1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright © 2006-2014 Intel Corporation.
13 #define pr_fmt(fmt) "DMAR: " fmt
14 #define dev_fmt(fmt) pr_fmt(fmt)
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-mapping.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-contiguous.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <linux/swiotlb.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48 #include <trace/events/intel_iommu.h>
50 #include "irq_remapping.h"
51 #include "intel-pasid.h"
53 #define ROOT_SIZE VTD_PAGE_SIZE
54 #define CONTEXT_SIZE VTD_PAGE_SIZE
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
61 #define IOAPIC_RANGE_START (0xfee00000)
62 #define IOAPIC_RANGE_END (0xfeefffff)
63 #define IOVA_START_ADDR (0x1000)
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
70 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
76 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN (1)
82 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
84 /* page table handling */
85 #define LEVEL_STRIDE (9)
86 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
89 * This bitmap is used to advertise the page sizes our hardware support
90 * to the IOMMU core, which will then use this information to split
91 * physically contiguous memory regions it is mapping into page sizes
94 * Traditionally the IOMMU core just handed us the mappings directly,
95 * after making sure the size is an order of a 4KiB page and that the
96 * mapping has natural alignment.
98 * To retain this behavior, we currently advertise that we support
99 * all page sizes that are an order of 4KiB.
101 * If at some point we'd like to utilize the IOMMU core's new behavior,
102 * we could change this to advertise the real page sizes we support.
104 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
106 static inline int agaw_to_level(int agaw)
111 static inline int agaw_to_width(int agaw)
113 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
116 static inline int width_to_agaw(int width)
118 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
121 static inline unsigned int level_to_offset_bits(int level)
123 return (level - 1) * LEVEL_STRIDE;
126 static inline int pfn_level_offset(unsigned long pfn, int level)
128 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
131 static inline unsigned long level_mask(int level)
133 return -1UL << level_to_offset_bits(level);
136 static inline unsigned long level_size(int level)
138 return 1UL << level_to_offset_bits(level);
141 static inline unsigned long align_to_level(unsigned long pfn, int level)
143 return (pfn + level_size(level) - 1) & level_mask(level);
146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
148 return 1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
152 are never going to work. */
153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
155 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
160 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
162 static inline unsigned long page_to_dma_pfn(struct page *pg)
164 return mm_to_dma_pfn(page_to_pfn(pg));
166 static inline unsigned long virt_to_dma_pfn(void *p)
168 return page_to_dma_pfn(virt_to_page(p));
171 /* global iommu list, set NULL for ignored DMAR units */
172 static struct intel_iommu **g_iommus;
174 static void __init check_tylersburg_isoch(void);
175 static int rwbf_quirk;
178 * set to 1 to panic kernel if can't successfully enable VT-d
179 * (used when kernel is launched w/ TXT)
181 static int force_on = 0;
182 int intel_iommu_tboot_noforce;
183 static int no_platform_optin;
185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
188 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
191 static phys_addr_t root_entry_lctp(struct root_entry *re)
196 return re->lo & VTD_PAGE_MASK;
200 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
203 static phys_addr_t root_entry_uctp(struct root_entry *re)
208 return re->hi & VTD_PAGE_MASK;
211 static inline void context_clear_pasid_enable(struct context_entry *context)
213 context->lo &= ~(1ULL << 11);
216 static inline bool context_pasid_enabled(struct context_entry *context)
218 return !!(context->lo & (1ULL << 11));
221 static inline void context_set_copied(struct context_entry *context)
223 context->hi |= (1ull << 3);
226 static inline bool context_copied(struct context_entry *context)
228 return !!(context->hi & (1ULL << 3));
231 static inline bool __context_present(struct context_entry *context)
233 return (context->lo & 1);
236 bool context_present(struct context_entry *context)
238 return context_pasid_enabled(context) ?
239 __context_present(context) :
240 __context_present(context) && !context_copied(context);
243 static inline void context_set_present(struct context_entry *context)
248 static inline void context_set_fault_enable(struct context_entry *context)
250 context->lo &= (((u64)-1) << 2) | 1;
253 static inline void context_set_translation_type(struct context_entry *context,
256 context->lo &= (((u64)-1) << 4) | 3;
257 context->lo |= (value & 3) << 2;
260 static inline void context_set_address_root(struct context_entry *context,
263 context->lo &= ~VTD_PAGE_MASK;
264 context->lo |= value & VTD_PAGE_MASK;
267 static inline void context_set_address_width(struct context_entry *context,
270 context->hi |= value & 7;
273 static inline void context_set_domain_id(struct context_entry *context,
276 context->hi |= (value & ((1 << 16) - 1)) << 8;
279 static inline int context_domain_id(struct context_entry *c)
281 return((c->hi >> 8) & 0xffff);
284 static inline void context_clear_entry(struct context_entry *context)
291 * This domain is a statically identity mapping domain.
292 * 1. This domain creats a static 1:1 mapping to all usable memory.
293 * 2. It maps to each iommu if successful.
294 * 3. Each iommu mapps to this domain if successful.
296 static struct dmar_domain *si_domain;
297 static int hw_pass_through = 1;
299 /* si_domain contains mulitple devices */
300 #define DOMAIN_FLAG_STATIC_IDENTITY BIT(0)
303 * This is a DMA domain allocated through the iommu domain allocation
304 * interface. But one or more devices belonging to this domain have
305 * been chosen to use a private domain. We should avoid to use the
306 * map/unmap/iova_to_phys APIs on it.
308 #define DOMAIN_FLAG_LOSE_CHILDREN BIT(1)
311 * When VT-d works in the scalable mode, it allows DMA translation to
312 * happen through either first level or second level page table. This
313 * bit marks that the DMA translation for the domain goes through the
314 * first level page table, otherwise, it goes through the second level.
316 #define DOMAIN_FLAG_USE_FIRST_LEVEL BIT(2)
319 * Domain represents a virtual machine which demands iommu nested
320 * translation mode support.
322 #define DOMAIN_FLAG_NESTING_MODE BIT(3)
324 #define for_each_domain_iommu(idx, domain) \
325 for (idx = 0; idx < g_num_of_iommus; idx++) \
326 if (domain->iommu_refcnt[idx])
328 struct dmar_rmrr_unit {
329 struct list_head list; /* list of rmrr units */
330 struct acpi_dmar_header *hdr; /* ACPI header */
331 u64 base_address; /* reserved base address*/
332 u64 end_address; /* reserved end address */
333 struct dmar_dev_scope *devices; /* target devices */
334 int devices_cnt; /* target device count */
337 struct dmar_atsr_unit {
338 struct list_head list; /* list of ATSR units */
339 struct acpi_dmar_header *hdr; /* ACPI header */
340 struct dmar_dev_scope *devices; /* target devices */
341 int devices_cnt; /* target device count */
342 u8 include_all:1; /* include all ports */
345 static LIST_HEAD(dmar_atsr_units);
346 static LIST_HEAD(dmar_rmrr_units);
348 #define for_each_rmrr_units(rmrr) \
349 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
351 /* bitmap for indexing intel_iommus */
352 static int g_num_of_iommus;
354 static void domain_exit(struct dmar_domain *domain);
355 static void domain_remove_dev_info(struct dmar_domain *domain);
356 static void dmar_remove_one_dev_info(struct device *dev);
357 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
358 static void domain_context_clear(struct intel_iommu *iommu,
360 static int domain_detach_iommu(struct dmar_domain *domain,
361 struct intel_iommu *iommu);
362 static bool device_is_rmrr_locked(struct device *dev);
363 static int intel_iommu_attach_device(struct iommu_domain *domain,
365 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
368 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
369 int dmar_disabled = 0;
371 int dmar_disabled = 1;
372 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
374 #ifdef INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
375 int intel_iommu_sm = 1;
378 #endif /* INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
380 int intel_iommu_enabled = 0;
381 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
383 static int dmar_map_gfx = 1;
384 static int dmar_forcedac;
385 static int intel_iommu_strict;
386 static int intel_iommu_superpage = 1;
387 static int iommu_identity_mapping;
388 static int intel_no_bounce;
390 #define IDENTMAP_GFX 2
391 #define IDENTMAP_AZALIA 4
393 int intel_iommu_gfx_mapped;
394 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
396 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
397 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
398 DEFINE_SPINLOCK(device_domain_lock);
399 static LIST_HEAD(device_domain_list);
401 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) && \
402 to_pci_dev(d)->untrusted)
405 * Iterate over elements in device_domain_list and call the specified
406 * callback @fn against each element.
408 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
409 void *data), void *data)
413 struct device_domain_info *info;
415 spin_lock_irqsave(&device_domain_lock, flags);
416 list_for_each_entry(info, &device_domain_list, global) {
417 ret = fn(info, data);
419 spin_unlock_irqrestore(&device_domain_lock, flags);
423 spin_unlock_irqrestore(&device_domain_lock, flags);
428 const struct iommu_ops intel_iommu_ops;
430 static bool translation_pre_enabled(struct intel_iommu *iommu)
432 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
435 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
437 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
440 static void init_translation_status(struct intel_iommu *iommu)
444 gsts = readl(iommu->reg + DMAR_GSTS_REG);
445 if (gsts & DMA_GSTS_TES)
446 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
449 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
450 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
452 return container_of(dom, struct dmar_domain, domain);
455 static int __init intel_iommu_setup(char *str)
460 if (!strncmp(str, "on", 2)) {
462 pr_info("IOMMU enabled\n");
463 } else if (!strncmp(str, "off", 3)) {
465 no_platform_optin = 1;
466 pr_info("IOMMU disabled\n");
467 } else if (!strncmp(str, "igfx_off", 8)) {
469 pr_info("Disable GFX device mapping\n");
470 } else if (!strncmp(str, "forcedac", 8)) {
471 pr_info("Forcing DAC for PCI devices\n");
473 } else if (!strncmp(str, "strict", 6)) {
474 pr_info("Disable batched IOTLB flush\n");
475 intel_iommu_strict = 1;
476 } else if (!strncmp(str, "sp_off", 6)) {
477 pr_info("Disable supported super page\n");
478 intel_iommu_superpage = 0;
479 } else if (!strncmp(str, "sm_on", 5)) {
480 pr_info("Intel-IOMMU: scalable mode supported\n");
482 } else if (!strncmp(str, "tboot_noforce", 13)) {
484 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
485 intel_iommu_tboot_noforce = 1;
486 } else if (!strncmp(str, "nobounce", 8)) {
487 pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
491 str += strcspn(str, ",");
497 __setup("intel_iommu=", intel_iommu_setup);
499 static struct kmem_cache *iommu_domain_cache;
500 static struct kmem_cache *iommu_devinfo_cache;
502 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
504 struct dmar_domain **domains;
507 domains = iommu->domains[idx];
511 return domains[did & 0xff];
514 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
515 struct dmar_domain *domain)
517 struct dmar_domain **domains;
520 if (!iommu->domains[idx]) {
521 size_t size = 256 * sizeof(struct dmar_domain *);
522 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
525 domains = iommu->domains[idx];
526 if (WARN_ON(!domains))
529 domains[did & 0xff] = domain;
532 void *alloc_pgtable_page(int node)
537 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
539 vaddr = page_address(page);
543 void free_pgtable_page(void *vaddr)
545 free_page((unsigned long)vaddr);
548 static inline void *alloc_domain_mem(void)
550 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
553 static void free_domain_mem(void *vaddr)
555 kmem_cache_free(iommu_domain_cache, vaddr);
558 static inline void * alloc_devinfo_mem(void)
560 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
563 static inline void free_devinfo_mem(void *vaddr)
565 kmem_cache_free(iommu_devinfo_cache, vaddr);
568 static inline int domain_type_is_si(struct dmar_domain *domain)
570 return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
573 static inline bool domain_use_first_level(struct dmar_domain *domain)
575 return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
578 static inline int domain_pfn_supported(struct dmar_domain *domain,
581 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
583 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
586 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
591 sagaw = cap_sagaw(iommu->cap);
592 for (agaw = width_to_agaw(max_gaw);
594 if (test_bit(agaw, &sagaw))
602 * Calculate max SAGAW for each iommu.
604 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
606 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
610 * calculate agaw for each iommu.
611 * "SAGAW" may be different across iommus, use a default agaw, and
612 * get a supported less agaw for iommus that don't support the default agaw.
614 int iommu_calculate_agaw(struct intel_iommu *iommu)
616 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
619 /* This functionin only returns single iommu in a domain */
620 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
624 /* si_domain and vm domain should not get here. */
625 if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
628 for_each_domain_iommu(iommu_id, domain)
631 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
634 return g_iommus[iommu_id];
637 static void domain_update_iommu_coherency(struct dmar_domain *domain)
639 struct dmar_drhd_unit *drhd;
640 struct intel_iommu *iommu;
644 domain->iommu_coherency = 1;
646 for_each_domain_iommu(i, domain) {
648 if (!ecap_coherent(g_iommus[i]->ecap)) {
649 domain->iommu_coherency = 0;
656 /* No hardware attached; use lowest common denominator */
658 for_each_active_iommu(iommu, drhd) {
659 if (!ecap_coherent(iommu->ecap)) {
660 domain->iommu_coherency = 0;
667 static int domain_update_iommu_snooping(struct intel_iommu *skip)
669 struct dmar_drhd_unit *drhd;
670 struct intel_iommu *iommu;
674 for_each_active_iommu(iommu, drhd) {
676 if (!ecap_sc_support(iommu->ecap)) {
687 static int domain_update_iommu_superpage(struct dmar_domain *domain,
688 struct intel_iommu *skip)
690 struct dmar_drhd_unit *drhd;
691 struct intel_iommu *iommu;
694 if (!intel_iommu_superpage) {
698 /* set iommu_superpage to the smallest common denominator */
700 for_each_active_iommu(iommu, drhd) {
702 if (domain && domain_use_first_level(domain)) {
703 if (!cap_fl1gp_support(iommu->cap))
706 mask &= cap_super_page_val(iommu->cap);
718 /* Some capabilities may be different across iommus */
719 static void domain_update_iommu_cap(struct dmar_domain *domain)
721 domain_update_iommu_coherency(domain);
722 domain->iommu_snooping = domain_update_iommu_snooping(NULL);
723 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
726 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
729 struct root_entry *root = &iommu->root_entry[bus];
730 struct context_entry *context;
734 if (sm_supported(iommu)) {
742 context = phys_to_virt(*entry & VTD_PAGE_MASK);
744 unsigned long phy_addr;
748 context = alloc_pgtable_page(iommu->node);
752 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
753 phy_addr = virt_to_phys((void *)context);
754 *entry = phy_addr | 1;
755 __iommu_flush_cache(iommu, entry, sizeof(*entry));
757 return &context[devfn];
760 static int iommu_dummy(struct device *dev)
762 return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
765 static bool attach_deferred(struct device *dev)
767 return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO;
771 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
772 * sub-hierarchy of a candidate PCI-PCI bridge
773 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
774 * @bridge: the candidate PCI-PCI bridge
776 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
779 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
781 struct pci_dev *pdev, *pbridge;
783 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
786 pdev = to_pci_dev(dev);
787 pbridge = to_pci_dev(bridge);
789 if (pbridge->subordinate &&
790 pbridge->subordinate->number <= pdev->bus->number &&
791 pbridge->subordinate->busn_res.end >= pdev->bus->number)
797 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
799 struct dmar_drhd_unit *drhd = NULL;
800 struct intel_iommu *iommu;
802 struct pci_dev *pdev = NULL;
806 if (iommu_dummy(dev))
809 if (dev_is_pci(dev)) {
810 struct pci_dev *pf_pdev;
812 pdev = pci_real_dma_dev(to_pci_dev(dev));
814 /* VFs aren't listed in scope tables; we need to look up
815 * the PF instead to find the IOMMU. */
816 pf_pdev = pci_physfn(pdev);
818 segment = pci_domain_nr(pdev->bus);
819 } else if (has_acpi_companion(dev))
820 dev = &ACPI_COMPANION(dev)->dev;
823 for_each_active_iommu(iommu, drhd) {
824 if (pdev && segment != drhd->segment)
827 for_each_active_dev_scope(drhd->devices,
828 drhd->devices_cnt, i, tmp) {
830 /* For a VF use its original BDF# not that of the PF
831 * which we used for the IOMMU lookup. Strictly speaking
832 * we could do this for all PCI devices; we only need to
833 * get the BDF# from the scope table for ACPI matches. */
834 if (pdev && pdev->is_virtfn)
837 *bus = drhd->devices[i].bus;
838 *devfn = drhd->devices[i].devfn;
842 if (is_downstream_to_pci_bridge(dev, tmp))
846 if (pdev && drhd->include_all) {
848 *bus = pdev->bus->number;
849 *devfn = pdev->devfn;
860 static void domain_flush_cache(struct dmar_domain *domain,
861 void *addr, int size)
863 if (!domain->iommu_coherency)
864 clflush_cache_range(addr, size);
867 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
869 struct context_entry *context;
873 spin_lock_irqsave(&iommu->lock, flags);
874 context = iommu_context_addr(iommu, bus, devfn, 0);
876 ret = context_present(context);
877 spin_unlock_irqrestore(&iommu->lock, flags);
881 static void free_context_table(struct intel_iommu *iommu)
885 struct context_entry *context;
887 spin_lock_irqsave(&iommu->lock, flags);
888 if (!iommu->root_entry) {
891 for (i = 0; i < ROOT_ENTRY_NR; i++) {
892 context = iommu_context_addr(iommu, i, 0, 0);
894 free_pgtable_page(context);
896 if (!sm_supported(iommu))
899 context = iommu_context_addr(iommu, i, 0x80, 0);
901 free_pgtable_page(context);
904 free_pgtable_page(iommu->root_entry);
905 iommu->root_entry = NULL;
907 spin_unlock_irqrestore(&iommu->lock, flags);
910 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
911 unsigned long pfn, int *target_level)
913 struct dma_pte *parent, *pte;
914 int level = agaw_to_level(domain->agaw);
917 BUG_ON(!domain->pgd);
919 if (!domain_pfn_supported(domain, pfn))
920 /* Address beyond IOMMU's addressing capabilities. */
923 parent = domain->pgd;
928 offset = pfn_level_offset(pfn, level);
929 pte = &parent[offset];
930 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
932 if (level == *target_level)
935 if (!dma_pte_present(pte)) {
938 tmp_page = alloc_pgtable_page(domain->nid);
943 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
944 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
945 if (domain_use_first_level(domain))
946 pteval |= DMA_FL_PTE_XD;
947 if (cmpxchg64(&pte->val, 0ULL, pteval))
948 /* Someone else set it while we were thinking; use theirs. */
949 free_pgtable_page(tmp_page);
951 domain_flush_cache(domain, pte, sizeof(*pte));
956 parent = phys_to_virt(dma_pte_addr(pte));
961 *target_level = level;
966 /* return address's pte at specific level */
967 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
969 int level, int *large_page)
971 struct dma_pte *parent, *pte;
972 int total = agaw_to_level(domain->agaw);
975 parent = domain->pgd;
976 while (level <= total) {
977 offset = pfn_level_offset(pfn, total);
978 pte = &parent[offset];
982 if (!dma_pte_present(pte)) {
987 if (dma_pte_superpage(pte)) {
992 parent = phys_to_virt(dma_pte_addr(pte));
998 /* clear last level pte, a tlb flush should be followed */
999 static void dma_pte_clear_range(struct dmar_domain *domain,
1000 unsigned long start_pfn,
1001 unsigned long last_pfn)
1003 unsigned int large_page;
1004 struct dma_pte *first_pte, *pte;
1006 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1007 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1008 BUG_ON(start_pfn > last_pfn);
1010 /* we don't need lock here; nobody else touches the iova range */
1013 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1015 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1020 start_pfn += lvl_to_nr_pages(large_page);
1022 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1024 domain_flush_cache(domain, first_pte,
1025 (void *)pte - (void *)first_pte);
1027 } while (start_pfn && start_pfn <= last_pfn);
1030 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1031 int retain_level, struct dma_pte *pte,
1032 unsigned long pfn, unsigned long start_pfn,
1033 unsigned long last_pfn)
1035 pfn = max(start_pfn, pfn);
1036 pte = &pte[pfn_level_offset(pfn, level)];
1039 unsigned long level_pfn;
1040 struct dma_pte *level_pte;
1042 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1045 level_pfn = pfn & level_mask(level);
1046 level_pte = phys_to_virt(dma_pte_addr(pte));
1049 dma_pte_free_level(domain, level - 1, retain_level,
1050 level_pte, level_pfn, start_pfn,
1055 * Free the page table if we're below the level we want to
1056 * retain and the range covers the entire table.
1058 if (level < retain_level && !(start_pfn > level_pfn ||
1059 last_pfn < level_pfn + level_size(level) - 1)) {
1061 domain_flush_cache(domain, pte, sizeof(*pte));
1062 free_pgtable_page(level_pte);
1065 pfn += level_size(level);
1066 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1070 * clear last level (leaf) ptes and free page table pages below the
1071 * level we wish to keep intact.
1073 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1074 unsigned long start_pfn,
1075 unsigned long last_pfn,
1078 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1079 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1080 BUG_ON(start_pfn > last_pfn);
1082 dma_pte_clear_range(domain, start_pfn, last_pfn);
1084 /* We don't need lock here; nobody else touches the iova range */
1085 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1086 domain->pgd, 0, start_pfn, last_pfn);
1089 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1090 free_pgtable_page(domain->pgd);
1095 /* When a page at a given level is being unlinked from its parent, we don't
1096 need to *modify* it at all. All we need to do is make a list of all the
1097 pages which can be freed just as soon as we've flushed the IOTLB and we
1098 know the hardware page-walk will no longer touch them.
1099 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1101 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1102 int level, struct dma_pte *pte,
1103 struct page *freelist)
1107 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1108 pg->freelist = freelist;
1114 pte = page_address(pg);
1116 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1117 freelist = dma_pte_list_pagetables(domain, level - 1,
1120 } while (!first_pte_in_page(pte));
1125 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1126 struct dma_pte *pte, unsigned long pfn,
1127 unsigned long start_pfn,
1128 unsigned long last_pfn,
1129 struct page *freelist)
1131 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1133 pfn = max(start_pfn, pfn);
1134 pte = &pte[pfn_level_offset(pfn, level)];
1137 unsigned long level_pfn;
1139 if (!dma_pte_present(pte))
1142 level_pfn = pfn & level_mask(level);
1144 /* If range covers entire pagetable, free it */
1145 if (start_pfn <= level_pfn &&
1146 last_pfn >= level_pfn + level_size(level) - 1) {
1147 /* These suborbinate page tables are going away entirely. Don't
1148 bother to clear them; we're just going to *free* them. */
1149 if (level > 1 && !dma_pte_superpage(pte))
1150 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1156 } else if (level > 1) {
1157 /* Recurse down into a level that isn't *entirely* obsolete */
1158 freelist = dma_pte_clear_level(domain, level - 1,
1159 phys_to_virt(dma_pte_addr(pte)),
1160 level_pfn, start_pfn, last_pfn,
1164 pfn += level_size(level);
1165 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1168 domain_flush_cache(domain, first_pte,
1169 (void *)++last_pte - (void *)first_pte);
1174 /* We can't just free the pages because the IOMMU may still be walking
1175 the page tables, and may have cached the intermediate levels. The
1176 pages can only be freed after the IOTLB flush has been done. */
1177 static struct page *domain_unmap(struct dmar_domain *domain,
1178 unsigned long start_pfn,
1179 unsigned long last_pfn)
1181 struct page *freelist;
1183 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1184 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1185 BUG_ON(start_pfn > last_pfn);
1187 /* we don't need lock here; nobody else touches the iova range */
1188 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1189 domain->pgd, 0, start_pfn, last_pfn, NULL);
1192 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1193 struct page *pgd_page = virt_to_page(domain->pgd);
1194 pgd_page->freelist = freelist;
1195 freelist = pgd_page;
1203 static void dma_free_pagelist(struct page *freelist)
1207 while ((pg = freelist)) {
1208 freelist = pg->freelist;
1209 free_pgtable_page(page_address(pg));
1213 static void iova_entry_free(unsigned long data)
1215 struct page *freelist = (struct page *)data;
1217 dma_free_pagelist(freelist);
1220 /* iommu handling */
1221 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1223 struct root_entry *root;
1224 unsigned long flags;
1226 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1228 pr_err("Allocating root entry for %s failed\n",
1233 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1235 spin_lock_irqsave(&iommu->lock, flags);
1236 iommu->root_entry = root;
1237 spin_unlock_irqrestore(&iommu->lock, flags);
1242 static void iommu_set_root_entry(struct intel_iommu *iommu)
1248 addr = virt_to_phys(iommu->root_entry);
1249 if (sm_supported(iommu))
1250 addr |= DMA_RTADDR_SMT;
1252 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1253 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1255 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1257 /* Make sure hardware complete it */
1258 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1259 readl, (sts & DMA_GSTS_RTPS), sts);
1261 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1264 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1269 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1272 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1273 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1275 /* Make sure hardware complete it */
1276 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1277 readl, (!(val & DMA_GSTS_WBFS)), val);
1279 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1282 /* return value determine if we need a write buffer flush */
1283 static void __iommu_flush_context(struct intel_iommu *iommu,
1284 u16 did, u16 source_id, u8 function_mask,
1291 case DMA_CCMD_GLOBAL_INVL:
1292 val = DMA_CCMD_GLOBAL_INVL;
1294 case DMA_CCMD_DOMAIN_INVL:
1295 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1297 case DMA_CCMD_DEVICE_INVL:
1298 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1299 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1304 val |= DMA_CCMD_ICC;
1306 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1307 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1309 /* Make sure hardware complete it */
1310 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1311 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1313 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1316 /* return value determine if we need a write buffer flush */
1317 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1318 u64 addr, unsigned int size_order, u64 type)
1320 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1321 u64 val = 0, val_iva = 0;
1325 case DMA_TLB_GLOBAL_FLUSH:
1326 /* global flush doesn't need set IVA_REG */
1327 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1329 case DMA_TLB_DSI_FLUSH:
1330 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1332 case DMA_TLB_PSI_FLUSH:
1333 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1334 /* IH bit is passed in as part of address */
1335 val_iva = size_order | addr;
1340 /* Note: set drain read/write */
1343 * This is probably to be super secure.. Looks like we can
1344 * ignore it without any impact.
1346 if (cap_read_drain(iommu->cap))
1347 val |= DMA_TLB_READ_DRAIN;
1349 if (cap_write_drain(iommu->cap))
1350 val |= DMA_TLB_WRITE_DRAIN;
1352 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1353 /* Note: Only uses first TLB reg currently */
1355 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1356 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1358 /* Make sure hardware complete it */
1359 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1360 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1362 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1364 /* check IOTLB invalidation granularity */
1365 if (DMA_TLB_IAIG(val) == 0)
1366 pr_err("Flush IOTLB failed\n");
1367 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1368 pr_debug("TLB flush request %Lx, actual %Lx\n",
1369 (unsigned long long)DMA_TLB_IIRG(type),
1370 (unsigned long long)DMA_TLB_IAIG(val));
1373 static struct device_domain_info *
1374 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1377 struct device_domain_info *info;
1379 assert_spin_locked(&device_domain_lock);
1384 list_for_each_entry(info, &domain->devices, link)
1385 if (info->iommu == iommu && info->bus == bus &&
1386 info->devfn == devfn) {
1387 if (info->ats_supported && info->dev)
1395 static void domain_update_iotlb(struct dmar_domain *domain)
1397 struct device_domain_info *info;
1398 bool has_iotlb_device = false;
1400 assert_spin_locked(&device_domain_lock);
1402 list_for_each_entry(info, &domain->devices, link) {
1403 struct pci_dev *pdev;
1405 if (!info->dev || !dev_is_pci(info->dev))
1408 pdev = to_pci_dev(info->dev);
1409 if (pdev->ats_enabled) {
1410 has_iotlb_device = true;
1415 domain->has_iotlb_device = has_iotlb_device;
1418 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1420 struct pci_dev *pdev;
1422 assert_spin_locked(&device_domain_lock);
1424 if (!info || !dev_is_pci(info->dev))
1427 pdev = to_pci_dev(info->dev);
1428 /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1429 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1430 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1431 * reserved, which should be set to 0.
1433 if (!ecap_dit(info->iommu->ecap))
1436 struct pci_dev *pf_pdev;
1438 /* pdev will be returned if device is not a vf */
1439 pf_pdev = pci_physfn(pdev);
1440 info->pfsid = pci_dev_id(pf_pdev);
1443 #ifdef CONFIG_INTEL_IOMMU_SVM
1444 /* The PCIe spec, in its wisdom, declares that the behaviour of
1445 the device if you enable PASID support after ATS support is
1446 undefined. So always enable PASID support on devices which
1447 have it, even if we can't yet know if we're ever going to
1449 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1450 info->pasid_enabled = 1;
1452 if (info->pri_supported &&
1453 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) &&
1454 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1455 info->pri_enabled = 1;
1457 if (!pdev->untrusted && info->ats_supported &&
1458 pci_ats_page_aligned(pdev) &&
1459 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1460 info->ats_enabled = 1;
1461 domain_update_iotlb(info->domain);
1462 info->ats_qdep = pci_ats_queue_depth(pdev);
1466 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1468 struct pci_dev *pdev;
1470 assert_spin_locked(&device_domain_lock);
1472 if (!dev_is_pci(info->dev))
1475 pdev = to_pci_dev(info->dev);
1477 if (info->ats_enabled) {
1478 pci_disable_ats(pdev);
1479 info->ats_enabled = 0;
1480 domain_update_iotlb(info->domain);
1482 #ifdef CONFIG_INTEL_IOMMU_SVM
1483 if (info->pri_enabled) {
1484 pci_disable_pri(pdev);
1485 info->pri_enabled = 0;
1487 if (info->pasid_enabled) {
1488 pci_disable_pasid(pdev);
1489 info->pasid_enabled = 0;
1494 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1495 u64 addr, unsigned mask)
1498 unsigned long flags;
1499 struct device_domain_info *info;
1501 if (!domain->has_iotlb_device)
1504 spin_lock_irqsave(&device_domain_lock, flags);
1505 list_for_each_entry(info, &domain->devices, link) {
1506 if (!info->ats_enabled)
1509 sid = info->bus << 8 | info->devfn;
1510 qdep = info->ats_qdep;
1511 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1514 spin_unlock_irqrestore(&device_domain_lock, flags);
1517 static void domain_flush_piotlb(struct intel_iommu *iommu,
1518 struct dmar_domain *domain,
1519 u64 addr, unsigned long npages, bool ih)
1521 u16 did = domain->iommu_did[iommu->seq_id];
1523 if (domain->default_pasid)
1524 qi_flush_piotlb(iommu, did, domain->default_pasid,
1527 if (!list_empty(&domain->devices))
1528 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1531 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1532 struct dmar_domain *domain,
1533 unsigned long pfn, unsigned int pages,
1536 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1537 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1538 u16 did = domain->iommu_did[iommu->seq_id];
1545 if (domain_use_first_level(domain)) {
1546 domain_flush_piotlb(iommu, domain, addr, pages, ih);
1549 * Fallback to domain selective flush if no PSI support or
1550 * the size is too big. PSI requires page size to be 2 ^ x,
1551 * and the base address is naturally aligned to the size.
1553 if (!cap_pgsel_inv(iommu->cap) ||
1554 mask > cap_max_amask_val(iommu->cap))
1555 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1558 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1563 * In caching mode, changes of pages from non-present to present require
1564 * flush. However, device IOTLB doesn't need to be flushed in this case.
1566 if (!cap_caching_mode(iommu->cap) || !map)
1567 iommu_flush_dev_iotlb(domain, addr, mask);
1570 /* Notification for newly created mappings */
1571 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1572 struct dmar_domain *domain,
1573 unsigned long pfn, unsigned int pages)
1576 * It's a non-present to present mapping. Only flush if caching mode
1579 if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1580 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1582 iommu_flush_write_buffer(iommu);
1585 static void iommu_flush_iova(struct iova_domain *iovad)
1587 struct dmar_domain *domain;
1590 domain = container_of(iovad, struct dmar_domain, iovad);
1592 for_each_domain_iommu(idx, domain) {
1593 struct intel_iommu *iommu = g_iommus[idx];
1594 u16 did = domain->iommu_did[iommu->seq_id];
1596 if (domain_use_first_level(domain))
1597 domain_flush_piotlb(iommu, domain, 0, -1, 0);
1599 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1602 if (!cap_caching_mode(iommu->cap))
1603 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1604 0, MAX_AGAW_PFN_WIDTH);
1608 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1611 unsigned long flags;
1613 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1616 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1617 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1618 pmen &= ~DMA_PMEN_EPM;
1619 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1621 /* wait for the protected region status bit to clear */
1622 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1623 readl, !(pmen & DMA_PMEN_PRS), pmen);
1625 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1628 static void iommu_enable_translation(struct intel_iommu *iommu)
1631 unsigned long flags;
1633 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1634 iommu->gcmd |= DMA_GCMD_TE;
1635 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1637 /* Make sure hardware complete it */
1638 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1639 readl, (sts & DMA_GSTS_TES), sts);
1641 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1644 static void iommu_disable_translation(struct intel_iommu *iommu)
1649 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1650 iommu->gcmd &= ~DMA_GCMD_TE;
1651 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1653 /* Make sure hardware complete it */
1654 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1655 readl, (!(sts & DMA_GSTS_TES)), sts);
1657 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1660 static int iommu_init_domains(struct intel_iommu *iommu)
1662 u32 ndomains, nlongs;
1665 ndomains = cap_ndoms(iommu->cap);
1666 pr_debug("%s: Number of Domains supported <%d>\n",
1667 iommu->name, ndomains);
1668 nlongs = BITS_TO_LONGS(ndomains);
1670 spin_lock_init(&iommu->lock);
1672 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1673 if (!iommu->domain_ids) {
1674 pr_err("%s: Allocating domain id array failed\n",
1679 size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1680 iommu->domains = kzalloc(size, GFP_KERNEL);
1682 if (iommu->domains) {
1683 size = 256 * sizeof(struct dmar_domain *);
1684 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1687 if (!iommu->domains || !iommu->domains[0]) {
1688 pr_err("%s: Allocating domain array failed\n",
1690 kfree(iommu->domain_ids);
1691 kfree(iommu->domains);
1692 iommu->domain_ids = NULL;
1693 iommu->domains = NULL;
1698 * If Caching mode is set, then invalid translations are tagged
1699 * with domain-id 0, hence we need to pre-allocate it. We also
1700 * use domain-id 0 as a marker for non-allocated domain-id, so
1701 * make sure it is not used for a real domain.
1703 set_bit(0, iommu->domain_ids);
1706 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1707 * entry for first-level or pass-through translation modes should
1708 * be programmed with a domain id different from those used for
1709 * second-level or nested translation. We reserve a domain id for
1712 if (sm_supported(iommu))
1713 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1718 static void disable_dmar_iommu(struct intel_iommu *iommu)
1720 struct device_domain_info *info, *tmp;
1721 unsigned long flags;
1723 if (!iommu->domains || !iommu->domain_ids)
1726 spin_lock_irqsave(&device_domain_lock, flags);
1727 list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1728 if (info->iommu != iommu)
1731 if (!info->dev || !info->domain)
1734 __dmar_remove_one_dev_info(info);
1736 spin_unlock_irqrestore(&device_domain_lock, flags);
1738 if (iommu->gcmd & DMA_GCMD_TE)
1739 iommu_disable_translation(iommu);
1742 static void free_dmar_iommu(struct intel_iommu *iommu)
1744 if ((iommu->domains) && (iommu->domain_ids)) {
1745 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1748 for (i = 0; i < elems; i++)
1749 kfree(iommu->domains[i]);
1750 kfree(iommu->domains);
1751 kfree(iommu->domain_ids);
1752 iommu->domains = NULL;
1753 iommu->domain_ids = NULL;
1756 g_iommus[iommu->seq_id] = NULL;
1758 /* free context mapping */
1759 free_context_table(iommu);
1761 #ifdef CONFIG_INTEL_IOMMU_SVM
1762 if (pasid_supported(iommu)) {
1763 if (ecap_prs(iommu->ecap))
1764 intel_svm_finish_prq(iommu);
1770 * Check and return whether first level is used by default for
1773 static bool first_level_by_default(void)
1775 struct dmar_drhd_unit *drhd;
1776 struct intel_iommu *iommu;
1777 static int first_level_support = -1;
1779 if (likely(first_level_support != -1))
1780 return first_level_support;
1782 first_level_support = 1;
1785 for_each_active_iommu(iommu, drhd) {
1786 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) {
1787 first_level_support = 0;
1793 return first_level_support;
1796 static struct dmar_domain *alloc_domain(int flags)
1798 struct dmar_domain *domain;
1800 domain = alloc_domain_mem();
1804 memset(domain, 0, sizeof(*domain));
1805 domain->nid = NUMA_NO_NODE;
1806 domain->flags = flags;
1807 if (first_level_by_default())
1808 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1809 domain->has_iotlb_device = false;
1810 INIT_LIST_HEAD(&domain->devices);
1815 /* Must be called with iommu->lock */
1816 static int domain_attach_iommu(struct dmar_domain *domain,
1817 struct intel_iommu *iommu)
1819 unsigned long ndomains;
1822 assert_spin_locked(&device_domain_lock);
1823 assert_spin_locked(&iommu->lock);
1825 domain->iommu_refcnt[iommu->seq_id] += 1;
1826 domain->iommu_count += 1;
1827 if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1828 ndomains = cap_ndoms(iommu->cap);
1829 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1831 if (num >= ndomains) {
1832 pr_err("%s: No free domain ids\n", iommu->name);
1833 domain->iommu_refcnt[iommu->seq_id] -= 1;
1834 domain->iommu_count -= 1;
1838 set_bit(num, iommu->domain_ids);
1839 set_iommu_domain(iommu, num, domain);
1841 domain->iommu_did[iommu->seq_id] = num;
1842 domain->nid = iommu->node;
1844 domain_update_iommu_cap(domain);
1850 static int domain_detach_iommu(struct dmar_domain *domain,
1851 struct intel_iommu *iommu)
1855 assert_spin_locked(&device_domain_lock);
1856 assert_spin_locked(&iommu->lock);
1858 domain->iommu_refcnt[iommu->seq_id] -= 1;
1859 count = --domain->iommu_count;
1860 if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1861 num = domain->iommu_did[iommu->seq_id];
1862 clear_bit(num, iommu->domain_ids);
1863 set_iommu_domain(iommu, num, NULL);
1865 domain_update_iommu_cap(domain);
1866 domain->iommu_did[iommu->seq_id] = 0;
1872 static struct iova_domain reserved_iova_list;
1873 static struct lock_class_key reserved_rbtree_key;
1875 static int dmar_init_reserved_ranges(void)
1877 struct pci_dev *pdev = NULL;
1881 init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1883 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1884 &reserved_rbtree_key);
1886 /* IOAPIC ranges shouldn't be accessed by DMA */
1887 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1888 IOVA_PFN(IOAPIC_RANGE_END));
1890 pr_err("Reserve IOAPIC range failed\n");
1894 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1895 for_each_pci_dev(pdev) {
1898 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1899 r = &pdev->resource[i];
1900 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1902 iova = reserve_iova(&reserved_iova_list,
1906 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1914 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1916 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1919 static inline int guestwidth_to_adjustwidth(int gaw)
1922 int r = (gaw - 12) % 9;
1933 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1936 int adjust_width, agaw;
1937 unsigned long sagaw;
1940 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1942 if (!intel_iommu_strict) {
1943 ret = init_iova_flush_queue(&domain->iovad,
1944 iommu_flush_iova, iova_entry_free);
1946 pr_info("iova flush queue initialization failed\n");
1949 domain_reserve_special_ranges(domain);
1951 /* calculate AGAW */
1952 if (guest_width > cap_mgaw(iommu->cap))
1953 guest_width = cap_mgaw(iommu->cap);
1954 domain->gaw = guest_width;
1955 adjust_width = guestwidth_to_adjustwidth(guest_width);
1956 agaw = width_to_agaw(adjust_width);
1957 sagaw = cap_sagaw(iommu->cap);
1958 if (!test_bit(agaw, &sagaw)) {
1959 /* hardware doesn't support it, choose a bigger one */
1960 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1961 agaw = find_next_bit(&sagaw, 5, agaw);
1965 domain->agaw = agaw;
1967 if (ecap_coherent(iommu->ecap))
1968 domain->iommu_coherency = 1;
1970 domain->iommu_coherency = 0;
1972 if (ecap_sc_support(iommu->ecap))
1973 domain->iommu_snooping = 1;
1975 domain->iommu_snooping = 0;
1977 if (intel_iommu_superpage)
1978 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1980 domain->iommu_superpage = 0;
1982 domain->nid = iommu->node;
1984 /* always allocate the top pgd */
1985 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1988 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1992 static void domain_exit(struct dmar_domain *domain)
1995 /* Remove associated devices and clear attached or cached domains */
1996 domain_remove_dev_info(domain);
1999 put_iova_domain(&domain->iovad);
2002 struct page *freelist;
2004 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
2005 dma_free_pagelist(freelist);
2008 free_domain_mem(domain);
2012 * Get the PASID directory size for scalable mode context entry.
2013 * Value of X in the PDTS field of a scalable mode context entry
2014 * indicates PASID directory with 2^(X + 7) entries.
2016 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
2020 max_pde = table->max_pasid >> PASID_PDE_SHIFT;
2021 pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
2029 * Set the RID_PASID field of a scalable mode context entry. The
2030 * IOMMU hardware will use the PASID value set in this field for
2031 * DMA translations of DMA requests without PASID.
2034 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
2036 context->hi |= pasid & ((1 << 20) - 1);
2037 context->hi |= (1 << 20);
2041 * Set the DTE(Device-TLB Enable) field of a scalable mode context
2044 static inline void context_set_sm_dte(struct context_entry *context)
2046 context->lo |= (1 << 2);
2050 * Set the PRE(Page Request Enable) field of a scalable mode context
2053 static inline void context_set_sm_pre(struct context_entry *context)
2055 context->lo |= (1 << 4);
2058 /* Convert value to context PASID directory size field coding. */
2059 #define context_pdts(pds) (((pds) & 0x7) << 9)
2061 static int domain_context_mapping_one(struct dmar_domain *domain,
2062 struct intel_iommu *iommu,
2063 struct pasid_table *table,
2066 u16 did = domain->iommu_did[iommu->seq_id];
2067 int translation = CONTEXT_TT_MULTI_LEVEL;
2068 struct device_domain_info *info = NULL;
2069 struct context_entry *context;
2070 unsigned long flags;
2075 if (hw_pass_through && domain_type_is_si(domain))
2076 translation = CONTEXT_TT_PASS_THROUGH;
2078 pr_debug("Set context mapping for %02x:%02x.%d\n",
2079 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2081 BUG_ON(!domain->pgd);
2083 spin_lock_irqsave(&device_domain_lock, flags);
2084 spin_lock(&iommu->lock);
2087 context = iommu_context_addr(iommu, bus, devfn, 1);
2092 if (context_present(context))
2096 * For kdump cases, old valid entries may be cached due to the
2097 * in-flight DMA and copied pgtable, but there is no unmapping
2098 * behaviour for them, thus we need an explicit cache flush for
2099 * the newly-mapped device. For kdump, at this point, the device
2100 * is supposed to finish reset at its driver probe stage, so no
2101 * in-flight DMA will exist, and we don't need to worry anymore
2104 if (context_copied(context)) {
2105 u16 did_old = context_domain_id(context);
2107 if (did_old < cap_ndoms(iommu->cap)) {
2108 iommu->flush.flush_context(iommu, did_old,
2109 (((u16)bus) << 8) | devfn,
2110 DMA_CCMD_MASK_NOBIT,
2111 DMA_CCMD_DEVICE_INVL);
2112 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2117 context_clear_entry(context);
2119 if (sm_supported(iommu)) {
2124 /* Setup the PASID DIR pointer: */
2125 pds = context_get_sm_pds(table);
2126 context->lo = (u64)virt_to_phys(table->table) |
2129 /* Setup the RID_PASID field: */
2130 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2133 * Setup the Device-TLB enable bit and Page request
2136 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2137 if (info && info->ats_supported)
2138 context_set_sm_dte(context);
2139 if (info && info->pri_supported)
2140 context_set_sm_pre(context);
2142 struct dma_pte *pgd = domain->pgd;
2145 context_set_domain_id(context, did);
2147 if (translation != CONTEXT_TT_PASS_THROUGH) {
2149 * Skip top levels of page tables for iommu which has
2150 * less agaw than default. Unnecessary for PT mode.
2152 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2154 pgd = phys_to_virt(dma_pte_addr(pgd));
2155 if (!dma_pte_present(pgd))
2159 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2160 if (info && info->ats_supported)
2161 translation = CONTEXT_TT_DEV_IOTLB;
2163 translation = CONTEXT_TT_MULTI_LEVEL;
2165 context_set_address_root(context, virt_to_phys(pgd));
2166 context_set_address_width(context, agaw);
2169 * In pass through mode, AW must be programmed to
2170 * indicate the largest AGAW value supported by
2171 * hardware. And ASR is ignored by hardware.
2173 context_set_address_width(context, iommu->msagaw);
2176 context_set_translation_type(context, translation);
2179 context_set_fault_enable(context);
2180 context_set_present(context);
2181 domain_flush_cache(domain, context, sizeof(*context));
2184 * It's a non-present to present mapping. If hardware doesn't cache
2185 * non-present entry we only need to flush the write-buffer. If the
2186 * _does_ cache non-present entries, then it does so in the special
2187 * domain #0, which we have to flush:
2189 if (cap_caching_mode(iommu->cap)) {
2190 iommu->flush.flush_context(iommu, 0,
2191 (((u16)bus) << 8) | devfn,
2192 DMA_CCMD_MASK_NOBIT,
2193 DMA_CCMD_DEVICE_INVL);
2194 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2196 iommu_flush_write_buffer(iommu);
2198 iommu_enable_dev_iotlb(info);
2203 spin_unlock(&iommu->lock);
2204 spin_unlock_irqrestore(&device_domain_lock, flags);
2209 struct domain_context_mapping_data {
2210 struct dmar_domain *domain;
2211 struct intel_iommu *iommu;
2212 struct pasid_table *table;
2215 static int domain_context_mapping_cb(struct pci_dev *pdev,
2216 u16 alias, void *opaque)
2218 struct domain_context_mapping_data *data = opaque;
2220 return domain_context_mapping_one(data->domain, data->iommu,
2221 data->table, PCI_BUS_NUM(alias),
2226 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2228 struct domain_context_mapping_data data;
2229 struct pasid_table *table;
2230 struct intel_iommu *iommu;
2233 iommu = device_to_iommu(dev, &bus, &devfn);
2237 table = intel_pasid_get_table(dev);
2239 if (!dev_is_pci(dev))
2240 return domain_context_mapping_one(domain, iommu, table,
2243 data.domain = domain;
2247 return pci_for_each_dma_alias(to_pci_dev(dev),
2248 &domain_context_mapping_cb, &data);
2251 static int domain_context_mapped_cb(struct pci_dev *pdev,
2252 u16 alias, void *opaque)
2254 struct intel_iommu *iommu = opaque;
2256 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2259 static int domain_context_mapped(struct device *dev)
2261 struct intel_iommu *iommu;
2264 iommu = device_to_iommu(dev, &bus, &devfn);
2268 if (!dev_is_pci(dev))
2269 return device_context_mapped(iommu, bus, devfn);
2271 return !pci_for_each_dma_alias(to_pci_dev(dev),
2272 domain_context_mapped_cb, iommu);
2275 /* Returns a number of VTD pages, but aligned to MM page size */
2276 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2279 host_addr &= ~PAGE_MASK;
2280 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2283 /* Return largest possible superpage level for a given mapping */
2284 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2285 unsigned long iov_pfn,
2286 unsigned long phy_pfn,
2287 unsigned long pages)
2289 int support, level = 1;
2290 unsigned long pfnmerge;
2292 support = domain->iommu_superpage;
2294 /* To use a large page, the virtual *and* physical addresses
2295 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2296 of them will mean we have to use smaller pages. So just
2297 merge them and check both at once. */
2298 pfnmerge = iov_pfn | phy_pfn;
2300 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2301 pages >>= VTD_STRIDE_SHIFT;
2304 pfnmerge >>= VTD_STRIDE_SHIFT;
2311 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2312 struct scatterlist *sg, unsigned long phys_pfn,
2313 unsigned long nr_pages, int prot)
2315 struct dma_pte *first_pte = NULL, *pte = NULL;
2316 phys_addr_t uninitialized_var(pteval);
2317 unsigned long sg_res = 0;
2318 unsigned int largepage_lvl = 0;
2319 unsigned long lvl_pages = 0;
2322 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2324 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2327 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2328 if (domain_use_first_level(domain))
2329 attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD;
2333 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2336 while (nr_pages > 0) {
2340 unsigned int pgoff = sg->offset & ~PAGE_MASK;
2342 sg_res = aligned_nrpages(sg->offset, sg->length);
2343 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2344 sg->dma_length = sg->length;
2345 pteval = (sg_phys(sg) - pgoff) | attr;
2346 phys_pfn = pteval >> VTD_PAGE_SHIFT;
2350 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2352 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2355 /* It is large page*/
2356 if (largepage_lvl > 1) {
2357 unsigned long nr_superpages, end_pfn;
2359 pteval |= DMA_PTE_LARGE_PAGE;
2360 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2362 nr_superpages = sg_res / lvl_pages;
2363 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2366 * Ensure that old small page tables are
2367 * removed to make room for superpage(s).
2368 * We're adding new large pages, so make sure
2369 * we don't remove their parent tables.
2371 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2374 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2378 /* We don't need lock here, nobody else
2379 * touches the iova range
2381 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2383 static int dumps = 5;
2384 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2385 iov_pfn, tmp, (unsigned long long)pteval);
2388 debug_dma_dump_mappings(NULL);
2393 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2395 BUG_ON(nr_pages < lvl_pages);
2396 BUG_ON(sg_res < lvl_pages);
2398 nr_pages -= lvl_pages;
2399 iov_pfn += lvl_pages;
2400 phys_pfn += lvl_pages;
2401 pteval += lvl_pages * VTD_PAGE_SIZE;
2402 sg_res -= lvl_pages;
2404 /* If the next PTE would be the first in a new page, then we
2405 need to flush the cache on the entries we've just written.
2406 And then we'll need to recalculate 'pte', so clear it and
2407 let it get set again in the if (!pte) block above.
2409 If we're done (!nr_pages) we need to flush the cache too.
2411 Also if we've been setting superpages, we may need to
2412 recalculate 'pte' and switch back to smaller pages for the
2413 end of the mapping, if the trailing size is not enough to
2414 use another superpage (i.e. sg_res < lvl_pages). */
2416 if (!nr_pages || first_pte_in_page(pte) ||
2417 (largepage_lvl > 1 && sg_res < lvl_pages)) {
2418 domain_flush_cache(domain, first_pte,
2419 (void *)pte - (void *)first_pte);
2423 if (!sg_res && nr_pages)
2429 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2430 struct scatterlist *sg, unsigned long phys_pfn,
2431 unsigned long nr_pages, int prot)
2434 struct intel_iommu *iommu;
2436 /* Do the real mapping first */
2437 ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2441 for_each_domain_iommu(iommu_id, domain) {
2442 iommu = g_iommus[iommu_id];
2443 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2449 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2450 struct scatterlist *sg, unsigned long nr_pages,
2453 return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2456 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2457 unsigned long phys_pfn, unsigned long nr_pages,
2460 return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2463 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2465 unsigned long flags;
2466 struct context_entry *context;
2472 spin_lock_irqsave(&iommu->lock, flags);
2473 context = iommu_context_addr(iommu, bus, devfn, 0);
2475 spin_unlock_irqrestore(&iommu->lock, flags);
2478 did_old = context_domain_id(context);
2479 context_clear_entry(context);
2480 __iommu_flush_cache(iommu, context, sizeof(*context));
2481 spin_unlock_irqrestore(&iommu->lock, flags);
2482 iommu->flush.flush_context(iommu,
2484 (((u16)bus) << 8) | devfn,
2485 DMA_CCMD_MASK_NOBIT,
2486 DMA_CCMD_DEVICE_INVL);
2487 iommu->flush.flush_iotlb(iommu,
2494 static inline void unlink_domain_info(struct device_domain_info *info)
2496 assert_spin_locked(&device_domain_lock);
2497 list_del(&info->link);
2498 list_del(&info->global);
2500 info->dev->archdata.iommu = NULL;
2503 static void domain_remove_dev_info(struct dmar_domain *domain)
2505 struct device_domain_info *info, *tmp;
2506 unsigned long flags;
2508 spin_lock_irqsave(&device_domain_lock, flags);
2509 list_for_each_entry_safe(info, tmp, &domain->devices, link)
2510 __dmar_remove_one_dev_info(info);
2511 spin_unlock_irqrestore(&device_domain_lock, flags);
2514 struct dmar_domain *find_domain(struct device *dev)
2516 struct device_domain_info *info;
2518 if (unlikely(attach_deferred(dev) || iommu_dummy(dev)))
2521 if (dev_is_pci(dev))
2522 dev = &pci_real_dma_dev(to_pci_dev(dev))->dev;
2524 /* No lock here, assumes no domain exit in normal case */
2525 info = dev->archdata.iommu;
2527 return info->domain;
2532 static void do_deferred_attach(struct device *dev)
2534 struct iommu_domain *domain;
2536 dev->archdata.iommu = NULL;
2537 domain = iommu_get_domain_for_dev(dev);
2539 intel_iommu_attach_device(domain, dev);
2542 static inline struct device_domain_info *
2543 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2545 struct device_domain_info *info;
2547 list_for_each_entry(info, &device_domain_list, global)
2548 if (info->iommu->segment == segment && info->bus == bus &&
2549 info->devfn == devfn)
2555 static int domain_setup_first_level(struct intel_iommu *iommu,
2556 struct dmar_domain *domain,
2560 int flags = PASID_FLAG_SUPERVISOR_MODE;
2561 struct dma_pte *pgd = domain->pgd;
2565 * Skip top levels of page tables for iommu which has
2566 * less agaw than default. Unnecessary for PT mode.
2568 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2569 pgd = phys_to_virt(dma_pte_addr(pgd));
2570 if (!dma_pte_present(pgd))
2574 level = agaw_to_level(agaw);
2575 if (level != 4 && level != 5)
2578 flags |= (level == 5) ? PASID_FLAG_FL5LP : 0;
2580 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2581 domain->iommu_did[iommu->seq_id],
2585 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2588 struct dmar_domain *domain)
2590 struct dmar_domain *found = NULL;
2591 struct device_domain_info *info;
2592 unsigned long flags;
2595 info = alloc_devinfo_mem();
2600 info->devfn = devfn;
2601 info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2602 info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2605 info->domain = domain;
2606 info->iommu = iommu;
2607 info->pasid_table = NULL;
2608 info->auxd_enabled = 0;
2609 INIT_LIST_HEAD(&info->auxiliary_domains);
2611 if (dev && dev_is_pci(dev)) {
2612 struct pci_dev *pdev = to_pci_dev(info->dev);
2614 if (!pdev->untrusted &&
2615 !pci_ats_disabled() &&
2616 ecap_dev_iotlb_support(iommu->ecap) &&
2617 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2618 dmar_find_matched_atsr_unit(pdev))
2619 info->ats_supported = 1;
2621 if (sm_supported(iommu)) {
2622 if (pasid_supported(iommu)) {
2623 int features = pci_pasid_features(pdev);
2625 info->pasid_supported = features | 1;
2628 if (info->ats_supported && ecap_prs(iommu->ecap) &&
2629 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2630 info->pri_supported = 1;
2634 spin_lock_irqsave(&device_domain_lock, flags);
2636 found = find_domain(dev);
2639 struct device_domain_info *info2;
2640 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2642 found = info2->domain;
2648 spin_unlock_irqrestore(&device_domain_lock, flags);
2649 free_devinfo_mem(info);
2650 /* Caller must free the original domain */
2654 spin_lock(&iommu->lock);
2655 ret = domain_attach_iommu(domain, iommu);
2656 spin_unlock(&iommu->lock);
2659 spin_unlock_irqrestore(&device_domain_lock, flags);
2660 free_devinfo_mem(info);
2664 list_add(&info->link, &domain->devices);
2665 list_add(&info->global, &device_domain_list);
2667 dev->archdata.iommu = info;
2668 spin_unlock_irqrestore(&device_domain_lock, flags);
2670 /* PASID table is mandatory for a PCI device in scalable mode. */
2671 if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2672 ret = intel_pasid_alloc_table(dev);
2674 dev_err(dev, "PASID table allocation failed\n");
2675 dmar_remove_one_dev_info(dev);
2679 /* Setup the PASID entry for requests without PASID: */
2680 spin_lock(&iommu->lock);
2681 if (hw_pass_through && domain_type_is_si(domain))
2682 ret = intel_pasid_setup_pass_through(iommu, domain,
2683 dev, PASID_RID2PASID);
2684 else if (domain_use_first_level(domain))
2685 ret = domain_setup_first_level(iommu, domain, dev,
2688 ret = intel_pasid_setup_second_level(iommu, domain,
2689 dev, PASID_RID2PASID);
2690 spin_unlock(&iommu->lock);
2692 dev_err(dev, "Setup RID2PASID failed\n");
2693 dmar_remove_one_dev_info(dev);
2698 if (dev && domain_context_mapping(domain, dev)) {
2699 dev_err(dev, "Domain context map failed\n");
2700 dmar_remove_one_dev_info(dev);
2707 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2709 *(u16 *)opaque = alias;
2713 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2715 struct device_domain_info *info;
2716 struct dmar_domain *domain = NULL;
2717 struct intel_iommu *iommu;
2719 unsigned long flags;
2722 iommu = device_to_iommu(dev, &bus, &devfn);
2726 if (dev_is_pci(dev)) {
2727 struct pci_dev *pdev = to_pci_dev(dev);
2729 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2731 spin_lock_irqsave(&device_domain_lock, flags);
2732 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2733 PCI_BUS_NUM(dma_alias),
2736 iommu = info->iommu;
2737 domain = info->domain;
2739 spin_unlock_irqrestore(&device_domain_lock, flags);
2741 /* DMA alias already has a domain, use it */
2746 /* Allocate and initialize new domain for the device */
2747 domain = alloc_domain(0);
2750 if (domain_init(domain, iommu, gaw)) {
2751 domain_exit(domain);
2759 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2760 struct dmar_domain *domain)
2762 struct intel_iommu *iommu;
2763 struct dmar_domain *tmp;
2764 u16 req_id, dma_alias;
2767 iommu = device_to_iommu(dev, &bus, &devfn);
2771 req_id = ((u16)bus << 8) | devfn;
2773 if (dev_is_pci(dev)) {
2774 struct pci_dev *pdev = to_pci_dev(dev);
2776 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2778 /* register PCI DMA alias device */
2779 if (req_id != dma_alias) {
2780 tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2781 dma_alias & 0xff, NULL, domain);
2783 if (!tmp || tmp != domain)
2788 tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2789 if (!tmp || tmp != domain)
2795 static int iommu_domain_identity_map(struct dmar_domain *domain,
2796 unsigned long long start,
2797 unsigned long long end)
2799 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2800 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2802 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2803 dma_to_mm_pfn(last_vpfn))) {
2804 pr_err("Reserving iova failed\n");
2808 pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2810 * RMRR range might have overlap with physical memory range,
2813 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2815 return __domain_mapping(domain, first_vpfn, NULL,
2816 first_vpfn, last_vpfn - first_vpfn + 1,
2817 DMA_PTE_READ|DMA_PTE_WRITE);
2820 static int domain_prepare_identity_map(struct device *dev,
2821 struct dmar_domain *domain,
2822 unsigned long long start,
2823 unsigned long long end)
2825 /* For _hardware_ passthrough, don't bother. But for software
2826 passthrough, we do it anyway -- it may indicate a memory
2827 range which is reserved in E820, so which didn't get set
2828 up to start with in si_domain */
2829 if (domain == si_domain && hw_pass_through) {
2830 dev_warn(dev, "Ignoring identity map for HW passthrough [0x%Lx - 0x%Lx]\n",
2835 dev_info(dev, "Setting identity map [0x%Lx - 0x%Lx]\n", start, end);
2838 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2839 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2840 dmi_get_system_info(DMI_BIOS_VENDOR),
2841 dmi_get_system_info(DMI_BIOS_VERSION),
2842 dmi_get_system_info(DMI_PRODUCT_VERSION));
2846 if (end >> agaw_to_width(domain->agaw)) {
2847 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2848 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2849 agaw_to_width(domain->agaw),
2850 dmi_get_system_info(DMI_BIOS_VENDOR),
2851 dmi_get_system_info(DMI_BIOS_VERSION),
2852 dmi_get_system_info(DMI_PRODUCT_VERSION));
2856 return iommu_domain_identity_map(domain, start, end);
2859 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2861 static int __init si_domain_init(int hw)
2863 struct dmar_rmrr_unit *rmrr;
2867 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2871 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2872 domain_exit(si_domain);
2879 for_each_online_node(nid) {
2880 unsigned long start_pfn, end_pfn;
2883 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2884 ret = iommu_domain_identity_map(si_domain,
2885 PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2892 * Identity map the RMRRs so that devices with RMRRs could also use
2895 for_each_rmrr_units(rmrr) {
2896 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2898 unsigned long long start = rmrr->base_address;
2899 unsigned long long end = rmrr->end_address;
2901 if (WARN_ON(end < start ||
2902 end >> agaw_to_width(si_domain->agaw)))
2905 ret = iommu_domain_identity_map(si_domain, start, end);
2914 static int identity_mapping(struct device *dev)
2916 struct device_domain_info *info;
2918 info = dev->archdata.iommu;
2920 return (info->domain == si_domain);
2925 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2927 struct dmar_domain *ndomain;
2928 struct intel_iommu *iommu;
2931 iommu = device_to_iommu(dev, &bus, &devfn);
2935 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2936 if (ndomain != domain)
2942 static bool device_has_rmrr(struct device *dev)
2944 struct dmar_rmrr_unit *rmrr;
2949 for_each_rmrr_units(rmrr) {
2951 * Return TRUE if this RMRR contains the device that
2954 for_each_active_dev_scope(rmrr->devices,
2955 rmrr->devices_cnt, i, tmp)
2957 is_downstream_to_pci_bridge(dev, tmp)) {
2967 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2968 * is relaxable (ie. is allowed to be not enforced under some conditions)
2969 * @dev: device handle
2971 * We assume that PCI USB devices with RMRRs have them largely
2972 * for historical reasons and that the RMRR space is not actively used post
2973 * boot. This exclusion may change if vendors begin to abuse it.
2975 * The same exception is made for graphics devices, with the requirement that
2976 * any use of the RMRR regions will be torn down before assigning the device
2979 * Return: true if the RMRR is relaxable, false otherwise
2981 static bool device_rmrr_is_relaxable(struct device *dev)
2983 struct pci_dev *pdev;
2985 if (!dev_is_pci(dev))
2988 pdev = to_pci_dev(dev);
2989 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2996 * There are a couple cases where we need to restrict the functionality of
2997 * devices associated with RMRRs. The first is when evaluating a device for
2998 * identity mapping because problems exist when devices are moved in and out
2999 * of domains and their respective RMRR information is lost. This means that
3000 * a device with associated RMRRs will never be in a "passthrough" domain.
3001 * The second is use of the device through the IOMMU API. This interface
3002 * expects to have full control of the IOVA space for the device. We cannot
3003 * satisfy both the requirement that RMRR access is maintained and have an
3004 * unencumbered IOVA space. We also have no ability to quiesce the device's
3005 * use of the RMRR space or even inform the IOMMU API user of the restriction.
3006 * We therefore prevent devices associated with an RMRR from participating in
3007 * the IOMMU API, which eliminates them from device assignment.
3009 * In both cases, devices which have relaxable RMRRs are not concerned by this
3010 * restriction. See device_rmrr_is_relaxable comment.
3012 static bool device_is_rmrr_locked(struct device *dev)
3014 if (!device_has_rmrr(dev))
3017 if (device_rmrr_is_relaxable(dev))
3024 * Return the required default domain type for a specific device.
3026 * @dev: the device in query
3027 * @startup: true if this is during early boot
3030 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
3031 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
3032 * - 0: both identity and dynamic domains work for this device
3034 static int device_def_domain_type(struct device *dev)
3036 if (dev_is_pci(dev)) {
3037 struct pci_dev *pdev = to_pci_dev(dev);
3040 * Prevent any device marked as untrusted from getting
3041 * placed into the statically identity mapping domain.
3043 if (pdev->untrusted)
3044 return IOMMU_DOMAIN_DMA;
3046 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
3047 return IOMMU_DOMAIN_IDENTITY;
3049 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
3050 return IOMMU_DOMAIN_IDENTITY;
3053 * We want to start off with all devices in the 1:1 domain, and
3054 * take them out later if we find they can't access all of memory.
3056 * However, we can't do this for PCI devices behind bridges,
3057 * because all PCI devices behind the same bridge will end up
3058 * with the same source-id on their transactions.
3060 * Practically speaking, we can't change things around for these
3061 * devices at run-time, because we can't be sure there'll be no
3062 * DMA transactions in flight for any of their siblings.
3064 * So PCI devices (unless they're on the root bus) as well as
3065 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
3066 * the 1:1 domain, just in _case_ one of their siblings turns out
3067 * not to be able to map all of memory.
3069 if (!pci_is_pcie(pdev)) {
3070 if (!pci_is_root_bus(pdev->bus))
3071 return IOMMU_DOMAIN_DMA;
3072 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
3073 return IOMMU_DOMAIN_DMA;
3074 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
3075 return IOMMU_DOMAIN_DMA;
3081 static void intel_iommu_init_qi(struct intel_iommu *iommu)
3084 * Start from the sane iommu hardware state.
3085 * If the queued invalidation is already initialized by us
3086 * (for example, while enabling interrupt-remapping) then
3087 * we got the things already rolling from a sane state.
3091 * Clear any previous faults.
3093 dmar_fault(-1, iommu);
3095 * Disable queued invalidation if supported and already enabled
3096 * before OS handover.
3098 dmar_disable_qi(iommu);
3101 if (dmar_enable_qi(iommu)) {
3103 * Queued Invalidate not enabled, use Register Based Invalidate
3105 iommu->flush.flush_context = __iommu_flush_context;
3106 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
3107 pr_info("%s: Using Register based invalidation\n",
3110 iommu->flush.flush_context = qi_flush_context;
3111 iommu->flush.flush_iotlb = qi_flush_iotlb;
3112 pr_info("%s: Using Queued invalidation\n", iommu->name);
3116 static int copy_context_table(struct intel_iommu *iommu,
3117 struct root_entry *old_re,
3118 struct context_entry **tbl,
3121 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3122 struct context_entry *new_ce = NULL, ce;
3123 struct context_entry *old_ce = NULL;
3124 struct root_entry re;
3125 phys_addr_t old_ce_phys;
3127 tbl_idx = ext ? bus * 2 : bus;
3128 memcpy(&re, old_re, sizeof(re));
3130 for (devfn = 0; devfn < 256; devfn++) {
3131 /* First calculate the correct index */
3132 idx = (ext ? devfn * 2 : devfn) % 256;
3135 /* First save what we may have and clean up */
3137 tbl[tbl_idx] = new_ce;
3138 __iommu_flush_cache(iommu, new_ce,
3148 old_ce_phys = root_entry_lctp(&re);
3150 old_ce_phys = root_entry_uctp(&re);
3153 if (ext && devfn == 0) {
3154 /* No LCTP, try UCTP */
3163 old_ce = memremap(old_ce_phys, PAGE_SIZE,
3168 new_ce = alloc_pgtable_page(iommu->node);
3175 /* Now copy the context entry */
3176 memcpy(&ce, old_ce + idx, sizeof(ce));
3178 if (!__context_present(&ce))
3181 did = context_domain_id(&ce);
3182 if (did >= 0 && did < cap_ndoms(iommu->cap))
3183 set_bit(did, iommu->domain_ids);
3186 * We need a marker for copied context entries. This
3187 * marker needs to work for the old format as well as
3188 * for extended context entries.
3190 * Bit 67 of the context entry is used. In the old
3191 * format this bit is available to software, in the
3192 * extended format it is the PGE bit, but PGE is ignored
3193 * by HW if PASIDs are disabled (and thus still
3196 * So disable PASIDs first and then mark the entry
3197 * copied. This means that we don't copy PASID
3198 * translations from the old kernel, but this is fine as
3199 * faults there are not fatal.
3201 context_clear_pasid_enable(&ce);
3202 context_set_copied(&ce);
3207 tbl[tbl_idx + pos] = new_ce;
3209 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3218 static int copy_translation_tables(struct intel_iommu *iommu)
3220 struct context_entry **ctxt_tbls;
3221 struct root_entry *old_rt;
3222 phys_addr_t old_rt_phys;
3223 int ctxt_table_entries;
3224 unsigned long flags;
3229 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3230 ext = !!(rtaddr_reg & DMA_RTADDR_RTT);
3231 new_ext = !!ecap_ecs(iommu->ecap);
3234 * The RTT bit can only be changed when translation is disabled,
3235 * but disabling translation means to open a window for data
3236 * corruption. So bail out and don't copy anything if we would
3237 * have to change the bit.
3242 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3246 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3250 /* This is too big for the stack - allocate it from slab */
3251 ctxt_table_entries = ext ? 512 : 256;
3253 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3257 for (bus = 0; bus < 256; bus++) {
3258 ret = copy_context_table(iommu, &old_rt[bus],
3259 ctxt_tbls, bus, ext);
3261 pr_err("%s: Failed to copy context table for bus %d\n",
3267 spin_lock_irqsave(&iommu->lock, flags);
3269 /* Context tables are copied, now write them to the root_entry table */
3270 for (bus = 0; bus < 256; bus++) {
3271 int idx = ext ? bus * 2 : bus;
3274 if (ctxt_tbls[idx]) {
3275 val = virt_to_phys(ctxt_tbls[idx]) | 1;
3276 iommu->root_entry[bus].lo = val;
3279 if (!ext || !ctxt_tbls[idx + 1])
3282 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3283 iommu->root_entry[bus].hi = val;
3286 spin_unlock_irqrestore(&iommu->lock, flags);
3290 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3300 static int __init init_dmars(void)
3302 struct dmar_drhd_unit *drhd;
3303 struct intel_iommu *iommu;
3309 * initialize and program root entry to not present
3312 for_each_drhd_unit(drhd) {
3314 * lock not needed as this is only incremented in the single
3315 * threaded kernel __init code path all other access are read
3318 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3322 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3325 /* Preallocate enough resources for IOMMU hot-addition */
3326 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3327 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3329 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3332 pr_err("Allocating global iommu array failed\n");
3337 for_each_iommu(iommu, drhd) {
3338 if (drhd->ignored) {
3339 iommu_disable_translation(iommu);
3344 * Find the max pasid size of all IOMMU's in the system.
3345 * We need to ensure the system pasid table is no bigger
3346 * than the smallest supported.
3348 if (pasid_supported(iommu)) {
3349 u32 temp = 2 << ecap_pss(iommu->ecap);
3351 intel_pasid_max_id = min_t(u32, temp,
3352 intel_pasid_max_id);
3355 g_iommus[iommu->seq_id] = iommu;
3357 intel_iommu_init_qi(iommu);
3359 ret = iommu_init_domains(iommu);
3363 init_translation_status(iommu);
3365 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3366 iommu_disable_translation(iommu);
3367 clear_translation_pre_enabled(iommu);
3368 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3374 * we could share the same root & context tables
3375 * among all IOMMU's. Need to Split it later.
3377 ret = iommu_alloc_root_entry(iommu);
3381 if (translation_pre_enabled(iommu)) {
3382 pr_info("Translation already enabled - trying to copy translation structures\n");
3384 ret = copy_translation_tables(iommu);
3387 * We found the IOMMU with translation
3388 * enabled - but failed to copy over the
3389 * old root-entry table. Try to proceed
3390 * by disabling translation now and
3391 * allocating a clean root-entry table.
3392 * This might cause DMAR faults, but
3393 * probably the dump will still succeed.
3395 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3397 iommu_disable_translation(iommu);
3398 clear_translation_pre_enabled(iommu);
3400 pr_info("Copied translation tables from previous kernel for %s\n",
3405 if (!ecap_pass_through(iommu->ecap))
3406 hw_pass_through = 0;
3407 intel_svm_check(iommu);
3411 * Now that qi is enabled on all iommus, set the root entry and flush
3412 * caches. This is required on some Intel X58 chipsets, otherwise the
3413 * flush_context function will loop forever and the boot hangs.
3415 for_each_active_iommu(iommu, drhd) {
3416 iommu_flush_write_buffer(iommu);
3417 iommu_set_root_entry(iommu);
3418 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3419 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3422 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3427 iommu_identity_mapping |= IDENTMAP_GFX;
3429 check_tylersburg_isoch();
3431 ret = si_domain_init(hw_pass_through);
3438 * global invalidate context cache
3439 * global invalidate iotlb
3440 * enable translation
3442 for_each_iommu(iommu, drhd) {
3443 if (drhd->ignored) {
3445 * we always have to disable PMRs or DMA may fail on
3449 iommu_disable_protect_mem_regions(iommu);
3453 iommu_flush_write_buffer(iommu);
3455 #ifdef CONFIG_INTEL_IOMMU_SVM
3456 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3458 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3459 * could cause possible lock race condition.
3461 up_write(&dmar_global_lock);
3462 ret = intel_svm_enable_prq(iommu);
3463 down_write(&dmar_global_lock);
3468 ret = dmar_set_interrupt(iommu);
3476 for_each_active_iommu(iommu, drhd) {
3477 disable_dmar_iommu(iommu);
3478 free_dmar_iommu(iommu);
3487 /* This takes a number of _MM_ pages, not VTD pages */
3488 static unsigned long intel_alloc_iova(struct device *dev,
3489 struct dmar_domain *domain,
3490 unsigned long nrpages, uint64_t dma_mask)
3492 unsigned long iova_pfn;
3495 * Restrict dma_mask to the width that the iommu can handle.
3496 * First-level translation restricts the input-address to a
3497 * canonical address (i.e., address bits 63:N have the same
3498 * value as address bit [N-1], where N is 48-bits with 4-level
3499 * paging and 57-bits with 5-level paging). Hence, skip bit
3502 if (domain_use_first_level(domain))
3503 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw - 1),
3506 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw),
3509 /* Ensure we reserve the whole size-aligned region */
3510 nrpages = __roundup_pow_of_two(nrpages);
3512 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3514 * First try to allocate an io virtual address in
3515 * DMA_BIT_MASK(32) and if that fails then try allocating
3518 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3519 IOVA_PFN(DMA_BIT_MASK(32)), false);
3523 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3524 IOVA_PFN(dma_mask), true);
3525 if (unlikely(!iova_pfn)) {
3526 dev_err_once(dev, "Allocating %ld-page iova failed\n",
3534 static struct dmar_domain *get_private_domain_for_dev(struct device *dev)
3536 struct dmar_domain *domain, *tmp;
3537 struct dmar_rmrr_unit *rmrr;
3538 struct device *i_dev;
3541 /* Device shouldn't be attached by any domains. */
3542 domain = find_domain(dev);
3546 domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3550 /* We have a new domain - setup possible RMRRs for the device */
3552 for_each_rmrr_units(rmrr) {
3553 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3558 ret = domain_prepare_identity_map(dev, domain,
3562 dev_err(dev, "Mapping reserved region failed\n");
3567 tmp = set_domain_for_dev(dev, domain);
3568 if (!tmp || domain != tmp) {
3569 domain_exit(domain);
3575 dev_err(dev, "Allocating domain failed\n");
3577 domain->domain.type = IOMMU_DOMAIN_DMA;
3582 /* Check if the dev needs to go through non-identity map and unmap process.*/
3583 static bool iommu_need_mapping(struct device *dev)
3587 if (iommu_dummy(dev))
3590 if (unlikely(attach_deferred(dev)))
3591 do_deferred_attach(dev);
3593 ret = identity_mapping(dev);
3595 u64 dma_mask = *dev->dma_mask;
3597 if (dev->coherent_dma_mask && dev->coherent_dma_mask < dma_mask)
3598 dma_mask = dev->coherent_dma_mask;
3600 if (dma_mask >= dma_direct_get_required_mask(dev))
3604 * 32 bit DMA is removed from si_domain and fall back to
3605 * non-identity mapping.
3607 dmar_remove_one_dev_info(dev);
3608 ret = iommu_request_dma_domain_for_dev(dev);
3610 struct iommu_domain *domain;
3611 struct dmar_domain *dmar_domain;
3613 domain = iommu_get_domain_for_dev(dev);
3615 dmar_domain = to_dmar_domain(domain);
3616 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
3618 dmar_remove_one_dev_info(dev);
3619 get_private_domain_for_dev(dev);
3622 dev_info(dev, "32bit DMA uses non-identity mapping\n");
3628 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3629 size_t size, int dir, u64 dma_mask)
3631 struct dmar_domain *domain;
3632 phys_addr_t start_paddr;
3633 unsigned long iova_pfn;
3636 struct intel_iommu *iommu;
3637 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3639 BUG_ON(dir == DMA_NONE);
3641 domain = find_domain(dev);
3643 return DMA_MAPPING_ERROR;
3645 iommu = domain_get_iommu(domain);
3646 size = aligned_nrpages(paddr, size);
3648 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3653 * Check if DMAR supports zero-length reads on write only
3656 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3657 !cap_zlr(iommu->cap))
3658 prot |= DMA_PTE_READ;
3659 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3660 prot |= DMA_PTE_WRITE;
3662 * paddr - (paddr + size) might be partial page, we should map the whole
3663 * page. Note: if two part of one page are separately mapped, we
3664 * might have two guest_addr mapping to the same host paddr, but this
3665 * is not a big problem
3667 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3668 mm_to_dma_pfn(paddr_pfn), size, prot);
3672 start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3673 start_paddr += paddr & ~PAGE_MASK;
3675 trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3681 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3682 dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3683 size, (unsigned long long)paddr, dir);
3684 return DMA_MAPPING_ERROR;
3687 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3688 unsigned long offset, size_t size,
3689 enum dma_data_direction dir,
3690 unsigned long attrs)
3692 if (iommu_need_mapping(dev))
3693 return __intel_map_single(dev, page_to_phys(page) + offset,
3694 size, dir, *dev->dma_mask);
3695 return dma_direct_map_page(dev, page, offset, size, dir, attrs);
3698 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3699 size_t size, enum dma_data_direction dir,
3700 unsigned long attrs)
3702 if (iommu_need_mapping(dev))
3703 return __intel_map_single(dev, phys_addr, size, dir,
3705 return dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
3708 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3710 struct dmar_domain *domain;
3711 unsigned long start_pfn, last_pfn;
3712 unsigned long nrpages;
3713 unsigned long iova_pfn;
3714 struct intel_iommu *iommu;
3715 struct page *freelist;
3716 struct pci_dev *pdev = NULL;
3718 domain = find_domain(dev);
3721 iommu = domain_get_iommu(domain);
3723 iova_pfn = IOVA_PFN(dev_addr);
3725 nrpages = aligned_nrpages(dev_addr, size);
3726 start_pfn = mm_to_dma_pfn(iova_pfn);
3727 last_pfn = start_pfn + nrpages - 1;
3729 if (dev_is_pci(dev))
3730 pdev = to_pci_dev(dev);
3732 freelist = domain_unmap(domain, start_pfn, last_pfn);
3733 if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3734 !has_iova_flush_queue(&domain->iovad)) {
3735 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3736 nrpages, !freelist, 0);
3738 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3739 dma_free_pagelist(freelist);
3741 queue_iova(&domain->iovad, iova_pfn, nrpages,
3742 (unsigned long)freelist);
3744 * queue up the release of the unmap to save the 1/6th of the
3745 * cpu used up by the iotlb flush operation...
3749 trace_unmap_single(dev, dev_addr, size);
3752 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3753 size_t size, enum dma_data_direction dir,
3754 unsigned long attrs)
3756 if (iommu_need_mapping(dev))
3757 intel_unmap(dev, dev_addr, size);
3759 dma_direct_unmap_page(dev, dev_addr, size, dir, attrs);
3762 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3763 size_t size, enum dma_data_direction dir, unsigned long attrs)
3765 if (iommu_need_mapping(dev))
3766 intel_unmap(dev, dev_addr, size);
3769 static void *intel_alloc_coherent(struct device *dev, size_t size,
3770 dma_addr_t *dma_handle, gfp_t flags,
3771 unsigned long attrs)
3773 struct page *page = NULL;
3776 if (!iommu_need_mapping(dev))
3777 return dma_direct_alloc(dev, size, dma_handle, flags, attrs);
3779 size = PAGE_ALIGN(size);
3780 order = get_order(size);
3782 if (gfpflags_allow_blocking(flags)) {
3783 unsigned int count = size >> PAGE_SHIFT;
3785 page = dma_alloc_from_contiguous(dev, count, order,
3786 flags & __GFP_NOWARN);
3790 page = alloc_pages(flags, order);
3793 memset(page_address(page), 0, size);
3795 *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3797 dev->coherent_dma_mask);
3798 if (*dma_handle != DMA_MAPPING_ERROR)
3799 return page_address(page);
3800 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3801 __free_pages(page, order);
3806 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3807 dma_addr_t dma_handle, unsigned long attrs)
3810 struct page *page = virt_to_page(vaddr);
3812 if (!iommu_need_mapping(dev))
3813 return dma_direct_free(dev, size, vaddr, dma_handle, attrs);
3815 size = PAGE_ALIGN(size);
3816 order = get_order(size);
3818 intel_unmap(dev, dma_handle, size);
3819 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3820 __free_pages(page, order);
3823 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3824 int nelems, enum dma_data_direction dir,
3825 unsigned long attrs)
3827 dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3828 unsigned long nrpages = 0;
3829 struct scatterlist *sg;
3832 if (!iommu_need_mapping(dev))
3833 return dma_direct_unmap_sg(dev, sglist, nelems, dir, attrs);
3835 for_each_sg(sglist, sg, nelems, i) {
3836 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3839 intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3841 trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3844 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3845 enum dma_data_direction dir, unsigned long attrs)
3848 struct dmar_domain *domain;
3851 unsigned long iova_pfn;
3853 struct scatterlist *sg;
3854 unsigned long start_vpfn;
3855 struct intel_iommu *iommu;
3857 BUG_ON(dir == DMA_NONE);
3858 if (!iommu_need_mapping(dev))
3859 return dma_direct_map_sg(dev, sglist, nelems, dir, attrs);
3861 domain = find_domain(dev);
3865 iommu = domain_get_iommu(domain);
3867 for_each_sg(sglist, sg, nelems, i)
3868 size += aligned_nrpages(sg->offset, sg->length);
3870 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3873 sglist->dma_length = 0;
3878 * Check if DMAR supports zero-length reads on write only
3881 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3882 !cap_zlr(iommu->cap))
3883 prot |= DMA_PTE_READ;
3884 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3885 prot |= DMA_PTE_WRITE;
3887 start_vpfn = mm_to_dma_pfn(iova_pfn);
3889 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3890 if (unlikely(ret)) {
3891 dma_pte_free_pagetable(domain, start_vpfn,
3892 start_vpfn + size - 1,
3893 agaw_to_level(domain->agaw) + 1);
3894 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3898 for_each_sg(sglist, sg, nelems, i)
3899 trace_map_sg(dev, i + 1, nelems, sg);
3904 static u64 intel_get_required_mask(struct device *dev)
3906 if (!iommu_need_mapping(dev))
3907 return dma_direct_get_required_mask(dev);
3908 return DMA_BIT_MASK(32);
3911 static const struct dma_map_ops intel_dma_ops = {
3912 .alloc = intel_alloc_coherent,
3913 .free = intel_free_coherent,
3914 .map_sg = intel_map_sg,
3915 .unmap_sg = intel_unmap_sg,
3916 .map_page = intel_map_page,
3917 .unmap_page = intel_unmap_page,
3918 .map_resource = intel_map_resource,
3919 .unmap_resource = intel_unmap_resource,
3920 .dma_supported = dma_direct_supported,
3921 .mmap = dma_common_mmap,
3922 .get_sgtable = dma_common_get_sgtable,
3923 .get_required_mask = intel_get_required_mask,
3927 bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3928 enum dma_data_direction dir, enum dma_sync_target target)
3930 struct dmar_domain *domain;
3931 phys_addr_t tlb_addr;
3933 domain = find_domain(dev);
3934 if (WARN_ON(!domain))
3937 tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3938 if (is_swiotlb_buffer(tlb_addr))
3939 swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3943 bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3944 enum dma_data_direction dir, unsigned long attrs,
3947 size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3948 struct dmar_domain *domain;
3949 struct intel_iommu *iommu;
3950 unsigned long iova_pfn;
3951 unsigned long nrpages;
3952 phys_addr_t tlb_addr;
3956 if (unlikely(attach_deferred(dev)))
3957 do_deferred_attach(dev);
3959 domain = find_domain(dev);
3961 if (WARN_ON(dir == DMA_NONE || !domain))
3962 return DMA_MAPPING_ERROR;
3964 iommu = domain_get_iommu(domain);
3965 if (WARN_ON(!iommu))
3966 return DMA_MAPPING_ERROR;
3968 nrpages = aligned_nrpages(0, size);
3969 iova_pfn = intel_alloc_iova(dev, domain,
3970 dma_to_mm_pfn(nrpages), dma_mask);
3972 return DMA_MAPPING_ERROR;
3975 * Check if DMAR supports zero-length reads on write only
3978 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3979 !cap_zlr(iommu->cap))
3980 prot |= DMA_PTE_READ;
3981 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3982 prot |= DMA_PTE_WRITE;
3985 * If both the physical buffer start address and size are
3986 * page aligned, we don't need to use a bounce page.
3988 if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
3989 tlb_addr = swiotlb_tbl_map_single(dev,
3990 __phys_to_dma(dev, io_tlb_start),
3991 paddr, size, aligned_size, dir, attrs);
3992 if (tlb_addr == DMA_MAPPING_ERROR) {
3995 /* Cleanup the padding area. */
3996 void *padding_start = phys_to_virt(tlb_addr);
3997 size_t padding_size = aligned_size;
3999 if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
4000 (dir == DMA_TO_DEVICE ||
4001 dir == DMA_BIDIRECTIONAL)) {
4002 padding_start += size;
4003 padding_size -= size;
4006 memset(padding_start, 0, padding_size);
4012 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
4013 tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
4017 trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
4019 return (phys_addr_t)iova_pfn << PAGE_SHIFT;
4022 if (is_swiotlb_buffer(tlb_addr))
4023 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
4024 aligned_size, dir, attrs);
4026 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
4027 dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
4028 size, (unsigned long long)paddr, dir);
4030 return DMA_MAPPING_ERROR;
4034 bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
4035 enum dma_data_direction dir, unsigned long attrs)
4037 size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
4038 struct dmar_domain *domain;
4039 phys_addr_t tlb_addr;
4041 domain = find_domain(dev);
4042 if (WARN_ON(!domain))
4045 tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
4046 if (WARN_ON(!tlb_addr))
4049 intel_unmap(dev, dev_addr, size);
4050 if (is_swiotlb_buffer(tlb_addr))
4051 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
4052 aligned_size, dir, attrs);
4054 trace_bounce_unmap_single(dev, dev_addr, size);
4058 bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
4059 size_t size, enum dma_data_direction dir, unsigned long attrs)
4061 return bounce_map_single(dev, page_to_phys(page) + offset,
4062 size, dir, attrs, *dev->dma_mask);
4066 bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
4067 enum dma_data_direction dir, unsigned long attrs)
4069 return bounce_map_single(dev, phys_addr, size,
4070 dir, attrs, *dev->dma_mask);
4074 bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
4075 enum dma_data_direction dir, unsigned long attrs)
4077 bounce_unmap_single(dev, dev_addr, size, dir, attrs);
4081 bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
4082 enum dma_data_direction dir, unsigned long attrs)
4084 bounce_unmap_single(dev, dev_addr, size, dir, attrs);
4088 bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
4089 enum dma_data_direction dir, unsigned long attrs)
4091 struct scatterlist *sg;
4094 for_each_sg(sglist, sg, nelems, i)
4095 bounce_unmap_page(dev, sg->dma_address,
4096 sg_dma_len(sg), dir, attrs);
4100 bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
4101 enum dma_data_direction dir, unsigned long attrs)
4104 struct scatterlist *sg;
4106 for_each_sg(sglist, sg, nelems, i) {
4107 sg->dma_address = bounce_map_page(dev, sg_page(sg),
4108 sg->offset, sg->length,
4110 if (sg->dma_address == DMA_MAPPING_ERROR)
4112 sg_dma_len(sg) = sg->length;
4115 for_each_sg(sglist, sg, nelems, i)
4116 trace_bounce_map_sg(dev, i + 1, nelems, sg);
4121 bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
4126 bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
4127 size_t size, enum dma_data_direction dir)
4129 bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
4133 bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
4134 size_t size, enum dma_data_direction dir)
4136 bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
4140 bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
4141 int nelems, enum dma_data_direction dir)
4143 struct scatterlist *sg;
4146 for_each_sg(sglist, sg, nelems, i)
4147 bounce_sync_single(dev, sg_dma_address(sg),
4148 sg_dma_len(sg), dir, SYNC_FOR_CPU);
4152 bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
4153 int nelems, enum dma_data_direction dir)
4155 struct scatterlist *sg;
4158 for_each_sg(sglist, sg, nelems, i)
4159 bounce_sync_single(dev, sg_dma_address(sg),
4160 sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
4163 static const struct dma_map_ops bounce_dma_ops = {
4164 .alloc = intel_alloc_coherent,
4165 .free = intel_free_coherent,
4166 .map_sg = bounce_map_sg,
4167 .unmap_sg = bounce_unmap_sg,
4168 .map_page = bounce_map_page,
4169 .unmap_page = bounce_unmap_page,
4170 .sync_single_for_cpu = bounce_sync_single_for_cpu,
4171 .sync_single_for_device = bounce_sync_single_for_device,
4172 .sync_sg_for_cpu = bounce_sync_sg_for_cpu,
4173 .sync_sg_for_device = bounce_sync_sg_for_device,
4174 .map_resource = bounce_map_resource,
4175 .unmap_resource = bounce_unmap_resource,
4176 .dma_supported = dma_direct_supported,
4179 static inline int iommu_domain_cache_init(void)
4183 iommu_domain_cache = kmem_cache_create("iommu_domain",
4184 sizeof(struct dmar_domain),
4189 if (!iommu_domain_cache) {
4190 pr_err("Couldn't create iommu_domain cache\n");
4197 static inline int iommu_devinfo_cache_init(void)
4201 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
4202 sizeof(struct device_domain_info),
4206 if (!iommu_devinfo_cache) {
4207 pr_err("Couldn't create devinfo cache\n");
4214 static int __init iommu_init_mempool(void)
4217 ret = iova_cache_get();
4221 ret = iommu_domain_cache_init();
4225 ret = iommu_devinfo_cache_init();
4229 kmem_cache_destroy(iommu_domain_cache);
4236 static void __init iommu_exit_mempool(void)
4238 kmem_cache_destroy(iommu_devinfo_cache);
4239 kmem_cache_destroy(iommu_domain_cache);
4243 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
4245 struct dmar_drhd_unit *drhd;
4249 /* We know that this device on this chipset has its own IOMMU.
4250 * If we find it under a different IOMMU, then the BIOS is lying
4251 * to us. Hope that the IOMMU for this device is actually
4252 * disabled, and it needs no translation...
4254 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
4256 /* "can't" happen */
4257 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
4260 vtbar &= 0xffff0000;
4262 /* we know that the this iommu should be at offset 0xa000 from vtbar */
4263 drhd = dmar_find_matched_drhd_unit(pdev);
4264 if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
4265 TAINT_FIRMWARE_WORKAROUND,
4266 "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
4267 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4269 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
4271 static void __init init_no_remapping_devices(void)
4273 struct dmar_drhd_unit *drhd;
4277 for_each_drhd_unit(drhd) {
4278 if (!drhd->include_all) {
4279 for_each_active_dev_scope(drhd->devices,
4280 drhd->devices_cnt, i, dev)
4282 /* ignore DMAR unit if no devices exist */
4283 if (i == drhd->devices_cnt)
4288 for_each_active_drhd_unit(drhd) {
4289 if (drhd->include_all)
4292 for_each_active_dev_scope(drhd->devices,
4293 drhd->devices_cnt, i, dev)
4294 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4296 if (i < drhd->devices_cnt)
4299 /* This IOMMU has *only* gfx devices. Either bypass it or
4300 set the gfx_mapped flag, as appropriate */
4301 if (!dmar_map_gfx) {
4303 for_each_active_dev_scope(drhd->devices,
4304 drhd->devices_cnt, i, dev)
4305 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4310 #ifdef CONFIG_SUSPEND
4311 static int init_iommu_hw(void)
4313 struct dmar_drhd_unit *drhd;
4314 struct intel_iommu *iommu = NULL;
4316 for_each_active_iommu(iommu, drhd)
4318 dmar_reenable_qi(iommu);
4320 for_each_iommu(iommu, drhd) {
4321 if (drhd->ignored) {
4323 * we always have to disable PMRs or DMA may fail on
4327 iommu_disable_protect_mem_regions(iommu);
4331 iommu_flush_write_buffer(iommu);
4333 iommu_set_root_entry(iommu);
4335 iommu->flush.flush_context(iommu, 0, 0, 0,
4336 DMA_CCMD_GLOBAL_INVL);
4337 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4338 iommu_enable_translation(iommu);
4339 iommu_disable_protect_mem_regions(iommu);
4345 static void iommu_flush_all(void)
4347 struct dmar_drhd_unit *drhd;
4348 struct intel_iommu *iommu;
4350 for_each_active_iommu(iommu, drhd) {
4351 iommu->flush.flush_context(iommu, 0, 0, 0,
4352 DMA_CCMD_GLOBAL_INVL);
4353 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4354 DMA_TLB_GLOBAL_FLUSH);
4358 static int iommu_suspend(void)
4360 struct dmar_drhd_unit *drhd;
4361 struct intel_iommu *iommu = NULL;
4364 for_each_active_iommu(iommu, drhd) {
4365 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4367 if (!iommu->iommu_state)
4373 for_each_active_iommu(iommu, drhd) {
4374 iommu_disable_translation(iommu);
4376 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4378 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4379 readl(iommu->reg + DMAR_FECTL_REG);
4380 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4381 readl(iommu->reg + DMAR_FEDATA_REG);
4382 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4383 readl(iommu->reg + DMAR_FEADDR_REG);
4384 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4385 readl(iommu->reg + DMAR_FEUADDR_REG);
4387 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4392 for_each_active_iommu(iommu, drhd)
4393 kfree(iommu->iommu_state);
4398 static void iommu_resume(void)
4400 struct dmar_drhd_unit *drhd;
4401 struct intel_iommu *iommu = NULL;
4404 if (init_iommu_hw()) {
4406 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4408 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4412 for_each_active_iommu(iommu, drhd) {
4414 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4416 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4417 iommu->reg + DMAR_FECTL_REG);
4418 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4419 iommu->reg + DMAR_FEDATA_REG);
4420 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4421 iommu->reg + DMAR_FEADDR_REG);
4422 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4423 iommu->reg + DMAR_FEUADDR_REG);
4425 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4428 for_each_active_iommu(iommu, drhd)
4429 kfree(iommu->iommu_state);
4432 static struct syscore_ops iommu_syscore_ops = {
4433 .resume = iommu_resume,
4434 .suspend = iommu_suspend,
4437 static void __init init_iommu_pm_ops(void)
4439 register_syscore_ops(&iommu_syscore_ops);
4443 static inline void init_iommu_pm_ops(void) {}
4444 #endif /* CONFIG_PM */
4446 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
4448 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
4449 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
4450 rmrr->end_address <= rmrr->base_address ||
4451 arch_rmrr_sanity_check(rmrr))
4457 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4459 struct acpi_dmar_reserved_memory *rmrr;
4460 struct dmar_rmrr_unit *rmrru;
4462 rmrr = (struct acpi_dmar_reserved_memory *)header;
4463 if (rmrr_sanity_check(rmrr))
4464 WARN_TAINT(1, TAINT_FIRMWARE_WORKAROUND,
4465 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
4466 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4467 rmrr->base_address, rmrr->end_address,
4468 dmi_get_system_info(DMI_BIOS_VENDOR),
4469 dmi_get_system_info(DMI_BIOS_VERSION),
4470 dmi_get_system_info(DMI_PRODUCT_VERSION));
4472 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4476 rmrru->hdr = header;
4478 rmrru->base_address = rmrr->base_address;
4479 rmrru->end_address = rmrr->end_address;
4481 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4482 ((void *)rmrr) + rmrr->header.length,
4483 &rmrru->devices_cnt);
4484 if (rmrru->devices_cnt && rmrru->devices == NULL)
4487 list_add(&rmrru->list, &dmar_rmrr_units);
4496 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4498 struct dmar_atsr_unit *atsru;
4499 struct acpi_dmar_atsr *tmp;
4501 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4502 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4503 if (atsr->segment != tmp->segment)
4505 if (atsr->header.length != tmp->header.length)
4507 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4514 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4516 struct acpi_dmar_atsr *atsr;
4517 struct dmar_atsr_unit *atsru;
4519 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4522 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4523 atsru = dmar_find_atsr(atsr);
4527 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4532 * If memory is allocated from slab by ACPI _DSM method, we need to
4533 * copy the memory content because the memory buffer will be freed
4536 atsru->hdr = (void *)(atsru + 1);
4537 memcpy(atsru->hdr, hdr, hdr->length);
4538 atsru->include_all = atsr->flags & 0x1;
4539 if (!atsru->include_all) {
4540 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4541 (void *)atsr + atsr->header.length,
4542 &atsru->devices_cnt);
4543 if (atsru->devices_cnt && atsru->devices == NULL) {
4549 list_add_rcu(&atsru->list, &dmar_atsr_units);
4554 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4556 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4560 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4562 struct acpi_dmar_atsr *atsr;
4563 struct dmar_atsr_unit *atsru;
4565 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4566 atsru = dmar_find_atsr(atsr);
4568 list_del_rcu(&atsru->list);
4570 intel_iommu_free_atsr(atsru);
4576 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4580 struct acpi_dmar_atsr *atsr;
4581 struct dmar_atsr_unit *atsru;
4583 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4584 atsru = dmar_find_atsr(atsr);
4588 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4589 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4597 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4600 struct intel_iommu *iommu = dmaru->iommu;
4602 if (g_iommus[iommu->seq_id])
4605 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4606 pr_warn("%s: Doesn't support hardware pass through.\n",
4610 if (!ecap_sc_support(iommu->ecap) &&
4611 domain_update_iommu_snooping(iommu)) {
4612 pr_warn("%s: Doesn't support snooping.\n",
4616 sp = domain_update_iommu_superpage(NULL, iommu) - 1;
4617 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4618 pr_warn("%s: Doesn't support large page.\n",
4624 * Disable translation if already enabled prior to OS handover.
4626 if (iommu->gcmd & DMA_GCMD_TE)
4627 iommu_disable_translation(iommu);
4629 g_iommus[iommu->seq_id] = iommu;
4630 ret = iommu_init_domains(iommu);
4632 ret = iommu_alloc_root_entry(iommu);
4636 intel_svm_check(iommu);
4638 if (dmaru->ignored) {
4640 * we always have to disable PMRs or DMA may fail on this device
4643 iommu_disable_protect_mem_regions(iommu);
4647 intel_iommu_init_qi(iommu);
4648 iommu_flush_write_buffer(iommu);
4650 #ifdef CONFIG_INTEL_IOMMU_SVM
4651 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4652 ret = intel_svm_enable_prq(iommu);
4657 ret = dmar_set_interrupt(iommu);
4661 iommu_set_root_entry(iommu);
4662 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4663 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4664 iommu_enable_translation(iommu);
4666 iommu_disable_protect_mem_regions(iommu);
4670 disable_dmar_iommu(iommu);
4672 free_dmar_iommu(iommu);
4676 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4679 struct intel_iommu *iommu = dmaru->iommu;
4681 if (!intel_iommu_enabled)
4687 ret = intel_iommu_add(dmaru);
4689 disable_dmar_iommu(iommu);
4690 free_dmar_iommu(iommu);
4696 static void intel_iommu_free_dmars(void)
4698 struct dmar_rmrr_unit *rmrru, *rmrr_n;
4699 struct dmar_atsr_unit *atsru, *atsr_n;
4701 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4702 list_del(&rmrru->list);
4703 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4707 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4708 list_del(&atsru->list);
4709 intel_iommu_free_atsr(atsru);
4713 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4716 struct pci_bus *bus;
4717 struct pci_dev *bridge = NULL;
4719 struct acpi_dmar_atsr *atsr;
4720 struct dmar_atsr_unit *atsru;
4722 dev = pci_physfn(dev);
4723 for (bus = dev->bus; bus; bus = bus->parent) {
4725 /* If it's an integrated device, allow ATS */
4728 /* Connected via non-PCIe: no ATS */
4729 if (!pci_is_pcie(bridge) ||
4730 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4732 /* If we found the root port, look it up in the ATSR */
4733 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4738 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4739 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4740 if (atsr->segment != pci_domain_nr(dev->bus))
4743 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4744 if (tmp == &bridge->dev)
4747 if (atsru->include_all)
4757 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4760 struct dmar_rmrr_unit *rmrru;
4761 struct dmar_atsr_unit *atsru;
4762 struct acpi_dmar_atsr *atsr;
4763 struct acpi_dmar_reserved_memory *rmrr;
4765 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4768 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4769 rmrr = container_of(rmrru->hdr,
4770 struct acpi_dmar_reserved_memory, header);
4771 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4772 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4773 ((void *)rmrr) + rmrr->header.length,
4774 rmrr->segment, rmrru->devices,
4775 rmrru->devices_cnt);
4778 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4779 dmar_remove_dev_scope(info, rmrr->segment,
4780 rmrru->devices, rmrru->devices_cnt);
4784 list_for_each_entry(atsru, &dmar_atsr_units, list) {
4785 if (atsru->include_all)
4788 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4789 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4790 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4791 (void *)atsr + atsr->header.length,
4792 atsr->segment, atsru->devices,
4793 atsru->devices_cnt);
4798 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4799 if (dmar_remove_dev_scope(info, atsr->segment,
4800 atsru->devices, atsru->devices_cnt))
4808 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4809 unsigned long val, void *v)
4811 struct memory_notify *mhp = v;
4812 unsigned long long start, end;
4813 unsigned long start_vpfn, last_vpfn;
4816 case MEM_GOING_ONLINE:
4817 start = mhp->start_pfn << PAGE_SHIFT;
4818 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4819 if (iommu_domain_identity_map(si_domain, start, end)) {
4820 pr_warn("Failed to build identity map for [%llx-%llx]\n",
4827 case MEM_CANCEL_ONLINE:
4828 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4829 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4830 while (start_vpfn <= last_vpfn) {
4832 struct dmar_drhd_unit *drhd;
4833 struct intel_iommu *iommu;
4834 struct page *freelist;
4836 iova = find_iova(&si_domain->iovad, start_vpfn);
4838 pr_debug("Failed get IOVA for PFN %lx\n",
4843 iova = split_and_remove_iova(&si_domain->iovad, iova,
4844 start_vpfn, last_vpfn);
4846 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4847 start_vpfn, last_vpfn);
4851 freelist = domain_unmap(si_domain, iova->pfn_lo,
4855 for_each_active_iommu(iommu, drhd)
4856 iommu_flush_iotlb_psi(iommu, si_domain,
4857 iova->pfn_lo, iova_size(iova),
4860 dma_free_pagelist(freelist);
4862 start_vpfn = iova->pfn_hi + 1;
4863 free_iova_mem(iova);
4871 static struct notifier_block intel_iommu_memory_nb = {
4872 .notifier_call = intel_iommu_memory_notifier,
4876 static void free_all_cpu_cached_iovas(unsigned int cpu)
4880 for (i = 0; i < g_num_of_iommus; i++) {
4881 struct intel_iommu *iommu = g_iommus[i];
4882 struct dmar_domain *domain;
4888 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4889 domain = get_iommu_domain(iommu, (u16)did);
4893 free_cpu_cached_iovas(cpu, &domain->iovad);
4898 static int intel_iommu_cpu_dead(unsigned int cpu)
4900 free_all_cpu_cached_iovas(cpu);
4904 static void intel_disable_iommus(void)
4906 struct intel_iommu *iommu = NULL;
4907 struct dmar_drhd_unit *drhd;
4909 for_each_iommu(iommu, drhd)
4910 iommu_disable_translation(iommu);
4913 void intel_iommu_shutdown(void)
4915 struct dmar_drhd_unit *drhd;
4916 struct intel_iommu *iommu = NULL;
4918 if (no_iommu || dmar_disabled)
4921 down_write(&dmar_global_lock);
4923 /* Disable PMRs explicitly here. */
4924 for_each_iommu(iommu, drhd)
4925 iommu_disable_protect_mem_regions(iommu);
4927 /* Make sure the IOMMUs are switched off */
4928 intel_disable_iommus();
4930 up_write(&dmar_global_lock);
4933 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4935 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4937 return container_of(iommu_dev, struct intel_iommu, iommu);
4940 static ssize_t intel_iommu_show_version(struct device *dev,
4941 struct device_attribute *attr,
4944 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4945 u32 ver = readl(iommu->reg + DMAR_VER_REG);
4946 return sprintf(buf, "%d:%d\n",
4947 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4949 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4951 static ssize_t intel_iommu_show_address(struct device *dev,
4952 struct device_attribute *attr,
4955 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4956 return sprintf(buf, "%llx\n", iommu->reg_phys);
4958 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4960 static ssize_t intel_iommu_show_cap(struct device *dev,
4961 struct device_attribute *attr,
4964 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4965 return sprintf(buf, "%llx\n", iommu->cap);
4967 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4969 static ssize_t intel_iommu_show_ecap(struct device *dev,
4970 struct device_attribute *attr,
4973 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4974 return sprintf(buf, "%llx\n", iommu->ecap);
4976 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4978 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4979 struct device_attribute *attr,
4982 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4983 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4985 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4987 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4988 struct device_attribute *attr,
4991 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4992 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4993 cap_ndoms(iommu->cap)));
4995 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4997 static struct attribute *intel_iommu_attrs[] = {
4998 &dev_attr_version.attr,
4999 &dev_attr_address.attr,
5001 &dev_attr_ecap.attr,
5002 &dev_attr_domains_supported.attr,
5003 &dev_attr_domains_used.attr,
5007 static struct attribute_group intel_iommu_group = {
5008 .name = "intel-iommu",
5009 .attrs = intel_iommu_attrs,
5012 const struct attribute_group *intel_iommu_groups[] = {
5017 static inline bool has_untrusted_dev(void)
5019 struct pci_dev *pdev = NULL;
5021 for_each_pci_dev(pdev)
5022 if (pdev->untrusted)
5028 static int __init platform_optin_force_iommu(void)
5030 if (!dmar_platform_optin() || no_platform_optin || !has_untrusted_dev())
5033 if (no_iommu || dmar_disabled)
5034 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
5037 * If Intel-IOMMU is disabled by default, we will apply identity
5038 * map for all devices except those marked as being untrusted.
5041 iommu_set_default_passthrough(false);
5049 static int __init probe_acpi_namespace_devices(void)
5051 struct dmar_drhd_unit *drhd;
5052 /* To avoid a -Wunused-but-set-variable warning. */
5053 struct intel_iommu *iommu __maybe_unused;
5057 for_each_active_iommu(iommu, drhd) {
5058 for_each_active_dev_scope(drhd->devices,
5059 drhd->devices_cnt, i, dev) {
5060 struct acpi_device_physical_node *pn;
5061 struct iommu_group *group;
5062 struct acpi_device *adev;
5064 if (dev->bus != &acpi_bus_type)
5067 adev = to_acpi_device(dev);
5068 mutex_lock(&adev->physical_node_lock);
5069 list_for_each_entry(pn,
5070 &adev->physical_node_list, node) {
5071 group = iommu_group_get(pn->dev);
5073 iommu_group_put(group);
5077 pn->dev->bus->iommu_ops = &intel_iommu_ops;
5078 ret = iommu_probe_device(pn->dev);
5082 mutex_unlock(&adev->physical_node_lock);
5092 int __init intel_iommu_init(void)
5095 struct dmar_drhd_unit *drhd;
5096 struct intel_iommu *iommu;
5099 * Intel IOMMU is required for a TXT/tboot launch or platform
5100 * opt in, so enforce that.
5102 force_on = tboot_force_iommu() || platform_optin_force_iommu();
5104 if (iommu_init_mempool()) {
5106 panic("tboot: Failed to initialize iommu memory\n");
5110 down_write(&dmar_global_lock);
5111 if (dmar_table_init()) {
5113 panic("tboot: Failed to initialize DMAR table\n");
5117 if (dmar_dev_scope_init() < 0) {
5119 panic("tboot: Failed to initialize DMAR device scope\n");
5123 up_write(&dmar_global_lock);
5126 * The bus notifier takes the dmar_global_lock, so lockdep will
5127 * complain later when we register it under the lock.
5129 dmar_register_bus_notifier();
5131 down_write(&dmar_global_lock);
5133 if (no_iommu || dmar_disabled) {
5135 * We exit the function here to ensure IOMMU's remapping and
5136 * mempool aren't setup, which means that the IOMMU's PMRs
5137 * won't be disabled via the call to init_dmars(). So disable
5138 * it explicitly here. The PMRs were setup by tboot prior to
5139 * calling SENTER, but the kernel is expected to reset/tear
5142 if (intel_iommu_tboot_noforce) {
5143 for_each_iommu(iommu, drhd)
5144 iommu_disable_protect_mem_regions(iommu);
5148 * Make sure the IOMMUs are switched off, even when we
5149 * boot into a kexec kernel and the previous kernel left
5152 intel_disable_iommus();
5156 if (list_empty(&dmar_rmrr_units))
5157 pr_info("No RMRR found\n");
5159 if (list_empty(&dmar_atsr_units))
5160 pr_info("No ATSR found\n");
5162 if (dmar_init_reserved_ranges()) {
5164 panic("tboot: Failed to reserve iommu ranges\n");
5165 goto out_free_reserved_range;
5169 intel_iommu_gfx_mapped = 1;
5171 init_no_remapping_devices();
5176 panic("tboot: Failed to initialize DMARs\n");
5177 pr_err("Initialization failed\n");
5178 goto out_free_reserved_range;
5180 up_write(&dmar_global_lock);
5182 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
5184 * If the system has no untrusted device or the user has decided
5185 * to disable the bounce page mechanisms, we don't need swiotlb.
5186 * Mark this and the pre-allocated bounce pages will be released
5189 if (!has_untrusted_dev() || intel_no_bounce)
5192 dma_ops = &intel_dma_ops;
5194 init_iommu_pm_ops();
5196 for_each_active_iommu(iommu, drhd) {
5197 iommu_device_sysfs_add(&iommu->iommu, NULL,
5200 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
5201 iommu_device_register(&iommu->iommu);
5204 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
5205 if (si_domain && !hw_pass_through)
5206 register_memory_notifier(&intel_iommu_memory_nb);
5207 cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
5208 intel_iommu_cpu_dead);
5210 down_read(&dmar_global_lock);
5211 if (probe_acpi_namespace_devices())
5212 pr_warn("ACPI name space devices didn't probe correctly\n");
5213 up_read(&dmar_global_lock);
5215 /* Finally, we enable the DMA remapping hardware. */
5216 for_each_iommu(iommu, drhd) {
5217 if (!drhd->ignored && !translation_pre_enabled(iommu))
5218 iommu_enable_translation(iommu);
5220 iommu_disable_protect_mem_regions(iommu);
5222 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
5224 intel_iommu_enabled = 1;
5225 intel_iommu_debugfs_init();
5229 out_free_reserved_range:
5230 put_iova_domain(&reserved_iova_list);
5232 intel_iommu_free_dmars();
5233 up_write(&dmar_global_lock);
5234 iommu_exit_mempool();
5238 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
5240 struct intel_iommu *iommu = opaque;
5242 domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
5247 * NB - intel-iommu lacks any sort of reference counting for the users of
5248 * dependent devices. If multiple endpoints have intersecting dependent
5249 * devices, unbinding the driver from any one of them will possibly leave
5250 * the others unable to operate.
5252 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
5254 if (!iommu || !dev || !dev_is_pci(dev))
5257 pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
5260 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
5262 struct dmar_domain *domain;
5263 struct intel_iommu *iommu;
5264 unsigned long flags;
5266 assert_spin_locked(&device_domain_lock);
5271 iommu = info->iommu;
5272 domain = info->domain;
5275 if (dev_is_pci(info->dev) && sm_supported(iommu))
5276 intel_pasid_tear_down_entry(iommu, info->dev,
5279 iommu_disable_dev_iotlb(info);
5280 domain_context_clear(iommu, info->dev);
5281 intel_pasid_free_table(info->dev);
5284 unlink_domain_info(info);
5286 spin_lock_irqsave(&iommu->lock, flags);
5287 domain_detach_iommu(domain, iommu);
5288 spin_unlock_irqrestore(&iommu->lock, flags);
5290 /* free the private domain */
5291 if (domain->flags & DOMAIN_FLAG_LOSE_CHILDREN &&
5292 !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
5293 list_empty(&domain->devices))
5294 domain_exit(info->domain);
5296 free_devinfo_mem(info);
5299 static void dmar_remove_one_dev_info(struct device *dev)
5301 struct device_domain_info *info;
5302 unsigned long flags;
5304 spin_lock_irqsave(&device_domain_lock, flags);
5305 info = dev->archdata.iommu;
5306 if (info && info != DEFER_DEVICE_DOMAIN_INFO
5307 && info != DUMMY_DEVICE_DOMAIN_INFO)
5308 __dmar_remove_one_dev_info(info);
5309 spin_unlock_irqrestore(&device_domain_lock, flags);
5312 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5316 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5317 domain_reserve_special_ranges(domain);
5319 /* calculate AGAW */
5320 domain->gaw = guest_width;
5321 adjust_width = guestwidth_to_adjustwidth(guest_width);
5322 domain->agaw = width_to_agaw(adjust_width);
5324 domain->iommu_coherency = 0;
5325 domain->iommu_snooping = 0;
5326 domain->iommu_superpage = 0;
5327 domain->max_addr = 0;
5329 /* always allocate the top pgd */
5330 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5333 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5337 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5339 struct dmar_domain *dmar_domain;
5340 struct iommu_domain *domain;
5344 case IOMMU_DOMAIN_DMA:
5346 case IOMMU_DOMAIN_UNMANAGED:
5347 dmar_domain = alloc_domain(0);
5349 pr_err("Can't allocate dmar_domain\n");
5352 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5353 pr_err("Domain initialization failed\n");
5354 domain_exit(dmar_domain);
5358 if (!intel_iommu_strict && type == IOMMU_DOMAIN_DMA) {
5359 ret = init_iova_flush_queue(&dmar_domain->iovad,
5363 pr_info("iova flush queue initialization failed\n");
5366 domain_update_iommu_cap(dmar_domain);
5368 domain = &dmar_domain->domain;
5369 domain->geometry.aperture_start = 0;
5370 domain->geometry.aperture_end =
5371 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5372 domain->geometry.force_aperture = true;
5375 case IOMMU_DOMAIN_IDENTITY:
5376 return &si_domain->domain;
5384 static void intel_iommu_domain_free(struct iommu_domain *domain)
5386 if (domain != &si_domain->domain)
5387 domain_exit(to_dmar_domain(domain));
5391 * Check whether a @domain could be attached to the @dev through the
5392 * aux-domain attach/detach APIs.
5395 is_aux_domain(struct device *dev, struct iommu_domain *domain)
5397 struct device_domain_info *info = dev->archdata.iommu;
5399 return info && info->auxd_enabled &&
5400 domain->type == IOMMU_DOMAIN_UNMANAGED;
5403 static void auxiliary_link_device(struct dmar_domain *domain,
5406 struct device_domain_info *info = dev->archdata.iommu;
5408 assert_spin_locked(&device_domain_lock);
5412 domain->auxd_refcnt++;
5413 list_add(&domain->auxd, &info->auxiliary_domains);
5416 static void auxiliary_unlink_device(struct dmar_domain *domain,
5419 struct device_domain_info *info = dev->archdata.iommu;
5421 assert_spin_locked(&device_domain_lock);
5425 list_del(&domain->auxd);
5426 domain->auxd_refcnt--;
5428 if (!domain->auxd_refcnt && domain->default_pasid > 0)
5429 ioasid_free(domain->default_pasid);
5432 static int aux_domain_add_dev(struct dmar_domain *domain,
5437 unsigned long flags;
5438 struct intel_iommu *iommu;
5440 iommu = device_to_iommu(dev, &bus, &devfn);
5444 if (domain->default_pasid <= 0) {
5447 /* No private data needed for the default pasid */
5448 pasid = ioasid_alloc(NULL, PASID_MIN,
5449 pci_max_pasids(to_pci_dev(dev)) - 1,
5451 if (pasid == INVALID_IOASID) {
5452 pr_err("Can't allocate default pasid\n");
5455 domain->default_pasid = pasid;
5458 spin_lock_irqsave(&device_domain_lock, flags);
5460 * iommu->lock must be held to attach domain to iommu and setup the
5461 * pasid entry for second level translation.
5463 spin_lock(&iommu->lock);
5464 ret = domain_attach_iommu(domain, iommu);
5468 /* Setup the PASID entry for mediated devices: */
5469 if (domain_use_first_level(domain))
5470 ret = domain_setup_first_level(iommu, domain, dev,
5471 domain->default_pasid);
5473 ret = intel_pasid_setup_second_level(iommu, domain, dev,
5474 domain->default_pasid);
5477 spin_unlock(&iommu->lock);
5479 auxiliary_link_device(domain, dev);
5481 spin_unlock_irqrestore(&device_domain_lock, flags);
5486 domain_detach_iommu(domain, iommu);
5488 spin_unlock(&iommu->lock);
5489 spin_unlock_irqrestore(&device_domain_lock, flags);
5490 if (!domain->auxd_refcnt && domain->default_pasid > 0)
5491 ioasid_free(domain->default_pasid);
5496 static void aux_domain_remove_dev(struct dmar_domain *domain,
5499 struct device_domain_info *info;
5500 struct intel_iommu *iommu;
5501 unsigned long flags;
5503 if (!is_aux_domain(dev, &domain->domain))
5506 spin_lock_irqsave(&device_domain_lock, flags);
5507 info = dev->archdata.iommu;
5508 iommu = info->iommu;
5510 auxiliary_unlink_device(domain, dev);
5512 spin_lock(&iommu->lock);
5513 intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid);
5514 domain_detach_iommu(domain, iommu);
5515 spin_unlock(&iommu->lock);
5517 spin_unlock_irqrestore(&device_domain_lock, flags);
5520 static int prepare_domain_attach_device(struct iommu_domain *domain,
5523 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5524 struct intel_iommu *iommu;
5528 iommu = device_to_iommu(dev, &bus, &devfn);
5532 /* check if this iommu agaw is sufficient for max mapped address */
5533 addr_width = agaw_to_width(iommu->agaw);
5534 if (addr_width > cap_mgaw(iommu->cap))
5535 addr_width = cap_mgaw(iommu->cap);
5537 if (dmar_domain->max_addr > (1LL << addr_width)) {
5538 dev_err(dev, "%s: iommu width (%d) is not "
5539 "sufficient for the mapped address (%llx)\n",
5540 __func__, addr_width, dmar_domain->max_addr);
5543 dmar_domain->gaw = addr_width;
5546 * Knock out extra levels of page tables if necessary
5548 while (iommu->agaw < dmar_domain->agaw) {
5549 struct dma_pte *pte;
5551 pte = dmar_domain->pgd;
5552 if (dma_pte_present(pte)) {
5553 dmar_domain->pgd = (struct dma_pte *)
5554 phys_to_virt(dma_pte_addr(pte));
5555 free_pgtable_page(pte);
5557 dmar_domain->agaw--;
5563 static int intel_iommu_attach_device(struct iommu_domain *domain,
5568 if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5569 device_is_rmrr_locked(dev)) {
5570 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
5574 if (is_aux_domain(dev, domain))
5577 /* normally dev is not mapped */
5578 if (unlikely(domain_context_mapped(dev))) {
5579 struct dmar_domain *old_domain;
5581 old_domain = find_domain(dev);
5583 dmar_remove_one_dev_info(dev);
5586 ret = prepare_domain_attach_device(domain, dev);
5590 return domain_add_dev_info(to_dmar_domain(domain), dev);
5593 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5598 if (!is_aux_domain(dev, domain))
5601 ret = prepare_domain_attach_device(domain, dev);
5605 return aux_domain_add_dev(to_dmar_domain(domain), dev);
5608 static void intel_iommu_detach_device(struct iommu_domain *domain,
5611 dmar_remove_one_dev_info(dev);
5614 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5617 aux_domain_remove_dev(to_dmar_domain(domain), dev);
5620 static int intel_iommu_map(struct iommu_domain *domain,
5621 unsigned long iova, phys_addr_t hpa,
5622 size_t size, int iommu_prot, gfp_t gfp)
5624 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5629 if (iommu_prot & IOMMU_READ)
5630 prot |= DMA_PTE_READ;
5631 if (iommu_prot & IOMMU_WRITE)
5632 prot |= DMA_PTE_WRITE;
5633 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5634 prot |= DMA_PTE_SNP;
5636 max_addr = iova + size;
5637 if (dmar_domain->max_addr < max_addr) {
5640 /* check if minimum agaw is sufficient for mapped address */
5641 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5642 if (end < max_addr) {
5643 pr_err("%s: iommu width (%d) is not "
5644 "sufficient for the mapped address (%llx)\n",
5645 __func__, dmar_domain->gaw, max_addr);
5648 dmar_domain->max_addr = max_addr;
5650 /* Round up size to next multiple of PAGE_SIZE, if it and
5651 the low bits of hpa would take us onto the next page */
5652 size = aligned_nrpages(hpa, size);
5653 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5654 hpa >> VTD_PAGE_SHIFT, size, prot);
5658 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5659 unsigned long iova, size_t size,
5660 struct iommu_iotlb_gather *gather)
5662 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5663 struct page *freelist = NULL;
5664 unsigned long start_pfn, last_pfn;
5665 unsigned int npages;
5666 int iommu_id, level = 0;
5668 /* Cope with horrid API which requires us to unmap more than the
5669 size argument if it happens to be a large-page mapping. */
5670 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5672 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5673 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5675 start_pfn = iova >> VTD_PAGE_SHIFT;
5676 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5678 freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5680 npages = last_pfn - start_pfn + 1;
5682 for_each_domain_iommu(iommu_id, dmar_domain)
5683 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5684 start_pfn, npages, !freelist, 0);
5686 dma_free_pagelist(freelist);
5688 if (dmar_domain->max_addr == iova + size)
5689 dmar_domain->max_addr = iova;
5694 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5697 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5698 struct dma_pte *pte;
5702 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5704 phys = dma_pte_addr(pte);
5709 static inline bool scalable_mode_support(void)
5711 struct dmar_drhd_unit *drhd;
5712 struct intel_iommu *iommu;
5716 for_each_active_iommu(iommu, drhd) {
5717 if (!sm_supported(iommu)) {
5727 static inline bool iommu_pasid_support(void)
5729 struct dmar_drhd_unit *drhd;
5730 struct intel_iommu *iommu;
5734 for_each_active_iommu(iommu, drhd) {
5735 if (!pasid_supported(iommu)) {
5745 static inline bool nested_mode_support(void)
5747 struct dmar_drhd_unit *drhd;
5748 struct intel_iommu *iommu;
5752 for_each_active_iommu(iommu, drhd) {
5753 if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) {
5763 static bool intel_iommu_capable(enum iommu_cap cap)
5765 if (cap == IOMMU_CAP_CACHE_COHERENCY)
5766 return domain_update_iommu_snooping(NULL) == 1;
5767 if (cap == IOMMU_CAP_INTR_REMAP)
5768 return irq_remapping_enabled == 1;
5773 static int intel_iommu_add_device(struct device *dev)
5775 struct dmar_domain *dmar_domain;
5776 struct iommu_domain *domain;
5777 struct intel_iommu *iommu;
5778 struct iommu_group *group;
5782 iommu = device_to_iommu(dev, &bus, &devfn);
5786 iommu_device_link(&iommu->iommu, dev);
5788 if (translation_pre_enabled(iommu))
5789 dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO;
5791 group = iommu_group_get_for_dev(dev);
5793 if (IS_ERR(group)) {
5794 ret = PTR_ERR(group);
5798 iommu_group_put(group);
5800 domain = iommu_get_domain_for_dev(dev);
5801 dmar_domain = to_dmar_domain(domain);
5802 if (domain->type == IOMMU_DOMAIN_DMA) {
5803 if (device_def_domain_type(dev) == IOMMU_DOMAIN_IDENTITY) {
5804 ret = iommu_request_dm_for_dev(dev);
5806 dmar_remove_one_dev_info(dev);
5807 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5808 domain_add_dev_info(si_domain, dev);
5810 "Device uses a private identity domain.\n");
5814 if (device_def_domain_type(dev) == IOMMU_DOMAIN_DMA) {
5815 ret = iommu_request_dma_domain_for_dev(dev);
5817 dmar_remove_one_dev_info(dev);
5818 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5819 if (!get_private_domain_for_dev(dev)) {
5821 "Failed to get a private domain.\n");
5827 "Device uses a private dma domain.\n");
5832 if (device_needs_bounce(dev)) {
5833 dev_info(dev, "Use Intel IOMMU bounce page dma_ops\n");
5834 set_dma_ops(dev, &bounce_dma_ops);
5840 iommu_device_unlink(&iommu->iommu, dev);
5844 static void intel_iommu_remove_device(struct device *dev)
5846 struct intel_iommu *iommu;
5849 iommu = device_to_iommu(dev, &bus, &devfn);
5853 dmar_remove_one_dev_info(dev);
5855 iommu_group_remove_device(dev);
5857 iommu_device_unlink(&iommu->iommu, dev);
5859 if (device_needs_bounce(dev))
5860 set_dma_ops(dev, NULL);
5863 static void intel_iommu_get_resv_regions(struct device *device,
5864 struct list_head *head)
5866 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5867 struct iommu_resv_region *reg;
5868 struct dmar_rmrr_unit *rmrr;
5869 struct device *i_dev;
5872 down_read(&dmar_global_lock);
5873 for_each_rmrr_units(rmrr) {
5874 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5876 struct iommu_resv_region *resv;
5877 enum iommu_resv_type type;
5880 if (i_dev != device &&
5881 !is_downstream_to_pci_bridge(device, i_dev))
5884 length = rmrr->end_address - rmrr->base_address + 1;
5886 type = device_rmrr_is_relaxable(device) ?
5887 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5889 resv = iommu_alloc_resv_region(rmrr->base_address,
5890 length, prot, type);
5894 list_add_tail(&resv->list, head);
5897 up_read(&dmar_global_lock);
5899 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5900 if (dev_is_pci(device)) {
5901 struct pci_dev *pdev = to_pci_dev(device);
5903 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5904 reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5905 IOMMU_RESV_DIRECT_RELAXABLE);
5907 list_add_tail(®->list, head);
5910 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5912 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5913 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5917 list_add_tail(®->list, head);
5920 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5922 struct device_domain_info *info;
5923 struct context_entry *context;
5924 struct dmar_domain *domain;
5925 unsigned long flags;
5929 domain = find_domain(dev);
5933 spin_lock_irqsave(&device_domain_lock, flags);
5934 spin_lock(&iommu->lock);
5937 info = dev->archdata.iommu;
5938 if (!info || !info->pasid_supported)
5941 context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5942 if (WARN_ON(!context))
5945 ctx_lo = context[0].lo;
5947 if (!(ctx_lo & CONTEXT_PASIDE)) {
5948 ctx_lo |= CONTEXT_PASIDE;
5949 context[0].lo = ctx_lo;
5951 iommu->flush.flush_context(iommu,
5952 domain->iommu_did[iommu->seq_id],
5953 PCI_DEVID(info->bus, info->devfn),
5954 DMA_CCMD_MASK_NOBIT,
5955 DMA_CCMD_DEVICE_INVL);
5958 /* Enable PASID support in the device, if it wasn't already */
5959 if (!info->pasid_enabled)
5960 iommu_enable_dev_iotlb(info);
5965 spin_unlock(&iommu->lock);
5966 spin_unlock_irqrestore(&device_domain_lock, flags);
5971 static void intel_iommu_apply_resv_region(struct device *dev,
5972 struct iommu_domain *domain,
5973 struct iommu_resv_region *region)
5975 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5976 unsigned long start, end;
5978 start = IOVA_PFN(region->start);
5979 end = IOVA_PFN(region->start + region->length - 1);
5981 WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5984 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5986 if (dev_is_pci(dev))
5987 return pci_device_group(dev);
5988 return generic_device_group(dev);
5991 #ifdef CONFIG_INTEL_IOMMU_SVM
5992 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5994 struct intel_iommu *iommu;
5997 if (iommu_dummy(dev)) {
5999 "No IOMMU translation for device; cannot enable SVM\n");
6003 iommu = device_to_iommu(dev, &bus, &devfn);
6005 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
6011 #endif /* CONFIG_INTEL_IOMMU_SVM */
6013 static int intel_iommu_enable_auxd(struct device *dev)
6015 struct device_domain_info *info;
6016 struct intel_iommu *iommu;
6017 unsigned long flags;
6021 iommu = device_to_iommu(dev, &bus, &devfn);
6022 if (!iommu || dmar_disabled)
6025 if (!sm_supported(iommu) || !pasid_supported(iommu))
6028 ret = intel_iommu_enable_pasid(iommu, dev);
6032 spin_lock_irqsave(&device_domain_lock, flags);
6033 info = dev->archdata.iommu;
6034 info->auxd_enabled = 1;
6035 spin_unlock_irqrestore(&device_domain_lock, flags);
6040 static int intel_iommu_disable_auxd(struct device *dev)
6042 struct device_domain_info *info;
6043 unsigned long flags;
6045 spin_lock_irqsave(&device_domain_lock, flags);
6046 info = dev->archdata.iommu;
6047 if (!WARN_ON(!info))
6048 info->auxd_enabled = 0;
6049 spin_unlock_irqrestore(&device_domain_lock, flags);
6055 * A PCI express designated vendor specific extended capability is defined
6056 * in the section 3.7 of Intel scalable I/O virtualization technical spec
6057 * for system software and tools to detect endpoint devices supporting the
6058 * Intel scalable IO virtualization without host driver dependency.
6060 * Returns the address of the matching extended capability structure within
6061 * the device's PCI configuration space or 0 if the device does not support
6064 static int siov_find_pci_dvsec(struct pci_dev *pdev)
6069 pos = pci_find_next_ext_capability(pdev, 0, 0x23);
6071 pci_read_config_word(pdev, pos + 4, &vendor);
6072 pci_read_config_word(pdev, pos + 8, &id);
6073 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
6076 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
6083 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
6085 if (feat == IOMMU_DEV_FEAT_AUX) {
6088 if (!dev_is_pci(dev) || dmar_disabled ||
6089 !scalable_mode_support() || !iommu_pasid_support())
6092 ret = pci_pasid_features(to_pci_dev(dev));
6096 return !!siov_find_pci_dvsec(to_pci_dev(dev));
6103 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
6105 if (feat == IOMMU_DEV_FEAT_AUX)
6106 return intel_iommu_enable_auxd(dev);
6112 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
6114 if (feat == IOMMU_DEV_FEAT_AUX)
6115 return intel_iommu_disable_auxd(dev);
6121 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
6123 struct device_domain_info *info = dev->archdata.iommu;
6125 if (feat == IOMMU_DEV_FEAT_AUX)
6126 return scalable_mode_support() && info && info->auxd_enabled;
6132 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
6134 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6136 return dmar_domain->default_pasid > 0 ?
6137 dmar_domain->default_pasid : -EINVAL;
6140 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
6143 return attach_deferred(dev);
6147 intel_iommu_domain_set_attr(struct iommu_domain *domain,
6148 enum iommu_attr attr, void *data)
6150 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6151 unsigned long flags;
6154 if (domain->type != IOMMU_DOMAIN_UNMANAGED)
6158 case DOMAIN_ATTR_NESTING:
6159 spin_lock_irqsave(&device_domain_lock, flags);
6160 if (nested_mode_support() &&
6161 list_empty(&dmar_domain->devices)) {
6162 dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
6163 dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
6167 spin_unlock_irqrestore(&device_domain_lock, flags);
6177 const struct iommu_ops intel_iommu_ops = {
6178 .capable = intel_iommu_capable,
6179 .domain_alloc = intel_iommu_domain_alloc,
6180 .domain_free = intel_iommu_domain_free,
6181 .domain_set_attr = intel_iommu_domain_set_attr,
6182 .attach_dev = intel_iommu_attach_device,
6183 .detach_dev = intel_iommu_detach_device,
6184 .aux_attach_dev = intel_iommu_aux_attach_device,
6185 .aux_detach_dev = intel_iommu_aux_detach_device,
6186 .aux_get_pasid = intel_iommu_aux_get_pasid,
6187 .map = intel_iommu_map,
6188 .unmap = intel_iommu_unmap,
6189 .iova_to_phys = intel_iommu_iova_to_phys,
6190 .add_device = intel_iommu_add_device,
6191 .remove_device = intel_iommu_remove_device,
6192 .get_resv_regions = intel_iommu_get_resv_regions,
6193 .put_resv_regions = generic_iommu_put_resv_regions,
6194 .apply_resv_region = intel_iommu_apply_resv_region,
6195 .device_group = intel_iommu_device_group,
6196 .dev_has_feat = intel_iommu_dev_has_feat,
6197 .dev_feat_enabled = intel_iommu_dev_feat_enabled,
6198 .dev_enable_feat = intel_iommu_dev_enable_feat,
6199 .dev_disable_feat = intel_iommu_dev_disable_feat,
6200 .is_attach_deferred = intel_iommu_is_attach_deferred,
6201 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
6204 static void quirk_iommu_igfx(struct pci_dev *dev)
6206 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
6210 /* G4x/GM45 integrated gfx dmar support is totally busted. */
6211 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
6212 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
6213 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
6214 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
6215 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
6216 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
6217 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
6219 /* Broadwell igfx malfunctions with dmar */
6220 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
6221 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
6222 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
6223 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
6224 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
6225 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
6226 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
6227 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
6228 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
6229 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
6230 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
6231 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
6232 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
6233 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
6234 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
6235 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
6236 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
6237 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
6238 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
6239 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
6240 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
6241 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
6242 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
6243 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
6245 static void quirk_iommu_rwbf(struct pci_dev *dev)
6248 * Mobile 4 Series Chipset neglects to set RWBF capability,
6249 * but needs it. Same seems to hold for the desktop versions.
6251 pci_info(dev, "Forcing write-buffer flush capability\n");
6255 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
6256 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
6257 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
6258 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
6259 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
6260 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
6261 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
6264 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
6265 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
6266 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
6267 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
6268 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
6269 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
6270 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
6271 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
6273 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
6277 if (pci_read_config_word(dev, GGC, &ggc))
6280 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
6281 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
6283 } else if (dmar_map_gfx) {
6284 /* we have to ensure the gfx device is idle before we flush */
6285 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
6286 intel_iommu_strict = 1;
6289 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
6290 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
6291 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
6292 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
6294 /* On Tylersburg chipsets, some BIOSes have been known to enable the
6295 ISOCH DMAR unit for the Azalia sound device, but not give it any
6296 TLB entries, which causes it to deadlock. Check for that. We do
6297 this in a function called from init_dmars(), instead of in a PCI
6298 quirk, because we don't want to print the obnoxious "BIOS broken"
6299 message if VT-d is actually disabled.
6301 static void __init check_tylersburg_isoch(void)
6303 struct pci_dev *pdev;
6304 uint32_t vtisochctrl;
6306 /* If there's no Azalia in the system anyway, forget it. */
6307 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
6312 /* System Management Registers. Might be hidden, in which case
6313 we can't do the sanity check. But that's OK, because the
6314 known-broken BIOSes _don't_ actually hide it, so far. */
6315 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
6319 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
6326 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
6327 if (vtisochctrl & 1)
6330 /* Drop all bits other than the number of TLB entries */
6331 vtisochctrl &= 0x1c;
6333 /* If we have the recommended number of TLB entries (16), fine. */
6334 if (vtisochctrl == 0x10)
6337 /* Zero TLB entries? You get to ride the short bus to school. */
6339 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
6340 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
6341 dmi_get_system_info(DMI_BIOS_VENDOR),
6342 dmi_get_system_info(DMI_BIOS_VERSION),
6343 dmi_get_system_info(DMI_PRODUCT_VERSION));
6344 iommu_identity_mapping |= IDENTMAP_AZALIA;
6348 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",