1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright © 2006-2014 Intel Corporation.
13 #define pr_fmt(fmt) "DMAR: " fmt
14 #define dev_fmt(fmt) pr_fmt(fmt)
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-mapping.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-contiguous.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <linux/swiotlb.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48 #include <trace/events/intel_iommu.h>
50 #include "irq_remapping.h"
51 #include "intel-pasid.h"
53 #define ROOT_SIZE VTD_PAGE_SIZE
54 #define CONTEXT_SIZE VTD_PAGE_SIZE
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
61 #define IOAPIC_RANGE_START (0xfee00000)
62 #define IOAPIC_RANGE_END (0xfeefffff)
63 #define IOVA_START_ADDR (0x1000)
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
70 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
76 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN (1)
82 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
84 /* page table handling */
85 #define LEVEL_STRIDE (9)
86 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
89 * This bitmap is used to advertise the page sizes our hardware support
90 * to the IOMMU core, which will then use this information to split
91 * physically contiguous memory regions it is mapping into page sizes
94 * Traditionally the IOMMU core just handed us the mappings directly,
95 * after making sure the size is an order of a 4KiB page and that the
96 * mapping has natural alignment.
98 * To retain this behavior, we currently advertise that we support
99 * all page sizes that are an order of 4KiB.
101 * If at some point we'd like to utilize the IOMMU core's new behavior,
102 * we could change this to advertise the real page sizes we support.
104 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
106 static inline int agaw_to_level(int agaw)
111 static inline int agaw_to_width(int agaw)
113 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
116 static inline int width_to_agaw(int width)
118 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
121 static inline unsigned int level_to_offset_bits(int level)
123 return (level - 1) * LEVEL_STRIDE;
126 static inline int pfn_level_offset(unsigned long pfn, int level)
128 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
131 static inline unsigned long level_mask(int level)
133 return -1UL << level_to_offset_bits(level);
136 static inline unsigned long level_size(int level)
138 return 1UL << level_to_offset_bits(level);
141 static inline unsigned long align_to_level(unsigned long pfn, int level)
143 return (pfn + level_size(level) - 1) & level_mask(level);
146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
148 return 1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
152 are never going to work. */
153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
155 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
160 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
162 static inline unsigned long page_to_dma_pfn(struct page *pg)
164 return mm_to_dma_pfn(page_to_pfn(pg));
166 static inline unsigned long virt_to_dma_pfn(void *p)
168 return page_to_dma_pfn(virt_to_page(p));
171 /* global iommu list, set NULL for ignored DMAR units */
172 static struct intel_iommu **g_iommus;
174 static void __init check_tylersburg_isoch(void);
175 static int rwbf_quirk;
178 * set to 1 to panic kernel if can't successfully enable VT-d
179 * (used when kernel is launched w/ TXT)
181 static int force_on = 0;
182 int intel_iommu_tboot_noforce;
183 static int no_platform_optin;
185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
188 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
191 static phys_addr_t root_entry_lctp(struct root_entry *re)
196 return re->lo & VTD_PAGE_MASK;
200 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
203 static phys_addr_t root_entry_uctp(struct root_entry *re)
208 return re->hi & VTD_PAGE_MASK;
211 static inline void context_clear_pasid_enable(struct context_entry *context)
213 context->lo &= ~(1ULL << 11);
216 static inline bool context_pasid_enabled(struct context_entry *context)
218 return !!(context->lo & (1ULL << 11));
221 static inline void context_set_copied(struct context_entry *context)
223 context->hi |= (1ull << 3);
226 static inline bool context_copied(struct context_entry *context)
228 return !!(context->hi & (1ULL << 3));
231 static inline bool __context_present(struct context_entry *context)
233 return (context->lo & 1);
236 bool context_present(struct context_entry *context)
238 return context_pasid_enabled(context) ?
239 __context_present(context) :
240 __context_present(context) && !context_copied(context);
243 static inline void context_set_present(struct context_entry *context)
248 static inline void context_set_fault_enable(struct context_entry *context)
250 context->lo &= (((u64)-1) << 2) | 1;
253 static inline void context_set_translation_type(struct context_entry *context,
256 context->lo &= (((u64)-1) << 4) | 3;
257 context->lo |= (value & 3) << 2;
260 static inline void context_set_address_root(struct context_entry *context,
263 context->lo &= ~VTD_PAGE_MASK;
264 context->lo |= value & VTD_PAGE_MASK;
267 static inline void context_set_address_width(struct context_entry *context,
270 context->hi |= value & 7;
273 static inline void context_set_domain_id(struct context_entry *context,
276 context->hi |= (value & ((1 << 16) - 1)) << 8;
279 static inline int context_domain_id(struct context_entry *c)
281 return((c->hi >> 8) & 0xffff);
284 static inline void context_clear_entry(struct context_entry *context)
291 * This domain is a statically identity mapping domain.
292 * 1. This domain creats a static 1:1 mapping to all usable memory.
293 * 2. It maps to each iommu if successful.
294 * 3. Each iommu mapps to this domain if successful.
296 static struct dmar_domain *si_domain;
297 static int hw_pass_through = 1;
299 #define for_each_domain_iommu(idx, domain) \
300 for (idx = 0; idx < g_num_of_iommus; idx++) \
301 if (domain->iommu_refcnt[idx])
303 struct dmar_rmrr_unit {
304 struct list_head list; /* list of rmrr units */
305 struct acpi_dmar_header *hdr; /* ACPI header */
306 u64 base_address; /* reserved base address*/
307 u64 end_address; /* reserved end address */
308 struct dmar_dev_scope *devices; /* target devices */
309 int devices_cnt; /* target device count */
312 struct dmar_atsr_unit {
313 struct list_head list; /* list of ATSR units */
314 struct acpi_dmar_header *hdr; /* ACPI header */
315 struct dmar_dev_scope *devices; /* target devices */
316 int devices_cnt; /* target device count */
317 u8 include_all:1; /* include all ports */
320 static LIST_HEAD(dmar_atsr_units);
321 static LIST_HEAD(dmar_rmrr_units);
323 #define for_each_rmrr_units(rmrr) \
324 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
326 /* bitmap for indexing intel_iommus */
327 static int g_num_of_iommus;
329 static void domain_exit(struct dmar_domain *domain);
330 static void domain_remove_dev_info(struct dmar_domain *domain);
331 static void dmar_remove_one_dev_info(struct device *dev);
332 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
333 static int intel_iommu_attach_device(struct iommu_domain *domain,
335 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
338 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
339 int dmar_disabled = 0;
341 int dmar_disabled = 1;
342 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
344 #ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
345 int intel_iommu_sm = 1;
348 #endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
350 int intel_iommu_enabled = 0;
351 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
353 static int dmar_map_gfx = 1;
354 static int dmar_forcedac;
355 static int intel_iommu_strict;
356 static int intel_iommu_superpage = 1;
357 static int iommu_identity_mapping;
358 static int intel_no_bounce;
360 #define IDENTMAP_GFX 2
361 #define IDENTMAP_AZALIA 4
363 int intel_iommu_gfx_mapped;
364 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
366 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
367 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
368 struct device_domain_info *get_domain_info(struct device *dev)
370 struct device_domain_info *info;
375 info = dev->archdata.iommu;
376 if (unlikely(info == DUMMY_DEVICE_DOMAIN_INFO ||
377 info == DEFER_DEVICE_DOMAIN_INFO))
383 DEFINE_SPINLOCK(device_domain_lock);
384 static LIST_HEAD(device_domain_list);
386 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) && \
387 to_pci_dev(d)->untrusted)
390 * Iterate over elements in device_domain_list and call the specified
391 * callback @fn against each element.
393 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
394 void *data), void *data)
398 struct device_domain_info *info;
400 spin_lock_irqsave(&device_domain_lock, flags);
401 list_for_each_entry(info, &device_domain_list, global) {
402 ret = fn(info, data);
404 spin_unlock_irqrestore(&device_domain_lock, flags);
408 spin_unlock_irqrestore(&device_domain_lock, flags);
413 const struct iommu_ops intel_iommu_ops;
415 static bool translation_pre_enabled(struct intel_iommu *iommu)
417 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
420 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
422 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
425 static void init_translation_status(struct intel_iommu *iommu)
429 gsts = readl(iommu->reg + DMAR_GSTS_REG);
430 if (gsts & DMA_GSTS_TES)
431 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
434 static int __init intel_iommu_setup(char *str)
439 if (!strncmp(str, "on", 2)) {
441 pr_info("IOMMU enabled\n");
442 } else if (!strncmp(str, "off", 3)) {
444 no_platform_optin = 1;
445 pr_info("IOMMU disabled\n");
446 } else if (!strncmp(str, "igfx_off", 8)) {
448 pr_info("Disable GFX device mapping\n");
449 } else if (!strncmp(str, "forcedac", 8)) {
450 pr_info("Forcing DAC for PCI devices\n");
452 } else if (!strncmp(str, "strict", 6)) {
453 pr_info("Disable batched IOTLB flush\n");
454 intel_iommu_strict = 1;
455 } else if (!strncmp(str, "sp_off", 6)) {
456 pr_info("Disable supported super page\n");
457 intel_iommu_superpage = 0;
458 } else if (!strncmp(str, "sm_on", 5)) {
459 pr_info("Intel-IOMMU: scalable mode supported\n");
461 } else if (!strncmp(str, "tboot_noforce", 13)) {
462 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
463 intel_iommu_tboot_noforce = 1;
464 } else if (!strncmp(str, "nobounce", 8)) {
465 pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
469 str += strcspn(str, ",");
475 __setup("intel_iommu=", intel_iommu_setup);
477 static struct kmem_cache *iommu_domain_cache;
478 static struct kmem_cache *iommu_devinfo_cache;
480 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
482 struct dmar_domain **domains;
485 domains = iommu->domains[idx];
489 return domains[did & 0xff];
492 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
493 struct dmar_domain *domain)
495 struct dmar_domain **domains;
498 if (!iommu->domains[idx]) {
499 size_t size = 256 * sizeof(struct dmar_domain *);
500 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
503 domains = iommu->domains[idx];
504 if (WARN_ON(!domains))
507 domains[did & 0xff] = domain;
510 void *alloc_pgtable_page(int node)
515 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
517 vaddr = page_address(page);
521 void free_pgtable_page(void *vaddr)
523 free_page((unsigned long)vaddr);
526 static inline void *alloc_domain_mem(void)
528 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
531 static void free_domain_mem(void *vaddr)
533 kmem_cache_free(iommu_domain_cache, vaddr);
536 static inline void * alloc_devinfo_mem(void)
538 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
541 static inline void free_devinfo_mem(void *vaddr)
543 kmem_cache_free(iommu_devinfo_cache, vaddr);
546 static inline int domain_type_is_si(struct dmar_domain *domain)
548 return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
551 static inline bool domain_use_first_level(struct dmar_domain *domain)
553 return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
556 static inline int domain_pfn_supported(struct dmar_domain *domain,
559 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
561 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
564 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
569 sagaw = cap_sagaw(iommu->cap);
570 for (agaw = width_to_agaw(max_gaw);
572 if (test_bit(agaw, &sagaw))
580 * Calculate max SAGAW for each iommu.
582 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
584 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
588 * calculate agaw for each iommu.
589 * "SAGAW" may be different across iommus, use a default agaw, and
590 * get a supported less agaw for iommus that don't support the default agaw.
592 int iommu_calculate_agaw(struct intel_iommu *iommu)
594 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
597 /* This functionin only returns single iommu in a domain */
598 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
602 /* si_domain and vm domain should not get here. */
603 if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
606 for_each_domain_iommu(iommu_id, domain)
609 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
612 return g_iommus[iommu_id];
615 static void domain_update_iommu_coherency(struct dmar_domain *domain)
617 struct dmar_drhd_unit *drhd;
618 struct intel_iommu *iommu;
622 domain->iommu_coherency = 1;
624 for_each_domain_iommu(i, domain) {
626 if (!ecap_coherent(g_iommus[i]->ecap)) {
627 domain->iommu_coherency = 0;
634 /* No hardware attached; use lowest common denominator */
636 for_each_active_iommu(iommu, drhd) {
637 if (!ecap_coherent(iommu->ecap)) {
638 domain->iommu_coherency = 0;
645 static int domain_update_iommu_snooping(struct intel_iommu *skip)
647 struct dmar_drhd_unit *drhd;
648 struct intel_iommu *iommu;
652 for_each_active_iommu(iommu, drhd) {
654 if (!ecap_sc_support(iommu->ecap)) {
665 static int domain_update_iommu_superpage(struct dmar_domain *domain,
666 struct intel_iommu *skip)
668 struct dmar_drhd_unit *drhd;
669 struct intel_iommu *iommu;
672 if (!intel_iommu_superpage) {
676 /* set iommu_superpage to the smallest common denominator */
678 for_each_active_iommu(iommu, drhd) {
680 if (domain && domain_use_first_level(domain)) {
681 if (!cap_fl1gp_support(iommu->cap))
684 mask &= cap_super_page_val(iommu->cap);
696 /* Some capabilities may be different across iommus */
697 static void domain_update_iommu_cap(struct dmar_domain *domain)
699 domain_update_iommu_coherency(domain);
700 domain->iommu_snooping = domain_update_iommu_snooping(NULL);
701 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
704 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
707 struct root_entry *root = &iommu->root_entry[bus];
708 struct context_entry *context;
712 if (sm_supported(iommu)) {
720 context = phys_to_virt(*entry & VTD_PAGE_MASK);
722 unsigned long phy_addr;
726 context = alloc_pgtable_page(iommu->node);
730 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
731 phy_addr = virt_to_phys((void *)context);
732 *entry = phy_addr | 1;
733 __iommu_flush_cache(iommu, entry, sizeof(*entry));
735 return &context[devfn];
738 static int iommu_dummy(struct device *dev)
740 return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
743 static bool attach_deferred(struct device *dev)
745 return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO;
749 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
750 * sub-hierarchy of a candidate PCI-PCI bridge
751 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
752 * @bridge: the candidate PCI-PCI bridge
754 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
757 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
759 struct pci_dev *pdev, *pbridge;
761 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
764 pdev = to_pci_dev(dev);
765 pbridge = to_pci_dev(bridge);
767 if (pbridge->subordinate &&
768 pbridge->subordinate->number <= pdev->bus->number &&
769 pbridge->subordinate->busn_res.end >= pdev->bus->number)
775 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
777 struct dmar_drhd_unit *drhd = NULL;
778 struct intel_iommu *iommu;
780 struct pci_dev *pdev = NULL;
784 if (iommu_dummy(dev))
787 if (dev_is_pci(dev)) {
788 struct pci_dev *pf_pdev;
790 pdev = pci_real_dma_dev(to_pci_dev(dev));
792 /* VFs aren't listed in scope tables; we need to look up
793 * the PF instead to find the IOMMU. */
794 pf_pdev = pci_physfn(pdev);
796 segment = pci_domain_nr(pdev->bus);
797 } else if (has_acpi_companion(dev))
798 dev = &ACPI_COMPANION(dev)->dev;
801 for_each_active_iommu(iommu, drhd) {
802 if (pdev && segment != drhd->segment)
805 for_each_active_dev_scope(drhd->devices,
806 drhd->devices_cnt, i, tmp) {
808 /* For a VF use its original BDF# not that of the PF
809 * which we used for the IOMMU lookup. Strictly speaking
810 * we could do this for all PCI devices; we only need to
811 * get the BDF# from the scope table for ACPI matches. */
812 if (pdev && pdev->is_virtfn)
815 *bus = drhd->devices[i].bus;
816 *devfn = drhd->devices[i].devfn;
820 if (is_downstream_to_pci_bridge(dev, tmp))
824 if (pdev && drhd->include_all) {
826 *bus = pdev->bus->number;
827 *devfn = pdev->devfn;
838 static void domain_flush_cache(struct dmar_domain *domain,
839 void *addr, int size)
841 if (!domain->iommu_coherency)
842 clflush_cache_range(addr, size);
845 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
847 struct context_entry *context;
851 spin_lock_irqsave(&iommu->lock, flags);
852 context = iommu_context_addr(iommu, bus, devfn, 0);
854 ret = context_present(context);
855 spin_unlock_irqrestore(&iommu->lock, flags);
859 static void free_context_table(struct intel_iommu *iommu)
863 struct context_entry *context;
865 spin_lock_irqsave(&iommu->lock, flags);
866 if (!iommu->root_entry) {
869 for (i = 0; i < ROOT_ENTRY_NR; i++) {
870 context = iommu_context_addr(iommu, i, 0, 0);
872 free_pgtable_page(context);
874 if (!sm_supported(iommu))
877 context = iommu_context_addr(iommu, i, 0x80, 0);
879 free_pgtable_page(context);
882 free_pgtable_page(iommu->root_entry);
883 iommu->root_entry = NULL;
885 spin_unlock_irqrestore(&iommu->lock, flags);
888 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
889 unsigned long pfn, int *target_level)
891 struct dma_pte *parent, *pte;
892 int level = agaw_to_level(domain->agaw);
895 BUG_ON(!domain->pgd);
897 if (!domain_pfn_supported(domain, pfn))
898 /* Address beyond IOMMU's addressing capabilities. */
901 parent = domain->pgd;
906 offset = pfn_level_offset(pfn, level);
907 pte = &parent[offset];
908 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
910 if (level == *target_level)
913 if (!dma_pte_present(pte)) {
916 tmp_page = alloc_pgtable_page(domain->nid);
921 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
922 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
923 if (domain_use_first_level(domain))
924 pteval |= DMA_FL_PTE_XD;
925 if (cmpxchg64(&pte->val, 0ULL, pteval))
926 /* Someone else set it while we were thinking; use theirs. */
927 free_pgtable_page(tmp_page);
929 domain_flush_cache(domain, pte, sizeof(*pte));
934 parent = phys_to_virt(dma_pte_addr(pte));
939 *target_level = level;
944 /* return address's pte at specific level */
945 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
947 int level, int *large_page)
949 struct dma_pte *parent, *pte;
950 int total = agaw_to_level(domain->agaw);
953 parent = domain->pgd;
954 while (level <= total) {
955 offset = pfn_level_offset(pfn, total);
956 pte = &parent[offset];
960 if (!dma_pte_present(pte)) {
965 if (dma_pte_superpage(pte)) {
970 parent = phys_to_virt(dma_pte_addr(pte));
976 /* clear last level pte, a tlb flush should be followed */
977 static void dma_pte_clear_range(struct dmar_domain *domain,
978 unsigned long start_pfn,
979 unsigned long last_pfn)
981 unsigned int large_page;
982 struct dma_pte *first_pte, *pte;
984 BUG_ON(!domain_pfn_supported(domain, start_pfn));
985 BUG_ON(!domain_pfn_supported(domain, last_pfn));
986 BUG_ON(start_pfn > last_pfn);
988 /* we don't need lock here; nobody else touches the iova range */
991 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
993 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
998 start_pfn += lvl_to_nr_pages(large_page);
1000 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1002 domain_flush_cache(domain, first_pte,
1003 (void *)pte - (void *)first_pte);
1005 } while (start_pfn && start_pfn <= last_pfn);
1008 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1009 int retain_level, struct dma_pte *pte,
1010 unsigned long pfn, unsigned long start_pfn,
1011 unsigned long last_pfn)
1013 pfn = max(start_pfn, pfn);
1014 pte = &pte[pfn_level_offset(pfn, level)];
1017 unsigned long level_pfn;
1018 struct dma_pte *level_pte;
1020 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1023 level_pfn = pfn & level_mask(level);
1024 level_pte = phys_to_virt(dma_pte_addr(pte));
1027 dma_pte_free_level(domain, level - 1, retain_level,
1028 level_pte, level_pfn, start_pfn,
1033 * Free the page table if we're below the level we want to
1034 * retain and the range covers the entire table.
1036 if (level < retain_level && !(start_pfn > level_pfn ||
1037 last_pfn < level_pfn + level_size(level) - 1)) {
1039 domain_flush_cache(domain, pte, sizeof(*pte));
1040 free_pgtable_page(level_pte);
1043 pfn += level_size(level);
1044 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1048 * clear last level (leaf) ptes and free page table pages below the
1049 * level we wish to keep intact.
1051 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1052 unsigned long start_pfn,
1053 unsigned long last_pfn,
1056 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1057 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1058 BUG_ON(start_pfn > last_pfn);
1060 dma_pte_clear_range(domain, start_pfn, last_pfn);
1062 /* We don't need lock here; nobody else touches the iova range */
1063 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1064 domain->pgd, 0, start_pfn, last_pfn);
1067 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1068 free_pgtable_page(domain->pgd);
1073 /* When a page at a given level is being unlinked from its parent, we don't
1074 need to *modify* it at all. All we need to do is make a list of all the
1075 pages which can be freed just as soon as we've flushed the IOTLB and we
1076 know the hardware page-walk will no longer touch them.
1077 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1079 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1080 int level, struct dma_pte *pte,
1081 struct page *freelist)
1085 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1086 pg->freelist = freelist;
1092 pte = page_address(pg);
1094 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1095 freelist = dma_pte_list_pagetables(domain, level - 1,
1098 } while (!first_pte_in_page(pte));
1103 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1104 struct dma_pte *pte, unsigned long pfn,
1105 unsigned long start_pfn,
1106 unsigned long last_pfn,
1107 struct page *freelist)
1109 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1111 pfn = max(start_pfn, pfn);
1112 pte = &pte[pfn_level_offset(pfn, level)];
1115 unsigned long level_pfn;
1117 if (!dma_pte_present(pte))
1120 level_pfn = pfn & level_mask(level);
1122 /* If range covers entire pagetable, free it */
1123 if (start_pfn <= level_pfn &&
1124 last_pfn >= level_pfn + level_size(level) - 1) {
1125 /* These suborbinate page tables are going away entirely. Don't
1126 bother to clear them; we're just going to *free* them. */
1127 if (level > 1 && !dma_pte_superpage(pte))
1128 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1134 } else if (level > 1) {
1135 /* Recurse down into a level that isn't *entirely* obsolete */
1136 freelist = dma_pte_clear_level(domain, level - 1,
1137 phys_to_virt(dma_pte_addr(pte)),
1138 level_pfn, start_pfn, last_pfn,
1142 pfn += level_size(level);
1143 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1146 domain_flush_cache(domain, first_pte,
1147 (void *)++last_pte - (void *)first_pte);
1152 /* We can't just free the pages because the IOMMU may still be walking
1153 the page tables, and may have cached the intermediate levels. The
1154 pages can only be freed after the IOTLB flush has been done. */
1155 static struct page *domain_unmap(struct dmar_domain *domain,
1156 unsigned long start_pfn,
1157 unsigned long last_pfn)
1159 struct page *freelist;
1161 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1162 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1163 BUG_ON(start_pfn > last_pfn);
1165 /* we don't need lock here; nobody else touches the iova range */
1166 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1167 domain->pgd, 0, start_pfn, last_pfn, NULL);
1170 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1171 struct page *pgd_page = virt_to_page(domain->pgd);
1172 pgd_page->freelist = freelist;
1173 freelist = pgd_page;
1181 static void dma_free_pagelist(struct page *freelist)
1185 while ((pg = freelist)) {
1186 freelist = pg->freelist;
1187 free_pgtable_page(page_address(pg));
1191 static void iova_entry_free(unsigned long data)
1193 struct page *freelist = (struct page *)data;
1195 dma_free_pagelist(freelist);
1198 /* iommu handling */
1199 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1201 struct root_entry *root;
1202 unsigned long flags;
1204 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1206 pr_err("Allocating root entry for %s failed\n",
1211 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1213 spin_lock_irqsave(&iommu->lock, flags);
1214 iommu->root_entry = root;
1215 spin_unlock_irqrestore(&iommu->lock, flags);
1220 static void iommu_set_root_entry(struct intel_iommu *iommu)
1226 addr = virt_to_phys(iommu->root_entry);
1227 if (sm_supported(iommu))
1228 addr |= DMA_RTADDR_SMT;
1230 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1231 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1233 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1235 /* Make sure hardware complete it */
1236 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1237 readl, (sts & DMA_GSTS_RTPS), sts);
1239 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1242 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1247 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1250 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1251 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1253 /* Make sure hardware complete it */
1254 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1255 readl, (!(val & DMA_GSTS_WBFS)), val);
1257 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1260 /* return value determine if we need a write buffer flush */
1261 static void __iommu_flush_context(struct intel_iommu *iommu,
1262 u16 did, u16 source_id, u8 function_mask,
1269 case DMA_CCMD_GLOBAL_INVL:
1270 val = DMA_CCMD_GLOBAL_INVL;
1272 case DMA_CCMD_DOMAIN_INVL:
1273 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1275 case DMA_CCMD_DEVICE_INVL:
1276 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1277 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1282 val |= DMA_CCMD_ICC;
1284 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1285 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1287 /* Make sure hardware complete it */
1288 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1289 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1291 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1294 /* return value determine if we need a write buffer flush */
1295 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1296 u64 addr, unsigned int size_order, u64 type)
1298 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1299 u64 val = 0, val_iva = 0;
1303 case DMA_TLB_GLOBAL_FLUSH:
1304 /* global flush doesn't need set IVA_REG */
1305 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1307 case DMA_TLB_DSI_FLUSH:
1308 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1310 case DMA_TLB_PSI_FLUSH:
1311 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1312 /* IH bit is passed in as part of address */
1313 val_iva = size_order | addr;
1318 /* Note: set drain read/write */
1321 * This is probably to be super secure.. Looks like we can
1322 * ignore it without any impact.
1324 if (cap_read_drain(iommu->cap))
1325 val |= DMA_TLB_READ_DRAIN;
1327 if (cap_write_drain(iommu->cap))
1328 val |= DMA_TLB_WRITE_DRAIN;
1330 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1331 /* Note: Only uses first TLB reg currently */
1333 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1334 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1336 /* Make sure hardware complete it */
1337 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1338 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1340 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1342 /* check IOTLB invalidation granularity */
1343 if (DMA_TLB_IAIG(val) == 0)
1344 pr_err("Flush IOTLB failed\n");
1345 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1346 pr_debug("TLB flush request %Lx, actual %Lx\n",
1347 (unsigned long long)DMA_TLB_IIRG(type),
1348 (unsigned long long)DMA_TLB_IAIG(val));
1351 static struct device_domain_info *
1352 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1355 struct device_domain_info *info;
1357 assert_spin_locked(&device_domain_lock);
1362 list_for_each_entry(info, &domain->devices, link)
1363 if (info->iommu == iommu && info->bus == bus &&
1364 info->devfn == devfn) {
1365 if (info->ats_supported && info->dev)
1373 static void domain_update_iotlb(struct dmar_domain *domain)
1375 struct device_domain_info *info;
1376 bool has_iotlb_device = false;
1378 assert_spin_locked(&device_domain_lock);
1380 list_for_each_entry(info, &domain->devices, link) {
1381 struct pci_dev *pdev;
1383 if (!info->dev || !dev_is_pci(info->dev))
1386 pdev = to_pci_dev(info->dev);
1387 if (pdev->ats_enabled) {
1388 has_iotlb_device = true;
1393 domain->has_iotlb_device = has_iotlb_device;
1396 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1398 struct pci_dev *pdev;
1400 assert_spin_locked(&device_domain_lock);
1402 if (!info || !dev_is_pci(info->dev))
1405 pdev = to_pci_dev(info->dev);
1406 /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1407 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1408 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1409 * reserved, which should be set to 0.
1411 if (!ecap_dit(info->iommu->ecap))
1414 struct pci_dev *pf_pdev;
1416 /* pdev will be returned if device is not a vf */
1417 pf_pdev = pci_physfn(pdev);
1418 info->pfsid = pci_dev_id(pf_pdev);
1421 #ifdef CONFIG_INTEL_IOMMU_SVM
1422 /* The PCIe spec, in its wisdom, declares that the behaviour of
1423 the device if you enable PASID support after ATS support is
1424 undefined. So always enable PASID support on devices which
1425 have it, even if we can't yet know if we're ever going to
1427 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1428 info->pasid_enabled = 1;
1430 if (info->pri_supported &&
1431 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) &&
1432 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1433 info->pri_enabled = 1;
1435 if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1436 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1437 info->ats_enabled = 1;
1438 domain_update_iotlb(info->domain);
1439 info->ats_qdep = pci_ats_queue_depth(pdev);
1443 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1445 struct pci_dev *pdev;
1447 assert_spin_locked(&device_domain_lock);
1449 if (!dev_is_pci(info->dev))
1452 pdev = to_pci_dev(info->dev);
1454 if (info->ats_enabled) {
1455 pci_disable_ats(pdev);
1456 info->ats_enabled = 0;
1457 domain_update_iotlb(info->domain);
1459 #ifdef CONFIG_INTEL_IOMMU_SVM
1460 if (info->pri_enabled) {
1461 pci_disable_pri(pdev);
1462 info->pri_enabled = 0;
1464 if (info->pasid_enabled) {
1465 pci_disable_pasid(pdev);
1466 info->pasid_enabled = 0;
1471 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1472 u64 addr, unsigned mask)
1475 unsigned long flags;
1476 struct device_domain_info *info;
1478 if (!domain->has_iotlb_device)
1481 spin_lock_irqsave(&device_domain_lock, flags);
1482 list_for_each_entry(info, &domain->devices, link) {
1483 if (!info->ats_enabled)
1486 sid = info->bus << 8 | info->devfn;
1487 qdep = info->ats_qdep;
1488 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1491 spin_unlock_irqrestore(&device_domain_lock, flags);
1494 static void domain_flush_piotlb(struct intel_iommu *iommu,
1495 struct dmar_domain *domain,
1496 u64 addr, unsigned long npages, bool ih)
1498 u16 did = domain->iommu_did[iommu->seq_id];
1500 if (domain->default_pasid)
1501 qi_flush_piotlb(iommu, did, domain->default_pasid,
1504 if (!list_empty(&domain->devices))
1505 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1508 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1509 struct dmar_domain *domain,
1510 unsigned long pfn, unsigned int pages,
1513 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1514 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1515 u16 did = domain->iommu_did[iommu->seq_id];
1522 if (domain_use_first_level(domain)) {
1523 domain_flush_piotlb(iommu, domain, addr, pages, ih);
1526 * Fallback to domain selective flush if no PSI support or
1527 * the size is too big. PSI requires page size to be 2 ^ x,
1528 * and the base address is naturally aligned to the size.
1530 if (!cap_pgsel_inv(iommu->cap) ||
1531 mask > cap_max_amask_val(iommu->cap))
1532 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1535 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1540 * In caching mode, changes of pages from non-present to present require
1541 * flush. However, device IOTLB doesn't need to be flushed in this case.
1543 if (!cap_caching_mode(iommu->cap) || !map)
1544 iommu_flush_dev_iotlb(domain, addr, mask);
1547 /* Notification for newly created mappings */
1548 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1549 struct dmar_domain *domain,
1550 unsigned long pfn, unsigned int pages)
1553 * It's a non-present to present mapping. Only flush if caching mode
1556 if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1557 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1559 iommu_flush_write_buffer(iommu);
1562 static void iommu_flush_iova(struct iova_domain *iovad)
1564 struct dmar_domain *domain;
1567 domain = container_of(iovad, struct dmar_domain, iovad);
1569 for_each_domain_iommu(idx, domain) {
1570 struct intel_iommu *iommu = g_iommus[idx];
1571 u16 did = domain->iommu_did[iommu->seq_id];
1573 if (domain_use_first_level(domain))
1574 domain_flush_piotlb(iommu, domain, 0, -1, 0);
1576 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1579 if (!cap_caching_mode(iommu->cap))
1580 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1581 0, MAX_AGAW_PFN_WIDTH);
1585 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1588 unsigned long flags;
1590 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1593 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1594 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1595 pmen &= ~DMA_PMEN_EPM;
1596 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1598 /* wait for the protected region status bit to clear */
1599 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1600 readl, !(pmen & DMA_PMEN_PRS), pmen);
1602 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1605 static void iommu_enable_translation(struct intel_iommu *iommu)
1608 unsigned long flags;
1610 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1611 iommu->gcmd |= DMA_GCMD_TE;
1612 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1614 /* Make sure hardware complete it */
1615 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1616 readl, (sts & DMA_GSTS_TES), sts);
1618 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1621 static void iommu_disable_translation(struct intel_iommu *iommu)
1626 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1627 iommu->gcmd &= ~DMA_GCMD_TE;
1628 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1630 /* Make sure hardware complete it */
1631 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1632 readl, (!(sts & DMA_GSTS_TES)), sts);
1634 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1637 static int iommu_init_domains(struct intel_iommu *iommu)
1639 u32 ndomains, nlongs;
1642 ndomains = cap_ndoms(iommu->cap);
1643 pr_debug("%s: Number of Domains supported <%d>\n",
1644 iommu->name, ndomains);
1645 nlongs = BITS_TO_LONGS(ndomains);
1647 spin_lock_init(&iommu->lock);
1649 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1650 if (!iommu->domain_ids) {
1651 pr_err("%s: Allocating domain id array failed\n",
1656 size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1657 iommu->domains = kzalloc(size, GFP_KERNEL);
1659 if (iommu->domains) {
1660 size = 256 * sizeof(struct dmar_domain *);
1661 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1664 if (!iommu->domains || !iommu->domains[0]) {
1665 pr_err("%s: Allocating domain array failed\n",
1667 kfree(iommu->domain_ids);
1668 kfree(iommu->domains);
1669 iommu->domain_ids = NULL;
1670 iommu->domains = NULL;
1675 * If Caching mode is set, then invalid translations are tagged
1676 * with domain-id 0, hence we need to pre-allocate it. We also
1677 * use domain-id 0 as a marker for non-allocated domain-id, so
1678 * make sure it is not used for a real domain.
1680 set_bit(0, iommu->domain_ids);
1683 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1684 * entry for first-level or pass-through translation modes should
1685 * be programmed with a domain id different from those used for
1686 * second-level or nested translation. We reserve a domain id for
1689 if (sm_supported(iommu))
1690 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1695 static void disable_dmar_iommu(struct intel_iommu *iommu)
1697 struct device_domain_info *info, *tmp;
1698 unsigned long flags;
1700 if (!iommu->domains || !iommu->domain_ids)
1703 spin_lock_irqsave(&device_domain_lock, flags);
1704 list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1705 if (info->iommu != iommu)
1708 if (!info->dev || !info->domain)
1711 __dmar_remove_one_dev_info(info);
1713 spin_unlock_irqrestore(&device_domain_lock, flags);
1715 if (iommu->gcmd & DMA_GCMD_TE)
1716 iommu_disable_translation(iommu);
1719 static void free_dmar_iommu(struct intel_iommu *iommu)
1721 if ((iommu->domains) && (iommu->domain_ids)) {
1722 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1725 for (i = 0; i < elems; i++)
1726 kfree(iommu->domains[i]);
1727 kfree(iommu->domains);
1728 kfree(iommu->domain_ids);
1729 iommu->domains = NULL;
1730 iommu->domain_ids = NULL;
1733 g_iommus[iommu->seq_id] = NULL;
1735 /* free context mapping */
1736 free_context_table(iommu);
1738 #ifdef CONFIG_INTEL_IOMMU_SVM
1739 if (pasid_supported(iommu)) {
1740 if (ecap_prs(iommu->ecap))
1741 intel_svm_finish_prq(iommu);
1743 if (ecap_vcs(iommu->ecap) && vccap_pasid(iommu->vccap))
1744 ioasid_unregister_allocator(&iommu->pasid_allocator);
1750 * Check and return whether first level is used by default for
1753 static bool first_level_by_default(void)
1755 struct dmar_drhd_unit *drhd;
1756 struct intel_iommu *iommu;
1757 static int first_level_support = -1;
1759 if (likely(first_level_support != -1))
1760 return first_level_support;
1762 first_level_support = 1;
1765 for_each_active_iommu(iommu, drhd) {
1766 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) {
1767 first_level_support = 0;
1773 return first_level_support;
1776 static struct dmar_domain *alloc_domain(int flags)
1778 struct dmar_domain *domain;
1780 domain = alloc_domain_mem();
1784 memset(domain, 0, sizeof(*domain));
1785 domain->nid = NUMA_NO_NODE;
1786 domain->flags = flags;
1787 if (first_level_by_default())
1788 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1789 domain->has_iotlb_device = false;
1790 INIT_LIST_HEAD(&domain->devices);
1795 /* Must be called with iommu->lock */
1796 static int domain_attach_iommu(struct dmar_domain *domain,
1797 struct intel_iommu *iommu)
1799 unsigned long ndomains;
1802 assert_spin_locked(&device_domain_lock);
1803 assert_spin_locked(&iommu->lock);
1805 domain->iommu_refcnt[iommu->seq_id] += 1;
1806 domain->iommu_count += 1;
1807 if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1808 ndomains = cap_ndoms(iommu->cap);
1809 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1811 if (num >= ndomains) {
1812 pr_err("%s: No free domain ids\n", iommu->name);
1813 domain->iommu_refcnt[iommu->seq_id] -= 1;
1814 domain->iommu_count -= 1;
1818 set_bit(num, iommu->domain_ids);
1819 set_iommu_domain(iommu, num, domain);
1821 domain->iommu_did[iommu->seq_id] = num;
1822 domain->nid = iommu->node;
1824 domain_update_iommu_cap(domain);
1830 static int domain_detach_iommu(struct dmar_domain *domain,
1831 struct intel_iommu *iommu)
1835 assert_spin_locked(&device_domain_lock);
1836 assert_spin_locked(&iommu->lock);
1838 domain->iommu_refcnt[iommu->seq_id] -= 1;
1839 count = --domain->iommu_count;
1840 if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1841 num = domain->iommu_did[iommu->seq_id];
1842 clear_bit(num, iommu->domain_ids);
1843 set_iommu_domain(iommu, num, NULL);
1845 domain_update_iommu_cap(domain);
1846 domain->iommu_did[iommu->seq_id] = 0;
1852 static struct iova_domain reserved_iova_list;
1853 static struct lock_class_key reserved_rbtree_key;
1855 static int dmar_init_reserved_ranges(void)
1857 struct pci_dev *pdev = NULL;
1861 init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1863 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1864 &reserved_rbtree_key);
1866 /* IOAPIC ranges shouldn't be accessed by DMA */
1867 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1868 IOVA_PFN(IOAPIC_RANGE_END));
1870 pr_err("Reserve IOAPIC range failed\n");
1874 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1875 for_each_pci_dev(pdev) {
1878 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1879 r = &pdev->resource[i];
1880 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1882 iova = reserve_iova(&reserved_iova_list,
1886 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1894 static inline int guestwidth_to_adjustwidth(int gaw)
1897 int r = (gaw - 12) % 9;
1908 static void domain_exit(struct dmar_domain *domain)
1911 /* Remove associated devices and clear attached or cached domains */
1912 domain_remove_dev_info(domain);
1915 if (domain->domain.type == IOMMU_DOMAIN_DMA)
1916 put_iova_domain(&domain->iovad);
1919 struct page *freelist;
1921 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1922 dma_free_pagelist(freelist);
1925 free_domain_mem(domain);
1929 * Get the PASID directory size for scalable mode context entry.
1930 * Value of X in the PDTS field of a scalable mode context entry
1931 * indicates PASID directory with 2^(X + 7) entries.
1933 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1937 max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1938 pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1946 * Set the RID_PASID field of a scalable mode context entry. The
1947 * IOMMU hardware will use the PASID value set in this field for
1948 * DMA translations of DMA requests without PASID.
1951 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1953 context->hi |= pasid & ((1 << 20) - 1);
1954 context->hi |= (1 << 20);
1958 * Set the DTE(Device-TLB Enable) field of a scalable mode context
1961 static inline void context_set_sm_dte(struct context_entry *context)
1963 context->lo |= (1 << 2);
1967 * Set the PRE(Page Request Enable) field of a scalable mode context
1970 static inline void context_set_sm_pre(struct context_entry *context)
1972 context->lo |= (1 << 4);
1975 /* Convert value to context PASID directory size field coding. */
1976 #define context_pdts(pds) (((pds) & 0x7) << 9)
1978 static int domain_context_mapping_one(struct dmar_domain *domain,
1979 struct intel_iommu *iommu,
1980 struct pasid_table *table,
1983 u16 did = domain->iommu_did[iommu->seq_id];
1984 int translation = CONTEXT_TT_MULTI_LEVEL;
1985 struct device_domain_info *info = NULL;
1986 struct context_entry *context;
1987 unsigned long flags;
1992 if (hw_pass_through && domain_type_is_si(domain))
1993 translation = CONTEXT_TT_PASS_THROUGH;
1995 pr_debug("Set context mapping for %02x:%02x.%d\n",
1996 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1998 BUG_ON(!domain->pgd);
2000 spin_lock_irqsave(&device_domain_lock, flags);
2001 spin_lock(&iommu->lock);
2004 context = iommu_context_addr(iommu, bus, devfn, 1);
2009 if (context_present(context))
2013 * For kdump cases, old valid entries may be cached due to the
2014 * in-flight DMA and copied pgtable, but there is no unmapping
2015 * behaviour for them, thus we need an explicit cache flush for
2016 * the newly-mapped device. For kdump, at this point, the device
2017 * is supposed to finish reset at its driver probe stage, so no
2018 * in-flight DMA will exist, and we don't need to worry anymore
2021 if (context_copied(context)) {
2022 u16 did_old = context_domain_id(context);
2024 if (did_old < cap_ndoms(iommu->cap)) {
2025 iommu->flush.flush_context(iommu, did_old,
2026 (((u16)bus) << 8) | devfn,
2027 DMA_CCMD_MASK_NOBIT,
2028 DMA_CCMD_DEVICE_INVL);
2029 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2034 context_clear_entry(context);
2036 if (sm_supported(iommu)) {
2041 /* Setup the PASID DIR pointer: */
2042 pds = context_get_sm_pds(table);
2043 context->lo = (u64)virt_to_phys(table->table) |
2046 /* Setup the RID_PASID field: */
2047 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2050 * Setup the Device-TLB enable bit and Page request
2053 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2054 if (info && info->ats_supported)
2055 context_set_sm_dte(context);
2056 if (info && info->pri_supported)
2057 context_set_sm_pre(context);
2059 struct dma_pte *pgd = domain->pgd;
2062 context_set_domain_id(context, did);
2064 if (translation != CONTEXT_TT_PASS_THROUGH) {
2066 * Skip top levels of page tables for iommu which has
2067 * less agaw than default. Unnecessary for PT mode.
2069 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2071 pgd = phys_to_virt(dma_pte_addr(pgd));
2072 if (!dma_pte_present(pgd))
2076 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2077 if (info && info->ats_supported)
2078 translation = CONTEXT_TT_DEV_IOTLB;
2080 translation = CONTEXT_TT_MULTI_LEVEL;
2082 context_set_address_root(context, virt_to_phys(pgd));
2083 context_set_address_width(context, agaw);
2086 * In pass through mode, AW must be programmed to
2087 * indicate the largest AGAW value supported by
2088 * hardware. And ASR is ignored by hardware.
2090 context_set_address_width(context, iommu->msagaw);
2093 context_set_translation_type(context, translation);
2096 context_set_fault_enable(context);
2097 context_set_present(context);
2098 domain_flush_cache(domain, context, sizeof(*context));
2101 * It's a non-present to present mapping. If hardware doesn't cache
2102 * non-present entry we only need to flush the write-buffer. If the
2103 * _does_ cache non-present entries, then it does so in the special
2104 * domain #0, which we have to flush:
2106 if (cap_caching_mode(iommu->cap)) {
2107 iommu->flush.flush_context(iommu, 0,
2108 (((u16)bus) << 8) | devfn,
2109 DMA_CCMD_MASK_NOBIT,
2110 DMA_CCMD_DEVICE_INVL);
2111 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2113 iommu_flush_write_buffer(iommu);
2115 iommu_enable_dev_iotlb(info);
2120 spin_unlock(&iommu->lock);
2121 spin_unlock_irqrestore(&device_domain_lock, flags);
2126 struct domain_context_mapping_data {
2127 struct dmar_domain *domain;
2128 struct intel_iommu *iommu;
2129 struct pasid_table *table;
2132 static int domain_context_mapping_cb(struct pci_dev *pdev,
2133 u16 alias, void *opaque)
2135 struct domain_context_mapping_data *data = opaque;
2137 return domain_context_mapping_one(data->domain, data->iommu,
2138 data->table, PCI_BUS_NUM(alias),
2143 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2145 struct domain_context_mapping_data data;
2146 struct pasid_table *table;
2147 struct intel_iommu *iommu;
2150 iommu = device_to_iommu(dev, &bus, &devfn);
2154 table = intel_pasid_get_table(dev);
2156 if (!dev_is_pci(dev))
2157 return domain_context_mapping_one(domain, iommu, table,
2160 data.domain = domain;
2164 return pci_for_each_dma_alias(to_pci_dev(dev),
2165 &domain_context_mapping_cb, &data);
2168 static int domain_context_mapped_cb(struct pci_dev *pdev,
2169 u16 alias, void *opaque)
2171 struct intel_iommu *iommu = opaque;
2173 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2176 static int domain_context_mapped(struct device *dev)
2178 struct intel_iommu *iommu;
2181 iommu = device_to_iommu(dev, &bus, &devfn);
2185 if (!dev_is_pci(dev))
2186 return device_context_mapped(iommu, bus, devfn);
2188 return !pci_for_each_dma_alias(to_pci_dev(dev),
2189 domain_context_mapped_cb, iommu);
2192 /* Returns a number of VTD pages, but aligned to MM page size */
2193 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2196 host_addr &= ~PAGE_MASK;
2197 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2200 /* Return largest possible superpage level for a given mapping */
2201 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2202 unsigned long iov_pfn,
2203 unsigned long phy_pfn,
2204 unsigned long pages)
2206 int support, level = 1;
2207 unsigned long pfnmerge;
2209 support = domain->iommu_superpage;
2211 /* To use a large page, the virtual *and* physical addresses
2212 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2213 of them will mean we have to use smaller pages. So just
2214 merge them and check both at once. */
2215 pfnmerge = iov_pfn | phy_pfn;
2217 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2218 pages >>= VTD_STRIDE_SHIFT;
2221 pfnmerge >>= VTD_STRIDE_SHIFT;
2228 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2229 struct scatterlist *sg, unsigned long phys_pfn,
2230 unsigned long nr_pages, int prot)
2232 struct dma_pte *first_pte = NULL, *pte = NULL;
2233 phys_addr_t uninitialized_var(pteval);
2234 unsigned long sg_res = 0;
2235 unsigned int largepage_lvl = 0;
2236 unsigned long lvl_pages = 0;
2239 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2241 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2244 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2245 if (domain_use_first_level(domain))
2246 attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD;
2250 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2253 while (nr_pages > 0) {
2257 unsigned int pgoff = sg->offset & ~PAGE_MASK;
2259 sg_res = aligned_nrpages(sg->offset, sg->length);
2260 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2261 sg->dma_length = sg->length;
2262 pteval = (sg_phys(sg) - pgoff) | attr;
2263 phys_pfn = pteval >> VTD_PAGE_SHIFT;
2267 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2269 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2272 /* It is large page*/
2273 if (largepage_lvl > 1) {
2274 unsigned long nr_superpages, end_pfn;
2276 pteval |= DMA_PTE_LARGE_PAGE;
2277 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2279 nr_superpages = sg_res / lvl_pages;
2280 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2283 * Ensure that old small page tables are
2284 * removed to make room for superpage(s).
2285 * We're adding new large pages, so make sure
2286 * we don't remove their parent tables.
2288 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2291 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2295 /* We don't need lock here, nobody else
2296 * touches the iova range
2298 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2300 static int dumps = 5;
2301 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2302 iov_pfn, tmp, (unsigned long long)pteval);
2305 debug_dma_dump_mappings(NULL);
2310 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2312 BUG_ON(nr_pages < lvl_pages);
2313 BUG_ON(sg_res < lvl_pages);
2315 nr_pages -= lvl_pages;
2316 iov_pfn += lvl_pages;
2317 phys_pfn += lvl_pages;
2318 pteval += lvl_pages * VTD_PAGE_SIZE;
2319 sg_res -= lvl_pages;
2321 /* If the next PTE would be the first in a new page, then we
2322 need to flush the cache on the entries we've just written.
2323 And then we'll need to recalculate 'pte', so clear it and
2324 let it get set again in the if (!pte) block above.
2326 If we're done (!nr_pages) we need to flush the cache too.
2328 Also if we've been setting superpages, we may need to
2329 recalculate 'pte' and switch back to smaller pages for the
2330 end of the mapping, if the trailing size is not enough to
2331 use another superpage (i.e. sg_res < lvl_pages). */
2333 if (!nr_pages || first_pte_in_page(pte) ||
2334 (largepage_lvl > 1 && sg_res < lvl_pages)) {
2335 domain_flush_cache(domain, first_pte,
2336 (void *)pte - (void *)first_pte);
2340 if (!sg_res && nr_pages)
2346 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2347 struct scatterlist *sg, unsigned long phys_pfn,
2348 unsigned long nr_pages, int prot)
2351 struct intel_iommu *iommu;
2353 /* Do the real mapping first */
2354 ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2358 for_each_domain_iommu(iommu_id, domain) {
2359 iommu = g_iommus[iommu_id];
2360 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2366 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2367 struct scatterlist *sg, unsigned long nr_pages,
2370 return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2373 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2374 unsigned long phys_pfn, unsigned long nr_pages,
2377 return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2380 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2382 unsigned long flags;
2383 struct context_entry *context;
2389 spin_lock_irqsave(&iommu->lock, flags);
2390 context = iommu_context_addr(iommu, bus, devfn, 0);
2392 spin_unlock_irqrestore(&iommu->lock, flags);
2395 did_old = context_domain_id(context);
2396 context_clear_entry(context);
2397 __iommu_flush_cache(iommu, context, sizeof(*context));
2398 spin_unlock_irqrestore(&iommu->lock, flags);
2399 iommu->flush.flush_context(iommu,
2401 (((u16)bus) << 8) | devfn,
2402 DMA_CCMD_MASK_NOBIT,
2403 DMA_CCMD_DEVICE_INVL);
2404 iommu->flush.flush_iotlb(iommu,
2411 static inline void unlink_domain_info(struct device_domain_info *info)
2413 assert_spin_locked(&device_domain_lock);
2414 list_del(&info->link);
2415 list_del(&info->global);
2417 info->dev->archdata.iommu = NULL;
2420 static void domain_remove_dev_info(struct dmar_domain *domain)
2422 struct device_domain_info *info, *tmp;
2423 unsigned long flags;
2425 spin_lock_irqsave(&device_domain_lock, flags);
2426 list_for_each_entry_safe(info, tmp, &domain->devices, link)
2427 __dmar_remove_one_dev_info(info);
2428 spin_unlock_irqrestore(&device_domain_lock, flags);
2431 struct dmar_domain *find_domain(struct device *dev)
2433 struct device_domain_info *info;
2435 if (unlikely(attach_deferred(dev) || iommu_dummy(dev)))
2438 /* No lock here, assumes no domain exit in normal case */
2439 info = get_domain_info(dev);
2441 return info->domain;
2446 static void do_deferred_attach(struct device *dev)
2448 struct iommu_domain *domain;
2450 dev->archdata.iommu = NULL;
2451 domain = iommu_get_domain_for_dev(dev);
2453 intel_iommu_attach_device(domain, dev);
2456 static inline struct device_domain_info *
2457 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2459 struct device_domain_info *info;
2461 list_for_each_entry(info, &device_domain_list, global)
2462 if (info->segment == segment && info->bus == bus &&
2463 info->devfn == devfn)
2469 static int domain_setup_first_level(struct intel_iommu *iommu,
2470 struct dmar_domain *domain,
2474 int flags = PASID_FLAG_SUPERVISOR_MODE;
2475 struct dma_pte *pgd = domain->pgd;
2479 * Skip top levels of page tables for iommu which has
2480 * less agaw than default. Unnecessary for PT mode.
2482 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2483 pgd = phys_to_virt(dma_pte_addr(pgd));
2484 if (!dma_pte_present(pgd))
2488 level = agaw_to_level(agaw);
2489 if (level != 4 && level != 5)
2492 flags |= (level == 5) ? PASID_FLAG_FL5LP : 0;
2494 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2495 domain->iommu_did[iommu->seq_id],
2499 static bool dev_is_real_dma_subdevice(struct device *dev)
2501 return dev && dev_is_pci(dev) &&
2502 pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2505 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2508 struct dmar_domain *domain)
2510 struct dmar_domain *found = NULL;
2511 struct device_domain_info *info;
2512 unsigned long flags;
2515 info = alloc_devinfo_mem();
2519 if (!dev_is_real_dma_subdevice(dev)) {
2521 info->devfn = devfn;
2522 info->segment = iommu->segment;
2524 struct pci_dev *pdev = to_pci_dev(dev);
2526 info->bus = pdev->bus->number;
2527 info->devfn = pdev->devfn;
2528 info->segment = pci_domain_nr(pdev->bus);
2531 info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2532 info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2535 info->domain = domain;
2536 info->iommu = iommu;
2537 info->pasid_table = NULL;
2538 info->auxd_enabled = 0;
2539 INIT_LIST_HEAD(&info->auxiliary_domains);
2541 if (dev && dev_is_pci(dev)) {
2542 struct pci_dev *pdev = to_pci_dev(info->dev);
2544 if (ecap_dev_iotlb_support(iommu->ecap) &&
2545 pci_ats_supported(pdev) &&
2546 dmar_find_matched_atsr_unit(pdev))
2547 info->ats_supported = 1;
2549 if (sm_supported(iommu)) {
2550 if (pasid_supported(iommu)) {
2551 int features = pci_pasid_features(pdev);
2553 info->pasid_supported = features | 1;
2556 if (info->ats_supported && ecap_prs(iommu->ecap) &&
2557 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2558 info->pri_supported = 1;
2562 spin_lock_irqsave(&device_domain_lock, flags);
2564 found = find_domain(dev);
2567 struct device_domain_info *info2;
2568 info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
2571 found = info2->domain;
2577 spin_unlock_irqrestore(&device_domain_lock, flags);
2578 free_devinfo_mem(info);
2579 /* Caller must free the original domain */
2583 spin_lock(&iommu->lock);
2584 ret = domain_attach_iommu(domain, iommu);
2585 spin_unlock(&iommu->lock);
2588 spin_unlock_irqrestore(&device_domain_lock, flags);
2589 free_devinfo_mem(info);
2593 list_add(&info->link, &domain->devices);
2594 list_add(&info->global, &device_domain_list);
2596 dev->archdata.iommu = info;
2597 spin_unlock_irqrestore(&device_domain_lock, flags);
2599 /* PASID table is mandatory for a PCI device in scalable mode. */
2600 if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2601 ret = intel_pasid_alloc_table(dev);
2603 dev_err(dev, "PASID table allocation failed\n");
2604 dmar_remove_one_dev_info(dev);
2608 /* Setup the PASID entry for requests without PASID: */
2609 spin_lock(&iommu->lock);
2610 if (hw_pass_through && domain_type_is_si(domain))
2611 ret = intel_pasid_setup_pass_through(iommu, domain,
2612 dev, PASID_RID2PASID);
2613 else if (domain_use_first_level(domain))
2614 ret = domain_setup_first_level(iommu, domain, dev,
2617 ret = intel_pasid_setup_second_level(iommu, domain,
2618 dev, PASID_RID2PASID);
2619 spin_unlock(&iommu->lock);
2621 dev_err(dev, "Setup RID2PASID failed\n");
2622 dmar_remove_one_dev_info(dev);
2627 if (dev && domain_context_mapping(domain, dev)) {
2628 dev_err(dev, "Domain context map failed\n");
2629 dmar_remove_one_dev_info(dev);
2636 static int iommu_domain_identity_map(struct dmar_domain *domain,
2637 unsigned long first_vpfn,
2638 unsigned long last_vpfn)
2641 * RMRR range might have overlap with physical memory range,
2644 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2646 return __domain_mapping(domain, first_vpfn, NULL,
2647 first_vpfn, last_vpfn - first_vpfn + 1,
2648 DMA_PTE_READ|DMA_PTE_WRITE);
2651 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2653 static int __init si_domain_init(int hw)
2655 struct dmar_rmrr_unit *rmrr;
2659 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2663 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2664 domain_exit(si_domain);
2671 for_each_online_node(nid) {
2672 unsigned long start_pfn, end_pfn;
2675 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2676 ret = iommu_domain_identity_map(si_domain,
2677 mm_to_dma_pfn(start_pfn),
2678 mm_to_dma_pfn(end_pfn));
2685 * Identity map the RMRRs so that devices with RMRRs could also use
2688 for_each_rmrr_units(rmrr) {
2689 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2691 unsigned long long start = rmrr->base_address;
2692 unsigned long long end = rmrr->end_address;
2694 if (WARN_ON(end < start ||
2695 end >> agaw_to_width(si_domain->agaw)))
2698 ret = iommu_domain_identity_map(si_domain, start, end);
2707 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2709 struct dmar_domain *ndomain;
2710 struct intel_iommu *iommu;
2713 iommu = device_to_iommu(dev, &bus, &devfn);
2717 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2718 if (ndomain != domain)
2724 static bool device_has_rmrr(struct device *dev)
2726 struct dmar_rmrr_unit *rmrr;
2731 for_each_rmrr_units(rmrr) {
2733 * Return TRUE if this RMRR contains the device that
2736 for_each_active_dev_scope(rmrr->devices,
2737 rmrr->devices_cnt, i, tmp)
2739 is_downstream_to_pci_bridge(dev, tmp)) {
2749 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2750 * is relaxable (ie. is allowed to be not enforced under some conditions)
2751 * @dev: device handle
2753 * We assume that PCI USB devices with RMRRs have them largely
2754 * for historical reasons and that the RMRR space is not actively used post
2755 * boot. This exclusion may change if vendors begin to abuse it.
2757 * The same exception is made for graphics devices, with the requirement that
2758 * any use of the RMRR regions will be torn down before assigning the device
2761 * Return: true if the RMRR is relaxable, false otherwise
2763 static bool device_rmrr_is_relaxable(struct device *dev)
2765 struct pci_dev *pdev;
2767 if (!dev_is_pci(dev))
2770 pdev = to_pci_dev(dev);
2771 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2778 * There are a couple cases where we need to restrict the functionality of
2779 * devices associated with RMRRs. The first is when evaluating a device for
2780 * identity mapping because problems exist when devices are moved in and out
2781 * of domains and their respective RMRR information is lost. This means that
2782 * a device with associated RMRRs will never be in a "passthrough" domain.
2783 * The second is use of the device through the IOMMU API. This interface
2784 * expects to have full control of the IOVA space for the device. We cannot
2785 * satisfy both the requirement that RMRR access is maintained and have an
2786 * unencumbered IOVA space. We also have no ability to quiesce the device's
2787 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2788 * We therefore prevent devices associated with an RMRR from participating in
2789 * the IOMMU API, which eliminates them from device assignment.
2791 * In both cases, devices which have relaxable RMRRs are not concerned by this
2792 * restriction. See device_rmrr_is_relaxable comment.
2794 static bool device_is_rmrr_locked(struct device *dev)
2796 if (!device_has_rmrr(dev))
2799 if (device_rmrr_is_relaxable(dev))
2806 * Return the required default domain type for a specific device.
2808 * @dev: the device in query
2809 * @startup: true if this is during early boot
2812 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2813 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2814 * - 0: both identity and dynamic domains work for this device
2816 static int device_def_domain_type(struct device *dev)
2818 if (dev_is_pci(dev)) {
2819 struct pci_dev *pdev = to_pci_dev(dev);
2822 * Prevent any device marked as untrusted from getting
2823 * placed into the statically identity mapping domain.
2825 if (pdev->untrusted)
2826 return IOMMU_DOMAIN_DMA;
2828 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2829 return IOMMU_DOMAIN_IDENTITY;
2831 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2832 return IOMMU_DOMAIN_IDENTITY;
2838 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2841 * Start from the sane iommu hardware state.
2842 * If the queued invalidation is already initialized by us
2843 * (for example, while enabling interrupt-remapping) then
2844 * we got the things already rolling from a sane state.
2848 * Clear any previous faults.
2850 dmar_fault(-1, iommu);
2852 * Disable queued invalidation if supported and already enabled
2853 * before OS handover.
2855 dmar_disable_qi(iommu);
2858 if (dmar_enable_qi(iommu)) {
2860 * Queued Invalidate not enabled, use Register Based Invalidate
2862 iommu->flush.flush_context = __iommu_flush_context;
2863 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2864 pr_info("%s: Using Register based invalidation\n",
2867 iommu->flush.flush_context = qi_flush_context;
2868 iommu->flush.flush_iotlb = qi_flush_iotlb;
2869 pr_info("%s: Using Queued invalidation\n", iommu->name);
2873 static int copy_context_table(struct intel_iommu *iommu,
2874 struct root_entry *old_re,
2875 struct context_entry **tbl,
2878 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2879 struct context_entry *new_ce = NULL, ce;
2880 struct context_entry *old_ce = NULL;
2881 struct root_entry re;
2882 phys_addr_t old_ce_phys;
2884 tbl_idx = ext ? bus * 2 : bus;
2885 memcpy(&re, old_re, sizeof(re));
2887 for (devfn = 0; devfn < 256; devfn++) {
2888 /* First calculate the correct index */
2889 idx = (ext ? devfn * 2 : devfn) % 256;
2892 /* First save what we may have and clean up */
2894 tbl[tbl_idx] = new_ce;
2895 __iommu_flush_cache(iommu, new_ce,
2905 old_ce_phys = root_entry_lctp(&re);
2907 old_ce_phys = root_entry_uctp(&re);
2910 if (ext && devfn == 0) {
2911 /* No LCTP, try UCTP */
2920 old_ce = memremap(old_ce_phys, PAGE_SIZE,
2925 new_ce = alloc_pgtable_page(iommu->node);
2932 /* Now copy the context entry */
2933 memcpy(&ce, old_ce + idx, sizeof(ce));
2935 if (!__context_present(&ce))
2938 did = context_domain_id(&ce);
2939 if (did >= 0 && did < cap_ndoms(iommu->cap))
2940 set_bit(did, iommu->domain_ids);
2943 * We need a marker for copied context entries. This
2944 * marker needs to work for the old format as well as
2945 * for extended context entries.
2947 * Bit 67 of the context entry is used. In the old
2948 * format this bit is available to software, in the
2949 * extended format it is the PGE bit, but PGE is ignored
2950 * by HW if PASIDs are disabled (and thus still
2953 * So disable PASIDs first and then mark the entry
2954 * copied. This means that we don't copy PASID
2955 * translations from the old kernel, but this is fine as
2956 * faults there are not fatal.
2958 context_clear_pasid_enable(&ce);
2959 context_set_copied(&ce);
2964 tbl[tbl_idx + pos] = new_ce;
2966 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2975 static int copy_translation_tables(struct intel_iommu *iommu)
2977 struct context_entry **ctxt_tbls;
2978 struct root_entry *old_rt;
2979 phys_addr_t old_rt_phys;
2980 int ctxt_table_entries;
2981 unsigned long flags;
2986 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2987 ext = !!(rtaddr_reg & DMA_RTADDR_RTT);
2988 new_ext = !!ecap_ecs(iommu->ecap);
2991 * The RTT bit can only be changed when translation is disabled,
2992 * but disabling translation means to open a window for data
2993 * corruption. So bail out and don't copy anything if we would
2994 * have to change the bit.
2999 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3003 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3007 /* This is too big for the stack - allocate it from slab */
3008 ctxt_table_entries = ext ? 512 : 256;
3010 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3014 for (bus = 0; bus < 256; bus++) {
3015 ret = copy_context_table(iommu, &old_rt[bus],
3016 ctxt_tbls, bus, ext);
3018 pr_err("%s: Failed to copy context table for bus %d\n",
3024 spin_lock_irqsave(&iommu->lock, flags);
3026 /* Context tables are copied, now write them to the root_entry table */
3027 for (bus = 0; bus < 256; bus++) {
3028 int idx = ext ? bus * 2 : bus;
3031 if (ctxt_tbls[idx]) {
3032 val = virt_to_phys(ctxt_tbls[idx]) | 1;
3033 iommu->root_entry[bus].lo = val;
3036 if (!ext || !ctxt_tbls[idx + 1])
3039 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3040 iommu->root_entry[bus].hi = val;
3043 spin_unlock_irqrestore(&iommu->lock, flags);
3047 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3057 #ifdef CONFIG_INTEL_IOMMU_SVM
3058 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
3060 struct intel_iommu *iommu = data;
3064 return INVALID_IOASID;
3066 * VT-d virtual command interface always uses the full 20 bit
3067 * PASID range. Host can partition guest PASID range based on
3068 * policies but it is out of guest's control.
3070 if (min < PASID_MIN || max > intel_pasid_max_id)
3071 return INVALID_IOASID;
3073 if (vcmd_alloc_pasid(iommu, &ioasid))
3074 return INVALID_IOASID;
3079 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
3081 struct intel_iommu *iommu = data;
3086 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
3087 * We can only free the PASID when all the devices are unbound.
3089 if (ioasid_find(NULL, ioasid, NULL)) {
3090 pr_alert("Cannot free active IOASID %d\n", ioasid);
3093 vcmd_free_pasid(iommu, ioasid);
3096 static void register_pasid_allocator(struct intel_iommu *iommu)
3099 * If we are running in the host, no need for custom allocator
3100 * in that PASIDs are allocated from the host system-wide.
3102 if (!cap_caching_mode(iommu->cap))
3105 if (!sm_supported(iommu)) {
3106 pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
3111 * Register a custom PASID allocator if we are running in a guest,
3112 * guest PASID must be obtained via virtual command interface.
3113 * There can be multiple vIOMMUs in each guest but only one allocator
3114 * is active. All vIOMMU allocators will eventually be calling the same
3117 if (!ecap_vcs(iommu->ecap) || !vccap_pasid(iommu->vccap))
3120 pr_info("Register custom PASID allocator\n");
3121 iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
3122 iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
3123 iommu->pasid_allocator.pdata = (void *)iommu;
3124 if (ioasid_register_allocator(&iommu->pasid_allocator)) {
3125 pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
3127 * Disable scalable mode on this IOMMU if there
3128 * is no custom allocator. Mixing SM capable vIOMMU
3129 * and non-SM vIOMMU are not supported.
3136 static int __init init_dmars(void)
3138 struct dmar_drhd_unit *drhd;
3139 struct intel_iommu *iommu;
3145 * initialize and program root entry to not present
3148 for_each_drhd_unit(drhd) {
3150 * lock not needed as this is only incremented in the single
3151 * threaded kernel __init code path all other access are read
3154 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3158 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3161 /* Preallocate enough resources for IOMMU hot-addition */
3162 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3163 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3165 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3168 pr_err("Allocating global iommu array failed\n");
3173 for_each_iommu(iommu, drhd) {
3174 if (drhd->ignored) {
3175 iommu_disable_translation(iommu);
3180 * Find the max pasid size of all IOMMU's in the system.
3181 * We need to ensure the system pasid table is no bigger
3182 * than the smallest supported.
3184 if (pasid_supported(iommu)) {
3185 u32 temp = 2 << ecap_pss(iommu->ecap);
3187 intel_pasid_max_id = min_t(u32, temp,
3188 intel_pasid_max_id);
3191 g_iommus[iommu->seq_id] = iommu;
3193 intel_iommu_init_qi(iommu);
3195 ret = iommu_init_domains(iommu);
3199 init_translation_status(iommu);
3201 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3202 iommu_disable_translation(iommu);
3203 clear_translation_pre_enabled(iommu);
3204 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3210 * we could share the same root & context tables
3211 * among all IOMMU's. Need to Split it later.
3213 ret = iommu_alloc_root_entry(iommu);
3217 if (translation_pre_enabled(iommu)) {
3218 pr_info("Translation already enabled - trying to copy translation structures\n");
3220 ret = copy_translation_tables(iommu);
3223 * We found the IOMMU with translation
3224 * enabled - but failed to copy over the
3225 * old root-entry table. Try to proceed
3226 * by disabling translation now and
3227 * allocating a clean root-entry table.
3228 * This might cause DMAR faults, but
3229 * probably the dump will still succeed.
3231 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3233 iommu_disable_translation(iommu);
3234 clear_translation_pre_enabled(iommu);
3236 pr_info("Copied translation tables from previous kernel for %s\n",
3241 if (!ecap_pass_through(iommu->ecap))
3242 hw_pass_through = 0;
3243 intel_svm_check(iommu);
3247 * Now that qi is enabled on all iommus, set the root entry and flush
3248 * caches. This is required on some Intel X58 chipsets, otherwise the
3249 * flush_context function will loop forever and the boot hangs.
3251 for_each_active_iommu(iommu, drhd) {
3252 iommu_flush_write_buffer(iommu);
3253 #ifdef CONFIG_INTEL_IOMMU_SVM
3254 register_pasid_allocator(iommu);
3256 iommu_set_root_entry(iommu);
3257 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3258 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3261 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3266 iommu_identity_mapping |= IDENTMAP_GFX;
3268 check_tylersburg_isoch();
3270 ret = si_domain_init(hw_pass_through);
3277 * global invalidate context cache
3278 * global invalidate iotlb
3279 * enable translation
3281 for_each_iommu(iommu, drhd) {
3282 if (drhd->ignored) {
3284 * we always have to disable PMRs or DMA may fail on
3288 iommu_disable_protect_mem_regions(iommu);
3292 iommu_flush_write_buffer(iommu);
3294 #ifdef CONFIG_INTEL_IOMMU_SVM
3295 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3297 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3298 * could cause possible lock race condition.
3300 up_write(&dmar_global_lock);
3301 ret = intel_svm_enable_prq(iommu);
3302 down_write(&dmar_global_lock);
3307 ret = dmar_set_interrupt(iommu);
3315 for_each_active_iommu(iommu, drhd) {
3316 disable_dmar_iommu(iommu);
3317 free_dmar_iommu(iommu);
3326 /* This takes a number of _MM_ pages, not VTD pages */
3327 static unsigned long intel_alloc_iova(struct device *dev,
3328 struct dmar_domain *domain,
3329 unsigned long nrpages, uint64_t dma_mask)
3331 unsigned long iova_pfn;
3334 * Restrict dma_mask to the width that the iommu can handle.
3335 * First-level translation restricts the input-address to a
3336 * canonical address (i.e., address bits 63:N have the same
3337 * value as address bit [N-1], where N is 48-bits with 4-level
3338 * paging and 57-bits with 5-level paging). Hence, skip bit
3341 if (domain_use_first_level(domain))
3342 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw - 1),
3345 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw),
3348 /* Ensure we reserve the whole size-aligned region */
3349 nrpages = __roundup_pow_of_two(nrpages);
3351 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3353 * First try to allocate an io virtual address in
3354 * DMA_BIT_MASK(32) and if that fails then try allocating
3357 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3358 IOVA_PFN(DMA_BIT_MASK(32)), false);
3362 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3363 IOVA_PFN(dma_mask), true);
3364 if (unlikely(!iova_pfn)) {
3365 dev_err_once(dev, "Allocating %ld-page iova failed\n",
3373 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3374 size_t size, int dir, u64 dma_mask)
3376 struct dmar_domain *domain;
3377 phys_addr_t start_paddr;
3378 unsigned long iova_pfn;
3381 struct intel_iommu *iommu;
3382 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3384 BUG_ON(dir == DMA_NONE);
3386 if (unlikely(attach_deferred(dev)))
3387 do_deferred_attach(dev);
3389 domain = find_domain(dev);
3391 return DMA_MAPPING_ERROR;
3393 iommu = domain_get_iommu(domain);
3394 size = aligned_nrpages(paddr, size);
3396 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3401 * Check if DMAR supports zero-length reads on write only
3404 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3405 !cap_zlr(iommu->cap))
3406 prot |= DMA_PTE_READ;
3407 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3408 prot |= DMA_PTE_WRITE;
3410 * paddr - (paddr + size) might be partial page, we should map the whole
3411 * page. Note: if two part of one page are separately mapped, we
3412 * might have two guest_addr mapping to the same host paddr, but this
3413 * is not a big problem
3415 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3416 mm_to_dma_pfn(paddr_pfn), size, prot);
3420 start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3421 start_paddr += paddr & ~PAGE_MASK;
3423 trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3429 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3430 dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3431 size, (unsigned long long)paddr, dir);
3432 return DMA_MAPPING_ERROR;
3435 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3436 unsigned long offset, size_t size,
3437 enum dma_data_direction dir,
3438 unsigned long attrs)
3440 return __intel_map_single(dev, page_to_phys(page) + offset,
3441 size, dir, *dev->dma_mask);
3444 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3445 size_t size, enum dma_data_direction dir,
3446 unsigned long attrs)
3448 return __intel_map_single(dev, phys_addr, size, dir, *dev->dma_mask);
3451 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3453 struct dmar_domain *domain;
3454 unsigned long start_pfn, last_pfn;
3455 unsigned long nrpages;
3456 unsigned long iova_pfn;
3457 struct intel_iommu *iommu;
3458 struct page *freelist;
3459 struct pci_dev *pdev = NULL;
3461 domain = find_domain(dev);
3464 iommu = domain_get_iommu(domain);
3466 iova_pfn = IOVA_PFN(dev_addr);
3468 nrpages = aligned_nrpages(dev_addr, size);
3469 start_pfn = mm_to_dma_pfn(iova_pfn);
3470 last_pfn = start_pfn + nrpages - 1;
3472 if (dev_is_pci(dev))
3473 pdev = to_pci_dev(dev);
3475 freelist = domain_unmap(domain, start_pfn, last_pfn);
3476 if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3477 !has_iova_flush_queue(&domain->iovad)) {
3478 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3479 nrpages, !freelist, 0);
3481 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3482 dma_free_pagelist(freelist);
3484 queue_iova(&domain->iovad, iova_pfn, nrpages,
3485 (unsigned long)freelist);
3487 * queue up the release of the unmap to save the 1/6th of the
3488 * cpu used up by the iotlb flush operation...
3492 trace_unmap_single(dev, dev_addr, size);
3495 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3496 size_t size, enum dma_data_direction dir,
3497 unsigned long attrs)
3499 intel_unmap(dev, dev_addr, size);
3502 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3503 size_t size, enum dma_data_direction dir, unsigned long attrs)
3505 intel_unmap(dev, dev_addr, size);
3508 static void *intel_alloc_coherent(struct device *dev, size_t size,
3509 dma_addr_t *dma_handle, gfp_t flags,
3510 unsigned long attrs)
3512 struct page *page = NULL;
3515 if (unlikely(attach_deferred(dev)))
3516 do_deferred_attach(dev);
3518 size = PAGE_ALIGN(size);
3519 order = get_order(size);
3521 if (gfpflags_allow_blocking(flags)) {
3522 unsigned int count = size >> PAGE_SHIFT;
3524 page = dma_alloc_from_contiguous(dev, count, order,
3525 flags & __GFP_NOWARN);
3529 page = alloc_pages(flags, order);
3532 memset(page_address(page), 0, size);
3534 *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3536 dev->coherent_dma_mask);
3537 if (*dma_handle != DMA_MAPPING_ERROR)
3538 return page_address(page);
3539 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3540 __free_pages(page, order);
3545 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3546 dma_addr_t dma_handle, unsigned long attrs)
3549 struct page *page = virt_to_page(vaddr);
3551 size = PAGE_ALIGN(size);
3552 order = get_order(size);
3554 intel_unmap(dev, dma_handle, size);
3555 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3556 __free_pages(page, order);
3559 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3560 int nelems, enum dma_data_direction dir,
3561 unsigned long attrs)
3563 dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3564 unsigned long nrpages = 0;
3565 struct scatterlist *sg;
3568 for_each_sg(sglist, sg, nelems, i) {
3569 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3572 intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3574 trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3577 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3578 enum dma_data_direction dir, unsigned long attrs)
3581 struct dmar_domain *domain;
3584 unsigned long iova_pfn;
3586 struct scatterlist *sg;
3587 unsigned long start_vpfn;
3588 struct intel_iommu *iommu;
3590 BUG_ON(dir == DMA_NONE);
3592 if (unlikely(attach_deferred(dev)))
3593 do_deferred_attach(dev);
3595 domain = find_domain(dev);
3599 iommu = domain_get_iommu(domain);
3601 for_each_sg(sglist, sg, nelems, i)
3602 size += aligned_nrpages(sg->offset, sg->length);
3604 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3607 sglist->dma_length = 0;
3612 * Check if DMAR supports zero-length reads on write only
3615 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3616 !cap_zlr(iommu->cap))
3617 prot |= DMA_PTE_READ;
3618 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3619 prot |= DMA_PTE_WRITE;
3621 start_vpfn = mm_to_dma_pfn(iova_pfn);
3623 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3624 if (unlikely(ret)) {
3625 dma_pte_free_pagetable(domain, start_vpfn,
3626 start_vpfn + size - 1,
3627 agaw_to_level(domain->agaw) + 1);
3628 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3632 for_each_sg(sglist, sg, nelems, i)
3633 trace_map_sg(dev, i + 1, nelems, sg);
3638 static u64 intel_get_required_mask(struct device *dev)
3640 return DMA_BIT_MASK(32);
3643 static const struct dma_map_ops intel_dma_ops = {
3644 .alloc = intel_alloc_coherent,
3645 .free = intel_free_coherent,
3646 .map_sg = intel_map_sg,
3647 .unmap_sg = intel_unmap_sg,
3648 .map_page = intel_map_page,
3649 .unmap_page = intel_unmap_page,
3650 .map_resource = intel_map_resource,
3651 .unmap_resource = intel_unmap_resource,
3652 .dma_supported = dma_direct_supported,
3653 .mmap = dma_common_mmap,
3654 .get_sgtable = dma_common_get_sgtable,
3655 .get_required_mask = intel_get_required_mask,
3659 bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3660 enum dma_data_direction dir, enum dma_sync_target target)
3662 struct dmar_domain *domain;
3663 phys_addr_t tlb_addr;
3665 domain = find_domain(dev);
3666 if (WARN_ON(!domain))
3669 tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3670 if (is_swiotlb_buffer(tlb_addr))
3671 swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3675 bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3676 enum dma_data_direction dir, unsigned long attrs,
3679 size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3680 struct dmar_domain *domain;
3681 struct intel_iommu *iommu;
3682 unsigned long iova_pfn;
3683 unsigned long nrpages;
3684 phys_addr_t tlb_addr;
3688 if (unlikely(attach_deferred(dev)))
3689 do_deferred_attach(dev);
3691 domain = find_domain(dev);
3693 if (WARN_ON(dir == DMA_NONE || !domain))
3694 return DMA_MAPPING_ERROR;
3696 iommu = domain_get_iommu(domain);
3697 if (WARN_ON(!iommu))
3698 return DMA_MAPPING_ERROR;
3700 nrpages = aligned_nrpages(0, size);
3701 iova_pfn = intel_alloc_iova(dev, domain,
3702 dma_to_mm_pfn(nrpages), dma_mask);
3704 return DMA_MAPPING_ERROR;
3707 * Check if DMAR supports zero-length reads on write only
3710 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3711 !cap_zlr(iommu->cap))
3712 prot |= DMA_PTE_READ;
3713 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3714 prot |= DMA_PTE_WRITE;
3717 * If both the physical buffer start address and size are
3718 * page aligned, we don't need to use a bounce page.
3720 if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
3721 tlb_addr = swiotlb_tbl_map_single(dev,
3722 __phys_to_dma(dev, io_tlb_start),
3723 paddr, size, aligned_size, dir, attrs);
3724 if (tlb_addr == DMA_MAPPING_ERROR) {
3727 /* Cleanup the padding area. */
3728 void *padding_start = phys_to_virt(tlb_addr);
3729 size_t padding_size = aligned_size;
3731 if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
3732 (dir == DMA_TO_DEVICE ||
3733 dir == DMA_BIDIRECTIONAL)) {
3734 padding_start += size;
3735 padding_size -= size;
3738 memset(padding_start, 0, padding_size);
3744 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3745 tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
3749 trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
3751 return (phys_addr_t)iova_pfn << PAGE_SHIFT;
3754 if (is_swiotlb_buffer(tlb_addr))
3755 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3756 aligned_size, dir, attrs);
3758 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3759 dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
3760 size, (unsigned long long)paddr, dir);
3762 return DMA_MAPPING_ERROR;
3766 bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
3767 enum dma_data_direction dir, unsigned long attrs)
3769 size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3770 struct dmar_domain *domain;
3771 phys_addr_t tlb_addr;
3773 domain = find_domain(dev);
3774 if (WARN_ON(!domain))
3777 tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
3778 if (WARN_ON(!tlb_addr))
3781 intel_unmap(dev, dev_addr, size);
3782 if (is_swiotlb_buffer(tlb_addr))
3783 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3784 aligned_size, dir, attrs);
3786 trace_bounce_unmap_single(dev, dev_addr, size);
3790 bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
3791 size_t size, enum dma_data_direction dir, unsigned long attrs)
3793 return bounce_map_single(dev, page_to_phys(page) + offset,
3794 size, dir, attrs, *dev->dma_mask);
3798 bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
3799 enum dma_data_direction dir, unsigned long attrs)
3801 return bounce_map_single(dev, phys_addr, size,
3802 dir, attrs, *dev->dma_mask);
3806 bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
3807 enum dma_data_direction dir, unsigned long attrs)
3809 bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3813 bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
3814 enum dma_data_direction dir, unsigned long attrs)
3816 bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3820 bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3821 enum dma_data_direction dir, unsigned long attrs)
3823 struct scatterlist *sg;
3826 for_each_sg(sglist, sg, nelems, i)
3827 bounce_unmap_page(dev, sg->dma_address,
3828 sg_dma_len(sg), dir, attrs);
3832 bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3833 enum dma_data_direction dir, unsigned long attrs)
3836 struct scatterlist *sg;
3838 for_each_sg(sglist, sg, nelems, i) {
3839 sg->dma_address = bounce_map_page(dev, sg_page(sg),
3840 sg->offset, sg->length,
3842 if (sg->dma_address == DMA_MAPPING_ERROR)
3844 sg_dma_len(sg) = sg->length;
3847 for_each_sg(sglist, sg, nelems, i)
3848 trace_bounce_map_sg(dev, i + 1, nelems, sg);
3853 bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
3858 bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
3859 size_t size, enum dma_data_direction dir)
3861 bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
3865 bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
3866 size_t size, enum dma_data_direction dir)
3868 bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
3872 bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
3873 int nelems, enum dma_data_direction dir)
3875 struct scatterlist *sg;
3878 for_each_sg(sglist, sg, nelems, i)
3879 bounce_sync_single(dev, sg_dma_address(sg),
3880 sg_dma_len(sg), dir, SYNC_FOR_CPU);
3884 bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
3885 int nelems, enum dma_data_direction dir)
3887 struct scatterlist *sg;
3890 for_each_sg(sglist, sg, nelems, i)
3891 bounce_sync_single(dev, sg_dma_address(sg),
3892 sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
3895 static const struct dma_map_ops bounce_dma_ops = {
3896 .alloc = intel_alloc_coherent,
3897 .free = intel_free_coherent,
3898 .map_sg = bounce_map_sg,
3899 .unmap_sg = bounce_unmap_sg,
3900 .map_page = bounce_map_page,
3901 .unmap_page = bounce_unmap_page,
3902 .sync_single_for_cpu = bounce_sync_single_for_cpu,
3903 .sync_single_for_device = bounce_sync_single_for_device,
3904 .sync_sg_for_cpu = bounce_sync_sg_for_cpu,
3905 .sync_sg_for_device = bounce_sync_sg_for_device,
3906 .map_resource = bounce_map_resource,
3907 .unmap_resource = bounce_unmap_resource,
3908 .dma_supported = dma_direct_supported,
3911 static inline int iommu_domain_cache_init(void)
3915 iommu_domain_cache = kmem_cache_create("iommu_domain",
3916 sizeof(struct dmar_domain),
3921 if (!iommu_domain_cache) {
3922 pr_err("Couldn't create iommu_domain cache\n");
3929 static inline int iommu_devinfo_cache_init(void)
3933 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3934 sizeof(struct device_domain_info),
3938 if (!iommu_devinfo_cache) {
3939 pr_err("Couldn't create devinfo cache\n");
3946 static int __init iommu_init_mempool(void)
3949 ret = iova_cache_get();
3953 ret = iommu_domain_cache_init();
3957 ret = iommu_devinfo_cache_init();
3961 kmem_cache_destroy(iommu_domain_cache);
3968 static void __init iommu_exit_mempool(void)
3970 kmem_cache_destroy(iommu_devinfo_cache);
3971 kmem_cache_destroy(iommu_domain_cache);
3975 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3977 struct dmar_drhd_unit *drhd;
3981 /* We know that this device on this chipset has its own IOMMU.
3982 * If we find it under a different IOMMU, then the BIOS is lying
3983 * to us. Hope that the IOMMU for this device is actually
3984 * disabled, and it needs no translation...
3986 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3988 /* "can't" happen */
3989 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3992 vtbar &= 0xffff0000;
3994 /* we know that the this iommu should be at offset 0xa000 from vtbar */
3995 drhd = dmar_find_matched_drhd_unit(pdev);
3996 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
3997 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
3998 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3999 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4002 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
4004 static void __init init_no_remapping_devices(void)
4006 struct dmar_drhd_unit *drhd;
4010 for_each_drhd_unit(drhd) {
4011 if (!drhd->include_all) {
4012 for_each_active_dev_scope(drhd->devices,
4013 drhd->devices_cnt, i, dev)
4015 /* ignore DMAR unit if no devices exist */
4016 if (i == drhd->devices_cnt)
4021 for_each_active_drhd_unit(drhd) {
4022 if (drhd->include_all)
4025 for_each_active_dev_scope(drhd->devices,
4026 drhd->devices_cnt, i, dev)
4027 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4029 if (i < drhd->devices_cnt)
4032 /* This IOMMU has *only* gfx devices. Either bypass it or
4033 set the gfx_mapped flag, as appropriate */
4034 if (!dmar_map_gfx) {
4036 for_each_active_dev_scope(drhd->devices,
4037 drhd->devices_cnt, i, dev)
4038 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4043 #ifdef CONFIG_SUSPEND
4044 static int init_iommu_hw(void)
4046 struct dmar_drhd_unit *drhd;
4047 struct intel_iommu *iommu = NULL;
4049 for_each_active_iommu(iommu, drhd)
4051 dmar_reenable_qi(iommu);
4053 for_each_iommu(iommu, drhd) {
4054 if (drhd->ignored) {
4056 * we always have to disable PMRs or DMA may fail on
4060 iommu_disable_protect_mem_regions(iommu);
4064 iommu_flush_write_buffer(iommu);
4066 iommu_set_root_entry(iommu);
4068 iommu->flush.flush_context(iommu, 0, 0, 0,
4069 DMA_CCMD_GLOBAL_INVL);
4070 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4071 iommu_enable_translation(iommu);
4072 iommu_disable_protect_mem_regions(iommu);
4078 static void iommu_flush_all(void)
4080 struct dmar_drhd_unit *drhd;
4081 struct intel_iommu *iommu;
4083 for_each_active_iommu(iommu, drhd) {
4084 iommu->flush.flush_context(iommu, 0, 0, 0,
4085 DMA_CCMD_GLOBAL_INVL);
4086 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4087 DMA_TLB_GLOBAL_FLUSH);
4091 static int iommu_suspend(void)
4093 struct dmar_drhd_unit *drhd;
4094 struct intel_iommu *iommu = NULL;
4097 for_each_active_iommu(iommu, drhd) {
4098 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4100 if (!iommu->iommu_state)
4106 for_each_active_iommu(iommu, drhd) {
4107 iommu_disable_translation(iommu);
4109 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4111 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4112 readl(iommu->reg + DMAR_FECTL_REG);
4113 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4114 readl(iommu->reg + DMAR_FEDATA_REG);
4115 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4116 readl(iommu->reg + DMAR_FEADDR_REG);
4117 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4118 readl(iommu->reg + DMAR_FEUADDR_REG);
4120 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4125 for_each_active_iommu(iommu, drhd)
4126 kfree(iommu->iommu_state);
4131 static void iommu_resume(void)
4133 struct dmar_drhd_unit *drhd;
4134 struct intel_iommu *iommu = NULL;
4137 if (init_iommu_hw()) {
4139 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4141 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4145 for_each_active_iommu(iommu, drhd) {
4147 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4149 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4150 iommu->reg + DMAR_FECTL_REG);
4151 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4152 iommu->reg + DMAR_FEDATA_REG);
4153 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4154 iommu->reg + DMAR_FEADDR_REG);
4155 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4156 iommu->reg + DMAR_FEUADDR_REG);
4158 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4161 for_each_active_iommu(iommu, drhd)
4162 kfree(iommu->iommu_state);
4165 static struct syscore_ops iommu_syscore_ops = {
4166 .resume = iommu_resume,
4167 .suspend = iommu_suspend,
4170 static void __init init_iommu_pm_ops(void)
4172 register_syscore_ops(&iommu_syscore_ops);
4176 static inline void init_iommu_pm_ops(void) {}
4177 #endif /* CONFIG_PM */
4179 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
4181 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
4182 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
4183 rmrr->end_address <= rmrr->base_address ||
4184 arch_rmrr_sanity_check(rmrr))
4190 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4192 struct acpi_dmar_reserved_memory *rmrr;
4193 struct dmar_rmrr_unit *rmrru;
4195 rmrr = (struct acpi_dmar_reserved_memory *)header;
4196 if (rmrr_sanity_check(rmrr)) {
4198 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
4199 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4200 rmrr->base_address, rmrr->end_address,
4201 dmi_get_system_info(DMI_BIOS_VENDOR),
4202 dmi_get_system_info(DMI_BIOS_VERSION),
4203 dmi_get_system_info(DMI_PRODUCT_VERSION));
4204 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4207 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4211 rmrru->hdr = header;
4213 rmrru->base_address = rmrr->base_address;
4214 rmrru->end_address = rmrr->end_address;
4216 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4217 ((void *)rmrr) + rmrr->header.length,
4218 &rmrru->devices_cnt);
4219 if (rmrru->devices_cnt && rmrru->devices == NULL)
4222 list_add(&rmrru->list, &dmar_rmrr_units);
4231 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4233 struct dmar_atsr_unit *atsru;
4234 struct acpi_dmar_atsr *tmp;
4236 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
4238 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4239 if (atsr->segment != tmp->segment)
4241 if (atsr->header.length != tmp->header.length)
4243 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4250 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4252 struct acpi_dmar_atsr *atsr;
4253 struct dmar_atsr_unit *atsru;
4255 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4258 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4259 atsru = dmar_find_atsr(atsr);
4263 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4268 * If memory is allocated from slab by ACPI _DSM method, we need to
4269 * copy the memory content because the memory buffer will be freed
4272 atsru->hdr = (void *)(atsru + 1);
4273 memcpy(atsru->hdr, hdr, hdr->length);
4274 atsru->include_all = atsr->flags & 0x1;
4275 if (!atsru->include_all) {
4276 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4277 (void *)atsr + atsr->header.length,
4278 &atsru->devices_cnt);
4279 if (atsru->devices_cnt && atsru->devices == NULL) {
4285 list_add_rcu(&atsru->list, &dmar_atsr_units);
4290 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4292 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4296 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4298 struct acpi_dmar_atsr *atsr;
4299 struct dmar_atsr_unit *atsru;
4301 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4302 atsru = dmar_find_atsr(atsr);
4304 list_del_rcu(&atsru->list);
4306 intel_iommu_free_atsr(atsru);
4312 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4316 struct acpi_dmar_atsr *atsr;
4317 struct dmar_atsr_unit *atsru;
4319 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4320 atsru = dmar_find_atsr(atsr);
4324 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4325 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4333 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4336 struct intel_iommu *iommu = dmaru->iommu;
4338 if (g_iommus[iommu->seq_id])
4341 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4342 pr_warn("%s: Doesn't support hardware pass through.\n",
4346 if (!ecap_sc_support(iommu->ecap) &&
4347 domain_update_iommu_snooping(iommu)) {
4348 pr_warn("%s: Doesn't support snooping.\n",
4352 sp = domain_update_iommu_superpage(NULL, iommu) - 1;
4353 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4354 pr_warn("%s: Doesn't support large page.\n",
4360 * Disable translation if already enabled prior to OS handover.
4362 if (iommu->gcmd & DMA_GCMD_TE)
4363 iommu_disable_translation(iommu);
4365 g_iommus[iommu->seq_id] = iommu;
4366 ret = iommu_init_domains(iommu);
4368 ret = iommu_alloc_root_entry(iommu);
4372 intel_svm_check(iommu);
4374 if (dmaru->ignored) {
4376 * we always have to disable PMRs or DMA may fail on this device
4379 iommu_disable_protect_mem_regions(iommu);
4383 intel_iommu_init_qi(iommu);
4384 iommu_flush_write_buffer(iommu);
4386 #ifdef CONFIG_INTEL_IOMMU_SVM
4387 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4388 ret = intel_svm_enable_prq(iommu);
4393 ret = dmar_set_interrupt(iommu);
4397 iommu_set_root_entry(iommu);
4398 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4399 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4400 iommu_enable_translation(iommu);
4402 iommu_disable_protect_mem_regions(iommu);
4406 disable_dmar_iommu(iommu);
4408 free_dmar_iommu(iommu);
4412 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4415 struct intel_iommu *iommu = dmaru->iommu;
4417 if (!intel_iommu_enabled)
4423 ret = intel_iommu_add(dmaru);
4425 disable_dmar_iommu(iommu);
4426 free_dmar_iommu(iommu);
4432 static void intel_iommu_free_dmars(void)
4434 struct dmar_rmrr_unit *rmrru, *rmrr_n;
4435 struct dmar_atsr_unit *atsru, *atsr_n;
4437 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4438 list_del(&rmrru->list);
4439 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4443 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4444 list_del(&atsru->list);
4445 intel_iommu_free_atsr(atsru);
4449 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4452 struct pci_bus *bus;
4453 struct pci_dev *bridge = NULL;
4455 struct acpi_dmar_atsr *atsr;
4456 struct dmar_atsr_unit *atsru;
4458 dev = pci_physfn(dev);
4459 for (bus = dev->bus; bus; bus = bus->parent) {
4461 /* If it's an integrated device, allow ATS */
4464 /* Connected via non-PCIe: no ATS */
4465 if (!pci_is_pcie(bridge) ||
4466 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4468 /* If we found the root port, look it up in the ATSR */
4469 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4474 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4475 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4476 if (atsr->segment != pci_domain_nr(dev->bus))
4479 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4480 if (tmp == &bridge->dev)
4483 if (atsru->include_all)
4493 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4496 struct dmar_rmrr_unit *rmrru;
4497 struct dmar_atsr_unit *atsru;
4498 struct acpi_dmar_atsr *atsr;
4499 struct acpi_dmar_reserved_memory *rmrr;
4501 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4504 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4505 rmrr = container_of(rmrru->hdr,
4506 struct acpi_dmar_reserved_memory, header);
4507 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4508 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4509 ((void *)rmrr) + rmrr->header.length,
4510 rmrr->segment, rmrru->devices,
4511 rmrru->devices_cnt);
4514 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4515 dmar_remove_dev_scope(info, rmrr->segment,
4516 rmrru->devices, rmrru->devices_cnt);
4520 list_for_each_entry(atsru, &dmar_atsr_units, list) {
4521 if (atsru->include_all)
4524 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4525 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4526 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4527 (void *)atsr + atsr->header.length,
4528 atsr->segment, atsru->devices,
4529 atsru->devices_cnt);
4534 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4535 if (dmar_remove_dev_scope(info, atsr->segment,
4536 atsru->devices, atsru->devices_cnt))
4544 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4545 unsigned long val, void *v)
4547 struct memory_notify *mhp = v;
4548 unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4549 unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
4553 case MEM_GOING_ONLINE:
4554 if (iommu_domain_identity_map(si_domain,
4555 start_vpfn, last_vpfn)) {
4556 pr_warn("Failed to build identity map for [%lx-%lx]\n",
4557 start_vpfn, last_vpfn);
4563 case MEM_CANCEL_ONLINE:
4565 struct dmar_drhd_unit *drhd;
4566 struct intel_iommu *iommu;
4567 struct page *freelist;
4569 freelist = domain_unmap(si_domain,
4570 start_vpfn, last_vpfn);
4573 for_each_active_iommu(iommu, drhd)
4574 iommu_flush_iotlb_psi(iommu, si_domain,
4575 start_vpfn, mhp->nr_pages,
4578 dma_free_pagelist(freelist);
4586 static struct notifier_block intel_iommu_memory_nb = {
4587 .notifier_call = intel_iommu_memory_notifier,
4591 static void free_all_cpu_cached_iovas(unsigned int cpu)
4595 for (i = 0; i < g_num_of_iommus; i++) {
4596 struct intel_iommu *iommu = g_iommus[i];
4597 struct dmar_domain *domain;
4603 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4604 domain = get_iommu_domain(iommu, (u16)did);
4606 if (!domain || domain->domain.type != IOMMU_DOMAIN_DMA)
4609 free_cpu_cached_iovas(cpu, &domain->iovad);
4614 static int intel_iommu_cpu_dead(unsigned int cpu)
4616 free_all_cpu_cached_iovas(cpu);
4620 static void intel_disable_iommus(void)
4622 struct intel_iommu *iommu = NULL;
4623 struct dmar_drhd_unit *drhd;
4625 for_each_iommu(iommu, drhd)
4626 iommu_disable_translation(iommu);
4629 void intel_iommu_shutdown(void)
4631 struct dmar_drhd_unit *drhd;
4632 struct intel_iommu *iommu = NULL;
4634 if (no_iommu || dmar_disabled)
4637 down_write(&dmar_global_lock);
4639 /* Disable PMRs explicitly here. */
4640 for_each_iommu(iommu, drhd)
4641 iommu_disable_protect_mem_regions(iommu);
4643 /* Make sure the IOMMUs are switched off */
4644 intel_disable_iommus();
4646 up_write(&dmar_global_lock);
4649 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4651 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4653 return container_of(iommu_dev, struct intel_iommu, iommu);
4656 static ssize_t intel_iommu_show_version(struct device *dev,
4657 struct device_attribute *attr,
4660 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4661 u32 ver = readl(iommu->reg + DMAR_VER_REG);
4662 return sprintf(buf, "%d:%d\n",
4663 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4665 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4667 static ssize_t intel_iommu_show_address(struct device *dev,
4668 struct device_attribute *attr,
4671 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4672 return sprintf(buf, "%llx\n", iommu->reg_phys);
4674 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4676 static ssize_t intel_iommu_show_cap(struct device *dev,
4677 struct device_attribute *attr,
4680 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4681 return sprintf(buf, "%llx\n", iommu->cap);
4683 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4685 static ssize_t intel_iommu_show_ecap(struct device *dev,
4686 struct device_attribute *attr,
4689 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4690 return sprintf(buf, "%llx\n", iommu->ecap);
4692 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4694 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4695 struct device_attribute *attr,
4698 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4699 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4701 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4703 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4704 struct device_attribute *attr,
4707 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4708 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4709 cap_ndoms(iommu->cap)));
4711 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4713 static struct attribute *intel_iommu_attrs[] = {
4714 &dev_attr_version.attr,
4715 &dev_attr_address.attr,
4717 &dev_attr_ecap.attr,
4718 &dev_attr_domains_supported.attr,
4719 &dev_attr_domains_used.attr,
4723 static struct attribute_group intel_iommu_group = {
4724 .name = "intel-iommu",
4725 .attrs = intel_iommu_attrs,
4728 const struct attribute_group *intel_iommu_groups[] = {
4733 static inline bool has_untrusted_dev(void)
4735 struct pci_dev *pdev = NULL;
4737 for_each_pci_dev(pdev)
4738 if (pdev->untrusted)
4744 static int __init platform_optin_force_iommu(void)
4746 if (!dmar_platform_optin() || no_platform_optin || !has_untrusted_dev())
4749 if (no_iommu || dmar_disabled)
4750 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4753 * If Intel-IOMMU is disabled by default, we will apply identity
4754 * map for all devices except those marked as being untrusted.
4757 iommu_set_default_passthrough(false);
4765 static int __init probe_acpi_namespace_devices(void)
4767 struct dmar_drhd_unit *drhd;
4768 /* To avoid a -Wunused-but-set-variable warning. */
4769 struct intel_iommu *iommu __maybe_unused;
4773 for_each_active_iommu(iommu, drhd) {
4774 for_each_active_dev_scope(drhd->devices,
4775 drhd->devices_cnt, i, dev) {
4776 struct acpi_device_physical_node *pn;
4777 struct iommu_group *group;
4778 struct acpi_device *adev;
4780 if (dev->bus != &acpi_bus_type)
4783 adev = to_acpi_device(dev);
4784 mutex_lock(&adev->physical_node_lock);
4785 list_for_each_entry(pn,
4786 &adev->physical_node_list, node) {
4787 group = iommu_group_get(pn->dev);
4789 iommu_group_put(group);
4793 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4794 ret = iommu_probe_device(pn->dev);
4798 mutex_unlock(&adev->physical_node_lock);
4808 int __init intel_iommu_init(void)
4811 struct dmar_drhd_unit *drhd;
4812 struct intel_iommu *iommu;
4815 * Intel IOMMU is required for a TXT/tboot launch or platform
4816 * opt in, so enforce that.
4818 force_on = tboot_force_iommu() || platform_optin_force_iommu();
4820 if (iommu_init_mempool()) {
4822 panic("tboot: Failed to initialize iommu memory\n");
4826 down_write(&dmar_global_lock);
4827 if (dmar_table_init()) {
4829 panic("tboot: Failed to initialize DMAR table\n");
4833 if (dmar_dev_scope_init() < 0) {
4835 panic("tboot: Failed to initialize DMAR device scope\n");
4839 up_write(&dmar_global_lock);
4842 * The bus notifier takes the dmar_global_lock, so lockdep will
4843 * complain later when we register it under the lock.
4845 dmar_register_bus_notifier();
4847 down_write(&dmar_global_lock);
4850 intel_iommu_debugfs_init();
4852 if (no_iommu || dmar_disabled) {
4854 * We exit the function here to ensure IOMMU's remapping and
4855 * mempool aren't setup, which means that the IOMMU's PMRs
4856 * won't be disabled via the call to init_dmars(). So disable
4857 * it explicitly here. The PMRs were setup by tboot prior to
4858 * calling SENTER, but the kernel is expected to reset/tear
4861 if (intel_iommu_tboot_noforce) {
4862 for_each_iommu(iommu, drhd)
4863 iommu_disable_protect_mem_regions(iommu);
4867 * Make sure the IOMMUs are switched off, even when we
4868 * boot into a kexec kernel and the previous kernel left
4871 intel_disable_iommus();
4875 if (list_empty(&dmar_rmrr_units))
4876 pr_info("No RMRR found\n");
4878 if (list_empty(&dmar_atsr_units))
4879 pr_info("No ATSR found\n");
4881 if (dmar_init_reserved_ranges()) {
4883 panic("tboot: Failed to reserve iommu ranges\n");
4884 goto out_free_reserved_range;
4888 intel_iommu_gfx_mapped = 1;
4890 init_no_remapping_devices();
4895 panic("tboot: Failed to initialize DMARs\n");
4896 pr_err("Initialization failed\n");
4897 goto out_free_reserved_range;
4899 up_write(&dmar_global_lock);
4901 init_iommu_pm_ops();
4903 down_read(&dmar_global_lock);
4904 for_each_active_iommu(iommu, drhd) {
4905 iommu_device_sysfs_add(&iommu->iommu, NULL,
4908 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4909 iommu_device_register(&iommu->iommu);
4911 up_read(&dmar_global_lock);
4913 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4914 if (si_domain && !hw_pass_through)
4915 register_memory_notifier(&intel_iommu_memory_nb);
4916 cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4917 intel_iommu_cpu_dead);
4919 down_read(&dmar_global_lock);
4920 if (probe_acpi_namespace_devices())
4921 pr_warn("ACPI name space devices didn't probe correctly\n");
4923 /* Finally, we enable the DMA remapping hardware. */
4924 for_each_iommu(iommu, drhd) {
4925 if (!drhd->ignored && !translation_pre_enabled(iommu))
4926 iommu_enable_translation(iommu);
4928 iommu_disable_protect_mem_regions(iommu);
4930 up_read(&dmar_global_lock);
4932 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4934 intel_iommu_enabled = 1;
4938 out_free_reserved_range:
4939 put_iova_domain(&reserved_iova_list);
4941 intel_iommu_free_dmars();
4942 up_write(&dmar_global_lock);
4943 iommu_exit_mempool();
4947 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4949 struct intel_iommu *iommu = opaque;
4951 domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4956 * NB - intel-iommu lacks any sort of reference counting for the users of
4957 * dependent devices. If multiple endpoints have intersecting dependent
4958 * devices, unbinding the driver from any one of them will possibly leave
4959 * the others unable to operate.
4961 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4963 if (!iommu || !dev || !dev_is_pci(dev))
4966 pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4969 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4971 struct dmar_domain *domain;
4972 struct intel_iommu *iommu;
4973 unsigned long flags;
4975 assert_spin_locked(&device_domain_lock);
4980 iommu = info->iommu;
4981 domain = info->domain;
4984 if (dev_is_pci(info->dev) && sm_supported(iommu))
4985 intel_pasid_tear_down_entry(iommu, info->dev,
4986 PASID_RID2PASID, false);
4988 iommu_disable_dev_iotlb(info);
4989 if (!dev_is_real_dma_subdevice(info->dev))
4990 domain_context_clear(iommu, info->dev);
4991 intel_pasid_free_table(info->dev);
4994 unlink_domain_info(info);
4996 spin_lock_irqsave(&iommu->lock, flags);
4997 domain_detach_iommu(domain, iommu);
4998 spin_unlock_irqrestore(&iommu->lock, flags);
5000 free_devinfo_mem(info);
5003 static void dmar_remove_one_dev_info(struct device *dev)
5005 struct device_domain_info *info;
5006 unsigned long flags;
5008 spin_lock_irqsave(&device_domain_lock, flags);
5009 info = get_domain_info(dev);
5011 __dmar_remove_one_dev_info(info);
5012 spin_unlock_irqrestore(&device_domain_lock, flags);
5015 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5019 /* calculate AGAW */
5020 domain->gaw = guest_width;
5021 adjust_width = guestwidth_to_adjustwidth(guest_width);
5022 domain->agaw = width_to_agaw(adjust_width);
5024 domain->iommu_coherency = 0;
5025 domain->iommu_snooping = 0;
5026 domain->iommu_superpage = 0;
5027 domain->max_addr = 0;
5029 /* always allocate the top pgd */
5030 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5033 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5037 static void intel_init_iova_domain(struct dmar_domain *dmar_domain)
5039 init_iova_domain(&dmar_domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5040 copy_reserved_iova(&reserved_iova_list, &dmar_domain->iovad);
5042 if (!intel_iommu_strict &&
5043 init_iova_flush_queue(&dmar_domain->iovad,
5044 iommu_flush_iova, iova_entry_free))
5045 pr_info("iova flush queue initialization failed\n");
5048 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5050 struct dmar_domain *dmar_domain;
5051 struct iommu_domain *domain;
5054 case IOMMU_DOMAIN_DMA:
5056 case IOMMU_DOMAIN_UNMANAGED:
5057 dmar_domain = alloc_domain(0);
5059 pr_err("Can't allocate dmar_domain\n");
5062 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5063 pr_err("Domain initialization failed\n");
5064 domain_exit(dmar_domain);
5068 if (type == IOMMU_DOMAIN_DMA)
5069 intel_init_iova_domain(dmar_domain);
5071 domain_update_iommu_cap(dmar_domain);
5073 domain = &dmar_domain->domain;
5074 domain->geometry.aperture_start = 0;
5075 domain->geometry.aperture_end =
5076 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5077 domain->geometry.force_aperture = true;
5080 case IOMMU_DOMAIN_IDENTITY:
5081 return &si_domain->domain;
5089 static void intel_iommu_domain_free(struct iommu_domain *domain)
5091 if (domain != &si_domain->domain)
5092 domain_exit(to_dmar_domain(domain));
5096 * Check whether a @domain could be attached to the @dev through the
5097 * aux-domain attach/detach APIs.
5100 is_aux_domain(struct device *dev, struct iommu_domain *domain)
5102 struct device_domain_info *info = get_domain_info(dev);
5104 return info && info->auxd_enabled &&
5105 domain->type == IOMMU_DOMAIN_UNMANAGED;
5108 static void auxiliary_link_device(struct dmar_domain *domain,
5111 struct device_domain_info *info = get_domain_info(dev);
5113 assert_spin_locked(&device_domain_lock);
5117 domain->auxd_refcnt++;
5118 list_add(&domain->auxd, &info->auxiliary_domains);
5121 static void auxiliary_unlink_device(struct dmar_domain *domain,
5124 struct device_domain_info *info = get_domain_info(dev);
5126 assert_spin_locked(&device_domain_lock);
5130 list_del(&domain->auxd);
5131 domain->auxd_refcnt--;
5133 if (!domain->auxd_refcnt && domain->default_pasid > 0)
5134 ioasid_free(domain->default_pasid);
5137 static int aux_domain_add_dev(struct dmar_domain *domain,
5142 unsigned long flags;
5143 struct intel_iommu *iommu;
5145 iommu = device_to_iommu(dev, &bus, &devfn);
5149 if (domain->default_pasid <= 0) {
5152 /* No private data needed for the default pasid */
5153 pasid = ioasid_alloc(NULL, PASID_MIN,
5154 pci_max_pasids(to_pci_dev(dev)) - 1,
5156 if (pasid == INVALID_IOASID) {
5157 pr_err("Can't allocate default pasid\n");
5160 domain->default_pasid = pasid;
5163 spin_lock_irqsave(&device_domain_lock, flags);
5165 * iommu->lock must be held to attach domain to iommu and setup the
5166 * pasid entry for second level translation.
5168 spin_lock(&iommu->lock);
5169 ret = domain_attach_iommu(domain, iommu);
5173 /* Setup the PASID entry for mediated devices: */
5174 if (domain_use_first_level(domain))
5175 ret = domain_setup_first_level(iommu, domain, dev,
5176 domain->default_pasid);
5178 ret = intel_pasid_setup_second_level(iommu, domain, dev,
5179 domain->default_pasid);
5182 spin_unlock(&iommu->lock);
5184 auxiliary_link_device(domain, dev);
5186 spin_unlock_irqrestore(&device_domain_lock, flags);
5191 domain_detach_iommu(domain, iommu);
5193 spin_unlock(&iommu->lock);
5194 spin_unlock_irqrestore(&device_domain_lock, flags);
5195 if (!domain->auxd_refcnt && domain->default_pasid > 0)
5196 ioasid_free(domain->default_pasid);
5201 static void aux_domain_remove_dev(struct dmar_domain *domain,
5204 struct device_domain_info *info;
5205 struct intel_iommu *iommu;
5206 unsigned long flags;
5208 if (!is_aux_domain(dev, &domain->domain))
5211 spin_lock_irqsave(&device_domain_lock, flags);
5212 info = get_domain_info(dev);
5213 iommu = info->iommu;
5215 auxiliary_unlink_device(domain, dev);
5217 spin_lock(&iommu->lock);
5218 intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid, false);
5219 domain_detach_iommu(domain, iommu);
5220 spin_unlock(&iommu->lock);
5222 spin_unlock_irqrestore(&device_domain_lock, flags);
5225 static int prepare_domain_attach_device(struct iommu_domain *domain,
5228 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5229 struct intel_iommu *iommu;
5233 iommu = device_to_iommu(dev, &bus, &devfn);
5237 /* check if this iommu agaw is sufficient for max mapped address */
5238 addr_width = agaw_to_width(iommu->agaw);
5239 if (addr_width > cap_mgaw(iommu->cap))
5240 addr_width = cap_mgaw(iommu->cap);
5242 if (dmar_domain->max_addr > (1LL << addr_width)) {
5243 dev_err(dev, "%s: iommu width (%d) is not "
5244 "sufficient for the mapped address (%llx)\n",
5245 __func__, addr_width, dmar_domain->max_addr);
5248 dmar_domain->gaw = addr_width;
5251 * Knock out extra levels of page tables if necessary
5253 while (iommu->agaw < dmar_domain->agaw) {
5254 struct dma_pte *pte;
5256 pte = dmar_domain->pgd;
5257 if (dma_pte_present(pte)) {
5258 dmar_domain->pgd = (struct dma_pte *)
5259 phys_to_virt(dma_pte_addr(pte));
5260 free_pgtable_page(pte);
5262 dmar_domain->agaw--;
5268 static int intel_iommu_attach_device(struct iommu_domain *domain,
5273 if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5274 device_is_rmrr_locked(dev)) {
5275 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
5279 if (is_aux_domain(dev, domain))
5282 /* normally dev is not mapped */
5283 if (unlikely(domain_context_mapped(dev))) {
5284 struct dmar_domain *old_domain;
5286 old_domain = find_domain(dev);
5288 dmar_remove_one_dev_info(dev);
5291 ret = prepare_domain_attach_device(domain, dev);
5295 return domain_add_dev_info(to_dmar_domain(domain), dev);
5298 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5303 if (!is_aux_domain(dev, domain))
5306 ret = prepare_domain_attach_device(domain, dev);
5310 return aux_domain_add_dev(to_dmar_domain(domain), dev);
5313 static void intel_iommu_detach_device(struct iommu_domain *domain,
5316 dmar_remove_one_dev_info(dev);
5319 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5322 aux_domain_remove_dev(to_dmar_domain(domain), dev);
5326 * 2D array for converting and sanitizing IOMMU generic TLB granularity to
5327 * VT-d granularity. Invalidation is typically included in the unmap operation
5328 * as a result of DMA or VFIO unmap. However, for assigned devices guest
5329 * owns the first level page tables. Invalidations of translation caches in the
5330 * guest are trapped and passed down to the host.
5332 * vIOMMU in the guest will only expose first level page tables, therefore
5333 * we do not support IOTLB granularity for request without PASID (second level).
5335 * For example, to find the VT-d granularity encoding for IOTLB
5336 * type and page selective granularity within PASID:
5337 * X: indexed by iommu cache type
5338 * Y: indexed by enum iommu_inv_granularity
5339 * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
5343 inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
5345 * PASID based IOTLB invalidation: PASID selective (per PASID),
5346 * page selective (address granularity)
5348 {-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
5349 /* PASID based dev TLBs */
5350 {-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
5352 {-EINVAL, -EINVAL, -EINVAL}
5355 static inline int to_vtd_granularity(int type, int granu)
5357 return inv_type_granu_table[type][granu];
5360 static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
5362 u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
5364 /* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
5365 * IOMMU cache invalidate API passes granu_size in bytes, and number of
5366 * granu size in contiguous memory.
5368 return order_base_2(nr_pages);
5371 #ifdef CONFIG_INTEL_IOMMU_SVM
5373 intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
5374 struct iommu_cache_invalidate_info *inv_info)
5376 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5377 struct device_domain_info *info;
5378 struct intel_iommu *iommu;
5379 unsigned long flags;
5386 if (!inv_info || !dmar_domain ||
5387 inv_info->version != IOMMU_CACHE_INVALIDATE_INFO_VERSION_1)
5390 if (!dev || !dev_is_pci(dev))
5393 iommu = device_to_iommu(dev, &bus, &devfn);
5397 if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
5400 spin_lock_irqsave(&device_domain_lock, flags);
5401 spin_lock(&iommu->lock);
5402 info = get_domain_info(dev);
5407 did = dmar_domain->iommu_did[iommu->seq_id];
5408 sid = PCI_DEVID(bus, devfn);
5410 /* Size is only valid in address selective invalidation */
5411 if (inv_info->granularity != IOMMU_INV_GRANU_PASID)
5412 size = to_vtd_size(inv_info->addr_info.granule_size,
5413 inv_info->addr_info.nb_granules);
5415 for_each_set_bit(cache_type,
5416 (unsigned long *)&inv_info->cache,
5417 IOMMU_CACHE_INV_TYPE_NR) {
5421 granu = to_vtd_granularity(cache_type, inv_info->granularity);
5422 if (granu == -EINVAL) {
5423 pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
5424 cache_type, inv_info->granularity);
5429 * PASID is stored in different locations based on the
5432 if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
5433 (inv_info->pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
5434 pasid = inv_info->pasid_info.pasid;
5435 else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5436 (inv_info->addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
5437 pasid = inv_info->addr_info.pasid;
5439 switch (BIT(cache_type)) {
5440 case IOMMU_CACHE_INV_TYPE_IOTLB:
5441 if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5443 (inv_info->addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
5444 pr_err_ratelimited("Address out of range, 0x%llx, size order %llu\n",
5445 inv_info->addr_info.addr, size);
5451 * If granu is PASID-selective, address is ignored.
5452 * We use npages = -1 to indicate that.
5454 qi_flush_piotlb(iommu, did, pasid,
5455 mm_to_dma_pfn(inv_info->addr_info.addr),
5456 (granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
5457 inv_info->addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
5460 * Always flush device IOTLB if ATS is enabled. vIOMMU
5461 * in the guest may assume IOTLB flush is inclusive,
5462 * which is more efficient.
5464 if (info->ats_enabled)
5465 qi_flush_dev_iotlb_pasid(iommu, sid,
5468 inv_info->addr_info.addr,
5471 case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
5472 if (info->ats_enabled)
5473 qi_flush_dev_iotlb_pasid(iommu, sid,
5476 inv_info->addr_info.addr,
5479 pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
5482 dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
5488 spin_unlock(&iommu->lock);
5489 spin_unlock_irqrestore(&device_domain_lock, flags);
5495 static int intel_iommu_map(struct iommu_domain *domain,
5496 unsigned long iova, phys_addr_t hpa,
5497 size_t size, int iommu_prot, gfp_t gfp)
5499 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5504 if (iommu_prot & IOMMU_READ)
5505 prot |= DMA_PTE_READ;
5506 if (iommu_prot & IOMMU_WRITE)
5507 prot |= DMA_PTE_WRITE;
5508 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5509 prot |= DMA_PTE_SNP;
5511 max_addr = iova + size;
5512 if (dmar_domain->max_addr < max_addr) {
5515 /* check if minimum agaw is sufficient for mapped address */
5516 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5517 if (end < max_addr) {
5518 pr_err("%s: iommu width (%d) is not "
5519 "sufficient for the mapped address (%llx)\n",
5520 __func__, dmar_domain->gaw, max_addr);
5523 dmar_domain->max_addr = max_addr;
5525 /* Round up size to next multiple of PAGE_SIZE, if it and
5526 the low bits of hpa would take us onto the next page */
5527 size = aligned_nrpages(hpa, size);
5528 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5529 hpa >> VTD_PAGE_SHIFT, size, prot);
5533 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5534 unsigned long iova, size_t size,
5535 struct iommu_iotlb_gather *gather)
5537 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5538 struct page *freelist = NULL;
5539 unsigned long start_pfn, last_pfn;
5540 unsigned int npages;
5541 int iommu_id, level = 0;
5543 /* Cope with horrid API which requires us to unmap more than the
5544 size argument if it happens to be a large-page mapping. */
5545 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5547 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5548 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5550 start_pfn = iova >> VTD_PAGE_SHIFT;
5551 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5553 freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5555 npages = last_pfn - start_pfn + 1;
5557 for_each_domain_iommu(iommu_id, dmar_domain)
5558 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5559 start_pfn, npages, !freelist, 0);
5561 dma_free_pagelist(freelist);
5563 if (dmar_domain->max_addr == iova + size)
5564 dmar_domain->max_addr = iova;
5569 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5572 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5573 struct dma_pte *pte;
5577 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5578 if (pte && dma_pte_present(pte))
5579 phys = dma_pte_addr(pte) +
5580 (iova & (BIT_MASK(level_to_offset_bits(level) +
5581 VTD_PAGE_SHIFT) - 1));
5586 static inline bool scalable_mode_support(void)
5588 struct dmar_drhd_unit *drhd;
5589 struct intel_iommu *iommu;
5593 for_each_active_iommu(iommu, drhd) {
5594 if (!sm_supported(iommu)) {
5604 static inline bool iommu_pasid_support(void)
5606 struct dmar_drhd_unit *drhd;
5607 struct intel_iommu *iommu;
5611 for_each_active_iommu(iommu, drhd) {
5612 if (!pasid_supported(iommu)) {
5622 static inline bool nested_mode_support(void)
5624 struct dmar_drhd_unit *drhd;
5625 struct intel_iommu *iommu;
5629 for_each_active_iommu(iommu, drhd) {
5630 if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) {
5640 static bool intel_iommu_capable(enum iommu_cap cap)
5642 if (cap == IOMMU_CAP_CACHE_COHERENCY)
5643 return domain_update_iommu_snooping(NULL) == 1;
5644 if (cap == IOMMU_CAP_INTR_REMAP)
5645 return irq_remapping_enabled == 1;
5650 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
5652 struct intel_iommu *iommu;
5655 iommu = device_to_iommu(dev, &bus, &devfn);
5657 return ERR_PTR(-ENODEV);
5659 if (translation_pre_enabled(iommu))
5660 dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO;
5662 return &iommu->iommu;
5665 static void intel_iommu_release_device(struct device *dev)
5667 struct intel_iommu *iommu;
5670 iommu = device_to_iommu(dev, &bus, &devfn);
5674 dmar_remove_one_dev_info(dev);
5676 set_dma_ops(dev, NULL);
5679 static void intel_iommu_probe_finalize(struct device *dev)
5681 struct iommu_domain *domain;
5683 domain = iommu_get_domain_for_dev(dev);
5684 if (device_needs_bounce(dev))
5685 set_dma_ops(dev, &bounce_dma_ops);
5686 else if (domain && domain->type == IOMMU_DOMAIN_DMA)
5687 set_dma_ops(dev, &intel_dma_ops);
5689 set_dma_ops(dev, NULL);
5692 static void intel_iommu_get_resv_regions(struct device *device,
5693 struct list_head *head)
5695 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5696 struct iommu_resv_region *reg;
5697 struct dmar_rmrr_unit *rmrr;
5698 struct device *i_dev;
5701 down_read(&dmar_global_lock);
5702 for_each_rmrr_units(rmrr) {
5703 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5705 struct iommu_resv_region *resv;
5706 enum iommu_resv_type type;
5709 if (i_dev != device &&
5710 !is_downstream_to_pci_bridge(device, i_dev))
5713 length = rmrr->end_address - rmrr->base_address + 1;
5715 type = device_rmrr_is_relaxable(device) ?
5716 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5718 resv = iommu_alloc_resv_region(rmrr->base_address,
5719 length, prot, type);
5723 list_add_tail(&resv->list, head);
5726 up_read(&dmar_global_lock);
5728 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5729 if (dev_is_pci(device)) {
5730 struct pci_dev *pdev = to_pci_dev(device);
5732 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5733 reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5734 IOMMU_RESV_DIRECT_RELAXABLE);
5736 list_add_tail(®->list, head);
5739 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5741 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5742 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5746 list_add_tail(®->list, head);
5749 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5751 struct device_domain_info *info;
5752 struct context_entry *context;
5753 struct dmar_domain *domain;
5754 unsigned long flags;
5758 domain = find_domain(dev);
5762 spin_lock_irqsave(&device_domain_lock, flags);
5763 spin_lock(&iommu->lock);
5766 info = get_domain_info(dev);
5767 if (!info || !info->pasid_supported)
5770 context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5771 if (WARN_ON(!context))
5774 ctx_lo = context[0].lo;
5776 if (!(ctx_lo & CONTEXT_PASIDE)) {
5777 ctx_lo |= CONTEXT_PASIDE;
5778 context[0].lo = ctx_lo;
5780 iommu->flush.flush_context(iommu,
5781 domain->iommu_did[iommu->seq_id],
5782 PCI_DEVID(info->bus, info->devfn),
5783 DMA_CCMD_MASK_NOBIT,
5784 DMA_CCMD_DEVICE_INVL);
5787 /* Enable PASID support in the device, if it wasn't already */
5788 if (!info->pasid_enabled)
5789 iommu_enable_dev_iotlb(info);
5794 spin_unlock(&iommu->lock);
5795 spin_unlock_irqrestore(&device_domain_lock, flags);
5800 static void intel_iommu_apply_resv_region(struct device *dev,
5801 struct iommu_domain *domain,
5802 struct iommu_resv_region *region)
5804 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5805 unsigned long start, end;
5807 start = IOVA_PFN(region->start);
5808 end = IOVA_PFN(region->start + region->length - 1);
5810 WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5813 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5815 if (dev_is_pci(dev))
5816 return pci_device_group(dev);
5817 return generic_device_group(dev);
5820 #ifdef CONFIG_INTEL_IOMMU_SVM
5821 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5823 struct intel_iommu *iommu;
5826 if (iommu_dummy(dev)) {
5828 "No IOMMU translation for device; cannot enable SVM\n");
5832 iommu = device_to_iommu(dev, &bus, &devfn);
5834 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5840 #endif /* CONFIG_INTEL_IOMMU_SVM */
5842 static int intel_iommu_enable_auxd(struct device *dev)
5844 struct device_domain_info *info;
5845 struct intel_iommu *iommu;
5846 unsigned long flags;
5850 iommu = device_to_iommu(dev, &bus, &devfn);
5851 if (!iommu || dmar_disabled)
5854 if (!sm_supported(iommu) || !pasid_supported(iommu))
5857 ret = intel_iommu_enable_pasid(iommu, dev);
5861 spin_lock_irqsave(&device_domain_lock, flags);
5862 info = get_domain_info(dev);
5863 info->auxd_enabled = 1;
5864 spin_unlock_irqrestore(&device_domain_lock, flags);
5869 static int intel_iommu_disable_auxd(struct device *dev)
5871 struct device_domain_info *info;
5872 unsigned long flags;
5874 spin_lock_irqsave(&device_domain_lock, flags);
5875 info = get_domain_info(dev);
5876 if (!WARN_ON(!info))
5877 info->auxd_enabled = 0;
5878 spin_unlock_irqrestore(&device_domain_lock, flags);
5884 * A PCI express designated vendor specific extended capability is defined
5885 * in the section 3.7 of Intel scalable I/O virtualization technical spec
5886 * for system software and tools to detect endpoint devices supporting the
5887 * Intel scalable IO virtualization without host driver dependency.
5889 * Returns the address of the matching extended capability structure within
5890 * the device's PCI configuration space or 0 if the device does not support
5893 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5898 pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5900 pci_read_config_word(pdev, pos + 4, &vendor);
5901 pci_read_config_word(pdev, pos + 8, &id);
5902 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5905 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5912 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5914 if (feat == IOMMU_DEV_FEAT_AUX) {
5917 if (!dev_is_pci(dev) || dmar_disabled ||
5918 !scalable_mode_support() || !iommu_pasid_support())
5921 ret = pci_pasid_features(to_pci_dev(dev));
5925 return !!siov_find_pci_dvsec(to_pci_dev(dev));
5928 if (feat == IOMMU_DEV_FEAT_SVA) {
5929 struct device_domain_info *info = get_domain_info(dev);
5931 return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) &&
5932 info->pasid_supported && info->pri_supported &&
5933 info->ats_supported;
5940 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5942 if (feat == IOMMU_DEV_FEAT_AUX)
5943 return intel_iommu_enable_auxd(dev);
5945 if (feat == IOMMU_DEV_FEAT_SVA) {
5946 struct device_domain_info *info = get_domain_info(dev);
5951 if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE)
5959 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5961 if (feat == IOMMU_DEV_FEAT_AUX)
5962 return intel_iommu_disable_auxd(dev);
5968 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5970 struct device_domain_info *info = get_domain_info(dev);
5972 if (feat == IOMMU_DEV_FEAT_AUX)
5973 return scalable_mode_support() && info && info->auxd_enabled;
5979 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5981 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5983 return dmar_domain->default_pasid > 0 ?
5984 dmar_domain->default_pasid : -EINVAL;
5987 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5990 return attach_deferred(dev);
5994 intel_iommu_domain_set_attr(struct iommu_domain *domain,
5995 enum iommu_attr attr, void *data)
5997 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5998 unsigned long flags;
6001 if (domain->type != IOMMU_DOMAIN_UNMANAGED)
6005 case DOMAIN_ATTR_NESTING:
6006 spin_lock_irqsave(&device_domain_lock, flags);
6007 if (nested_mode_support() &&
6008 list_empty(&dmar_domain->devices)) {
6009 dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
6010 dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
6014 spin_unlock_irqrestore(&device_domain_lock, flags);
6024 const struct iommu_ops intel_iommu_ops = {
6025 .capable = intel_iommu_capable,
6026 .domain_alloc = intel_iommu_domain_alloc,
6027 .domain_free = intel_iommu_domain_free,
6028 .domain_set_attr = intel_iommu_domain_set_attr,
6029 .attach_dev = intel_iommu_attach_device,
6030 .detach_dev = intel_iommu_detach_device,
6031 .aux_attach_dev = intel_iommu_aux_attach_device,
6032 .aux_detach_dev = intel_iommu_aux_detach_device,
6033 .aux_get_pasid = intel_iommu_aux_get_pasid,
6034 .map = intel_iommu_map,
6035 .unmap = intel_iommu_unmap,
6036 .iova_to_phys = intel_iommu_iova_to_phys,
6037 .probe_device = intel_iommu_probe_device,
6038 .probe_finalize = intel_iommu_probe_finalize,
6039 .release_device = intel_iommu_release_device,
6040 .get_resv_regions = intel_iommu_get_resv_regions,
6041 .put_resv_regions = generic_iommu_put_resv_regions,
6042 .apply_resv_region = intel_iommu_apply_resv_region,
6043 .device_group = intel_iommu_device_group,
6044 .dev_has_feat = intel_iommu_dev_has_feat,
6045 .dev_feat_enabled = intel_iommu_dev_feat_enabled,
6046 .dev_enable_feat = intel_iommu_dev_enable_feat,
6047 .dev_disable_feat = intel_iommu_dev_disable_feat,
6048 .is_attach_deferred = intel_iommu_is_attach_deferred,
6049 .def_domain_type = device_def_domain_type,
6050 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
6051 #ifdef CONFIG_INTEL_IOMMU_SVM
6052 .cache_invalidate = intel_iommu_sva_invalidate,
6053 .sva_bind_gpasid = intel_svm_bind_gpasid,
6054 .sva_unbind_gpasid = intel_svm_unbind_gpasid,
6055 .sva_bind = intel_svm_bind,
6056 .sva_unbind = intel_svm_unbind,
6057 .sva_get_pasid = intel_svm_get_pasid,
6061 static void quirk_iommu_igfx(struct pci_dev *dev)
6063 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
6067 /* G4x/GM45 integrated gfx dmar support is totally busted. */
6068 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
6069 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
6070 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
6071 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
6072 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
6073 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
6074 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
6076 /* Broadwell igfx malfunctions with dmar */
6077 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
6078 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
6079 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
6080 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
6081 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
6082 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
6083 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
6084 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
6085 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
6086 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
6087 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
6088 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
6089 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
6090 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
6091 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
6092 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
6093 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
6094 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
6095 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
6096 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
6097 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
6098 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
6099 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
6100 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
6102 static void quirk_iommu_rwbf(struct pci_dev *dev)
6105 * Mobile 4 Series Chipset neglects to set RWBF capability,
6106 * but needs it. Same seems to hold for the desktop versions.
6108 pci_info(dev, "Forcing write-buffer flush capability\n");
6112 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
6113 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
6114 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
6115 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
6116 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
6117 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
6118 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
6121 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
6122 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
6123 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
6124 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
6125 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
6126 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
6127 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
6128 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
6130 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
6134 if (pci_read_config_word(dev, GGC, &ggc))
6137 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
6138 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
6140 } else if (dmar_map_gfx) {
6141 /* we have to ensure the gfx device is idle before we flush */
6142 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
6143 intel_iommu_strict = 1;
6146 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
6147 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
6148 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
6149 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
6151 /* On Tylersburg chipsets, some BIOSes have been known to enable the
6152 ISOCH DMAR unit for the Azalia sound device, but not give it any
6153 TLB entries, which causes it to deadlock. Check for that. We do
6154 this in a function called from init_dmars(), instead of in a PCI
6155 quirk, because we don't want to print the obnoxious "BIOS broken"
6156 message if VT-d is actually disabled.
6158 static void __init check_tylersburg_isoch(void)
6160 struct pci_dev *pdev;
6161 uint32_t vtisochctrl;
6163 /* If there's no Azalia in the system anyway, forget it. */
6164 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
6169 /* System Management Registers. Might be hidden, in which case
6170 we can't do the sanity check. But that's OK, because the
6171 known-broken BIOSes _don't_ actually hide it, so far. */
6172 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
6176 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
6183 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
6184 if (vtisochctrl & 1)
6187 /* Drop all bits other than the number of TLB entries */
6188 vtisochctrl &= 0x1c;
6190 /* If we have the recommended number of TLB entries (16), fine. */
6191 if (vtisochctrl == 0x10)
6194 /* Zero TLB entries? You get to ride the short bus to school. */
6196 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
6197 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
6198 dmi_get_system_info(DMI_BIOS_VENDOR),
6199 dmi_get_system_info(DMI_BIOS_VERSION),
6200 dmi_get_system_info(DMI_PRODUCT_VERSION));
6201 iommu_identity_mapping |= IDENTMAP_AZALIA;
6205 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",