1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright © 2006-2014 Intel Corporation.
13 #define pr_fmt(fmt) "DMAR: " fmt
14 #define dev_fmt(fmt) pr_fmt(fmt)
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-mapping.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-contiguous.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <linux/swiotlb.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48 #include <trace/events/intel_iommu.h>
50 #include "../irq_remapping.h"
51 #include "intel-pasid.h"
53 #define ROOT_SIZE VTD_PAGE_SIZE
54 #define CONTEXT_SIZE VTD_PAGE_SIZE
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
61 #define IOAPIC_RANGE_START (0xfee00000)
62 #define IOAPIC_RANGE_END (0xfeefffff)
63 #define IOVA_START_ADDR (0x1000)
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
70 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
76 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN (1)
82 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
84 /* page table handling */
85 #define LEVEL_STRIDE (9)
86 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
89 * This bitmap is used to advertise the page sizes our hardware support
90 * to the IOMMU core, which will then use this information to split
91 * physically contiguous memory regions it is mapping into page sizes
94 * Traditionally the IOMMU core just handed us the mappings directly,
95 * after making sure the size is an order of a 4KiB page and that the
96 * mapping has natural alignment.
98 * To retain this behavior, we currently advertise that we support
99 * all page sizes that are an order of 4KiB.
101 * If at some point we'd like to utilize the IOMMU core's new behavior,
102 * we could change this to advertise the real page sizes we support.
104 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
106 static inline int agaw_to_level(int agaw)
111 static inline int agaw_to_width(int agaw)
113 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
116 static inline int width_to_agaw(int width)
118 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
121 static inline unsigned int level_to_offset_bits(int level)
123 return (level - 1) * LEVEL_STRIDE;
126 static inline int pfn_level_offset(unsigned long pfn, int level)
128 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
131 static inline unsigned long level_mask(int level)
133 return -1UL << level_to_offset_bits(level);
136 static inline unsigned long level_size(int level)
138 return 1UL << level_to_offset_bits(level);
141 static inline unsigned long align_to_level(unsigned long pfn, int level)
143 return (pfn + level_size(level) - 1) & level_mask(level);
146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
148 return 1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
152 are never going to work. */
153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
155 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
160 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
162 static inline unsigned long page_to_dma_pfn(struct page *pg)
164 return mm_to_dma_pfn(page_to_pfn(pg));
166 static inline unsigned long virt_to_dma_pfn(void *p)
168 return page_to_dma_pfn(virt_to_page(p));
171 /* global iommu list, set NULL for ignored DMAR units */
172 static struct intel_iommu **g_iommus;
174 static void __init check_tylersburg_isoch(void);
175 static int rwbf_quirk;
178 * set to 1 to panic kernel if can't successfully enable VT-d
179 * (used when kernel is launched w/ TXT)
181 static int force_on = 0;
182 int intel_iommu_tboot_noforce;
183 static int no_platform_optin;
185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
188 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
191 static phys_addr_t root_entry_lctp(struct root_entry *re)
196 return re->lo & VTD_PAGE_MASK;
200 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
203 static phys_addr_t root_entry_uctp(struct root_entry *re)
208 return re->hi & VTD_PAGE_MASK;
211 static inline void context_clear_pasid_enable(struct context_entry *context)
213 context->lo &= ~(1ULL << 11);
216 static inline bool context_pasid_enabled(struct context_entry *context)
218 return !!(context->lo & (1ULL << 11));
221 static inline void context_set_copied(struct context_entry *context)
223 context->hi |= (1ull << 3);
226 static inline bool context_copied(struct context_entry *context)
228 return !!(context->hi & (1ULL << 3));
231 static inline bool __context_present(struct context_entry *context)
233 return (context->lo & 1);
236 bool context_present(struct context_entry *context)
238 return context_pasid_enabled(context) ?
239 __context_present(context) :
240 __context_present(context) && !context_copied(context);
243 static inline void context_set_present(struct context_entry *context)
248 static inline void context_set_fault_enable(struct context_entry *context)
250 context->lo &= (((u64)-1) << 2) | 1;
253 static inline void context_set_translation_type(struct context_entry *context,
256 context->lo &= (((u64)-1) << 4) | 3;
257 context->lo |= (value & 3) << 2;
260 static inline void context_set_address_root(struct context_entry *context,
263 context->lo &= ~VTD_PAGE_MASK;
264 context->lo |= value & VTD_PAGE_MASK;
267 static inline void context_set_address_width(struct context_entry *context,
270 context->hi |= value & 7;
273 static inline void context_set_domain_id(struct context_entry *context,
276 context->hi |= (value & ((1 << 16) - 1)) << 8;
279 static inline int context_domain_id(struct context_entry *c)
281 return((c->hi >> 8) & 0xffff);
284 static inline void context_clear_entry(struct context_entry *context)
291 * This domain is a statically identity mapping domain.
292 * 1. This domain creats a static 1:1 mapping to all usable memory.
293 * 2. It maps to each iommu if successful.
294 * 3. Each iommu mapps to this domain if successful.
296 static struct dmar_domain *si_domain;
297 static int hw_pass_through = 1;
299 #define for_each_domain_iommu(idx, domain) \
300 for (idx = 0; idx < g_num_of_iommus; idx++) \
301 if (domain->iommu_refcnt[idx])
303 struct dmar_rmrr_unit {
304 struct list_head list; /* list of rmrr units */
305 struct acpi_dmar_header *hdr; /* ACPI header */
306 u64 base_address; /* reserved base address*/
307 u64 end_address; /* reserved end address */
308 struct dmar_dev_scope *devices; /* target devices */
309 int devices_cnt; /* target device count */
312 struct dmar_atsr_unit {
313 struct list_head list; /* list of ATSR units */
314 struct acpi_dmar_header *hdr; /* ACPI header */
315 struct dmar_dev_scope *devices; /* target devices */
316 int devices_cnt; /* target device count */
317 u8 include_all:1; /* include all ports */
320 static LIST_HEAD(dmar_atsr_units);
321 static LIST_HEAD(dmar_rmrr_units);
323 #define for_each_rmrr_units(rmrr) \
324 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
326 /* bitmap for indexing intel_iommus */
327 static int g_num_of_iommus;
329 static void domain_exit(struct dmar_domain *domain);
330 static void domain_remove_dev_info(struct dmar_domain *domain);
331 static void dmar_remove_one_dev_info(struct device *dev);
332 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
333 static int intel_iommu_attach_device(struct iommu_domain *domain,
335 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
338 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
339 int dmar_disabled = 0;
341 int dmar_disabled = 1;
342 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
344 #ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
345 int intel_iommu_sm = 1;
348 #endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
350 int intel_iommu_enabled = 0;
351 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
353 static int dmar_map_gfx = 1;
354 static int dmar_forcedac;
355 static int intel_iommu_strict;
356 static int intel_iommu_superpage = 1;
357 static int iommu_identity_mapping;
358 static int intel_no_bounce;
360 #define IDENTMAP_GFX 2
361 #define IDENTMAP_AZALIA 4
363 int intel_iommu_gfx_mapped;
364 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
366 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
367 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
368 struct device_domain_info *get_domain_info(struct device *dev)
370 struct device_domain_info *info;
375 info = dev->archdata.iommu;
376 if (unlikely(info == DUMMY_DEVICE_DOMAIN_INFO ||
377 info == DEFER_DEVICE_DOMAIN_INFO))
383 DEFINE_SPINLOCK(device_domain_lock);
384 static LIST_HEAD(device_domain_list);
386 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) && \
387 to_pci_dev(d)->untrusted)
390 * Iterate over elements in device_domain_list and call the specified
391 * callback @fn against each element.
393 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
394 void *data), void *data)
398 struct device_domain_info *info;
400 spin_lock_irqsave(&device_domain_lock, flags);
401 list_for_each_entry(info, &device_domain_list, global) {
402 ret = fn(info, data);
404 spin_unlock_irqrestore(&device_domain_lock, flags);
408 spin_unlock_irqrestore(&device_domain_lock, flags);
413 const struct iommu_ops intel_iommu_ops;
415 static bool translation_pre_enabled(struct intel_iommu *iommu)
417 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
420 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
422 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
425 static void init_translation_status(struct intel_iommu *iommu)
429 gsts = readl(iommu->reg + DMAR_GSTS_REG);
430 if (gsts & DMA_GSTS_TES)
431 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
434 static int __init intel_iommu_setup(char *str)
439 if (!strncmp(str, "on", 2)) {
441 pr_info("IOMMU enabled\n");
442 } else if (!strncmp(str, "off", 3)) {
444 no_platform_optin = 1;
445 pr_info("IOMMU disabled\n");
446 } else if (!strncmp(str, "igfx_off", 8)) {
448 pr_info("Disable GFX device mapping\n");
449 } else if (!strncmp(str, "forcedac", 8)) {
450 pr_info("Forcing DAC for PCI devices\n");
452 } else if (!strncmp(str, "strict", 6)) {
453 pr_info("Disable batched IOTLB flush\n");
454 intel_iommu_strict = 1;
455 } else if (!strncmp(str, "sp_off", 6)) {
456 pr_info("Disable supported super page\n");
457 intel_iommu_superpage = 0;
458 } else if (!strncmp(str, "sm_on", 5)) {
459 pr_info("Intel-IOMMU: scalable mode supported\n");
461 } else if (!strncmp(str, "tboot_noforce", 13)) {
462 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
463 intel_iommu_tboot_noforce = 1;
464 } else if (!strncmp(str, "nobounce", 8)) {
465 pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
469 str += strcspn(str, ",");
475 __setup("intel_iommu=", intel_iommu_setup);
477 static struct kmem_cache *iommu_domain_cache;
478 static struct kmem_cache *iommu_devinfo_cache;
480 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
482 struct dmar_domain **domains;
485 domains = iommu->domains[idx];
489 return domains[did & 0xff];
492 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
493 struct dmar_domain *domain)
495 struct dmar_domain **domains;
498 if (!iommu->domains[idx]) {
499 size_t size = 256 * sizeof(struct dmar_domain *);
500 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
503 domains = iommu->domains[idx];
504 if (WARN_ON(!domains))
507 domains[did & 0xff] = domain;
510 void *alloc_pgtable_page(int node)
515 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
517 vaddr = page_address(page);
521 void free_pgtable_page(void *vaddr)
523 free_page((unsigned long)vaddr);
526 static inline void *alloc_domain_mem(void)
528 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
531 static void free_domain_mem(void *vaddr)
533 kmem_cache_free(iommu_domain_cache, vaddr);
536 static inline void * alloc_devinfo_mem(void)
538 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
541 static inline void free_devinfo_mem(void *vaddr)
543 kmem_cache_free(iommu_devinfo_cache, vaddr);
546 static inline int domain_type_is_si(struct dmar_domain *domain)
548 return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
551 static inline bool domain_use_first_level(struct dmar_domain *domain)
553 return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
556 static inline int domain_pfn_supported(struct dmar_domain *domain,
559 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
561 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
564 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
569 sagaw = cap_sagaw(iommu->cap);
570 for (agaw = width_to_agaw(max_gaw);
572 if (test_bit(agaw, &sagaw))
580 * Calculate max SAGAW for each iommu.
582 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
584 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
588 * calculate agaw for each iommu.
589 * "SAGAW" may be different across iommus, use a default agaw, and
590 * get a supported less agaw for iommus that don't support the default agaw.
592 int iommu_calculate_agaw(struct intel_iommu *iommu)
594 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
597 /* This functionin only returns single iommu in a domain */
598 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
602 /* si_domain and vm domain should not get here. */
603 if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
606 for_each_domain_iommu(iommu_id, domain)
609 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
612 return g_iommus[iommu_id];
615 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
617 return sm_supported(iommu) ?
618 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
621 static void domain_update_iommu_coherency(struct dmar_domain *domain)
623 struct dmar_drhd_unit *drhd;
624 struct intel_iommu *iommu;
628 domain->iommu_coherency = 1;
630 for_each_domain_iommu(i, domain) {
632 if (!iommu_paging_structure_coherency(g_iommus[i])) {
633 domain->iommu_coherency = 0;
640 /* No hardware attached; use lowest common denominator */
642 for_each_active_iommu(iommu, drhd) {
643 if (!iommu_paging_structure_coherency(iommu)) {
644 domain->iommu_coherency = 0;
651 static int domain_update_iommu_snooping(struct intel_iommu *skip)
653 struct dmar_drhd_unit *drhd;
654 struct intel_iommu *iommu;
658 for_each_active_iommu(iommu, drhd) {
660 if (!ecap_sc_support(iommu->ecap)) {
671 static int domain_update_iommu_superpage(struct dmar_domain *domain,
672 struct intel_iommu *skip)
674 struct dmar_drhd_unit *drhd;
675 struct intel_iommu *iommu;
678 if (!intel_iommu_superpage) {
682 /* set iommu_superpage to the smallest common denominator */
684 for_each_active_iommu(iommu, drhd) {
686 if (domain && domain_use_first_level(domain)) {
687 if (!cap_fl1gp_support(iommu->cap))
690 mask &= cap_super_page_val(iommu->cap);
702 /* Some capabilities may be different across iommus */
703 static void domain_update_iommu_cap(struct dmar_domain *domain)
705 domain_update_iommu_coherency(domain);
706 domain->iommu_snooping = domain_update_iommu_snooping(NULL);
707 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
710 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
713 struct root_entry *root = &iommu->root_entry[bus];
714 struct context_entry *context;
718 if (sm_supported(iommu)) {
726 context = phys_to_virt(*entry & VTD_PAGE_MASK);
728 unsigned long phy_addr;
732 context = alloc_pgtable_page(iommu->node);
736 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
737 phy_addr = virt_to_phys((void *)context);
738 *entry = phy_addr | 1;
739 __iommu_flush_cache(iommu, entry, sizeof(*entry));
741 return &context[devfn];
744 static int iommu_dummy(struct device *dev)
746 return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
749 static bool attach_deferred(struct device *dev)
751 return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO;
755 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
756 * sub-hierarchy of a candidate PCI-PCI bridge
757 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
758 * @bridge: the candidate PCI-PCI bridge
760 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
763 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
765 struct pci_dev *pdev, *pbridge;
767 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
770 pdev = to_pci_dev(dev);
771 pbridge = to_pci_dev(bridge);
773 if (pbridge->subordinate &&
774 pbridge->subordinate->number <= pdev->bus->number &&
775 pbridge->subordinate->busn_res.end >= pdev->bus->number)
781 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
783 struct dmar_drhd_unit *drhd = NULL;
784 struct intel_iommu *iommu;
786 struct pci_dev *pdev = NULL;
790 if (iommu_dummy(dev))
793 if (dev_is_pci(dev)) {
794 struct pci_dev *pf_pdev;
796 pdev = pci_real_dma_dev(to_pci_dev(dev));
798 /* VFs aren't listed in scope tables; we need to look up
799 * the PF instead to find the IOMMU. */
800 pf_pdev = pci_physfn(pdev);
802 segment = pci_domain_nr(pdev->bus);
803 } else if (has_acpi_companion(dev))
804 dev = &ACPI_COMPANION(dev)->dev;
807 for_each_active_iommu(iommu, drhd) {
808 if (pdev && segment != drhd->segment)
811 for_each_active_dev_scope(drhd->devices,
812 drhd->devices_cnt, i, tmp) {
814 /* For a VF use its original BDF# not that of the PF
815 * which we used for the IOMMU lookup. Strictly speaking
816 * we could do this for all PCI devices; we only need to
817 * get the BDF# from the scope table for ACPI matches. */
818 if (pdev && pdev->is_virtfn)
821 *bus = drhd->devices[i].bus;
822 *devfn = drhd->devices[i].devfn;
826 if (is_downstream_to_pci_bridge(dev, tmp))
830 if (pdev && drhd->include_all) {
832 *bus = pdev->bus->number;
833 *devfn = pdev->devfn;
844 static void domain_flush_cache(struct dmar_domain *domain,
845 void *addr, int size)
847 if (!domain->iommu_coherency)
848 clflush_cache_range(addr, size);
851 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
853 struct context_entry *context;
857 spin_lock_irqsave(&iommu->lock, flags);
858 context = iommu_context_addr(iommu, bus, devfn, 0);
860 ret = context_present(context);
861 spin_unlock_irqrestore(&iommu->lock, flags);
865 static void free_context_table(struct intel_iommu *iommu)
869 struct context_entry *context;
871 spin_lock_irqsave(&iommu->lock, flags);
872 if (!iommu->root_entry) {
875 for (i = 0; i < ROOT_ENTRY_NR; i++) {
876 context = iommu_context_addr(iommu, i, 0, 0);
878 free_pgtable_page(context);
880 if (!sm_supported(iommu))
883 context = iommu_context_addr(iommu, i, 0x80, 0);
885 free_pgtable_page(context);
888 free_pgtable_page(iommu->root_entry);
889 iommu->root_entry = NULL;
891 spin_unlock_irqrestore(&iommu->lock, flags);
894 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
895 unsigned long pfn, int *target_level)
897 struct dma_pte *parent, *pte;
898 int level = agaw_to_level(domain->agaw);
901 BUG_ON(!domain->pgd);
903 if (!domain_pfn_supported(domain, pfn))
904 /* Address beyond IOMMU's addressing capabilities. */
907 parent = domain->pgd;
912 offset = pfn_level_offset(pfn, level);
913 pte = &parent[offset];
914 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
916 if (level == *target_level)
919 if (!dma_pte_present(pte)) {
922 tmp_page = alloc_pgtable_page(domain->nid);
927 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
928 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
929 if (domain_use_first_level(domain))
930 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
931 if (cmpxchg64(&pte->val, 0ULL, pteval))
932 /* Someone else set it while we were thinking; use theirs. */
933 free_pgtable_page(tmp_page);
935 domain_flush_cache(domain, pte, sizeof(*pte));
940 parent = phys_to_virt(dma_pte_addr(pte));
945 *target_level = level;
950 /* return address's pte at specific level */
951 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
953 int level, int *large_page)
955 struct dma_pte *parent, *pte;
956 int total = agaw_to_level(domain->agaw);
959 parent = domain->pgd;
960 while (level <= total) {
961 offset = pfn_level_offset(pfn, total);
962 pte = &parent[offset];
966 if (!dma_pte_present(pte)) {
971 if (dma_pte_superpage(pte)) {
976 parent = phys_to_virt(dma_pte_addr(pte));
982 /* clear last level pte, a tlb flush should be followed */
983 static void dma_pte_clear_range(struct dmar_domain *domain,
984 unsigned long start_pfn,
985 unsigned long last_pfn)
987 unsigned int large_page;
988 struct dma_pte *first_pte, *pte;
990 BUG_ON(!domain_pfn_supported(domain, start_pfn));
991 BUG_ON(!domain_pfn_supported(domain, last_pfn));
992 BUG_ON(start_pfn > last_pfn);
994 /* we don't need lock here; nobody else touches the iova range */
997 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
999 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1004 start_pfn += lvl_to_nr_pages(large_page);
1006 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1008 domain_flush_cache(domain, first_pte,
1009 (void *)pte - (void *)first_pte);
1011 } while (start_pfn && start_pfn <= last_pfn);
1014 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1015 int retain_level, struct dma_pte *pte,
1016 unsigned long pfn, unsigned long start_pfn,
1017 unsigned long last_pfn)
1019 pfn = max(start_pfn, pfn);
1020 pte = &pte[pfn_level_offset(pfn, level)];
1023 unsigned long level_pfn;
1024 struct dma_pte *level_pte;
1026 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1029 level_pfn = pfn & level_mask(level);
1030 level_pte = phys_to_virt(dma_pte_addr(pte));
1033 dma_pte_free_level(domain, level - 1, retain_level,
1034 level_pte, level_pfn, start_pfn,
1039 * Free the page table if we're below the level we want to
1040 * retain and the range covers the entire table.
1042 if (level < retain_level && !(start_pfn > level_pfn ||
1043 last_pfn < level_pfn + level_size(level) - 1)) {
1045 domain_flush_cache(domain, pte, sizeof(*pte));
1046 free_pgtable_page(level_pte);
1049 pfn += level_size(level);
1050 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1054 * clear last level (leaf) ptes and free page table pages below the
1055 * level we wish to keep intact.
1057 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1058 unsigned long start_pfn,
1059 unsigned long last_pfn,
1062 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1063 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1064 BUG_ON(start_pfn > last_pfn);
1066 dma_pte_clear_range(domain, start_pfn, last_pfn);
1068 /* We don't need lock here; nobody else touches the iova range */
1069 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1070 domain->pgd, 0, start_pfn, last_pfn);
1073 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1074 free_pgtable_page(domain->pgd);
1079 /* When a page at a given level is being unlinked from its parent, we don't
1080 need to *modify* it at all. All we need to do is make a list of all the
1081 pages which can be freed just as soon as we've flushed the IOTLB and we
1082 know the hardware page-walk will no longer touch them.
1083 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1085 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1086 int level, struct dma_pte *pte,
1087 struct page *freelist)
1091 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1092 pg->freelist = freelist;
1098 pte = page_address(pg);
1100 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1101 freelist = dma_pte_list_pagetables(domain, level - 1,
1104 } while (!first_pte_in_page(pte));
1109 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1110 struct dma_pte *pte, unsigned long pfn,
1111 unsigned long start_pfn,
1112 unsigned long last_pfn,
1113 struct page *freelist)
1115 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1117 pfn = max(start_pfn, pfn);
1118 pte = &pte[pfn_level_offset(pfn, level)];
1121 unsigned long level_pfn;
1123 if (!dma_pte_present(pte))
1126 level_pfn = pfn & level_mask(level);
1128 /* If range covers entire pagetable, free it */
1129 if (start_pfn <= level_pfn &&
1130 last_pfn >= level_pfn + level_size(level) - 1) {
1131 /* These suborbinate page tables are going away entirely. Don't
1132 bother to clear them; we're just going to *free* them. */
1133 if (level > 1 && !dma_pte_superpage(pte))
1134 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1140 } else if (level > 1) {
1141 /* Recurse down into a level that isn't *entirely* obsolete */
1142 freelist = dma_pte_clear_level(domain, level - 1,
1143 phys_to_virt(dma_pte_addr(pte)),
1144 level_pfn, start_pfn, last_pfn,
1148 pfn += level_size(level);
1149 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1152 domain_flush_cache(domain, first_pte,
1153 (void *)++last_pte - (void *)first_pte);
1158 /* We can't just free the pages because the IOMMU may still be walking
1159 the page tables, and may have cached the intermediate levels. The
1160 pages can only be freed after the IOTLB flush has been done. */
1161 static struct page *domain_unmap(struct dmar_domain *domain,
1162 unsigned long start_pfn,
1163 unsigned long last_pfn)
1165 struct page *freelist;
1167 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1168 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1169 BUG_ON(start_pfn > last_pfn);
1171 /* we don't need lock here; nobody else touches the iova range */
1172 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1173 domain->pgd, 0, start_pfn, last_pfn, NULL);
1176 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1177 struct page *pgd_page = virt_to_page(domain->pgd);
1178 pgd_page->freelist = freelist;
1179 freelist = pgd_page;
1187 static void dma_free_pagelist(struct page *freelist)
1191 while ((pg = freelist)) {
1192 freelist = pg->freelist;
1193 free_pgtable_page(page_address(pg));
1197 static void iova_entry_free(unsigned long data)
1199 struct page *freelist = (struct page *)data;
1201 dma_free_pagelist(freelist);
1204 /* iommu handling */
1205 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1207 struct root_entry *root;
1208 unsigned long flags;
1210 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1212 pr_err("Allocating root entry for %s failed\n",
1217 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1219 spin_lock_irqsave(&iommu->lock, flags);
1220 iommu->root_entry = root;
1221 spin_unlock_irqrestore(&iommu->lock, flags);
1226 static void iommu_set_root_entry(struct intel_iommu *iommu)
1232 addr = virt_to_phys(iommu->root_entry);
1233 if (sm_supported(iommu))
1234 addr |= DMA_RTADDR_SMT;
1236 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1237 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1239 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1241 /* Make sure hardware complete it */
1242 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1243 readl, (sts & DMA_GSTS_RTPS), sts);
1245 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1248 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1253 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1256 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1257 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1259 /* Make sure hardware complete it */
1260 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1261 readl, (!(val & DMA_GSTS_WBFS)), val);
1263 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1266 /* return value determine if we need a write buffer flush */
1267 static void __iommu_flush_context(struct intel_iommu *iommu,
1268 u16 did, u16 source_id, u8 function_mask,
1275 case DMA_CCMD_GLOBAL_INVL:
1276 val = DMA_CCMD_GLOBAL_INVL;
1278 case DMA_CCMD_DOMAIN_INVL:
1279 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1281 case DMA_CCMD_DEVICE_INVL:
1282 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1283 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1288 val |= DMA_CCMD_ICC;
1290 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1291 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1293 /* Make sure hardware complete it */
1294 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1295 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1297 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1300 /* return value determine if we need a write buffer flush */
1301 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1302 u64 addr, unsigned int size_order, u64 type)
1304 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1305 u64 val = 0, val_iva = 0;
1309 case DMA_TLB_GLOBAL_FLUSH:
1310 /* global flush doesn't need set IVA_REG */
1311 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1313 case DMA_TLB_DSI_FLUSH:
1314 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1316 case DMA_TLB_PSI_FLUSH:
1317 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1318 /* IH bit is passed in as part of address */
1319 val_iva = size_order | addr;
1324 /* Note: set drain read/write */
1327 * This is probably to be super secure.. Looks like we can
1328 * ignore it without any impact.
1330 if (cap_read_drain(iommu->cap))
1331 val |= DMA_TLB_READ_DRAIN;
1333 if (cap_write_drain(iommu->cap))
1334 val |= DMA_TLB_WRITE_DRAIN;
1336 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1337 /* Note: Only uses first TLB reg currently */
1339 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1340 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1342 /* Make sure hardware complete it */
1343 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1344 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1346 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1348 /* check IOTLB invalidation granularity */
1349 if (DMA_TLB_IAIG(val) == 0)
1350 pr_err("Flush IOTLB failed\n");
1351 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1352 pr_debug("TLB flush request %Lx, actual %Lx\n",
1353 (unsigned long long)DMA_TLB_IIRG(type),
1354 (unsigned long long)DMA_TLB_IAIG(val));
1357 static struct device_domain_info *
1358 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1361 struct device_domain_info *info;
1363 assert_spin_locked(&device_domain_lock);
1368 list_for_each_entry(info, &domain->devices, link)
1369 if (info->iommu == iommu && info->bus == bus &&
1370 info->devfn == devfn) {
1371 if (info->ats_supported && info->dev)
1379 static void domain_update_iotlb(struct dmar_domain *domain)
1381 struct device_domain_info *info;
1382 bool has_iotlb_device = false;
1384 assert_spin_locked(&device_domain_lock);
1386 list_for_each_entry(info, &domain->devices, link) {
1387 struct pci_dev *pdev;
1389 if (!info->dev || !dev_is_pci(info->dev))
1392 pdev = to_pci_dev(info->dev);
1393 if (pdev->ats_enabled) {
1394 has_iotlb_device = true;
1399 domain->has_iotlb_device = has_iotlb_device;
1402 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1404 struct pci_dev *pdev;
1406 assert_spin_locked(&device_domain_lock);
1408 if (!info || !dev_is_pci(info->dev))
1411 pdev = to_pci_dev(info->dev);
1412 /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1413 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1414 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1415 * reserved, which should be set to 0.
1417 if (!ecap_dit(info->iommu->ecap))
1420 struct pci_dev *pf_pdev;
1422 /* pdev will be returned if device is not a vf */
1423 pf_pdev = pci_physfn(pdev);
1424 info->pfsid = pci_dev_id(pf_pdev);
1427 #ifdef CONFIG_INTEL_IOMMU_SVM
1428 /* The PCIe spec, in its wisdom, declares that the behaviour of
1429 the device if you enable PASID support after ATS support is
1430 undefined. So always enable PASID support on devices which
1431 have it, even if we can't yet know if we're ever going to
1433 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1434 info->pasid_enabled = 1;
1436 if (info->pri_supported &&
1437 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) &&
1438 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1439 info->pri_enabled = 1;
1441 if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1442 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1443 info->ats_enabled = 1;
1444 domain_update_iotlb(info->domain);
1445 info->ats_qdep = pci_ats_queue_depth(pdev);
1449 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1451 struct pci_dev *pdev;
1453 assert_spin_locked(&device_domain_lock);
1455 if (!dev_is_pci(info->dev))
1458 pdev = to_pci_dev(info->dev);
1460 if (info->ats_enabled) {
1461 pci_disable_ats(pdev);
1462 info->ats_enabled = 0;
1463 domain_update_iotlb(info->domain);
1465 #ifdef CONFIG_INTEL_IOMMU_SVM
1466 if (info->pri_enabled) {
1467 pci_disable_pri(pdev);
1468 info->pri_enabled = 0;
1470 if (info->pasid_enabled) {
1471 pci_disable_pasid(pdev);
1472 info->pasid_enabled = 0;
1477 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1478 u64 addr, unsigned mask)
1481 unsigned long flags;
1482 struct device_domain_info *info;
1484 if (!domain->has_iotlb_device)
1487 spin_lock_irqsave(&device_domain_lock, flags);
1488 list_for_each_entry(info, &domain->devices, link) {
1489 if (!info->ats_enabled)
1492 sid = info->bus << 8 | info->devfn;
1493 qdep = info->ats_qdep;
1494 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1497 spin_unlock_irqrestore(&device_domain_lock, flags);
1500 static void domain_flush_piotlb(struct intel_iommu *iommu,
1501 struct dmar_domain *domain,
1502 u64 addr, unsigned long npages, bool ih)
1504 u16 did = domain->iommu_did[iommu->seq_id];
1506 if (domain->default_pasid)
1507 qi_flush_piotlb(iommu, did, domain->default_pasid,
1510 if (!list_empty(&domain->devices))
1511 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1514 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1515 struct dmar_domain *domain,
1516 unsigned long pfn, unsigned int pages,
1519 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1520 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1521 u16 did = domain->iommu_did[iommu->seq_id];
1528 if (domain_use_first_level(domain)) {
1529 domain_flush_piotlb(iommu, domain, addr, pages, ih);
1532 * Fallback to domain selective flush if no PSI support or
1533 * the size is too big. PSI requires page size to be 2 ^ x,
1534 * and the base address is naturally aligned to the size.
1536 if (!cap_pgsel_inv(iommu->cap) ||
1537 mask > cap_max_amask_val(iommu->cap))
1538 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1541 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1546 * In caching mode, changes of pages from non-present to present require
1547 * flush. However, device IOTLB doesn't need to be flushed in this case.
1549 if (!cap_caching_mode(iommu->cap) || !map)
1550 iommu_flush_dev_iotlb(domain, addr, mask);
1553 /* Notification for newly created mappings */
1554 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1555 struct dmar_domain *domain,
1556 unsigned long pfn, unsigned int pages)
1559 * It's a non-present to present mapping. Only flush if caching mode
1562 if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1563 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1565 iommu_flush_write_buffer(iommu);
1568 static void iommu_flush_iova(struct iova_domain *iovad)
1570 struct dmar_domain *domain;
1573 domain = container_of(iovad, struct dmar_domain, iovad);
1575 for_each_domain_iommu(idx, domain) {
1576 struct intel_iommu *iommu = g_iommus[idx];
1577 u16 did = domain->iommu_did[iommu->seq_id];
1579 if (domain_use_first_level(domain))
1580 domain_flush_piotlb(iommu, domain, 0, -1, 0);
1582 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1585 if (!cap_caching_mode(iommu->cap))
1586 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1587 0, MAX_AGAW_PFN_WIDTH);
1591 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1594 unsigned long flags;
1596 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1599 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1600 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1601 pmen &= ~DMA_PMEN_EPM;
1602 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1604 /* wait for the protected region status bit to clear */
1605 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1606 readl, !(pmen & DMA_PMEN_PRS), pmen);
1608 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1611 static void iommu_enable_translation(struct intel_iommu *iommu)
1614 unsigned long flags;
1616 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1617 iommu->gcmd |= DMA_GCMD_TE;
1618 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1620 /* Make sure hardware complete it */
1621 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1622 readl, (sts & DMA_GSTS_TES), sts);
1624 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1627 static void iommu_disable_translation(struct intel_iommu *iommu)
1632 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1633 iommu->gcmd &= ~DMA_GCMD_TE;
1634 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1636 /* Make sure hardware complete it */
1637 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1638 readl, (!(sts & DMA_GSTS_TES)), sts);
1640 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1643 static int iommu_init_domains(struct intel_iommu *iommu)
1645 u32 ndomains, nlongs;
1648 ndomains = cap_ndoms(iommu->cap);
1649 pr_debug("%s: Number of Domains supported <%d>\n",
1650 iommu->name, ndomains);
1651 nlongs = BITS_TO_LONGS(ndomains);
1653 spin_lock_init(&iommu->lock);
1655 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1656 if (!iommu->domain_ids) {
1657 pr_err("%s: Allocating domain id array failed\n",
1662 size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1663 iommu->domains = kzalloc(size, GFP_KERNEL);
1665 if (iommu->domains) {
1666 size = 256 * sizeof(struct dmar_domain *);
1667 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1670 if (!iommu->domains || !iommu->domains[0]) {
1671 pr_err("%s: Allocating domain array failed\n",
1673 kfree(iommu->domain_ids);
1674 kfree(iommu->domains);
1675 iommu->domain_ids = NULL;
1676 iommu->domains = NULL;
1681 * If Caching mode is set, then invalid translations are tagged
1682 * with domain-id 0, hence we need to pre-allocate it. We also
1683 * use domain-id 0 as a marker for non-allocated domain-id, so
1684 * make sure it is not used for a real domain.
1686 set_bit(0, iommu->domain_ids);
1689 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1690 * entry for first-level or pass-through translation modes should
1691 * be programmed with a domain id different from those used for
1692 * second-level or nested translation. We reserve a domain id for
1695 if (sm_supported(iommu))
1696 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1701 static void disable_dmar_iommu(struct intel_iommu *iommu)
1703 struct device_domain_info *info, *tmp;
1704 unsigned long flags;
1706 if (!iommu->domains || !iommu->domain_ids)
1709 spin_lock_irqsave(&device_domain_lock, flags);
1710 list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1711 if (info->iommu != iommu)
1714 if (!info->dev || !info->domain)
1717 __dmar_remove_one_dev_info(info);
1719 spin_unlock_irqrestore(&device_domain_lock, flags);
1721 if (iommu->gcmd & DMA_GCMD_TE)
1722 iommu_disable_translation(iommu);
1725 static void free_dmar_iommu(struct intel_iommu *iommu)
1727 if ((iommu->domains) && (iommu->domain_ids)) {
1728 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1731 for (i = 0; i < elems; i++)
1732 kfree(iommu->domains[i]);
1733 kfree(iommu->domains);
1734 kfree(iommu->domain_ids);
1735 iommu->domains = NULL;
1736 iommu->domain_ids = NULL;
1739 g_iommus[iommu->seq_id] = NULL;
1741 /* free context mapping */
1742 free_context_table(iommu);
1744 #ifdef CONFIG_INTEL_IOMMU_SVM
1745 if (pasid_supported(iommu)) {
1746 if (ecap_prs(iommu->ecap))
1747 intel_svm_finish_prq(iommu);
1749 if (ecap_vcs(iommu->ecap) && vccap_pasid(iommu->vccap))
1750 ioasid_unregister_allocator(&iommu->pasid_allocator);
1756 * Check and return whether first level is used by default for
1759 static bool first_level_by_default(void)
1761 struct dmar_drhd_unit *drhd;
1762 struct intel_iommu *iommu;
1763 static int first_level_support = -1;
1765 if (likely(first_level_support != -1))
1766 return first_level_support;
1768 first_level_support = 1;
1771 for_each_active_iommu(iommu, drhd) {
1772 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) {
1773 first_level_support = 0;
1779 return first_level_support;
1782 static struct dmar_domain *alloc_domain(int flags)
1784 struct dmar_domain *domain;
1786 domain = alloc_domain_mem();
1790 memset(domain, 0, sizeof(*domain));
1791 domain->nid = NUMA_NO_NODE;
1792 domain->flags = flags;
1793 if (first_level_by_default())
1794 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1795 domain->has_iotlb_device = false;
1796 INIT_LIST_HEAD(&domain->devices);
1801 /* Must be called with iommu->lock */
1802 static int domain_attach_iommu(struct dmar_domain *domain,
1803 struct intel_iommu *iommu)
1805 unsigned long ndomains;
1808 assert_spin_locked(&device_domain_lock);
1809 assert_spin_locked(&iommu->lock);
1811 domain->iommu_refcnt[iommu->seq_id] += 1;
1812 domain->iommu_count += 1;
1813 if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1814 ndomains = cap_ndoms(iommu->cap);
1815 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1817 if (num >= ndomains) {
1818 pr_err("%s: No free domain ids\n", iommu->name);
1819 domain->iommu_refcnt[iommu->seq_id] -= 1;
1820 domain->iommu_count -= 1;
1824 set_bit(num, iommu->domain_ids);
1825 set_iommu_domain(iommu, num, domain);
1827 domain->iommu_did[iommu->seq_id] = num;
1828 domain->nid = iommu->node;
1830 domain_update_iommu_cap(domain);
1836 static int domain_detach_iommu(struct dmar_domain *domain,
1837 struct intel_iommu *iommu)
1841 assert_spin_locked(&device_domain_lock);
1842 assert_spin_locked(&iommu->lock);
1844 domain->iommu_refcnt[iommu->seq_id] -= 1;
1845 count = --domain->iommu_count;
1846 if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1847 num = domain->iommu_did[iommu->seq_id];
1848 clear_bit(num, iommu->domain_ids);
1849 set_iommu_domain(iommu, num, NULL);
1851 domain_update_iommu_cap(domain);
1852 domain->iommu_did[iommu->seq_id] = 0;
1858 static struct iova_domain reserved_iova_list;
1859 static struct lock_class_key reserved_rbtree_key;
1861 static int dmar_init_reserved_ranges(void)
1863 struct pci_dev *pdev = NULL;
1867 init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1869 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1870 &reserved_rbtree_key);
1872 /* IOAPIC ranges shouldn't be accessed by DMA */
1873 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1874 IOVA_PFN(IOAPIC_RANGE_END));
1876 pr_err("Reserve IOAPIC range failed\n");
1880 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1881 for_each_pci_dev(pdev) {
1884 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1885 r = &pdev->resource[i];
1886 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1888 iova = reserve_iova(&reserved_iova_list,
1892 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1900 static inline int guestwidth_to_adjustwidth(int gaw)
1903 int r = (gaw - 12) % 9;
1914 static void domain_exit(struct dmar_domain *domain)
1917 /* Remove associated devices and clear attached or cached domains */
1918 domain_remove_dev_info(domain);
1921 if (domain->domain.type == IOMMU_DOMAIN_DMA)
1922 put_iova_domain(&domain->iovad);
1925 struct page *freelist;
1927 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1928 dma_free_pagelist(freelist);
1931 free_domain_mem(domain);
1935 * Get the PASID directory size for scalable mode context entry.
1936 * Value of X in the PDTS field of a scalable mode context entry
1937 * indicates PASID directory with 2^(X + 7) entries.
1939 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1943 max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1944 pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1952 * Set the RID_PASID field of a scalable mode context entry. The
1953 * IOMMU hardware will use the PASID value set in this field for
1954 * DMA translations of DMA requests without PASID.
1957 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1959 context->hi |= pasid & ((1 << 20) - 1);
1963 * Set the DTE(Device-TLB Enable) field of a scalable mode context
1966 static inline void context_set_sm_dte(struct context_entry *context)
1968 context->lo |= (1 << 2);
1972 * Set the PRE(Page Request Enable) field of a scalable mode context
1975 static inline void context_set_sm_pre(struct context_entry *context)
1977 context->lo |= (1 << 4);
1980 /* Convert value to context PASID directory size field coding. */
1981 #define context_pdts(pds) (((pds) & 0x7) << 9)
1983 static int domain_context_mapping_one(struct dmar_domain *domain,
1984 struct intel_iommu *iommu,
1985 struct pasid_table *table,
1988 u16 did = domain->iommu_did[iommu->seq_id];
1989 int translation = CONTEXT_TT_MULTI_LEVEL;
1990 struct device_domain_info *info = NULL;
1991 struct context_entry *context;
1992 unsigned long flags;
1997 if (hw_pass_through && domain_type_is_si(domain))
1998 translation = CONTEXT_TT_PASS_THROUGH;
2000 pr_debug("Set context mapping for %02x:%02x.%d\n",
2001 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2003 BUG_ON(!domain->pgd);
2005 spin_lock_irqsave(&device_domain_lock, flags);
2006 spin_lock(&iommu->lock);
2009 context = iommu_context_addr(iommu, bus, devfn, 1);
2014 if (context_present(context))
2018 * For kdump cases, old valid entries may be cached due to the
2019 * in-flight DMA and copied pgtable, but there is no unmapping
2020 * behaviour for them, thus we need an explicit cache flush for
2021 * the newly-mapped device. For kdump, at this point, the device
2022 * is supposed to finish reset at its driver probe stage, so no
2023 * in-flight DMA will exist, and we don't need to worry anymore
2026 if (context_copied(context)) {
2027 u16 did_old = context_domain_id(context);
2029 if (did_old < cap_ndoms(iommu->cap)) {
2030 iommu->flush.flush_context(iommu, did_old,
2031 (((u16)bus) << 8) | devfn,
2032 DMA_CCMD_MASK_NOBIT,
2033 DMA_CCMD_DEVICE_INVL);
2034 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2039 context_clear_entry(context);
2041 if (sm_supported(iommu)) {
2046 /* Setup the PASID DIR pointer: */
2047 pds = context_get_sm_pds(table);
2048 context->lo = (u64)virt_to_phys(table->table) |
2051 /* Setup the RID_PASID field: */
2052 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2055 * Setup the Device-TLB enable bit and Page request
2058 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2059 if (info && info->ats_supported)
2060 context_set_sm_dte(context);
2061 if (info && info->pri_supported)
2062 context_set_sm_pre(context);
2064 struct dma_pte *pgd = domain->pgd;
2067 context_set_domain_id(context, did);
2069 if (translation != CONTEXT_TT_PASS_THROUGH) {
2071 * Skip top levels of page tables for iommu which has
2072 * less agaw than default. Unnecessary for PT mode.
2074 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2076 pgd = phys_to_virt(dma_pte_addr(pgd));
2077 if (!dma_pte_present(pgd))
2081 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2082 if (info && info->ats_supported)
2083 translation = CONTEXT_TT_DEV_IOTLB;
2085 translation = CONTEXT_TT_MULTI_LEVEL;
2087 context_set_address_root(context, virt_to_phys(pgd));
2088 context_set_address_width(context, agaw);
2091 * In pass through mode, AW must be programmed to
2092 * indicate the largest AGAW value supported by
2093 * hardware. And ASR is ignored by hardware.
2095 context_set_address_width(context, iommu->msagaw);
2098 context_set_translation_type(context, translation);
2101 context_set_fault_enable(context);
2102 context_set_present(context);
2103 if (!ecap_coherent(iommu->ecap))
2104 clflush_cache_range(context, sizeof(*context));
2107 * It's a non-present to present mapping. If hardware doesn't cache
2108 * non-present entry we only need to flush the write-buffer. If the
2109 * _does_ cache non-present entries, then it does so in the special
2110 * domain #0, which we have to flush:
2112 if (cap_caching_mode(iommu->cap)) {
2113 iommu->flush.flush_context(iommu, 0,
2114 (((u16)bus) << 8) | devfn,
2115 DMA_CCMD_MASK_NOBIT,
2116 DMA_CCMD_DEVICE_INVL);
2117 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2119 iommu_flush_write_buffer(iommu);
2121 iommu_enable_dev_iotlb(info);
2126 spin_unlock(&iommu->lock);
2127 spin_unlock_irqrestore(&device_domain_lock, flags);
2132 struct domain_context_mapping_data {
2133 struct dmar_domain *domain;
2134 struct intel_iommu *iommu;
2135 struct pasid_table *table;
2138 static int domain_context_mapping_cb(struct pci_dev *pdev,
2139 u16 alias, void *opaque)
2141 struct domain_context_mapping_data *data = opaque;
2143 return domain_context_mapping_one(data->domain, data->iommu,
2144 data->table, PCI_BUS_NUM(alias),
2149 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2151 struct domain_context_mapping_data data;
2152 struct pasid_table *table;
2153 struct intel_iommu *iommu;
2156 iommu = device_to_iommu(dev, &bus, &devfn);
2160 table = intel_pasid_get_table(dev);
2162 if (!dev_is_pci(dev))
2163 return domain_context_mapping_one(domain, iommu, table,
2166 data.domain = domain;
2170 return pci_for_each_dma_alias(to_pci_dev(dev),
2171 &domain_context_mapping_cb, &data);
2174 static int domain_context_mapped_cb(struct pci_dev *pdev,
2175 u16 alias, void *opaque)
2177 struct intel_iommu *iommu = opaque;
2179 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2182 static int domain_context_mapped(struct device *dev)
2184 struct intel_iommu *iommu;
2187 iommu = device_to_iommu(dev, &bus, &devfn);
2191 if (!dev_is_pci(dev))
2192 return device_context_mapped(iommu, bus, devfn);
2194 return !pci_for_each_dma_alias(to_pci_dev(dev),
2195 domain_context_mapped_cb, iommu);
2198 /* Returns a number of VTD pages, but aligned to MM page size */
2199 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2202 host_addr &= ~PAGE_MASK;
2203 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2206 /* Return largest possible superpage level for a given mapping */
2207 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2208 unsigned long iov_pfn,
2209 unsigned long phy_pfn,
2210 unsigned long pages)
2212 int support, level = 1;
2213 unsigned long pfnmerge;
2215 support = domain->iommu_superpage;
2217 /* To use a large page, the virtual *and* physical addresses
2218 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2219 of them will mean we have to use smaller pages. So just
2220 merge them and check both at once. */
2221 pfnmerge = iov_pfn | phy_pfn;
2223 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2224 pages >>= VTD_STRIDE_SHIFT;
2227 pfnmerge >>= VTD_STRIDE_SHIFT;
2234 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2235 struct scatterlist *sg, unsigned long phys_pfn,
2236 unsigned long nr_pages, int prot)
2238 struct dma_pte *first_pte = NULL, *pte = NULL;
2239 phys_addr_t uninitialized_var(pteval);
2240 unsigned long sg_res = 0;
2241 unsigned int largepage_lvl = 0;
2242 unsigned long lvl_pages = 0;
2245 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2247 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2250 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2251 if (domain_use_first_level(domain))
2252 attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD | DMA_FL_PTE_US;
2256 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2259 while (nr_pages > 0) {
2263 unsigned int pgoff = sg->offset & ~PAGE_MASK;
2265 sg_res = aligned_nrpages(sg->offset, sg->length);
2266 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2267 sg->dma_length = sg->length;
2268 pteval = (sg_phys(sg) - pgoff) | attr;
2269 phys_pfn = pteval >> VTD_PAGE_SHIFT;
2273 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2275 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2278 /* It is large page*/
2279 if (largepage_lvl > 1) {
2280 unsigned long nr_superpages, end_pfn;
2282 pteval |= DMA_PTE_LARGE_PAGE;
2283 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2285 nr_superpages = sg_res / lvl_pages;
2286 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2289 * Ensure that old small page tables are
2290 * removed to make room for superpage(s).
2291 * We're adding new large pages, so make sure
2292 * we don't remove their parent tables.
2294 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2297 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2301 /* We don't need lock here, nobody else
2302 * touches the iova range
2304 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2306 static int dumps = 5;
2307 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2308 iov_pfn, tmp, (unsigned long long)pteval);
2311 debug_dma_dump_mappings(NULL);
2316 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2318 BUG_ON(nr_pages < lvl_pages);
2319 BUG_ON(sg_res < lvl_pages);
2321 nr_pages -= lvl_pages;
2322 iov_pfn += lvl_pages;
2323 phys_pfn += lvl_pages;
2324 pteval += lvl_pages * VTD_PAGE_SIZE;
2325 sg_res -= lvl_pages;
2327 /* If the next PTE would be the first in a new page, then we
2328 need to flush the cache on the entries we've just written.
2329 And then we'll need to recalculate 'pte', so clear it and
2330 let it get set again in the if (!pte) block above.
2332 If we're done (!nr_pages) we need to flush the cache too.
2334 Also if we've been setting superpages, we may need to
2335 recalculate 'pte' and switch back to smaller pages for the
2336 end of the mapping, if the trailing size is not enough to
2337 use another superpage (i.e. sg_res < lvl_pages). */
2339 if (!nr_pages || first_pte_in_page(pte) ||
2340 (largepage_lvl > 1 && sg_res < lvl_pages)) {
2341 domain_flush_cache(domain, first_pte,
2342 (void *)pte - (void *)first_pte);
2346 if (!sg_res && nr_pages)
2352 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2353 struct scatterlist *sg, unsigned long phys_pfn,
2354 unsigned long nr_pages, int prot)
2357 struct intel_iommu *iommu;
2359 /* Do the real mapping first */
2360 ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2364 for_each_domain_iommu(iommu_id, domain) {
2365 iommu = g_iommus[iommu_id];
2366 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2372 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2373 struct scatterlist *sg, unsigned long nr_pages,
2376 return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2379 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2380 unsigned long phys_pfn, unsigned long nr_pages,
2383 return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2386 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2388 unsigned long flags;
2389 struct context_entry *context;
2395 spin_lock_irqsave(&iommu->lock, flags);
2396 context = iommu_context_addr(iommu, bus, devfn, 0);
2398 spin_unlock_irqrestore(&iommu->lock, flags);
2401 did_old = context_domain_id(context);
2402 context_clear_entry(context);
2403 __iommu_flush_cache(iommu, context, sizeof(*context));
2404 spin_unlock_irqrestore(&iommu->lock, flags);
2405 iommu->flush.flush_context(iommu,
2407 (((u16)bus) << 8) | devfn,
2408 DMA_CCMD_MASK_NOBIT,
2409 DMA_CCMD_DEVICE_INVL);
2410 iommu->flush.flush_iotlb(iommu,
2417 static inline void unlink_domain_info(struct device_domain_info *info)
2419 assert_spin_locked(&device_domain_lock);
2420 list_del(&info->link);
2421 list_del(&info->global);
2423 info->dev->archdata.iommu = NULL;
2426 static void domain_remove_dev_info(struct dmar_domain *domain)
2428 struct device_domain_info *info, *tmp;
2429 unsigned long flags;
2431 spin_lock_irqsave(&device_domain_lock, flags);
2432 list_for_each_entry_safe(info, tmp, &domain->devices, link)
2433 __dmar_remove_one_dev_info(info);
2434 spin_unlock_irqrestore(&device_domain_lock, flags);
2437 struct dmar_domain *find_domain(struct device *dev)
2439 struct device_domain_info *info;
2441 if (unlikely(attach_deferred(dev) || iommu_dummy(dev)))
2444 /* No lock here, assumes no domain exit in normal case */
2445 info = get_domain_info(dev);
2447 return info->domain;
2452 static void do_deferred_attach(struct device *dev)
2454 struct iommu_domain *domain;
2456 dev->archdata.iommu = NULL;
2457 domain = iommu_get_domain_for_dev(dev);
2459 intel_iommu_attach_device(domain, dev);
2462 static inline struct device_domain_info *
2463 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2465 struct device_domain_info *info;
2467 list_for_each_entry(info, &device_domain_list, global)
2468 if (info->segment == segment && info->bus == bus &&
2469 info->devfn == devfn)
2475 static int domain_setup_first_level(struct intel_iommu *iommu,
2476 struct dmar_domain *domain,
2480 int flags = PASID_FLAG_SUPERVISOR_MODE;
2481 struct dma_pte *pgd = domain->pgd;
2485 * Skip top levels of page tables for iommu which has
2486 * less agaw than default. Unnecessary for PT mode.
2488 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2489 pgd = phys_to_virt(dma_pte_addr(pgd));
2490 if (!dma_pte_present(pgd))
2494 level = agaw_to_level(agaw);
2495 if (level != 4 && level != 5)
2498 flags |= (level == 5) ? PASID_FLAG_FL5LP : 0;
2500 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2501 domain->iommu_did[iommu->seq_id],
2505 static bool dev_is_real_dma_subdevice(struct device *dev)
2507 return dev && dev_is_pci(dev) &&
2508 pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2511 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2514 struct dmar_domain *domain)
2516 struct dmar_domain *found = NULL;
2517 struct device_domain_info *info;
2518 unsigned long flags;
2521 info = alloc_devinfo_mem();
2525 if (!dev_is_real_dma_subdevice(dev)) {
2527 info->devfn = devfn;
2528 info->segment = iommu->segment;
2530 struct pci_dev *pdev = to_pci_dev(dev);
2532 info->bus = pdev->bus->number;
2533 info->devfn = pdev->devfn;
2534 info->segment = pci_domain_nr(pdev->bus);
2537 info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2538 info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2541 info->domain = domain;
2542 info->iommu = iommu;
2543 info->pasid_table = NULL;
2544 info->auxd_enabled = 0;
2545 INIT_LIST_HEAD(&info->auxiliary_domains);
2547 if (dev && dev_is_pci(dev)) {
2548 struct pci_dev *pdev = to_pci_dev(info->dev);
2550 if (ecap_dev_iotlb_support(iommu->ecap) &&
2551 pci_ats_supported(pdev) &&
2552 dmar_find_matched_atsr_unit(pdev))
2553 info->ats_supported = 1;
2555 if (sm_supported(iommu)) {
2556 if (pasid_supported(iommu)) {
2557 int features = pci_pasid_features(pdev);
2559 info->pasid_supported = features | 1;
2562 if (info->ats_supported && ecap_prs(iommu->ecap) &&
2563 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2564 info->pri_supported = 1;
2568 spin_lock_irqsave(&device_domain_lock, flags);
2570 found = find_domain(dev);
2573 struct device_domain_info *info2;
2574 info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
2577 found = info2->domain;
2583 spin_unlock_irqrestore(&device_domain_lock, flags);
2584 free_devinfo_mem(info);
2585 /* Caller must free the original domain */
2589 spin_lock(&iommu->lock);
2590 ret = domain_attach_iommu(domain, iommu);
2591 spin_unlock(&iommu->lock);
2594 spin_unlock_irqrestore(&device_domain_lock, flags);
2595 free_devinfo_mem(info);
2599 list_add(&info->link, &domain->devices);
2600 list_add(&info->global, &device_domain_list);
2602 dev->archdata.iommu = info;
2603 spin_unlock_irqrestore(&device_domain_lock, flags);
2605 /* PASID table is mandatory for a PCI device in scalable mode. */
2606 if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2607 ret = intel_pasid_alloc_table(dev);
2609 dev_err(dev, "PASID table allocation failed\n");
2610 dmar_remove_one_dev_info(dev);
2614 /* Setup the PASID entry for requests without PASID: */
2615 spin_lock(&iommu->lock);
2616 if (hw_pass_through && domain_type_is_si(domain))
2617 ret = intel_pasid_setup_pass_through(iommu, domain,
2618 dev, PASID_RID2PASID);
2619 else if (domain_use_first_level(domain))
2620 ret = domain_setup_first_level(iommu, domain, dev,
2623 ret = intel_pasid_setup_second_level(iommu, domain,
2624 dev, PASID_RID2PASID);
2625 spin_unlock(&iommu->lock);
2627 dev_err(dev, "Setup RID2PASID failed\n");
2628 dmar_remove_one_dev_info(dev);
2633 if (dev && domain_context_mapping(domain, dev)) {
2634 dev_err(dev, "Domain context map failed\n");
2635 dmar_remove_one_dev_info(dev);
2642 static int iommu_domain_identity_map(struct dmar_domain *domain,
2643 unsigned long first_vpfn,
2644 unsigned long last_vpfn)
2647 * RMRR range might have overlap with physical memory range,
2650 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2652 return __domain_mapping(domain, first_vpfn, NULL,
2653 first_vpfn, last_vpfn - first_vpfn + 1,
2654 DMA_PTE_READ|DMA_PTE_WRITE);
2657 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2659 static int __init si_domain_init(int hw)
2661 struct dmar_rmrr_unit *rmrr;
2665 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2669 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2670 domain_exit(si_domain);
2677 for_each_online_node(nid) {
2678 unsigned long start_pfn, end_pfn;
2681 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2682 ret = iommu_domain_identity_map(si_domain,
2683 mm_to_dma_pfn(start_pfn),
2684 mm_to_dma_pfn(end_pfn));
2691 * Identity map the RMRRs so that devices with RMRRs could also use
2694 for_each_rmrr_units(rmrr) {
2695 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2697 unsigned long long start = rmrr->base_address;
2698 unsigned long long end = rmrr->end_address;
2700 if (WARN_ON(end < start ||
2701 end >> agaw_to_width(si_domain->agaw)))
2704 ret = iommu_domain_identity_map(si_domain,
2705 mm_to_dma_pfn(start >> PAGE_SHIFT),
2706 mm_to_dma_pfn(end >> PAGE_SHIFT));
2715 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2717 struct dmar_domain *ndomain;
2718 struct intel_iommu *iommu;
2721 iommu = device_to_iommu(dev, &bus, &devfn);
2725 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2726 if (ndomain != domain)
2732 static bool device_has_rmrr(struct device *dev)
2734 struct dmar_rmrr_unit *rmrr;
2739 for_each_rmrr_units(rmrr) {
2741 * Return TRUE if this RMRR contains the device that
2744 for_each_active_dev_scope(rmrr->devices,
2745 rmrr->devices_cnt, i, tmp)
2747 is_downstream_to_pci_bridge(dev, tmp)) {
2757 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2758 * is relaxable (ie. is allowed to be not enforced under some conditions)
2759 * @dev: device handle
2761 * We assume that PCI USB devices with RMRRs have them largely
2762 * for historical reasons and that the RMRR space is not actively used post
2763 * boot. This exclusion may change if vendors begin to abuse it.
2765 * The same exception is made for graphics devices, with the requirement that
2766 * any use of the RMRR regions will be torn down before assigning the device
2769 * Return: true if the RMRR is relaxable, false otherwise
2771 static bool device_rmrr_is_relaxable(struct device *dev)
2773 struct pci_dev *pdev;
2775 if (!dev_is_pci(dev))
2778 pdev = to_pci_dev(dev);
2779 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2786 * There are a couple cases where we need to restrict the functionality of
2787 * devices associated with RMRRs. The first is when evaluating a device for
2788 * identity mapping because problems exist when devices are moved in and out
2789 * of domains and their respective RMRR information is lost. This means that
2790 * a device with associated RMRRs will never be in a "passthrough" domain.
2791 * The second is use of the device through the IOMMU API. This interface
2792 * expects to have full control of the IOVA space for the device. We cannot
2793 * satisfy both the requirement that RMRR access is maintained and have an
2794 * unencumbered IOVA space. We also have no ability to quiesce the device's
2795 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2796 * We therefore prevent devices associated with an RMRR from participating in
2797 * the IOMMU API, which eliminates them from device assignment.
2799 * In both cases, devices which have relaxable RMRRs are not concerned by this
2800 * restriction. See device_rmrr_is_relaxable comment.
2802 static bool device_is_rmrr_locked(struct device *dev)
2804 if (!device_has_rmrr(dev))
2807 if (device_rmrr_is_relaxable(dev))
2814 * Return the required default domain type for a specific device.
2816 * @dev: the device in query
2817 * @startup: true if this is during early boot
2820 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2821 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2822 * - 0: both identity and dynamic domains work for this device
2824 static int device_def_domain_type(struct device *dev)
2826 if (dev_is_pci(dev)) {
2827 struct pci_dev *pdev = to_pci_dev(dev);
2830 * Prevent any device marked as untrusted from getting
2831 * placed into the statically identity mapping domain.
2833 if (pdev->untrusted)
2834 return IOMMU_DOMAIN_DMA;
2836 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2837 return IOMMU_DOMAIN_IDENTITY;
2839 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2840 return IOMMU_DOMAIN_IDENTITY;
2846 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2849 * Start from the sane iommu hardware state.
2850 * If the queued invalidation is already initialized by us
2851 * (for example, while enabling interrupt-remapping) then
2852 * we got the things already rolling from a sane state.
2856 * Clear any previous faults.
2858 dmar_fault(-1, iommu);
2860 * Disable queued invalidation if supported and already enabled
2861 * before OS handover.
2863 dmar_disable_qi(iommu);
2866 if (dmar_enable_qi(iommu)) {
2868 * Queued Invalidate not enabled, use Register Based Invalidate
2870 iommu->flush.flush_context = __iommu_flush_context;
2871 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2872 pr_info("%s: Using Register based invalidation\n",
2875 iommu->flush.flush_context = qi_flush_context;
2876 iommu->flush.flush_iotlb = qi_flush_iotlb;
2877 pr_info("%s: Using Queued invalidation\n", iommu->name);
2881 static int copy_context_table(struct intel_iommu *iommu,
2882 struct root_entry *old_re,
2883 struct context_entry **tbl,
2886 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2887 struct context_entry *new_ce = NULL, ce;
2888 struct context_entry *old_ce = NULL;
2889 struct root_entry re;
2890 phys_addr_t old_ce_phys;
2892 tbl_idx = ext ? bus * 2 : bus;
2893 memcpy(&re, old_re, sizeof(re));
2895 for (devfn = 0; devfn < 256; devfn++) {
2896 /* First calculate the correct index */
2897 idx = (ext ? devfn * 2 : devfn) % 256;
2900 /* First save what we may have and clean up */
2902 tbl[tbl_idx] = new_ce;
2903 __iommu_flush_cache(iommu, new_ce,
2913 old_ce_phys = root_entry_lctp(&re);
2915 old_ce_phys = root_entry_uctp(&re);
2918 if (ext && devfn == 0) {
2919 /* No LCTP, try UCTP */
2928 old_ce = memremap(old_ce_phys, PAGE_SIZE,
2933 new_ce = alloc_pgtable_page(iommu->node);
2940 /* Now copy the context entry */
2941 memcpy(&ce, old_ce + idx, sizeof(ce));
2943 if (!__context_present(&ce))
2946 did = context_domain_id(&ce);
2947 if (did >= 0 && did < cap_ndoms(iommu->cap))
2948 set_bit(did, iommu->domain_ids);
2951 * We need a marker for copied context entries. This
2952 * marker needs to work for the old format as well as
2953 * for extended context entries.
2955 * Bit 67 of the context entry is used. In the old
2956 * format this bit is available to software, in the
2957 * extended format it is the PGE bit, but PGE is ignored
2958 * by HW if PASIDs are disabled (and thus still
2961 * So disable PASIDs first and then mark the entry
2962 * copied. This means that we don't copy PASID
2963 * translations from the old kernel, but this is fine as
2964 * faults there are not fatal.
2966 context_clear_pasid_enable(&ce);
2967 context_set_copied(&ce);
2972 tbl[tbl_idx + pos] = new_ce;
2974 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2983 static int copy_translation_tables(struct intel_iommu *iommu)
2985 struct context_entry **ctxt_tbls;
2986 struct root_entry *old_rt;
2987 phys_addr_t old_rt_phys;
2988 int ctxt_table_entries;
2989 unsigned long flags;
2994 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2995 ext = !!(rtaddr_reg & DMA_RTADDR_RTT);
2996 new_ext = !!ecap_ecs(iommu->ecap);
2999 * The RTT bit can only be changed when translation is disabled,
3000 * but disabling translation means to open a window for data
3001 * corruption. So bail out and don't copy anything if we would
3002 * have to change the bit.
3007 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3011 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3015 /* This is too big for the stack - allocate it from slab */
3016 ctxt_table_entries = ext ? 512 : 256;
3018 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3022 for (bus = 0; bus < 256; bus++) {
3023 ret = copy_context_table(iommu, &old_rt[bus],
3024 ctxt_tbls, bus, ext);
3026 pr_err("%s: Failed to copy context table for bus %d\n",
3032 spin_lock_irqsave(&iommu->lock, flags);
3034 /* Context tables are copied, now write them to the root_entry table */
3035 for (bus = 0; bus < 256; bus++) {
3036 int idx = ext ? bus * 2 : bus;
3039 if (ctxt_tbls[idx]) {
3040 val = virt_to_phys(ctxt_tbls[idx]) | 1;
3041 iommu->root_entry[bus].lo = val;
3044 if (!ext || !ctxt_tbls[idx + 1])
3047 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3048 iommu->root_entry[bus].hi = val;
3051 spin_unlock_irqrestore(&iommu->lock, flags);
3055 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3065 #ifdef CONFIG_INTEL_IOMMU_SVM
3066 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
3068 struct intel_iommu *iommu = data;
3072 return INVALID_IOASID;
3074 * VT-d virtual command interface always uses the full 20 bit
3075 * PASID range. Host can partition guest PASID range based on
3076 * policies but it is out of guest's control.
3078 if (min < PASID_MIN || max > intel_pasid_max_id)
3079 return INVALID_IOASID;
3081 if (vcmd_alloc_pasid(iommu, &ioasid))
3082 return INVALID_IOASID;
3087 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
3089 struct intel_iommu *iommu = data;
3094 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
3095 * We can only free the PASID when all the devices are unbound.
3097 if (ioasid_find(NULL, ioasid, NULL)) {
3098 pr_alert("Cannot free active IOASID %d\n", ioasid);
3101 vcmd_free_pasid(iommu, ioasid);
3104 static void register_pasid_allocator(struct intel_iommu *iommu)
3107 * If we are running in the host, no need for custom allocator
3108 * in that PASIDs are allocated from the host system-wide.
3110 if (!cap_caching_mode(iommu->cap))
3113 if (!sm_supported(iommu)) {
3114 pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
3119 * Register a custom PASID allocator if we are running in a guest,
3120 * guest PASID must be obtained via virtual command interface.
3121 * There can be multiple vIOMMUs in each guest but only one allocator
3122 * is active. All vIOMMU allocators will eventually be calling the same
3125 if (!ecap_vcs(iommu->ecap) || !vccap_pasid(iommu->vccap))
3128 pr_info("Register custom PASID allocator\n");
3129 iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
3130 iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
3131 iommu->pasid_allocator.pdata = (void *)iommu;
3132 if (ioasid_register_allocator(&iommu->pasid_allocator)) {
3133 pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
3135 * Disable scalable mode on this IOMMU if there
3136 * is no custom allocator. Mixing SM capable vIOMMU
3137 * and non-SM vIOMMU are not supported.
3144 static int __init init_dmars(void)
3146 struct dmar_drhd_unit *drhd;
3147 struct intel_iommu *iommu;
3153 * initialize and program root entry to not present
3156 for_each_drhd_unit(drhd) {
3158 * lock not needed as this is only incremented in the single
3159 * threaded kernel __init code path all other access are read
3162 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3166 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3169 /* Preallocate enough resources for IOMMU hot-addition */
3170 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3171 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3173 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3176 pr_err("Allocating global iommu array failed\n");
3181 for_each_iommu(iommu, drhd) {
3182 if (drhd->ignored) {
3183 iommu_disable_translation(iommu);
3188 * Find the max pasid size of all IOMMU's in the system.
3189 * We need to ensure the system pasid table is no bigger
3190 * than the smallest supported.
3192 if (pasid_supported(iommu)) {
3193 u32 temp = 2 << ecap_pss(iommu->ecap);
3195 intel_pasid_max_id = min_t(u32, temp,
3196 intel_pasid_max_id);
3199 g_iommus[iommu->seq_id] = iommu;
3201 intel_iommu_init_qi(iommu);
3203 ret = iommu_init_domains(iommu);
3207 init_translation_status(iommu);
3209 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3210 iommu_disable_translation(iommu);
3211 clear_translation_pre_enabled(iommu);
3212 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3218 * we could share the same root & context tables
3219 * among all IOMMU's. Need to Split it later.
3221 ret = iommu_alloc_root_entry(iommu);
3225 if (translation_pre_enabled(iommu)) {
3226 pr_info("Translation already enabled - trying to copy translation structures\n");
3228 ret = copy_translation_tables(iommu);
3231 * We found the IOMMU with translation
3232 * enabled - but failed to copy over the
3233 * old root-entry table. Try to proceed
3234 * by disabling translation now and
3235 * allocating a clean root-entry table.
3236 * This might cause DMAR faults, but
3237 * probably the dump will still succeed.
3239 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3241 iommu_disable_translation(iommu);
3242 clear_translation_pre_enabled(iommu);
3244 pr_info("Copied translation tables from previous kernel for %s\n",
3249 if (!ecap_pass_through(iommu->ecap))
3250 hw_pass_through = 0;
3251 intel_svm_check(iommu);
3255 * Now that qi is enabled on all iommus, set the root entry and flush
3256 * caches. This is required on some Intel X58 chipsets, otherwise the
3257 * flush_context function will loop forever and the boot hangs.
3259 for_each_active_iommu(iommu, drhd) {
3260 iommu_flush_write_buffer(iommu);
3261 #ifdef CONFIG_INTEL_IOMMU_SVM
3262 register_pasid_allocator(iommu);
3264 iommu_set_root_entry(iommu);
3265 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3266 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3269 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3274 iommu_identity_mapping |= IDENTMAP_GFX;
3276 check_tylersburg_isoch();
3278 ret = si_domain_init(hw_pass_through);
3285 * global invalidate context cache
3286 * global invalidate iotlb
3287 * enable translation
3289 for_each_iommu(iommu, drhd) {
3290 if (drhd->ignored) {
3292 * we always have to disable PMRs or DMA may fail on
3296 iommu_disable_protect_mem_regions(iommu);
3300 iommu_flush_write_buffer(iommu);
3302 #ifdef CONFIG_INTEL_IOMMU_SVM
3303 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3305 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3306 * could cause possible lock race condition.
3308 up_write(&dmar_global_lock);
3309 ret = intel_svm_enable_prq(iommu);
3310 down_write(&dmar_global_lock);
3315 ret = dmar_set_interrupt(iommu);
3323 for_each_active_iommu(iommu, drhd) {
3324 disable_dmar_iommu(iommu);
3325 free_dmar_iommu(iommu);
3334 /* This takes a number of _MM_ pages, not VTD pages */
3335 static unsigned long intel_alloc_iova(struct device *dev,
3336 struct dmar_domain *domain,
3337 unsigned long nrpages, uint64_t dma_mask)
3339 unsigned long iova_pfn;
3342 * Restrict dma_mask to the width that the iommu can handle.
3343 * First-level translation restricts the input-address to a
3344 * canonical address (i.e., address bits 63:N have the same
3345 * value as address bit [N-1], where N is 48-bits with 4-level
3346 * paging and 57-bits with 5-level paging). Hence, skip bit
3349 if (domain_use_first_level(domain))
3350 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw - 1),
3353 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw),
3356 /* Ensure we reserve the whole size-aligned region */
3357 nrpages = __roundup_pow_of_two(nrpages);
3359 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3361 * First try to allocate an io virtual address in
3362 * DMA_BIT_MASK(32) and if that fails then try allocating
3365 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3366 IOVA_PFN(DMA_BIT_MASK(32)), false);
3370 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3371 IOVA_PFN(dma_mask), true);
3372 if (unlikely(!iova_pfn)) {
3373 dev_err_once(dev, "Allocating %ld-page iova failed\n",
3381 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3382 size_t size, int dir, u64 dma_mask)
3384 struct dmar_domain *domain;
3385 phys_addr_t start_paddr;
3386 unsigned long iova_pfn;
3389 struct intel_iommu *iommu;
3390 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3392 BUG_ON(dir == DMA_NONE);
3394 if (unlikely(attach_deferred(dev)))
3395 do_deferred_attach(dev);
3397 domain = find_domain(dev);
3399 return DMA_MAPPING_ERROR;
3401 iommu = domain_get_iommu(domain);
3402 size = aligned_nrpages(paddr, size);
3404 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3409 * Check if DMAR supports zero-length reads on write only
3412 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3413 !cap_zlr(iommu->cap))
3414 prot |= DMA_PTE_READ;
3415 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3416 prot |= DMA_PTE_WRITE;
3418 * paddr - (paddr + size) might be partial page, we should map the whole
3419 * page. Note: if two part of one page are separately mapped, we
3420 * might have two guest_addr mapping to the same host paddr, but this
3421 * is not a big problem
3423 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3424 mm_to_dma_pfn(paddr_pfn), size, prot);
3428 start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3429 start_paddr += paddr & ~PAGE_MASK;
3431 trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3437 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3438 dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3439 size, (unsigned long long)paddr, dir);
3440 return DMA_MAPPING_ERROR;
3443 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3444 unsigned long offset, size_t size,
3445 enum dma_data_direction dir,
3446 unsigned long attrs)
3448 return __intel_map_single(dev, page_to_phys(page) + offset,
3449 size, dir, *dev->dma_mask);
3452 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3453 size_t size, enum dma_data_direction dir,
3454 unsigned long attrs)
3456 return __intel_map_single(dev, phys_addr, size, dir, *dev->dma_mask);
3459 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3461 struct dmar_domain *domain;
3462 unsigned long start_pfn, last_pfn;
3463 unsigned long nrpages;
3464 unsigned long iova_pfn;
3465 struct intel_iommu *iommu;
3466 struct page *freelist;
3467 struct pci_dev *pdev = NULL;
3469 domain = find_domain(dev);
3472 iommu = domain_get_iommu(domain);
3474 iova_pfn = IOVA_PFN(dev_addr);
3476 nrpages = aligned_nrpages(dev_addr, size);
3477 start_pfn = mm_to_dma_pfn(iova_pfn);
3478 last_pfn = start_pfn + nrpages - 1;
3480 if (dev_is_pci(dev))
3481 pdev = to_pci_dev(dev);
3483 freelist = domain_unmap(domain, start_pfn, last_pfn);
3484 if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3485 !has_iova_flush_queue(&domain->iovad)) {
3486 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3487 nrpages, !freelist, 0);
3489 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3490 dma_free_pagelist(freelist);
3492 queue_iova(&domain->iovad, iova_pfn, nrpages,
3493 (unsigned long)freelist);
3495 * queue up the release of the unmap to save the 1/6th of the
3496 * cpu used up by the iotlb flush operation...
3500 trace_unmap_single(dev, dev_addr, size);
3503 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3504 size_t size, enum dma_data_direction dir,
3505 unsigned long attrs)
3507 intel_unmap(dev, dev_addr, size);
3510 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3511 size_t size, enum dma_data_direction dir, unsigned long attrs)
3513 intel_unmap(dev, dev_addr, size);
3516 static void *intel_alloc_coherent(struct device *dev, size_t size,
3517 dma_addr_t *dma_handle, gfp_t flags,
3518 unsigned long attrs)
3520 struct page *page = NULL;
3523 if (unlikely(attach_deferred(dev)))
3524 do_deferred_attach(dev);
3526 size = PAGE_ALIGN(size);
3527 order = get_order(size);
3529 if (gfpflags_allow_blocking(flags)) {
3530 unsigned int count = size >> PAGE_SHIFT;
3532 page = dma_alloc_from_contiguous(dev, count, order,
3533 flags & __GFP_NOWARN);
3537 page = alloc_pages(flags, order);
3540 memset(page_address(page), 0, size);
3542 *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3544 dev->coherent_dma_mask);
3545 if (*dma_handle != DMA_MAPPING_ERROR)
3546 return page_address(page);
3547 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3548 __free_pages(page, order);
3553 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3554 dma_addr_t dma_handle, unsigned long attrs)
3557 struct page *page = virt_to_page(vaddr);
3559 size = PAGE_ALIGN(size);
3560 order = get_order(size);
3562 intel_unmap(dev, dma_handle, size);
3563 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3564 __free_pages(page, order);
3567 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3568 int nelems, enum dma_data_direction dir,
3569 unsigned long attrs)
3571 dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3572 unsigned long nrpages = 0;
3573 struct scatterlist *sg;
3576 for_each_sg(sglist, sg, nelems, i) {
3577 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3580 intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3582 trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3585 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3586 enum dma_data_direction dir, unsigned long attrs)
3589 struct dmar_domain *domain;
3592 unsigned long iova_pfn;
3594 struct scatterlist *sg;
3595 unsigned long start_vpfn;
3596 struct intel_iommu *iommu;
3598 BUG_ON(dir == DMA_NONE);
3600 if (unlikely(attach_deferred(dev)))
3601 do_deferred_attach(dev);
3603 domain = find_domain(dev);
3607 iommu = domain_get_iommu(domain);
3609 for_each_sg(sglist, sg, nelems, i)
3610 size += aligned_nrpages(sg->offset, sg->length);
3612 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3615 sglist->dma_length = 0;
3620 * Check if DMAR supports zero-length reads on write only
3623 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3624 !cap_zlr(iommu->cap))
3625 prot |= DMA_PTE_READ;
3626 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3627 prot |= DMA_PTE_WRITE;
3629 start_vpfn = mm_to_dma_pfn(iova_pfn);
3631 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3632 if (unlikely(ret)) {
3633 dma_pte_free_pagetable(domain, start_vpfn,
3634 start_vpfn + size - 1,
3635 agaw_to_level(domain->agaw) + 1);
3636 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3640 for_each_sg(sglist, sg, nelems, i)
3641 trace_map_sg(dev, i + 1, nelems, sg);
3646 static u64 intel_get_required_mask(struct device *dev)
3648 return DMA_BIT_MASK(32);
3651 static const struct dma_map_ops intel_dma_ops = {
3652 .alloc = intel_alloc_coherent,
3653 .free = intel_free_coherent,
3654 .map_sg = intel_map_sg,
3655 .unmap_sg = intel_unmap_sg,
3656 .map_page = intel_map_page,
3657 .unmap_page = intel_unmap_page,
3658 .map_resource = intel_map_resource,
3659 .unmap_resource = intel_unmap_resource,
3660 .dma_supported = dma_direct_supported,
3661 .mmap = dma_common_mmap,
3662 .get_sgtable = dma_common_get_sgtable,
3663 .get_required_mask = intel_get_required_mask,
3667 bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3668 enum dma_data_direction dir, enum dma_sync_target target)
3670 struct dmar_domain *domain;
3671 phys_addr_t tlb_addr;
3673 domain = find_domain(dev);
3674 if (WARN_ON(!domain))
3677 tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3678 if (is_swiotlb_buffer(tlb_addr))
3679 swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3683 bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3684 enum dma_data_direction dir, unsigned long attrs,
3687 size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3688 struct dmar_domain *domain;
3689 struct intel_iommu *iommu;
3690 unsigned long iova_pfn;
3691 unsigned long nrpages;
3692 phys_addr_t tlb_addr;
3696 if (unlikely(attach_deferred(dev)))
3697 do_deferred_attach(dev);
3699 domain = find_domain(dev);
3701 if (WARN_ON(dir == DMA_NONE || !domain))
3702 return DMA_MAPPING_ERROR;
3704 iommu = domain_get_iommu(domain);
3705 if (WARN_ON(!iommu))
3706 return DMA_MAPPING_ERROR;
3708 nrpages = aligned_nrpages(0, size);
3709 iova_pfn = intel_alloc_iova(dev, domain,
3710 dma_to_mm_pfn(nrpages), dma_mask);
3712 return DMA_MAPPING_ERROR;
3715 * Check if DMAR supports zero-length reads on write only
3718 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3719 !cap_zlr(iommu->cap))
3720 prot |= DMA_PTE_READ;
3721 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3722 prot |= DMA_PTE_WRITE;
3725 * If both the physical buffer start address and size are
3726 * page aligned, we don't need to use a bounce page.
3728 if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
3729 tlb_addr = swiotlb_tbl_map_single(dev,
3730 __phys_to_dma(dev, io_tlb_start),
3731 paddr, size, aligned_size, dir, attrs);
3732 if (tlb_addr == DMA_MAPPING_ERROR) {
3735 /* Cleanup the padding area. */
3736 void *padding_start = phys_to_virt(tlb_addr);
3737 size_t padding_size = aligned_size;
3739 if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
3740 (dir == DMA_TO_DEVICE ||
3741 dir == DMA_BIDIRECTIONAL)) {
3742 padding_start += size;
3743 padding_size -= size;
3746 memset(padding_start, 0, padding_size);
3752 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3753 tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
3757 trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
3759 return (phys_addr_t)iova_pfn << PAGE_SHIFT;
3762 if (is_swiotlb_buffer(tlb_addr))
3763 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3764 aligned_size, dir, attrs);
3766 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3767 dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
3768 size, (unsigned long long)paddr, dir);
3770 return DMA_MAPPING_ERROR;
3774 bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
3775 enum dma_data_direction dir, unsigned long attrs)
3777 size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3778 struct dmar_domain *domain;
3779 phys_addr_t tlb_addr;
3781 domain = find_domain(dev);
3782 if (WARN_ON(!domain))
3785 tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
3786 if (WARN_ON(!tlb_addr))
3789 intel_unmap(dev, dev_addr, size);
3790 if (is_swiotlb_buffer(tlb_addr))
3791 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3792 aligned_size, dir, attrs);
3794 trace_bounce_unmap_single(dev, dev_addr, size);
3798 bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
3799 size_t size, enum dma_data_direction dir, unsigned long attrs)
3801 return bounce_map_single(dev, page_to_phys(page) + offset,
3802 size, dir, attrs, *dev->dma_mask);
3806 bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
3807 enum dma_data_direction dir, unsigned long attrs)
3809 return bounce_map_single(dev, phys_addr, size,
3810 dir, attrs, *dev->dma_mask);
3814 bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
3815 enum dma_data_direction dir, unsigned long attrs)
3817 bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3821 bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
3822 enum dma_data_direction dir, unsigned long attrs)
3824 bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3828 bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3829 enum dma_data_direction dir, unsigned long attrs)
3831 struct scatterlist *sg;
3834 for_each_sg(sglist, sg, nelems, i)
3835 bounce_unmap_page(dev, sg->dma_address,
3836 sg_dma_len(sg), dir, attrs);
3840 bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3841 enum dma_data_direction dir, unsigned long attrs)
3844 struct scatterlist *sg;
3846 for_each_sg(sglist, sg, nelems, i) {
3847 sg->dma_address = bounce_map_page(dev, sg_page(sg),
3848 sg->offset, sg->length,
3850 if (sg->dma_address == DMA_MAPPING_ERROR)
3852 sg_dma_len(sg) = sg->length;
3855 for_each_sg(sglist, sg, nelems, i)
3856 trace_bounce_map_sg(dev, i + 1, nelems, sg);
3861 bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
3866 bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
3867 size_t size, enum dma_data_direction dir)
3869 bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
3873 bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
3874 size_t size, enum dma_data_direction dir)
3876 bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
3880 bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
3881 int nelems, enum dma_data_direction dir)
3883 struct scatterlist *sg;
3886 for_each_sg(sglist, sg, nelems, i)
3887 bounce_sync_single(dev, sg_dma_address(sg),
3888 sg_dma_len(sg), dir, SYNC_FOR_CPU);
3892 bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
3893 int nelems, enum dma_data_direction dir)
3895 struct scatterlist *sg;
3898 for_each_sg(sglist, sg, nelems, i)
3899 bounce_sync_single(dev, sg_dma_address(sg),
3900 sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
3903 static const struct dma_map_ops bounce_dma_ops = {
3904 .alloc = intel_alloc_coherent,
3905 .free = intel_free_coherent,
3906 .map_sg = bounce_map_sg,
3907 .unmap_sg = bounce_unmap_sg,
3908 .map_page = bounce_map_page,
3909 .unmap_page = bounce_unmap_page,
3910 .sync_single_for_cpu = bounce_sync_single_for_cpu,
3911 .sync_single_for_device = bounce_sync_single_for_device,
3912 .sync_sg_for_cpu = bounce_sync_sg_for_cpu,
3913 .sync_sg_for_device = bounce_sync_sg_for_device,
3914 .map_resource = bounce_map_resource,
3915 .unmap_resource = bounce_unmap_resource,
3916 .dma_supported = dma_direct_supported,
3919 static inline int iommu_domain_cache_init(void)
3923 iommu_domain_cache = kmem_cache_create("iommu_domain",
3924 sizeof(struct dmar_domain),
3929 if (!iommu_domain_cache) {
3930 pr_err("Couldn't create iommu_domain cache\n");
3937 static inline int iommu_devinfo_cache_init(void)
3941 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3942 sizeof(struct device_domain_info),
3946 if (!iommu_devinfo_cache) {
3947 pr_err("Couldn't create devinfo cache\n");
3954 static int __init iommu_init_mempool(void)
3957 ret = iova_cache_get();
3961 ret = iommu_domain_cache_init();
3965 ret = iommu_devinfo_cache_init();
3969 kmem_cache_destroy(iommu_domain_cache);
3976 static void __init iommu_exit_mempool(void)
3978 kmem_cache_destroy(iommu_devinfo_cache);
3979 kmem_cache_destroy(iommu_domain_cache);
3983 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3985 struct dmar_drhd_unit *drhd;
3989 /* We know that this device on this chipset has its own IOMMU.
3990 * If we find it under a different IOMMU, then the BIOS is lying
3991 * to us. Hope that the IOMMU for this device is actually
3992 * disabled, and it needs no translation...
3994 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3996 /* "can't" happen */
3997 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
4000 vtbar &= 0xffff0000;
4002 /* we know that the this iommu should be at offset 0xa000 from vtbar */
4003 drhd = dmar_find_matched_drhd_unit(pdev);
4004 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
4005 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
4006 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4007 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4010 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
4012 static void __init init_no_remapping_devices(void)
4014 struct dmar_drhd_unit *drhd;
4018 for_each_drhd_unit(drhd) {
4019 if (!drhd->include_all) {
4020 for_each_active_dev_scope(drhd->devices,
4021 drhd->devices_cnt, i, dev)
4023 /* ignore DMAR unit if no devices exist */
4024 if (i == drhd->devices_cnt)
4029 for_each_active_drhd_unit(drhd) {
4030 if (drhd->include_all)
4033 for_each_active_dev_scope(drhd->devices,
4034 drhd->devices_cnt, i, dev)
4035 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4037 if (i < drhd->devices_cnt)
4040 /* This IOMMU has *only* gfx devices. Either bypass it or
4041 set the gfx_mapped flag, as appropriate */
4042 if (!dmar_map_gfx) {
4044 for_each_active_dev_scope(drhd->devices,
4045 drhd->devices_cnt, i, dev)
4046 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4051 #ifdef CONFIG_SUSPEND
4052 static int init_iommu_hw(void)
4054 struct dmar_drhd_unit *drhd;
4055 struct intel_iommu *iommu = NULL;
4057 for_each_active_iommu(iommu, drhd)
4059 dmar_reenable_qi(iommu);
4061 for_each_iommu(iommu, drhd) {
4062 if (drhd->ignored) {
4064 * we always have to disable PMRs or DMA may fail on
4068 iommu_disable_protect_mem_regions(iommu);
4072 iommu_flush_write_buffer(iommu);
4074 iommu_set_root_entry(iommu);
4076 iommu->flush.flush_context(iommu, 0, 0, 0,
4077 DMA_CCMD_GLOBAL_INVL);
4078 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4079 iommu_enable_translation(iommu);
4080 iommu_disable_protect_mem_regions(iommu);
4086 static void iommu_flush_all(void)
4088 struct dmar_drhd_unit *drhd;
4089 struct intel_iommu *iommu;
4091 for_each_active_iommu(iommu, drhd) {
4092 iommu->flush.flush_context(iommu, 0, 0, 0,
4093 DMA_CCMD_GLOBAL_INVL);
4094 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4095 DMA_TLB_GLOBAL_FLUSH);
4099 static int iommu_suspend(void)
4101 struct dmar_drhd_unit *drhd;
4102 struct intel_iommu *iommu = NULL;
4105 for_each_active_iommu(iommu, drhd) {
4106 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4108 if (!iommu->iommu_state)
4114 for_each_active_iommu(iommu, drhd) {
4115 iommu_disable_translation(iommu);
4117 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4119 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4120 readl(iommu->reg + DMAR_FECTL_REG);
4121 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4122 readl(iommu->reg + DMAR_FEDATA_REG);
4123 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4124 readl(iommu->reg + DMAR_FEADDR_REG);
4125 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4126 readl(iommu->reg + DMAR_FEUADDR_REG);
4128 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4133 for_each_active_iommu(iommu, drhd)
4134 kfree(iommu->iommu_state);
4139 static void iommu_resume(void)
4141 struct dmar_drhd_unit *drhd;
4142 struct intel_iommu *iommu = NULL;
4145 if (init_iommu_hw()) {
4147 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4149 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4153 for_each_active_iommu(iommu, drhd) {
4155 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4157 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4158 iommu->reg + DMAR_FECTL_REG);
4159 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4160 iommu->reg + DMAR_FEDATA_REG);
4161 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4162 iommu->reg + DMAR_FEADDR_REG);
4163 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4164 iommu->reg + DMAR_FEUADDR_REG);
4166 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4169 for_each_active_iommu(iommu, drhd)
4170 kfree(iommu->iommu_state);
4173 static struct syscore_ops iommu_syscore_ops = {
4174 .resume = iommu_resume,
4175 .suspend = iommu_suspend,
4178 static void __init init_iommu_pm_ops(void)
4180 register_syscore_ops(&iommu_syscore_ops);
4184 static inline void init_iommu_pm_ops(void) {}
4185 #endif /* CONFIG_PM */
4187 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
4189 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
4190 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
4191 rmrr->end_address <= rmrr->base_address ||
4192 arch_rmrr_sanity_check(rmrr))
4198 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4200 struct acpi_dmar_reserved_memory *rmrr;
4201 struct dmar_rmrr_unit *rmrru;
4203 rmrr = (struct acpi_dmar_reserved_memory *)header;
4204 if (rmrr_sanity_check(rmrr)) {
4206 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
4207 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4208 rmrr->base_address, rmrr->end_address,
4209 dmi_get_system_info(DMI_BIOS_VENDOR),
4210 dmi_get_system_info(DMI_BIOS_VERSION),
4211 dmi_get_system_info(DMI_PRODUCT_VERSION));
4212 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4215 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4219 rmrru->hdr = header;
4221 rmrru->base_address = rmrr->base_address;
4222 rmrru->end_address = rmrr->end_address;
4224 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4225 ((void *)rmrr) + rmrr->header.length,
4226 &rmrru->devices_cnt);
4227 if (rmrru->devices_cnt && rmrru->devices == NULL)
4230 list_add(&rmrru->list, &dmar_rmrr_units);
4239 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4241 struct dmar_atsr_unit *atsru;
4242 struct acpi_dmar_atsr *tmp;
4244 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
4246 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4247 if (atsr->segment != tmp->segment)
4249 if (atsr->header.length != tmp->header.length)
4251 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4258 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4260 struct acpi_dmar_atsr *atsr;
4261 struct dmar_atsr_unit *atsru;
4263 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4266 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4267 atsru = dmar_find_atsr(atsr);
4271 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4276 * If memory is allocated from slab by ACPI _DSM method, we need to
4277 * copy the memory content because the memory buffer will be freed
4280 atsru->hdr = (void *)(atsru + 1);
4281 memcpy(atsru->hdr, hdr, hdr->length);
4282 atsru->include_all = atsr->flags & 0x1;
4283 if (!atsru->include_all) {
4284 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4285 (void *)atsr + atsr->header.length,
4286 &atsru->devices_cnt);
4287 if (atsru->devices_cnt && atsru->devices == NULL) {
4293 list_add_rcu(&atsru->list, &dmar_atsr_units);
4298 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4300 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4304 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4306 struct acpi_dmar_atsr *atsr;
4307 struct dmar_atsr_unit *atsru;
4309 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4310 atsru = dmar_find_atsr(atsr);
4312 list_del_rcu(&atsru->list);
4314 intel_iommu_free_atsr(atsru);
4320 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4324 struct acpi_dmar_atsr *atsr;
4325 struct dmar_atsr_unit *atsru;
4327 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4328 atsru = dmar_find_atsr(atsr);
4332 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4333 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4341 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4344 struct intel_iommu *iommu = dmaru->iommu;
4346 if (g_iommus[iommu->seq_id])
4349 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4350 pr_warn("%s: Doesn't support hardware pass through.\n",
4354 if (!ecap_sc_support(iommu->ecap) &&
4355 domain_update_iommu_snooping(iommu)) {
4356 pr_warn("%s: Doesn't support snooping.\n",
4360 sp = domain_update_iommu_superpage(NULL, iommu) - 1;
4361 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4362 pr_warn("%s: Doesn't support large page.\n",
4368 * Disable translation if already enabled prior to OS handover.
4370 if (iommu->gcmd & DMA_GCMD_TE)
4371 iommu_disable_translation(iommu);
4373 g_iommus[iommu->seq_id] = iommu;
4374 ret = iommu_init_domains(iommu);
4376 ret = iommu_alloc_root_entry(iommu);
4380 intel_svm_check(iommu);
4382 if (dmaru->ignored) {
4384 * we always have to disable PMRs or DMA may fail on this device
4387 iommu_disable_protect_mem_regions(iommu);
4391 intel_iommu_init_qi(iommu);
4392 iommu_flush_write_buffer(iommu);
4394 #ifdef CONFIG_INTEL_IOMMU_SVM
4395 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4396 ret = intel_svm_enable_prq(iommu);
4401 ret = dmar_set_interrupt(iommu);
4405 iommu_set_root_entry(iommu);
4406 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4407 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4408 iommu_enable_translation(iommu);
4410 iommu_disable_protect_mem_regions(iommu);
4414 disable_dmar_iommu(iommu);
4416 free_dmar_iommu(iommu);
4420 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4423 struct intel_iommu *iommu = dmaru->iommu;
4425 if (!intel_iommu_enabled)
4431 ret = intel_iommu_add(dmaru);
4433 disable_dmar_iommu(iommu);
4434 free_dmar_iommu(iommu);
4440 static void intel_iommu_free_dmars(void)
4442 struct dmar_rmrr_unit *rmrru, *rmrr_n;
4443 struct dmar_atsr_unit *atsru, *atsr_n;
4445 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4446 list_del(&rmrru->list);
4447 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4451 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4452 list_del(&atsru->list);
4453 intel_iommu_free_atsr(atsru);
4457 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4460 struct pci_bus *bus;
4461 struct pci_dev *bridge = NULL;
4463 struct acpi_dmar_atsr *atsr;
4464 struct dmar_atsr_unit *atsru;
4466 dev = pci_physfn(dev);
4467 for (bus = dev->bus; bus; bus = bus->parent) {
4469 /* If it's an integrated device, allow ATS */
4472 /* Connected via non-PCIe: no ATS */
4473 if (!pci_is_pcie(bridge) ||
4474 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4476 /* If we found the root port, look it up in the ATSR */
4477 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4482 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4483 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4484 if (atsr->segment != pci_domain_nr(dev->bus))
4487 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4488 if (tmp == &bridge->dev)
4491 if (atsru->include_all)
4501 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4504 struct dmar_rmrr_unit *rmrru;
4505 struct dmar_atsr_unit *atsru;
4506 struct acpi_dmar_atsr *atsr;
4507 struct acpi_dmar_reserved_memory *rmrr;
4509 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4512 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4513 rmrr = container_of(rmrru->hdr,
4514 struct acpi_dmar_reserved_memory, header);
4515 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4516 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4517 ((void *)rmrr) + rmrr->header.length,
4518 rmrr->segment, rmrru->devices,
4519 rmrru->devices_cnt);
4522 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4523 dmar_remove_dev_scope(info, rmrr->segment,
4524 rmrru->devices, rmrru->devices_cnt);
4528 list_for_each_entry(atsru, &dmar_atsr_units, list) {
4529 if (atsru->include_all)
4532 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4533 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4534 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4535 (void *)atsr + atsr->header.length,
4536 atsr->segment, atsru->devices,
4537 atsru->devices_cnt);
4542 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4543 if (dmar_remove_dev_scope(info, atsr->segment,
4544 atsru->devices, atsru->devices_cnt))
4552 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4553 unsigned long val, void *v)
4555 struct memory_notify *mhp = v;
4556 unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4557 unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
4561 case MEM_GOING_ONLINE:
4562 if (iommu_domain_identity_map(si_domain,
4563 start_vpfn, last_vpfn)) {
4564 pr_warn("Failed to build identity map for [%lx-%lx]\n",
4565 start_vpfn, last_vpfn);
4571 case MEM_CANCEL_ONLINE:
4573 struct dmar_drhd_unit *drhd;
4574 struct intel_iommu *iommu;
4575 struct page *freelist;
4577 freelist = domain_unmap(si_domain,
4578 start_vpfn, last_vpfn);
4581 for_each_active_iommu(iommu, drhd)
4582 iommu_flush_iotlb_psi(iommu, si_domain,
4583 start_vpfn, mhp->nr_pages,
4586 dma_free_pagelist(freelist);
4594 static struct notifier_block intel_iommu_memory_nb = {
4595 .notifier_call = intel_iommu_memory_notifier,
4599 static void free_all_cpu_cached_iovas(unsigned int cpu)
4603 for (i = 0; i < g_num_of_iommus; i++) {
4604 struct intel_iommu *iommu = g_iommus[i];
4605 struct dmar_domain *domain;
4611 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4612 domain = get_iommu_domain(iommu, (u16)did);
4614 if (!domain || domain->domain.type != IOMMU_DOMAIN_DMA)
4617 free_cpu_cached_iovas(cpu, &domain->iovad);
4622 static int intel_iommu_cpu_dead(unsigned int cpu)
4624 free_all_cpu_cached_iovas(cpu);
4628 static void intel_disable_iommus(void)
4630 struct intel_iommu *iommu = NULL;
4631 struct dmar_drhd_unit *drhd;
4633 for_each_iommu(iommu, drhd)
4634 iommu_disable_translation(iommu);
4637 void intel_iommu_shutdown(void)
4639 struct dmar_drhd_unit *drhd;
4640 struct intel_iommu *iommu = NULL;
4642 if (no_iommu || dmar_disabled)
4645 down_write(&dmar_global_lock);
4647 /* Disable PMRs explicitly here. */
4648 for_each_iommu(iommu, drhd)
4649 iommu_disable_protect_mem_regions(iommu);
4651 /* Make sure the IOMMUs are switched off */
4652 intel_disable_iommus();
4654 up_write(&dmar_global_lock);
4657 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4659 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4661 return container_of(iommu_dev, struct intel_iommu, iommu);
4664 static ssize_t intel_iommu_show_version(struct device *dev,
4665 struct device_attribute *attr,
4668 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4669 u32 ver = readl(iommu->reg + DMAR_VER_REG);
4670 return sprintf(buf, "%d:%d\n",
4671 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4673 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4675 static ssize_t intel_iommu_show_address(struct device *dev,
4676 struct device_attribute *attr,
4679 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4680 return sprintf(buf, "%llx\n", iommu->reg_phys);
4682 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4684 static ssize_t intel_iommu_show_cap(struct device *dev,
4685 struct device_attribute *attr,
4688 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4689 return sprintf(buf, "%llx\n", iommu->cap);
4691 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4693 static ssize_t intel_iommu_show_ecap(struct device *dev,
4694 struct device_attribute *attr,
4697 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4698 return sprintf(buf, "%llx\n", iommu->ecap);
4700 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4702 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4703 struct device_attribute *attr,
4706 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4707 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4709 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4711 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4712 struct device_attribute *attr,
4715 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4716 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4717 cap_ndoms(iommu->cap)));
4719 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4721 static struct attribute *intel_iommu_attrs[] = {
4722 &dev_attr_version.attr,
4723 &dev_attr_address.attr,
4725 &dev_attr_ecap.attr,
4726 &dev_attr_domains_supported.attr,
4727 &dev_attr_domains_used.attr,
4731 static struct attribute_group intel_iommu_group = {
4732 .name = "intel-iommu",
4733 .attrs = intel_iommu_attrs,
4736 const struct attribute_group *intel_iommu_groups[] = {
4741 static inline bool has_untrusted_dev(void)
4743 struct pci_dev *pdev = NULL;
4745 for_each_pci_dev(pdev)
4746 if (pdev->untrusted)
4752 static int __init platform_optin_force_iommu(void)
4754 if (!dmar_platform_optin() || no_platform_optin || !has_untrusted_dev())
4757 if (no_iommu || dmar_disabled)
4758 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4761 * If Intel-IOMMU is disabled by default, we will apply identity
4762 * map for all devices except those marked as being untrusted.
4765 iommu_set_default_passthrough(false);
4773 static int __init probe_acpi_namespace_devices(void)
4775 struct dmar_drhd_unit *drhd;
4776 /* To avoid a -Wunused-but-set-variable warning. */
4777 struct intel_iommu *iommu __maybe_unused;
4781 for_each_active_iommu(iommu, drhd) {
4782 for_each_active_dev_scope(drhd->devices,
4783 drhd->devices_cnt, i, dev) {
4784 struct acpi_device_physical_node *pn;
4785 struct iommu_group *group;
4786 struct acpi_device *adev;
4788 if (dev->bus != &acpi_bus_type)
4791 adev = to_acpi_device(dev);
4792 mutex_lock(&adev->physical_node_lock);
4793 list_for_each_entry(pn,
4794 &adev->physical_node_list, node) {
4795 group = iommu_group_get(pn->dev);
4797 iommu_group_put(group);
4801 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4802 ret = iommu_probe_device(pn->dev);
4806 mutex_unlock(&adev->physical_node_lock);
4816 int __init intel_iommu_init(void)
4819 struct dmar_drhd_unit *drhd;
4820 struct intel_iommu *iommu;
4823 * Intel IOMMU is required for a TXT/tboot launch or platform
4824 * opt in, so enforce that.
4826 force_on = tboot_force_iommu() || platform_optin_force_iommu();
4828 if (iommu_init_mempool()) {
4830 panic("tboot: Failed to initialize iommu memory\n");
4834 down_write(&dmar_global_lock);
4835 if (dmar_table_init()) {
4837 panic("tboot: Failed to initialize DMAR table\n");
4841 if (dmar_dev_scope_init() < 0) {
4843 panic("tboot: Failed to initialize DMAR device scope\n");
4847 up_write(&dmar_global_lock);
4850 * The bus notifier takes the dmar_global_lock, so lockdep will
4851 * complain later when we register it under the lock.
4853 dmar_register_bus_notifier();
4855 down_write(&dmar_global_lock);
4858 intel_iommu_debugfs_init();
4860 if (no_iommu || dmar_disabled) {
4862 * We exit the function here to ensure IOMMU's remapping and
4863 * mempool aren't setup, which means that the IOMMU's PMRs
4864 * won't be disabled via the call to init_dmars(). So disable
4865 * it explicitly here. The PMRs were setup by tboot prior to
4866 * calling SENTER, but the kernel is expected to reset/tear
4869 if (intel_iommu_tboot_noforce) {
4870 for_each_iommu(iommu, drhd)
4871 iommu_disable_protect_mem_regions(iommu);
4875 * Make sure the IOMMUs are switched off, even when we
4876 * boot into a kexec kernel and the previous kernel left
4879 intel_disable_iommus();
4883 if (list_empty(&dmar_rmrr_units))
4884 pr_info("No RMRR found\n");
4886 if (list_empty(&dmar_atsr_units))
4887 pr_info("No ATSR found\n");
4889 if (dmar_init_reserved_ranges()) {
4891 panic("tboot: Failed to reserve iommu ranges\n");
4892 goto out_free_reserved_range;
4896 intel_iommu_gfx_mapped = 1;
4898 init_no_remapping_devices();
4903 panic("tboot: Failed to initialize DMARs\n");
4904 pr_err("Initialization failed\n");
4905 goto out_free_reserved_range;
4907 up_write(&dmar_global_lock);
4909 init_iommu_pm_ops();
4911 down_read(&dmar_global_lock);
4912 for_each_active_iommu(iommu, drhd) {
4913 iommu_device_sysfs_add(&iommu->iommu, NULL,
4916 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4917 iommu_device_register(&iommu->iommu);
4919 up_read(&dmar_global_lock);
4921 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4922 if (si_domain && !hw_pass_through)
4923 register_memory_notifier(&intel_iommu_memory_nb);
4924 cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4925 intel_iommu_cpu_dead);
4927 down_read(&dmar_global_lock);
4928 if (probe_acpi_namespace_devices())
4929 pr_warn("ACPI name space devices didn't probe correctly\n");
4931 /* Finally, we enable the DMA remapping hardware. */
4932 for_each_iommu(iommu, drhd) {
4933 if (!drhd->ignored && !translation_pre_enabled(iommu))
4934 iommu_enable_translation(iommu);
4936 iommu_disable_protect_mem_regions(iommu);
4938 up_read(&dmar_global_lock);
4940 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4942 intel_iommu_enabled = 1;
4946 out_free_reserved_range:
4947 put_iova_domain(&reserved_iova_list);
4949 intel_iommu_free_dmars();
4950 up_write(&dmar_global_lock);
4951 iommu_exit_mempool();
4955 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4957 struct intel_iommu *iommu = opaque;
4959 domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4964 * NB - intel-iommu lacks any sort of reference counting for the users of
4965 * dependent devices. If multiple endpoints have intersecting dependent
4966 * devices, unbinding the driver from any one of them will possibly leave
4967 * the others unable to operate.
4969 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4971 if (!iommu || !dev || !dev_is_pci(dev))
4974 pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4977 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4979 struct dmar_domain *domain;
4980 struct intel_iommu *iommu;
4981 unsigned long flags;
4983 assert_spin_locked(&device_domain_lock);
4988 iommu = info->iommu;
4989 domain = info->domain;
4992 if (dev_is_pci(info->dev) && sm_supported(iommu))
4993 intel_pasid_tear_down_entry(iommu, info->dev,
4994 PASID_RID2PASID, false);
4996 iommu_disable_dev_iotlb(info);
4997 if (!dev_is_real_dma_subdevice(info->dev))
4998 domain_context_clear(iommu, info->dev);
4999 intel_pasid_free_table(info->dev);
5002 unlink_domain_info(info);
5004 spin_lock_irqsave(&iommu->lock, flags);
5005 domain_detach_iommu(domain, iommu);
5006 spin_unlock_irqrestore(&iommu->lock, flags);
5008 free_devinfo_mem(info);
5011 static void dmar_remove_one_dev_info(struct device *dev)
5013 struct device_domain_info *info;
5014 unsigned long flags;
5016 spin_lock_irqsave(&device_domain_lock, flags);
5017 info = get_domain_info(dev);
5019 __dmar_remove_one_dev_info(info);
5020 spin_unlock_irqrestore(&device_domain_lock, flags);
5023 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5027 /* calculate AGAW */
5028 domain->gaw = guest_width;
5029 adjust_width = guestwidth_to_adjustwidth(guest_width);
5030 domain->agaw = width_to_agaw(adjust_width);
5032 domain->iommu_coherency = 0;
5033 domain->iommu_snooping = 0;
5034 domain->iommu_superpage = 0;
5035 domain->max_addr = 0;
5037 /* always allocate the top pgd */
5038 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5041 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5045 static void intel_init_iova_domain(struct dmar_domain *dmar_domain)
5047 init_iova_domain(&dmar_domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5048 copy_reserved_iova(&reserved_iova_list, &dmar_domain->iovad);
5050 if (!intel_iommu_strict &&
5051 init_iova_flush_queue(&dmar_domain->iovad,
5052 iommu_flush_iova, iova_entry_free))
5053 pr_info("iova flush queue initialization failed\n");
5056 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5058 struct dmar_domain *dmar_domain;
5059 struct iommu_domain *domain;
5062 case IOMMU_DOMAIN_DMA:
5064 case IOMMU_DOMAIN_UNMANAGED:
5065 dmar_domain = alloc_domain(0);
5067 pr_err("Can't allocate dmar_domain\n");
5070 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5071 pr_err("Domain initialization failed\n");
5072 domain_exit(dmar_domain);
5076 if (type == IOMMU_DOMAIN_DMA)
5077 intel_init_iova_domain(dmar_domain);
5079 domain_update_iommu_cap(dmar_domain);
5081 domain = &dmar_domain->domain;
5082 domain->geometry.aperture_start = 0;
5083 domain->geometry.aperture_end =
5084 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5085 domain->geometry.force_aperture = true;
5088 case IOMMU_DOMAIN_IDENTITY:
5089 return &si_domain->domain;
5097 static void intel_iommu_domain_free(struct iommu_domain *domain)
5099 if (domain != &si_domain->domain)
5100 domain_exit(to_dmar_domain(domain));
5104 * Check whether a @domain could be attached to the @dev through the
5105 * aux-domain attach/detach APIs.
5108 is_aux_domain(struct device *dev, struct iommu_domain *domain)
5110 struct device_domain_info *info = get_domain_info(dev);
5112 return info && info->auxd_enabled &&
5113 domain->type == IOMMU_DOMAIN_UNMANAGED;
5116 static void auxiliary_link_device(struct dmar_domain *domain,
5119 struct device_domain_info *info = get_domain_info(dev);
5121 assert_spin_locked(&device_domain_lock);
5125 domain->auxd_refcnt++;
5126 list_add(&domain->auxd, &info->auxiliary_domains);
5129 static void auxiliary_unlink_device(struct dmar_domain *domain,
5132 struct device_domain_info *info = get_domain_info(dev);
5134 assert_spin_locked(&device_domain_lock);
5138 list_del(&domain->auxd);
5139 domain->auxd_refcnt--;
5141 if (!domain->auxd_refcnt && domain->default_pasid > 0)
5142 ioasid_free(domain->default_pasid);
5145 static int aux_domain_add_dev(struct dmar_domain *domain,
5150 unsigned long flags;
5151 struct intel_iommu *iommu;
5153 iommu = device_to_iommu(dev, &bus, &devfn);
5157 if (domain->default_pasid <= 0) {
5160 /* No private data needed for the default pasid */
5161 pasid = ioasid_alloc(NULL, PASID_MIN,
5162 pci_max_pasids(to_pci_dev(dev)) - 1,
5164 if (pasid == INVALID_IOASID) {
5165 pr_err("Can't allocate default pasid\n");
5168 domain->default_pasid = pasid;
5171 spin_lock_irqsave(&device_domain_lock, flags);
5173 * iommu->lock must be held to attach domain to iommu and setup the
5174 * pasid entry for second level translation.
5176 spin_lock(&iommu->lock);
5177 ret = domain_attach_iommu(domain, iommu);
5181 /* Setup the PASID entry for mediated devices: */
5182 if (domain_use_first_level(domain))
5183 ret = domain_setup_first_level(iommu, domain, dev,
5184 domain->default_pasid);
5186 ret = intel_pasid_setup_second_level(iommu, domain, dev,
5187 domain->default_pasid);
5190 spin_unlock(&iommu->lock);
5192 auxiliary_link_device(domain, dev);
5194 spin_unlock_irqrestore(&device_domain_lock, flags);
5199 domain_detach_iommu(domain, iommu);
5201 spin_unlock(&iommu->lock);
5202 spin_unlock_irqrestore(&device_domain_lock, flags);
5203 if (!domain->auxd_refcnt && domain->default_pasid > 0)
5204 ioasid_free(domain->default_pasid);
5209 static void aux_domain_remove_dev(struct dmar_domain *domain,
5212 struct device_domain_info *info;
5213 struct intel_iommu *iommu;
5214 unsigned long flags;
5216 if (!is_aux_domain(dev, &domain->domain))
5219 spin_lock_irqsave(&device_domain_lock, flags);
5220 info = get_domain_info(dev);
5221 iommu = info->iommu;
5223 auxiliary_unlink_device(domain, dev);
5225 spin_lock(&iommu->lock);
5226 intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid, false);
5227 domain_detach_iommu(domain, iommu);
5228 spin_unlock(&iommu->lock);
5230 spin_unlock_irqrestore(&device_domain_lock, flags);
5233 static int prepare_domain_attach_device(struct iommu_domain *domain,
5236 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5237 struct intel_iommu *iommu;
5241 iommu = device_to_iommu(dev, &bus, &devfn);
5245 /* check if this iommu agaw is sufficient for max mapped address */
5246 addr_width = agaw_to_width(iommu->agaw);
5247 if (addr_width > cap_mgaw(iommu->cap))
5248 addr_width = cap_mgaw(iommu->cap);
5250 if (dmar_domain->max_addr > (1LL << addr_width)) {
5251 dev_err(dev, "%s: iommu width (%d) is not "
5252 "sufficient for the mapped address (%llx)\n",
5253 __func__, addr_width, dmar_domain->max_addr);
5256 dmar_domain->gaw = addr_width;
5259 * Knock out extra levels of page tables if necessary
5261 while (iommu->agaw < dmar_domain->agaw) {
5262 struct dma_pte *pte;
5264 pte = dmar_domain->pgd;
5265 if (dma_pte_present(pte)) {
5266 dmar_domain->pgd = (struct dma_pte *)
5267 phys_to_virt(dma_pte_addr(pte));
5268 free_pgtable_page(pte);
5270 dmar_domain->agaw--;
5276 static int intel_iommu_attach_device(struct iommu_domain *domain,
5281 if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5282 device_is_rmrr_locked(dev)) {
5283 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
5287 if (is_aux_domain(dev, domain))
5290 /* normally dev is not mapped */
5291 if (unlikely(domain_context_mapped(dev))) {
5292 struct dmar_domain *old_domain;
5294 old_domain = find_domain(dev);
5296 dmar_remove_one_dev_info(dev);
5299 ret = prepare_domain_attach_device(domain, dev);
5303 return domain_add_dev_info(to_dmar_domain(domain), dev);
5306 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5311 if (!is_aux_domain(dev, domain))
5314 ret = prepare_domain_attach_device(domain, dev);
5318 return aux_domain_add_dev(to_dmar_domain(domain), dev);
5321 static void intel_iommu_detach_device(struct iommu_domain *domain,
5324 dmar_remove_one_dev_info(dev);
5327 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5330 aux_domain_remove_dev(to_dmar_domain(domain), dev);
5334 * 2D array for converting and sanitizing IOMMU generic TLB granularity to
5335 * VT-d granularity. Invalidation is typically included in the unmap operation
5336 * as a result of DMA or VFIO unmap. However, for assigned devices guest
5337 * owns the first level page tables. Invalidations of translation caches in the
5338 * guest are trapped and passed down to the host.
5340 * vIOMMU in the guest will only expose first level page tables, therefore
5341 * we do not support IOTLB granularity for request without PASID (second level).
5343 * For example, to find the VT-d granularity encoding for IOTLB
5344 * type and page selective granularity within PASID:
5345 * X: indexed by iommu cache type
5346 * Y: indexed by enum iommu_inv_granularity
5347 * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
5351 inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
5353 * PASID based IOTLB invalidation: PASID selective (per PASID),
5354 * page selective (address granularity)
5356 {-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
5357 /* PASID based dev TLBs */
5358 {-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
5360 {-EINVAL, -EINVAL, -EINVAL}
5363 static inline int to_vtd_granularity(int type, int granu)
5365 return inv_type_granu_table[type][granu];
5368 static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
5370 u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
5372 /* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
5373 * IOMMU cache invalidate API passes granu_size in bytes, and number of
5374 * granu size in contiguous memory.
5376 return order_base_2(nr_pages);
5379 #ifdef CONFIG_INTEL_IOMMU_SVM
5381 intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
5382 struct iommu_cache_invalidate_info *inv_info)
5384 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5385 struct device_domain_info *info;
5386 struct intel_iommu *iommu;
5387 unsigned long flags;
5394 if (!inv_info || !dmar_domain ||
5395 inv_info->version != IOMMU_CACHE_INVALIDATE_INFO_VERSION_1)
5398 if (!dev || !dev_is_pci(dev))
5401 iommu = device_to_iommu(dev, &bus, &devfn);
5405 if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
5408 spin_lock_irqsave(&device_domain_lock, flags);
5409 spin_lock(&iommu->lock);
5410 info = get_domain_info(dev);
5415 did = dmar_domain->iommu_did[iommu->seq_id];
5416 sid = PCI_DEVID(bus, devfn);
5418 /* Size is only valid in address selective invalidation */
5419 if (inv_info->granularity != IOMMU_INV_GRANU_PASID)
5420 size = to_vtd_size(inv_info->addr_info.granule_size,
5421 inv_info->addr_info.nb_granules);
5423 for_each_set_bit(cache_type,
5424 (unsigned long *)&inv_info->cache,
5425 IOMMU_CACHE_INV_TYPE_NR) {
5429 granu = to_vtd_granularity(cache_type, inv_info->granularity);
5430 if (granu == -EINVAL) {
5431 pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
5432 cache_type, inv_info->granularity);
5437 * PASID is stored in different locations based on the
5440 if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
5441 (inv_info->pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
5442 pasid = inv_info->pasid_info.pasid;
5443 else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5444 (inv_info->addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
5445 pasid = inv_info->addr_info.pasid;
5447 switch (BIT(cache_type)) {
5448 case IOMMU_CACHE_INV_TYPE_IOTLB:
5449 if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5451 (inv_info->addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
5452 pr_err_ratelimited("Address out of range, 0x%llx, size order %llu\n",
5453 inv_info->addr_info.addr, size);
5459 * If granu is PASID-selective, address is ignored.
5460 * We use npages = -1 to indicate that.
5462 qi_flush_piotlb(iommu, did, pasid,
5463 mm_to_dma_pfn(inv_info->addr_info.addr),
5464 (granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
5465 inv_info->addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
5468 * Always flush device IOTLB if ATS is enabled. vIOMMU
5469 * in the guest may assume IOTLB flush is inclusive,
5470 * which is more efficient.
5472 if (info->ats_enabled)
5473 qi_flush_dev_iotlb_pasid(iommu, sid,
5476 inv_info->addr_info.addr,
5479 case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
5480 if (info->ats_enabled)
5481 qi_flush_dev_iotlb_pasid(iommu, sid,
5484 inv_info->addr_info.addr,
5487 pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
5490 dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
5496 spin_unlock(&iommu->lock);
5497 spin_unlock_irqrestore(&device_domain_lock, flags);
5503 static int intel_iommu_map(struct iommu_domain *domain,
5504 unsigned long iova, phys_addr_t hpa,
5505 size_t size, int iommu_prot, gfp_t gfp)
5507 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5512 if (iommu_prot & IOMMU_READ)
5513 prot |= DMA_PTE_READ;
5514 if (iommu_prot & IOMMU_WRITE)
5515 prot |= DMA_PTE_WRITE;
5516 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5517 prot |= DMA_PTE_SNP;
5519 max_addr = iova + size;
5520 if (dmar_domain->max_addr < max_addr) {
5523 /* check if minimum agaw is sufficient for mapped address */
5524 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5525 if (end < max_addr) {
5526 pr_err("%s: iommu width (%d) is not "
5527 "sufficient for the mapped address (%llx)\n",
5528 __func__, dmar_domain->gaw, max_addr);
5531 dmar_domain->max_addr = max_addr;
5533 /* Round up size to next multiple of PAGE_SIZE, if it and
5534 the low bits of hpa would take us onto the next page */
5535 size = aligned_nrpages(hpa, size);
5536 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5537 hpa >> VTD_PAGE_SHIFT, size, prot);
5541 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5542 unsigned long iova, size_t size,
5543 struct iommu_iotlb_gather *gather)
5545 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5546 struct page *freelist = NULL;
5547 unsigned long start_pfn, last_pfn;
5548 unsigned int npages;
5549 int iommu_id, level = 0;
5551 /* Cope with horrid API which requires us to unmap more than the
5552 size argument if it happens to be a large-page mapping. */
5553 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5555 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5556 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5558 start_pfn = iova >> VTD_PAGE_SHIFT;
5559 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5561 freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5563 npages = last_pfn - start_pfn + 1;
5565 for_each_domain_iommu(iommu_id, dmar_domain)
5566 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5567 start_pfn, npages, !freelist, 0);
5569 dma_free_pagelist(freelist);
5571 if (dmar_domain->max_addr == iova + size)
5572 dmar_domain->max_addr = iova;
5577 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5580 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5581 struct dma_pte *pte;
5585 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5586 if (pte && dma_pte_present(pte))
5587 phys = dma_pte_addr(pte) +
5588 (iova & (BIT_MASK(level_to_offset_bits(level) +
5589 VTD_PAGE_SHIFT) - 1));
5594 static inline bool scalable_mode_support(void)
5596 struct dmar_drhd_unit *drhd;
5597 struct intel_iommu *iommu;
5601 for_each_active_iommu(iommu, drhd) {
5602 if (!sm_supported(iommu)) {
5612 static inline bool iommu_pasid_support(void)
5614 struct dmar_drhd_unit *drhd;
5615 struct intel_iommu *iommu;
5619 for_each_active_iommu(iommu, drhd) {
5620 if (!pasid_supported(iommu)) {
5630 static inline bool nested_mode_support(void)
5632 struct dmar_drhd_unit *drhd;
5633 struct intel_iommu *iommu;
5637 for_each_active_iommu(iommu, drhd) {
5638 if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) {
5648 static bool intel_iommu_capable(enum iommu_cap cap)
5650 if (cap == IOMMU_CAP_CACHE_COHERENCY)
5651 return domain_update_iommu_snooping(NULL) == 1;
5652 if (cap == IOMMU_CAP_INTR_REMAP)
5653 return irq_remapping_enabled == 1;
5658 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
5660 struct intel_iommu *iommu;
5663 iommu = device_to_iommu(dev, &bus, &devfn);
5665 return ERR_PTR(-ENODEV);
5667 if (translation_pre_enabled(iommu))
5668 dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO;
5670 return &iommu->iommu;
5673 static void intel_iommu_release_device(struct device *dev)
5675 struct intel_iommu *iommu;
5678 iommu = device_to_iommu(dev, &bus, &devfn);
5682 dmar_remove_one_dev_info(dev);
5684 set_dma_ops(dev, NULL);
5687 static void intel_iommu_probe_finalize(struct device *dev)
5689 struct iommu_domain *domain;
5691 domain = iommu_get_domain_for_dev(dev);
5692 if (device_needs_bounce(dev))
5693 set_dma_ops(dev, &bounce_dma_ops);
5694 else if (domain && domain->type == IOMMU_DOMAIN_DMA)
5695 set_dma_ops(dev, &intel_dma_ops);
5697 set_dma_ops(dev, NULL);
5700 static void intel_iommu_get_resv_regions(struct device *device,
5701 struct list_head *head)
5703 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5704 struct iommu_resv_region *reg;
5705 struct dmar_rmrr_unit *rmrr;
5706 struct device *i_dev;
5709 down_read(&dmar_global_lock);
5710 for_each_rmrr_units(rmrr) {
5711 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5713 struct iommu_resv_region *resv;
5714 enum iommu_resv_type type;
5717 if (i_dev != device &&
5718 !is_downstream_to_pci_bridge(device, i_dev))
5721 length = rmrr->end_address - rmrr->base_address + 1;
5723 type = device_rmrr_is_relaxable(device) ?
5724 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5726 resv = iommu_alloc_resv_region(rmrr->base_address,
5727 length, prot, type);
5731 list_add_tail(&resv->list, head);
5734 up_read(&dmar_global_lock);
5736 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5737 if (dev_is_pci(device)) {
5738 struct pci_dev *pdev = to_pci_dev(device);
5740 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5741 reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5742 IOMMU_RESV_DIRECT_RELAXABLE);
5744 list_add_tail(®->list, head);
5747 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5749 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5750 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5754 list_add_tail(®->list, head);
5757 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5759 struct device_domain_info *info;
5760 struct context_entry *context;
5761 struct dmar_domain *domain;
5762 unsigned long flags;
5766 domain = find_domain(dev);
5770 spin_lock_irqsave(&device_domain_lock, flags);
5771 spin_lock(&iommu->lock);
5774 info = get_domain_info(dev);
5775 if (!info || !info->pasid_supported)
5778 context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5779 if (WARN_ON(!context))
5782 ctx_lo = context[0].lo;
5784 if (!(ctx_lo & CONTEXT_PASIDE)) {
5785 ctx_lo |= CONTEXT_PASIDE;
5786 context[0].lo = ctx_lo;
5788 iommu->flush.flush_context(iommu,
5789 domain->iommu_did[iommu->seq_id],
5790 PCI_DEVID(info->bus, info->devfn),
5791 DMA_CCMD_MASK_NOBIT,
5792 DMA_CCMD_DEVICE_INVL);
5795 /* Enable PASID support in the device, if it wasn't already */
5796 if (!info->pasid_enabled)
5797 iommu_enable_dev_iotlb(info);
5802 spin_unlock(&iommu->lock);
5803 spin_unlock_irqrestore(&device_domain_lock, flags);
5808 static void intel_iommu_apply_resv_region(struct device *dev,
5809 struct iommu_domain *domain,
5810 struct iommu_resv_region *region)
5812 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5813 unsigned long start, end;
5815 start = IOVA_PFN(region->start);
5816 end = IOVA_PFN(region->start + region->length - 1);
5818 WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5821 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5823 if (dev_is_pci(dev))
5824 return pci_device_group(dev);
5825 return generic_device_group(dev);
5828 #ifdef CONFIG_INTEL_IOMMU_SVM
5829 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5831 struct intel_iommu *iommu;
5834 if (iommu_dummy(dev)) {
5836 "No IOMMU translation for device; cannot enable SVM\n");
5840 iommu = device_to_iommu(dev, &bus, &devfn);
5842 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5848 #endif /* CONFIG_INTEL_IOMMU_SVM */
5850 static int intel_iommu_enable_auxd(struct device *dev)
5852 struct device_domain_info *info;
5853 struct intel_iommu *iommu;
5854 unsigned long flags;
5858 iommu = device_to_iommu(dev, &bus, &devfn);
5859 if (!iommu || dmar_disabled)
5862 if (!sm_supported(iommu) || !pasid_supported(iommu))
5865 ret = intel_iommu_enable_pasid(iommu, dev);
5869 spin_lock_irqsave(&device_domain_lock, flags);
5870 info = get_domain_info(dev);
5871 info->auxd_enabled = 1;
5872 spin_unlock_irqrestore(&device_domain_lock, flags);
5877 static int intel_iommu_disable_auxd(struct device *dev)
5879 struct device_domain_info *info;
5880 unsigned long flags;
5882 spin_lock_irqsave(&device_domain_lock, flags);
5883 info = get_domain_info(dev);
5884 if (!WARN_ON(!info))
5885 info->auxd_enabled = 0;
5886 spin_unlock_irqrestore(&device_domain_lock, flags);
5892 * A PCI express designated vendor specific extended capability is defined
5893 * in the section 3.7 of Intel scalable I/O virtualization technical spec
5894 * for system software and tools to detect endpoint devices supporting the
5895 * Intel scalable IO virtualization without host driver dependency.
5897 * Returns the address of the matching extended capability structure within
5898 * the device's PCI configuration space or 0 if the device does not support
5901 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5906 pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5908 pci_read_config_word(pdev, pos + 4, &vendor);
5909 pci_read_config_word(pdev, pos + 8, &id);
5910 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5913 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5920 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5922 if (feat == IOMMU_DEV_FEAT_AUX) {
5925 if (!dev_is_pci(dev) || dmar_disabled ||
5926 !scalable_mode_support() || !iommu_pasid_support())
5929 ret = pci_pasid_features(to_pci_dev(dev));
5933 return !!siov_find_pci_dvsec(to_pci_dev(dev));
5936 if (feat == IOMMU_DEV_FEAT_SVA) {
5937 struct device_domain_info *info = get_domain_info(dev);
5939 return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) &&
5940 info->pasid_supported && info->pri_supported &&
5941 info->ats_supported;
5948 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5950 if (feat == IOMMU_DEV_FEAT_AUX)
5951 return intel_iommu_enable_auxd(dev);
5953 if (feat == IOMMU_DEV_FEAT_SVA) {
5954 struct device_domain_info *info = get_domain_info(dev);
5959 if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE)
5967 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5969 if (feat == IOMMU_DEV_FEAT_AUX)
5970 return intel_iommu_disable_auxd(dev);
5976 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5978 struct device_domain_info *info = get_domain_info(dev);
5980 if (feat == IOMMU_DEV_FEAT_AUX)
5981 return scalable_mode_support() && info && info->auxd_enabled;
5987 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5989 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5991 return dmar_domain->default_pasid > 0 ?
5992 dmar_domain->default_pasid : -EINVAL;
5995 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5998 return attach_deferred(dev);
6002 intel_iommu_domain_set_attr(struct iommu_domain *domain,
6003 enum iommu_attr attr, void *data)
6005 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6006 unsigned long flags;
6009 if (domain->type != IOMMU_DOMAIN_UNMANAGED)
6013 case DOMAIN_ATTR_NESTING:
6014 spin_lock_irqsave(&device_domain_lock, flags);
6015 if (nested_mode_support() &&
6016 list_empty(&dmar_domain->devices)) {
6017 dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
6018 dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
6022 spin_unlock_irqrestore(&device_domain_lock, flags);
6033 * Check that the device does not live on an external facing PCI port that is
6034 * marked as untrusted. Such devices should not be able to apply quirks and
6035 * thus not be able to bypass the IOMMU restrictions.
6037 static bool risky_device(struct pci_dev *pdev)
6039 if (pdev->untrusted) {
6041 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
6042 pdev->vendor, pdev->device);
6043 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
6049 const struct iommu_ops intel_iommu_ops = {
6050 .capable = intel_iommu_capable,
6051 .domain_alloc = intel_iommu_domain_alloc,
6052 .domain_free = intel_iommu_domain_free,
6053 .domain_set_attr = intel_iommu_domain_set_attr,
6054 .attach_dev = intel_iommu_attach_device,
6055 .detach_dev = intel_iommu_detach_device,
6056 .aux_attach_dev = intel_iommu_aux_attach_device,
6057 .aux_detach_dev = intel_iommu_aux_detach_device,
6058 .aux_get_pasid = intel_iommu_aux_get_pasid,
6059 .map = intel_iommu_map,
6060 .unmap = intel_iommu_unmap,
6061 .iova_to_phys = intel_iommu_iova_to_phys,
6062 .probe_device = intel_iommu_probe_device,
6063 .probe_finalize = intel_iommu_probe_finalize,
6064 .release_device = intel_iommu_release_device,
6065 .get_resv_regions = intel_iommu_get_resv_regions,
6066 .put_resv_regions = generic_iommu_put_resv_regions,
6067 .apply_resv_region = intel_iommu_apply_resv_region,
6068 .device_group = intel_iommu_device_group,
6069 .dev_has_feat = intel_iommu_dev_has_feat,
6070 .dev_feat_enabled = intel_iommu_dev_feat_enabled,
6071 .dev_enable_feat = intel_iommu_dev_enable_feat,
6072 .dev_disable_feat = intel_iommu_dev_disable_feat,
6073 .is_attach_deferred = intel_iommu_is_attach_deferred,
6074 .def_domain_type = device_def_domain_type,
6075 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
6076 #ifdef CONFIG_INTEL_IOMMU_SVM
6077 .cache_invalidate = intel_iommu_sva_invalidate,
6078 .sva_bind_gpasid = intel_svm_bind_gpasid,
6079 .sva_unbind_gpasid = intel_svm_unbind_gpasid,
6080 .sva_bind = intel_svm_bind,
6081 .sva_unbind = intel_svm_unbind,
6082 .sva_get_pasid = intel_svm_get_pasid,
6086 static void quirk_iommu_igfx(struct pci_dev *dev)
6088 if (risky_device(dev))
6091 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
6095 /* G4x/GM45 integrated gfx dmar support is totally busted. */
6096 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
6097 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
6098 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
6099 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
6100 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
6101 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
6102 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
6104 /* Broadwell igfx malfunctions with dmar */
6105 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
6106 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
6107 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
6108 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
6109 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
6110 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
6111 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
6112 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
6113 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
6114 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
6115 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
6116 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
6117 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
6118 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
6119 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
6120 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
6121 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
6122 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
6123 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
6124 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
6125 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
6126 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
6127 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
6128 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
6130 static void quirk_iommu_rwbf(struct pci_dev *dev)
6132 if (risky_device(dev))
6136 * Mobile 4 Series Chipset neglects to set RWBF capability,
6137 * but needs it. Same seems to hold for the desktop versions.
6139 pci_info(dev, "Forcing write-buffer flush capability\n");
6143 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
6144 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
6145 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
6146 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
6147 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
6148 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
6149 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
6152 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
6153 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
6154 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
6155 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
6156 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
6157 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
6158 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
6159 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
6161 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
6165 if (risky_device(dev))
6168 if (pci_read_config_word(dev, GGC, &ggc))
6171 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
6172 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
6174 } else if (dmar_map_gfx) {
6175 /* we have to ensure the gfx device is idle before we flush */
6176 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
6177 intel_iommu_strict = 1;
6180 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
6181 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
6182 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
6183 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
6185 /* On Tylersburg chipsets, some BIOSes have been known to enable the
6186 ISOCH DMAR unit for the Azalia sound device, but not give it any
6187 TLB entries, which causes it to deadlock. Check for that. We do
6188 this in a function called from init_dmars(), instead of in a PCI
6189 quirk, because we don't want to print the obnoxious "BIOS broken"
6190 message if VT-d is actually disabled.
6192 static void __init check_tylersburg_isoch(void)
6194 struct pci_dev *pdev;
6195 uint32_t vtisochctrl;
6197 /* If there's no Azalia in the system anyway, forget it. */
6198 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
6202 if (risky_device(pdev)) {
6209 /* System Management Registers. Might be hidden, in which case
6210 we can't do the sanity check. But that's OK, because the
6211 known-broken BIOSes _don't_ actually hide it, so far. */
6212 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
6216 if (risky_device(pdev)) {
6221 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
6228 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
6229 if (vtisochctrl & 1)
6232 /* Drop all bits other than the number of TLB entries */
6233 vtisochctrl &= 0x1c;
6235 /* If we have the recommended number of TLB entries (16), fine. */
6236 if (vtisochctrl == 0x10)
6239 /* Zero TLB entries? You get to ride the short bus to school. */
6241 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
6242 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
6243 dmi_get_system_info(DMI_BIOS_VENDOR),
6244 dmi_get_system_info(DMI_BIOS_VERSION),
6245 dmi_get_system_info(DMI_PRODUCT_VERSION));
6246 iommu_identity_mapping |= IDENTMAP_AZALIA;
6250 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",