2 * Copyright © 2006-2014 Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
21 #define pr_fmt(fmt) "DMAR: " fmt
23 #include <linux/init.h>
24 #include <linux/bitmap.h>
25 #include <linux/debugfs.h>
26 #include <linux/export.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/dma-direct.h>
35 #include <linux/mempool.h>
36 #include <linux/memory.h>
37 #include <linux/cpu.h>
38 #include <linux/timer.h>
40 #include <linux/iova.h>
41 #include <linux/iommu.h>
42 #include <linux/intel-iommu.h>
43 #include <linux/syscore_ops.h>
44 #include <linux/tboot.h>
45 #include <linux/dmi.h>
46 #include <linux/pci-ats.h>
47 #include <linux/memblock.h>
48 #include <linux/dma-contiguous.h>
49 #include <linux/dma-direct.h>
50 #include <linux/crash_dump.h>
51 #include <asm/irq_remapping.h>
52 #include <asm/cacheflush.h>
53 #include <asm/iommu.h>
55 #include "irq_remapping.h"
57 #define ROOT_SIZE VTD_PAGE_SIZE
58 #define CONTEXT_SIZE VTD_PAGE_SIZE
60 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
61 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
62 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
63 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
65 #define IOAPIC_RANGE_START (0xfee00000)
66 #define IOAPIC_RANGE_END (0xfeefffff)
67 #define IOVA_START_ADDR (0x1000)
69 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
71 #define MAX_AGAW_WIDTH 64
72 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
74 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
75 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
77 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
78 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
79 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
80 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
81 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
83 /* IO virtual address start page frame number */
84 #define IOVA_START_PFN (1)
86 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
88 /* page table handling */
89 #define LEVEL_STRIDE (9)
90 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
93 * This bitmap is used to advertise the page sizes our hardware support
94 * to the IOMMU core, which will then use this information to split
95 * physically contiguous memory regions it is mapping into page sizes
98 * Traditionally the IOMMU core just handed us the mappings directly,
99 * after making sure the size is an order of a 4KiB page and that the
100 * mapping has natural alignment.
102 * To retain this behavior, we currently advertise that we support
103 * all page sizes that are an order of 4KiB.
105 * If at some point we'd like to utilize the IOMMU core's new behavior,
106 * we could change this to advertise the real page sizes we support.
108 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
110 static inline int agaw_to_level(int agaw)
115 static inline int agaw_to_width(int agaw)
117 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
120 static inline int width_to_agaw(int width)
122 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
125 static inline unsigned int level_to_offset_bits(int level)
127 return (level - 1) * LEVEL_STRIDE;
130 static inline int pfn_level_offset(unsigned long pfn, int level)
132 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
135 static inline unsigned long level_mask(int level)
137 return -1UL << level_to_offset_bits(level);
140 static inline unsigned long level_size(int level)
142 return 1UL << level_to_offset_bits(level);
145 static inline unsigned long align_to_level(unsigned long pfn, int level)
147 return (pfn + level_size(level) - 1) & level_mask(level);
150 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
152 return 1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
155 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
156 are never going to work. */
157 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
159 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
162 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
164 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
166 static inline unsigned long page_to_dma_pfn(struct page *pg)
168 return mm_to_dma_pfn(page_to_pfn(pg));
170 static inline unsigned long virt_to_dma_pfn(void *p)
172 return page_to_dma_pfn(virt_to_page(p));
175 /* global iommu list, set NULL for ignored DMAR units */
176 static struct intel_iommu **g_iommus;
178 static void __init check_tylersburg_isoch(void);
179 static int rwbf_quirk;
182 * set to 1 to panic kernel if can't successfully enable VT-d
183 * (used when kernel is launched w/ TXT)
185 static int force_on = 0;
186 int intel_iommu_tboot_noforce;
191 * 12-63: Context Ptr (12 - (haw-1))
198 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
201 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
204 static phys_addr_t root_entry_lctp(struct root_entry *re)
209 return re->lo & VTD_PAGE_MASK;
213 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
216 static phys_addr_t root_entry_uctp(struct root_entry *re)
221 return re->hi & VTD_PAGE_MASK;
226 * 1: fault processing disable
227 * 2-3: translation type
228 * 12-63: address space root
234 struct context_entry {
239 static inline void context_clear_pasid_enable(struct context_entry *context)
241 context->lo &= ~(1ULL << 11);
244 static inline bool context_pasid_enabled(struct context_entry *context)
246 return !!(context->lo & (1ULL << 11));
249 static inline void context_set_copied(struct context_entry *context)
251 context->hi |= (1ull << 3);
254 static inline bool context_copied(struct context_entry *context)
256 return !!(context->hi & (1ULL << 3));
259 static inline bool __context_present(struct context_entry *context)
261 return (context->lo & 1);
264 static inline bool context_present(struct context_entry *context)
266 return context_pasid_enabled(context) ?
267 __context_present(context) :
268 __context_present(context) && !context_copied(context);
271 static inline void context_set_present(struct context_entry *context)
276 static inline void context_set_fault_enable(struct context_entry *context)
278 context->lo &= (((u64)-1) << 2) | 1;
281 static inline void context_set_translation_type(struct context_entry *context,
284 context->lo &= (((u64)-1) << 4) | 3;
285 context->lo |= (value & 3) << 2;
288 static inline void context_set_address_root(struct context_entry *context,
291 context->lo &= ~VTD_PAGE_MASK;
292 context->lo |= value & VTD_PAGE_MASK;
295 static inline void context_set_address_width(struct context_entry *context,
298 context->hi |= value & 7;
301 static inline void context_set_domain_id(struct context_entry *context,
304 context->hi |= (value & ((1 << 16) - 1)) << 8;
307 static inline int context_domain_id(struct context_entry *c)
309 return((c->hi >> 8) & 0xffff);
312 static inline void context_clear_entry(struct context_entry *context)
325 * 12-63: Host physcial address
331 static inline void dma_clear_pte(struct dma_pte *pte)
336 static inline u64 dma_pte_addr(struct dma_pte *pte)
339 return pte->val & VTD_PAGE_MASK;
341 /* Must have a full atomic 64-bit read */
342 return __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
346 static inline bool dma_pte_present(struct dma_pte *pte)
348 return (pte->val & 3) != 0;
351 static inline bool dma_pte_superpage(struct dma_pte *pte)
353 return (pte->val & DMA_PTE_LARGE_PAGE);
356 static inline int first_pte_in_page(struct dma_pte *pte)
358 return !((unsigned long)pte & ~VTD_PAGE_MASK);
362 * This domain is a statically identity mapping domain.
363 * 1. This domain creats a static 1:1 mapping to all usable memory.
364 * 2. It maps to each iommu if successful.
365 * 3. Each iommu mapps to this domain if successful.
367 static struct dmar_domain *si_domain;
368 static int hw_pass_through = 1;
371 * Domain represents a virtual machine, more than one devices
372 * across iommus may be owned in one domain, e.g. kvm guest.
374 #define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 0)
376 /* si_domain contains mulitple devices */
377 #define DOMAIN_FLAG_STATIC_IDENTITY (1 << 1)
379 #define for_each_domain_iommu(idx, domain) \
380 for (idx = 0; idx < g_num_of_iommus; idx++) \
381 if (domain->iommu_refcnt[idx])
384 int nid; /* node id */
386 unsigned iommu_refcnt[DMAR_UNITS_SUPPORTED];
387 /* Refcount of devices per iommu */
390 u16 iommu_did[DMAR_UNITS_SUPPORTED];
391 /* Domain ids per IOMMU. Use u16 since
392 * domain ids are 16 bit wide according
393 * to VT-d spec, section 9.3 */
395 bool has_iotlb_device;
396 struct list_head devices; /* all devices' list */
397 struct iova_domain iovad; /* iova's that belong to this domain */
399 struct dma_pte *pgd; /* virtual address */
400 int gaw; /* max guest address width */
402 /* adjusted guest address width, 0 is level 2 30-bit */
405 int flags; /* flags to find out type of domain */
407 int iommu_coherency;/* indicate coherency of iommu access */
408 int iommu_snooping; /* indicate snooping control feature*/
409 int iommu_count; /* reference count of iommu */
410 int iommu_superpage;/* Level of superpages supported:
411 0 == 4KiB (no superpages), 1 == 2MiB,
412 2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
413 u64 max_addr; /* maximum mapped address */
415 struct iommu_domain domain; /* generic domain data structure for
419 /* PCI domain-device relationship */
420 struct device_domain_info {
421 struct list_head link; /* link to domain siblings */
422 struct list_head global; /* link to global list */
423 u8 bus; /* PCI bus number */
424 u8 devfn; /* PCI devfn number */
425 u8 pasid_supported:3;
432 struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
433 struct intel_iommu *iommu; /* IOMMU used by this device */
434 struct dmar_domain *domain; /* pointer to domain */
437 struct dmar_rmrr_unit {
438 struct list_head list; /* list of rmrr units */
439 struct acpi_dmar_header *hdr; /* ACPI header */
440 u64 base_address; /* reserved base address*/
441 u64 end_address; /* reserved end address */
442 struct dmar_dev_scope *devices; /* target devices */
443 int devices_cnt; /* target device count */
444 struct iommu_resv_region *resv; /* reserved region handle */
447 struct dmar_atsr_unit {
448 struct list_head list; /* list of ATSR units */
449 struct acpi_dmar_header *hdr; /* ACPI header */
450 struct dmar_dev_scope *devices; /* target devices */
451 int devices_cnt; /* target device count */
452 u8 include_all:1; /* include all ports */
455 static LIST_HEAD(dmar_atsr_units);
456 static LIST_HEAD(dmar_rmrr_units);
458 #define for_each_rmrr_units(rmrr) \
459 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
461 /* bitmap for indexing intel_iommus */
462 static int g_num_of_iommus;
464 static void domain_exit(struct dmar_domain *domain);
465 static void domain_remove_dev_info(struct dmar_domain *domain);
466 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
468 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
469 static void domain_context_clear(struct intel_iommu *iommu,
471 static int domain_detach_iommu(struct dmar_domain *domain,
472 struct intel_iommu *iommu);
474 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
475 int dmar_disabled = 0;
477 int dmar_disabled = 1;
478 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
480 int intel_iommu_enabled = 0;
481 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
483 static int dmar_map_gfx = 1;
484 static int dmar_forcedac;
485 static int intel_iommu_strict;
486 static int intel_iommu_superpage = 1;
487 static int intel_iommu_ecs = 1;
488 static int iommu_identity_mapping;
490 #define IDENTMAP_ALL 1
491 #define IDENTMAP_GFX 2
492 #define IDENTMAP_AZALIA 4
494 #define ecs_enabled(iommu) (intel_iommu_ecs && ecap_ecs(iommu->ecap))
495 #define pasid_enabled(iommu) (ecs_enabled(iommu) && ecap_pasid(iommu->ecap))
497 int intel_iommu_gfx_mapped;
498 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
500 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
501 static DEFINE_SPINLOCK(device_domain_lock);
502 static LIST_HEAD(device_domain_list);
504 const struct iommu_ops intel_iommu_ops;
506 static bool translation_pre_enabled(struct intel_iommu *iommu)
508 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
511 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
513 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
516 static void init_translation_status(struct intel_iommu *iommu)
520 gsts = readl(iommu->reg + DMAR_GSTS_REG);
521 if (gsts & DMA_GSTS_TES)
522 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
525 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
526 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
528 return container_of(dom, struct dmar_domain, domain);
531 static int __init intel_iommu_setup(char *str)
536 if (!strncmp(str, "on", 2)) {
538 pr_info("IOMMU enabled\n");
539 } else if (!strncmp(str, "off", 3)) {
541 pr_info("IOMMU disabled\n");
542 } else if (!strncmp(str, "igfx_off", 8)) {
544 pr_info("Disable GFX device mapping\n");
545 } else if (!strncmp(str, "forcedac", 8)) {
546 pr_info("Forcing DAC for PCI devices\n");
548 } else if (!strncmp(str, "strict", 6)) {
549 pr_info("Disable batched IOTLB flush\n");
550 intel_iommu_strict = 1;
551 } else if (!strncmp(str, "sp_off", 6)) {
552 pr_info("Disable supported super page\n");
553 intel_iommu_superpage = 0;
554 } else if (!strncmp(str, "ecs_off", 7)) {
556 "Intel-IOMMU: disable extended context table support\n");
558 } else if (!strncmp(str, "tboot_noforce", 13)) {
560 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
561 intel_iommu_tboot_noforce = 1;
564 str += strcspn(str, ",");
570 __setup("intel_iommu=", intel_iommu_setup);
572 static struct kmem_cache *iommu_domain_cache;
573 static struct kmem_cache *iommu_devinfo_cache;
575 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
577 struct dmar_domain **domains;
580 domains = iommu->domains[idx];
584 return domains[did & 0xff];
587 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
588 struct dmar_domain *domain)
590 struct dmar_domain **domains;
593 if (!iommu->domains[idx]) {
594 size_t size = 256 * sizeof(struct dmar_domain *);
595 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
598 domains = iommu->domains[idx];
599 if (WARN_ON(!domains))
602 domains[did & 0xff] = domain;
605 static inline void *alloc_pgtable_page(int node)
610 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
612 vaddr = page_address(page);
616 static inline void free_pgtable_page(void *vaddr)
618 free_page((unsigned long)vaddr);
621 static inline void *alloc_domain_mem(void)
623 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
626 static void free_domain_mem(void *vaddr)
628 kmem_cache_free(iommu_domain_cache, vaddr);
631 static inline void * alloc_devinfo_mem(void)
633 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
636 static inline void free_devinfo_mem(void *vaddr)
638 kmem_cache_free(iommu_devinfo_cache, vaddr);
641 static inline int domain_type_is_vm(struct dmar_domain *domain)
643 return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
646 static inline int domain_type_is_si(struct dmar_domain *domain)
648 return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
651 static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
653 return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
654 DOMAIN_FLAG_STATIC_IDENTITY);
657 static inline int domain_pfn_supported(struct dmar_domain *domain,
660 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
662 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
665 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
670 sagaw = cap_sagaw(iommu->cap);
671 for (agaw = width_to_agaw(max_gaw);
673 if (test_bit(agaw, &sagaw))
681 * Calculate max SAGAW for each iommu.
683 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
685 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
689 * calculate agaw for each iommu.
690 * "SAGAW" may be different across iommus, use a default agaw, and
691 * get a supported less agaw for iommus that don't support the default agaw.
693 int iommu_calculate_agaw(struct intel_iommu *iommu)
695 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
698 /* This functionin only returns single iommu in a domain */
699 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
703 /* si_domain and vm domain should not get here. */
704 BUG_ON(domain_type_is_vm_or_si(domain));
705 for_each_domain_iommu(iommu_id, domain)
708 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
711 return g_iommus[iommu_id];
714 static void domain_update_iommu_coherency(struct dmar_domain *domain)
716 struct dmar_drhd_unit *drhd;
717 struct intel_iommu *iommu;
721 domain->iommu_coherency = 1;
723 for_each_domain_iommu(i, domain) {
725 if (!ecap_coherent(g_iommus[i]->ecap)) {
726 domain->iommu_coherency = 0;
733 /* No hardware attached; use lowest common denominator */
735 for_each_active_iommu(iommu, drhd) {
736 if (!ecap_coherent(iommu->ecap)) {
737 domain->iommu_coherency = 0;
744 static int domain_update_iommu_snooping(struct intel_iommu *skip)
746 struct dmar_drhd_unit *drhd;
747 struct intel_iommu *iommu;
751 for_each_active_iommu(iommu, drhd) {
753 if (!ecap_sc_support(iommu->ecap)) {
764 static int domain_update_iommu_superpage(struct intel_iommu *skip)
766 struct dmar_drhd_unit *drhd;
767 struct intel_iommu *iommu;
770 if (!intel_iommu_superpage) {
774 /* set iommu_superpage to the smallest common denominator */
776 for_each_active_iommu(iommu, drhd) {
778 mask &= cap_super_page_val(iommu->cap);
788 /* Some capabilities may be different across iommus */
789 static void domain_update_iommu_cap(struct dmar_domain *domain)
791 domain_update_iommu_coherency(domain);
792 domain->iommu_snooping = domain_update_iommu_snooping(NULL);
793 domain->iommu_superpage = domain_update_iommu_superpage(NULL);
796 static inline struct context_entry *iommu_context_addr(struct intel_iommu *iommu,
797 u8 bus, u8 devfn, int alloc)
799 struct root_entry *root = &iommu->root_entry[bus];
800 struct context_entry *context;
804 if (ecs_enabled(iommu)) {
812 context = phys_to_virt(*entry & VTD_PAGE_MASK);
814 unsigned long phy_addr;
818 context = alloc_pgtable_page(iommu->node);
822 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
823 phy_addr = virt_to_phys((void *)context);
824 *entry = phy_addr | 1;
825 __iommu_flush_cache(iommu, entry, sizeof(*entry));
827 return &context[devfn];
830 static int iommu_dummy(struct device *dev)
832 return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
835 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
837 struct dmar_drhd_unit *drhd = NULL;
838 struct intel_iommu *iommu;
840 struct pci_dev *ptmp, *pdev = NULL;
844 if (iommu_dummy(dev))
847 if (dev_is_pci(dev)) {
848 struct pci_dev *pf_pdev;
850 pdev = to_pci_dev(dev);
853 /* VMD child devices currently cannot be handled individually */
854 if (is_vmd(pdev->bus))
858 /* VFs aren't listed in scope tables; we need to look up
859 * the PF instead to find the IOMMU. */
860 pf_pdev = pci_physfn(pdev);
862 segment = pci_domain_nr(pdev->bus);
863 } else if (has_acpi_companion(dev))
864 dev = &ACPI_COMPANION(dev)->dev;
867 for_each_active_iommu(iommu, drhd) {
868 if (pdev && segment != drhd->segment)
871 for_each_active_dev_scope(drhd->devices,
872 drhd->devices_cnt, i, tmp) {
874 /* For a VF use its original BDF# not that of the PF
875 * which we used for the IOMMU lookup. Strictly speaking
876 * we could do this for all PCI devices; we only need to
877 * get the BDF# from the scope table for ACPI matches. */
878 if (pdev && pdev->is_virtfn)
881 *bus = drhd->devices[i].bus;
882 *devfn = drhd->devices[i].devfn;
886 if (!pdev || !dev_is_pci(tmp))
889 ptmp = to_pci_dev(tmp);
890 if (ptmp->subordinate &&
891 ptmp->subordinate->number <= pdev->bus->number &&
892 ptmp->subordinate->busn_res.end >= pdev->bus->number)
896 if (pdev && drhd->include_all) {
898 *bus = pdev->bus->number;
899 *devfn = pdev->devfn;
910 static void domain_flush_cache(struct dmar_domain *domain,
911 void *addr, int size)
913 if (!domain->iommu_coherency)
914 clflush_cache_range(addr, size);
917 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
919 struct context_entry *context;
923 spin_lock_irqsave(&iommu->lock, flags);
924 context = iommu_context_addr(iommu, bus, devfn, 0);
926 ret = context_present(context);
927 spin_unlock_irqrestore(&iommu->lock, flags);
931 static void free_context_table(struct intel_iommu *iommu)
935 struct context_entry *context;
937 spin_lock_irqsave(&iommu->lock, flags);
938 if (!iommu->root_entry) {
941 for (i = 0; i < ROOT_ENTRY_NR; i++) {
942 context = iommu_context_addr(iommu, i, 0, 0);
944 free_pgtable_page(context);
946 if (!ecs_enabled(iommu))
949 context = iommu_context_addr(iommu, i, 0x80, 0);
951 free_pgtable_page(context);
954 free_pgtable_page(iommu->root_entry);
955 iommu->root_entry = NULL;
957 spin_unlock_irqrestore(&iommu->lock, flags);
960 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
961 unsigned long pfn, int *target_level)
963 struct dma_pte *parent, *pte = NULL;
964 int level = agaw_to_level(domain->agaw);
967 BUG_ON(!domain->pgd);
969 if (!domain_pfn_supported(domain, pfn))
970 /* Address beyond IOMMU's addressing capabilities. */
973 parent = domain->pgd;
978 offset = pfn_level_offset(pfn, level);
979 pte = &parent[offset];
980 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
982 if (level == *target_level)
985 if (!dma_pte_present(pte)) {
988 tmp_page = alloc_pgtable_page(domain->nid);
993 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
994 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
995 if (cmpxchg64(&pte->val, 0ULL, pteval))
996 /* Someone else set it while we were thinking; use theirs. */
997 free_pgtable_page(tmp_page);
999 domain_flush_cache(domain, pte, sizeof(*pte));
1004 parent = phys_to_virt(dma_pte_addr(pte));
1009 *target_level = level;
1015 /* return address's pte at specific level */
1016 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1018 int level, int *large_page)
1020 struct dma_pte *parent, *pte = NULL;
1021 int total = agaw_to_level(domain->agaw);
1024 parent = domain->pgd;
1025 while (level <= total) {
1026 offset = pfn_level_offset(pfn, total);
1027 pte = &parent[offset];
1031 if (!dma_pte_present(pte)) {
1032 *large_page = total;
1036 if (dma_pte_superpage(pte)) {
1037 *large_page = total;
1041 parent = phys_to_virt(dma_pte_addr(pte));
1047 /* clear last level pte, a tlb flush should be followed */
1048 static void dma_pte_clear_range(struct dmar_domain *domain,
1049 unsigned long start_pfn,
1050 unsigned long last_pfn)
1052 unsigned int large_page = 1;
1053 struct dma_pte *first_pte, *pte;
1055 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1056 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1057 BUG_ON(start_pfn > last_pfn);
1059 /* we don't need lock here; nobody else touches the iova range */
1062 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1064 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1069 start_pfn += lvl_to_nr_pages(large_page);
1071 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1073 domain_flush_cache(domain, first_pte,
1074 (void *)pte - (void *)first_pte);
1076 } while (start_pfn && start_pfn <= last_pfn);
1079 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1080 int retain_level, struct dma_pte *pte,
1081 unsigned long pfn, unsigned long start_pfn,
1082 unsigned long last_pfn)
1084 pfn = max(start_pfn, pfn);
1085 pte = &pte[pfn_level_offset(pfn, level)];
1088 unsigned long level_pfn;
1089 struct dma_pte *level_pte;
1091 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1094 level_pfn = pfn & level_mask(level);
1095 level_pte = phys_to_virt(dma_pte_addr(pte));
1098 dma_pte_free_level(domain, level - 1, retain_level,
1099 level_pte, level_pfn, start_pfn,
1104 * Free the page table if we're below the level we want to
1105 * retain and the range covers the entire table.
1107 if (level < retain_level && !(start_pfn > level_pfn ||
1108 last_pfn < level_pfn + level_size(level) - 1)) {
1110 domain_flush_cache(domain, pte, sizeof(*pte));
1111 free_pgtable_page(level_pte);
1114 pfn += level_size(level);
1115 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1119 * clear last level (leaf) ptes and free page table pages below the
1120 * level we wish to keep intact.
1122 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1123 unsigned long start_pfn,
1124 unsigned long last_pfn,
1127 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1128 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1129 BUG_ON(start_pfn > last_pfn);
1131 dma_pte_clear_range(domain, start_pfn, last_pfn);
1133 /* We don't need lock here; nobody else touches the iova range */
1134 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1135 domain->pgd, 0, start_pfn, last_pfn);
1138 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1139 free_pgtable_page(domain->pgd);
1144 /* When a page at a given level is being unlinked from its parent, we don't
1145 need to *modify* it at all. All we need to do is make a list of all the
1146 pages which can be freed just as soon as we've flushed the IOTLB and we
1147 know the hardware page-walk will no longer touch them.
1148 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1150 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1151 int level, struct dma_pte *pte,
1152 struct page *freelist)
1156 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1157 pg->freelist = freelist;
1163 pte = page_address(pg);
1165 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1166 freelist = dma_pte_list_pagetables(domain, level - 1,
1169 } while (!first_pte_in_page(pte));
1174 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1175 struct dma_pte *pte, unsigned long pfn,
1176 unsigned long start_pfn,
1177 unsigned long last_pfn,
1178 struct page *freelist)
1180 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1182 pfn = max(start_pfn, pfn);
1183 pte = &pte[pfn_level_offset(pfn, level)];
1186 unsigned long level_pfn;
1188 if (!dma_pte_present(pte))
1191 level_pfn = pfn & level_mask(level);
1193 /* If range covers entire pagetable, free it */
1194 if (start_pfn <= level_pfn &&
1195 last_pfn >= level_pfn + level_size(level) - 1) {
1196 /* These suborbinate page tables are going away entirely. Don't
1197 bother to clear them; we're just going to *free* them. */
1198 if (level > 1 && !dma_pte_superpage(pte))
1199 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1205 } else if (level > 1) {
1206 /* Recurse down into a level that isn't *entirely* obsolete */
1207 freelist = dma_pte_clear_level(domain, level - 1,
1208 phys_to_virt(dma_pte_addr(pte)),
1209 level_pfn, start_pfn, last_pfn,
1213 pfn += level_size(level);
1214 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1217 domain_flush_cache(domain, first_pte,
1218 (void *)++last_pte - (void *)first_pte);
1223 /* We can't just free the pages because the IOMMU may still be walking
1224 the page tables, and may have cached the intermediate levels. The
1225 pages can only be freed after the IOTLB flush has been done. */
1226 static struct page *domain_unmap(struct dmar_domain *domain,
1227 unsigned long start_pfn,
1228 unsigned long last_pfn)
1230 struct page *freelist = NULL;
1232 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1233 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1234 BUG_ON(start_pfn > last_pfn);
1236 /* we don't need lock here; nobody else touches the iova range */
1237 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1238 domain->pgd, 0, start_pfn, last_pfn, NULL);
1241 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1242 struct page *pgd_page = virt_to_page(domain->pgd);
1243 pgd_page->freelist = freelist;
1244 freelist = pgd_page;
1252 static void dma_free_pagelist(struct page *freelist)
1256 while ((pg = freelist)) {
1257 freelist = pg->freelist;
1258 free_pgtable_page(page_address(pg));
1262 static void iova_entry_free(unsigned long data)
1264 struct page *freelist = (struct page *)data;
1266 dma_free_pagelist(freelist);
1269 /* iommu handling */
1270 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1272 struct root_entry *root;
1273 unsigned long flags;
1275 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1277 pr_err("Allocating root entry for %s failed\n",
1282 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1284 spin_lock_irqsave(&iommu->lock, flags);
1285 iommu->root_entry = root;
1286 spin_unlock_irqrestore(&iommu->lock, flags);
1291 static void iommu_set_root_entry(struct intel_iommu *iommu)
1297 addr = virt_to_phys(iommu->root_entry);
1298 if (ecs_enabled(iommu))
1299 addr |= DMA_RTADDR_RTT;
1301 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1302 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1304 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1306 /* Make sure hardware complete it */
1307 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1308 readl, (sts & DMA_GSTS_RTPS), sts);
1310 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1313 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1318 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1321 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1322 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1324 /* Make sure hardware complete it */
1325 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1326 readl, (!(val & DMA_GSTS_WBFS)), val);
1328 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1331 /* return value determine if we need a write buffer flush */
1332 static void __iommu_flush_context(struct intel_iommu *iommu,
1333 u16 did, u16 source_id, u8 function_mask,
1340 case DMA_CCMD_GLOBAL_INVL:
1341 val = DMA_CCMD_GLOBAL_INVL;
1343 case DMA_CCMD_DOMAIN_INVL:
1344 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1346 case DMA_CCMD_DEVICE_INVL:
1347 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1348 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1353 val |= DMA_CCMD_ICC;
1355 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1356 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1358 /* Make sure hardware complete it */
1359 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1360 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1362 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1365 /* return value determine if we need a write buffer flush */
1366 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1367 u64 addr, unsigned int size_order, u64 type)
1369 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1370 u64 val = 0, val_iva = 0;
1374 case DMA_TLB_GLOBAL_FLUSH:
1375 /* global flush doesn't need set IVA_REG */
1376 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1378 case DMA_TLB_DSI_FLUSH:
1379 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1381 case DMA_TLB_PSI_FLUSH:
1382 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1383 /* IH bit is passed in as part of address */
1384 val_iva = size_order | addr;
1389 /* Note: set drain read/write */
1392 * This is probably to be super secure.. Looks like we can
1393 * ignore it without any impact.
1395 if (cap_read_drain(iommu->cap))
1396 val |= DMA_TLB_READ_DRAIN;
1398 if (cap_write_drain(iommu->cap))
1399 val |= DMA_TLB_WRITE_DRAIN;
1401 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1402 /* Note: Only uses first TLB reg currently */
1404 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1405 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1407 /* Make sure hardware complete it */
1408 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1409 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1411 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1413 /* check IOTLB invalidation granularity */
1414 if (DMA_TLB_IAIG(val) == 0)
1415 pr_err("Flush IOTLB failed\n");
1416 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1417 pr_debug("TLB flush request %Lx, actual %Lx\n",
1418 (unsigned long long)DMA_TLB_IIRG(type),
1419 (unsigned long long)DMA_TLB_IAIG(val));
1422 static struct device_domain_info *
1423 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1426 struct device_domain_info *info;
1428 assert_spin_locked(&device_domain_lock);
1433 list_for_each_entry(info, &domain->devices, link)
1434 if (info->iommu == iommu && info->bus == bus &&
1435 info->devfn == devfn) {
1436 if (info->ats_supported && info->dev)
1444 static void domain_update_iotlb(struct dmar_domain *domain)
1446 struct device_domain_info *info;
1447 bool has_iotlb_device = false;
1449 assert_spin_locked(&device_domain_lock);
1451 list_for_each_entry(info, &domain->devices, link) {
1452 struct pci_dev *pdev;
1454 if (!info->dev || !dev_is_pci(info->dev))
1457 pdev = to_pci_dev(info->dev);
1458 if (pdev->ats_enabled) {
1459 has_iotlb_device = true;
1464 domain->has_iotlb_device = has_iotlb_device;
1467 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1469 struct pci_dev *pdev;
1471 assert_spin_locked(&device_domain_lock);
1473 if (!info || !dev_is_pci(info->dev))
1476 pdev = to_pci_dev(info->dev);
1478 #ifdef CONFIG_INTEL_IOMMU_SVM
1479 /* The PCIe spec, in its wisdom, declares that the behaviour of
1480 the device if you enable PASID support after ATS support is
1481 undefined. So always enable PASID support on devices which
1482 have it, even if we can't yet know if we're ever going to
1484 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1485 info->pasid_enabled = 1;
1487 if (info->pri_supported && !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1488 info->pri_enabled = 1;
1490 if (info->ats_supported && !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1491 info->ats_enabled = 1;
1492 domain_update_iotlb(info->domain);
1493 info->ats_qdep = pci_ats_queue_depth(pdev);
1497 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1499 struct pci_dev *pdev;
1501 assert_spin_locked(&device_domain_lock);
1503 if (!dev_is_pci(info->dev))
1506 pdev = to_pci_dev(info->dev);
1508 if (info->ats_enabled) {
1509 pci_disable_ats(pdev);
1510 info->ats_enabled = 0;
1511 domain_update_iotlb(info->domain);
1513 #ifdef CONFIG_INTEL_IOMMU_SVM
1514 if (info->pri_enabled) {
1515 pci_disable_pri(pdev);
1516 info->pri_enabled = 0;
1518 if (info->pasid_enabled) {
1519 pci_disable_pasid(pdev);
1520 info->pasid_enabled = 0;
1525 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1526 u64 addr, unsigned mask)
1529 unsigned long flags;
1530 struct device_domain_info *info;
1532 if (!domain->has_iotlb_device)
1535 spin_lock_irqsave(&device_domain_lock, flags);
1536 list_for_each_entry(info, &domain->devices, link) {
1537 if (!info->ats_enabled)
1540 sid = info->bus << 8 | info->devfn;
1541 qdep = info->ats_qdep;
1542 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1544 spin_unlock_irqrestore(&device_domain_lock, flags);
1547 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1548 struct dmar_domain *domain,
1549 unsigned long pfn, unsigned int pages,
1552 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1553 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1554 u16 did = domain->iommu_did[iommu->seq_id];
1561 * Fallback to domain selective flush if no PSI support or the size is
1563 * PSI requires page size to be 2 ^ x, and the base address is naturally
1564 * aligned to the size
1566 if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1567 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1570 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1574 * In caching mode, changes of pages from non-present to present require
1575 * flush. However, device IOTLB doesn't need to be flushed in this case.
1577 if (!cap_caching_mode(iommu->cap) || !map)
1578 iommu_flush_dev_iotlb(domain, addr, mask);
1581 /* Notification for newly created mappings */
1582 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1583 struct dmar_domain *domain,
1584 unsigned long pfn, unsigned int pages)
1586 /* It's a non-present to present mapping. Only flush if caching mode */
1587 if (cap_caching_mode(iommu->cap))
1588 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1590 iommu_flush_write_buffer(iommu);
1593 static void iommu_flush_iova(struct iova_domain *iovad)
1595 struct dmar_domain *domain;
1598 domain = container_of(iovad, struct dmar_domain, iovad);
1600 for_each_domain_iommu(idx, domain) {
1601 struct intel_iommu *iommu = g_iommus[idx];
1602 u16 did = domain->iommu_did[iommu->seq_id];
1604 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1606 if (!cap_caching_mode(iommu->cap))
1607 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1608 0, MAX_AGAW_PFN_WIDTH);
1612 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1615 unsigned long flags;
1617 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1618 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1619 pmen &= ~DMA_PMEN_EPM;
1620 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1622 /* wait for the protected region status bit to clear */
1623 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1624 readl, !(pmen & DMA_PMEN_PRS), pmen);
1626 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1629 static void iommu_enable_translation(struct intel_iommu *iommu)
1632 unsigned long flags;
1634 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1635 iommu->gcmd |= DMA_GCMD_TE;
1636 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1638 /* Make sure hardware complete it */
1639 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1640 readl, (sts & DMA_GSTS_TES), sts);
1642 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1645 static void iommu_disable_translation(struct intel_iommu *iommu)
1650 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1651 iommu->gcmd &= ~DMA_GCMD_TE;
1652 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1654 /* Make sure hardware complete it */
1655 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1656 readl, (!(sts & DMA_GSTS_TES)), sts);
1658 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1662 static int iommu_init_domains(struct intel_iommu *iommu)
1664 u32 ndomains, nlongs;
1667 ndomains = cap_ndoms(iommu->cap);
1668 pr_debug("%s: Number of Domains supported <%d>\n",
1669 iommu->name, ndomains);
1670 nlongs = BITS_TO_LONGS(ndomains);
1672 spin_lock_init(&iommu->lock);
1674 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1675 if (!iommu->domain_ids) {
1676 pr_err("%s: Allocating domain id array failed\n",
1681 size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1682 iommu->domains = kzalloc(size, GFP_KERNEL);
1684 if (iommu->domains) {
1685 size = 256 * sizeof(struct dmar_domain *);
1686 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1689 if (!iommu->domains || !iommu->domains[0]) {
1690 pr_err("%s: Allocating domain array failed\n",
1692 kfree(iommu->domain_ids);
1693 kfree(iommu->domains);
1694 iommu->domain_ids = NULL;
1695 iommu->domains = NULL;
1702 * If Caching mode is set, then invalid translations are tagged
1703 * with domain-id 0, hence we need to pre-allocate it. We also
1704 * use domain-id 0 as a marker for non-allocated domain-id, so
1705 * make sure it is not used for a real domain.
1707 set_bit(0, iommu->domain_ids);
1712 static void disable_dmar_iommu(struct intel_iommu *iommu)
1714 struct device_domain_info *info, *tmp;
1715 unsigned long flags;
1717 if (!iommu->domains || !iommu->domain_ids)
1721 spin_lock_irqsave(&device_domain_lock, flags);
1722 list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1723 struct dmar_domain *domain;
1725 if (info->iommu != iommu)
1728 if (!info->dev || !info->domain)
1731 domain = info->domain;
1733 __dmar_remove_one_dev_info(info);
1735 if (!domain_type_is_vm_or_si(domain)) {
1737 * The domain_exit() function can't be called under
1738 * device_domain_lock, as it takes this lock itself.
1739 * So release the lock here and re-run the loop
1742 spin_unlock_irqrestore(&device_domain_lock, flags);
1743 domain_exit(domain);
1747 spin_unlock_irqrestore(&device_domain_lock, flags);
1749 if (iommu->gcmd & DMA_GCMD_TE)
1750 iommu_disable_translation(iommu);
1753 static void free_dmar_iommu(struct intel_iommu *iommu)
1755 if ((iommu->domains) && (iommu->domain_ids)) {
1756 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1759 for (i = 0; i < elems; i++)
1760 kfree(iommu->domains[i]);
1761 kfree(iommu->domains);
1762 kfree(iommu->domain_ids);
1763 iommu->domains = NULL;
1764 iommu->domain_ids = NULL;
1767 g_iommus[iommu->seq_id] = NULL;
1769 /* free context mapping */
1770 free_context_table(iommu);
1772 #ifdef CONFIG_INTEL_IOMMU_SVM
1773 if (pasid_enabled(iommu)) {
1774 if (ecap_prs(iommu->ecap))
1775 intel_svm_finish_prq(iommu);
1776 intel_svm_free_pasid_tables(iommu);
1781 static struct dmar_domain *alloc_domain(int flags)
1783 struct dmar_domain *domain;
1785 domain = alloc_domain_mem();
1789 memset(domain, 0, sizeof(*domain));
1791 domain->flags = flags;
1792 domain->has_iotlb_device = false;
1793 INIT_LIST_HEAD(&domain->devices);
1798 /* Must be called with iommu->lock */
1799 static int domain_attach_iommu(struct dmar_domain *domain,
1800 struct intel_iommu *iommu)
1802 unsigned long ndomains;
1805 assert_spin_locked(&device_domain_lock);
1806 assert_spin_locked(&iommu->lock);
1808 domain->iommu_refcnt[iommu->seq_id] += 1;
1809 domain->iommu_count += 1;
1810 if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1811 ndomains = cap_ndoms(iommu->cap);
1812 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1814 if (num >= ndomains) {
1815 pr_err("%s: No free domain ids\n", iommu->name);
1816 domain->iommu_refcnt[iommu->seq_id] -= 1;
1817 domain->iommu_count -= 1;
1821 set_bit(num, iommu->domain_ids);
1822 set_iommu_domain(iommu, num, domain);
1824 domain->iommu_did[iommu->seq_id] = num;
1825 domain->nid = iommu->node;
1827 domain_update_iommu_cap(domain);
1833 static int domain_detach_iommu(struct dmar_domain *domain,
1834 struct intel_iommu *iommu)
1836 int num, count = INT_MAX;
1838 assert_spin_locked(&device_domain_lock);
1839 assert_spin_locked(&iommu->lock);
1841 domain->iommu_refcnt[iommu->seq_id] -= 1;
1842 count = --domain->iommu_count;
1843 if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1844 num = domain->iommu_did[iommu->seq_id];
1845 clear_bit(num, iommu->domain_ids);
1846 set_iommu_domain(iommu, num, NULL);
1848 domain_update_iommu_cap(domain);
1849 domain->iommu_did[iommu->seq_id] = 0;
1855 static struct iova_domain reserved_iova_list;
1856 static struct lock_class_key reserved_rbtree_key;
1858 static int dmar_init_reserved_ranges(void)
1860 struct pci_dev *pdev = NULL;
1864 init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1866 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1867 &reserved_rbtree_key);
1869 /* IOAPIC ranges shouldn't be accessed by DMA */
1870 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1871 IOVA_PFN(IOAPIC_RANGE_END));
1873 pr_err("Reserve IOAPIC range failed\n");
1877 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1878 for_each_pci_dev(pdev) {
1881 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1882 r = &pdev->resource[i];
1883 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1885 iova = reserve_iova(&reserved_iova_list,
1889 pr_err("Reserve iova failed\n");
1897 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1899 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1902 static inline int guestwidth_to_adjustwidth(int gaw)
1905 int r = (gaw - 12) % 9;
1916 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1919 int adjust_width, agaw;
1920 unsigned long sagaw;
1923 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1925 err = init_iova_flush_queue(&domain->iovad,
1926 iommu_flush_iova, iova_entry_free);
1930 domain_reserve_special_ranges(domain);
1932 /* calculate AGAW */
1933 if (guest_width > cap_mgaw(iommu->cap))
1934 guest_width = cap_mgaw(iommu->cap);
1935 domain->gaw = guest_width;
1936 adjust_width = guestwidth_to_adjustwidth(guest_width);
1937 agaw = width_to_agaw(adjust_width);
1938 sagaw = cap_sagaw(iommu->cap);
1939 if (!test_bit(agaw, &sagaw)) {
1940 /* hardware doesn't support it, choose a bigger one */
1941 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1942 agaw = find_next_bit(&sagaw, 5, agaw);
1946 domain->agaw = agaw;
1948 if (ecap_coherent(iommu->ecap))
1949 domain->iommu_coherency = 1;
1951 domain->iommu_coherency = 0;
1953 if (ecap_sc_support(iommu->ecap))
1954 domain->iommu_snooping = 1;
1956 domain->iommu_snooping = 0;
1958 if (intel_iommu_superpage)
1959 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1961 domain->iommu_superpage = 0;
1963 domain->nid = iommu->node;
1965 /* always allocate the top pgd */
1966 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1969 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1973 static void domain_exit(struct dmar_domain *domain)
1975 struct page *freelist = NULL;
1977 /* Domain 0 is reserved, so dont process it */
1981 /* Remove associated devices and clear attached or cached domains */
1983 domain_remove_dev_info(domain);
1987 put_iova_domain(&domain->iovad);
1989 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1991 dma_free_pagelist(freelist);
1993 free_domain_mem(domain);
1996 static int domain_context_mapping_one(struct dmar_domain *domain,
1997 struct intel_iommu *iommu,
2000 u16 did = domain->iommu_did[iommu->seq_id];
2001 int translation = CONTEXT_TT_MULTI_LEVEL;
2002 struct device_domain_info *info = NULL;
2003 struct context_entry *context;
2004 unsigned long flags;
2005 struct dma_pte *pgd;
2010 if (hw_pass_through && domain_type_is_si(domain))
2011 translation = CONTEXT_TT_PASS_THROUGH;
2013 pr_debug("Set context mapping for %02x:%02x.%d\n",
2014 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2016 BUG_ON(!domain->pgd);
2018 spin_lock_irqsave(&device_domain_lock, flags);
2019 spin_lock(&iommu->lock);
2022 context = iommu_context_addr(iommu, bus, devfn, 1);
2027 if (context_present(context))
2031 * For kdump cases, old valid entries may be cached due to the
2032 * in-flight DMA and copied pgtable, but there is no unmapping
2033 * behaviour for them, thus we need an explicit cache flush for
2034 * the newly-mapped device. For kdump, at this point, the device
2035 * is supposed to finish reset at its driver probe stage, so no
2036 * in-flight DMA will exist, and we don't need to worry anymore
2039 if (context_copied(context)) {
2040 u16 did_old = context_domain_id(context);
2042 if (did_old < cap_ndoms(iommu->cap)) {
2043 iommu->flush.flush_context(iommu, did_old,
2044 (((u16)bus) << 8) | devfn,
2045 DMA_CCMD_MASK_NOBIT,
2046 DMA_CCMD_DEVICE_INVL);
2047 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2054 context_clear_entry(context);
2055 context_set_domain_id(context, did);
2058 * Skip top levels of page tables for iommu which has less agaw
2059 * than default. Unnecessary for PT mode.
2061 if (translation != CONTEXT_TT_PASS_THROUGH) {
2062 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
2064 pgd = phys_to_virt(dma_pte_addr(pgd));
2065 if (!dma_pte_present(pgd))
2069 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2070 if (info && info->ats_supported)
2071 translation = CONTEXT_TT_DEV_IOTLB;
2073 translation = CONTEXT_TT_MULTI_LEVEL;
2075 context_set_address_root(context, virt_to_phys(pgd));
2076 context_set_address_width(context, iommu->agaw);
2079 * In pass through mode, AW must be programmed to
2080 * indicate the largest AGAW value supported by
2081 * hardware. And ASR is ignored by hardware.
2083 context_set_address_width(context, iommu->msagaw);
2086 context_set_translation_type(context, translation);
2087 context_set_fault_enable(context);
2088 context_set_present(context);
2089 domain_flush_cache(domain, context, sizeof(*context));
2092 * It's a non-present to present mapping. If hardware doesn't cache
2093 * non-present entry we only need to flush the write-buffer. If the
2094 * _does_ cache non-present entries, then it does so in the special
2095 * domain #0, which we have to flush:
2097 if (cap_caching_mode(iommu->cap)) {
2098 iommu->flush.flush_context(iommu, 0,
2099 (((u16)bus) << 8) | devfn,
2100 DMA_CCMD_MASK_NOBIT,
2101 DMA_CCMD_DEVICE_INVL);
2102 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2104 iommu_flush_write_buffer(iommu);
2106 iommu_enable_dev_iotlb(info);
2111 spin_unlock(&iommu->lock);
2112 spin_unlock_irqrestore(&device_domain_lock, flags);
2117 struct domain_context_mapping_data {
2118 struct dmar_domain *domain;
2119 struct intel_iommu *iommu;
2122 static int domain_context_mapping_cb(struct pci_dev *pdev,
2123 u16 alias, void *opaque)
2125 struct domain_context_mapping_data *data = opaque;
2127 return domain_context_mapping_one(data->domain, data->iommu,
2128 PCI_BUS_NUM(alias), alias & 0xff);
2132 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2134 struct intel_iommu *iommu;
2136 struct domain_context_mapping_data data;
2138 iommu = device_to_iommu(dev, &bus, &devfn);
2142 if (!dev_is_pci(dev))
2143 return domain_context_mapping_one(domain, iommu, bus, devfn);
2145 data.domain = domain;
2148 return pci_for_each_dma_alias(to_pci_dev(dev),
2149 &domain_context_mapping_cb, &data);
2152 static int domain_context_mapped_cb(struct pci_dev *pdev,
2153 u16 alias, void *opaque)
2155 struct intel_iommu *iommu = opaque;
2157 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2160 static int domain_context_mapped(struct device *dev)
2162 struct intel_iommu *iommu;
2165 iommu = device_to_iommu(dev, &bus, &devfn);
2169 if (!dev_is_pci(dev))
2170 return device_context_mapped(iommu, bus, devfn);
2172 return !pci_for_each_dma_alias(to_pci_dev(dev),
2173 domain_context_mapped_cb, iommu);
2176 /* Returns a number of VTD pages, but aligned to MM page size */
2177 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2180 host_addr &= ~PAGE_MASK;
2181 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2184 /* Return largest possible superpage level for a given mapping */
2185 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2186 unsigned long iov_pfn,
2187 unsigned long phy_pfn,
2188 unsigned long pages)
2190 int support, level = 1;
2191 unsigned long pfnmerge;
2193 support = domain->iommu_superpage;
2195 /* To use a large page, the virtual *and* physical addresses
2196 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2197 of them will mean we have to use smaller pages. So just
2198 merge them and check both at once. */
2199 pfnmerge = iov_pfn | phy_pfn;
2201 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2202 pages >>= VTD_STRIDE_SHIFT;
2205 pfnmerge >>= VTD_STRIDE_SHIFT;
2212 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2213 struct scatterlist *sg, unsigned long phys_pfn,
2214 unsigned long nr_pages, int prot)
2216 struct dma_pte *first_pte = NULL, *pte = NULL;
2217 phys_addr_t uninitialized_var(pteval);
2218 unsigned long sg_res = 0;
2219 unsigned int largepage_lvl = 0;
2220 unsigned long lvl_pages = 0;
2222 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2224 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2227 prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2231 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2234 while (nr_pages > 0) {
2238 unsigned int pgoff = sg->offset & ~PAGE_MASK;
2240 sg_res = aligned_nrpages(sg->offset, sg->length);
2241 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2242 sg->dma_length = sg->length;
2243 pteval = (sg_phys(sg) - pgoff) | prot;
2244 phys_pfn = pteval >> VTD_PAGE_SHIFT;
2248 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2250 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2253 /* It is large page*/
2254 if (largepage_lvl > 1) {
2255 unsigned long nr_superpages, end_pfn;
2257 pteval |= DMA_PTE_LARGE_PAGE;
2258 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2260 nr_superpages = sg_res / lvl_pages;
2261 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2264 * Ensure that old small page tables are
2265 * removed to make room for superpage(s).
2266 * We're adding new large pages, so make sure
2267 * we don't remove their parent tables.
2269 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2272 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2276 /* We don't need lock here, nobody else
2277 * touches the iova range
2279 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2281 static int dumps = 5;
2282 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2283 iov_pfn, tmp, (unsigned long long)pteval);
2286 debug_dma_dump_mappings(NULL);
2291 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2293 BUG_ON(nr_pages < lvl_pages);
2294 BUG_ON(sg_res < lvl_pages);
2296 nr_pages -= lvl_pages;
2297 iov_pfn += lvl_pages;
2298 phys_pfn += lvl_pages;
2299 pteval += lvl_pages * VTD_PAGE_SIZE;
2300 sg_res -= lvl_pages;
2302 /* If the next PTE would be the first in a new page, then we
2303 need to flush the cache on the entries we've just written.
2304 And then we'll need to recalculate 'pte', so clear it and
2305 let it get set again in the if (!pte) block above.
2307 If we're done (!nr_pages) we need to flush the cache too.
2309 Also if we've been setting superpages, we may need to
2310 recalculate 'pte' and switch back to smaller pages for the
2311 end of the mapping, if the trailing size is not enough to
2312 use another superpage (i.e. sg_res < lvl_pages). */
2314 if (!nr_pages || first_pte_in_page(pte) ||
2315 (largepage_lvl > 1 && sg_res < lvl_pages)) {
2316 domain_flush_cache(domain, first_pte,
2317 (void *)pte - (void *)first_pte);
2321 if (!sg_res && nr_pages)
2327 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2328 struct scatterlist *sg, unsigned long phys_pfn,
2329 unsigned long nr_pages, int prot)
2332 struct intel_iommu *iommu;
2334 /* Do the real mapping first */
2335 ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2339 /* Notify about the new mapping */
2340 if (domain_type_is_vm(domain)) {
2341 /* VM typed domains can have more than one IOMMUs */
2343 for_each_domain_iommu(iommu_id, domain) {
2344 iommu = g_iommus[iommu_id];
2345 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2348 /* General domains only have one IOMMU */
2349 iommu = domain_get_iommu(domain);
2350 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2356 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2357 struct scatterlist *sg, unsigned long nr_pages,
2360 return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2363 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2364 unsigned long phys_pfn, unsigned long nr_pages,
2367 return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2370 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2372 unsigned long flags;
2373 struct context_entry *context;
2379 spin_lock_irqsave(&iommu->lock, flags);
2380 context = iommu_context_addr(iommu, bus, devfn, 0);
2382 spin_unlock_irqrestore(&iommu->lock, flags);
2385 did_old = context_domain_id(context);
2386 context_clear_entry(context);
2387 __iommu_flush_cache(iommu, context, sizeof(*context));
2388 spin_unlock_irqrestore(&iommu->lock, flags);
2389 iommu->flush.flush_context(iommu,
2391 (((u16)bus) << 8) | devfn,
2392 DMA_CCMD_MASK_NOBIT,
2393 DMA_CCMD_DEVICE_INVL);
2394 iommu->flush.flush_iotlb(iommu,
2401 static inline void unlink_domain_info(struct device_domain_info *info)
2403 assert_spin_locked(&device_domain_lock);
2404 list_del(&info->link);
2405 list_del(&info->global);
2407 info->dev->archdata.iommu = NULL;
2410 static void domain_remove_dev_info(struct dmar_domain *domain)
2412 struct device_domain_info *info, *tmp;
2413 unsigned long flags;
2415 spin_lock_irqsave(&device_domain_lock, flags);
2416 list_for_each_entry_safe(info, tmp, &domain->devices, link)
2417 __dmar_remove_one_dev_info(info);
2418 spin_unlock_irqrestore(&device_domain_lock, flags);
2423 * Note: we use struct device->archdata.iommu stores the info
2425 static struct dmar_domain *find_domain(struct device *dev)
2427 struct device_domain_info *info;
2429 /* No lock here, assumes no domain exit in normal case */
2430 info = dev->archdata.iommu;
2432 return info->domain;
2436 static inline struct device_domain_info *
2437 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2439 struct device_domain_info *info;
2441 list_for_each_entry(info, &device_domain_list, global)
2442 if (info->iommu->segment == segment && info->bus == bus &&
2443 info->devfn == devfn)
2449 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2452 struct dmar_domain *domain)
2454 struct dmar_domain *found = NULL;
2455 struct device_domain_info *info;
2456 unsigned long flags;
2459 info = alloc_devinfo_mem();
2464 info->devfn = devfn;
2465 info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2466 info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2469 info->domain = domain;
2470 info->iommu = iommu;
2472 if (dev && dev_is_pci(dev)) {
2473 struct pci_dev *pdev = to_pci_dev(info->dev);
2475 if (!pci_ats_disabled() &&
2476 ecap_dev_iotlb_support(iommu->ecap) &&
2477 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2478 dmar_find_matched_atsr_unit(pdev))
2479 info->ats_supported = 1;
2481 if (ecs_enabled(iommu)) {
2482 if (pasid_enabled(iommu)) {
2483 int features = pci_pasid_features(pdev);
2485 info->pasid_supported = features | 1;
2488 if (info->ats_supported && ecap_prs(iommu->ecap) &&
2489 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2490 info->pri_supported = 1;
2494 spin_lock_irqsave(&device_domain_lock, flags);
2496 found = find_domain(dev);
2499 struct device_domain_info *info2;
2500 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2502 found = info2->domain;
2508 spin_unlock_irqrestore(&device_domain_lock, flags);
2509 free_devinfo_mem(info);
2510 /* Caller must free the original domain */
2514 spin_lock(&iommu->lock);
2515 ret = domain_attach_iommu(domain, iommu);
2516 spin_unlock(&iommu->lock);
2519 spin_unlock_irqrestore(&device_domain_lock, flags);
2520 free_devinfo_mem(info);
2524 list_add(&info->link, &domain->devices);
2525 list_add(&info->global, &device_domain_list);
2527 dev->archdata.iommu = info;
2528 spin_unlock_irqrestore(&device_domain_lock, flags);
2530 if (dev && domain_context_mapping(domain, dev)) {
2531 pr_err("Domain context map for %s failed\n", dev_name(dev));
2532 dmar_remove_one_dev_info(domain, dev);
2539 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2541 *(u16 *)opaque = alias;
2545 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2547 struct device_domain_info *info = NULL;
2548 struct dmar_domain *domain = NULL;
2549 struct intel_iommu *iommu;
2551 unsigned long flags;
2554 iommu = device_to_iommu(dev, &bus, &devfn);
2558 if (dev_is_pci(dev)) {
2559 struct pci_dev *pdev = to_pci_dev(dev);
2561 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2563 spin_lock_irqsave(&device_domain_lock, flags);
2564 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2565 PCI_BUS_NUM(dma_alias),
2568 iommu = info->iommu;
2569 domain = info->domain;
2571 spin_unlock_irqrestore(&device_domain_lock, flags);
2573 /* DMA alias already has a domain, use it */
2578 /* Allocate and initialize new domain for the device */
2579 domain = alloc_domain(0);
2582 if (domain_init(domain, iommu, gaw)) {
2583 domain_exit(domain);
2592 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2593 struct dmar_domain *domain)
2595 struct intel_iommu *iommu;
2596 struct dmar_domain *tmp;
2597 u16 req_id, dma_alias;
2600 iommu = device_to_iommu(dev, &bus, &devfn);
2604 req_id = ((u16)bus << 8) | devfn;
2606 if (dev_is_pci(dev)) {
2607 struct pci_dev *pdev = to_pci_dev(dev);
2609 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2611 /* register PCI DMA alias device */
2612 if (req_id != dma_alias) {
2613 tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2614 dma_alias & 0xff, NULL, domain);
2616 if (!tmp || tmp != domain)
2621 tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2622 if (!tmp || tmp != domain)
2628 static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2630 struct dmar_domain *domain, *tmp;
2632 domain = find_domain(dev);
2636 domain = find_or_alloc_domain(dev, gaw);
2640 tmp = set_domain_for_dev(dev, domain);
2641 if (!tmp || domain != tmp) {
2642 domain_exit(domain);
2651 static int iommu_domain_identity_map(struct dmar_domain *domain,
2652 unsigned long long start,
2653 unsigned long long end)
2655 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2656 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2658 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2659 dma_to_mm_pfn(last_vpfn))) {
2660 pr_err("Reserving iova failed\n");
2664 pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2666 * RMRR range might have overlap with physical memory range,
2669 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2671 return __domain_mapping(domain, first_vpfn, NULL,
2672 first_vpfn, last_vpfn - first_vpfn + 1,
2673 DMA_PTE_READ|DMA_PTE_WRITE);
2676 static int domain_prepare_identity_map(struct device *dev,
2677 struct dmar_domain *domain,
2678 unsigned long long start,
2679 unsigned long long end)
2681 /* For _hardware_ passthrough, don't bother. But for software
2682 passthrough, we do it anyway -- it may indicate a memory
2683 range which is reserved in E820, so which didn't get set
2684 up to start with in si_domain */
2685 if (domain == si_domain && hw_pass_through) {
2686 pr_warn("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2687 dev_name(dev), start, end);
2691 pr_info("Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2692 dev_name(dev), start, end);
2695 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2696 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2697 dmi_get_system_info(DMI_BIOS_VENDOR),
2698 dmi_get_system_info(DMI_BIOS_VERSION),
2699 dmi_get_system_info(DMI_PRODUCT_VERSION));
2703 if (end >> agaw_to_width(domain->agaw)) {
2704 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2705 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2706 agaw_to_width(domain->agaw),
2707 dmi_get_system_info(DMI_BIOS_VENDOR),
2708 dmi_get_system_info(DMI_BIOS_VERSION),
2709 dmi_get_system_info(DMI_PRODUCT_VERSION));
2713 return iommu_domain_identity_map(domain, start, end);
2716 static int iommu_prepare_identity_map(struct device *dev,
2717 unsigned long long start,
2718 unsigned long long end)
2720 struct dmar_domain *domain;
2723 domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2727 ret = domain_prepare_identity_map(dev, domain, start, end);
2729 domain_exit(domain);
2734 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2737 if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2739 return iommu_prepare_identity_map(dev, rmrr->base_address,
2743 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2744 static inline void iommu_prepare_isa(void)
2746 struct pci_dev *pdev;
2749 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2753 pr_info("Prepare 0-16MiB unity mapping for LPC\n");
2754 ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2757 pr_err("Failed to create 0-16MiB identity map - floppy might not work\n");
2762 static inline void iommu_prepare_isa(void)
2766 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2768 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2770 static int __init si_domain_init(int hw)
2774 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2778 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2779 domain_exit(si_domain);
2783 pr_debug("Identity mapping domain allocated\n");
2788 for_each_online_node(nid) {
2789 unsigned long start_pfn, end_pfn;
2792 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2793 ret = iommu_domain_identity_map(si_domain,
2794 PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2803 static int identity_mapping(struct device *dev)
2805 struct device_domain_info *info;
2807 if (likely(!iommu_identity_mapping))
2810 info = dev->archdata.iommu;
2811 if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2812 return (info->domain == si_domain);
2817 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2819 struct dmar_domain *ndomain;
2820 struct intel_iommu *iommu;
2823 iommu = device_to_iommu(dev, &bus, &devfn);
2827 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2828 if (ndomain != domain)
2834 static bool device_has_rmrr(struct device *dev)
2836 struct dmar_rmrr_unit *rmrr;
2841 for_each_rmrr_units(rmrr) {
2843 * Return TRUE if this RMRR contains the device that
2846 for_each_active_dev_scope(rmrr->devices,
2847 rmrr->devices_cnt, i, tmp)
2858 * There are a couple cases where we need to restrict the functionality of
2859 * devices associated with RMRRs. The first is when evaluating a device for
2860 * identity mapping because problems exist when devices are moved in and out
2861 * of domains and their respective RMRR information is lost. This means that
2862 * a device with associated RMRRs will never be in a "passthrough" domain.
2863 * The second is use of the device through the IOMMU API. This interface
2864 * expects to have full control of the IOVA space for the device. We cannot
2865 * satisfy both the requirement that RMRR access is maintained and have an
2866 * unencumbered IOVA space. We also have no ability to quiesce the device's
2867 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2868 * We therefore prevent devices associated with an RMRR from participating in
2869 * the IOMMU API, which eliminates them from device assignment.
2871 * In both cases we assume that PCI USB devices with RMRRs have them largely
2872 * for historical reasons and that the RMRR space is not actively used post
2873 * boot. This exclusion may change if vendors begin to abuse it.
2875 * The same exception is made for graphics devices, with the requirement that
2876 * any use of the RMRR regions will be torn down before assigning the device
2879 static bool device_is_rmrr_locked(struct device *dev)
2881 if (!device_has_rmrr(dev))
2884 if (dev_is_pci(dev)) {
2885 struct pci_dev *pdev = to_pci_dev(dev);
2887 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2894 static int iommu_should_identity_map(struct device *dev, int startup)
2897 if (dev_is_pci(dev)) {
2898 struct pci_dev *pdev = to_pci_dev(dev);
2900 if (device_is_rmrr_locked(dev))
2903 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2906 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2909 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2913 * We want to start off with all devices in the 1:1 domain, and
2914 * take them out later if we find they can't access all of memory.
2916 * However, we can't do this for PCI devices behind bridges,
2917 * because all PCI devices behind the same bridge will end up
2918 * with the same source-id on their transactions.
2920 * Practically speaking, we can't change things around for these
2921 * devices at run-time, because we can't be sure there'll be no
2922 * DMA transactions in flight for any of their siblings.
2924 * So PCI devices (unless they're on the root bus) as well as
2925 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2926 * the 1:1 domain, just in _case_ one of their siblings turns out
2927 * not to be able to map all of memory.
2929 if (!pci_is_pcie(pdev)) {
2930 if (!pci_is_root_bus(pdev->bus))
2932 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2934 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2937 if (device_has_rmrr(dev))
2942 * At boot time, we don't yet know if devices will be 64-bit capable.
2943 * Assume that they will — if they turn out not to be, then we can
2944 * take them out of the 1:1 domain later.
2948 * If the device's dma_mask is less than the system's memory
2949 * size then this is not a candidate for identity mapping.
2951 u64 dma_mask = *dev->dma_mask;
2953 if (dev->coherent_dma_mask &&
2954 dev->coherent_dma_mask < dma_mask)
2955 dma_mask = dev->coherent_dma_mask;
2957 return dma_mask >= dma_get_required_mask(dev);
2963 static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
2967 if (!iommu_should_identity_map(dev, 1))
2970 ret = domain_add_dev_info(si_domain, dev);
2972 pr_info("%s identity mapping for device %s\n",
2973 hw ? "Hardware" : "Software", dev_name(dev));
2974 else if (ret == -ENODEV)
2975 /* device not associated with an iommu */
2982 static int __init iommu_prepare_static_identity_mapping(int hw)
2984 struct pci_dev *pdev = NULL;
2985 struct dmar_drhd_unit *drhd;
2986 struct intel_iommu *iommu;
2991 for_each_pci_dev(pdev) {
2992 ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
2997 for_each_active_iommu(iommu, drhd)
2998 for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
2999 struct acpi_device_physical_node *pn;
3000 struct acpi_device *adev;
3002 if (dev->bus != &acpi_bus_type)
3005 adev= to_acpi_device(dev);
3006 mutex_lock(&adev->physical_node_lock);
3007 list_for_each_entry(pn, &adev->physical_node_list, node) {
3008 ret = dev_prepare_static_identity_mapping(pn->dev, hw);
3012 mutex_unlock(&adev->physical_node_lock);
3020 static void intel_iommu_init_qi(struct intel_iommu *iommu)
3023 * Start from the sane iommu hardware state.
3024 * If the queued invalidation is already initialized by us
3025 * (for example, while enabling interrupt-remapping) then
3026 * we got the things already rolling from a sane state.
3030 * Clear any previous faults.
3032 dmar_fault(-1, iommu);
3034 * Disable queued invalidation if supported and already enabled
3035 * before OS handover.
3037 dmar_disable_qi(iommu);
3040 if (dmar_enable_qi(iommu)) {
3042 * Queued Invalidate not enabled, use Register Based Invalidate
3044 iommu->flush.flush_context = __iommu_flush_context;
3045 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
3046 pr_info("%s: Using Register based invalidation\n",
3049 iommu->flush.flush_context = qi_flush_context;
3050 iommu->flush.flush_iotlb = qi_flush_iotlb;
3051 pr_info("%s: Using Queued invalidation\n", iommu->name);
3055 static int copy_context_table(struct intel_iommu *iommu,
3056 struct root_entry *old_re,
3057 struct context_entry **tbl,
3060 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3061 struct context_entry *new_ce = NULL, ce;
3062 struct context_entry *old_ce = NULL;
3063 struct root_entry re;
3064 phys_addr_t old_ce_phys;
3066 tbl_idx = ext ? bus * 2 : bus;
3067 memcpy(&re, old_re, sizeof(re));
3069 for (devfn = 0; devfn < 256; devfn++) {
3070 /* First calculate the correct index */
3071 idx = (ext ? devfn * 2 : devfn) % 256;
3074 /* First save what we may have and clean up */
3076 tbl[tbl_idx] = new_ce;
3077 __iommu_flush_cache(iommu, new_ce,
3087 old_ce_phys = root_entry_lctp(&re);
3089 old_ce_phys = root_entry_uctp(&re);
3092 if (ext && devfn == 0) {
3093 /* No LCTP, try UCTP */
3102 old_ce = memremap(old_ce_phys, PAGE_SIZE,
3107 new_ce = alloc_pgtable_page(iommu->node);
3114 /* Now copy the context entry */
3115 memcpy(&ce, old_ce + idx, sizeof(ce));
3117 if (!__context_present(&ce))
3120 did = context_domain_id(&ce);
3121 if (did >= 0 && did < cap_ndoms(iommu->cap))
3122 set_bit(did, iommu->domain_ids);
3125 * We need a marker for copied context entries. This
3126 * marker needs to work for the old format as well as
3127 * for extended context entries.
3129 * Bit 67 of the context entry is used. In the old
3130 * format this bit is available to software, in the
3131 * extended format it is the PGE bit, but PGE is ignored
3132 * by HW if PASIDs are disabled (and thus still
3135 * So disable PASIDs first and then mark the entry
3136 * copied. This means that we don't copy PASID
3137 * translations from the old kernel, but this is fine as
3138 * faults there are not fatal.
3140 context_clear_pasid_enable(&ce);
3141 context_set_copied(&ce);
3146 tbl[tbl_idx + pos] = new_ce;
3148 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3157 static int copy_translation_tables(struct intel_iommu *iommu)
3159 struct context_entry **ctxt_tbls;
3160 struct root_entry *old_rt;
3161 phys_addr_t old_rt_phys;
3162 int ctxt_table_entries;
3163 unsigned long flags;
3168 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3169 ext = !!(rtaddr_reg & DMA_RTADDR_RTT);
3170 new_ext = !!ecap_ecs(iommu->ecap);
3173 * The RTT bit can only be changed when translation is disabled,
3174 * but disabling translation means to open a window for data
3175 * corruption. So bail out and don't copy anything if we would
3176 * have to change the bit.
3181 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3185 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3189 /* This is too big for the stack - allocate it from slab */
3190 ctxt_table_entries = ext ? 512 : 256;
3192 ctxt_tbls = kzalloc(ctxt_table_entries * sizeof(void *), GFP_KERNEL);
3196 for (bus = 0; bus < 256; bus++) {
3197 ret = copy_context_table(iommu, &old_rt[bus],
3198 ctxt_tbls, bus, ext);
3200 pr_err("%s: Failed to copy context table for bus %d\n",
3206 spin_lock_irqsave(&iommu->lock, flags);
3208 /* Context tables are copied, now write them to the root_entry table */
3209 for (bus = 0; bus < 256; bus++) {
3210 int idx = ext ? bus * 2 : bus;
3213 if (ctxt_tbls[idx]) {
3214 val = virt_to_phys(ctxt_tbls[idx]) | 1;
3215 iommu->root_entry[bus].lo = val;
3218 if (!ext || !ctxt_tbls[idx + 1])
3221 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3222 iommu->root_entry[bus].hi = val;
3225 spin_unlock_irqrestore(&iommu->lock, flags);
3229 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3239 static int __init init_dmars(void)
3241 struct dmar_drhd_unit *drhd;
3242 struct dmar_rmrr_unit *rmrr;
3243 bool copied_tables = false;
3245 struct intel_iommu *iommu;
3251 * initialize and program root entry to not present
3254 for_each_drhd_unit(drhd) {
3256 * lock not needed as this is only incremented in the single
3257 * threaded kernel __init code path all other access are read
3260 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3264 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3267 /* Preallocate enough resources for IOMMU hot-addition */
3268 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3269 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3271 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3274 pr_err("Allocating global iommu array failed\n");
3279 for_each_active_iommu(iommu, drhd) {
3280 g_iommus[iommu->seq_id] = iommu;
3282 intel_iommu_init_qi(iommu);
3284 ret = iommu_init_domains(iommu);
3288 init_translation_status(iommu);
3290 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3291 iommu_disable_translation(iommu);
3292 clear_translation_pre_enabled(iommu);
3293 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3299 * we could share the same root & context tables
3300 * among all IOMMU's. Need to Split it later.
3302 ret = iommu_alloc_root_entry(iommu);
3306 if (translation_pre_enabled(iommu)) {
3307 pr_info("Translation already enabled - trying to copy translation structures\n");
3309 ret = copy_translation_tables(iommu);
3312 * We found the IOMMU with translation
3313 * enabled - but failed to copy over the
3314 * old root-entry table. Try to proceed
3315 * by disabling translation now and
3316 * allocating a clean root-entry table.
3317 * This might cause DMAR faults, but
3318 * probably the dump will still succeed.
3320 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3322 iommu_disable_translation(iommu);
3323 clear_translation_pre_enabled(iommu);
3325 pr_info("Copied translation tables from previous kernel for %s\n",
3327 copied_tables = true;
3331 if (!ecap_pass_through(iommu->ecap))
3332 hw_pass_through = 0;
3333 #ifdef CONFIG_INTEL_IOMMU_SVM
3334 if (pasid_enabled(iommu))
3335 intel_svm_alloc_pasid_tables(iommu);
3340 * Now that qi is enabled on all iommus, set the root entry and flush
3341 * caches. This is required on some Intel X58 chipsets, otherwise the
3342 * flush_context function will loop forever and the boot hangs.
3344 for_each_active_iommu(iommu, drhd) {
3345 iommu_flush_write_buffer(iommu);
3346 iommu_set_root_entry(iommu);
3347 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3348 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3351 if (iommu_pass_through)
3352 iommu_identity_mapping |= IDENTMAP_ALL;
3354 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3355 iommu_identity_mapping |= IDENTMAP_GFX;
3358 check_tylersburg_isoch();
3360 if (iommu_identity_mapping) {
3361 ret = si_domain_init(hw_pass_through);
3368 * If we copied translations from a previous kernel in the kdump
3369 * case, we can not assign the devices to domains now, as that
3370 * would eliminate the old mappings. So skip this part and defer
3371 * the assignment to device driver initialization time.
3377 * If pass through is not set or not enabled, setup context entries for
3378 * identity mappings for rmrr, gfx, and isa and may fall back to static
3379 * identity mapping if iommu_identity_mapping is set.
3381 if (iommu_identity_mapping) {
3382 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
3384 pr_crit("Failed to setup IOMMU pass-through\n");
3390 * for each dev attached to rmrr
3392 * locate drhd for dev, alloc domain for dev
3393 * allocate free domain
3394 * allocate page table entries for rmrr
3395 * if context not allocated for bus
3396 * allocate and init context
3397 * set present in root table for this bus
3398 * init context with domain, translation etc
3402 pr_info("Setting RMRR:\n");
3403 for_each_rmrr_units(rmrr) {
3404 /* some BIOS lists non-exist devices in DMAR table. */
3405 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3407 ret = iommu_prepare_rmrr_dev(rmrr, dev);
3409 pr_err("Mapping reserved region failed\n");
3413 iommu_prepare_isa();
3420 * global invalidate context cache
3421 * global invalidate iotlb
3422 * enable translation
3424 for_each_iommu(iommu, drhd) {
3425 if (drhd->ignored) {
3427 * we always have to disable PMRs or DMA may fail on
3431 iommu_disable_protect_mem_regions(iommu);
3435 iommu_flush_write_buffer(iommu);
3437 #ifdef CONFIG_INTEL_IOMMU_SVM
3438 if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
3439 ret = intel_svm_enable_prq(iommu);
3444 ret = dmar_set_interrupt(iommu);
3448 if (!translation_pre_enabled(iommu))
3449 iommu_enable_translation(iommu);
3451 iommu_disable_protect_mem_regions(iommu);
3457 for_each_active_iommu(iommu, drhd) {
3458 disable_dmar_iommu(iommu);
3459 free_dmar_iommu(iommu);
3468 /* This takes a number of _MM_ pages, not VTD pages */
3469 static unsigned long intel_alloc_iova(struct device *dev,
3470 struct dmar_domain *domain,
3471 unsigned long nrpages, uint64_t dma_mask)
3473 unsigned long iova_pfn = 0;
3475 /* Restrict dma_mask to the width that the iommu can handle */
3476 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3477 /* Ensure we reserve the whole size-aligned region */
3478 nrpages = __roundup_pow_of_two(nrpages);
3480 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3482 * First try to allocate an io virtual address in
3483 * DMA_BIT_MASK(32) and if that fails then try allocating
3486 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3487 IOVA_PFN(DMA_BIT_MASK(32)), false);
3491 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3492 IOVA_PFN(dma_mask), true);
3493 if (unlikely(!iova_pfn)) {
3494 pr_err("Allocating %ld-page iova for %s failed",
3495 nrpages, dev_name(dev));
3502 static struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
3504 struct dmar_domain *domain, *tmp;
3505 struct dmar_rmrr_unit *rmrr;
3506 struct device *i_dev;
3509 domain = find_domain(dev);
3513 domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3517 /* We have a new domain - setup possible RMRRs for the device */
3519 for_each_rmrr_units(rmrr) {
3520 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3525 ret = domain_prepare_identity_map(dev, domain,
3529 dev_err(dev, "Mapping reserved region failed\n");
3534 tmp = set_domain_for_dev(dev, domain);
3535 if (!tmp || domain != tmp) {
3536 domain_exit(domain);
3543 pr_err("Allocating domain for %s failed\n", dev_name(dev));
3549 /* Check if the dev needs to go through non-identity map and unmap process.*/
3550 static int iommu_no_mapping(struct device *dev)
3554 if (iommu_dummy(dev))
3557 if (!iommu_identity_mapping)
3560 found = identity_mapping(dev);
3562 if (iommu_should_identity_map(dev, 0))
3566 * 32 bit DMA is removed from si_domain and fall back
3567 * to non-identity mapping.
3569 dmar_remove_one_dev_info(si_domain, dev);
3570 pr_info("32bit %s uses non-identity mapping\n",
3576 * In case of a detached 64 bit DMA device from vm, the device
3577 * is put into si_domain for identity mapping.
3579 if (iommu_should_identity_map(dev, 0)) {
3581 ret = domain_add_dev_info(si_domain, dev);
3583 pr_info("64bit %s uses identity mapping\n",
3593 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3594 size_t size, int dir, u64 dma_mask)
3596 struct dmar_domain *domain;
3597 phys_addr_t start_paddr;
3598 unsigned long iova_pfn;
3601 struct intel_iommu *iommu;
3602 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3604 BUG_ON(dir == DMA_NONE);
3606 if (iommu_no_mapping(dev))
3609 domain = get_valid_domain_for_dev(dev);
3613 iommu = domain_get_iommu(domain);
3614 size = aligned_nrpages(paddr, size);
3616 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3621 * Check if DMAR supports zero-length reads on write only
3624 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3625 !cap_zlr(iommu->cap))
3626 prot |= DMA_PTE_READ;
3627 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3628 prot |= DMA_PTE_WRITE;
3630 * paddr - (paddr + size) might be partial page, we should map the whole
3631 * page. Note: if two part of one page are separately mapped, we
3632 * might have two guest_addr mapping to the same host paddr, but this
3633 * is not a big problem
3635 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3636 mm_to_dma_pfn(paddr_pfn), size, prot);
3640 start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3641 start_paddr += paddr & ~PAGE_MASK;
3646 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3647 pr_err("Device %s request: %zx@%llx dir %d --- failed\n",
3648 dev_name(dev), size, (unsigned long long)paddr, dir);
3652 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3653 unsigned long offset, size_t size,
3654 enum dma_data_direction dir,
3655 unsigned long attrs)
3657 return __intel_map_single(dev, page_to_phys(page) + offset, size,
3658 dir, *dev->dma_mask);
3661 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3663 struct dmar_domain *domain;
3664 unsigned long start_pfn, last_pfn;
3665 unsigned long nrpages;
3666 unsigned long iova_pfn;
3667 struct intel_iommu *iommu;
3668 struct page *freelist;
3670 if (iommu_no_mapping(dev))
3673 domain = find_domain(dev);
3676 iommu = domain_get_iommu(domain);
3678 iova_pfn = IOVA_PFN(dev_addr);
3680 nrpages = aligned_nrpages(dev_addr, size);
3681 start_pfn = mm_to_dma_pfn(iova_pfn);
3682 last_pfn = start_pfn + nrpages - 1;
3684 pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3685 dev_name(dev), start_pfn, last_pfn);
3687 freelist = domain_unmap(domain, start_pfn, last_pfn);
3689 if (intel_iommu_strict) {
3690 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3691 nrpages, !freelist, 0);
3693 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3694 dma_free_pagelist(freelist);
3696 queue_iova(&domain->iovad, iova_pfn, nrpages,
3697 (unsigned long)freelist);
3699 * queue up the release of the unmap to save the 1/6th of the
3700 * cpu used up by the iotlb flush operation...
3705 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3706 size_t size, enum dma_data_direction dir,
3707 unsigned long attrs)
3709 intel_unmap(dev, dev_addr, size);
3712 static void *intel_alloc_coherent(struct device *dev, size_t size,
3713 dma_addr_t *dma_handle, gfp_t flags,
3714 unsigned long attrs)
3718 vaddr = dma_direct_alloc(dev, size, dma_handle, flags, attrs);
3719 if (iommu_no_mapping(dev) || !vaddr)
3722 *dma_handle = __intel_map_single(dev, virt_to_phys(vaddr),
3723 PAGE_ALIGN(size), DMA_BIDIRECTIONAL,
3724 dev->coherent_dma_mask);
3726 goto out_free_pages;
3730 dma_direct_free(dev, size, vaddr, *dma_handle, attrs);
3734 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3735 dma_addr_t dma_handle, unsigned long attrs)
3737 if (!iommu_no_mapping(dev))
3738 intel_unmap(dev, dma_handle, PAGE_ALIGN(size));
3739 dma_direct_free(dev, size, vaddr, dma_handle, attrs);
3742 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3743 int nelems, enum dma_data_direction dir,
3744 unsigned long attrs)
3746 dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3747 unsigned long nrpages = 0;
3748 struct scatterlist *sg;
3751 for_each_sg(sglist, sg, nelems, i) {
3752 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3755 intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3758 static int intel_nontranslate_map_sg(struct device *hddev,
3759 struct scatterlist *sglist, int nelems, int dir)
3762 struct scatterlist *sg;
3764 for_each_sg(sglist, sg, nelems, i) {
3765 BUG_ON(!sg_page(sg));
3766 sg->dma_address = sg_phys(sg);
3767 sg->dma_length = sg->length;
3772 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3773 enum dma_data_direction dir, unsigned long attrs)
3776 struct dmar_domain *domain;
3779 unsigned long iova_pfn;
3781 struct scatterlist *sg;
3782 unsigned long start_vpfn;
3783 struct intel_iommu *iommu;
3785 BUG_ON(dir == DMA_NONE);
3786 if (iommu_no_mapping(dev))
3787 return intel_nontranslate_map_sg(dev, sglist, nelems, dir);
3789 domain = get_valid_domain_for_dev(dev);
3793 iommu = domain_get_iommu(domain);
3795 for_each_sg(sglist, sg, nelems, i)
3796 size += aligned_nrpages(sg->offset, sg->length);
3798 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3801 sglist->dma_length = 0;
3806 * Check if DMAR supports zero-length reads on write only
3809 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3810 !cap_zlr(iommu->cap))
3811 prot |= DMA_PTE_READ;
3812 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3813 prot |= DMA_PTE_WRITE;
3815 start_vpfn = mm_to_dma_pfn(iova_pfn);
3817 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3818 if (unlikely(ret)) {
3819 dma_pte_free_pagetable(domain, start_vpfn,
3820 start_vpfn + size - 1,
3821 agaw_to_level(domain->agaw) + 1);
3822 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3829 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3834 const struct dma_map_ops intel_dma_ops = {
3835 .alloc = intel_alloc_coherent,
3836 .free = intel_free_coherent,
3837 .map_sg = intel_map_sg,
3838 .unmap_sg = intel_unmap_sg,
3839 .map_page = intel_map_page,
3840 .unmap_page = intel_unmap_page,
3841 .mapping_error = intel_mapping_error,
3843 .dma_supported = dma_direct_supported,
3847 static inline int iommu_domain_cache_init(void)
3851 iommu_domain_cache = kmem_cache_create("iommu_domain",
3852 sizeof(struct dmar_domain),
3857 if (!iommu_domain_cache) {
3858 pr_err("Couldn't create iommu_domain cache\n");
3865 static inline int iommu_devinfo_cache_init(void)
3869 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3870 sizeof(struct device_domain_info),
3874 if (!iommu_devinfo_cache) {
3875 pr_err("Couldn't create devinfo cache\n");
3882 static int __init iommu_init_mempool(void)
3885 ret = iova_cache_get();
3889 ret = iommu_domain_cache_init();
3893 ret = iommu_devinfo_cache_init();
3897 kmem_cache_destroy(iommu_domain_cache);
3904 static void __init iommu_exit_mempool(void)
3906 kmem_cache_destroy(iommu_devinfo_cache);
3907 kmem_cache_destroy(iommu_domain_cache);
3911 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3913 struct dmar_drhd_unit *drhd;
3917 /* We know that this device on this chipset has its own IOMMU.
3918 * If we find it under a different IOMMU, then the BIOS is lying
3919 * to us. Hope that the IOMMU for this device is actually
3920 * disabled, and it needs no translation...
3922 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3924 /* "can't" happen */
3925 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3928 vtbar &= 0xffff0000;
3930 /* we know that the this iommu should be at offset 0xa000 from vtbar */
3931 drhd = dmar_find_matched_drhd_unit(pdev);
3932 if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3933 TAINT_FIRMWARE_WORKAROUND,
3934 "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3935 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3937 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3939 static void __init init_no_remapping_devices(void)
3941 struct dmar_drhd_unit *drhd;
3945 for_each_drhd_unit(drhd) {
3946 if (!drhd->include_all) {
3947 for_each_active_dev_scope(drhd->devices,
3948 drhd->devices_cnt, i, dev)
3950 /* ignore DMAR unit if no devices exist */
3951 if (i == drhd->devices_cnt)
3956 for_each_active_drhd_unit(drhd) {
3957 if (drhd->include_all)
3960 for_each_active_dev_scope(drhd->devices,
3961 drhd->devices_cnt, i, dev)
3962 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3964 if (i < drhd->devices_cnt)
3967 /* This IOMMU has *only* gfx devices. Either bypass it or
3968 set the gfx_mapped flag, as appropriate */
3970 intel_iommu_gfx_mapped = 1;
3973 for_each_active_dev_scope(drhd->devices,
3974 drhd->devices_cnt, i, dev)
3975 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3980 #ifdef CONFIG_SUSPEND
3981 static int init_iommu_hw(void)
3983 struct dmar_drhd_unit *drhd;
3984 struct intel_iommu *iommu = NULL;
3986 for_each_active_iommu(iommu, drhd)
3988 dmar_reenable_qi(iommu);
3990 for_each_iommu(iommu, drhd) {
3991 if (drhd->ignored) {
3993 * we always have to disable PMRs or DMA may fail on
3997 iommu_disable_protect_mem_regions(iommu);
4001 iommu_flush_write_buffer(iommu);
4003 iommu_set_root_entry(iommu);
4005 iommu->flush.flush_context(iommu, 0, 0, 0,
4006 DMA_CCMD_GLOBAL_INVL);
4007 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4008 iommu_enable_translation(iommu);
4009 iommu_disable_protect_mem_regions(iommu);
4015 static void iommu_flush_all(void)
4017 struct dmar_drhd_unit *drhd;
4018 struct intel_iommu *iommu;
4020 for_each_active_iommu(iommu, drhd) {
4021 iommu->flush.flush_context(iommu, 0, 0, 0,
4022 DMA_CCMD_GLOBAL_INVL);
4023 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4024 DMA_TLB_GLOBAL_FLUSH);
4028 static int iommu_suspend(void)
4030 struct dmar_drhd_unit *drhd;
4031 struct intel_iommu *iommu = NULL;
4034 for_each_active_iommu(iommu, drhd) {
4035 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
4037 if (!iommu->iommu_state)
4043 for_each_active_iommu(iommu, drhd) {
4044 iommu_disable_translation(iommu);
4046 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4048 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4049 readl(iommu->reg + DMAR_FECTL_REG);
4050 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4051 readl(iommu->reg + DMAR_FEDATA_REG);
4052 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4053 readl(iommu->reg + DMAR_FEADDR_REG);
4054 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4055 readl(iommu->reg + DMAR_FEUADDR_REG);
4057 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4062 for_each_active_iommu(iommu, drhd)
4063 kfree(iommu->iommu_state);
4068 static void iommu_resume(void)
4070 struct dmar_drhd_unit *drhd;
4071 struct intel_iommu *iommu = NULL;
4074 if (init_iommu_hw()) {
4076 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4078 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4082 for_each_active_iommu(iommu, drhd) {
4084 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4086 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4087 iommu->reg + DMAR_FECTL_REG);
4088 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4089 iommu->reg + DMAR_FEDATA_REG);
4090 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4091 iommu->reg + DMAR_FEADDR_REG);
4092 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4093 iommu->reg + DMAR_FEUADDR_REG);
4095 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4098 for_each_active_iommu(iommu, drhd)
4099 kfree(iommu->iommu_state);
4102 static struct syscore_ops iommu_syscore_ops = {
4103 .resume = iommu_resume,
4104 .suspend = iommu_suspend,
4107 static void __init init_iommu_pm_ops(void)
4109 register_syscore_ops(&iommu_syscore_ops);
4113 static inline void init_iommu_pm_ops(void) {}
4114 #endif /* CONFIG_PM */
4117 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4119 struct acpi_dmar_reserved_memory *rmrr;
4120 int prot = DMA_PTE_READ|DMA_PTE_WRITE;
4121 struct dmar_rmrr_unit *rmrru;
4124 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4128 rmrru->hdr = header;
4129 rmrr = (struct acpi_dmar_reserved_memory *)header;
4130 rmrru->base_address = rmrr->base_address;
4131 rmrru->end_address = rmrr->end_address;
4133 length = rmrr->end_address - rmrr->base_address + 1;
4134 rmrru->resv = iommu_alloc_resv_region(rmrr->base_address, length, prot,
4139 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4140 ((void *)rmrr) + rmrr->header.length,
4141 &rmrru->devices_cnt);
4142 if (rmrru->devices_cnt && rmrru->devices == NULL)
4145 list_add(&rmrru->list, &dmar_rmrr_units);
4156 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4158 struct dmar_atsr_unit *atsru;
4159 struct acpi_dmar_atsr *tmp;
4161 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4162 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4163 if (atsr->segment != tmp->segment)
4165 if (atsr->header.length != tmp->header.length)
4167 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4174 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4176 struct acpi_dmar_atsr *atsr;
4177 struct dmar_atsr_unit *atsru;
4179 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4182 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4183 atsru = dmar_find_atsr(atsr);
4187 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4192 * If memory is allocated from slab by ACPI _DSM method, we need to
4193 * copy the memory content because the memory buffer will be freed
4196 atsru->hdr = (void *)(atsru + 1);
4197 memcpy(atsru->hdr, hdr, hdr->length);
4198 atsru->include_all = atsr->flags & 0x1;
4199 if (!atsru->include_all) {
4200 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4201 (void *)atsr + atsr->header.length,
4202 &atsru->devices_cnt);
4203 if (atsru->devices_cnt && atsru->devices == NULL) {
4209 list_add_rcu(&atsru->list, &dmar_atsr_units);
4214 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4216 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4220 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4222 struct acpi_dmar_atsr *atsr;
4223 struct dmar_atsr_unit *atsru;
4225 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4226 atsru = dmar_find_atsr(atsr);
4228 list_del_rcu(&atsru->list);
4230 intel_iommu_free_atsr(atsru);
4236 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4240 struct acpi_dmar_atsr *atsr;
4241 struct dmar_atsr_unit *atsru;
4243 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4244 atsru = dmar_find_atsr(atsr);
4248 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4249 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4257 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4260 struct intel_iommu *iommu = dmaru->iommu;
4262 if (g_iommus[iommu->seq_id])
4265 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4266 pr_warn("%s: Doesn't support hardware pass through.\n",
4270 if (!ecap_sc_support(iommu->ecap) &&
4271 domain_update_iommu_snooping(iommu)) {
4272 pr_warn("%s: Doesn't support snooping.\n",
4276 sp = domain_update_iommu_superpage(iommu) - 1;
4277 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4278 pr_warn("%s: Doesn't support large page.\n",
4284 * Disable translation if already enabled prior to OS handover.
4286 if (iommu->gcmd & DMA_GCMD_TE)
4287 iommu_disable_translation(iommu);
4289 g_iommus[iommu->seq_id] = iommu;
4290 ret = iommu_init_domains(iommu);
4292 ret = iommu_alloc_root_entry(iommu);
4296 #ifdef CONFIG_INTEL_IOMMU_SVM
4297 if (pasid_enabled(iommu))
4298 intel_svm_alloc_pasid_tables(iommu);
4301 if (dmaru->ignored) {
4303 * we always have to disable PMRs or DMA may fail on this device
4306 iommu_disable_protect_mem_regions(iommu);
4310 intel_iommu_init_qi(iommu);
4311 iommu_flush_write_buffer(iommu);
4313 #ifdef CONFIG_INTEL_IOMMU_SVM
4314 if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
4315 ret = intel_svm_enable_prq(iommu);
4320 ret = dmar_set_interrupt(iommu);
4324 iommu_set_root_entry(iommu);
4325 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4326 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4327 iommu_enable_translation(iommu);
4329 iommu_disable_protect_mem_regions(iommu);
4333 disable_dmar_iommu(iommu);
4335 free_dmar_iommu(iommu);
4339 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4342 struct intel_iommu *iommu = dmaru->iommu;
4344 if (!intel_iommu_enabled)
4350 ret = intel_iommu_add(dmaru);
4352 disable_dmar_iommu(iommu);
4353 free_dmar_iommu(iommu);
4359 static void intel_iommu_free_dmars(void)
4361 struct dmar_rmrr_unit *rmrru, *rmrr_n;
4362 struct dmar_atsr_unit *atsru, *atsr_n;
4364 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4365 list_del(&rmrru->list);
4366 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4371 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4372 list_del(&atsru->list);
4373 intel_iommu_free_atsr(atsru);
4377 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4380 struct pci_bus *bus;
4381 struct pci_dev *bridge = NULL;
4383 struct acpi_dmar_atsr *atsr;
4384 struct dmar_atsr_unit *atsru;
4386 dev = pci_physfn(dev);
4387 for (bus = dev->bus; bus; bus = bus->parent) {
4389 /* If it's an integrated device, allow ATS */
4392 /* Connected via non-PCIe: no ATS */
4393 if (!pci_is_pcie(bridge) ||
4394 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4396 /* If we found the root port, look it up in the ATSR */
4397 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4402 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4403 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4404 if (atsr->segment != pci_domain_nr(dev->bus))
4407 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4408 if (tmp == &bridge->dev)
4411 if (atsru->include_all)
4421 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4424 struct dmar_rmrr_unit *rmrru;
4425 struct dmar_atsr_unit *atsru;
4426 struct acpi_dmar_atsr *atsr;
4427 struct acpi_dmar_reserved_memory *rmrr;
4429 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4432 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4433 rmrr = container_of(rmrru->hdr,
4434 struct acpi_dmar_reserved_memory, header);
4435 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4436 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4437 ((void *)rmrr) + rmrr->header.length,
4438 rmrr->segment, rmrru->devices,
4439 rmrru->devices_cnt);
4442 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4443 dmar_remove_dev_scope(info, rmrr->segment,
4444 rmrru->devices, rmrru->devices_cnt);
4448 list_for_each_entry(atsru, &dmar_atsr_units, list) {
4449 if (atsru->include_all)
4452 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4453 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4454 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4455 (void *)atsr + atsr->header.length,
4456 atsr->segment, atsru->devices,
4457 atsru->devices_cnt);
4462 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4463 if (dmar_remove_dev_scope(info, atsr->segment,
4464 atsru->devices, atsru->devices_cnt))
4473 * Here we only respond to action of unbound device from driver.
4475 * Added device is not attached to its DMAR domain here yet. That will happen
4476 * when mapping the device to iova.
4478 static int device_notifier(struct notifier_block *nb,
4479 unsigned long action, void *data)
4481 struct device *dev = data;
4482 struct dmar_domain *domain;
4484 if (iommu_dummy(dev))
4487 if (action != BUS_NOTIFY_REMOVED_DEVICE)
4490 domain = find_domain(dev);
4494 dmar_remove_one_dev_info(domain, dev);
4495 if (!domain_type_is_vm_or_si(domain) && list_empty(&domain->devices))
4496 domain_exit(domain);
4501 static struct notifier_block device_nb = {
4502 .notifier_call = device_notifier,
4505 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4506 unsigned long val, void *v)
4508 struct memory_notify *mhp = v;
4509 unsigned long long start, end;
4510 unsigned long start_vpfn, last_vpfn;
4513 case MEM_GOING_ONLINE:
4514 start = mhp->start_pfn << PAGE_SHIFT;
4515 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4516 if (iommu_domain_identity_map(si_domain, start, end)) {
4517 pr_warn("Failed to build identity map for [%llx-%llx]\n",
4524 case MEM_CANCEL_ONLINE:
4525 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4526 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4527 while (start_vpfn <= last_vpfn) {
4529 struct dmar_drhd_unit *drhd;
4530 struct intel_iommu *iommu;
4531 struct page *freelist;
4533 iova = find_iova(&si_domain->iovad, start_vpfn);
4535 pr_debug("Failed get IOVA for PFN %lx\n",
4540 iova = split_and_remove_iova(&si_domain->iovad, iova,
4541 start_vpfn, last_vpfn);
4543 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4544 start_vpfn, last_vpfn);
4548 freelist = domain_unmap(si_domain, iova->pfn_lo,
4552 for_each_active_iommu(iommu, drhd)
4553 iommu_flush_iotlb_psi(iommu, si_domain,
4554 iova->pfn_lo, iova_size(iova),
4557 dma_free_pagelist(freelist);
4559 start_vpfn = iova->pfn_hi + 1;
4560 free_iova_mem(iova);
4568 static struct notifier_block intel_iommu_memory_nb = {
4569 .notifier_call = intel_iommu_memory_notifier,
4573 static void free_all_cpu_cached_iovas(unsigned int cpu)
4577 for (i = 0; i < g_num_of_iommus; i++) {
4578 struct intel_iommu *iommu = g_iommus[i];
4579 struct dmar_domain *domain;
4585 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4586 domain = get_iommu_domain(iommu, (u16)did);
4590 free_cpu_cached_iovas(cpu, &domain->iovad);
4595 static int intel_iommu_cpu_dead(unsigned int cpu)
4597 free_all_cpu_cached_iovas(cpu);
4601 static void intel_disable_iommus(void)
4603 struct intel_iommu *iommu = NULL;
4604 struct dmar_drhd_unit *drhd;
4606 for_each_iommu(iommu, drhd)
4607 iommu_disable_translation(iommu);
4610 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4612 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4614 return container_of(iommu_dev, struct intel_iommu, iommu);
4617 static ssize_t intel_iommu_show_version(struct device *dev,
4618 struct device_attribute *attr,
4621 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4622 u32 ver = readl(iommu->reg + DMAR_VER_REG);
4623 return sprintf(buf, "%d:%d\n",
4624 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4626 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4628 static ssize_t intel_iommu_show_address(struct device *dev,
4629 struct device_attribute *attr,
4632 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4633 return sprintf(buf, "%llx\n", iommu->reg_phys);
4635 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4637 static ssize_t intel_iommu_show_cap(struct device *dev,
4638 struct device_attribute *attr,
4641 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4642 return sprintf(buf, "%llx\n", iommu->cap);
4644 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4646 static ssize_t intel_iommu_show_ecap(struct device *dev,
4647 struct device_attribute *attr,
4650 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4651 return sprintf(buf, "%llx\n", iommu->ecap);
4653 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4655 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4656 struct device_attribute *attr,
4659 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4660 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4662 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4664 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4665 struct device_attribute *attr,
4668 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4669 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4670 cap_ndoms(iommu->cap)));
4672 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4674 static struct attribute *intel_iommu_attrs[] = {
4675 &dev_attr_version.attr,
4676 &dev_attr_address.attr,
4678 &dev_attr_ecap.attr,
4679 &dev_attr_domains_supported.attr,
4680 &dev_attr_domains_used.attr,
4684 static struct attribute_group intel_iommu_group = {
4685 .name = "intel-iommu",
4686 .attrs = intel_iommu_attrs,
4689 const struct attribute_group *intel_iommu_groups[] = {
4694 int __init intel_iommu_init(void)
4697 struct dmar_drhd_unit *drhd;
4698 struct intel_iommu *iommu;
4700 /* VT-d is required for a TXT/tboot launch, so enforce that */
4701 force_on = tboot_force_iommu();
4703 if (iommu_init_mempool()) {
4705 panic("tboot: Failed to initialize iommu memory\n");
4709 down_write(&dmar_global_lock);
4710 if (dmar_table_init()) {
4712 panic("tboot: Failed to initialize DMAR table\n");
4716 if (dmar_dev_scope_init() < 0) {
4718 panic("tboot: Failed to initialize DMAR device scope\n");
4722 up_write(&dmar_global_lock);
4725 * The bus notifier takes the dmar_global_lock, so lockdep will
4726 * complain later when we register it under the lock.
4728 dmar_register_bus_notifier();
4730 down_write(&dmar_global_lock);
4732 if (no_iommu || dmar_disabled) {
4734 * We exit the function here to ensure IOMMU's remapping and
4735 * mempool aren't setup, which means that the IOMMU's PMRs
4736 * won't be disabled via the call to init_dmars(). So disable
4737 * it explicitly here. The PMRs were setup by tboot prior to
4738 * calling SENTER, but the kernel is expected to reset/tear
4741 if (intel_iommu_tboot_noforce) {
4742 for_each_iommu(iommu, drhd)
4743 iommu_disable_protect_mem_regions(iommu);
4747 * Make sure the IOMMUs are switched off, even when we
4748 * boot into a kexec kernel and the previous kernel left
4751 intel_disable_iommus();
4755 if (list_empty(&dmar_rmrr_units))
4756 pr_info("No RMRR found\n");
4758 if (list_empty(&dmar_atsr_units))
4759 pr_info("No ATSR found\n");
4761 if (dmar_init_reserved_ranges()) {
4763 panic("tboot: Failed to reserve iommu ranges\n");
4764 goto out_free_reserved_range;
4767 init_no_remapping_devices();
4772 panic("tboot: Failed to initialize DMARs\n");
4773 pr_err("Initialization failed\n");
4774 goto out_free_reserved_range;
4776 up_write(&dmar_global_lock);
4777 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4779 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4782 dma_ops = &intel_dma_ops;
4784 init_iommu_pm_ops();
4786 for_each_active_iommu(iommu, drhd) {
4787 iommu_device_sysfs_add(&iommu->iommu, NULL,
4790 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4791 iommu_device_register(&iommu->iommu);
4794 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4795 bus_register_notifier(&pci_bus_type, &device_nb);
4796 if (si_domain && !hw_pass_through)
4797 register_memory_notifier(&intel_iommu_memory_nb);
4798 cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4799 intel_iommu_cpu_dead);
4800 intel_iommu_enabled = 1;
4804 out_free_reserved_range:
4805 put_iova_domain(&reserved_iova_list);
4807 intel_iommu_free_dmars();
4808 up_write(&dmar_global_lock);
4809 iommu_exit_mempool();
4813 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4815 struct intel_iommu *iommu = opaque;
4817 domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4822 * NB - intel-iommu lacks any sort of reference counting for the users of
4823 * dependent devices. If multiple endpoints have intersecting dependent
4824 * devices, unbinding the driver from any one of them will possibly leave
4825 * the others unable to operate.
4827 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4829 if (!iommu || !dev || !dev_is_pci(dev))
4832 pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4835 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4837 struct intel_iommu *iommu;
4838 unsigned long flags;
4840 assert_spin_locked(&device_domain_lock);
4845 iommu = info->iommu;
4848 iommu_disable_dev_iotlb(info);
4849 domain_context_clear(iommu, info->dev);
4852 unlink_domain_info(info);
4854 spin_lock_irqsave(&iommu->lock, flags);
4855 domain_detach_iommu(info->domain, iommu);
4856 spin_unlock_irqrestore(&iommu->lock, flags);
4858 free_devinfo_mem(info);
4861 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
4864 struct device_domain_info *info;
4865 unsigned long flags;
4867 spin_lock_irqsave(&device_domain_lock, flags);
4868 info = dev->archdata.iommu;
4869 __dmar_remove_one_dev_info(info);
4870 spin_unlock_irqrestore(&device_domain_lock, flags);
4873 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4877 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
4878 domain_reserve_special_ranges(domain);
4880 /* calculate AGAW */
4881 domain->gaw = guest_width;
4882 adjust_width = guestwidth_to_adjustwidth(guest_width);
4883 domain->agaw = width_to_agaw(adjust_width);
4885 domain->iommu_coherency = 0;
4886 domain->iommu_snooping = 0;
4887 domain->iommu_superpage = 0;
4888 domain->max_addr = 0;
4890 /* always allocate the top pgd */
4891 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4894 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4898 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4900 struct dmar_domain *dmar_domain;
4901 struct iommu_domain *domain;
4903 if (type != IOMMU_DOMAIN_UNMANAGED)
4906 dmar_domain = alloc_domain(DOMAIN_FLAG_VIRTUAL_MACHINE);
4908 pr_err("Can't allocate dmar_domain\n");
4911 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4912 pr_err("Domain initialization failed\n");
4913 domain_exit(dmar_domain);
4916 domain_update_iommu_cap(dmar_domain);
4918 domain = &dmar_domain->domain;
4919 domain->geometry.aperture_start = 0;
4920 domain->geometry.aperture_end = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4921 domain->geometry.force_aperture = true;
4926 static void intel_iommu_domain_free(struct iommu_domain *domain)
4928 domain_exit(to_dmar_domain(domain));
4931 static int intel_iommu_attach_device(struct iommu_domain *domain,
4934 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4935 struct intel_iommu *iommu;
4939 if (device_is_rmrr_locked(dev)) {
4940 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
4944 /* normally dev is not mapped */
4945 if (unlikely(domain_context_mapped(dev))) {
4946 struct dmar_domain *old_domain;
4948 old_domain = find_domain(dev);
4951 dmar_remove_one_dev_info(old_domain, dev);
4954 if (!domain_type_is_vm_or_si(old_domain) &&
4955 list_empty(&old_domain->devices))
4956 domain_exit(old_domain);
4960 iommu = device_to_iommu(dev, &bus, &devfn);
4964 /* check if this iommu agaw is sufficient for max mapped address */
4965 addr_width = agaw_to_width(iommu->agaw);
4966 if (addr_width > cap_mgaw(iommu->cap))
4967 addr_width = cap_mgaw(iommu->cap);
4969 if (dmar_domain->max_addr > (1LL << addr_width)) {
4970 pr_err("%s: iommu width (%d) is not "
4971 "sufficient for the mapped address (%llx)\n",
4972 __func__, addr_width, dmar_domain->max_addr);
4975 dmar_domain->gaw = addr_width;
4978 * Knock out extra levels of page tables if necessary
4980 while (iommu->agaw < dmar_domain->agaw) {
4981 struct dma_pte *pte;
4983 pte = dmar_domain->pgd;
4984 if (dma_pte_present(pte)) {
4985 dmar_domain->pgd = (struct dma_pte *)
4986 phys_to_virt(dma_pte_addr(pte));
4987 free_pgtable_page(pte);
4989 dmar_domain->agaw--;
4992 return domain_add_dev_info(dmar_domain, dev);
4995 static void intel_iommu_detach_device(struct iommu_domain *domain,
4998 dmar_remove_one_dev_info(to_dmar_domain(domain), dev);
5001 static int intel_iommu_map(struct iommu_domain *domain,
5002 unsigned long iova, phys_addr_t hpa,
5003 size_t size, int iommu_prot)
5005 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5010 if (iommu_prot & IOMMU_READ)
5011 prot |= DMA_PTE_READ;
5012 if (iommu_prot & IOMMU_WRITE)
5013 prot |= DMA_PTE_WRITE;
5014 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5015 prot |= DMA_PTE_SNP;
5017 max_addr = iova + size;
5018 if (dmar_domain->max_addr < max_addr) {
5021 /* check if minimum agaw is sufficient for mapped address */
5022 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5023 if (end < max_addr) {
5024 pr_err("%s: iommu width (%d) is not "
5025 "sufficient for the mapped address (%llx)\n",
5026 __func__, dmar_domain->gaw, max_addr);
5029 dmar_domain->max_addr = max_addr;
5031 /* Round up size to next multiple of PAGE_SIZE, if it and
5032 the low bits of hpa would take us onto the next page */
5033 size = aligned_nrpages(hpa, size);
5034 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5035 hpa >> VTD_PAGE_SHIFT, size, prot);
5039 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5040 unsigned long iova, size_t size)
5042 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5043 struct page *freelist = NULL;
5044 unsigned long start_pfn, last_pfn;
5045 unsigned int npages;
5046 int iommu_id, level = 0;
5048 /* Cope with horrid API which requires us to unmap more than the
5049 size argument if it happens to be a large-page mapping. */
5050 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5052 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5053 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5055 start_pfn = iova >> VTD_PAGE_SHIFT;
5056 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5058 freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5060 npages = last_pfn - start_pfn + 1;
5062 for_each_domain_iommu(iommu_id, dmar_domain)
5063 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5064 start_pfn, npages, !freelist, 0);
5066 dma_free_pagelist(freelist);
5068 if (dmar_domain->max_addr == iova + size)
5069 dmar_domain->max_addr = iova;
5074 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5077 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5078 struct dma_pte *pte;
5082 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5084 phys = dma_pte_addr(pte);
5089 static bool intel_iommu_capable(enum iommu_cap cap)
5091 if (cap == IOMMU_CAP_CACHE_COHERENCY)
5092 return domain_update_iommu_snooping(NULL) == 1;
5093 if (cap == IOMMU_CAP_INTR_REMAP)
5094 return irq_remapping_enabled == 1;
5099 static int intel_iommu_add_device(struct device *dev)
5101 struct intel_iommu *iommu;
5102 struct iommu_group *group;
5105 iommu = device_to_iommu(dev, &bus, &devfn);
5109 iommu_device_link(&iommu->iommu, dev);
5111 group = iommu_group_get_for_dev(dev);
5114 return PTR_ERR(group);
5116 iommu_group_put(group);
5120 static void intel_iommu_remove_device(struct device *dev)
5122 struct intel_iommu *iommu;
5125 iommu = device_to_iommu(dev, &bus, &devfn);
5129 iommu_group_remove_device(dev);
5131 iommu_device_unlink(&iommu->iommu, dev);
5134 static void intel_iommu_get_resv_regions(struct device *device,
5135 struct list_head *head)
5137 struct iommu_resv_region *reg;
5138 struct dmar_rmrr_unit *rmrr;
5139 struct device *i_dev;
5143 for_each_rmrr_units(rmrr) {
5144 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5146 if (i_dev != device)
5149 list_add_tail(&rmrr->resv->list, head);
5154 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5155 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5159 list_add_tail(®->list, head);
5162 static void intel_iommu_put_resv_regions(struct device *dev,
5163 struct list_head *head)
5165 struct iommu_resv_region *entry, *next;
5167 list_for_each_entry_safe(entry, next, head, list) {
5168 if (entry->type == IOMMU_RESV_RESERVED)
5173 #ifdef CONFIG_INTEL_IOMMU_SVM
5174 #define MAX_NR_PASID_BITS (20)
5175 static inline unsigned long intel_iommu_get_pts(struct intel_iommu *iommu)
5178 * Convert ecap_pss to extend context entry pts encoding, also
5179 * respect the soft pasid_max value set by the iommu.
5180 * - number of PASID bits = ecap_pss + 1
5181 * - number of PASID table entries = 2^(pts + 5)
5182 * Therefore, pts = ecap_pss - 4
5183 * e.g. KBL ecap_pss = 0x13, PASID has 20 bits, pts = 15
5185 if (ecap_pss(iommu->ecap) < 5)
5188 /* pasid_max is encoded as actual number of entries not the bits */
5189 return find_first_bit((unsigned long *)&iommu->pasid_max,
5190 MAX_NR_PASID_BITS) - 5;
5193 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct intel_svm_dev *sdev)
5195 struct device_domain_info *info;
5196 struct context_entry *context;
5197 struct dmar_domain *domain;
5198 unsigned long flags;
5202 domain = get_valid_domain_for_dev(sdev->dev);
5206 spin_lock_irqsave(&device_domain_lock, flags);
5207 spin_lock(&iommu->lock);
5210 info = sdev->dev->archdata.iommu;
5211 if (!info || !info->pasid_supported)
5214 context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5215 if (WARN_ON(!context))
5218 ctx_lo = context[0].lo;
5220 sdev->did = domain->iommu_did[iommu->seq_id];
5221 sdev->sid = PCI_DEVID(info->bus, info->devfn);
5223 if (!(ctx_lo & CONTEXT_PASIDE)) {
5224 if (iommu->pasid_state_table)
5225 context[1].hi = (u64)virt_to_phys(iommu->pasid_state_table);
5226 context[1].lo = (u64)virt_to_phys(iommu->pasid_table) |
5227 intel_iommu_get_pts(iommu);
5230 /* CONTEXT_TT_MULTI_LEVEL and CONTEXT_TT_DEV_IOTLB are both
5231 * extended to permit requests-with-PASID if the PASIDE bit
5232 * is set. which makes sense. For CONTEXT_TT_PASS_THROUGH,
5233 * however, the PASIDE bit is ignored and requests-with-PASID
5234 * are unconditionally blocked. Which makes less sense.
5235 * So convert from CONTEXT_TT_PASS_THROUGH to one of the new
5236 * "guest mode" translation types depending on whether ATS
5237 * is available or not. Annoyingly, we can't use the new
5238 * modes *unless* PASIDE is set. */
5239 if ((ctx_lo & CONTEXT_TT_MASK) == (CONTEXT_TT_PASS_THROUGH << 2)) {
5240 ctx_lo &= ~CONTEXT_TT_MASK;
5241 if (info->ats_supported)
5242 ctx_lo |= CONTEXT_TT_PT_PASID_DEV_IOTLB << 2;
5244 ctx_lo |= CONTEXT_TT_PT_PASID << 2;
5246 ctx_lo |= CONTEXT_PASIDE;
5247 if (iommu->pasid_state_table)
5248 ctx_lo |= CONTEXT_DINVE;
5249 if (info->pri_supported)
5250 ctx_lo |= CONTEXT_PRS;
5251 context[0].lo = ctx_lo;
5253 iommu->flush.flush_context(iommu, sdev->did, sdev->sid,
5254 DMA_CCMD_MASK_NOBIT,
5255 DMA_CCMD_DEVICE_INVL);
5258 /* Enable PASID support in the device, if it wasn't already */
5259 if (!info->pasid_enabled)
5260 iommu_enable_dev_iotlb(info);
5262 if (info->ats_enabled) {
5263 sdev->dev_iotlb = 1;
5264 sdev->qdep = info->ats_qdep;
5265 if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS)
5271 spin_unlock(&iommu->lock);
5272 spin_unlock_irqrestore(&device_domain_lock, flags);
5277 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5279 struct intel_iommu *iommu;
5282 if (iommu_dummy(dev)) {
5284 "No IOMMU translation for device; cannot enable SVM\n");
5288 iommu = device_to_iommu(dev, &bus, &devfn);
5290 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5294 if (!iommu->pasid_table) {
5295 dev_err(dev, "PASID not enabled on IOMMU; cannot enable SVM\n");
5301 #endif /* CONFIG_INTEL_IOMMU_SVM */
5303 const struct iommu_ops intel_iommu_ops = {
5304 .capable = intel_iommu_capable,
5305 .domain_alloc = intel_iommu_domain_alloc,
5306 .domain_free = intel_iommu_domain_free,
5307 .attach_dev = intel_iommu_attach_device,
5308 .detach_dev = intel_iommu_detach_device,
5309 .map = intel_iommu_map,
5310 .unmap = intel_iommu_unmap,
5311 .map_sg = default_iommu_map_sg,
5312 .iova_to_phys = intel_iommu_iova_to_phys,
5313 .add_device = intel_iommu_add_device,
5314 .remove_device = intel_iommu_remove_device,
5315 .get_resv_regions = intel_iommu_get_resv_regions,
5316 .put_resv_regions = intel_iommu_put_resv_regions,
5317 .device_group = pci_device_group,
5318 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
5321 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
5323 /* G4x/GM45 integrated gfx dmar support is totally busted. */
5324 pr_info("Disabling IOMMU for graphics on this chipset\n");
5328 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
5329 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
5330 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
5331 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
5332 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
5333 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
5334 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
5336 static void quirk_iommu_rwbf(struct pci_dev *dev)
5339 * Mobile 4 Series Chipset neglects to set RWBF capability,
5340 * but needs it. Same seems to hold for the desktop versions.
5342 pr_info("Forcing write-buffer flush capability\n");
5346 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5347 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5348 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5349 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5350 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5351 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5352 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5355 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
5356 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
5357 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
5358 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
5359 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
5360 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
5361 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
5362 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
5364 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5368 if (pci_read_config_word(dev, GGC, &ggc))
5371 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5372 pr_info("BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5374 } else if (dmar_map_gfx) {
5375 /* we have to ensure the gfx device is idle before we flush */
5376 pr_info("Disabling batched IOTLB flush on Ironlake\n");
5377 intel_iommu_strict = 1;
5380 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5381 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5382 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5383 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5385 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5386 ISOCH DMAR unit for the Azalia sound device, but not give it any
5387 TLB entries, which causes it to deadlock. Check for that. We do
5388 this in a function called from init_dmars(), instead of in a PCI
5389 quirk, because we don't want to print the obnoxious "BIOS broken"
5390 message if VT-d is actually disabled.
5392 static void __init check_tylersburg_isoch(void)
5394 struct pci_dev *pdev;
5395 uint32_t vtisochctrl;
5397 /* If there's no Azalia in the system anyway, forget it. */
5398 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5403 /* System Management Registers. Might be hidden, in which case
5404 we can't do the sanity check. But that's OK, because the
5405 known-broken BIOSes _don't_ actually hide it, so far. */
5406 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5410 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5417 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5418 if (vtisochctrl & 1)
5421 /* Drop all bits other than the number of TLB entries */
5422 vtisochctrl &= 0x1c;
5424 /* If we have the recommended number of TLB entries (16), fine. */
5425 if (vtisochctrl == 0x10)
5428 /* Zero TLB entries? You get to ride the short bus to school. */
5430 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5431 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5432 dmi_get_system_info(DMI_BIOS_VENDOR),
5433 dmi_get_system_info(DMI_BIOS_VERSION),
5434 dmi_get_system_info(DMI_PRODUCT_VERSION));
5435 iommu_identity_mapping |= IDENTMAP_AZALIA;
5439 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",