1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright © 2006-2014 Intel Corporation.
13 #define pr_fmt(fmt) "DMAR: " fmt
14 #define dev_fmt(fmt) pr_fmt(fmt)
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dmi.h>
19 #include <linux/memory.h>
20 #include <linux/pci.h>
21 #include <linux/pci-ats.h>
22 #include <linux/spinlock.h>
23 #include <linux/syscore_ops.h>
24 #include <linux/tboot.h>
25 #include <uapi/linux/iommufd.h>
28 #include "../dma-iommu.h"
29 #include "../irq_remapping.h"
30 #include "../iommu-sva.h"
32 #include "cap_audit.h"
35 #define ROOT_SIZE VTD_PAGE_SIZE
36 #define CONTEXT_SIZE VTD_PAGE_SIZE
38 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
39 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
40 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
41 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
43 #define IOAPIC_RANGE_START (0xfee00000)
44 #define IOAPIC_RANGE_END (0xfeefffff)
45 #define IOVA_START_ADDR (0x1000)
47 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
49 #define MAX_AGAW_WIDTH 64
50 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
52 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
53 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
55 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
56 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
57 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
58 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
59 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
61 /* IO virtual address start page frame number */
62 #define IOVA_START_PFN (1)
64 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
66 /* page table handling */
67 #define LEVEL_STRIDE (9)
68 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
70 static inline int agaw_to_level(int agaw)
75 static inline int agaw_to_width(int agaw)
77 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
80 static inline int width_to_agaw(int width)
82 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
85 static inline unsigned int level_to_offset_bits(int level)
87 return (level - 1) * LEVEL_STRIDE;
90 static inline int pfn_level_offset(u64 pfn, int level)
92 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
95 static inline u64 level_mask(int level)
97 return -1ULL << level_to_offset_bits(level);
100 static inline u64 level_size(int level)
102 return 1ULL << level_to_offset_bits(level);
105 static inline u64 align_to_level(u64 pfn, int level)
107 return (pfn + level_size(level) - 1) & level_mask(level);
110 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
112 return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
115 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
116 are never going to work. */
117 static inline unsigned long mm_to_dma_pfn_start(unsigned long mm_pfn)
119 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
121 static inline unsigned long mm_to_dma_pfn_end(unsigned long mm_pfn)
123 return ((mm_pfn + 1) << (PAGE_SHIFT - VTD_PAGE_SHIFT)) - 1;
125 static inline unsigned long page_to_dma_pfn(struct page *pg)
127 return mm_to_dma_pfn_start(page_to_pfn(pg));
129 static inline unsigned long virt_to_dma_pfn(void *p)
131 return page_to_dma_pfn(virt_to_page(p));
134 static void __init check_tylersburg_isoch(void);
135 static int rwbf_quirk;
138 * set to 1 to panic kernel if can't successfully enable VT-d
139 * (used when kernel is launched w/ TXT)
141 static int force_on = 0;
142 static int intel_iommu_tboot_noforce;
143 static int no_platform_optin;
145 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
148 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
151 static phys_addr_t root_entry_lctp(struct root_entry *re)
156 return re->lo & VTD_PAGE_MASK;
160 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
163 static phys_addr_t root_entry_uctp(struct root_entry *re)
168 return re->hi & VTD_PAGE_MASK;
171 static inline void context_set_present(struct context_entry *context)
176 static inline void context_set_fault_enable(struct context_entry *context)
178 context->lo &= (((u64)-1) << 2) | 1;
181 static inline void context_set_translation_type(struct context_entry *context,
184 context->lo &= (((u64)-1) << 4) | 3;
185 context->lo |= (value & 3) << 2;
188 static inline void context_set_address_root(struct context_entry *context,
191 context->lo &= ~VTD_PAGE_MASK;
192 context->lo |= value & VTD_PAGE_MASK;
195 static inline void context_set_address_width(struct context_entry *context,
198 context->hi |= value & 7;
201 static inline void context_set_domain_id(struct context_entry *context,
204 context->hi |= (value & ((1 << 16) - 1)) << 8;
207 static inline void context_set_pasid(struct context_entry *context)
209 context->lo |= CONTEXT_PASIDE;
212 static inline int context_domain_id(struct context_entry *c)
214 return((c->hi >> 8) & 0xffff);
217 static inline void context_clear_entry(struct context_entry *context)
223 static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
225 if (!iommu->copied_tables)
228 return test_bit(((long)bus << 8) | devfn, iommu->copied_tables);
232 set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
234 set_bit(((long)bus << 8) | devfn, iommu->copied_tables);
238 clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
240 clear_bit(((long)bus << 8) | devfn, iommu->copied_tables);
244 * This domain is a statically identity mapping domain.
245 * 1. This domain creats a static 1:1 mapping to all usable memory.
246 * 2. It maps to each iommu if successful.
247 * 3. Each iommu mapps to this domain if successful.
249 static struct dmar_domain *si_domain;
250 static int hw_pass_through = 1;
252 struct dmar_rmrr_unit {
253 struct list_head list; /* list of rmrr units */
254 struct acpi_dmar_header *hdr; /* ACPI header */
255 u64 base_address; /* reserved base address*/
256 u64 end_address; /* reserved end address */
257 struct dmar_dev_scope *devices; /* target devices */
258 int devices_cnt; /* target device count */
261 struct dmar_atsr_unit {
262 struct list_head list; /* list of ATSR units */
263 struct acpi_dmar_header *hdr; /* ACPI header */
264 struct dmar_dev_scope *devices; /* target devices */
265 int devices_cnt; /* target device count */
266 u8 include_all:1; /* include all ports */
269 struct dmar_satc_unit {
270 struct list_head list; /* list of SATC units */
271 struct acpi_dmar_header *hdr; /* ACPI header */
272 struct dmar_dev_scope *devices; /* target devices */
273 struct intel_iommu *iommu; /* the corresponding iommu */
274 int devices_cnt; /* target device count */
275 u8 atc_required:1; /* ATS is required */
278 static LIST_HEAD(dmar_atsr_units);
279 static LIST_HEAD(dmar_rmrr_units);
280 static LIST_HEAD(dmar_satc_units);
282 #define for_each_rmrr_units(rmrr) \
283 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
285 static void intel_iommu_domain_free(struct iommu_domain *domain);
287 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
288 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
290 int intel_iommu_enabled = 0;
291 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
293 static int dmar_map_gfx = 1;
294 static int intel_iommu_superpage = 1;
295 static int iommu_identity_mapping;
296 static int iommu_skip_te_disable;
298 #define IDENTMAP_GFX 2
299 #define IDENTMAP_AZALIA 4
301 const struct iommu_ops intel_iommu_ops;
302 const struct iommu_dirty_ops intel_dirty_ops;
304 static bool translation_pre_enabled(struct intel_iommu *iommu)
306 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
309 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
311 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
314 static void init_translation_status(struct intel_iommu *iommu)
318 gsts = readl(iommu->reg + DMAR_GSTS_REG);
319 if (gsts & DMA_GSTS_TES)
320 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
323 static int __init intel_iommu_setup(char *str)
329 if (!strncmp(str, "on", 2)) {
331 pr_info("IOMMU enabled\n");
332 } else if (!strncmp(str, "off", 3)) {
334 no_platform_optin = 1;
335 pr_info("IOMMU disabled\n");
336 } else if (!strncmp(str, "igfx_off", 8)) {
338 pr_info("Disable GFX device mapping\n");
339 } else if (!strncmp(str, "forcedac", 8)) {
340 pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
341 iommu_dma_forcedac = true;
342 } else if (!strncmp(str, "strict", 6)) {
343 pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
344 iommu_set_dma_strict();
345 } else if (!strncmp(str, "sp_off", 6)) {
346 pr_info("Disable supported super page\n");
347 intel_iommu_superpage = 0;
348 } else if (!strncmp(str, "sm_on", 5)) {
349 pr_info("Enable scalable mode if hardware supports\n");
351 } else if (!strncmp(str, "sm_off", 6)) {
352 pr_info("Scalable mode is disallowed\n");
354 } else if (!strncmp(str, "tboot_noforce", 13)) {
355 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
356 intel_iommu_tboot_noforce = 1;
358 pr_notice("Unknown option - '%s'\n", str);
361 str += strcspn(str, ",");
368 __setup("intel_iommu=", intel_iommu_setup);
370 void *alloc_pgtable_page(int node, gfp_t gfp)
375 page = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
377 vaddr = page_address(page);
381 void free_pgtable_page(void *vaddr)
383 free_page((unsigned long)vaddr);
386 static inline int domain_type_is_si(struct dmar_domain *domain)
388 return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
391 static inline int domain_pfn_supported(struct dmar_domain *domain,
394 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
396 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
400 * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
401 * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
402 * the returned SAGAW.
404 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
406 unsigned long fl_sagaw, sl_sagaw;
408 fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
409 sl_sagaw = cap_sagaw(iommu->cap);
411 /* Second level only. */
412 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
415 /* First level only. */
416 if (!ecap_slts(iommu->ecap))
419 return fl_sagaw & sl_sagaw;
422 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
427 sagaw = __iommu_calculate_sagaw(iommu);
428 for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
429 if (test_bit(agaw, &sagaw))
437 * Calculate max SAGAW for each iommu.
439 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
441 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
445 * calculate agaw for each iommu.
446 * "SAGAW" may be different across iommus, use a default agaw, and
447 * get a supported less agaw for iommus that don't support the default agaw.
449 int iommu_calculate_agaw(struct intel_iommu *iommu)
451 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
454 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
456 return sm_supported(iommu) ?
457 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
460 static void domain_update_iommu_coherency(struct dmar_domain *domain)
462 struct iommu_domain_info *info;
463 struct dmar_drhd_unit *drhd;
464 struct intel_iommu *iommu;
468 domain->iommu_coherency = true;
469 xa_for_each(&domain->iommu_array, i, info) {
471 if (!iommu_paging_structure_coherency(info->iommu)) {
472 domain->iommu_coherency = false;
479 /* No hardware attached; use lowest common denominator */
481 for_each_active_iommu(iommu, drhd) {
482 if (!iommu_paging_structure_coherency(iommu)) {
483 domain->iommu_coherency = false;
490 static int domain_update_iommu_superpage(struct dmar_domain *domain,
491 struct intel_iommu *skip)
493 struct dmar_drhd_unit *drhd;
494 struct intel_iommu *iommu;
497 if (!intel_iommu_superpage)
500 /* set iommu_superpage to the smallest common denominator */
502 for_each_active_iommu(iommu, drhd) {
504 if (domain && domain->use_first_level) {
505 if (!cap_fl1gp_support(iommu->cap))
508 mask &= cap_super_page_val(iommu->cap);
520 static int domain_update_device_node(struct dmar_domain *domain)
522 struct device_domain_info *info;
523 int nid = NUMA_NO_NODE;
526 spin_lock_irqsave(&domain->lock, flags);
527 list_for_each_entry(info, &domain->devices, link) {
529 * There could possibly be multiple device numa nodes as devices
530 * within the same domain may sit behind different IOMMUs. There
531 * isn't perfect answer in such situation, so we select first
532 * come first served policy.
534 nid = dev_to_node(info->dev);
535 if (nid != NUMA_NO_NODE)
538 spin_unlock_irqrestore(&domain->lock, flags);
543 static void domain_update_iotlb(struct dmar_domain *domain);
545 /* Return the super pagesize bitmap if supported. */
546 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
548 unsigned long bitmap = 0;
551 * 1-level super page supports page size of 2MiB, 2-level super page
552 * supports page size of both 2MiB and 1GiB.
554 if (domain->iommu_superpage == 1)
556 else if (domain->iommu_superpage == 2)
557 bitmap |= SZ_2M | SZ_1G;
562 /* Some capabilities may be different across iommus */
563 void domain_update_iommu_cap(struct dmar_domain *domain)
565 domain_update_iommu_coherency(domain);
566 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
569 * If RHSA is missing, we should default to the device numa domain
572 if (domain->nid == NUMA_NO_NODE)
573 domain->nid = domain_update_device_node(domain);
576 * First-level translation restricts the input-address to a
577 * canonical address (i.e., address bits 63:N have the same
578 * value as address bit [N-1], where N is 48-bits with 4-level
579 * paging and 57-bits with 5-level paging). Hence, skip bit
582 if (domain->use_first_level)
583 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
585 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
587 domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
588 domain_update_iotlb(domain);
591 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
594 struct root_entry *root = &iommu->root_entry[bus];
595 struct context_entry *context;
599 * Except that the caller requested to allocate a new entry,
600 * returning a copied context entry makes no sense.
602 if (!alloc && context_copied(iommu, bus, devfn))
606 if (sm_supported(iommu)) {
614 context = phys_to_virt(*entry & VTD_PAGE_MASK);
616 unsigned long phy_addr;
620 context = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
624 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
625 phy_addr = virt_to_phys((void *)context);
626 *entry = phy_addr | 1;
627 __iommu_flush_cache(iommu, entry, sizeof(*entry));
629 return &context[devfn];
633 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
634 * sub-hierarchy of a candidate PCI-PCI bridge
635 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
636 * @bridge: the candidate PCI-PCI bridge
638 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
641 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
643 struct pci_dev *pdev, *pbridge;
645 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
648 pdev = to_pci_dev(dev);
649 pbridge = to_pci_dev(bridge);
651 if (pbridge->subordinate &&
652 pbridge->subordinate->number <= pdev->bus->number &&
653 pbridge->subordinate->busn_res.end >= pdev->bus->number)
659 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
661 struct dmar_drhd_unit *drhd;
665 /* We know that this device on this chipset has its own IOMMU.
666 * If we find it under a different IOMMU, then the BIOS is lying
667 * to us. Hope that the IOMMU for this device is actually
668 * disabled, and it needs no translation...
670 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
673 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
678 /* we know that the this iommu should be at offset 0xa000 from vtbar */
679 drhd = dmar_find_matched_drhd_unit(pdev);
680 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
681 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
682 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
689 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
691 if (!iommu || iommu->drhd->ignored)
694 if (dev_is_pci(dev)) {
695 struct pci_dev *pdev = to_pci_dev(dev);
697 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
698 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
699 quirk_ioat_snb_local_iommu(pdev))
706 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
708 struct dmar_drhd_unit *drhd = NULL;
709 struct pci_dev *pdev = NULL;
710 struct intel_iommu *iommu;
718 if (dev_is_pci(dev)) {
719 struct pci_dev *pf_pdev;
721 pdev = pci_real_dma_dev(to_pci_dev(dev));
723 /* VFs aren't listed in scope tables; we need to look up
724 * the PF instead to find the IOMMU. */
725 pf_pdev = pci_physfn(pdev);
727 segment = pci_domain_nr(pdev->bus);
728 } else if (has_acpi_companion(dev))
729 dev = &ACPI_COMPANION(dev)->dev;
732 for_each_iommu(iommu, drhd) {
733 if (pdev && segment != drhd->segment)
736 for_each_active_dev_scope(drhd->devices,
737 drhd->devices_cnt, i, tmp) {
739 /* For a VF use its original BDF# not that of the PF
740 * which we used for the IOMMU lookup. Strictly speaking
741 * we could do this for all PCI devices; we only need to
742 * get the BDF# from the scope table for ACPI matches. */
743 if (pdev && pdev->is_virtfn)
747 *bus = drhd->devices[i].bus;
748 *devfn = drhd->devices[i].devfn;
753 if (is_downstream_to_pci_bridge(dev, tmp))
757 if (pdev && drhd->include_all) {
760 *bus = pdev->bus->number;
761 *devfn = pdev->devfn;
768 if (iommu_is_dummy(iommu, dev))
776 static void domain_flush_cache(struct dmar_domain *domain,
777 void *addr, int size)
779 if (!domain->iommu_coherency)
780 clflush_cache_range(addr, size);
783 static void free_context_table(struct intel_iommu *iommu)
785 struct context_entry *context;
788 if (!iommu->root_entry)
791 for (i = 0; i < ROOT_ENTRY_NR; i++) {
792 context = iommu_context_addr(iommu, i, 0, 0);
794 free_pgtable_page(context);
796 if (!sm_supported(iommu))
799 context = iommu_context_addr(iommu, i, 0x80, 0);
801 free_pgtable_page(context);
804 free_pgtable_page(iommu->root_entry);
805 iommu->root_entry = NULL;
808 #ifdef CONFIG_DMAR_DEBUG
809 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
810 u8 bus, u8 devfn, struct dma_pte *parent, int level)
816 offset = pfn_level_offset(pfn, level);
817 pte = &parent[offset];
818 if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
819 pr_info("PTE not present at level %d\n", level);
823 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
828 parent = phys_to_virt(dma_pte_addr(pte));
833 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
834 unsigned long long addr, u32 pasid)
836 struct pasid_dir_entry *dir, *pde;
837 struct pasid_entry *entries, *pte;
838 struct context_entry *ctx_entry;
839 struct root_entry *rt_entry;
840 int i, dir_index, index, level;
841 u8 devfn = source_id & 0xff;
842 u8 bus = source_id >> 8;
843 struct dma_pte *pgtable;
845 pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
847 /* root entry dump */
848 rt_entry = &iommu->root_entry[bus];
850 pr_info("root table entry is not present\n");
854 if (sm_supported(iommu))
855 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
856 rt_entry->hi, rt_entry->lo);
858 pr_info("root entry: 0x%016llx", rt_entry->lo);
860 /* context entry dump */
861 ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
863 pr_info("context table entry is not present\n");
867 pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
868 ctx_entry->hi, ctx_entry->lo);
870 /* legacy mode does not require PASID entries */
871 if (!sm_supported(iommu)) {
872 level = agaw_to_level(ctx_entry->hi & 7);
873 pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
877 /* get the pointer to pasid directory entry */
878 dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
880 pr_info("pasid directory entry is not present\n");
883 /* For request-without-pasid, get the pasid from context entry */
884 if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
885 pasid = IOMMU_NO_PASID;
887 dir_index = pasid >> PASID_PDE_SHIFT;
888 pde = &dir[dir_index];
889 pr_info("pasid dir entry: 0x%016llx\n", pde->val);
891 /* get the pointer to the pasid table entry */
892 entries = get_pasid_table_from_pde(pde);
894 pr_info("pasid table entry is not present\n");
897 index = pasid & PASID_PTE_MASK;
898 pte = &entries[index];
899 for (i = 0; i < ARRAY_SIZE(pte->val); i++)
900 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
902 if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
903 level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
904 pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
906 level = agaw_to_level((pte->val[0] >> 2) & 0x7);
907 pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
911 pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
915 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
916 unsigned long pfn, int *target_level,
919 struct dma_pte *parent, *pte;
920 int level = agaw_to_level(domain->agaw);
923 if (!domain_pfn_supported(domain, pfn))
924 /* Address beyond IOMMU's addressing capabilities. */
927 parent = domain->pgd;
932 offset = pfn_level_offset(pfn, level);
933 pte = &parent[offset];
934 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
936 if (level == *target_level)
939 if (!dma_pte_present(pte)) {
942 tmp_page = alloc_pgtable_page(domain->nid, gfp);
947 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
948 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
949 if (domain->use_first_level)
950 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
952 if (cmpxchg64(&pte->val, 0ULL, pteval))
953 /* Someone else set it while we were thinking; use theirs. */
954 free_pgtable_page(tmp_page);
956 domain_flush_cache(domain, pte, sizeof(*pte));
961 parent = phys_to_virt(dma_pte_addr(pte));
966 *target_level = level;
971 /* return address's pte at specific level */
972 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
974 int level, int *large_page)
976 struct dma_pte *parent, *pte;
977 int total = agaw_to_level(domain->agaw);
980 parent = domain->pgd;
981 while (level <= total) {
982 offset = pfn_level_offset(pfn, total);
983 pte = &parent[offset];
987 if (!dma_pte_present(pte)) {
992 if (dma_pte_superpage(pte)) {
997 parent = phys_to_virt(dma_pte_addr(pte));
1003 /* clear last level pte, a tlb flush should be followed */
1004 static void dma_pte_clear_range(struct dmar_domain *domain,
1005 unsigned long start_pfn,
1006 unsigned long last_pfn)
1008 unsigned int large_page;
1009 struct dma_pte *first_pte, *pte;
1011 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1012 WARN_ON(start_pfn > last_pfn))
1015 /* we don't need lock here; nobody else touches the iova range */
1018 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1020 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1025 start_pfn += lvl_to_nr_pages(large_page);
1027 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1029 domain_flush_cache(domain, first_pte,
1030 (void *)pte - (void *)first_pte);
1032 } while (start_pfn && start_pfn <= last_pfn);
1035 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1036 int retain_level, struct dma_pte *pte,
1037 unsigned long pfn, unsigned long start_pfn,
1038 unsigned long last_pfn)
1040 pfn = max(start_pfn, pfn);
1041 pte = &pte[pfn_level_offset(pfn, level)];
1044 unsigned long level_pfn;
1045 struct dma_pte *level_pte;
1047 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1050 level_pfn = pfn & level_mask(level);
1051 level_pte = phys_to_virt(dma_pte_addr(pte));
1054 dma_pte_free_level(domain, level - 1, retain_level,
1055 level_pte, level_pfn, start_pfn,
1060 * Free the page table if we're below the level we want to
1061 * retain and the range covers the entire table.
1063 if (level < retain_level && !(start_pfn > level_pfn ||
1064 last_pfn < level_pfn + level_size(level) - 1)) {
1066 domain_flush_cache(domain, pte, sizeof(*pte));
1067 free_pgtable_page(level_pte);
1070 pfn += level_size(level);
1071 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1075 * clear last level (leaf) ptes and free page table pages below the
1076 * level we wish to keep intact.
1078 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1079 unsigned long start_pfn,
1080 unsigned long last_pfn,
1083 dma_pte_clear_range(domain, start_pfn, last_pfn);
1085 /* We don't need lock here; nobody else touches the iova range */
1086 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1087 domain->pgd, 0, start_pfn, last_pfn);
1090 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1091 free_pgtable_page(domain->pgd);
1096 /* When a page at a given level is being unlinked from its parent, we don't
1097 need to *modify* it at all. All we need to do is make a list of all the
1098 pages which can be freed just as soon as we've flushed the IOTLB and we
1099 know the hardware page-walk will no longer touch them.
1100 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1102 static void dma_pte_list_pagetables(struct dmar_domain *domain,
1103 int level, struct dma_pte *pte,
1104 struct list_head *freelist)
1108 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1109 list_add_tail(&pg->lru, freelist);
1114 pte = page_address(pg);
1116 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1117 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1119 } while (!first_pte_in_page(pte));
1122 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1123 struct dma_pte *pte, unsigned long pfn,
1124 unsigned long start_pfn, unsigned long last_pfn,
1125 struct list_head *freelist)
1127 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1129 pfn = max(start_pfn, pfn);
1130 pte = &pte[pfn_level_offset(pfn, level)];
1133 unsigned long level_pfn = pfn & level_mask(level);
1135 if (!dma_pte_present(pte))
1138 /* If range covers entire pagetable, free it */
1139 if (start_pfn <= level_pfn &&
1140 last_pfn >= level_pfn + level_size(level) - 1) {
1141 /* These suborbinate page tables are going away entirely. Don't
1142 bother to clear them; we're just going to *free* them. */
1143 if (level > 1 && !dma_pte_superpage(pte))
1144 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1150 } else if (level > 1) {
1151 /* Recurse down into a level that isn't *entirely* obsolete */
1152 dma_pte_clear_level(domain, level - 1,
1153 phys_to_virt(dma_pte_addr(pte)),
1154 level_pfn, start_pfn, last_pfn,
1158 pfn = level_pfn + level_size(level);
1159 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1162 domain_flush_cache(domain, first_pte,
1163 (void *)++last_pte - (void *)first_pte);
1166 /* We can't just free the pages because the IOMMU may still be walking
1167 the page tables, and may have cached the intermediate levels. The
1168 pages can only be freed after the IOTLB flush has been done. */
1169 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1170 unsigned long last_pfn, struct list_head *freelist)
1172 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1173 WARN_ON(start_pfn > last_pfn))
1176 /* we don't need lock here; nobody else touches the iova range */
1177 dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1178 domain->pgd, 0, start_pfn, last_pfn, freelist);
1181 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1182 struct page *pgd_page = virt_to_page(domain->pgd);
1183 list_add_tail(&pgd_page->lru, freelist);
1188 /* iommu handling */
1189 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1191 struct root_entry *root;
1193 root = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
1195 pr_err("Allocating root entry for %s failed\n",
1200 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1201 iommu->root_entry = root;
1206 static void iommu_set_root_entry(struct intel_iommu *iommu)
1212 addr = virt_to_phys(iommu->root_entry);
1213 if (sm_supported(iommu))
1214 addr |= DMA_RTADDR_SMT;
1216 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1217 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1219 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1221 /* Make sure hardware complete it */
1222 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1223 readl, (sts & DMA_GSTS_RTPS), sts);
1225 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1228 * Hardware invalidates all DMA remapping hardware translation
1229 * caches as part of SRTP flow.
1231 if (cap_esrtps(iommu->cap))
1234 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1235 if (sm_supported(iommu))
1236 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1237 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1240 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1245 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1248 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1249 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1251 /* Make sure hardware complete it */
1252 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1253 readl, (!(val & DMA_GSTS_WBFS)), val);
1255 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1258 /* return value determine if we need a write buffer flush */
1259 static void __iommu_flush_context(struct intel_iommu *iommu,
1260 u16 did, u16 source_id, u8 function_mask,
1267 case DMA_CCMD_GLOBAL_INVL:
1268 val = DMA_CCMD_GLOBAL_INVL;
1270 case DMA_CCMD_DOMAIN_INVL:
1271 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1273 case DMA_CCMD_DEVICE_INVL:
1274 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1275 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1278 pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1282 val |= DMA_CCMD_ICC;
1284 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1285 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1287 /* Make sure hardware complete it */
1288 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1289 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1291 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1294 /* return value determine if we need a write buffer flush */
1295 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1296 u64 addr, unsigned int size_order, u64 type)
1298 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1299 u64 val = 0, val_iva = 0;
1303 case DMA_TLB_GLOBAL_FLUSH:
1304 /* global flush doesn't need set IVA_REG */
1305 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1307 case DMA_TLB_DSI_FLUSH:
1308 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1310 case DMA_TLB_PSI_FLUSH:
1311 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1312 /* IH bit is passed in as part of address */
1313 val_iva = size_order | addr;
1316 pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1321 if (cap_write_drain(iommu->cap))
1322 val |= DMA_TLB_WRITE_DRAIN;
1324 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1325 /* Note: Only uses first TLB reg currently */
1327 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1328 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1330 /* Make sure hardware complete it */
1331 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1332 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1334 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1336 /* check IOTLB invalidation granularity */
1337 if (DMA_TLB_IAIG(val) == 0)
1338 pr_err("Flush IOTLB failed\n");
1339 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1340 pr_debug("TLB flush request %Lx, actual %Lx\n",
1341 (unsigned long long)DMA_TLB_IIRG(type),
1342 (unsigned long long)DMA_TLB_IAIG(val));
1345 static struct device_domain_info *
1346 domain_lookup_dev_info(struct dmar_domain *domain,
1347 struct intel_iommu *iommu, u8 bus, u8 devfn)
1349 struct device_domain_info *info;
1350 unsigned long flags;
1352 spin_lock_irqsave(&domain->lock, flags);
1353 list_for_each_entry(info, &domain->devices, link) {
1354 if (info->iommu == iommu && info->bus == bus &&
1355 info->devfn == devfn) {
1356 spin_unlock_irqrestore(&domain->lock, flags);
1360 spin_unlock_irqrestore(&domain->lock, flags);
1365 static void domain_update_iotlb(struct dmar_domain *domain)
1367 struct dev_pasid_info *dev_pasid;
1368 struct device_domain_info *info;
1369 bool has_iotlb_device = false;
1370 unsigned long flags;
1372 spin_lock_irqsave(&domain->lock, flags);
1373 list_for_each_entry(info, &domain->devices, link) {
1374 if (info->ats_enabled) {
1375 has_iotlb_device = true;
1380 list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1381 info = dev_iommu_priv_get(dev_pasid->dev);
1382 if (info->ats_enabled) {
1383 has_iotlb_device = true;
1387 domain->has_iotlb_device = has_iotlb_device;
1388 spin_unlock_irqrestore(&domain->lock, flags);
1392 * The extra devTLB flush quirk impacts those QAT devices with PCI device
1393 * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1394 * check because it applies only to the built-in QAT devices and it doesn't
1395 * grant additional privileges.
1397 #define BUGGY_QAT_DEVID_MASK 0x4940
1398 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1400 if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1403 if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1409 static void iommu_enable_pci_caps(struct device_domain_info *info)
1411 struct pci_dev *pdev;
1413 if (!dev_is_pci(info->dev))
1416 pdev = to_pci_dev(info->dev);
1418 /* The PCIe spec, in its wisdom, declares that the behaviour of
1419 the device if you enable PASID support after ATS support is
1420 undefined. So always enable PASID support on devices which
1421 have it, even if we can't yet know if we're ever going to
1423 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1424 info->pasid_enabled = 1;
1426 if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1427 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1428 info->ats_enabled = 1;
1429 domain_update_iotlb(info->domain);
1433 static void iommu_disable_pci_caps(struct device_domain_info *info)
1435 struct pci_dev *pdev;
1437 if (!dev_is_pci(info->dev))
1440 pdev = to_pci_dev(info->dev);
1442 if (info->ats_enabled) {
1443 pci_disable_ats(pdev);
1444 info->ats_enabled = 0;
1445 domain_update_iotlb(info->domain);
1448 if (info->pasid_enabled) {
1449 pci_disable_pasid(pdev);
1450 info->pasid_enabled = 0;
1454 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1455 u64 addr, unsigned int mask)
1459 if (!info || !info->ats_enabled)
1462 sid = info->bus << 8 | info->devfn;
1463 qdep = info->ats_qdep;
1464 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1466 quirk_extra_dev_tlb_flush(info, addr, mask, IOMMU_NO_PASID, qdep);
1469 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1470 u64 addr, unsigned mask)
1472 struct dev_pasid_info *dev_pasid;
1473 struct device_domain_info *info;
1474 unsigned long flags;
1476 if (!domain->has_iotlb_device)
1479 spin_lock_irqsave(&domain->lock, flags);
1480 list_for_each_entry(info, &domain->devices, link)
1481 __iommu_flush_dev_iotlb(info, addr, mask);
1483 list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1484 info = dev_iommu_priv_get(dev_pasid->dev);
1486 if (!info->ats_enabled)
1489 qi_flush_dev_iotlb_pasid(info->iommu,
1490 PCI_DEVID(info->bus, info->devfn),
1491 info->pfsid, dev_pasid->pasid,
1492 info->ats_qdep, addr,
1495 spin_unlock_irqrestore(&domain->lock, flags);
1498 static void domain_flush_pasid_iotlb(struct intel_iommu *iommu,
1499 struct dmar_domain *domain, u64 addr,
1500 unsigned long npages, bool ih)
1502 u16 did = domain_id_iommu(domain, iommu);
1503 struct dev_pasid_info *dev_pasid;
1504 unsigned long flags;
1506 spin_lock_irqsave(&domain->lock, flags);
1507 list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain)
1508 qi_flush_piotlb(iommu, did, dev_pasid->pasid, addr, npages, ih);
1510 if (!list_empty(&domain->devices))
1511 qi_flush_piotlb(iommu, did, IOMMU_NO_PASID, addr, npages, ih);
1512 spin_unlock_irqrestore(&domain->lock, flags);
1515 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1516 struct dmar_domain *domain,
1517 unsigned long pfn, unsigned int pages,
1520 unsigned int aligned_pages = __roundup_pow_of_two(pages);
1521 unsigned int mask = ilog2(aligned_pages);
1522 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1523 u16 did = domain_id_iommu(domain, iommu);
1525 if (WARN_ON(!pages))
1531 if (domain->use_first_level) {
1532 domain_flush_pasid_iotlb(iommu, domain, addr, pages, ih);
1534 unsigned long bitmask = aligned_pages - 1;
1537 * PSI masks the low order bits of the base address. If the
1538 * address isn't aligned to the mask, then compute a mask value
1539 * needed to ensure the target range is flushed.
1541 if (unlikely(bitmask & pfn)) {
1542 unsigned long end_pfn = pfn + pages - 1, shared_bits;
1545 * Since end_pfn <= pfn + bitmask, the only way bits
1546 * higher than bitmask can differ in pfn and end_pfn is
1547 * by carrying. This means after masking out bitmask,
1548 * high bits starting with the first set bit in
1549 * shared_bits are all equal in both pfn and end_pfn.
1551 shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1552 mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1556 * Fallback to domain selective flush if no PSI support or
1557 * the size is too big.
1559 if (!cap_pgsel_inv(iommu->cap) ||
1560 mask > cap_max_amask_val(iommu->cap))
1561 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1564 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1569 * In caching mode, changes of pages from non-present to present require
1570 * flush. However, device IOTLB doesn't need to be flushed in this case.
1572 if (!cap_caching_mode(iommu->cap) || !map)
1573 iommu_flush_dev_iotlb(domain, addr, mask);
1576 /* Notification for newly created mappings */
1577 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1578 struct dmar_domain *domain,
1579 unsigned long pfn, unsigned int pages)
1582 * It's a non-present to present mapping. Only flush if caching mode
1585 if (cap_caching_mode(iommu->cap) && !domain->use_first_level)
1586 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1588 iommu_flush_write_buffer(iommu);
1591 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1593 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1594 struct iommu_domain_info *info;
1597 xa_for_each(&dmar_domain->iommu_array, idx, info) {
1598 struct intel_iommu *iommu = info->iommu;
1599 u16 did = domain_id_iommu(dmar_domain, iommu);
1601 if (dmar_domain->use_first_level)
1602 domain_flush_pasid_iotlb(iommu, dmar_domain, 0, -1, 0);
1604 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1607 if (!cap_caching_mode(iommu->cap))
1608 iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1612 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1615 unsigned long flags;
1617 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1620 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1621 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1622 pmen &= ~DMA_PMEN_EPM;
1623 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1625 /* wait for the protected region status bit to clear */
1626 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1627 readl, !(pmen & DMA_PMEN_PRS), pmen);
1629 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1632 static void iommu_enable_translation(struct intel_iommu *iommu)
1635 unsigned long flags;
1637 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1638 iommu->gcmd |= DMA_GCMD_TE;
1639 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1641 /* Make sure hardware complete it */
1642 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1643 readl, (sts & DMA_GSTS_TES), sts);
1645 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1648 static void iommu_disable_translation(struct intel_iommu *iommu)
1653 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1654 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1657 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1658 iommu->gcmd &= ~DMA_GCMD_TE;
1659 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1661 /* Make sure hardware complete it */
1662 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1663 readl, (!(sts & DMA_GSTS_TES)), sts);
1665 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1668 static int iommu_init_domains(struct intel_iommu *iommu)
1672 ndomains = cap_ndoms(iommu->cap);
1673 pr_debug("%s: Number of Domains supported <%d>\n",
1674 iommu->name, ndomains);
1676 spin_lock_init(&iommu->lock);
1678 iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1679 if (!iommu->domain_ids)
1683 * If Caching mode is set, then invalid translations are tagged
1684 * with domain-id 0, hence we need to pre-allocate it. We also
1685 * use domain-id 0 as a marker for non-allocated domain-id, so
1686 * make sure it is not used for a real domain.
1688 set_bit(0, iommu->domain_ids);
1691 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1692 * entry for first-level or pass-through translation modes should
1693 * be programmed with a domain id different from those used for
1694 * second-level or nested translation. We reserve a domain id for
1697 if (sm_supported(iommu))
1698 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1703 static void disable_dmar_iommu(struct intel_iommu *iommu)
1705 if (!iommu->domain_ids)
1709 * All iommu domains must have been detached from the devices,
1710 * hence there should be no domain IDs in use.
1712 if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1713 > NUM_RESERVED_DID))
1716 if (iommu->gcmd & DMA_GCMD_TE)
1717 iommu_disable_translation(iommu);
1720 static void free_dmar_iommu(struct intel_iommu *iommu)
1722 if (iommu->domain_ids) {
1723 bitmap_free(iommu->domain_ids);
1724 iommu->domain_ids = NULL;
1727 if (iommu->copied_tables) {
1728 bitmap_free(iommu->copied_tables);
1729 iommu->copied_tables = NULL;
1732 /* free context mapping */
1733 free_context_table(iommu);
1735 #ifdef CONFIG_INTEL_IOMMU_SVM
1736 if (pasid_supported(iommu)) {
1737 if (ecap_prs(iommu->ecap))
1738 intel_svm_finish_prq(iommu);
1744 * Check and return whether first level is used by default for
1747 static bool first_level_by_default(unsigned int type)
1749 /* Only SL is available in legacy mode */
1750 if (!scalable_mode_support())
1753 /* Only level (either FL or SL) is available, just use it */
1754 if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1755 return intel_cap_flts_sanity();
1757 /* Both levels are available, decide it based on domain type */
1758 return type != IOMMU_DOMAIN_UNMANAGED;
1761 static struct dmar_domain *alloc_domain(unsigned int type)
1763 struct dmar_domain *domain;
1765 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1769 domain->nid = NUMA_NO_NODE;
1770 if (first_level_by_default(type))
1771 domain->use_first_level = true;
1772 domain->has_iotlb_device = false;
1773 INIT_LIST_HEAD(&domain->devices);
1774 INIT_LIST_HEAD(&domain->dev_pasids);
1775 spin_lock_init(&domain->lock);
1776 xa_init(&domain->iommu_array);
1781 int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1783 struct iommu_domain_info *info, *curr;
1784 unsigned long ndomains;
1785 int num, ret = -ENOSPC;
1787 info = kzalloc(sizeof(*info), GFP_KERNEL);
1791 spin_lock(&iommu->lock);
1792 curr = xa_load(&domain->iommu_array, iommu->seq_id);
1795 spin_unlock(&iommu->lock);
1800 ndomains = cap_ndoms(iommu->cap);
1801 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1802 if (num >= ndomains) {
1803 pr_err("%s: No free domain ids\n", iommu->name);
1807 set_bit(num, iommu->domain_ids);
1810 info->iommu = iommu;
1811 curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1812 NULL, info, GFP_ATOMIC);
1814 ret = xa_err(curr) ? : -EBUSY;
1817 domain_update_iommu_cap(domain);
1819 spin_unlock(&iommu->lock);
1823 clear_bit(info->did, iommu->domain_ids);
1825 spin_unlock(&iommu->lock);
1830 void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1832 struct iommu_domain_info *info;
1834 spin_lock(&iommu->lock);
1835 info = xa_load(&domain->iommu_array, iommu->seq_id);
1836 if (--info->refcnt == 0) {
1837 clear_bit(info->did, iommu->domain_ids);
1838 xa_erase(&domain->iommu_array, iommu->seq_id);
1839 domain->nid = NUMA_NO_NODE;
1840 domain_update_iommu_cap(domain);
1843 spin_unlock(&iommu->lock);
1846 static inline int guestwidth_to_adjustwidth(int gaw)
1849 int r = (gaw - 12) % 9;
1860 static void domain_exit(struct dmar_domain *domain)
1863 LIST_HEAD(freelist);
1865 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1866 put_pages_list(&freelist);
1869 if (WARN_ON(!list_empty(&domain->devices)))
1876 * Get the PASID directory size for scalable mode context entry.
1877 * Value of X in the PDTS field of a scalable mode context entry
1878 * indicates PASID directory with 2^(X + 7) entries.
1880 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1882 unsigned long pds, max_pde;
1884 max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1885 pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
1893 * Set the RID_PASID field of a scalable mode context entry. The
1894 * IOMMU hardware will use the PASID value set in this field for
1895 * DMA translations of DMA requests without PASID.
1898 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1900 context->hi |= pasid & ((1 << 20) - 1);
1904 * Set the DTE(Device-TLB Enable) field of a scalable mode context
1907 static inline void context_set_sm_dte(struct context_entry *context)
1909 context->lo |= BIT_ULL(2);
1913 * Set the PRE(Page Request Enable) field of a scalable mode context
1916 static inline void context_set_sm_pre(struct context_entry *context)
1918 context->lo |= BIT_ULL(4);
1921 /* Convert value to context PASID directory size field coding. */
1922 #define context_pdts(pds) (((pds) & 0x7) << 9)
1924 static int domain_context_mapping_one(struct dmar_domain *domain,
1925 struct intel_iommu *iommu,
1926 struct pasid_table *table,
1929 struct device_domain_info *info =
1930 domain_lookup_dev_info(domain, iommu, bus, devfn);
1931 u16 did = domain_id_iommu(domain, iommu);
1932 int translation = CONTEXT_TT_MULTI_LEVEL;
1933 struct context_entry *context;
1936 if (hw_pass_through && domain_type_is_si(domain))
1937 translation = CONTEXT_TT_PASS_THROUGH;
1939 pr_debug("Set context mapping for %02x:%02x.%d\n",
1940 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1942 spin_lock(&iommu->lock);
1944 context = iommu_context_addr(iommu, bus, devfn, 1);
1949 if (context_present(context) && !context_copied(iommu, bus, devfn))
1953 * For kdump cases, old valid entries may be cached due to the
1954 * in-flight DMA and copied pgtable, but there is no unmapping
1955 * behaviour for them, thus we need an explicit cache flush for
1956 * the newly-mapped device. For kdump, at this point, the device
1957 * is supposed to finish reset at its driver probe stage, so no
1958 * in-flight DMA will exist, and we don't need to worry anymore
1961 if (context_copied(iommu, bus, devfn)) {
1962 u16 did_old = context_domain_id(context);
1964 if (did_old < cap_ndoms(iommu->cap)) {
1965 iommu->flush.flush_context(iommu, did_old,
1966 (((u16)bus) << 8) | devfn,
1967 DMA_CCMD_MASK_NOBIT,
1968 DMA_CCMD_DEVICE_INVL);
1969 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1973 clear_context_copied(iommu, bus, devfn);
1976 context_clear_entry(context);
1978 if (sm_supported(iommu)) {
1981 /* Setup the PASID DIR pointer: */
1982 pds = context_get_sm_pds(table);
1983 context->lo = (u64)virt_to_phys(table->table) |
1986 /* Setup the RID_PASID field: */
1987 context_set_sm_rid2pasid(context, IOMMU_NO_PASID);
1990 * Setup the Device-TLB enable bit and Page request
1993 if (info && info->ats_supported)
1994 context_set_sm_dte(context);
1995 if (info && info->pri_supported)
1996 context_set_sm_pre(context);
1997 if (info && info->pasid_supported)
1998 context_set_pasid(context);
2000 struct dma_pte *pgd = domain->pgd;
2003 context_set_domain_id(context, did);
2005 if (translation != CONTEXT_TT_PASS_THROUGH) {
2007 * Skip top levels of page tables for iommu which has
2008 * less agaw than default. Unnecessary for PT mode.
2010 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2012 pgd = phys_to_virt(dma_pte_addr(pgd));
2013 if (!dma_pte_present(pgd))
2017 if (info && info->ats_supported)
2018 translation = CONTEXT_TT_DEV_IOTLB;
2020 translation = CONTEXT_TT_MULTI_LEVEL;
2022 context_set_address_root(context, virt_to_phys(pgd));
2023 context_set_address_width(context, agaw);
2026 * In pass through mode, AW must be programmed to
2027 * indicate the largest AGAW value supported by
2028 * hardware. And ASR is ignored by hardware.
2030 context_set_address_width(context, iommu->msagaw);
2033 context_set_translation_type(context, translation);
2036 context_set_fault_enable(context);
2037 context_set_present(context);
2038 if (!ecap_coherent(iommu->ecap))
2039 clflush_cache_range(context, sizeof(*context));
2042 * It's a non-present to present mapping. If hardware doesn't cache
2043 * non-present entry we only need to flush the write-buffer. If the
2044 * _does_ cache non-present entries, then it does so in the special
2045 * domain #0, which we have to flush:
2047 if (cap_caching_mode(iommu->cap)) {
2048 iommu->flush.flush_context(iommu, 0,
2049 (((u16)bus) << 8) | devfn,
2050 DMA_CCMD_MASK_NOBIT,
2051 DMA_CCMD_DEVICE_INVL);
2052 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2054 iommu_flush_write_buffer(iommu);
2060 spin_unlock(&iommu->lock);
2065 struct domain_context_mapping_data {
2066 struct dmar_domain *domain;
2067 struct intel_iommu *iommu;
2068 struct pasid_table *table;
2071 static int domain_context_mapping_cb(struct pci_dev *pdev,
2072 u16 alias, void *opaque)
2074 struct domain_context_mapping_data *data = opaque;
2076 return domain_context_mapping_one(data->domain, data->iommu,
2077 data->table, PCI_BUS_NUM(alias),
2082 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2084 struct domain_context_mapping_data data;
2085 struct pasid_table *table;
2086 struct intel_iommu *iommu;
2089 iommu = device_to_iommu(dev, &bus, &devfn);
2093 table = intel_pasid_get_table(dev);
2095 if (!dev_is_pci(dev))
2096 return domain_context_mapping_one(domain, iommu, table,
2099 data.domain = domain;
2103 return pci_for_each_dma_alias(to_pci_dev(dev),
2104 &domain_context_mapping_cb, &data);
2107 /* Returns a number of VTD pages, but aligned to MM page size */
2108 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2111 host_addr &= ~PAGE_MASK;
2112 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2115 /* Return largest possible superpage level for a given mapping */
2116 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2117 unsigned long iov_pfn,
2118 unsigned long phy_pfn,
2119 unsigned long pages)
2121 int support, level = 1;
2122 unsigned long pfnmerge;
2124 support = domain->iommu_superpage;
2126 /* To use a large page, the virtual *and* physical addresses
2127 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2128 of them will mean we have to use smaller pages. So just
2129 merge them and check both at once. */
2130 pfnmerge = iov_pfn | phy_pfn;
2132 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2133 pages >>= VTD_STRIDE_SHIFT;
2136 pfnmerge >>= VTD_STRIDE_SHIFT;
2144 * Ensure that old small page tables are removed to make room for superpage(s).
2145 * We're going to add new large pages, so make sure we don't remove their parent
2146 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2148 static void switch_to_super_page(struct dmar_domain *domain,
2149 unsigned long start_pfn,
2150 unsigned long end_pfn, int level)
2152 unsigned long lvl_pages = lvl_to_nr_pages(level);
2153 struct iommu_domain_info *info;
2154 struct dma_pte *pte = NULL;
2157 while (start_pfn <= end_pfn) {
2159 pte = pfn_to_dma_pte(domain, start_pfn, &level,
2162 if (dma_pte_present(pte)) {
2163 dma_pte_free_pagetable(domain, start_pfn,
2164 start_pfn + lvl_pages - 1,
2167 xa_for_each(&domain->iommu_array, i, info)
2168 iommu_flush_iotlb_psi(info->iommu, domain,
2169 start_pfn, lvl_pages,
2174 start_pfn += lvl_pages;
2175 if (first_pte_in_page(pte))
2181 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2182 unsigned long phys_pfn, unsigned long nr_pages, int prot,
2185 struct dma_pte *first_pte = NULL, *pte = NULL;
2186 unsigned int largepage_lvl = 0;
2187 unsigned long lvl_pages = 0;
2191 if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
2194 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2197 if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
2198 pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
2202 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2203 attr |= DMA_FL_PTE_PRESENT;
2204 if (domain->use_first_level) {
2205 attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2206 if (prot & DMA_PTE_WRITE)
2207 attr |= DMA_FL_PTE_DIRTY;
2210 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2212 while (nr_pages > 0) {
2216 largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2217 phys_pfn, nr_pages);
2219 pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
2225 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2227 /* It is large page*/
2228 if (largepage_lvl > 1) {
2229 unsigned long end_pfn;
2230 unsigned long pages_to_remove;
2232 pteval |= DMA_PTE_LARGE_PAGE;
2233 pages_to_remove = min_t(unsigned long, nr_pages,
2234 nr_pte_to_next_page(pte) * lvl_pages);
2235 end_pfn = iov_pfn + pages_to_remove - 1;
2236 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2238 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2242 /* We don't need lock here, nobody else
2243 * touches the iova range
2245 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2247 static int dumps = 5;
2248 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2249 iov_pfn, tmp, (unsigned long long)pteval);
2252 debug_dma_dump_mappings(NULL);
2257 nr_pages -= lvl_pages;
2258 iov_pfn += lvl_pages;
2259 phys_pfn += lvl_pages;
2260 pteval += lvl_pages * VTD_PAGE_SIZE;
2262 /* If the next PTE would be the first in a new page, then we
2263 * need to flush the cache on the entries we've just written.
2264 * And then we'll need to recalculate 'pte', so clear it and
2265 * let it get set again in the if (!pte) block above.
2267 * If we're done (!nr_pages) we need to flush the cache too.
2269 * Also if we've been setting superpages, we may need to
2270 * recalculate 'pte' and switch back to smaller pages for the
2271 * end of the mapping, if the trailing size is not enough to
2272 * use another superpage (i.e. nr_pages < lvl_pages).
2275 if (!nr_pages || first_pte_in_page(pte) ||
2276 (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2277 domain_flush_cache(domain, first_pte,
2278 (void *)pte - (void *)first_pte);
2286 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2288 struct intel_iommu *iommu = info->iommu;
2289 struct context_entry *context;
2295 spin_lock(&iommu->lock);
2296 context = iommu_context_addr(iommu, bus, devfn, 0);
2298 spin_unlock(&iommu->lock);
2302 if (sm_supported(iommu)) {
2303 if (hw_pass_through && domain_type_is_si(info->domain))
2304 did_old = FLPT_DEFAULT_DID;
2306 did_old = domain_id_iommu(info->domain, iommu);
2308 did_old = context_domain_id(context);
2311 context_clear_entry(context);
2312 __iommu_flush_cache(iommu, context, sizeof(*context));
2313 spin_unlock(&iommu->lock);
2314 iommu->flush.flush_context(iommu,
2316 (((u16)bus) << 8) | devfn,
2317 DMA_CCMD_MASK_NOBIT,
2318 DMA_CCMD_DEVICE_INVL);
2320 if (sm_supported(iommu))
2321 qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2323 iommu->flush.flush_iotlb(iommu,
2329 __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2332 static int domain_setup_first_level(struct intel_iommu *iommu,
2333 struct dmar_domain *domain,
2337 struct dma_pte *pgd = domain->pgd;
2342 * Skip top levels of page tables for iommu which has
2343 * less agaw than default. Unnecessary for PT mode.
2345 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2346 pgd = phys_to_virt(dma_pte_addr(pgd));
2347 if (!dma_pte_present(pgd))
2351 level = agaw_to_level(agaw);
2352 if (level != 4 && level != 5)
2356 flags |= PASID_FLAG_FL5LP;
2358 if (domain->force_snooping)
2359 flags |= PASID_FLAG_PAGE_SNOOP;
2361 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2362 domain_id_iommu(domain, iommu),
2366 static bool dev_is_real_dma_subdevice(struct device *dev)
2368 return dev && dev_is_pci(dev) &&
2369 pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2372 static int iommu_domain_identity_map(struct dmar_domain *domain,
2373 unsigned long first_vpfn,
2374 unsigned long last_vpfn)
2377 * RMRR range might have overlap with physical memory range,
2380 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2382 return __domain_mapping(domain, first_vpfn,
2383 first_vpfn, last_vpfn - first_vpfn + 1,
2384 DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL);
2387 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2389 static int __init si_domain_init(int hw)
2391 struct dmar_rmrr_unit *rmrr;
2395 si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2399 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2400 domain_exit(si_domain);
2408 for_each_online_node(nid) {
2409 unsigned long start_pfn, end_pfn;
2412 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2413 ret = iommu_domain_identity_map(si_domain,
2414 mm_to_dma_pfn_start(start_pfn),
2415 mm_to_dma_pfn_end(end_pfn));
2422 * Identity map the RMRRs so that devices with RMRRs could also use
2425 for_each_rmrr_units(rmrr) {
2426 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2428 unsigned long long start = rmrr->base_address;
2429 unsigned long long end = rmrr->end_address;
2431 if (WARN_ON(end < start ||
2432 end >> agaw_to_width(si_domain->agaw)))
2435 ret = iommu_domain_identity_map(si_domain,
2436 mm_to_dma_pfn_start(start >> PAGE_SHIFT),
2437 mm_to_dma_pfn_end(end >> PAGE_SHIFT));
2446 static int dmar_domain_attach_device(struct dmar_domain *domain,
2449 struct device_domain_info *info = dev_iommu_priv_get(dev);
2450 struct intel_iommu *iommu;
2451 unsigned long flags;
2455 iommu = device_to_iommu(dev, &bus, &devfn);
2459 ret = domain_attach_iommu(domain, iommu);
2462 info->domain = domain;
2463 spin_lock_irqsave(&domain->lock, flags);
2464 list_add(&info->link, &domain->devices);
2465 spin_unlock_irqrestore(&domain->lock, flags);
2467 /* PASID table is mandatory for a PCI device in scalable mode. */
2468 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2469 /* Setup the PASID entry for requests without PASID: */
2470 if (hw_pass_through && domain_type_is_si(domain))
2471 ret = intel_pasid_setup_pass_through(iommu, domain,
2472 dev, IOMMU_NO_PASID);
2473 else if (domain->use_first_level)
2474 ret = domain_setup_first_level(iommu, domain, dev,
2477 ret = intel_pasid_setup_second_level(iommu, domain,
2478 dev, IOMMU_NO_PASID);
2480 dev_err(dev, "Setup RID2PASID failed\n");
2481 device_block_translation(dev);
2486 ret = domain_context_mapping(domain, dev);
2488 dev_err(dev, "Domain context map failed\n");
2489 device_block_translation(dev);
2493 iommu_enable_pci_caps(info);
2499 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2500 * is relaxable (ie. is allowed to be not enforced under some conditions)
2501 * @dev: device handle
2503 * We assume that PCI USB devices with RMRRs have them largely
2504 * for historical reasons and that the RMRR space is not actively used post
2505 * boot. This exclusion may change if vendors begin to abuse it.
2507 * The same exception is made for graphics devices, with the requirement that
2508 * any use of the RMRR regions will be torn down before assigning the device
2511 * Return: true if the RMRR is relaxable, false otherwise
2513 static bool device_rmrr_is_relaxable(struct device *dev)
2515 struct pci_dev *pdev;
2517 if (!dev_is_pci(dev))
2520 pdev = to_pci_dev(dev);
2521 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2528 * Return the required default domain type for a specific device.
2530 * @dev: the device in query
2531 * @startup: true if this is during early boot
2534 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2535 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2536 * - 0: both identity and dynamic domains work for this device
2538 static int device_def_domain_type(struct device *dev)
2540 if (dev_is_pci(dev)) {
2541 struct pci_dev *pdev = to_pci_dev(dev);
2543 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2544 return IOMMU_DOMAIN_IDENTITY;
2546 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2547 return IOMMU_DOMAIN_IDENTITY;
2553 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2556 * Start from the sane iommu hardware state.
2557 * If the queued invalidation is already initialized by us
2558 * (for example, while enabling interrupt-remapping) then
2559 * we got the things already rolling from a sane state.
2563 * Clear any previous faults.
2565 dmar_fault(-1, iommu);
2567 * Disable queued invalidation if supported and already enabled
2568 * before OS handover.
2570 dmar_disable_qi(iommu);
2573 if (dmar_enable_qi(iommu)) {
2575 * Queued Invalidate not enabled, use Register Based Invalidate
2577 iommu->flush.flush_context = __iommu_flush_context;
2578 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2579 pr_info("%s: Using Register based invalidation\n",
2582 iommu->flush.flush_context = qi_flush_context;
2583 iommu->flush.flush_iotlb = qi_flush_iotlb;
2584 pr_info("%s: Using Queued invalidation\n", iommu->name);
2588 static int copy_context_table(struct intel_iommu *iommu,
2589 struct root_entry *old_re,
2590 struct context_entry **tbl,
2593 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2594 struct context_entry *new_ce = NULL, ce;
2595 struct context_entry *old_ce = NULL;
2596 struct root_entry re;
2597 phys_addr_t old_ce_phys;
2599 tbl_idx = ext ? bus * 2 : bus;
2600 memcpy(&re, old_re, sizeof(re));
2602 for (devfn = 0; devfn < 256; devfn++) {
2603 /* First calculate the correct index */
2604 idx = (ext ? devfn * 2 : devfn) % 256;
2607 /* First save what we may have and clean up */
2609 tbl[tbl_idx] = new_ce;
2610 __iommu_flush_cache(iommu, new_ce,
2620 old_ce_phys = root_entry_lctp(&re);
2622 old_ce_phys = root_entry_uctp(&re);
2625 if (ext && devfn == 0) {
2626 /* No LCTP, try UCTP */
2635 old_ce = memremap(old_ce_phys, PAGE_SIZE,
2640 new_ce = alloc_pgtable_page(iommu->node, GFP_KERNEL);
2647 /* Now copy the context entry */
2648 memcpy(&ce, old_ce + idx, sizeof(ce));
2650 if (!context_present(&ce))
2653 did = context_domain_id(&ce);
2654 if (did >= 0 && did < cap_ndoms(iommu->cap))
2655 set_bit(did, iommu->domain_ids);
2657 set_context_copied(iommu, bus, devfn);
2661 tbl[tbl_idx + pos] = new_ce;
2663 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2672 static int copy_translation_tables(struct intel_iommu *iommu)
2674 struct context_entry **ctxt_tbls;
2675 struct root_entry *old_rt;
2676 phys_addr_t old_rt_phys;
2677 int ctxt_table_entries;
2682 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2683 ext = !!(rtaddr_reg & DMA_RTADDR_SMT);
2684 new_ext = !!sm_supported(iommu);
2687 * The RTT bit can only be changed when translation is disabled,
2688 * but disabling translation means to open a window for data
2689 * corruption. So bail out and don't copy anything if we would
2690 * have to change the bit.
2695 iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2696 if (!iommu->copied_tables)
2699 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2703 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2707 /* This is too big for the stack - allocate it from slab */
2708 ctxt_table_entries = ext ? 512 : 256;
2710 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2714 for (bus = 0; bus < 256; bus++) {
2715 ret = copy_context_table(iommu, &old_rt[bus],
2716 ctxt_tbls, bus, ext);
2718 pr_err("%s: Failed to copy context table for bus %d\n",
2724 spin_lock(&iommu->lock);
2726 /* Context tables are copied, now write them to the root_entry table */
2727 for (bus = 0; bus < 256; bus++) {
2728 int idx = ext ? bus * 2 : bus;
2731 if (ctxt_tbls[idx]) {
2732 val = virt_to_phys(ctxt_tbls[idx]) | 1;
2733 iommu->root_entry[bus].lo = val;
2736 if (!ext || !ctxt_tbls[idx + 1])
2739 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2740 iommu->root_entry[bus].hi = val;
2743 spin_unlock(&iommu->lock);
2747 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2757 static int __init init_dmars(void)
2759 struct dmar_drhd_unit *drhd;
2760 struct intel_iommu *iommu;
2763 ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2767 for_each_iommu(iommu, drhd) {
2768 if (drhd->ignored) {
2769 iommu_disable_translation(iommu);
2774 * Find the max pasid size of all IOMMU's in the system.
2775 * We need to ensure the system pasid table is no bigger
2776 * than the smallest supported.
2778 if (pasid_supported(iommu)) {
2779 u32 temp = 2 << ecap_pss(iommu->ecap);
2781 intel_pasid_max_id = min_t(u32, temp,
2782 intel_pasid_max_id);
2785 intel_iommu_init_qi(iommu);
2787 ret = iommu_init_domains(iommu);
2791 init_translation_status(iommu);
2793 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2794 iommu_disable_translation(iommu);
2795 clear_translation_pre_enabled(iommu);
2796 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2802 * we could share the same root & context tables
2803 * among all IOMMU's. Need to Split it later.
2805 ret = iommu_alloc_root_entry(iommu);
2809 if (translation_pre_enabled(iommu)) {
2810 pr_info("Translation already enabled - trying to copy translation structures\n");
2812 ret = copy_translation_tables(iommu);
2815 * We found the IOMMU with translation
2816 * enabled - but failed to copy over the
2817 * old root-entry table. Try to proceed
2818 * by disabling translation now and
2819 * allocating a clean root-entry table.
2820 * This might cause DMAR faults, but
2821 * probably the dump will still succeed.
2823 pr_err("Failed to copy translation tables from previous kernel for %s\n",
2825 iommu_disable_translation(iommu);
2826 clear_translation_pre_enabled(iommu);
2828 pr_info("Copied translation tables from previous kernel for %s\n",
2833 if (!ecap_pass_through(iommu->ecap))
2834 hw_pass_through = 0;
2835 intel_svm_check(iommu);
2839 * Now that qi is enabled on all iommus, set the root entry and flush
2840 * caches. This is required on some Intel X58 chipsets, otherwise the
2841 * flush_context function will loop forever and the boot hangs.
2843 for_each_active_iommu(iommu, drhd) {
2844 iommu_flush_write_buffer(iommu);
2845 iommu_set_root_entry(iommu);
2848 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2853 iommu_identity_mapping |= IDENTMAP_GFX;
2855 check_tylersburg_isoch();
2857 ret = si_domain_init(hw_pass_through);
2864 * global invalidate context cache
2865 * global invalidate iotlb
2866 * enable translation
2868 for_each_iommu(iommu, drhd) {
2869 if (drhd->ignored) {
2871 * we always have to disable PMRs or DMA may fail on
2875 iommu_disable_protect_mem_regions(iommu);
2879 iommu_flush_write_buffer(iommu);
2881 #ifdef CONFIG_INTEL_IOMMU_SVM
2882 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
2884 * Call dmar_alloc_hwirq() with dmar_global_lock held,
2885 * could cause possible lock race condition.
2887 up_write(&dmar_global_lock);
2888 ret = intel_svm_enable_prq(iommu);
2889 down_write(&dmar_global_lock);
2894 ret = dmar_set_interrupt(iommu);
2902 for_each_active_iommu(iommu, drhd) {
2903 disable_dmar_iommu(iommu);
2904 free_dmar_iommu(iommu);
2907 domain_exit(si_domain);
2914 static void __init init_no_remapping_devices(void)
2916 struct dmar_drhd_unit *drhd;
2920 for_each_drhd_unit(drhd) {
2921 if (!drhd->include_all) {
2922 for_each_active_dev_scope(drhd->devices,
2923 drhd->devices_cnt, i, dev)
2925 /* ignore DMAR unit if no devices exist */
2926 if (i == drhd->devices_cnt)
2931 for_each_active_drhd_unit(drhd) {
2932 if (drhd->include_all)
2935 for_each_active_dev_scope(drhd->devices,
2936 drhd->devices_cnt, i, dev)
2937 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2939 if (i < drhd->devices_cnt)
2942 /* This IOMMU has *only* gfx devices. Either bypass it or
2943 set the gfx_mapped flag, as appropriate */
2944 drhd->gfx_dedicated = 1;
2950 #ifdef CONFIG_SUSPEND
2951 static int init_iommu_hw(void)
2953 struct dmar_drhd_unit *drhd;
2954 struct intel_iommu *iommu = NULL;
2957 for_each_active_iommu(iommu, drhd) {
2959 ret = dmar_reenable_qi(iommu);
2965 for_each_iommu(iommu, drhd) {
2966 if (drhd->ignored) {
2968 * we always have to disable PMRs or DMA may fail on
2972 iommu_disable_protect_mem_regions(iommu);
2976 iommu_flush_write_buffer(iommu);
2977 iommu_set_root_entry(iommu);
2978 iommu_enable_translation(iommu);
2979 iommu_disable_protect_mem_regions(iommu);
2985 static void iommu_flush_all(void)
2987 struct dmar_drhd_unit *drhd;
2988 struct intel_iommu *iommu;
2990 for_each_active_iommu(iommu, drhd) {
2991 iommu->flush.flush_context(iommu, 0, 0, 0,
2992 DMA_CCMD_GLOBAL_INVL);
2993 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2994 DMA_TLB_GLOBAL_FLUSH);
2998 static int iommu_suspend(void)
3000 struct dmar_drhd_unit *drhd;
3001 struct intel_iommu *iommu = NULL;
3006 for_each_active_iommu(iommu, drhd) {
3007 iommu_disable_translation(iommu);
3009 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3011 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3012 readl(iommu->reg + DMAR_FECTL_REG);
3013 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3014 readl(iommu->reg + DMAR_FEDATA_REG);
3015 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3016 readl(iommu->reg + DMAR_FEADDR_REG);
3017 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3018 readl(iommu->reg + DMAR_FEUADDR_REG);
3020 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3025 static void iommu_resume(void)
3027 struct dmar_drhd_unit *drhd;
3028 struct intel_iommu *iommu = NULL;
3031 if (init_iommu_hw()) {
3033 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3035 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3039 for_each_active_iommu(iommu, drhd) {
3041 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3043 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3044 iommu->reg + DMAR_FECTL_REG);
3045 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3046 iommu->reg + DMAR_FEDATA_REG);
3047 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3048 iommu->reg + DMAR_FEADDR_REG);
3049 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3050 iommu->reg + DMAR_FEUADDR_REG);
3052 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3056 static struct syscore_ops iommu_syscore_ops = {
3057 .resume = iommu_resume,
3058 .suspend = iommu_suspend,
3061 static void __init init_iommu_pm_ops(void)
3063 register_syscore_ops(&iommu_syscore_ops);
3067 static inline void init_iommu_pm_ops(void) {}
3068 #endif /* CONFIG_PM */
3070 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3072 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3073 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3074 rmrr->end_address <= rmrr->base_address ||
3075 arch_rmrr_sanity_check(rmrr))
3081 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3083 struct acpi_dmar_reserved_memory *rmrr;
3084 struct dmar_rmrr_unit *rmrru;
3086 rmrr = (struct acpi_dmar_reserved_memory *)header;
3087 if (rmrr_sanity_check(rmrr)) {
3089 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3090 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3091 rmrr->base_address, rmrr->end_address,
3092 dmi_get_system_info(DMI_BIOS_VENDOR),
3093 dmi_get_system_info(DMI_BIOS_VERSION),
3094 dmi_get_system_info(DMI_PRODUCT_VERSION));
3095 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3098 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3102 rmrru->hdr = header;
3104 rmrru->base_address = rmrr->base_address;
3105 rmrru->end_address = rmrr->end_address;
3107 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3108 ((void *)rmrr) + rmrr->header.length,
3109 &rmrru->devices_cnt);
3110 if (rmrru->devices_cnt && rmrru->devices == NULL)
3113 list_add(&rmrru->list, &dmar_rmrr_units);
3122 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3124 struct dmar_atsr_unit *atsru;
3125 struct acpi_dmar_atsr *tmp;
3127 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3129 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3130 if (atsr->segment != tmp->segment)
3132 if (atsr->header.length != tmp->header.length)
3134 if (memcmp(atsr, tmp, atsr->header.length) == 0)
3141 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3143 struct acpi_dmar_atsr *atsr;
3144 struct dmar_atsr_unit *atsru;
3146 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3149 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3150 atsru = dmar_find_atsr(atsr);
3154 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3159 * If memory is allocated from slab by ACPI _DSM method, we need to
3160 * copy the memory content because the memory buffer will be freed
3163 atsru->hdr = (void *)(atsru + 1);
3164 memcpy(atsru->hdr, hdr, hdr->length);
3165 atsru->include_all = atsr->flags & 0x1;
3166 if (!atsru->include_all) {
3167 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3168 (void *)atsr + atsr->header.length,
3169 &atsru->devices_cnt);
3170 if (atsru->devices_cnt && atsru->devices == NULL) {
3176 list_add_rcu(&atsru->list, &dmar_atsr_units);
3181 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3183 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3187 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3189 struct acpi_dmar_atsr *atsr;
3190 struct dmar_atsr_unit *atsru;
3192 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3193 atsru = dmar_find_atsr(atsr);
3195 list_del_rcu(&atsru->list);
3197 intel_iommu_free_atsr(atsru);
3203 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3207 struct acpi_dmar_atsr *atsr;
3208 struct dmar_atsr_unit *atsru;
3210 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3211 atsru = dmar_find_atsr(atsr);
3215 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3216 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3224 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3226 struct dmar_satc_unit *satcu;
3227 struct acpi_dmar_satc *tmp;
3229 list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3231 tmp = (struct acpi_dmar_satc *)satcu->hdr;
3232 if (satc->segment != tmp->segment)
3234 if (satc->header.length != tmp->header.length)
3236 if (memcmp(satc, tmp, satc->header.length) == 0)
3243 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3245 struct acpi_dmar_satc *satc;
3246 struct dmar_satc_unit *satcu;
3248 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3251 satc = container_of(hdr, struct acpi_dmar_satc, header);
3252 satcu = dmar_find_satc(satc);
3256 satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3260 satcu->hdr = (void *)(satcu + 1);
3261 memcpy(satcu->hdr, hdr, hdr->length);
3262 satcu->atc_required = satc->flags & 0x1;
3263 satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3264 (void *)satc + satc->header.length,
3265 &satcu->devices_cnt);
3266 if (satcu->devices_cnt && !satcu->devices) {
3270 list_add_rcu(&satcu->list, &dmar_satc_units);
3275 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3278 struct intel_iommu *iommu = dmaru->iommu;
3280 ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3284 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3285 pr_warn("%s: Doesn't support hardware pass through.\n",
3290 sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3291 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3292 pr_warn("%s: Doesn't support large page.\n",
3298 * Disable translation if already enabled prior to OS handover.
3300 if (iommu->gcmd & DMA_GCMD_TE)
3301 iommu_disable_translation(iommu);
3303 ret = iommu_init_domains(iommu);
3305 ret = iommu_alloc_root_entry(iommu);
3309 intel_svm_check(iommu);
3311 if (dmaru->ignored) {
3313 * we always have to disable PMRs or DMA may fail on this device
3316 iommu_disable_protect_mem_regions(iommu);
3320 intel_iommu_init_qi(iommu);
3321 iommu_flush_write_buffer(iommu);
3323 #ifdef CONFIG_INTEL_IOMMU_SVM
3324 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3325 ret = intel_svm_enable_prq(iommu);
3330 ret = dmar_set_interrupt(iommu);
3334 iommu_set_root_entry(iommu);
3335 iommu_enable_translation(iommu);
3337 iommu_disable_protect_mem_regions(iommu);
3341 disable_dmar_iommu(iommu);
3343 free_dmar_iommu(iommu);
3347 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3350 struct intel_iommu *iommu = dmaru->iommu;
3352 if (!intel_iommu_enabled)
3358 ret = intel_iommu_add(dmaru);
3360 disable_dmar_iommu(iommu);
3361 free_dmar_iommu(iommu);
3367 static void intel_iommu_free_dmars(void)
3369 struct dmar_rmrr_unit *rmrru, *rmrr_n;
3370 struct dmar_atsr_unit *atsru, *atsr_n;
3371 struct dmar_satc_unit *satcu, *satc_n;
3373 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3374 list_del(&rmrru->list);
3375 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3379 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3380 list_del(&atsru->list);
3381 intel_iommu_free_atsr(atsru);
3383 list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3384 list_del(&satcu->list);
3385 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3390 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3392 struct dmar_satc_unit *satcu;
3393 struct acpi_dmar_satc *satc;
3397 dev = pci_physfn(dev);
3400 list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3401 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3402 if (satc->segment != pci_domain_nr(dev->bus))
3404 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3405 if (to_pci_dev(tmp) == dev)
3414 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3417 struct pci_bus *bus;
3418 struct pci_dev *bridge = NULL;
3420 struct acpi_dmar_atsr *atsr;
3421 struct dmar_atsr_unit *atsru;
3422 struct dmar_satc_unit *satcu;
3424 dev = pci_physfn(dev);
3425 satcu = dmar_find_matched_satc_unit(dev);
3428 * This device supports ATS as it is in SATC table.
3429 * When IOMMU is in legacy mode, enabling ATS is done
3430 * automatically by HW for the device that requires
3431 * ATS, hence OS should not enable this device ATS
3432 * to avoid duplicated TLB invalidation.
3434 return !(satcu->atc_required && !sm_supported(iommu));
3436 for (bus = dev->bus; bus; bus = bus->parent) {
3438 /* If it's an integrated device, allow ATS */
3441 /* Connected via non-PCIe: no ATS */
3442 if (!pci_is_pcie(bridge) ||
3443 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3445 /* If we found the root port, look it up in the ATSR */
3446 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3451 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3452 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3453 if (atsr->segment != pci_domain_nr(dev->bus))
3456 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3457 if (tmp == &bridge->dev)
3460 if (atsru->include_all)
3470 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3473 struct dmar_rmrr_unit *rmrru;
3474 struct dmar_atsr_unit *atsru;
3475 struct dmar_satc_unit *satcu;
3476 struct acpi_dmar_atsr *atsr;
3477 struct acpi_dmar_reserved_memory *rmrr;
3478 struct acpi_dmar_satc *satc;
3480 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3483 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3484 rmrr = container_of(rmrru->hdr,
3485 struct acpi_dmar_reserved_memory, header);
3486 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3487 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3488 ((void *)rmrr) + rmrr->header.length,
3489 rmrr->segment, rmrru->devices,
3490 rmrru->devices_cnt);
3493 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3494 dmar_remove_dev_scope(info, rmrr->segment,
3495 rmrru->devices, rmrru->devices_cnt);
3499 list_for_each_entry(atsru, &dmar_atsr_units, list) {
3500 if (atsru->include_all)
3503 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3504 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3505 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3506 (void *)atsr + atsr->header.length,
3507 atsr->segment, atsru->devices,
3508 atsru->devices_cnt);
3513 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3514 if (dmar_remove_dev_scope(info, atsr->segment,
3515 atsru->devices, atsru->devices_cnt))
3519 list_for_each_entry(satcu, &dmar_satc_units, list) {
3520 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3521 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3522 ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3523 (void *)satc + satc->header.length,
3524 satc->segment, satcu->devices,
3525 satcu->devices_cnt);
3530 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3531 if (dmar_remove_dev_scope(info, satc->segment,
3532 satcu->devices, satcu->devices_cnt))
3540 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3541 unsigned long val, void *v)
3543 struct memory_notify *mhp = v;
3544 unsigned long start_vpfn = mm_to_dma_pfn_start(mhp->start_pfn);
3545 unsigned long last_vpfn = mm_to_dma_pfn_end(mhp->start_pfn +
3549 case MEM_GOING_ONLINE:
3550 if (iommu_domain_identity_map(si_domain,
3551 start_vpfn, last_vpfn)) {
3552 pr_warn("Failed to build identity map for [%lx-%lx]\n",
3553 start_vpfn, last_vpfn);
3559 case MEM_CANCEL_ONLINE:
3561 struct dmar_drhd_unit *drhd;
3562 struct intel_iommu *iommu;
3563 LIST_HEAD(freelist);
3565 domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3568 for_each_active_iommu(iommu, drhd)
3569 iommu_flush_iotlb_psi(iommu, si_domain,
3570 start_vpfn, mhp->nr_pages,
3571 list_empty(&freelist), 0);
3573 put_pages_list(&freelist);
3581 static struct notifier_block intel_iommu_memory_nb = {
3582 .notifier_call = intel_iommu_memory_notifier,
3586 static void intel_disable_iommus(void)
3588 struct intel_iommu *iommu = NULL;
3589 struct dmar_drhd_unit *drhd;
3591 for_each_iommu(iommu, drhd)
3592 iommu_disable_translation(iommu);
3595 void intel_iommu_shutdown(void)
3597 struct dmar_drhd_unit *drhd;
3598 struct intel_iommu *iommu = NULL;
3600 if (no_iommu || dmar_disabled)
3603 down_write(&dmar_global_lock);
3605 /* Disable PMRs explicitly here. */
3606 for_each_iommu(iommu, drhd)
3607 iommu_disable_protect_mem_regions(iommu);
3609 /* Make sure the IOMMUs are switched off */
3610 intel_disable_iommus();
3612 up_write(&dmar_global_lock);
3615 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3617 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3619 return container_of(iommu_dev, struct intel_iommu, iommu);
3622 static ssize_t version_show(struct device *dev,
3623 struct device_attribute *attr, char *buf)
3625 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3626 u32 ver = readl(iommu->reg + DMAR_VER_REG);
3627 return sysfs_emit(buf, "%d:%d\n",
3628 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3630 static DEVICE_ATTR_RO(version);
3632 static ssize_t address_show(struct device *dev,
3633 struct device_attribute *attr, char *buf)
3635 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3636 return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
3638 static DEVICE_ATTR_RO(address);
3640 static ssize_t cap_show(struct device *dev,
3641 struct device_attribute *attr, char *buf)
3643 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3644 return sysfs_emit(buf, "%llx\n", iommu->cap);
3646 static DEVICE_ATTR_RO(cap);
3648 static ssize_t ecap_show(struct device *dev,
3649 struct device_attribute *attr, char *buf)
3651 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3652 return sysfs_emit(buf, "%llx\n", iommu->ecap);
3654 static DEVICE_ATTR_RO(ecap);
3656 static ssize_t domains_supported_show(struct device *dev,
3657 struct device_attribute *attr, char *buf)
3659 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3660 return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
3662 static DEVICE_ATTR_RO(domains_supported);
3664 static ssize_t domains_used_show(struct device *dev,
3665 struct device_attribute *attr, char *buf)
3667 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3668 return sysfs_emit(buf, "%d\n",
3669 bitmap_weight(iommu->domain_ids,
3670 cap_ndoms(iommu->cap)));
3672 static DEVICE_ATTR_RO(domains_used);
3674 static struct attribute *intel_iommu_attrs[] = {
3675 &dev_attr_version.attr,
3676 &dev_attr_address.attr,
3678 &dev_attr_ecap.attr,
3679 &dev_attr_domains_supported.attr,
3680 &dev_attr_domains_used.attr,
3684 static struct attribute_group intel_iommu_group = {
3685 .name = "intel-iommu",
3686 .attrs = intel_iommu_attrs,
3689 const struct attribute_group *intel_iommu_groups[] = {
3694 static inline bool has_external_pci(void)
3696 struct pci_dev *pdev = NULL;
3698 for_each_pci_dev(pdev)
3699 if (pdev->external_facing) {
3707 static int __init platform_optin_force_iommu(void)
3709 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3712 if (no_iommu || dmar_disabled)
3713 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3716 * If Intel-IOMMU is disabled by default, we will apply identity
3717 * map for all devices except those marked as being untrusted.
3720 iommu_set_default_passthrough(false);
3728 static int __init probe_acpi_namespace_devices(void)
3730 struct dmar_drhd_unit *drhd;
3731 /* To avoid a -Wunused-but-set-variable warning. */
3732 struct intel_iommu *iommu __maybe_unused;
3736 for_each_active_iommu(iommu, drhd) {
3737 for_each_active_dev_scope(drhd->devices,
3738 drhd->devices_cnt, i, dev) {
3739 struct acpi_device_physical_node *pn;
3740 struct acpi_device *adev;
3742 if (dev->bus != &acpi_bus_type)
3745 adev = to_acpi_device(dev);
3746 mutex_lock(&adev->physical_node_lock);
3747 list_for_each_entry(pn,
3748 &adev->physical_node_list, node) {
3749 ret = iommu_probe_device(pn->dev);
3753 mutex_unlock(&adev->physical_node_lock);
3763 static __init int tboot_force_iommu(void)
3765 if (!tboot_enabled())
3768 if (no_iommu || dmar_disabled)
3769 pr_warn("Forcing Intel-IOMMU to enabled\n");
3777 int __init intel_iommu_init(void)
3780 struct dmar_drhd_unit *drhd;
3781 struct intel_iommu *iommu;
3784 * Intel IOMMU is required for a TXT/tboot launch or platform
3785 * opt in, so enforce that.
3787 force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3788 platform_optin_force_iommu();
3790 down_write(&dmar_global_lock);
3791 if (dmar_table_init()) {
3793 panic("tboot: Failed to initialize DMAR table\n");
3797 if (dmar_dev_scope_init() < 0) {
3799 panic("tboot: Failed to initialize DMAR device scope\n");
3803 up_write(&dmar_global_lock);
3806 * The bus notifier takes the dmar_global_lock, so lockdep will
3807 * complain later when we register it under the lock.
3809 dmar_register_bus_notifier();
3811 down_write(&dmar_global_lock);
3814 intel_iommu_debugfs_init();
3816 if (no_iommu || dmar_disabled) {
3818 * We exit the function here to ensure IOMMU's remapping and
3819 * mempool aren't setup, which means that the IOMMU's PMRs
3820 * won't be disabled via the call to init_dmars(). So disable
3821 * it explicitly here. The PMRs were setup by tboot prior to
3822 * calling SENTER, but the kernel is expected to reset/tear
3825 if (intel_iommu_tboot_noforce) {
3826 for_each_iommu(iommu, drhd)
3827 iommu_disable_protect_mem_regions(iommu);
3831 * Make sure the IOMMUs are switched off, even when we
3832 * boot into a kexec kernel and the previous kernel left
3835 intel_disable_iommus();
3839 if (list_empty(&dmar_rmrr_units))
3840 pr_info("No RMRR found\n");
3842 if (list_empty(&dmar_atsr_units))
3843 pr_info("No ATSR found\n");
3845 if (list_empty(&dmar_satc_units))
3846 pr_info("No SATC found\n");
3848 init_no_remapping_devices();
3853 panic("tboot: Failed to initialize DMARs\n");
3854 pr_err("Initialization failed\n");
3857 up_write(&dmar_global_lock);
3859 init_iommu_pm_ops();
3861 down_read(&dmar_global_lock);
3862 for_each_active_iommu(iommu, drhd) {
3864 * The flush queue implementation does not perform
3865 * page-selective invalidations that are required for efficient
3866 * TLB flushes in virtual environments. The benefit of batching
3867 * is likely to be much lower than the overhead of synchronizing
3868 * the virtual and physical IOMMU page-tables.
3870 if (cap_caching_mode(iommu->cap) &&
3871 !first_level_by_default(IOMMU_DOMAIN_DMA)) {
3872 pr_info_once("IOMMU batching disallowed due to virtualization\n");
3873 iommu_set_dma_strict();
3875 iommu_device_sysfs_add(&iommu->iommu, NULL,
3878 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3880 iommu_pmu_register(iommu);
3882 up_read(&dmar_global_lock);
3884 if (si_domain && !hw_pass_through)
3885 register_memory_notifier(&intel_iommu_memory_nb);
3887 down_read(&dmar_global_lock);
3888 if (probe_acpi_namespace_devices())
3889 pr_warn("ACPI name space devices didn't probe correctly\n");
3891 /* Finally, we enable the DMA remapping hardware. */
3892 for_each_iommu(iommu, drhd) {
3893 if (!drhd->ignored && !translation_pre_enabled(iommu))
3894 iommu_enable_translation(iommu);
3896 iommu_disable_protect_mem_regions(iommu);
3898 up_read(&dmar_global_lock);
3900 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3902 intel_iommu_enabled = 1;
3907 intel_iommu_free_dmars();
3908 up_write(&dmar_global_lock);
3912 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3914 struct device_domain_info *info = opaque;
3916 domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3921 * NB - intel-iommu lacks any sort of reference counting for the users of
3922 * dependent devices. If multiple endpoints have intersecting dependent
3923 * devices, unbinding the driver from any one of them will possibly leave
3924 * the others unable to operate.
3926 static void domain_context_clear(struct device_domain_info *info)
3928 if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
3931 pci_for_each_dma_alias(to_pci_dev(info->dev),
3932 &domain_context_clear_one_cb, info);
3935 static void dmar_remove_one_dev_info(struct device *dev)
3937 struct device_domain_info *info = dev_iommu_priv_get(dev);
3938 struct dmar_domain *domain = info->domain;
3939 struct intel_iommu *iommu = info->iommu;
3940 unsigned long flags;
3942 if (!dev_is_real_dma_subdevice(info->dev)) {
3943 if (dev_is_pci(info->dev) && sm_supported(iommu))
3944 intel_pasid_tear_down_entry(iommu, info->dev,
3945 IOMMU_NO_PASID, false);
3947 iommu_disable_pci_caps(info);
3948 domain_context_clear(info);
3951 spin_lock_irqsave(&domain->lock, flags);
3952 list_del(&info->link);
3953 spin_unlock_irqrestore(&domain->lock, flags);
3955 domain_detach_iommu(domain, iommu);
3956 info->domain = NULL;
3960 * Clear the page table pointer in context or pasid table entries so that
3961 * all DMA requests without PASID from the device are blocked. If the page
3962 * table has been set, clean up the data structures.
3964 void device_block_translation(struct device *dev)
3966 struct device_domain_info *info = dev_iommu_priv_get(dev);
3967 struct intel_iommu *iommu = info->iommu;
3968 unsigned long flags;
3970 iommu_disable_pci_caps(info);
3971 if (!dev_is_real_dma_subdevice(dev)) {
3972 if (sm_supported(iommu))
3973 intel_pasid_tear_down_entry(iommu, dev,
3974 IOMMU_NO_PASID, false);
3976 domain_context_clear(info);
3982 spin_lock_irqsave(&info->domain->lock, flags);
3983 list_del(&info->link);
3984 spin_unlock_irqrestore(&info->domain->lock, flags);
3986 domain_detach_iommu(info->domain, iommu);
3987 info->domain = NULL;
3990 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3994 /* calculate AGAW */
3995 domain->gaw = guest_width;
3996 adjust_width = guestwidth_to_adjustwidth(guest_width);
3997 domain->agaw = width_to_agaw(adjust_width);
3999 domain->iommu_coherency = false;
4000 domain->iommu_superpage = 0;
4001 domain->max_addr = 0;
4003 /* always allocate the top pgd */
4004 domain->pgd = alloc_pgtable_page(domain->nid, GFP_ATOMIC);
4007 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4011 static int blocking_domain_attach_dev(struct iommu_domain *domain,
4014 device_block_translation(dev);
4018 static struct iommu_domain blocking_domain = {
4019 .ops = &(const struct iommu_domain_ops) {
4020 .attach_dev = blocking_domain_attach_dev,
4021 .free = intel_iommu_domain_free
4025 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4027 struct dmar_domain *dmar_domain;
4028 struct iommu_domain *domain;
4031 case IOMMU_DOMAIN_BLOCKED:
4032 return &blocking_domain;
4033 case IOMMU_DOMAIN_DMA:
4034 case IOMMU_DOMAIN_UNMANAGED:
4035 dmar_domain = alloc_domain(type);
4037 pr_err("Can't allocate dmar_domain\n");
4040 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4041 pr_err("Domain initialization failed\n");
4042 domain_exit(dmar_domain);
4046 domain = &dmar_domain->domain;
4047 domain->geometry.aperture_start = 0;
4048 domain->geometry.aperture_end =
4049 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4050 domain->geometry.force_aperture = true;
4053 case IOMMU_DOMAIN_IDENTITY:
4054 return &si_domain->domain;
4055 case IOMMU_DOMAIN_SVA:
4056 return intel_svm_domain_alloc();
4064 static struct iommu_domain *
4065 intel_iommu_domain_alloc_user(struct device *dev, u32 flags,
4066 struct iommu_domain *parent,
4067 const struct iommu_user_data *user_data)
4069 struct device_domain_info *info = dev_iommu_priv_get(dev);
4070 bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
4071 bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
4072 struct intel_iommu *iommu = info->iommu;
4073 struct iommu_domain *domain;
4075 /* Must be NESTING domain */
4077 if (!nested_supported(iommu) || flags)
4078 return ERR_PTR(-EOPNOTSUPP);
4079 return intel_nested_domain_alloc(parent, user_data);
4083 (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
4084 return ERR_PTR(-EOPNOTSUPP);
4085 if (nested_parent && !nested_supported(iommu))
4086 return ERR_PTR(-EOPNOTSUPP);
4087 if (user_data || (dirty_tracking && !ssads_supported(iommu)))
4088 return ERR_PTR(-EOPNOTSUPP);
4091 * domain_alloc_user op needs to fully initialize a domain before
4092 * return, so uses iommu_domain_alloc() here for simple.
4094 domain = iommu_domain_alloc(dev->bus);
4096 return ERR_PTR(-ENOMEM);
4099 to_dmar_domain(domain)->nested_parent = true;
4101 if (dirty_tracking) {
4102 if (to_dmar_domain(domain)->use_first_level) {
4103 iommu_domain_free(domain);
4104 return ERR_PTR(-EOPNOTSUPP);
4106 domain->dirty_ops = &intel_dirty_ops;
4112 static void intel_iommu_domain_free(struct iommu_domain *domain)
4114 if (domain != &si_domain->domain && domain != &blocking_domain)
4115 domain_exit(to_dmar_domain(domain));
4118 int prepare_domain_attach_device(struct iommu_domain *domain,
4121 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4122 struct intel_iommu *iommu;
4125 iommu = device_to_iommu(dev, NULL, NULL);
4129 if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
4132 if (domain->dirty_ops && !ssads_supported(iommu))
4135 /* check if this iommu agaw is sufficient for max mapped address */
4136 addr_width = agaw_to_width(iommu->agaw);
4137 if (addr_width > cap_mgaw(iommu->cap))
4138 addr_width = cap_mgaw(iommu->cap);
4140 if (dmar_domain->max_addr > (1LL << addr_width))
4142 dmar_domain->gaw = addr_width;
4145 * Knock out extra levels of page tables if necessary
4147 while (iommu->agaw < dmar_domain->agaw) {
4148 struct dma_pte *pte;
4150 pte = dmar_domain->pgd;
4151 if (dma_pte_present(pte)) {
4152 dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4153 free_pgtable_page(pte);
4155 dmar_domain->agaw--;
4161 static int intel_iommu_attach_device(struct iommu_domain *domain,
4164 struct device_domain_info *info = dev_iommu_priv_get(dev);
4168 device_block_translation(dev);
4170 ret = prepare_domain_attach_device(domain, dev);
4174 return dmar_domain_attach_device(to_dmar_domain(domain), dev);
4177 static int intel_iommu_map(struct iommu_domain *domain,
4178 unsigned long iova, phys_addr_t hpa,
4179 size_t size, int iommu_prot, gfp_t gfp)
4181 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4185 if (iommu_prot & IOMMU_READ)
4186 prot |= DMA_PTE_READ;
4187 if (iommu_prot & IOMMU_WRITE)
4188 prot |= DMA_PTE_WRITE;
4189 if (dmar_domain->set_pte_snp)
4190 prot |= DMA_PTE_SNP;
4192 max_addr = iova + size;
4193 if (dmar_domain->max_addr < max_addr) {
4196 /* check if minimum agaw is sufficient for mapped address */
4197 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4198 if (end < max_addr) {
4199 pr_err("%s: iommu width (%d) is not "
4200 "sufficient for the mapped address (%llx)\n",
4201 __func__, dmar_domain->gaw, max_addr);
4204 dmar_domain->max_addr = max_addr;
4206 /* Round up size to next multiple of PAGE_SIZE, if it and
4207 the low bits of hpa would take us onto the next page */
4208 size = aligned_nrpages(hpa, size);
4209 return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4210 hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
4213 static int intel_iommu_map_pages(struct iommu_domain *domain,
4214 unsigned long iova, phys_addr_t paddr,
4215 size_t pgsize, size_t pgcount,
4216 int prot, gfp_t gfp, size_t *mapped)
4218 unsigned long pgshift = __ffs(pgsize);
4219 size_t size = pgcount << pgshift;
4222 if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4225 if (!IS_ALIGNED(iova | paddr, pgsize))
4228 ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4235 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4236 unsigned long iova, size_t size,
4237 struct iommu_iotlb_gather *gather)
4239 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4240 unsigned long start_pfn, last_pfn;
4243 /* Cope with horrid API which requires us to unmap more than the
4244 size argument if it happens to be a large-page mapping. */
4245 if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
4246 &level, GFP_ATOMIC)))
4249 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4250 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4252 start_pfn = iova >> VTD_PAGE_SHIFT;
4253 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4255 domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4257 if (dmar_domain->max_addr == iova + size)
4258 dmar_domain->max_addr = iova;
4261 * We do not use page-selective IOTLB invalidation in flush queue,
4262 * so there is no need to track page and sync iotlb.
4264 if (!iommu_iotlb_gather_queued(gather))
4265 iommu_iotlb_gather_add_page(domain, gather, iova, size);
4270 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4272 size_t pgsize, size_t pgcount,
4273 struct iommu_iotlb_gather *gather)
4275 unsigned long pgshift = __ffs(pgsize);
4276 size_t size = pgcount << pgshift;
4278 return intel_iommu_unmap(domain, iova, size, gather);
4281 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4282 struct iommu_iotlb_gather *gather)
4284 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4285 unsigned long iova_pfn = IOVA_PFN(gather->start);
4286 size_t size = gather->end - gather->start;
4287 struct iommu_domain_info *info;
4288 unsigned long start_pfn;
4289 unsigned long nrpages;
4292 nrpages = aligned_nrpages(gather->start, size);
4293 start_pfn = mm_to_dma_pfn_start(iova_pfn);
4295 xa_for_each(&dmar_domain->iommu_array, i, info)
4296 iommu_flush_iotlb_psi(info->iommu, dmar_domain,
4298 list_empty(&gather->freelist), 0);
4300 put_pages_list(&gather->freelist);
4303 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4306 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4307 struct dma_pte *pte;
4311 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
4313 if (pte && dma_pte_present(pte))
4314 phys = dma_pte_addr(pte) +
4315 (iova & (BIT_MASK(level_to_offset_bits(level) +
4316 VTD_PAGE_SHIFT) - 1));
4321 static bool domain_support_force_snooping(struct dmar_domain *domain)
4323 struct device_domain_info *info;
4324 bool support = true;
4326 assert_spin_locked(&domain->lock);
4327 list_for_each_entry(info, &domain->devices, link) {
4328 if (!ecap_sc_support(info->iommu->ecap)) {
4337 static void domain_set_force_snooping(struct dmar_domain *domain)
4339 struct device_domain_info *info;
4341 assert_spin_locked(&domain->lock);
4343 * Second level page table supports per-PTE snoop control. The
4344 * iommu_map() interface will handle this by setting SNP bit.
4346 if (!domain->use_first_level) {
4347 domain->set_pte_snp = true;
4351 list_for_each_entry(info, &domain->devices, link)
4352 intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4356 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4358 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4359 unsigned long flags;
4361 if (dmar_domain->force_snooping)
4364 spin_lock_irqsave(&dmar_domain->lock, flags);
4365 if (!domain_support_force_snooping(dmar_domain)) {
4366 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4370 domain_set_force_snooping(dmar_domain);
4371 dmar_domain->force_snooping = true;
4372 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4377 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
4379 struct device_domain_info *info = dev_iommu_priv_get(dev);
4382 case IOMMU_CAP_CACHE_COHERENCY:
4383 case IOMMU_CAP_DEFERRED_FLUSH:
4385 case IOMMU_CAP_PRE_BOOT_PROTECTION:
4386 return dmar_platform_optin();
4387 case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
4388 return ecap_sc_support(info->iommu->ecap);
4389 case IOMMU_CAP_DIRTY_TRACKING:
4390 return ssads_supported(info->iommu);
4396 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4398 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4399 struct device_domain_info *info;
4400 struct intel_iommu *iommu;
4404 iommu = device_to_iommu(dev, &bus, &devfn);
4405 if (!iommu || !iommu->iommu.ops)
4406 return ERR_PTR(-ENODEV);
4408 info = kzalloc(sizeof(*info), GFP_KERNEL);
4410 return ERR_PTR(-ENOMEM);
4412 if (dev_is_real_dma_subdevice(dev)) {
4413 info->bus = pdev->bus->number;
4414 info->devfn = pdev->devfn;
4415 info->segment = pci_domain_nr(pdev->bus);
4418 info->devfn = devfn;
4419 info->segment = iommu->segment;
4423 info->iommu = iommu;
4424 if (dev_is_pci(dev)) {
4425 if (ecap_dev_iotlb_support(iommu->ecap) &&
4426 pci_ats_supported(pdev) &&
4427 dmar_ats_supported(pdev, iommu)) {
4428 info->ats_supported = 1;
4429 info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
4432 * For IOMMU that supports device IOTLB throttling
4433 * (DIT), we assign PFSID to the invalidation desc
4434 * of a VF such that IOMMU HW can gauge queue depth
4435 * at PF level. If DIT is not set, PFSID will be
4436 * treated as reserved, which should be set to 0.
4438 if (ecap_dit(iommu->ecap))
4439 info->pfsid = pci_dev_id(pci_physfn(pdev));
4440 info->ats_qdep = pci_ats_queue_depth(pdev);
4442 if (sm_supported(iommu)) {
4443 if (pasid_supported(iommu)) {
4444 int features = pci_pasid_features(pdev);
4447 info->pasid_supported = features | 1;
4450 if (info->ats_supported && ecap_prs(iommu->ecap) &&
4451 pci_pri_supported(pdev))
4452 info->pri_supported = 1;
4456 dev_iommu_priv_set(dev, info);
4458 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
4459 ret = intel_pasid_alloc_table(dev);
4461 dev_err(dev, "PASID table allocation failed\n");
4462 dev_iommu_priv_set(dev, NULL);
4464 return ERR_PTR(ret);
4468 return &iommu->iommu;
4471 static void intel_iommu_release_device(struct device *dev)
4473 struct device_domain_info *info = dev_iommu_priv_get(dev);
4475 dmar_remove_one_dev_info(dev);
4476 intel_pasid_free_table(dev);
4477 dev_iommu_priv_set(dev, NULL);
4479 set_dma_ops(dev, NULL);
4482 static void intel_iommu_probe_finalize(struct device *dev)
4484 set_dma_ops(dev, NULL);
4485 iommu_setup_dma_ops(dev, 0, U64_MAX);
4488 static void intel_iommu_get_resv_regions(struct device *device,
4489 struct list_head *head)
4491 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4492 struct iommu_resv_region *reg;
4493 struct dmar_rmrr_unit *rmrr;
4494 struct device *i_dev;
4498 for_each_rmrr_units(rmrr) {
4499 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4501 struct iommu_resv_region *resv;
4502 enum iommu_resv_type type;
4505 if (i_dev != device &&
4506 !is_downstream_to_pci_bridge(device, i_dev))
4509 length = rmrr->end_address - rmrr->base_address + 1;
4511 type = device_rmrr_is_relaxable(device) ?
4512 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4514 resv = iommu_alloc_resv_region(rmrr->base_address,
4520 list_add_tail(&resv->list, head);
4525 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4526 if (dev_is_pci(device)) {
4527 struct pci_dev *pdev = to_pci_dev(device);
4529 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4530 reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4531 IOMMU_RESV_DIRECT_RELAXABLE,
4534 list_add_tail(®->list, head);
4537 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4539 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4540 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4541 0, IOMMU_RESV_MSI, GFP_KERNEL);
4544 list_add_tail(®->list, head);
4547 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4549 if (dev_is_pci(dev))
4550 return pci_device_group(dev);
4551 return generic_device_group(dev);
4554 static int intel_iommu_enable_sva(struct device *dev)
4556 struct device_domain_info *info = dev_iommu_priv_get(dev);
4557 struct intel_iommu *iommu;
4559 if (!info || dmar_disabled)
4562 iommu = info->iommu;
4566 if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4569 if (!info->pasid_enabled || !info->ats_enabled)
4573 * Devices having device-specific I/O fault handling should not
4574 * support PCI/PRI. The IOMMU side has no means to check the
4575 * capability of device-specific IOPF. Therefore, IOMMU can only
4576 * default that if the device driver enables SVA on a non-PRI
4577 * device, it will handle IOPF in its own way.
4579 if (!info->pri_supported)
4582 /* Devices supporting PRI should have it enabled. */
4583 if (!info->pri_enabled)
4589 static int intel_iommu_enable_iopf(struct device *dev)
4591 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4592 struct device_domain_info *info = dev_iommu_priv_get(dev);
4593 struct intel_iommu *iommu;
4596 if (!pdev || !info || !info->ats_enabled || !info->pri_supported)
4599 if (info->pri_enabled)
4602 iommu = info->iommu;
4606 /* PASID is required in PRG Response Message. */
4607 if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
4610 ret = pci_reset_pri(pdev);
4614 ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4618 ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4620 goto iopf_remove_device;
4622 ret = pci_enable_pri(pdev, PRQ_DEPTH);
4624 goto iopf_unregister_handler;
4625 info->pri_enabled = 1;
4629 iopf_unregister_handler:
4630 iommu_unregister_device_fault_handler(dev);
4632 iopf_queue_remove_device(iommu->iopf_queue, dev);
4637 static int intel_iommu_disable_iopf(struct device *dev)
4639 struct device_domain_info *info = dev_iommu_priv_get(dev);
4640 struct intel_iommu *iommu = info->iommu;
4642 if (!info->pri_enabled)
4646 * PCIe spec states that by clearing PRI enable bit, the Page
4647 * Request Interface will not issue new page requests, but has
4648 * outstanding page requests that have been transmitted or are
4649 * queued for transmission. This is supposed to be called after
4650 * the device driver has stopped DMA, all PASIDs have been
4651 * unbound and the outstanding PRQs have been drained.
4653 pci_disable_pri(to_pci_dev(dev));
4654 info->pri_enabled = 0;
4657 * With PRI disabled and outstanding PRQs drained, unregistering
4658 * fault handler and removing device from iopf queue should never
4661 WARN_ON(iommu_unregister_device_fault_handler(dev));
4662 WARN_ON(iopf_queue_remove_device(iommu->iopf_queue, dev));
4668 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4671 case IOMMU_DEV_FEAT_IOPF:
4672 return intel_iommu_enable_iopf(dev);
4674 case IOMMU_DEV_FEAT_SVA:
4675 return intel_iommu_enable_sva(dev);
4683 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4686 case IOMMU_DEV_FEAT_IOPF:
4687 return intel_iommu_disable_iopf(dev);
4689 case IOMMU_DEV_FEAT_SVA:
4697 static bool intel_iommu_is_attach_deferred(struct device *dev)
4699 struct device_domain_info *info = dev_iommu_priv_get(dev);
4701 return translation_pre_enabled(info->iommu) && !info->domain;
4705 * Check that the device does not live on an external facing PCI port that is
4706 * marked as untrusted. Such devices should not be able to apply quirks and
4707 * thus not be able to bypass the IOMMU restrictions.
4709 static bool risky_device(struct pci_dev *pdev)
4711 if (pdev->untrusted) {
4713 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4714 pdev->vendor, pdev->device);
4715 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4721 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4722 unsigned long iova, size_t size)
4724 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4725 unsigned long pages = aligned_nrpages(iova, size);
4726 unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4727 struct iommu_domain_info *info;
4730 xa_for_each(&dmar_domain->iommu_array, i, info)
4731 __mapping_notify_one(info->iommu, dmar_domain, pfn, pages);
4734 static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
4736 struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
4737 struct dev_pasid_info *curr, *dev_pasid = NULL;
4738 struct dmar_domain *dmar_domain;
4739 struct iommu_domain *domain;
4740 unsigned long flags;
4742 domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0);
4743 if (WARN_ON_ONCE(!domain))
4747 * The SVA implementation needs to handle its own stuffs like the mm
4748 * notification. Before consolidating that code into iommu core, let
4749 * the intel sva code handle it.
4751 if (domain->type == IOMMU_DOMAIN_SVA) {
4752 intel_svm_remove_dev_pasid(dev, pasid);
4756 dmar_domain = to_dmar_domain(domain);
4757 spin_lock_irqsave(&dmar_domain->lock, flags);
4758 list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
4759 if (curr->dev == dev && curr->pasid == pasid) {
4760 list_del(&curr->link_domain);
4765 WARN_ON_ONCE(!dev_pasid);
4766 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4768 domain_detach_iommu(dmar_domain, iommu);
4771 intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4772 intel_drain_pasid_prq(dev, pasid);
4775 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4776 struct device *dev, ioasid_t pasid)
4778 struct device_domain_info *info = dev_iommu_priv_get(dev);
4779 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4780 struct intel_iommu *iommu = info->iommu;
4781 struct dev_pasid_info *dev_pasid;
4782 unsigned long flags;
4785 if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4788 if (domain->dirty_ops)
4791 if (context_copied(iommu, info->bus, info->devfn))
4794 ret = prepare_domain_attach_device(domain, dev);
4798 dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
4802 ret = domain_attach_iommu(dmar_domain, iommu);
4806 if (domain_type_is_si(dmar_domain))
4807 ret = intel_pasid_setup_pass_through(iommu, dmar_domain,
4809 else if (dmar_domain->use_first_level)
4810 ret = domain_setup_first_level(iommu, dmar_domain,
4813 ret = intel_pasid_setup_second_level(iommu, dmar_domain,
4816 goto out_detach_iommu;
4818 dev_pasid->dev = dev;
4819 dev_pasid->pasid = pasid;
4820 spin_lock_irqsave(&dmar_domain->lock, flags);
4821 list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
4822 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4826 domain_detach_iommu(dmar_domain, iommu);
4832 static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
4834 struct device_domain_info *info = dev_iommu_priv_get(dev);
4835 struct intel_iommu *iommu = info->iommu;
4836 struct iommu_hw_info_vtd *vtd;
4838 vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
4840 return ERR_PTR(-ENOMEM);
4842 vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17;
4843 vtd->cap_reg = iommu->cap;
4844 vtd->ecap_reg = iommu->ecap;
4845 *length = sizeof(*vtd);
4846 *type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
4850 static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
4853 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4854 struct device_domain_info *info;
4857 spin_lock(&dmar_domain->lock);
4858 if (dmar_domain->dirty_tracking == enable)
4861 list_for_each_entry(info, &dmar_domain->devices, link) {
4862 ret = intel_pasid_setup_dirty_tracking(info->iommu,
4863 info->domain, info->dev,
4864 IOMMU_NO_PASID, enable);
4869 dmar_domain->dirty_tracking = enable;
4871 spin_unlock(&dmar_domain->lock);
4876 list_for_each_entry(info, &dmar_domain->devices, link)
4877 intel_pasid_setup_dirty_tracking(info->iommu, dmar_domain,
4878 info->dev, IOMMU_NO_PASID,
4879 dmar_domain->dirty_tracking);
4880 spin_unlock(&dmar_domain->lock);
4884 static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
4885 unsigned long iova, size_t size,
4886 unsigned long flags,
4887 struct iommu_dirty_bitmap *dirty)
4889 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4890 unsigned long end = iova + size - 1;
4891 unsigned long pgsize;
4894 * IOMMUFD core calls into a dirty tracking disabled domain without an
4895 * IOVA bitmap set in order to clean dirty bits in all PTEs that might
4896 * have occurred when we stopped dirty tracking. This ensures that we
4897 * never inherit dirtied bits from a previous cycle.
4899 if (!dmar_domain->dirty_tracking && dirty->bitmap)
4903 struct dma_pte *pte;
4906 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
4908 pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
4909 if (!pte || !dma_pte_present(pte)) {
4914 if (dma_sl_pte_test_and_clear_dirty(pte, flags))
4915 iommu_dirty_bitmap_record(dirty, iova, pgsize);
4917 } while (iova < end);
4922 const struct iommu_dirty_ops intel_dirty_ops = {
4923 .set_dirty_tracking = intel_iommu_set_dirty_tracking,
4924 .read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
4927 const struct iommu_ops intel_iommu_ops = {
4928 .capable = intel_iommu_capable,
4929 .hw_info = intel_iommu_hw_info,
4930 .domain_alloc = intel_iommu_domain_alloc,
4931 .domain_alloc_user = intel_iommu_domain_alloc_user,
4932 .probe_device = intel_iommu_probe_device,
4933 .probe_finalize = intel_iommu_probe_finalize,
4934 .release_device = intel_iommu_release_device,
4935 .get_resv_regions = intel_iommu_get_resv_regions,
4936 .device_group = intel_iommu_device_group,
4937 .dev_enable_feat = intel_iommu_dev_enable_feat,
4938 .dev_disable_feat = intel_iommu_dev_disable_feat,
4939 .is_attach_deferred = intel_iommu_is_attach_deferred,
4940 .def_domain_type = device_def_domain_type,
4941 .remove_dev_pasid = intel_iommu_remove_dev_pasid,
4942 .pgsize_bitmap = SZ_4K,
4943 #ifdef CONFIG_INTEL_IOMMU_SVM
4944 .page_response = intel_svm_page_response,
4946 .default_domain_ops = &(const struct iommu_domain_ops) {
4947 .attach_dev = intel_iommu_attach_device,
4948 .set_dev_pasid = intel_iommu_set_dev_pasid,
4949 .map_pages = intel_iommu_map_pages,
4950 .unmap_pages = intel_iommu_unmap_pages,
4951 .iotlb_sync_map = intel_iommu_iotlb_sync_map,
4952 .flush_iotlb_all = intel_flush_iotlb_all,
4953 .iotlb_sync = intel_iommu_tlb_sync,
4954 .iova_to_phys = intel_iommu_iova_to_phys,
4955 .free = intel_iommu_domain_free,
4956 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4960 static void quirk_iommu_igfx(struct pci_dev *dev)
4962 if (risky_device(dev))
4965 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4969 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4970 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4971 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4972 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4973 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4974 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4975 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4976 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4978 /* Broadwell igfx malfunctions with dmar */
4979 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4980 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4981 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4982 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4983 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4984 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4985 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4986 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4987 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4988 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4989 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4990 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4991 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4992 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4993 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4994 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4995 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4996 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4997 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4998 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4999 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
5000 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
5001 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
5002 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
5004 static void quirk_iommu_rwbf(struct pci_dev *dev)
5006 if (risky_device(dev))
5010 * Mobile 4 Series Chipset neglects to set RWBF capability,
5011 * but needs it. Same seems to hold for the desktop versions.
5013 pci_info(dev, "Forcing write-buffer flush capability\n");
5017 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5018 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5019 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5020 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5021 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5022 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5023 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5026 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
5027 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
5028 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
5029 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
5030 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
5031 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
5032 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
5033 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
5035 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5039 if (risky_device(dev))
5042 if (pci_read_config_word(dev, GGC, &ggc))
5045 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5046 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5048 } else if (dmar_map_gfx) {
5049 /* we have to ensure the gfx device is idle before we flush */
5050 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5051 iommu_set_dma_strict();
5054 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5055 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5056 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5057 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5059 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
5063 if (!IS_GFX_DEVICE(dev))
5066 ver = (dev->device >> 8) & 0xff;
5067 if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
5068 ver != 0x4e && ver != 0x8a && ver != 0x98 &&
5069 ver != 0x9a && ver != 0xa7)
5072 if (risky_device(dev))
5075 pci_info(dev, "Skip IOMMU disabling for graphics\n");
5076 iommu_skip_te_disable = 1;
5078 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
5080 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5081 ISOCH DMAR unit for the Azalia sound device, but not give it any
5082 TLB entries, which causes it to deadlock. Check for that. We do
5083 this in a function called from init_dmars(), instead of in a PCI
5084 quirk, because we don't want to print the obnoxious "BIOS broken"
5085 message if VT-d is actually disabled.
5087 static void __init check_tylersburg_isoch(void)
5089 struct pci_dev *pdev;
5090 uint32_t vtisochctrl;
5092 /* If there's no Azalia in the system anyway, forget it. */
5093 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5097 if (risky_device(pdev)) {
5104 /* System Management Registers. Might be hidden, in which case
5105 we can't do the sanity check. But that's OK, because the
5106 known-broken BIOSes _don't_ actually hide it, so far. */
5107 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5111 if (risky_device(pdev)) {
5116 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5123 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5124 if (vtisochctrl & 1)
5127 /* Drop all bits other than the number of TLB entries */
5128 vtisochctrl &= 0x1c;
5130 /* If we have the recommended number of TLB entries (16), fine. */
5131 if (vtisochctrl == 0x10)
5134 /* Zero TLB entries? You get to ride the short bus to school. */
5136 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5137 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5138 dmi_get_system_info(DMI_BIOS_VENDOR),
5139 dmi_get_system_info(DMI_BIOS_VERSION),
5140 dmi_get_system_info(DMI_PRODUCT_VERSION));
5141 iommu_identity_mapping |= IDENTMAP_AZALIA;
5145 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5150 * Here we deal with a device TLB defect where device may inadvertently issue ATS
5151 * invalidation completion before posted writes initiated with translated address
5152 * that utilized translations matching the invalidation address range, violating
5153 * the invalidation completion ordering.
5154 * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
5155 * vulnerable to this defect. In other words, any dTLB invalidation initiated not
5156 * under the control of the trusted/privileged host device driver must use this
5158 * Device TLBs are invalidated under the following six conditions:
5159 * 1. Device driver does DMA API unmap IOVA
5160 * 2. Device driver unbind a PASID from a process, sva_unbind_device()
5161 * 3. PASID is torn down, after PASID cache is flushed. e.g. process
5162 * exit_mmap() due to crash
5163 * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
5164 * VM has to free pages that were unmapped
5165 * 5. Userspace driver unmaps a DMA buffer
5166 * 6. Cache invalidation in vSVA usage (upcoming)
5168 * For #1 and #2, device drivers are responsible for stopping DMA traffic
5169 * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
5170 * invalidate TLB the same way as normal user unmap which will use this quirk.
5171 * The dTLB invalidation after PASID cache flush does not need this quirk.
5173 * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
5175 void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
5176 unsigned long address, unsigned long mask,
5177 u32 pasid, u16 qdep)
5181 if (likely(!info->dtlb_extra_inval))
5184 sid = PCI_DEVID(info->bus, info->devfn);
5185 if (pasid == IOMMU_NO_PASID) {
5186 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
5187 qdep, address, mask);
5189 qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
5190 pasid, qdep, address, mask);
5194 #define ecmd_get_status_code(res) (((res) & 0xff) >> 1)
5197 * Function to submit a command to the enhanced command interface. The
5198 * valid enhanced command descriptions are defined in Table 47 of the
5199 * VT-d spec. The VT-d hardware implementation may support some but not
5200 * all commands, which can be determined by checking the Enhanced
5201 * Command Capability Register.
5204 * - 0: Command successful without any error;
5205 * - Negative: software error value;
5206 * - Nonzero positive: failure status code defined in Table 48.
5208 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
5210 unsigned long flags;
5214 if (!cap_ecmds(iommu->cap))
5217 raw_spin_lock_irqsave(&iommu->register_lock, flags);
5219 res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
5220 if (res & DMA_ECMD_ECRSP_IP) {
5226 * Unconditionally write the operand B, because
5227 * - There is no side effect if an ecmd doesn't require an
5228 * operand B, but we set the register to some value.
5229 * - It's not invoked in any critical path. The extra MMIO
5230 * write doesn't bring any performance concerns.
5232 dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
5233 dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
5235 IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
5236 !(res & DMA_ECMD_ECRSP_IP), res);
5238 if (res & DMA_ECMD_ECRSP_IP) {
5243 ret = ecmd_get_status_code(res);
5245 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);