]> Git Repo - linux.git/blob - drivers/iommu/intel/iommu.c
iommu/vt-d: Remove commented-out code
[linux.git] / drivers / iommu / intel / iommu.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <[email protected]>,
6  *          Ashok Raj <[email protected]>,
7  *          Shaohua Li <[email protected]>,
8  *          Anil S Keshavamurthy <[email protected]>,
9  *          Fenghua Yu <[email protected]>
10  *          Joerg Roedel <[email protected]>
11  */
12
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dmi.h>
19 #include <linux/memory.h>
20 #include <linux/pci.h>
21 #include <linux/pci-ats.h>
22 #include <linux/spinlock.h>
23 #include <linux/syscore_ops.h>
24 #include <linux/tboot.h>
25
26 #include "iommu.h"
27 #include "../dma-iommu.h"
28 #include "../irq_remapping.h"
29 #include "../iommu-sva.h"
30 #include "pasid.h"
31 #include "cap_audit.h"
32 #include "perfmon.h"
33
34 #define ROOT_SIZE               VTD_PAGE_SIZE
35 #define CONTEXT_SIZE            VTD_PAGE_SIZE
36
37 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
38 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
39 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
40 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
41
42 #define IOAPIC_RANGE_START      (0xfee00000)
43 #define IOAPIC_RANGE_END        (0xfeefffff)
44 #define IOVA_START_ADDR         (0x1000)
45
46 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
47
48 #define MAX_AGAW_WIDTH 64
49 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
50
51 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
52 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
53
54 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
55    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
56 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
57                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
58 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
59
60 /* IO virtual address start page frame number */
61 #define IOVA_START_PFN          (1)
62
63 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
64
65 /* page table handling */
66 #define LEVEL_STRIDE            (9)
67 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
68
69 static inline int agaw_to_level(int agaw)
70 {
71         return agaw + 2;
72 }
73
74 static inline int agaw_to_width(int agaw)
75 {
76         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
77 }
78
79 static inline int width_to_agaw(int width)
80 {
81         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
82 }
83
84 static inline unsigned int level_to_offset_bits(int level)
85 {
86         return (level - 1) * LEVEL_STRIDE;
87 }
88
89 static inline int pfn_level_offset(u64 pfn, int level)
90 {
91         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
92 }
93
94 static inline u64 level_mask(int level)
95 {
96         return -1ULL << level_to_offset_bits(level);
97 }
98
99 static inline u64 level_size(int level)
100 {
101         return 1ULL << level_to_offset_bits(level);
102 }
103
104 static inline u64 align_to_level(u64 pfn, int level)
105 {
106         return (pfn + level_size(level) - 1) & level_mask(level);
107 }
108
109 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
110 {
111         return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
112 }
113
114 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
115    are never going to work. */
116 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
117 {
118         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
119 }
120 static inline unsigned long page_to_dma_pfn(struct page *pg)
121 {
122         return mm_to_dma_pfn(page_to_pfn(pg));
123 }
124 static inline unsigned long virt_to_dma_pfn(void *p)
125 {
126         return page_to_dma_pfn(virt_to_page(p));
127 }
128
129 static void __init check_tylersburg_isoch(void);
130 static int rwbf_quirk;
131
132 /*
133  * set to 1 to panic kernel if can't successfully enable VT-d
134  * (used when kernel is launched w/ TXT)
135  */
136 static int force_on = 0;
137 static int intel_iommu_tboot_noforce;
138 static int no_platform_optin;
139
140 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
141
142 /*
143  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
144  * if marked present.
145  */
146 static phys_addr_t root_entry_lctp(struct root_entry *re)
147 {
148         if (!(re->lo & 1))
149                 return 0;
150
151         return re->lo & VTD_PAGE_MASK;
152 }
153
154 /*
155  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
156  * if marked present.
157  */
158 static phys_addr_t root_entry_uctp(struct root_entry *re)
159 {
160         if (!(re->hi & 1))
161                 return 0;
162
163         return re->hi & VTD_PAGE_MASK;
164 }
165
166 static inline void context_set_present(struct context_entry *context)
167 {
168         context->lo |= 1;
169 }
170
171 static inline void context_set_fault_enable(struct context_entry *context)
172 {
173         context->lo &= (((u64)-1) << 2) | 1;
174 }
175
176 static inline void context_set_translation_type(struct context_entry *context,
177                                                 unsigned long value)
178 {
179         context->lo &= (((u64)-1) << 4) | 3;
180         context->lo |= (value & 3) << 2;
181 }
182
183 static inline void context_set_address_root(struct context_entry *context,
184                                             unsigned long value)
185 {
186         context->lo &= ~VTD_PAGE_MASK;
187         context->lo |= value & VTD_PAGE_MASK;
188 }
189
190 static inline void context_set_address_width(struct context_entry *context,
191                                              unsigned long value)
192 {
193         context->hi |= value & 7;
194 }
195
196 static inline void context_set_domain_id(struct context_entry *context,
197                                          unsigned long value)
198 {
199         context->hi |= (value & ((1 << 16) - 1)) << 8;
200 }
201
202 static inline void context_set_pasid(struct context_entry *context)
203 {
204         context->lo |= CONTEXT_PASIDE;
205 }
206
207 static inline int context_domain_id(struct context_entry *c)
208 {
209         return((c->hi >> 8) & 0xffff);
210 }
211
212 static inline void context_clear_entry(struct context_entry *context)
213 {
214         context->lo = 0;
215         context->hi = 0;
216 }
217
218 static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
219 {
220         if (!iommu->copied_tables)
221                 return false;
222
223         return test_bit(((long)bus << 8) | devfn, iommu->copied_tables);
224 }
225
226 static inline void
227 set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
228 {
229         set_bit(((long)bus << 8) | devfn, iommu->copied_tables);
230 }
231
232 static inline void
233 clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
234 {
235         clear_bit(((long)bus << 8) | devfn, iommu->copied_tables);
236 }
237
238 /*
239  * This domain is a statically identity mapping domain.
240  *      1. This domain creats a static 1:1 mapping to all usable memory.
241  *      2. It maps to each iommu if successful.
242  *      3. Each iommu mapps to this domain if successful.
243  */
244 static struct dmar_domain *si_domain;
245 static int hw_pass_through = 1;
246
247 struct dmar_rmrr_unit {
248         struct list_head list;          /* list of rmrr units   */
249         struct acpi_dmar_header *hdr;   /* ACPI header          */
250         u64     base_address;           /* reserved base address*/
251         u64     end_address;            /* reserved end address */
252         struct dmar_dev_scope *devices; /* target devices */
253         int     devices_cnt;            /* target device count */
254 };
255
256 struct dmar_atsr_unit {
257         struct list_head list;          /* list of ATSR units */
258         struct acpi_dmar_header *hdr;   /* ACPI header */
259         struct dmar_dev_scope *devices; /* target devices */
260         int devices_cnt;                /* target device count */
261         u8 include_all:1;               /* include all ports */
262 };
263
264 struct dmar_satc_unit {
265         struct list_head list;          /* list of SATC units */
266         struct acpi_dmar_header *hdr;   /* ACPI header */
267         struct dmar_dev_scope *devices; /* target devices */
268         struct intel_iommu *iommu;      /* the corresponding iommu */
269         int devices_cnt;                /* target device count */
270         u8 atc_required:1;              /* ATS is required */
271 };
272
273 static LIST_HEAD(dmar_atsr_units);
274 static LIST_HEAD(dmar_rmrr_units);
275 static LIST_HEAD(dmar_satc_units);
276
277 #define for_each_rmrr_units(rmrr) \
278         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
279
280 static void device_block_translation(struct device *dev);
281 static void intel_iommu_domain_free(struct iommu_domain *domain);
282
283 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
284 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
285
286 int intel_iommu_enabled = 0;
287 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
288
289 static int dmar_map_gfx = 1;
290 static int intel_iommu_superpage = 1;
291 static int iommu_identity_mapping;
292 static int iommu_skip_te_disable;
293
294 #define IDENTMAP_GFX            2
295 #define IDENTMAP_AZALIA         4
296
297 const struct iommu_ops intel_iommu_ops;
298
299 static bool translation_pre_enabled(struct intel_iommu *iommu)
300 {
301         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
302 }
303
304 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
305 {
306         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
307 }
308
309 static void init_translation_status(struct intel_iommu *iommu)
310 {
311         u32 gsts;
312
313         gsts = readl(iommu->reg + DMAR_GSTS_REG);
314         if (gsts & DMA_GSTS_TES)
315                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
316 }
317
318 static int __init intel_iommu_setup(char *str)
319 {
320         if (!str)
321                 return -EINVAL;
322
323         while (*str) {
324                 if (!strncmp(str, "on", 2)) {
325                         dmar_disabled = 0;
326                         pr_info("IOMMU enabled\n");
327                 } else if (!strncmp(str, "off", 3)) {
328                         dmar_disabled = 1;
329                         no_platform_optin = 1;
330                         pr_info("IOMMU disabled\n");
331                 } else if (!strncmp(str, "igfx_off", 8)) {
332                         dmar_map_gfx = 0;
333                         pr_info("Disable GFX device mapping\n");
334                 } else if (!strncmp(str, "forcedac", 8)) {
335                         pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
336                         iommu_dma_forcedac = true;
337                 } else if (!strncmp(str, "strict", 6)) {
338                         pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
339                         iommu_set_dma_strict();
340                 } else if (!strncmp(str, "sp_off", 6)) {
341                         pr_info("Disable supported super page\n");
342                         intel_iommu_superpage = 0;
343                 } else if (!strncmp(str, "sm_on", 5)) {
344                         pr_info("Enable scalable mode if hardware supports\n");
345                         intel_iommu_sm = 1;
346                 } else if (!strncmp(str, "sm_off", 6)) {
347                         pr_info("Scalable mode is disallowed\n");
348                         intel_iommu_sm = 0;
349                 } else if (!strncmp(str, "tboot_noforce", 13)) {
350                         pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
351                         intel_iommu_tboot_noforce = 1;
352                 } else {
353                         pr_notice("Unknown option - '%s'\n", str);
354                 }
355
356                 str += strcspn(str, ",");
357                 while (*str == ',')
358                         str++;
359         }
360
361         return 1;
362 }
363 __setup("intel_iommu=", intel_iommu_setup);
364
365 void *alloc_pgtable_page(int node, gfp_t gfp)
366 {
367         struct page *page;
368         void *vaddr = NULL;
369
370         page = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
371         if (page)
372                 vaddr = page_address(page);
373         return vaddr;
374 }
375
376 void free_pgtable_page(void *vaddr)
377 {
378         free_page((unsigned long)vaddr);
379 }
380
381 static inline int domain_type_is_si(struct dmar_domain *domain)
382 {
383         return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
384 }
385
386 static inline int domain_pfn_supported(struct dmar_domain *domain,
387                                        unsigned long pfn)
388 {
389         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
390
391         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
392 }
393
394 /*
395  * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
396  * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
397  * the returned SAGAW.
398  */
399 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
400 {
401         unsigned long fl_sagaw, sl_sagaw;
402
403         fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
404         sl_sagaw = cap_sagaw(iommu->cap);
405
406         /* Second level only. */
407         if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
408                 return sl_sagaw;
409
410         /* First level only. */
411         if (!ecap_slts(iommu->ecap))
412                 return fl_sagaw;
413
414         return fl_sagaw & sl_sagaw;
415 }
416
417 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
418 {
419         unsigned long sagaw;
420         int agaw;
421
422         sagaw = __iommu_calculate_sagaw(iommu);
423         for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
424                 if (test_bit(agaw, &sagaw))
425                         break;
426         }
427
428         return agaw;
429 }
430
431 /*
432  * Calculate max SAGAW for each iommu.
433  */
434 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
435 {
436         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
437 }
438
439 /*
440  * calculate agaw for each iommu.
441  * "SAGAW" may be different across iommus, use a default agaw, and
442  * get a supported less agaw for iommus that don't support the default agaw.
443  */
444 int iommu_calculate_agaw(struct intel_iommu *iommu)
445 {
446         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
447 }
448
449 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
450 {
451         return sm_supported(iommu) ?
452                         ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
453 }
454
455 static void domain_update_iommu_coherency(struct dmar_domain *domain)
456 {
457         struct iommu_domain_info *info;
458         struct dmar_drhd_unit *drhd;
459         struct intel_iommu *iommu;
460         bool found = false;
461         unsigned long i;
462
463         domain->iommu_coherency = true;
464         xa_for_each(&domain->iommu_array, i, info) {
465                 found = true;
466                 if (!iommu_paging_structure_coherency(info->iommu)) {
467                         domain->iommu_coherency = false;
468                         break;
469                 }
470         }
471         if (found)
472                 return;
473
474         /* No hardware attached; use lowest common denominator */
475         rcu_read_lock();
476         for_each_active_iommu(iommu, drhd) {
477                 if (!iommu_paging_structure_coherency(iommu)) {
478                         domain->iommu_coherency = false;
479                         break;
480                 }
481         }
482         rcu_read_unlock();
483 }
484
485 static int domain_update_iommu_superpage(struct dmar_domain *domain,
486                                          struct intel_iommu *skip)
487 {
488         struct dmar_drhd_unit *drhd;
489         struct intel_iommu *iommu;
490         int mask = 0x3;
491
492         if (!intel_iommu_superpage)
493                 return 0;
494
495         /* set iommu_superpage to the smallest common denominator */
496         rcu_read_lock();
497         for_each_active_iommu(iommu, drhd) {
498                 if (iommu != skip) {
499                         if (domain && domain->use_first_level) {
500                                 if (!cap_fl1gp_support(iommu->cap))
501                                         mask = 0x1;
502                         } else {
503                                 mask &= cap_super_page_val(iommu->cap);
504                         }
505
506                         if (!mask)
507                                 break;
508                 }
509         }
510         rcu_read_unlock();
511
512         return fls(mask);
513 }
514
515 static int domain_update_device_node(struct dmar_domain *domain)
516 {
517         struct device_domain_info *info;
518         int nid = NUMA_NO_NODE;
519         unsigned long flags;
520
521         spin_lock_irqsave(&domain->lock, flags);
522         list_for_each_entry(info, &domain->devices, link) {
523                 /*
524                  * There could possibly be multiple device numa nodes as devices
525                  * within the same domain may sit behind different IOMMUs. There
526                  * isn't perfect answer in such situation, so we select first
527                  * come first served policy.
528                  */
529                 nid = dev_to_node(info->dev);
530                 if (nid != NUMA_NO_NODE)
531                         break;
532         }
533         spin_unlock_irqrestore(&domain->lock, flags);
534
535         return nid;
536 }
537
538 static void domain_update_iotlb(struct dmar_domain *domain);
539
540 /* Return the super pagesize bitmap if supported. */
541 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
542 {
543         unsigned long bitmap = 0;
544
545         /*
546          * 1-level super page supports page size of 2MiB, 2-level super page
547          * supports page size of both 2MiB and 1GiB.
548          */
549         if (domain->iommu_superpage == 1)
550                 bitmap |= SZ_2M;
551         else if (domain->iommu_superpage == 2)
552                 bitmap |= SZ_2M | SZ_1G;
553
554         return bitmap;
555 }
556
557 /* Some capabilities may be different across iommus */
558 static void domain_update_iommu_cap(struct dmar_domain *domain)
559 {
560         domain_update_iommu_coherency(domain);
561         domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
562
563         /*
564          * If RHSA is missing, we should default to the device numa domain
565          * as fall back.
566          */
567         if (domain->nid == NUMA_NO_NODE)
568                 domain->nid = domain_update_device_node(domain);
569
570         /*
571          * First-level translation restricts the input-address to a
572          * canonical address (i.e., address bits 63:N have the same
573          * value as address bit [N-1], where N is 48-bits with 4-level
574          * paging and 57-bits with 5-level paging). Hence, skip bit
575          * [N-1].
576          */
577         if (domain->use_first_level)
578                 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
579         else
580                 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
581
582         domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
583         domain_update_iotlb(domain);
584 }
585
586 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
587                                          u8 devfn, int alloc)
588 {
589         struct root_entry *root = &iommu->root_entry[bus];
590         struct context_entry *context;
591         u64 *entry;
592
593         /*
594          * Except that the caller requested to allocate a new entry,
595          * returning a copied context entry makes no sense.
596          */
597         if (!alloc && context_copied(iommu, bus, devfn))
598                 return NULL;
599
600         entry = &root->lo;
601         if (sm_supported(iommu)) {
602                 if (devfn >= 0x80) {
603                         devfn -= 0x80;
604                         entry = &root->hi;
605                 }
606                 devfn *= 2;
607         }
608         if (*entry & 1)
609                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
610         else {
611                 unsigned long phy_addr;
612                 if (!alloc)
613                         return NULL;
614
615                 context = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
616                 if (!context)
617                         return NULL;
618
619                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
620                 phy_addr = virt_to_phys((void *)context);
621                 *entry = phy_addr | 1;
622                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
623         }
624         return &context[devfn];
625 }
626
627 /**
628  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
629  *                               sub-hierarchy of a candidate PCI-PCI bridge
630  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
631  * @bridge: the candidate PCI-PCI bridge
632  *
633  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
634  */
635 static bool
636 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
637 {
638         struct pci_dev *pdev, *pbridge;
639
640         if (!dev_is_pci(dev) || !dev_is_pci(bridge))
641                 return false;
642
643         pdev = to_pci_dev(dev);
644         pbridge = to_pci_dev(bridge);
645
646         if (pbridge->subordinate &&
647             pbridge->subordinate->number <= pdev->bus->number &&
648             pbridge->subordinate->busn_res.end >= pdev->bus->number)
649                 return true;
650
651         return false;
652 }
653
654 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
655 {
656         struct dmar_drhd_unit *drhd;
657         u32 vtbar;
658         int rc;
659
660         /* We know that this device on this chipset has its own IOMMU.
661          * If we find it under a different IOMMU, then the BIOS is lying
662          * to us. Hope that the IOMMU for this device is actually
663          * disabled, and it needs no translation...
664          */
665         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
666         if (rc) {
667                 /* "can't" happen */
668                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
669                 return false;
670         }
671         vtbar &= 0xffff0000;
672
673         /* we know that the this iommu should be at offset 0xa000 from vtbar */
674         drhd = dmar_find_matched_drhd_unit(pdev);
675         if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
676                 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
677                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
678                 return true;
679         }
680
681         return false;
682 }
683
684 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
685 {
686         if (!iommu || iommu->drhd->ignored)
687                 return true;
688
689         if (dev_is_pci(dev)) {
690                 struct pci_dev *pdev = to_pci_dev(dev);
691
692                 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
693                     pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
694                     quirk_ioat_snb_local_iommu(pdev))
695                         return true;
696         }
697
698         return false;
699 }
700
701 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
702 {
703         struct dmar_drhd_unit *drhd = NULL;
704         struct pci_dev *pdev = NULL;
705         struct intel_iommu *iommu;
706         struct device *tmp;
707         u16 segment = 0;
708         int i;
709
710         if (!dev)
711                 return NULL;
712
713         if (dev_is_pci(dev)) {
714                 struct pci_dev *pf_pdev;
715
716                 pdev = pci_real_dma_dev(to_pci_dev(dev));
717
718                 /* VFs aren't listed in scope tables; we need to look up
719                  * the PF instead to find the IOMMU. */
720                 pf_pdev = pci_physfn(pdev);
721                 dev = &pf_pdev->dev;
722                 segment = pci_domain_nr(pdev->bus);
723         } else if (has_acpi_companion(dev))
724                 dev = &ACPI_COMPANION(dev)->dev;
725
726         rcu_read_lock();
727         for_each_iommu(iommu, drhd) {
728                 if (pdev && segment != drhd->segment)
729                         continue;
730
731                 for_each_active_dev_scope(drhd->devices,
732                                           drhd->devices_cnt, i, tmp) {
733                         if (tmp == dev) {
734                                 /* For a VF use its original BDF# not that of the PF
735                                  * which we used for the IOMMU lookup. Strictly speaking
736                                  * we could do this for all PCI devices; we only need to
737                                  * get the BDF# from the scope table for ACPI matches. */
738                                 if (pdev && pdev->is_virtfn)
739                                         goto got_pdev;
740
741                                 if (bus && devfn) {
742                                         *bus = drhd->devices[i].bus;
743                                         *devfn = drhd->devices[i].devfn;
744                                 }
745                                 goto out;
746                         }
747
748                         if (is_downstream_to_pci_bridge(dev, tmp))
749                                 goto got_pdev;
750                 }
751
752                 if (pdev && drhd->include_all) {
753 got_pdev:
754                         if (bus && devfn) {
755                                 *bus = pdev->bus->number;
756                                 *devfn = pdev->devfn;
757                         }
758                         goto out;
759                 }
760         }
761         iommu = NULL;
762 out:
763         if (iommu_is_dummy(iommu, dev))
764                 iommu = NULL;
765
766         rcu_read_unlock();
767
768         return iommu;
769 }
770
771 static void domain_flush_cache(struct dmar_domain *domain,
772                                void *addr, int size)
773 {
774         if (!domain->iommu_coherency)
775                 clflush_cache_range(addr, size);
776 }
777
778 static void free_context_table(struct intel_iommu *iommu)
779 {
780         struct context_entry *context;
781         int i;
782
783         if (!iommu->root_entry)
784                 return;
785
786         for (i = 0; i < ROOT_ENTRY_NR; i++) {
787                 context = iommu_context_addr(iommu, i, 0, 0);
788                 if (context)
789                         free_pgtable_page(context);
790
791                 if (!sm_supported(iommu))
792                         continue;
793
794                 context = iommu_context_addr(iommu, i, 0x80, 0);
795                 if (context)
796                         free_pgtable_page(context);
797         }
798
799         free_pgtable_page(iommu->root_entry);
800         iommu->root_entry = NULL;
801 }
802
803 #ifdef CONFIG_DMAR_DEBUG
804 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
805                          u8 bus, u8 devfn, struct dma_pte *parent, int level)
806 {
807         struct dma_pte *pte;
808         int offset;
809
810         while (1) {
811                 offset = pfn_level_offset(pfn, level);
812                 pte = &parent[offset];
813                 if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
814                         pr_info("PTE not present at level %d\n", level);
815                         break;
816                 }
817
818                 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
819
820                 if (level == 1)
821                         break;
822
823                 parent = phys_to_virt(dma_pte_addr(pte));
824                 level--;
825         }
826 }
827
828 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
829                           unsigned long long addr, u32 pasid)
830 {
831         struct pasid_dir_entry *dir, *pde;
832         struct pasid_entry *entries, *pte;
833         struct context_entry *ctx_entry;
834         struct root_entry *rt_entry;
835         int i, dir_index, index, level;
836         u8 devfn = source_id & 0xff;
837         u8 bus = source_id >> 8;
838         struct dma_pte *pgtable;
839
840         pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
841
842         /* root entry dump */
843         rt_entry = &iommu->root_entry[bus];
844         if (!rt_entry) {
845                 pr_info("root table entry is not present\n");
846                 return;
847         }
848
849         if (sm_supported(iommu))
850                 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
851                         rt_entry->hi, rt_entry->lo);
852         else
853                 pr_info("root entry: 0x%016llx", rt_entry->lo);
854
855         /* context entry dump */
856         ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
857         if (!ctx_entry) {
858                 pr_info("context table entry is not present\n");
859                 return;
860         }
861
862         pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
863                 ctx_entry->hi, ctx_entry->lo);
864
865         /* legacy mode does not require PASID entries */
866         if (!sm_supported(iommu)) {
867                 level = agaw_to_level(ctx_entry->hi & 7);
868                 pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
869                 goto pgtable_walk;
870         }
871
872         /* get the pointer to pasid directory entry */
873         dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
874         if (!dir) {
875                 pr_info("pasid directory entry is not present\n");
876                 return;
877         }
878         /* For request-without-pasid, get the pasid from context entry */
879         if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
880                 pasid = PASID_RID2PASID;
881
882         dir_index = pasid >> PASID_PDE_SHIFT;
883         pde = &dir[dir_index];
884         pr_info("pasid dir entry: 0x%016llx\n", pde->val);
885
886         /* get the pointer to the pasid table entry */
887         entries = get_pasid_table_from_pde(pde);
888         if (!entries) {
889                 pr_info("pasid table entry is not present\n");
890                 return;
891         }
892         index = pasid & PASID_PTE_MASK;
893         pte = &entries[index];
894         for (i = 0; i < ARRAY_SIZE(pte->val); i++)
895                 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
896
897         if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
898                 level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
899                 pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
900         } else {
901                 level = agaw_to_level((pte->val[0] >> 2) & 0x7);
902                 pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
903         }
904
905 pgtable_walk:
906         pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
907 }
908 #endif
909
910 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
911                                       unsigned long pfn, int *target_level,
912                                       gfp_t gfp)
913 {
914         struct dma_pte *parent, *pte;
915         int level = agaw_to_level(domain->agaw);
916         int offset;
917
918         if (!domain_pfn_supported(domain, pfn))
919                 /* Address beyond IOMMU's addressing capabilities. */
920                 return NULL;
921
922         parent = domain->pgd;
923
924         while (1) {
925                 void *tmp_page;
926
927                 offset = pfn_level_offset(pfn, level);
928                 pte = &parent[offset];
929                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
930                         break;
931                 if (level == *target_level)
932                         break;
933
934                 if (!dma_pte_present(pte)) {
935                         uint64_t pteval;
936
937                         tmp_page = alloc_pgtable_page(domain->nid, gfp);
938
939                         if (!tmp_page)
940                                 return NULL;
941
942                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
943                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
944                         if (domain->use_first_level)
945                                 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
946
947                         if (cmpxchg64(&pte->val, 0ULL, pteval))
948                                 /* Someone else set it while we were thinking; use theirs. */
949                                 free_pgtable_page(tmp_page);
950                         else
951                                 domain_flush_cache(domain, pte, sizeof(*pte));
952                 }
953                 if (level == 1)
954                         break;
955
956                 parent = phys_to_virt(dma_pte_addr(pte));
957                 level--;
958         }
959
960         if (!*target_level)
961                 *target_level = level;
962
963         return pte;
964 }
965
966 /* return address's pte at specific level */
967 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
968                                          unsigned long pfn,
969                                          int level, int *large_page)
970 {
971         struct dma_pte *parent, *pte;
972         int total = agaw_to_level(domain->agaw);
973         int offset;
974
975         parent = domain->pgd;
976         while (level <= total) {
977                 offset = pfn_level_offset(pfn, total);
978                 pte = &parent[offset];
979                 if (level == total)
980                         return pte;
981
982                 if (!dma_pte_present(pte)) {
983                         *large_page = total;
984                         break;
985                 }
986
987                 if (dma_pte_superpage(pte)) {
988                         *large_page = total;
989                         return pte;
990                 }
991
992                 parent = phys_to_virt(dma_pte_addr(pte));
993                 total--;
994         }
995         return NULL;
996 }
997
998 /* clear last level pte, a tlb flush should be followed */
999 static void dma_pte_clear_range(struct dmar_domain *domain,
1000                                 unsigned long start_pfn,
1001                                 unsigned long last_pfn)
1002 {
1003         unsigned int large_page;
1004         struct dma_pte *first_pte, *pte;
1005
1006         if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1007             WARN_ON(start_pfn > last_pfn))
1008                 return;
1009
1010         /* we don't need lock here; nobody else touches the iova range */
1011         do {
1012                 large_page = 1;
1013                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1014                 if (!pte) {
1015                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1016                         continue;
1017                 }
1018                 do {
1019                         dma_clear_pte(pte);
1020                         start_pfn += lvl_to_nr_pages(large_page);
1021                         pte++;
1022                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1023
1024                 domain_flush_cache(domain, first_pte,
1025                                    (void *)pte - (void *)first_pte);
1026
1027         } while (start_pfn && start_pfn <= last_pfn);
1028 }
1029
1030 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1031                                int retain_level, struct dma_pte *pte,
1032                                unsigned long pfn, unsigned long start_pfn,
1033                                unsigned long last_pfn)
1034 {
1035         pfn = max(start_pfn, pfn);
1036         pte = &pte[pfn_level_offset(pfn, level)];
1037
1038         do {
1039                 unsigned long level_pfn;
1040                 struct dma_pte *level_pte;
1041
1042                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1043                         goto next;
1044
1045                 level_pfn = pfn & level_mask(level);
1046                 level_pte = phys_to_virt(dma_pte_addr(pte));
1047
1048                 if (level > 2) {
1049                         dma_pte_free_level(domain, level - 1, retain_level,
1050                                            level_pte, level_pfn, start_pfn,
1051                                            last_pfn);
1052                 }
1053
1054                 /*
1055                  * Free the page table if we're below the level we want to
1056                  * retain and the range covers the entire table.
1057                  */
1058                 if (level < retain_level && !(start_pfn > level_pfn ||
1059                       last_pfn < level_pfn + level_size(level) - 1)) {
1060                         dma_clear_pte(pte);
1061                         domain_flush_cache(domain, pte, sizeof(*pte));
1062                         free_pgtable_page(level_pte);
1063                 }
1064 next:
1065                 pfn += level_size(level);
1066         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1067 }
1068
1069 /*
1070  * clear last level (leaf) ptes and free page table pages below the
1071  * level we wish to keep intact.
1072  */
1073 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1074                                    unsigned long start_pfn,
1075                                    unsigned long last_pfn,
1076                                    int retain_level)
1077 {
1078         dma_pte_clear_range(domain, start_pfn, last_pfn);
1079
1080         /* We don't need lock here; nobody else touches the iova range */
1081         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1082                            domain->pgd, 0, start_pfn, last_pfn);
1083
1084         /* free pgd */
1085         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1086                 free_pgtable_page(domain->pgd);
1087                 domain->pgd = NULL;
1088         }
1089 }
1090
1091 /* When a page at a given level is being unlinked from its parent, we don't
1092    need to *modify* it at all. All we need to do is make a list of all the
1093    pages which can be freed just as soon as we've flushed the IOTLB and we
1094    know the hardware page-walk will no longer touch them.
1095    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1096    be freed. */
1097 static void dma_pte_list_pagetables(struct dmar_domain *domain,
1098                                     int level, struct dma_pte *pte,
1099                                     struct list_head *freelist)
1100 {
1101         struct page *pg;
1102
1103         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1104         list_add_tail(&pg->lru, freelist);
1105
1106         if (level == 1)
1107                 return;
1108
1109         pte = page_address(pg);
1110         do {
1111                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1112                         dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1113                 pte++;
1114         } while (!first_pte_in_page(pte));
1115 }
1116
1117 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1118                                 struct dma_pte *pte, unsigned long pfn,
1119                                 unsigned long start_pfn, unsigned long last_pfn,
1120                                 struct list_head *freelist)
1121 {
1122         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1123
1124         pfn = max(start_pfn, pfn);
1125         pte = &pte[pfn_level_offset(pfn, level)];
1126
1127         do {
1128                 unsigned long level_pfn = pfn & level_mask(level);
1129
1130                 if (!dma_pte_present(pte))
1131                         goto next;
1132
1133                 /* If range covers entire pagetable, free it */
1134                 if (start_pfn <= level_pfn &&
1135                     last_pfn >= level_pfn + level_size(level) - 1) {
1136                         /* These suborbinate page tables are going away entirely. Don't
1137                            bother to clear them; we're just going to *free* them. */
1138                         if (level > 1 && !dma_pte_superpage(pte))
1139                                 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1140
1141                         dma_clear_pte(pte);
1142                         if (!first_pte)
1143                                 first_pte = pte;
1144                         last_pte = pte;
1145                 } else if (level > 1) {
1146                         /* Recurse down into a level that isn't *entirely* obsolete */
1147                         dma_pte_clear_level(domain, level - 1,
1148                                             phys_to_virt(dma_pte_addr(pte)),
1149                                             level_pfn, start_pfn, last_pfn,
1150                                             freelist);
1151                 }
1152 next:
1153                 pfn = level_pfn + level_size(level);
1154         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1155
1156         if (first_pte)
1157                 domain_flush_cache(domain, first_pte,
1158                                    (void *)++last_pte - (void *)first_pte);
1159 }
1160
1161 /* We can't just free the pages because the IOMMU may still be walking
1162    the page tables, and may have cached the intermediate levels. The
1163    pages can only be freed after the IOTLB flush has been done. */
1164 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1165                          unsigned long last_pfn, struct list_head *freelist)
1166 {
1167         if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1168             WARN_ON(start_pfn > last_pfn))
1169                 return;
1170
1171         /* we don't need lock here; nobody else touches the iova range */
1172         dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1173                             domain->pgd, 0, start_pfn, last_pfn, freelist);
1174
1175         /* free pgd */
1176         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1177                 struct page *pgd_page = virt_to_page(domain->pgd);
1178                 list_add_tail(&pgd_page->lru, freelist);
1179                 domain->pgd = NULL;
1180         }
1181 }
1182
1183 /* iommu handling */
1184 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1185 {
1186         struct root_entry *root;
1187
1188         root = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
1189         if (!root) {
1190                 pr_err("Allocating root entry for %s failed\n",
1191                         iommu->name);
1192                 return -ENOMEM;
1193         }
1194
1195         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1196         iommu->root_entry = root;
1197
1198         return 0;
1199 }
1200
1201 static void iommu_set_root_entry(struct intel_iommu *iommu)
1202 {
1203         u64 addr;
1204         u32 sts;
1205         unsigned long flag;
1206
1207         addr = virt_to_phys(iommu->root_entry);
1208         if (sm_supported(iommu))
1209                 addr |= DMA_RTADDR_SMT;
1210
1211         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1212         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1213
1214         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1215
1216         /* Make sure hardware complete it */
1217         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1218                       readl, (sts & DMA_GSTS_RTPS), sts);
1219
1220         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1221
1222         /*
1223          * Hardware invalidates all DMA remapping hardware translation
1224          * caches as part of SRTP flow.
1225          */
1226         if (cap_esrtps(iommu->cap))
1227                 return;
1228
1229         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1230         if (sm_supported(iommu))
1231                 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1232         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1233 }
1234
1235 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1236 {
1237         u32 val;
1238         unsigned long flag;
1239
1240         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1241                 return;
1242
1243         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1244         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1245
1246         /* Make sure hardware complete it */
1247         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1248                       readl, (!(val & DMA_GSTS_WBFS)), val);
1249
1250         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1251 }
1252
1253 /* return value determine if we need a write buffer flush */
1254 static void __iommu_flush_context(struct intel_iommu *iommu,
1255                                   u16 did, u16 source_id, u8 function_mask,
1256                                   u64 type)
1257 {
1258         u64 val = 0;
1259         unsigned long flag;
1260
1261         switch (type) {
1262         case DMA_CCMD_GLOBAL_INVL:
1263                 val = DMA_CCMD_GLOBAL_INVL;
1264                 break;
1265         case DMA_CCMD_DOMAIN_INVL:
1266                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1267                 break;
1268         case DMA_CCMD_DEVICE_INVL:
1269                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1270                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1271                 break;
1272         default:
1273                 pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1274                         iommu->name, type);
1275                 return;
1276         }
1277         val |= DMA_CCMD_ICC;
1278
1279         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1280         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1281
1282         /* Make sure hardware complete it */
1283         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1284                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1285
1286         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1287 }
1288
1289 /* return value determine if we need a write buffer flush */
1290 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1291                                 u64 addr, unsigned int size_order, u64 type)
1292 {
1293         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1294         u64 val = 0, val_iva = 0;
1295         unsigned long flag;
1296
1297         switch (type) {
1298         case DMA_TLB_GLOBAL_FLUSH:
1299                 /* global flush doesn't need set IVA_REG */
1300                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1301                 break;
1302         case DMA_TLB_DSI_FLUSH:
1303                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1304                 break;
1305         case DMA_TLB_PSI_FLUSH:
1306                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1307                 /* IH bit is passed in as part of address */
1308                 val_iva = size_order | addr;
1309                 break;
1310         default:
1311                 pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1312                         iommu->name, type);
1313                 return;
1314         }
1315
1316         if (cap_write_drain(iommu->cap))
1317                 val |= DMA_TLB_WRITE_DRAIN;
1318
1319         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1320         /* Note: Only uses first TLB reg currently */
1321         if (val_iva)
1322                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1323         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1324
1325         /* Make sure hardware complete it */
1326         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1327                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1328
1329         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1330
1331         /* check IOTLB invalidation granularity */
1332         if (DMA_TLB_IAIG(val) == 0)
1333                 pr_err("Flush IOTLB failed\n");
1334         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1335                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1336                         (unsigned long long)DMA_TLB_IIRG(type),
1337                         (unsigned long long)DMA_TLB_IAIG(val));
1338 }
1339
1340 static struct device_domain_info *
1341 domain_lookup_dev_info(struct dmar_domain *domain,
1342                        struct intel_iommu *iommu, u8 bus, u8 devfn)
1343 {
1344         struct device_domain_info *info;
1345         unsigned long flags;
1346
1347         spin_lock_irqsave(&domain->lock, flags);
1348         list_for_each_entry(info, &domain->devices, link) {
1349                 if (info->iommu == iommu && info->bus == bus &&
1350                     info->devfn == devfn) {
1351                         spin_unlock_irqrestore(&domain->lock, flags);
1352                         return info;
1353                 }
1354         }
1355         spin_unlock_irqrestore(&domain->lock, flags);
1356
1357         return NULL;
1358 }
1359
1360 static void domain_update_iotlb(struct dmar_domain *domain)
1361 {
1362         struct device_domain_info *info;
1363         bool has_iotlb_device = false;
1364         unsigned long flags;
1365
1366         spin_lock_irqsave(&domain->lock, flags);
1367         list_for_each_entry(info, &domain->devices, link) {
1368                 if (info->ats_enabled) {
1369                         has_iotlb_device = true;
1370                         break;
1371                 }
1372         }
1373         domain->has_iotlb_device = has_iotlb_device;
1374         spin_unlock_irqrestore(&domain->lock, flags);
1375 }
1376
1377 /*
1378  * The extra devTLB flush quirk impacts those QAT devices with PCI device
1379  * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1380  * check because it applies only to the built-in QAT devices and it doesn't
1381  * grant additional privileges.
1382  */
1383 #define BUGGY_QAT_DEVID_MASK 0x4940
1384 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1385 {
1386         if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1387                 return false;
1388
1389         if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1390                 return false;
1391
1392         return true;
1393 }
1394
1395 static void iommu_enable_pci_caps(struct device_domain_info *info)
1396 {
1397         struct pci_dev *pdev;
1398
1399         if (!dev_is_pci(info->dev))
1400                 return;
1401
1402         pdev = to_pci_dev(info->dev);
1403
1404         /* The PCIe spec, in its wisdom, declares that the behaviour of
1405            the device if you enable PASID support after ATS support is
1406            undefined. So always enable PASID support on devices which
1407            have it, even if we can't yet know if we're ever going to
1408            use it. */
1409         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1410                 info->pasid_enabled = 1;
1411
1412         if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1413             !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1414                 info->ats_enabled = 1;
1415                 domain_update_iotlb(info->domain);
1416         }
1417 }
1418
1419 static void iommu_disable_pci_caps(struct device_domain_info *info)
1420 {
1421         struct pci_dev *pdev;
1422
1423         if (!dev_is_pci(info->dev))
1424                 return;
1425
1426         pdev = to_pci_dev(info->dev);
1427
1428         if (info->ats_enabled) {
1429                 pci_disable_ats(pdev);
1430                 info->ats_enabled = 0;
1431                 domain_update_iotlb(info->domain);
1432         }
1433
1434         if (info->pasid_enabled) {
1435                 pci_disable_pasid(pdev);
1436                 info->pasid_enabled = 0;
1437         }
1438 }
1439
1440 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1441                                     u64 addr, unsigned int mask)
1442 {
1443         u16 sid, qdep;
1444
1445         if (!info || !info->ats_enabled)
1446                 return;
1447
1448         sid = info->bus << 8 | info->devfn;
1449         qdep = info->ats_qdep;
1450         qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1451                            qdep, addr, mask);
1452         quirk_extra_dev_tlb_flush(info, addr, mask, PASID_RID2PASID, qdep);
1453 }
1454
1455 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1456                                   u64 addr, unsigned mask)
1457 {
1458         struct device_domain_info *info;
1459         unsigned long flags;
1460
1461         if (!domain->has_iotlb_device)
1462                 return;
1463
1464         spin_lock_irqsave(&domain->lock, flags);
1465         list_for_each_entry(info, &domain->devices, link)
1466                 __iommu_flush_dev_iotlb(info, addr, mask);
1467         spin_unlock_irqrestore(&domain->lock, flags);
1468 }
1469
1470 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1471                                   struct dmar_domain *domain,
1472                                   unsigned long pfn, unsigned int pages,
1473                                   int ih, int map)
1474 {
1475         unsigned int aligned_pages = __roundup_pow_of_two(pages);
1476         unsigned int mask = ilog2(aligned_pages);
1477         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1478         u16 did = domain_id_iommu(domain, iommu);
1479
1480         if (WARN_ON(!pages))
1481                 return;
1482
1483         if (ih)
1484                 ih = 1 << 6;
1485
1486         if (domain->use_first_level) {
1487                 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, pages, ih);
1488         } else {
1489                 unsigned long bitmask = aligned_pages - 1;
1490
1491                 /*
1492                  * PSI masks the low order bits of the base address. If the
1493                  * address isn't aligned to the mask, then compute a mask value
1494                  * needed to ensure the target range is flushed.
1495                  */
1496                 if (unlikely(bitmask & pfn)) {
1497                         unsigned long end_pfn = pfn + pages - 1, shared_bits;
1498
1499                         /*
1500                          * Since end_pfn <= pfn + bitmask, the only way bits
1501                          * higher than bitmask can differ in pfn and end_pfn is
1502                          * by carrying. This means after masking out bitmask,
1503                          * high bits starting with the first set bit in
1504                          * shared_bits are all equal in both pfn and end_pfn.
1505                          */
1506                         shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1507                         mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1508                 }
1509
1510                 /*
1511                  * Fallback to domain selective flush if no PSI support or
1512                  * the size is too big.
1513                  */
1514                 if (!cap_pgsel_inv(iommu->cap) ||
1515                     mask > cap_max_amask_val(iommu->cap))
1516                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1517                                                         DMA_TLB_DSI_FLUSH);
1518                 else
1519                         iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1520                                                         DMA_TLB_PSI_FLUSH);
1521         }
1522
1523         /*
1524          * In caching mode, changes of pages from non-present to present require
1525          * flush. However, device IOTLB doesn't need to be flushed in this case.
1526          */
1527         if (!cap_caching_mode(iommu->cap) || !map)
1528                 iommu_flush_dev_iotlb(domain, addr, mask);
1529 }
1530
1531 /* Notification for newly created mappings */
1532 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1533                                         struct dmar_domain *domain,
1534                                         unsigned long pfn, unsigned int pages)
1535 {
1536         /*
1537          * It's a non-present to present mapping. Only flush if caching mode
1538          * and second level.
1539          */
1540         if (cap_caching_mode(iommu->cap) && !domain->use_first_level)
1541                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1542         else
1543                 iommu_flush_write_buffer(iommu);
1544 }
1545
1546 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1547 {
1548         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1549         struct iommu_domain_info *info;
1550         unsigned long idx;
1551
1552         xa_for_each(&dmar_domain->iommu_array, idx, info) {
1553                 struct intel_iommu *iommu = info->iommu;
1554                 u16 did = domain_id_iommu(dmar_domain, iommu);
1555
1556                 if (dmar_domain->use_first_level)
1557                         qi_flush_piotlb(iommu, did, PASID_RID2PASID, 0, -1, 0);
1558                 else
1559                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1560                                                  DMA_TLB_DSI_FLUSH);
1561
1562                 if (!cap_caching_mode(iommu->cap))
1563                         iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1564         }
1565 }
1566
1567 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1568 {
1569         u32 pmen;
1570         unsigned long flags;
1571
1572         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1573                 return;
1574
1575         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1576         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1577         pmen &= ~DMA_PMEN_EPM;
1578         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1579
1580         /* wait for the protected region status bit to clear */
1581         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1582                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1583
1584         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1585 }
1586
1587 static void iommu_enable_translation(struct intel_iommu *iommu)
1588 {
1589         u32 sts;
1590         unsigned long flags;
1591
1592         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1593         iommu->gcmd |= DMA_GCMD_TE;
1594         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1595
1596         /* Make sure hardware complete it */
1597         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1598                       readl, (sts & DMA_GSTS_TES), sts);
1599
1600         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1601 }
1602
1603 static void iommu_disable_translation(struct intel_iommu *iommu)
1604 {
1605         u32 sts;
1606         unsigned long flag;
1607
1608         if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1609             (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1610                 return;
1611
1612         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1613         iommu->gcmd &= ~DMA_GCMD_TE;
1614         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1615
1616         /* Make sure hardware complete it */
1617         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1618                       readl, (!(sts & DMA_GSTS_TES)), sts);
1619
1620         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1621 }
1622
1623 static int iommu_init_domains(struct intel_iommu *iommu)
1624 {
1625         u32 ndomains;
1626
1627         ndomains = cap_ndoms(iommu->cap);
1628         pr_debug("%s: Number of Domains supported <%d>\n",
1629                  iommu->name, ndomains);
1630
1631         spin_lock_init(&iommu->lock);
1632
1633         iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1634         if (!iommu->domain_ids)
1635                 return -ENOMEM;
1636
1637         /*
1638          * If Caching mode is set, then invalid translations are tagged
1639          * with domain-id 0, hence we need to pre-allocate it. We also
1640          * use domain-id 0 as a marker for non-allocated domain-id, so
1641          * make sure it is not used for a real domain.
1642          */
1643         set_bit(0, iommu->domain_ids);
1644
1645         /*
1646          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1647          * entry for first-level or pass-through translation modes should
1648          * be programmed with a domain id different from those used for
1649          * second-level or nested translation. We reserve a domain id for
1650          * this purpose.
1651          */
1652         if (sm_supported(iommu))
1653                 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1654
1655         return 0;
1656 }
1657
1658 static void disable_dmar_iommu(struct intel_iommu *iommu)
1659 {
1660         if (!iommu->domain_ids)
1661                 return;
1662
1663         /*
1664          * All iommu domains must have been detached from the devices,
1665          * hence there should be no domain IDs in use.
1666          */
1667         if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1668                     > NUM_RESERVED_DID))
1669                 return;
1670
1671         if (iommu->gcmd & DMA_GCMD_TE)
1672                 iommu_disable_translation(iommu);
1673 }
1674
1675 static void free_dmar_iommu(struct intel_iommu *iommu)
1676 {
1677         if (iommu->domain_ids) {
1678                 bitmap_free(iommu->domain_ids);
1679                 iommu->domain_ids = NULL;
1680         }
1681
1682         if (iommu->copied_tables) {
1683                 bitmap_free(iommu->copied_tables);
1684                 iommu->copied_tables = NULL;
1685         }
1686
1687         /* free context mapping */
1688         free_context_table(iommu);
1689
1690 #ifdef CONFIG_INTEL_IOMMU_SVM
1691         if (pasid_supported(iommu)) {
1692                 if (ecap_prs(iommu->ecap))
1693                         intel_svm_finish_prq(iommu);
1694         }
1695 #endif
1696 }
1697
1698 /*
1699  * Check and return whether first level is used by default for
1700  * DMA translation.
1701  */
1702 static bool first_level_by_default(unsigned int type)
1703 {
1704         /* Only SL is available in legacy mode */
1705         if (!scalable_mode_support())
1706                 return false;
1707
1708         /* Only level (either FL or SL) is available, just use it */
1709         if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1710                 return intel_cap_flts_sanity();
1711
1712         /* Both levels are available, decide it based on domain type */
1713         return type != IOMMU_DOMAIN_UNMANAGED;
1714 }
1715
1716 static struct dmar_domain *alloc_domain(unsigned int type)
1717 {
1718         struct dmar_domain *domain;
1719
1720         domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1721         if (!domain)
1722                 return NULL;
1723
1724         domain->nid = NUMA_NO_NODE;
1725         if (first_level_by_default(type))
1726                 domain->use_first_level = true;
1727         domain->has_iotlb_device = false;
1728         INIT_LIST_HEAD(&domain->devices);
1729         spin_lock_init(&domain->lock);
1730         xa_init(&domain->iommu_array);
1731
1732         return domain;
1733 }
1734
1735 static int domain_attach_iommu(struct dmar_domain *domain,
1736                                struct intel_iommu *iommu)
1737 {
1738         struct iommu_domain_info *info, *curr;
1739         unsigned long ndomains;
1740         int num, ret = -ENOSPC;
1741
1742         info = kzalloc(sizeof(*info), GFP_KERNEL);
1743         if (!info)
1744                 return -ENOMEM;
1745
1746         spin_lock(&iommu->lock);
1747         curr = xa_load(&domain->iommu_array, iommu->seq_id);
1748         if (curr) {
1749                 curr->refcnt++;
1750                 spin_unlock(&iommu->lock);
1751                 kfree(info);
1752                 return 0;
1753         }
1754
1755         ndomains = cap_ndoms(iommu->cap);
1756         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1757         if (num >= ndomains) {
1758                 pr_err("%s: No free domain ids\n", iommu->name);
1759                 goto err_unlock;
1760         }
1761
1762         set_bit(num, iommu->domain_ids);
1763         info->refcnt    = 1;
1764         info->did       = num;
1765         info->iommu     = iommu;
1766         curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1767                           NULL, info, GFP_ATOMIC);
1768         if (curr) {
1769                 ret = xa_err(curr) ? : -EBUSY;
1770                 goto err_clear;
1771         }
1772         domain_update_iommu_cap(domain);
1773
1774         spin_unlock(&iommu->lock);
1775         return 0;
1776
1777 err_clear:
1778         clear_bit(info->did, iommu->domain_ids);
1779 err_unlock:
1780         spin_unlock(&iommu->lock);
1781         kfree(info);
1782         return ret;
1783 }
1784
1785 static void domain_detach_iommu(struct dmar_domain *domain,
1786                                 struct intel_iommu *iommu)
1787 {
1788         struct iommu_domain_info *info;
1789
1790         spin_lock(&iommu->lock);
1791         info = xa_load(&domain->iommu_array, iommu->seq_id);
1792         if (--info->refcnt == 0) {
1793                 clear_bit(info->did, iommu->domain_ids);
1794                 xa_erase(&domain->iommu_array, iommu->seq_id);
1795                 domain->nid = NUMA_NO_NODE;
1796                 domain_update_iommu_cap(domain);
1797                 kfree(info);
1798         }
1799         spin_unlock(&iommu->lock);
1800 }
1801
1802 static inline int guestwidth_to_adjustwidth(int gaw)
1803 {
1804         int agaw;
1805         int r = (gaw - 12) % 9;
1806
1807         if (r == 0)
1808                 agaw = gaw;
1809         else
1810                 agaw = gaw + 9 - r;
1811         if (agaw > 64)
1812                 agaw = 64;
1813         return agaw;
1814 }
1815
1816 static void domain_exit(struct dmar_domain *domain)
1817 {
1818         if (domain->pgd) {
1819                 LIST_HEAD(freelist);
1820
1821                 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1822                 put_pages_list(&freelist);
1823         }
1824
1825         if (WARN_ON(!list_empty(&domain->devices)))
1826                 return;
1827
1828         kfree(domain);
1829 }
1830
1831 /*
1832  * Get the PASID directory size for scalable mode context entry.
1833  * Value of X in the PDTS field of a scalable mode context entry
1834  * indicates PASID directory with 2^(X + 7) entries.
1835  */
1836 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1837 {
1838         unsigned long pds, max_pde;
1839
1840         max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1841         pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
1842         if (pds < 7)
1843                 return 0;
1844
1845         return pds - 7;
1846 }
1847
1848 /*
1849  * Set the RID_PASID field of a scalable mode context entry. The
1850  * IOMMU hardware will use the PASID value set in this field for
1851  * DMA translations of DMA requests without PASID.
1852  */
1853 static inline void
1854 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1855 {
1856         context->hi |= pasid & ((1 << 20) - 1);
1857 }
1858
1859 /*
1860  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1861  * entry.
1862  */
1863 static inline void context_set_sm_dte(struct context_entry *context)
1864 {
1865         context->lo |= BIT_ULL(2);
1866 }
1867
1868 /*
1869  * Set the PRE(Page Request Enable) field of a scalable mode context
1870  * entry.
1871  */
1872 static inline void context_set_sm_pre(struct context_entry *context)
1873 {
1874         context->lo |= BIT_ULL(4);
1875 }
1876
1877 /* Convert value to context PASID directory size field coding. */
1878 #define context_pdts(pds)       (((pds) & 0x7) << 9)
1879
1880 static int domain_context_mapping_one(struct dmar_domain *domain,
1881                                       struct intel_iommu *iommu,
1882                                       struct pasid_table *table,
1883                                       u8 bus, u8 devfn)
1884 {
1885         struct device_domain_info *info =
1886                         domain_lookup_dev_info(domain, iommu, bus, devfn);
1887         u16 did = domain_id_iommu(domain, iommu);
1888         int translation = CONTEXT_TT_MULTI_LEVEL;
1889         struct context_entry *context;
1890         int ret;
1891
1892         if (hw_pass_through && domain_type_is_si(domain))
1893                 translation = CONTEXT_TT_PASS_THROUGH;
1894
1895         pr_debug("Set context mapping for %02x:%02x.%d\n",
1896                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1897
1898         spin_lock(&iommu->lock);
1899         ret = -ENOMEM;
1900         context = iommu_context_addr(iommu, bus, devfn, 1);
1901         if (!context)
1902                 goto out_unlock;
1903
1904         ret = 0;
1905         if (context_present(context) && !context_copied(iommu, bus, devfn))
1906                 goto out_unlock;
1907
1908         /*
1909          * For kdump cases, old valid entries may be cached due to the
1910          * in-flight DMA and copied pgtable, but there is no unmapping
1911          * behaviour for them, thus we need an explicit cache flush for
1912          * the newly-mapped device. For kdump, at this point, the device
1913          * is supposed to finish reset at its driver probe stage, so no
1914          * in-flight DMA will exist, and we don't need to worry anymore
1915          * hereafter.
1916          */
1917         if (context_copied(iommu, bus, devfn)) {
1918                 u16 did_old = context_domain_id(context);
1919
1920                 if (did_old < cap_ndoms(iommu->cap)) {
1921                         iommu->flush.flush_context(iommu, did_old,
1922                                                    (((u16)bus) << 8) | devfn,
1923                                                    DMA_CCMD_MASK_NOBIT,
1924                                                    DMA_CCMD_DEVICE_INVL);
1925                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1926                                                  DMA_TLB_DSI_FLUSH);
1927                 }
1928
1929                 clear_context_copied(iommu, bus, devfn);
1930         }
1931
1932         context_clear_entry(context);
1933
1934         if (sm_supported(iommu)) {
1935                 unsigned long pds;
1936
1937                 /* Setup the PASID DIR pointer: */
1938                 pds = context_get_sm_pds(table);
1939                 context->lo = (u64)virt_to_phys(table->table) |
1940                                 context_pdts(pds);
1941
1942                 /* Setup the RID_PASID field: */
1943                 context_set_sm_rid2pasid(context, PASID_RID2PASID);
1944
1945                 /*
1946                  * Setup the Device-TLB enable bit and Page request
1947                  * Enable bit:
1948                  */
1949                 if (info && info->ats_supported)
1950                         context_set_sm_dte(context);
1951                 if (info && info->pri_supported)
1952                         context_set_sm_pre(context);
1953                 if (info && info->pasid_supported)
1954                         context_set_pasid(context);
1955         } else {
1956                 struct dma_pte *pgd = domain->pgd;
1957                 int agaw;
1958
1959                 context_set_domain_id(context, did);
1960
1961                 if (translation != CONTEXT_TT_PASS_THROUGH) {
1962                         /*
1963                          * Skip top levels of page tables for iommu which has
1964                          * less agaw than default. Unnecessary for PT mode.
1965                          */
1966                         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
1967                                 ret = -ENOMEM;
1968                                 pgd = phys_to_virt(dma_pte_addr(pgd));
1969                                 if (!dma_pte_present(pgd))
1970                                         goto out_unlock;
1971                         }
1972
1973                         if (info && info->ats_supported)
1974                                 translation = CONTEXT_TT_DEV_IOTLB;
1975                         else
1976                                 translation = CONTEXT_TT_MULTI_LEVEL;
1977
1978                         context_set_address_root(context, virt_to_phys(pgd));
1979                         context_set_address_width(context, agaw);
1980                 } else {
1981                         /*
1982                          * In pass through mode, AW must be programmed to
1983                          * indicate the largest AGAW value supported by
1984                          * hardware. And ASR is ignored by hardware.
1985                          */
1986                         context_set_address_width(context, iommu->msagaw);
1987                 }
1988
1989                 context_set_translation_type(context, translation);
1990         }
1991
1992         context_set_fault_enable(context);
1993         context_set_present(context);
1994         if (!ecap_coherent(iommu->ecap))
1995                 clflush_cache_range(context, sizeof(*context));
1996
1997         /*
1998          * It's a non-present to present mapping. If hardware doesn't cache
1999          * non-present entry we only need to flush the write-buffer. If the
2000          * _does_ cache non-present entries, then it does so in the special
2001          * domain #0, which we have to flush:
2002          */
2003         if (cap_caching_mode(iommu->cap)) {
2004                 iommu->flush.flush_context(iommu, 0,
2005                                            (((u16)bus) << 8) | devfn,
2006                                            DMA_CCMD_MASK_NOBIT,
2007                                            DMA_CCMD_DEVICE_INVL);
2008                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2009         } else {
2010                 iommu_flush_write_buffer(iommu);
2011         }
2012
2013         ret = 0;
2014
2015 out_unlock:
2016         spin_unlock(&iommu->lock);
2017
2018         return ret;
2019 }
2020
2021 struct domain_context_mapping_data {
2022         struct dmar_domain *domain;
2023         struct intel_iommu *iommu;
2024         struct pasid_table *table;
2025 };
2026
2027 static int domain_context_mapping_cb(struct pci_dev *pdev,
2028                                      u16 alias, void *opaque)
2029 {
2030         struct domain_context_mapping_data *data = opaque;
2031
2032         return domain_context_mapping_one(data->domain, data->iommu,
2033                                           data->table, PCI_BUS_NUM(alias),
2034                                           alias & 0xff);
2035 }
2036
2037 static int
2038 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2039 {
2040         struct domain_context_mapping_data data;
2041         struct pasid_table *table;
2042         struct intel_iommu *iommu;
2043         u8 bus, devfn;
2044
2045         iommu = device_to_iommu(dev, &bus, &devfn);
2046         if (!iommu)
2047                 return -ENODEV;
2048
2049         table = intel_pasid_get_table(dev);
2050
2051         if (!dev_is_pci(dev))
2052                 return domain_context_mapping_one(domain, iommu, table,
2053                                                   bus, devfn);
2054
2055         data.domain = domain;
2056         data.iommu = iommu;
2057         data.table = table;
2058
2059         return pci_for_each_dma_alias(to_pci_dev(dev),
2060                                       &domain_context_mapping_cb, &data);
2061 }
2062
2063 /* Returns a number of VTD pages, but aligned to MM page size */
2064 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2065                                             size_t size)
2066 {
2067         host_addr &= ~PAGE_MASK;
2068         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2069 }
2070
2071 /* Return largest possible superpage level for a given mapping */
2072 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2073                                           unsigned long iov_pfn,
2074                                           unsigned long phy_pfn,
2075                                           unsigned long pages)
2076 {
2077         int support, level = 1;
2078         unsigned long pfnmerge;
2079
2080         support = domain->iommu_superpage;
2081
2082         /* To use a large page, the virtual *and* physical addresses
2083            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2084            of them will mean we have to use smaller pages. So just
2085            merge them and check both at once. */
2086         pfnmerge = iov_pfn | phy_pfn;
2087
2088         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2089                 pages >>= VTD_STRIDE_SHIFT;
2090                 if (!pages)
2091                         break;
2092                 pfnmerge >>= VTD_STRIDE_SHIFT;
2093                 level++;
2094                 support--;
2095         }
2096         return level;
2097 }
2098
2099 /*
2100  * Ensure that old small page tables are removed to make room for superpage(s).
2101  * We're going to add new large pages, so make sure we don't remove their parent
2102  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2103  */
2104 static void switch_to_super_page(struct dmar_domain *domain,
2105                                  unsigned long start_pfn,
2106                                  unsigned long end_pfn, int level)
2107 {
2108         unsigned long lvl_pages = lvl_to_nr_pages(level);
2109         struct iommu_domain_info *info;
2110         struct dma_pte *pte = NULL;
2111         unsigned long i;
2112
2113         while (start_pfn <= end_pfn) {
2114                 if (!pte)
2115                         pte = pfn_to_dma_pte(domain, start_pfn, &level,
2116                                              GFP_ATOMIC);
2117
2118                 if (dma_pte_present(pte)) {
2119                         dma_pte_free_pagetable(domain, start_pfn,
2120                                                start_pfn + lvl_pages - 1,
2121                                                level + 1);
2122
2123                         xa_for_each(&domain->iommu_array, i, info)
2124                                 iommu_flush_iotlb_psi(info->iommu, domain,
2125                                                       start_pfn, lvl_pages,
2126                                                       0, 0);
2127                 }
2128
2129                 pte++;
2130                 start_pfn += lvl_pages;
2131                 if (first_pte_in_page(pte))
2132                         pte = NULL;
2133         }
2134 }
2135
2136 static int
2137 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2138                  unsigned long phys_pfn, unsigned long nr_pages, int prot,
2139                  gfp_t gfp)
2140 {
2141         struct dma_pte *first_pte = NULL, *pte = NULL;
2142         unsigned int largepage_lvl = 0;
2143         unsigned long lvl_pages = 0;
2144         phys_addr_t pteval;
2145         u64 attr;
2146
2147         if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
2148                 return -EINVAL;
2149
2150         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2151                 return -EINVAL;
2152
2153         attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2154         attr |= DMA_FL_PTE_PRESENT;
2155         if (domain->use_first_level) {
2156                 attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2157                 if (prot & DMA_PTE_WRITE)
2158                         attr |= DMA_FL_PTE_DIRTY;
2159         }
2160
2161         pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2162
2163         while (nr_pages > 0) {
2164                 uint64_t tmp;
2165
2166                 if (!pte) {
2167                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2168                                         phys_pfn, nr_pages);
2169
2170                         pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
2171                                              gfp);
2172                         if (!pte)
2173                                 return -ENOMEM;
2174                         first_pte = pte;
2175
2176                         lvl_pages = lvl_to_nr_pages(largepage_lvl);
2177
2178                         /* It is large page*/
2179                         if (largepage_lvl > 1) {
2180                                 unsigned long end_pfn;
2181                                 unsigned long pages_to_remove;
2182
2183                                 pteval |= DMA_PTE_LARGE_PAGE;
2184                                 pages_to_remove = min_t(unsigned long, nr_pages,
2185                                                         nr_pte_to_next_page(pte) * lvl_pages);
2186                                 end_pfn = iov_pfn + pages_to_remove - 1;
2187                                 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2188                         } else {
2189                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2190                         }
2191
2192                 }
2193                 /* We don't need lock here, nobody else
2194                  * touches the iova range
2195                  */
2196                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2197                 if (tmp) {
2198                         static int dumps = 5;
2199                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2200                                 iov_pfn, tmp, (unsigned long long)pteval);
2201                         if (dumps) {
2202                                 dumps--;
2203                                 debug_dma_dump_mappings(NULL);
2204                         }
2205                         WARN_ON(1);
2206                 }
2207
2208                 nr_pages -= lvl_pages;
2209                 iov_pfn += lvl_pages;
2210                 phys_pfn += lvl_pages;
2211                 pteval += lvl_pages * VTD_PAGE_SIZE;
2212
2213                 /* If the next PTE would be the first in a new page, then we
2214                  * need to flush the cache on the entries we've just written.
2215                  * And then we'll need to recalculate 'pte', so clear it and
2216                  * let it get set again in the if (!pte) block above.
2217                  *
2218                  * If we're done (!nr_pages) we need to flush the cache too.
2219                  *
2220                  * Also if we've been setting superpages, we may need to
2221                  * recalculate 'pte' and switch back to smaller pages for the
2222                  * end of the mapping, if the trailing size is not enough to
2223                  * use another superpage (i.e. nr_pages < lvl_pages).
2224                  */
2225                 pte++;
2226                 if (!nr_pages || first_pte_in_page(pte) ||
2227                     (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2228                         domain_flush_cache(domain, first_pte,
2229                                            (void *)pte - (void *)first_pte);
2230                         pte = NULL;
2231                 }
2232         }
2233
2234         return 0;
2235 }
2236
2237 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2238 {
2239         struct intel_iommu *iommu = info->iommu;
2240         struct context_entry *context;
2241         u16 did_old;
2242
2243         if (!iommu)
2244                 return;
2245
2246         spin_lock(&iommu->lock);
2247         context = iommu_context_addr(iommu, bus, devfn, 0);
2248         if (!context) {
2249                 spin_unlock(&iommu->lock);
2250                 return;
2251         }
2252
2253         if (sm_supported(iommu)) {
2254                 if (hw_pass_through && domain_type_is_si(info->domain))
2255                         did_old = FLPT_DEFAULT_DID;
2256                 else
2257                         did_old = domain_id_iommu(info->domain, iommu);
2258         } else {
2259                 did_old = context_domain_id(context);
2260         }
2261
2262         context_clear_entry(context);
2263         __iommu_flush_cache(iommu, context, sizeof(*context));
2264         spin_unlock(&iommu->lock);
2265         iommu->flush.flush_context(iommu,
2266                                    did_old,
2267                                    (((u16)bus) << 8) | devfn,
2268                                    DMA_CCMD_MASK_NOBIT,
2269                                    DMA_CCMD_DEVICE_INVL);
2270
2271         if (sm_supported(iommu))
2272                 qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2273
2274         iommu->flush.flush_iotlb(iommu,
2275                                  did_old,
2276                                  0,
2277                                  0,
2278                                  DMA_TLB_DSI_FLUSH);
2279
2280         __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2281 }
2282
2283 static int domain_setup_first_level(struct intel_iommu *iommu,
2284                                     struct dmar_domain *domain,
2285                                     struct device *dev,
2286                                     u32 pasid)
2287 {
2288         struct dma_pte *pgd = domain->pgd;
2289         int agaw, level;
2290         int flags = 0;
2291
2292         /*
2293          * Skip top levels of page tables for iommu which has
2294          * less agaw than default. Unnecessary for PT mode.
2295          */
2296         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2297                 pgd = phys_to_virt(dma_pte_addr(pgd));
2298                 if (!dma_pte_present(pgd))
2299                         return -ENOMEM;
2300         }
2301
2302         level = agaw_to_level(agaw);
2303         if (level != 4 && level != 5)
2304                 return -EINVAL;
2305
2306         if (level == 5)
2307                 flags |= PASID_FLAG_FL5LP;
2308
2309         if (domain->force_snooping)
2310                 flags |= PASID_FLAG_PAGE_SNOOP;
2311
2312         return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2313                                              domain_id_iommu(domain, iommu),
2314                                              flags);
2315 }
2316
2317 static bool dev_is_real_dma_subdevice(struct device *dev)
2318 {
2319         return dev && dev_is_pci(dev) &&
2320                pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2321 }
2322
2323 static int iommu_domain_identity_map(struct dmar_domain *domain,
2324                                      unsigned long first_vpfn,
2325                                      unsigned long last_vpfn)
2326 {
2327         /*
2328          * RMRR range might have overlap with physical memory range,
2329          * clear it first
2330          */
2331         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2332
2333         return __domain_mapping(domain, first_vpfn,
2334                                 first_vpfn, last_vpfn - first_vpfn + 1,
2335                                 DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL);
2336 }
2337
2338 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2339
2340 static int __init si_domain_init(int hw)
2341 {
2342         struct dmar_rmrr_unit *rmrr;
2343         struct device *dev;
2344         int i, nid, ret;
2345
2346         si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2347         if (!si_domain)
2348                 return -EFAULT;
2349
2350         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2351                 domain_exit(si_domain);
2352                 si_domain = NULL;
2353                 return -EFAULT;
2354         }
2355
2356         if (hw)
2357                 return 0;
2358
2359         for_each_online_node(nid) {
2360                 unsigned long start_pfn, end_pfn;
2361                 int i;
2362
2363                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2364                         ret = iommu_domain_identity_map(si_domain,
2365                                         mm_to_dma_pfn(start_pfn),
2366                                         mm_to_dma_pfn(end_pfn));
2367                         if (ret)
2368                                 return ret;
2369                 }
2370         }
2371
2372         /*
2373          * Identity map the RMRRs so that devices with RMRRs could also use
2374          * the si_domain.
2375          */
2376         for_each_rmrr_units(rmrr) {
2377                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2378                                           i, dev) {
2379                         unsigned long long start = rmrr->base_address;
2380                         unsigned long long end = rmrr->end_address;
2381
2382                         if (WARN_ON(end < start ||
2383                                     end >> agaw_to_width(si_domain->agaw)))
2384                                 continue;
2385
2386                         ret = iommu_domain_identity_map(si_domain,
2387                                         mm_to_dma_pfn(start >> PAGE_SHIFT),
2388                                         mm_to_dma_pfn(end >> PAGE_SHIFT));
2389                         if (ret)
2390                                 return ret;
2391                 }
2392         }
2393
2394         return 0;
2395 }
2396
2397 static int dmar_domain_attach_device(struct dmar_domain *domain,
2398                                      struct device *dev)
2399 {
2400         struct device_domain_info *info = dev_iommu_priv_get(dev);
2401         struct intel_iommu *iommu;
2402         unsigned long flags;
2403         u8 bus, devfn;
2404         int ret;
2405
2406         iommu = device_to_iommu(dev, &bus, &devfn);
2407         if (!iommu)
2408                 return -ENODEV;
2409
2410         ret = domain_attach_iommu(domain, iommu);
2411         if (ret)
2412                 return ret;
2413         info->domain = domain;
2414         spin_lock_irqsave(&domain->lock, flags);
2415         list_add(&info->link, &domain->devices);
2416         spin_unlock_irqrestore(&domain->lock, flags);
2417
2418         /* PASID table is mandatory for a PCI device in scalable mode. */
2419         if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2420                 /* Setup the PASID entry for requests without PASID: */
2421                 if (hw_pass_through && domain_type_is_si(domain))
2422                         ret = intel_pasid_setup_pass_through(iommu, domain,
2423                                         dev, PASID_RID2PASID);
2424                 else if (domain->use_first_level)
2425                         ret = domain_setup_first_level(iommu, domain, dev,
2426                                         PASID_RID2PASID);
2427                 else
2428                         ret = intel_pasid_setup_second_level(iommu, domain,
2429                                         dev, PASID_RID2PASID);
2430                 if (ret) {
2431                         dev_err(dev, "Setup RID2PASID failed\n");
2432                         device_block_translation(dev);
2433                         return ret;
2434                 }
2435         }
2436
2437         ret = domain_context_mapping(domain, dev);
2438         if (ret) {
2439                 dev_err(dev, "Domain context map failed\n");
2440                 device_block_translation(dev);
2441                 return ret;
2442         }
2443
2444         iommu_enable_pci_caps(info);
2445
2446         return 0;
2447 }
2448
2449 static bool device_has_rmrr(struct device *dev)
2450 {
2451         struct dmar_rmrr_unit *rmrr;
2452         struct device *tmp;
2453         int i;
2454
2455         rcu_read_lock();
2456         for_each_rmrr_units(rmrr) {
2457                 /*
2458                  * Return TRUE if this RMRR contains the device that
2459                  * is passed in.
2460                  */
2461                 for_each_active_dev_scope(rmrr->devices,
2462                                           rmrr->devices_cnt, i, tmp)
2463                         if (tmp == dev ||
2464                             is_downstream_to_pci_bridge(dev, tmp)) {
2465                                 rcu_read_unlock();
2466                                 return true;
2467                         }
2468         }
2469         rcu_read_unlock();
2470         return false;
2471 }
2472
2473 /**
2474  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2475  * is relaxable (ie. is allowed to be not enforced under some conditions)
2476  * @dev: device handle
2477  *
2478  * We assume that PCI USB devices with RMRRs have them largely
2479  * for historical reasons and that the RMRR space is not actively used post
2480  * boot.  This exclusion may change if vendors begin to abuse it.
2481  *
2482  * The same exception is made for graphics devices, with the requirement that
2483  * any use of the RMRR regions will be torn down before assigning the device
2484  * to a guest.
2485  *
2486  * Return: true if the RMRR is relaxable, false otherwise
2487  */
2488 static bool device_rmrr_is_relaxable(struct device *dev)
2489 {
2490         struct pci_dev *pdev;
2491
2492         if (!dev_is_pci(dev))
2493                 return false;
2494
2495         pdev = to_pci_dev(dev);
2496         if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2497                 return true;
2498         else
2499                 return false;
2500 }
2501
2502 /*
2503  * There are a couple cases where we need to restrict the functionality of
2504  * devices associated with RMRRs.  The first is when evaluating a device for
2505  * identity mapping because problems exist when devices are moved in and out
2506  * of domains and their respective RMRR information is lost.  This means that
2507  * a device with associated RMRRs will never be in a "passthrough" domain.
2508  * The second is use of the device through the IOMMU API.  This interface
2509  * expects to have full control of the IOVA space for the device.  We cannot
2510  * satisfy both the requirement that RMRR access is maintained and have an
2511  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2512  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2513  * We therefore prevent devices associated with an RMRR from participating in
2514  * the IOMMU API, which eliminates them from device assignment.
2515  *
2516  * In both cases, devices which have relaxable RMRRs are not concerned by this
2517  * restriction. See device_rmrr_is_relaxable comment.
2518  */
2519 static bool device_is_rmrr_locked(struct device *dev)
2520 {
2521         if (!device_has_rmrr(dev))
2522                 return false;
2523
2524         if (device_rmrr_is_relaxable(dev))
2525                 return false;
2526
2527         return true;
2528 }
2529
2530 /*
2531  * Return the required default domain type for a specific device.
2532  *
2533  * @dev: the device in query
2534  * @startup: true if this is during early boot
2535  *
2536  * Returns:
2537  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2538  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2539  *  - 0: both identity and dynamic domains work for this device
2540  */
2541 static int device_def_domain_type(struct device *dev)
2542 {
2543         if (dev_is_pci(dev)) {
2544                 struct pci_dev *pdev = to_pci_dev(dev);
2545
2546                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2547                         return IOMMU_DOMAIN_IDENTITY;
2548
2549                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2550                         return IOMMU_DOMAIN_IDENTITY;
2551         }
2552
2553         return 0;
2554 }
2555
2556 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2557 {
2558         /*
2559          * Start from the sane iommu hardware state.
2560          * If the queued invalidation is already initialized by us
2561          * (for example, while enabling interrupt-remapping) then
2562          * we got the things already rolling from a sane state.
2563          */
2564         if (!iommu->qi) {
2565                 /*
2566                  * Clear any previous faults.
2567                  */
2568                 dmar_fault(-1, iommu);
2569                 /*
2570                  * Disable queued invalidation if supported and already enabled
2571                  * before OS handover.
2572                  */
2573                 dmar_disable_qi(iommu);
2574         }
2575
2576         if (dmar_enable_qi(iommu)) {
2577                 /*
2578                  * Queued Invalidate not enabled, use Register Based Invalidate
2579                  */
2580                 iommu->flush.flush_context = __iommu_flush_context;
2581                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2582                 pr_info("%s: Using Register based invalidation\n",
2583                         iommu->name);
2584         } else {
2585                 iommu->flush.flush_context = qi_flush_context;
2586                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2587                 pr_info("%s: Using Queued invalidation\n", iommu->name);
2588         }
2589 }
2590
2591 static int copy_context_table(struct intel_iommu *iommu,
2592                               struct root_entry *old_re,
2593                               struct context_entry **tbl,
2594                               int bus, bool ext)
2595 {
2596         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2597         struct context_entry *new_ce = NULL, ce;
2598         struct context_entry *old_ce = NULL;
2599         struct root_entry re;
2600         phys_addr_t old_ce_phys;
2601
2602         tbl_idx = ext ? bus * 2 : bus;
2603         memcpy(&re, old_re, sizeof(re));
2604
2605         for (devfn = 0; devfn < 256; devfn++) {
2606                 /* First calculate the correct index */
2607                 idx = (ext ? devfn * 2 : devfn) % 256;
2608
2609                 if (idx == 0) {
2610                         /* First save what we may have and clean up */
2611                         if (new_ce) {
2612                                 tbl[tbl_idx] = new_ce;
2613                                 __iommu_flush_cache(iommu, new_ce,
2614                                                     VTD_PAGE_SIZE);
2615                                 pos = 1;
2616                         }
2617
2618                         if (old_ce)
2619                                 memunmap(old_ce);
2620
2621                         ret = 0;
2622                         if (devfn < 0x80)
2623                                 old_ce_phys = root_entry_lctp(&re);
2624                         else
2625                                 old_ce_phys = root_entry_uctp(&re);
2626
2627                         if (!old_ce_phys) {
2628                                 if (ext && devfn == 0) {
2629                                         /* No LCTP, try UCTP */
2630                                         devfn = 0x7f;
2631                                         continue;
2632                                 } else {
2633                                         goto out;
2634                                 }
2635                         }
2636
2637                         ret = -ENOMEM;
2638                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
2639                                         MEMREMAP_WB);
2640                         if (!old_ce)
2641                                 goto out;
2642
2643                         new_ce = alloc_pgtable_page(iommu->node, GFP_KERNEL);
2644                         if (!new_ce)
2645                                 goto out_unmap;
2646
2647                         ret = 0;
2648                 }
2649
2650                 /* Now copy the context entry */
2651                 memcpy(&ce, old_ce + idx, sizeof(ce));
2652
2653                 if (!context_present(&ce))
2654                         continue;
2655
2656                 did = context_domain_id(&ce);
2657                 if (did >= 0 && did < cap_ndoms(iommu->cap))
2658                         set_bit(did, iommu->domain_ids);
2659
2660                 set_context_copied(iommu, bus, devfn);
2661                 new_ce[idx] = ce;
2662         }
2663
2664         tbl[tbl_idx + pos] = new_ce;
2665
2666         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2667
2668 out_unmap:
2669         memunmap(old_ce);
2670
2671 out:
2672         return ret;
2673 }
2674
2675 static int copy_translation_tables(struct intel_iommu *iommu)
2676 {
2677         struct context_entry **ctxt_tbls;
2678         struct root_entry *old_rt;
2679         phys_addr_t old_rt_phys;
2680         int ctxt_table_entries;
2681         u64 rtaddr_reg;
2682         int bus, ret;
2683         bool new_ext, ext;
2684
2685         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2686         ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
2687         new_ext    = !!sm_supported(iommu);
2688
2689         /*
2690          * The RTT bit can only be changed when translation is disabled,
2691          * but disabling translation means to open a window for data
2692          * corruption. So bail out and don't copy anything if we would
2693          * have to change the bit.
2694          */
2695         if (new_ext != ext)
2696                 return -EINVAL;
2697
2698         iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2699         if (!iommu->copied_tables)
2700                 return -ENOMEM;
2701
2702         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2703         if (!old_rt_phys)
2704                 return -EINVAL;
2705
2706         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2707         if (!old_rt)
2708                 return -ENOMEM;
2709
2710         /* This is too big for the stack - allocate it from slab */
2711         ctxt_table_entries = ext ? 512 : 256;
2712         ret = -ENOMEM;
2713         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2714         if (!ctxt_tbls)
2715                 goto out_unmap;
2716
2717         for (bus = 0; bus < 256; bus++) {
2718                 ret = copy_context_table(iommu, &old_rt[bus],
2719                                          ctxt_tbls, bus, ext);
2720                 if (ret) {
2721                         pr_err("%s: Failed to copy context table for bus %d\n",
2722                                 iommu->name, bus);
2723                         continue;
2724                 }
2725         }
2726
2727         spin_lock(&iommu->lock);
2728
2729         /* Context tables are copied, now write them to the root_entry table */
2730         for (bus = 0; bus < 256; bus++) {
2731                 int idx = ext ? bus * 2 : bus;
2732                 u64 val;
2733
2734                 if (ctxt_tbls[idx]) {
2735                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
2736                         iommu->root_entry[bus].lo = val;
2737                 }
2738
2739                 if (!ext || !ctxt_tbls[idx + 1])
2740                         continue;
2741
2742                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2743                 iommu->root_entry[bus].hi = val;
2744         }
2745
2746         spin_unlock(&iommu->lock);
2747
2748         kfree(ctxt_tbls);
2749
2750         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2751
2752         ret = 0;
2753
2754 out_unmap:
2755         memunmap(old_rt);
2756
2757         return ret;
2758 }
2759
2760 static int __init init_dmars(void)
2761 {
2762         struct dmar_drhd_unit *drhd;
2763         struct intel_iommu *iommu;
2764         int ret;
2765
2766         ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2767         if (ret)
2768                 goto free_iommu;
2769
2770         for_each_iommu(iommu, drhd) {
2771                 if (drhd->ignored) {
2772                         iommu_disable_translation(iommu);
2773                         continue;
2774                 }
2775
2776                 /*
2777                  * Find the max pasid size of all IOMMU's in the system.
2778                  * We need to ensure the system pasid table is no bigger
2779                  * than the smallest supported.
2780                  */
2781                 if (pasid_supported(iommu)) {
2782                         u32 temp = 2 << ecap_pss(iommu->ecap);
2783
2784                         intel_pasid_max_id = min_t(u32, temp,
2785                                                    intel_pasid_max_id);
2786                 }
2787
2788                 intel_iommu_init_qi(iommu);
2789
2790                 ret = iommu_init_domains(iommu);
2791                 if (ret)
2792                         goto free_iommu;
2793
2794                 init_translation_status(iommu);
2795
2796                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2797                         iommu_disable_translation(iommu);
2798                         clear_translation_pre_enabled(iommu);
2799                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2800                                 iommu->name);
2801                 }
2802
2803                 /*
2804                  * TBD:
2805                  * we could share the same root & context tables
2806                  * among all IOMMU's. Need to Split it later.
2807                  */
2808                 ret = iommu_alloc_root_entry(iommu);
2809                 if (ret)
2810                         goto free_iommu;
2811
2812                 if (translation_pre_enabled(iommu)) {
2813                         pr_info("Translation already enabled - trying to copy translation structures\n");
2814
2815                         ret = copy_translation_tables(iommu);
2816                         if (ret) {
2817                                 /*
2818                                  * We found the IOMMU with translation
2819                                  * enabled - but failed to copy over the
2820                                  * old root-entry table. Try to proceed
2821                                  * by disabling translation now and
2822                                  * allocating a clean root-entry table.
2823                                  * This might cause DMAR faults, but
2824                                  * probably the dump will still succeed.
2825                                  */
2826                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
2827                                        iommu->name);
2828                                 iommu_disable_translation(iommu);
2829                                 clear_translation_pre_enabled(iommu);
2830                         } else {
2831                                 pr_info("Copied translation tables from previous kernel for %s\n",
2832                                         iommu->name);
2833                         }
2834                 }
2835
2836                 if (!ecap_pass_through(iommu->ecap))
2837                         hw_pass_through = 0;
2838                 intel_svm_check(iommu);
2839         }
2840
2841         /*
2842          * Now that qi is enabled on all iommus, set the root entry and flush
2843          * caches. This is required on some Intel X58 chipsets, otherwise the
2844          * flush_context function will loop forever and the boot hangs.
2845          */
2846         for_each_active_iommu(iommu, drhd) {
2847                 iommu_flush_write_buffer(iommu);
2848                 iommu_set_root_entry(iommu);
2849         }
2850
2851 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2852         dmar_map_gfx = 0;
2853 #endif
2854
2855         if (!dmar_map_gfx)
2856                 iommu_identity_mapping |= IDENTMAP_GFX;
2857
2858         check_tylersburg_isoch();
2859
2860         ret = si_domain_init(hw_pass_through);
2861         if (ret)
2862                 goto free_iommu;
2863
2864         /*
2865          * for each drhd
2866          *   enable fault log
2867          *   global invalidate context cache
2868          *   global invalidate iotlb
2869          *   enable translation
2870          */
2871         for_each_iommu(iommu, drhd) {
2872                 if (drhd->ignored) {
2873                         /*
2874                          * we always have to disable PMRs or DMA may fail on
2875                          * this device
2876                          */
2877                         if (force_on)
2878                                 iommu_disable_protect_mem_regions(iommu);
2879                         continue;
2880                 }
2881
2882                 iommu_flush_write_buffer(iommu);
2883
2884 #ifdef CONFIG_INTEL_IOMMU_SVM
2885                 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
2886                         /*
2887                          * Call dmar_alloc_hwirq() with dmar_global_lock held,
2888                          * could cause possible lock race condition.
2889                          */
2890                         up_write(&dmar_global_lock);
2891                         ret = intel_svm_enable_prq(iommu);
2892                         down_write(&dmar_global_lock);
2893                         if (ret)
2894                                 goto free_iommu;
2895                 }
2896 #endif
2897                 ret = dmar_set_interrupt(iommu);
2898                 if (ret)
2899                         goto free_iommu;
2900         }
2901
2902         return 0;
2903
2904 free_iommu:
2905         for_each_active_iommu(iommu, drhd) {
2906                 disable_dmar_iommu(iommu);
2907                 free_dmar_iommu(iommu);
2908         }
2909         if (si_domain) {
2910                 domain_exit(si_domain);
2911                 si_domain = NULL;
2912         }
2913
2914         return ret;
2915 }
2916
2917 static void __init init_no_remapping_devices(void)
2918 {
2919         struct dmar_drhd_unit *drhd;
2920         struct device *dev;
2921         int i;
2922
2923         for_each_drhd_unit(drhd) {
2924                 if (!drhd->include_all) {
2925                         for_each_active_dev_scope(drhd->devices,
2926                                                   drhd->devices_cnt, i, dev)
2927                                 break;
2928                         /* ignore DMAR unit if no devices exist */
2929                         if (i == drhd->devices_cnt)
2930                                 drhd->ignored = 1;
2931                 }
2932         }
2933
2934         for_each_active_drhd_unit(drhd) {
2935                 if (drhd->include_all)
2936                         continue;
2937
2938                 for_each_active_dev_scope(drhd->devices,
2939                                           drhd->devices_cnt, i, dev)
2940                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2941                                 break;
2942                 if (i < drhd->devices_cnt)
2943                         continue;
2944
2945                 /* This IOMMU has *only* gfx devices. Either bypass it or
2946                    set the gfx_mapped flag, as appropriate */
2947                 drhd->gfx_dedicated = 1;
2948                 if (!dmar_map_gfx)
2949                         drhd->ignored = 1;
2950         }
2951 }
2952
2953 #ifdef CONFIG_SUSPEND
2954 static int init_iommu_hw(void)
2955 {
2956         struct dmar_drhd_unit *drhd;
2957         struct intel_iommu *iommu = NULL;
2958         int ret;
2959
2960         for_each_active_iommu(iommu, drhd) {
2961                 if (iommu->qi) {
2962                         ret = dmar_reenable_qi(iommu);
2963                         if (ret)
2964                                 return ret;
2965                 }
2966         }
2967
2968         for_each_iommu(iommu, drhd) {
2969                 if (drhd->ignored) {
2970                         /*
2971                          * we always have to disable PMRs or DMA may fail on
2972                          * this device
2973                          */
2974                         if (force_on)
2975                                 iommu_disable_protect_mem_regions(iommu);
2976                         continue;
2977                 }
2978
2979                 iommu_flush_write_buffer(iommu);
2980                 iommu_set_root_entry(iommu);
2981                 iommu_enable_translation(iommu);
2982                 iommu_disable_protect_mem_regions(iommu);
2983         }
2984
2985         return 0;
2986 }
2987
2988 static void iommu_flush_all(void)
2989 {
2990         struct dmar_drhd_unit *drhd;
2991         struct intel_iommu *iommu;
2992
2993         for_each_active_iommu(iommu, drhd) {
2994                 iommu->flush.flush_context(iommu, 0, 0, 0,
2995                                            DMA_CCMD_GLOBAL_INVL);
2996                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2997                                          DMA_TLB_GLOBAL_FLUSH);
2998         }
2999 }
3000
3001 static int iommu_suspend(void)
3002 {
3003         struct dmar_drhd_unit *drhd;
3004         struct intel_iommu *iommu = NULL;
3005         unsigned long flag;
3006
3007         for_each_active_iommu(iommu, drhd) {
3008                 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3009                                              GFP_KERNEL);
3010                 if (!iommu->iommu_state)
3011                         goto nomem;
3012         }
3013
3014         iommu_flush_all();
3015
3016         for_each_active_iommu(iommu, drhd) {
3017                 iommu_disable_translation(iommu);
3018
3019                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3020
3021                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3022                         readl(iommu->reg + DMAR_FECTL_REG);
3023                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3024                         readl(iommu->reg + DMAR_FEDATA_REG);
3025                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3026                         readl(iommu->reg + DMAR_FEADDR_REG);
3027                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3028                         readl(iommu->reg + DMAR_FEUADDR_REG);
3029
3030                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3031         }
3032         return 0;
3033
3034 nomem:
3035         for_each_active_iommu(iommu, drhd)
3036                 kfree(iommu->iommu_state);
3037
3038         return -ENOMEM;
3039 }
3040
3041 static void iommu_resume(void)
3042 {
3043         struct dmar_drhd_unit *drhd;
3044         struct intel_iommu *iommu = NULL;
3045         unsigned long flag;
3046
3047         if (init_iommu_hw()) {
3048                 if (force_on)
3049                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3050                 else
3051                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3052                 return;
3053         }
3054
3055         for_each_active_iommu(iommu, drhd) {
3056
3057                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3058
3059                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3060                         iommu->reg + DMAR_FECTL_REG);
3061                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3062                         iommu->reg + DMAR_FEDATA_REG);
3063                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3064                         iommu->reg + DMAR_FEADDR_REG);
3065                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3066                         iommu->reg + DMAR_FEUADDR_REG);
3067
3068                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3069         }
3070
3071         for_each_active_iommu(iommu, drhd)
3072                 kfree(iommu->iommu_state);
3073 }
3074
3075 static struct syscore_ops iommu_syscore_ops = {
3076         .resume         = iommu_resume,
3077         .suspend        = iommu_suspend,
3078 };
3079
3080 static void __init init_iommu_pm_ops(void)
3081 {
3082         register_syscore_ops(&iommu_syscore_ops);
3083 }
3084
3085 #else
3086 static inline void init_iommu_pm_ops(void) {}
3087 #endif  /* CONFIG_PM */
3088
3089 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3090 {
3091         if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3092             !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3093             rmrr->end_address <= rmrr->base_address ||
3094             arch_rmrr_sanity_check(rmrr))
3095                 return -EINVAL;
3096
3097         return 0;
3098 }
3099
3100 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3101 {
3102         struct acpi_dmar_reserved_memory *rmrr;
3103         struct dmar_rmrr_unit *rmrru;
3104
3105         rmrr = (struct acpi_dmar_reserved_memory *)header;
3106         if (rmrr_sanity_check(rmrr)) {
3107                 pr_warn(FW_BUG
3108                            "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3109                            "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3110                            rmrr->base_address, rmrr->end_address,
3111                            dmi_get_system_info(DMI_BIOS_VENDOR),
3112                            dmi_get_system_info(DMI_BIOS_VERSION),
3113                            dmi_get_system_info(DMI_PRODUCT_VERSION));
3114                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3115         }
3116
3117         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3118         if (!rmrru)
3119                 goto out;
3120
3121         rmrru->hdr = header;
3122
3123         rmrru->base_address = rmrr->base_address;
3124         rmrru->end_address = rmrr->end_address;
3125
3126         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3127                                 ((void *)rmrr) + rmrr->header.length,
3128                                 &rmrru->devices_cnt);
3129         if (rmrru->devices_cnt && rmrru->devices == NULL)
3130                 goto free_rmrru;
3131
3132         list_add(&rmrru->list, &dmar_rmrr_units);
3133
3134         return 0;
3135 free_rmrru:
3136         kfree(rmrru);
3137 out:
3138         return -ENOMEM;
3139 }
3140
3141 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3142 {
3143         struct dmar_atsr_unit *atsru;
3144         struct acpi_dmar_atsr *tmp;
3145
3146         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3147                                 dmar_rcu_check()) {
3148                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3149                 if (atsr->segment != tmp->segment)
3150                         continue;
3151                 if (atsr->header.length != tmp->header.length)
3152                         continue;
3153                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
3154                         return atsru;
3155         }
3156
3157         return NULL;
3158 }
3159
3160 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3161 {
3162         struct acpi_dmar_atsr *atsr;
3163         struct dmar_atsr_unit *atsru;
3164
3165         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3166                 return 0;
3167
3168         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3169         atsru = dmar_find_atsr(atsr);
3170         if (atsru)
3171                 return 0;
3172
3173         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3174         if (!atsru)
3175                 return -ENOMEM;
3176
3177         /*
3178          * If memory is allocated from slab by ACPI _DSM method, we need to
3179          * copy the memory content because the memory buffer will be freed
3180          * on return.
3181          */
3182         atsru->hdr = (void *)(atsru + 1);
3183         memcpy(atsru->hdr, hdr, hdr->length);
3184         atsru->include_all = atsr->flags & 0x1;
3185         if (!atsru->include_all) {
3186                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3187                                 (void *)atsr + atsr->header.length,
3188                                 &atsru->devices_cnt);
3189                 if (atsru->devices_cnt && atsru->devices == NULL) {
3190                         kfree(atsru);
3191                         return -ENOMEM;
3192                 }
3193         }
3194
3195         list_add_rcu(&atsru->list, &dmar_atsr_units);
3196
3197         return 0;
3198 }
3199
3200 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3201 {
3202         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3203         kfree(atsru);
3204 }
3205
3206 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3207 {
3208         struct acpi_dmar_atsr *atsr;
3209         struct dmar_atsr_unit *atsru;
3210
3211         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3212         atsru = dmar_find_atsr(atsr);
3213         if (atsru) {
3214                 list_del_rcu(&atsru->list);
3215                 synchronize_rcu();
3216                 intel_iommu_free_atsr(atsru);
3217         }
3218
3219         return 0;
3220 }
3221
3222 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3223 {
3224         int i;
3225         struct device *dev;
3226         struct acpi_dmar_atsr *atsr;
3227         struct dmar_atsr_unit *atsru;
3228
3229         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3230         atsru = dmar_find_atsr(atsr);
3231         if (!atsru)
3232                 return 0;
3233
3234         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3235                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3236                                           i, dev)
3237                         return -EBUSY;
3238         }
3239
3240         return 0;
3241 }
3242
3243 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3244 {
3245         struct dmar_satc_unit *satcu;
3246         struct acpi_dmar_satc *tmp;
3247
3248         list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3249                                 dmar_rcu_check()) {
3250                 tmp = (struct acpi_dmar_satc *)satcu->hdr;
3251                 if (satc->segment != tmp->segment)
3252                         continue;
3253                 if (satc->header.length != tmp->header.length)
3254                         continue;
3255                 if (memcmp(satc, tmp, satc->header.length) == 0)
3256                         return satcu;
3257         }
3258
3259         return NULL;
3260 }
3261
3262 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3263 {
3264         struct acpi_dmar_satc *satc;
3265         struct dmar_satc_unit *satcu;
3266
3267         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3268                 return 0;
3269
3270         satc = container_of(hdr, struct acpi_dmar_satc, header);
3271         satcu = dmar_find_satc(satc);
3272         if (satcu)
3273                 return 0;
3274
3275         satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3276         if (!satcu)
3277                 return -ENOMEM;
3278
3279         satcu->hdr = (void *)(satcu + 1);
3280         memcpy(satcu->hdr, hdr, hdr->length);
3281         satcu->atc_required = satc->flags & 0x1;
3282         satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3283                                               (void *)satc + satc->header.length,
3284                                               &satcu->devices_cnt);
3285         if (satcu->devices_cnt && !satcu->devices) {
3286                 kfree(satcu);
3287                 return -ENOMEM;
3288         }
3289         list_add_rcu(&satcu->list, &dmar_satc_units);
3290
3291         return 0;
3292 }
3293
3294 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3295 {
3296         int sp, ret;
3297         struct intel_iommu *iommu = dmaru->iommu;
3298
3299         ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3300         if (ret)
3301                 goto out;
3302
3303         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3304                 pr_warn("%s: Doesn't support hardware pass through.\n",
3305                         iommu->name);
3306                 return -ENXIO;
3307         }
3308
3309         sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3310         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3311                 pr_warn("%s: Doesn't support large page.\n",
3312                         iommu->name);
3313                 return -ENXIO;
3314         }
3315
3316         /*
3317          * Disable translation if already enabled prior to OS handover.
3318          */
3319         if (iommu->gcmd & DMA_GCMD_TE)
3320                 iommu_disable_translation(iommu);
3321
3322         ret = iommu_init_domains(iommu);
3323         if (ret == 0)
3324                 ret = iommu_alloc_root_entry(iommu);
3325         if (ret)
3326                 goto out;
3327
3328         intel_svm_check(iommu);
3329
3330         if (dmaru->ignored) {
3331                 /*
3332                  * we always have to disable PMRs or DMA may fail on this device
3333                  */
3334                 if (force_on)
3335                         iommu_disable_protect_mem_regions(iommu);
3336                 return 0;
3337         }
3338
3339         intel_iommu_init_qi(iommu);
3340         iommu_flush_write_buffer(iommu);
3341
3342 #ifdef CONFIG_INTEL_IOMMU_SVM
3343         if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3344                 ret = intel_svm_enable_prq(iommu);
3345                 if (ret)
3346                         goto disable_iommu;
3347         }
3348 #endif
3349         ret = dmar_set_interrupt(iommu);
3350         if (ret)
3351                 goto disable_iommu;
3352
3353         iommu_set_root_entry(iommu);
3354         iommu_enable_translation(iommu);
3355
3356         iommu_disable_protect_mem_regions(iommu);
3357         return 0;
3358
3359 disable_iommu:
3360         disable_dmar_iommu(iommu);
3361 out:
3362         free_dmar_iommu(iommu);
3363         return ret;
3364 }
3365
3366 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3367 {
3368         int ret = 0;
3369         struct intel_iommu *iommu = dmaru->iommu;
3370
3371         if (!intel_iommu_enabled)
3372                 return 0;
3373         if (iommu == NULL)
3374                 return -EINVAL;
3375
3376         if (insert) {
3377                 ret = intel_iommu_add(dmaru);
3378         } else {
3379                 disable_dmar_iommu(iommu);
3380                 free_dmar_iommu(iommu);
3381         }
3382
3383         return ret;
3384 }
3385
3386 static void intel_iommu_free_dmars(void)
3387 {
3388         struct dmar_rmrr_unit *rmrru, *rmrr_n;
3389         struct dmar_atsr_unit *atsru, *atsr_n;
3390         struct dmar_satc_unit *satcu, *satc_n;
3391
3392         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3393                 list_del(&rmrru->list);
3394                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3395                 kfree(rmrru);
3396         }
3397
3398         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3399                 list_del(&atsru->list);
3400                 intel_iommu_free_atsr(atsru);
3401         }
3402         list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3403                 list_del(&satcu->list);
3404                 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3405                 kfree(satcu);
3406         }
3407 }
3408
3409 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3410 {
3411         struct dmar_satc_unit *satcu;
3412         struct acpi_dmar_satc *satc;
3413         struct device *tmp;
3414         int i;
3415
3416         dev = pci_physfn(dev);
3417         rcu_read_lock();
3418
3419         list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3420                 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3421                 if (satc->segment != pci_domain_nr(dev->bus))
3422                         continue;
3423                 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3424                         if (to_pci_dev(tmp) == dev)
3425                                 goto out;
3426         }
3427         satcu = NULL;
3428 out:
3429         rcu_read_unlock();
3430         return satcu;
3431 }
3432
3433 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3434 {
3435         int i, ret = 1;
3436         struct pci_bus *bus;
3437         struct pci_dev *bridge = NULL;
3438         struct device *tmp;
3439         struct acpi_dmar_atsr *atsr;
3440         struct dmar_atsr_unit *atsru;
3441         struct dmar_satc_unit *satcu;
3442
3443         dev = pci_physfn(dev);
3444         satcu = dmar_find_matched_satc_unit(dev);
3445         if (satcu)
3446                 /*
3447                  * This device supports ATS as it is in SATC table.
3448                  * When IOMMU is in legacy mode, enabling ATS is done
3449                  * automatically by HW for the device that requires
3450                  * ATS, hence OS should not enable this device ATS
3451                  * to avoid duplicated TLB invalidation.
3452                  */
3453                 return !(satcu->atc_required && !sm_supported(iommu));
3454
3455         for (bus = dev->bus; bus; bus = bus->parent) {
3456                 bridge = bus->self;
3457                 /* If it's an integrated device, allow ATS */
3458                 if (!bridge)
3459                         return 1;
3460                 /* Connected via non-PCIe: no ATS */
3461                 if (!pci_is_pcie(bridge) ||
3462                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3463                         return 0;
3464                 /* If we found the root port, look it up in the ATSR */
3465                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3466                         break;
3467         }
3468
3469         rcu_read_lock();
3470         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3471                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3472                 if (atsr->segment != pci_domain_nr(dev->bus))
3473                         continue;
3474
3475                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3476                         if (tmp == &bridge->dev)
3477                                 goto out;
3478
3479                 if (atsru->include_all)
3480                         goto out;
3481         }
3482         ret = 0;
3483 out:
3484         rcu_read_unlock();
3485
3486         return ret;
3487 }
3488
3489 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3490 {
3491         int ret;
3492         struct dmar_rmrr_unit *rmrru;
3493         struct dmar_atsr_unit *atsru;
3494         struct dmar_satc_unit *satcu;
3495         struct acpi_dmar_atsr *atsr;
3496         struct acpi_dmar_reserved_memory *rmrr;
3497         struct acpi_dmar_satc *satc;
3498
3499         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3500                 return 0;
3501
3502         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3503                 rmrr = container_of(rmrru->hdr,
3504                                     struct acpi_dmar_reserved_memory, header);
3505                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3506                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3507                                 ((void *)rmrr) + rmrr->header.length,
3508                                 rmrr->segment, rmrru->devices,
3509                                 rmrru->devices_cnt);
3510                         if (ret < 0)
3511                                 return ret;
3512                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3513                         dmar_remove_dev_scope(info, rmrr->segment,
3514                                 rmrru->devices, rmrru->devices_cnt);
3515                 }
3516         }
3517
3518         list_for_each_entry(atsru, &dmar_atsr_units, list) {
3519                 if (atsru->include_all)
3520                         continue;
3521
3522                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3523                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3524                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3525                                         (void *)atsr + atsr->header.length,
3526                                         atsr->segment, atsru->devices,
3527                                         atsru->devices_cnt);
3528                         if (ret > 0)
3529                                 break;
3530                         else if (ret < 0)
3531                                 return ret;
3532                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3533                         if (dmar_remove_dev_scope(info, atsr->segment,
3534                                         atsru->devices, atsru->devices_cnt))
3535                                 break;
3536                 }
3537         }
3538         list_for_each_entry(satcu, &dmar_satc_units, list) {
3539                 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3540                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3541                         ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3542                                         (void *)satc + satc->header.length,
3543                                         satc->segment, satcu->devices,
3544                                         satcu->devices_cnt);
3545                         if (ret > 0)
3546                                 break;
3547                         else if (ret < 0)
3548                                 return ret;
3549                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3550                         if (dmar_remove_dev_scope(info, satc->segment,
3551                                         satcu->devices, satcu->devices_cnt))
3552                                 break;
3553                 }
3554         }
3555
3556         return 0;
3557 }
3558
3559 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3560                                        unsigned long val, void *v)
3561 {
3562         struct memory_notify *mhp = v;
3563         unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
3564         unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
3565                         mhp->nr_pages - 1);
3566
3567         switch (val) {
3568         case MEM_GOING_ONLINE:
3569                 if (iommu_domain_identity_map(si_domain,
3570                                               start_vpfn, last_vpfn)) {
3571                         pr_warn("Failed to build identity map for [%lx-%lx]\n",
3572                                 start_vpfn, last_vpfn);
3573                         return NOTIFY_BAD;
3574                 }
3575                 break;
3576
3577         case MEM_OFFLINE:
3578         case MEM_CANCEL_ONLINE:
3579                 {
3580                         struct dmar_drhd_unit *drhd;
3581                         struct intel_iommu *iommu;
3582                         LIST_HEAD(freelist);
3583
3584                         domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3585
3586                         rcu_read_lock();
3587                         for_each_active_iommu(iommu, drhd)
3588                                 iommu_flush_iotlb_psi(iommu, si_domain,
3589                                         start_vpfn, mhp->nr_pages,
3590                                         list_empty(&freelist), 0);
3591                         rcu_read_unlock();
3592                         put_pages_list(&freelist);
3593                 }
3594                 break;
3595         }
3596
3597         return NOTIFY_OK;
3598 }
3599
3600 static struct notifier_block intel_iommu_memory_nb = {
3601         .notifier_call = intel_iommu_memory_notifier,
3602         .priority = 0
3603 };
3604
3605 static void intel_disable_iommus(void)
3606 {
3607         struct intel_iommu *iommu = NULL;
3608         struct dmar_drhd_unit *drhd;
3609
3610         for_each_iommu(iommu, drhd)
3611                 iommu_disable_translation(iommu);
3612 }
3613
3614 void intel_iommu_shutdown(void)
3615 {
3616         struct dmar_drhd_unit *drhd;
3617         struct intel_iommu *iommu = NULL;
3618
3619         if (no_iommu || dmar_disabled)
3620                 return;
3621
3622         down_write(&dmar_global_lock);
3623
3624         /* Disable PMRs explicitly here. */
3625         for_each_iommu(iommu, drhd)
3626                 iommu_disable_protect_mem_regions(iommu);
3627
3628         /* Make sure the IOMMUs are switched off */
3629         intel_disable_iommus();
3630
3631         up_write(&dmar_global_lock);
3632 }
3633
3634 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3635 {
3636         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3637
3638         return container_of(iommu_dev, struct intel_iommu, iommu);
3639 }
3640
3641 static ssize_t version_show(struct device *dev,
3642                             struct device_attribute *attr, char *buf)
3643 {
3644         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3645         u32 ver = readl(iommu->reg + DMAR_VER_REG);
3646         return sysfs_emit(buf, "%d:%d\n",
3647                           DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3648 }
3649 static DEVICE_ATTR_RO(version);
3650
3651 static ssize_t address_show(struct device *dev,
3652                             struct device_attribute *attr, char *buf)
3653 {
3654         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3655         return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
3656 }
3657 static DEVICE_ATTR_RO(address);
3658
3659 static ssize_t cap_show(struct device *dev,
3660                         struct device_attribute *attr, char *buf)
3661 {
3662         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3663         return sysfs_emit(buf, "%llx\n", iommu->cap);
3664 }
3665 static DEVICE_ATTR_RO(cap);
3666
3667 static ssize_t ecap_show(struct device *dev,
3668                          struct device_attribute *attr, char *buf)
3669 {
3670         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3671         return sysfs_emit(buf, "%llx\n", iommu->ecap);
3672 }
3673 static DEVICE_ATTR_RO(ecap);
3674
3675 static ssize_t domains_supported_show(struct device *dev,
3676                                       struct device_attribute *attr, char *buf)
3677 {
3678         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3679         return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
3680 }
3681 static DEVICE_ATTR_RO(domains_supported);
3682
3683 static ssize_t domains_used_show(struct device *dev,
3684                                  struct device_attribute *attr, char *buf)
3685 {
3686         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3687         return sysfs_emit(buf, "%d\n",
3688                           bitmap_weight(iommu->domain_ids,
3689                                         cap_ndoms(iommu->cap)));
3690 }
3691 static DEVICE_ATTR_RO(domains_used);
3692
3693 static struct attribute *intel_iommu_attrs[] = {
3694         &dev_attr_version.attr,
3695         &dev_attr_address.attr,
3696         &dev_attr_cap.attr,
3697         &dev_attr_ecap.attr,
3698         &dev_attr_domains_supported.attr,
3699         &dev_attr_domains_used.attr,
3700         NULL,
3701 };
3702
3703 static struct attribute_group intel_iommu_group = {
3704         .name = "intel-iommu",
3705         .attrs = intel_iommu_attrs,
3706 };
3707
3708 const struct attribute_group *intel_iommu_groups[] = {
3709         &intel_iommu_group,
3710         NULL,
3711 };
3712
3713 static inline bool has_external_pci(void)
3714 {
3715         struct pci_dev *pdev = NULL;
3716
3717         for_each_pci_dev(pdev)
3718                 if (pdev->external_facing) {
3719                         pci_dev_put(pdev);
3720                         return true;
3721                 }
3722
3723         return false;
3724 }
3725
3726 static int __init platform_optin_force_iommu(void)
3727 {
3728         if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3729                 return 0;
3730
3731         if (no_iommu || dmar_disabled)
3732                 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3733
3734         /*
3735          * If Intel-IOMMU is disabled by default, we will apply identity
3736          * map for all devices except those marked as being untrusted.
3737          */
3738         if (dmar_disabled)
3739                 iommu_set_default_passthrough(false);
3740
3741         dmar_disabled = 0;
3742         no_iommu = 0;
3743
3744         return 1;
3745 }
3746
3747 static int __init probe_acpi_namespace_devices(void)
3748 {
3749         struct dmar_drhd_unit *drhd;
3750         /* To avoid a -Wunused-but-set-variable warning. */
3751         struct intel_iommu *iommu __maybe_unused;
3752         struct device *dev;
3753         int i, ret = 0;
3754
3755         for_each_active_iommu(iommu, drhd) {
3756                 for_each_active_dev_scope(drhd->devices,
3757                                           drhd->devices_cnt, i, dev) {
3758                         struct acpi_device_physical_node *pn;
3759                         struct iommu_group *group;
3760                         struct acpi_device *adev;
3761
3762                         if (dev->bus != &acpi_bus_type)
3763                                 continue;
3764
3765                         adev = to_acpi_device(dev);
3766                         mutex_lock(&adev->physical_node_lock);
3767                         list_for_each_entry(pn,
3768                                             &adev->physical_node_list, node) {
3769                                 group = iommu_group_get(pn->dev);
3770                                 if (group) {
3771                                         iommu_group_put(group);
3772                                         continue;
3773                                 }
3774
3775                                 ret = iommu_probe_device(pn->dev);
3776                                 if (ret)
3777                                         break;
3778                         }
3779                         mutex_unlock(&adev->physical_node_lock);
3780
3781                         if (ret)
3782                                 return ret;
3783                 }
3784         }
3785
3786         return 0;
3787 }
3788
3789 static __init int tboot_force_iommu(void)
3790 {
3791         if (!tboot_enabled())
3792                 return 0;
3793
3794         if (no_iommu || dmar_disabled)
3795                 pr_warn("Forcing Intel-IOMMU to enabled\n");
3796
3797         dmar_disabled = 0;
3798         no_iommu = 0;
3799
3800         return 1;
3801 }
3802
3803 int __init intel_iommu_init(void)
3804 {
3805         int ret = -ENODEV;
3806         struct dmar_drhd_unit *drhd;
3807         struct intel_iommu *iommu;
3808
3809         /*
3810          * Intel IOMMU is required for a TXT/tboot launch or platform
3811          * opt in, so enforce that.
3812          */
3813         force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3814                     platform_optin_force_iommu();
3815
3816         down_write(&dmar_global_lock);
3817         if (dmar_table_init()) {
3818                 if (force_on)
3819                         panic("tboot: Failed to initialize DMAR table\n");
3820                 goto out_free_dmar;
3821         }
3822
3823         if (dmar_dev_scope_init() < 0) {
3824                 if (force_on)
3825                         panic("tboot: Failed to initialize DMAR device scope\n");
3826                 goto out_free_dmar;
3827         }
3828
3829         up_write(&dmar_global_lock);
3830
3831         /*
3832          * The bus notifier takes the dmar_global_lock, so lockdep will
3833          * complain later when we register it under the lock.
3834          */
3835         dmar_register_bus_notifier();
3836
3837         down_write(&dmar_global_lock);
3838
3839         if (!no_iommu)
3840                 intel_iommu_debugfs_init();
3841
3842         if (no_iommu || dmar_disabled) {
3843                 /*
3844                  * We exit the function here to ensure IOMMU's remapping and
3845                  * mempool aren't setup, which means that the IOMMU's PMRs
3846                  * won't be disabled via the call to init_dmars(). So disable
3847                  * it explicitly here. The PMRs were setup by tboot prior to
3848                  * calling SENTER, but the kernel is expected to reset/tear
3849                  * down the PMRs.
3850                  */
3851                 if (intel_iommu_tboot_noforce) {
3852                         for_each_iommu(iommu, drhd)
3853                                 iommu_disable_protect_mem_regions(iommu);
3854                 }
3855
3856                 /*
3857                  * Make sure the IOMMUs are switched off, even when we
3858                  * boot into a kexec kernel and the previous kernel left
3859                  * them enabled
3860                  */
3861                 intel_disable_iommus();
3862                 goto out_free_dmar;
3863         }
3864
3865         if (list_empty(&dmar_rmrr_units))
3866                 pr_info("No RMRR found\n");
3867
3868         if (list_empty(&dmar_atsr_units))
3869                 pr_info("No ATSR found\n");
3870
3871         if (list_empty(&dmar_satc_units))
3872                 pr_info("No SATC found\n");
3873
3874         init_no_remapping_devices();
3875
3876         ret = init_dmars();
3877         if (ret) {
3878                 if (force_on)
3879                         panic("tboot: Failed to initialize DMARs\n");
3880                 pr_err("Initialization failed\n");
3881                 goto out_free_dmar;
3882         }
3883         up_write(&dmar_global_lock);
3884
3885         init_iommu_pm_ops();
3886
3887         down_read(&dmar_global_lock);
3888         for_each_active_iommu(iommu, drhd) {
3889                 /*
3890                  * The flush queue implementation does not perform
3891                  * page-selective invalidations that are required for efficient
3892                  * TLB flushes in virtual environments.  The benefit of batching
3893                  * is likely to be much lower than the overhead of synchronizing
3894                  * the virtual and physical IOMMU page-tables.
3895                  */
3896                 if (cap_caching_mode(iommu->cap) &&
3897                     !first_level_by_default(IOMMU_DOMAIN_DMA)) {
3898                         pr_info_once("IOMMU batching disallowed due to virtualization\n");
3899                         iommu_set_dma_strict();
3900                 }
3901                 iommu_device_sysfs_add(&iommu->iommu, NULL,
3902                                        intel_iommu_groups,
3903                                        "%s", iommu->name);
3904                 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3905
3906                 iommu_pmu_register(iommu);
3907         }
3908         up_read(&dmar_global_lock);
3909
3910         if (si_domain && !hw_pass_through)
3911                 register_memory_notifier(&intel_iommu_memory_nb);
3912
3913         down_read(&dmar_global_lock);
3914         if (probe_acpi_namespace_devices())
3915                 pr_warn("ACPI name space devices didn't probe correctly\n");
3916
3917         /* Finally, we enable the DMA remapping hardware. */
3918         for_each_iommu(iommu, drhd) {
3919                 if (!drhd->ignored && !translation_pre_enabled(iommu))
3920                         iommu_enable_translation(iommu);
3921
3922                 iommu_disable_protect_mem_regions(iommu);
3923         }
3924         up_read(&dmar_global_lock);
3925
3926         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3927
3928         intel_iommu_enabled = 1;
3929
3930         return 0;
3931
3932 out_free_dmar:
3933         intel_iommu_free_dmars();
3934         up_write(&dmar_global_lock);
3935         return ret;
3936 }
3937
3938 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3939 {
3940         struct device_domain_info *info = opaque;
3941
3942         domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3943         return 0;
3944 }
3945
3946 /*
3947  * NB - intel-iommu lacks any sort of reference counting for the users of
3948  * dependent devices.  If multiple endpoints have intersecting dependent
3949  * devices, unbinding the driver from any one of them will possibly leave
3950  * the others unable to operate.
3951  */
3952 static void domain_context_clear(struct device_domain_info *info)
3953 {
3954         if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
3955                 return;
3956
3957         pci_for_each_dma_alias(to_pci_dev(info->dev),
3958                                &domain_context_clear_one_cb, info);
3959 }
3960
3961 static void dmar_remove_one_dev_info(struct device *dev)
3962 {
3963         struct device_domain_info *info = dev_iommu_priv_get(dev);
3964         struct dmar_domain *domain = info->domain;
3965         struct intel_iommu *iommu = info->iommu;
3966         unsigned long flags;
3967
3968         if (!dev_is_real_dma_subdevice(info->dev)) {
3969                 if (dev_is_pci(info->dev) && sm_supported(iommu))
3970                         intel_pasid_tear_down_entry(iommu, info->dev,
3971                                         PASID_RID2PASID, false);
3972
3973                 iommu_disable_pci_caps(info);
3974                 domain_context_clear(info);
3975         }
3976
3977         spin_lock_irqsave(&domain->lock, flags);
3978         list_del(&info->link);
3979         spin_unlock_irqrestore(&domain->lock, flags);
3980
3981         domain_detach_iommu(domain, iommu);
3982         info->domain = NULL;
3983 }
3984
3985 /*
3986  * Clear the page table pointer in context or pasid table entries so that
3987  * all DMA requests without PASID from the device are blocked. If the page
3988  * table has been set, clean up the data structures.
3989  */
3990 static void device_block_translation(struct device *dev)
3991 {
3992         struct device_domain_info *info = dev_iommu_priv_get(dev);
3993         struct intel_iommu *iommu = info->iommu;
3994         unsigned long flags;
3995
3996         iommu_disable_pci_caps(info);
3997         if (!dev_is_real_dma_subdevice(dev)) {
3998                 if (sm_supported(iommu))
3999                         intel_pasid_tear_down_entry(iommu, dev,
4000                                                     PASID_RID2PASID, false);
4001                 else
4002                         domain_context_clear(info);
4003         }
4004
4005         if (!info->domain)
4006                 return;
4007
4008         spin_lock_irqsave(&info->domain->lock, flags);
4009         list_del(&info->link);
4010         spin_unlock_irqrestore(&info->domain->lock, flags);
4011
4012         domain_detach_iommu(info->domain, iommu);
4013         info->domain = NULL;
4014 }
4015
4016 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4017 {
4018         int adjust_width;
4019
4020         /* calculate AGAW */
4021         domain->gaw = guest_width;
4022         adjust_width = guestwidth_to_adjustwidth(guest_width);
4023         domain->agaw = width_to_agaw(adjust_width);
4024
4025         domain->iommu_coherency = false;
4026         domain->iommu_superpage = 0;
4027         domain->max_addr = 0;
4028
4029         /* always allocate the top pgd */
4030         domain->pgd = alloc_pgtable_page(domain->nid, GFP_ATOMIC);
4031         if (!domain->pgd)
4032                 return -ENOMEM;
4033         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4034         return 0;
4035 }
4036
4037 static int blocking_domain_attach_dev(struct iommu_domain *domain,
4038                                       struct device *dev)
4039 {
4040         device_block_translation(dev);
4041         return 0;
4042 }
4043
4044 static struct iommu_domain blocking_domain = {
4045         .ops = &(const struct iommu_domain_ops) {
4046                 .attach_dev     = blocking_domain_attach_dev,
4047                 .free           = intel_iommu_domain_free
4048         }
4049 };
4050
4051 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4052 {
4053         struct dmar_domain *dmar_domain;
4054         struct iommu_domain *domain;
4055
4056         switch (type) {
4057         case IOMMU_DOMAIN_BLOCKED:
4058                 return &blocking_domain;
4059         case IOMMU_DOMAIN_DMA:
4060         case IOMMU_DOMAIN_DMA_FQ:
4061         case IOMMU_DOMAIN_UNMANAGED:
4062                 dmar_domain = alloc_domain(type);
4063                 if (!dmar_domain) {
4064                         pr_err("Can't allocate dmar_domain\n");
4065                         return NULL;
4066                 }
4067                 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4068                         pr_err("Domain initialization failed\n");
4069                         domain_exit(dmar_domain);
4070                         return NULL;
4071                 }
4072
4073                 domain = &dmar_domain->domain;
4074                 domain->geometry.aperture_start = 0;
4075                 domain->geometry.aperture_end   =
4076                                 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4077                 domain->geometry.force_aperture = true;
4078
4079                 return domain;
4080         case IOMMU_DOMAIN_IDENTITY:
4081                 return &si_domain->domain;
4082         case IOMMU_DOMAIN_SVA:
4083                 return intel_svm_domain_alloc();
4084         default:
4085                 return NULL;
4086         }
4087
4088         return NULL;
4089 }
4090
4091 static void intel_iommu_domain_free(struct iommu_domain *domain)
4092 {
4093         if (domain != &si_domain->domain && domain != &blocking_domain)
4094                 domain_exit(to_dmar_domain(domain));
4095 }
4096
4097 static int prepare_domain_attach_device(struct iommu_domain *domain,
4098                                         struct device *dev)
4099 {
4100         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4101         struct intel_iommu *iommu;
4102         int addr_width;
4103
4104         iommu = device_to_iommu(dev, NULL, NULL);
4105         if (!iommu)
4106                 return -ENODEV;
4107
4108         if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
4109                 return -EINVAL;
4110
4111         /* check if this iommu agaw is sufficient for max mapped address */
4112         addr_width = agaw_to_width(iommu->agaw);
4113         if (addr_width > cap_mgaw(iommu->cap))
4114                 addr_width = cap_mgaw(iommu->cap);
4115
4116         if (dmar_domain->max_addr > (1LL << addr_width))
4117                 return -EINVAL;
4118         dmar_domain->gaw = addr_width;
4119
4120         /*
4121          * Knock out extra levels of page tables if necessary
4122          */
4123         while (iommu->agaw < dmar_domain->agaw) {
4124                 struct dma_pte *pte;
4125
4126                 pte = dmar_domain->pgd;
4127                 if (dma_pte_present(pte)) {
4128                         dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4129                         free_pgtable_page(pte);
4130                 }
4131                 dmar_domain->agaw--;
4132         }
4133
4134         return 0;
4135 }
4136
4137 static int intel_iommu_attach_device(struct iommu_domain *domain,
4138                                      struct device *dev)
4139 {
4140         struct device_domain_info *info = dev_iommu_priv_get(dev);
4141         int ret;
4142
4143         if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
4144             device_is_rmrr_locked(dev)) {
4145                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4146                 return -EPERM;
4147         }
4148
4149         if (info->domain)
4150                 device_block_translation(dev);
4151
4152         ret = prepare_domain_attach_device(domain, dev);
4153         if (ret)
4154                 return ret;
4155
4156         return dmar_domain_attach_device(to_dmar_domain(domain), dev);
4157 }
4158
4159 static int intel_iommu_map(struct iommu_domain *domain,
4160                            unsigned long iova, phys_addr_t hpa,
4161                            size_t size, int iommu_prot, gfp_t gfp)
4162 {
4163         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4164         u64 max_addr;
4165         int prot = 0;
4166
4167         if (iommu_prot & IOMMU_READ)
4168                 prot |= DMA_PTE_READ;
4169         if (iommu_prot & IOMMU_WRITE)
4170                 prot |= DMA_PTE_WRITE;
4171         if (dmar_domain->set_pte_snp)
4172                 prot |= DMA_PTE_SNP;
4173
4174         max_addr = iova + size;
4175         if (dmar_domain->max_addr < max_addr) {
4176                 u64 end;
4177
4178                 /* check if minimum agaw is sufficient for mapped address */
4179                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4180                 if (end < max_addr) {
4181                         pr_err("%s: iommu width (%d) is not "
4182                                "sufficient for the mapped address (%llx)\n",
4183                                __func__, dmar_domain->gaw, max_addr);
4184                         return -EFAULT;
4185                 }
4186                 dmar_domain->max_addr = max_addr;
4187         }
4188         /* Round up size to next multiple of PAGE_SIZE, if it and
4189            the low bits of hpa would take us onto the next page */
4190         size = aligned_nrpages(hpa, size);
4191         return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4192                                 hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
4193 }
4194
4195 static int intel_iommu_map_pages(struct iommu_domain *domain,
4196                                  unsigned long iova, phys_addr_t paddr,
4197                                  size_t pgsize, size_t pgcount,
4198                                  int prot, gfp_t gfp, size_t *mapped)
4199 {
4200         unsigned long pgshift = __ffs(pgsize);
4201         size_t size = pgcount << pgshift;
4202         int ret;
4203
4204         if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4205                 return -EINVAL;
4206
4207         if (!IS_ALIGNED(iova | paddr, pgsize))
4208                 return -EINVAL;
4209
4210         ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4211         if (!ret && mapped)
4212                 *mapped = size;
4213
4214         return ret;
4215 }
4216
4217 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4218                                 unsigned long iova, size_t size,
4219                                 struct iommu_iotlb_gather *gather)
4220 {
4221         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4222         unsigned long start_pfn, last_pfn;
4223         int level = 0;
4224
4225         /* Cope with horrid API which requires us to unmap more than the
4226            size argument if it happens to be a large-page mapping. */
4227         if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
4228                                      &level, GFP_ATOMIC)))
4229                 return 0;
4230
4231         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4232                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4233
4234         start_pfn = iova >> VTD_PAGE_SHIFT;
4235         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4236
4237         domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4238
4239         if (dmar_domain->max_addr == iova + size)
4240                 dmar_domain->max_addr = iova;
4241
4242         /*
4243          * We do not use page-selective IOTLB invalidation in flush queue,
4244          * so there is no need to track page and sync iotlb.
4245          */
4246         if (!iommu_iotlb_gather_queued(gather))
4247                 iommu_iotlb_gather_add_page(domain, gather, iova, size);
4248
4249         return size;
4250 }
4251
4252 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4253                                       unsigned long iova,
4254                                       size_t pgsize, size_t pgcount,
4255                                       struct iommu_iotlb_gather *gather)
4256 {
4257         unsigned long pgshift = __ffs(pgsize);
4258         size_t size = pgcount << pgshift;
4259
4260         return intel_iommu_unmap(domain, iova, size, gather);
4261 }
4262
4263 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4264                                  struct iommu_iotlb_gather *gather)
4265 {
4266         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4267         unsigned long iova_pfn = IOVA_PFN(gather->start);
4268         size_t size = gather->end - gather->start;
4269         struct iommu_domain_info *info;
4270         unsigned long start_pfn;
4271         unsigned long nrpages;
4272         unsigned long i;
4273
4274         nrpages = aligned_nrpages(gather->start, size);
4275         start_pfn = mm_to_dma_pfn(iova_pfn);
4276
4277         xa_for_each(&dmar_domain->iommu_array, i, info)
4278                 iommu_flush_iotlb_psi(info->iommu, dmar_domain,
4279                                       start_pfn, nrpages,
4280                                       list_empty(&gather->freelist), 0);
4281
4282         put_pages_list(&gather->freelist);
4283 }
4284
4285 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4286                                             dma_addr_t iova)
4287 {
4288         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4289         struct dma_pte *pte;
4290         int level = 0;
4291         u64 phys = 0;
4292
4293         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
4294                              GFP_ATOMIC);
4295         if (pte && dma_pte_present(pte))
4296                 phys = dma_pte_addr(pte) +
4297                         (iova & (BIT_MASK(level_to_offset_bits(level) +
4298                                                 VTD_PAGE_SHIFT) - 1));
4299
4300         return phys;
4301 }
4302
4303 static bool domain_support_force_snooping(struct dmar_domain *domain)
4304 {
4305         struct device_domain_info *info;
4306         bool support = true;
4307
4308         assert_spin_locked(&domain->lock);
4309         list_for_each_entry(info, &domain->devices, link) {
4310                 if (!ecap_sc_support(info->iommu->ecap)) {
4311                         support = false;
4312                         break;
4313                 }
4314         }
4315
4316         return support;
4317 }
4318
4319 static void domain_set_force_snooping(struct dmar_domain *domain)
4320 {
4321         struct device_domain_info *info;
4322
4323         assert_spin_locked(&domain->lock);
4324         /*
4325          * Second level page table supports per-PTE snoop control. The
4326          * iommu_map() interface will handle this by setting SNP bit.
4327          */
4328         if (!domain->use_first_level) {
4329                 domain->set_pte_snp = true;
4330                 return;
4331         }
4332
4333         list_for_each_entry(info, &domain->devices, link)
4334                 intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4335                                                      PASID_RID2PASID);
4336 }
4337
4338 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4339 {
4340         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4341         unsigned long flags;
4342
4343         if (dmar_domain->force_snooping)
4344                 return true;
4345
4346         spin_lock_irqsave(&dmar_domain->lock, flags);
4347         if (!domain_support_force_snooping(dmar_domain)) {
4348                 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4349                 return false;
4350         }
4351
4352         domain_set_force_snooping(dmar_domain);
4353         dmar_domain->force_snooping = true;
4354         spin_unlock_irqrestore(&dmar_domain->lock, flags);
4355
4356         return true;
4357 }
4358
4359 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
4360 {
4361         struct device_domain_info *info = dev_iommu_priv_get(dev);
4362
4363         switch (cap) {
4364         case IOMMU_CAP_CACHE_COHERENCY:
4365                 return true;
4366         case IOMMU_CAP_PRE_BOOT_PROTECTION:
4367                 return dmar_platform_optin();
4368         case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
4369                 return ecap_sc_support(info->iommu->ecap);
4370         default:
4371                 return false;
4372         }
4373 }
4374
4375 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4376 {
4377         struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4378         struct device_domain_info *info;
4379         struct intel_iommu *iommu;
4380         u8 bus, devfn;
4381         int ret;
4382
4383         iommu = device_to_iommu(dev, &bus, &devfn);
4384         if (!iommu || !iommu->iommu.ops)
4385                 return ERR_PTR(-ENODEV);
4386
4387         info = kzalloc(sizeof(*info), GFP_KERNEL);
4388         if (!info)
4389                 return ERR_PTR(-ENOMEM);
4390
4391         if (dev_is_real_dma_subdevice(dev)) {
4392                 info->bus = pdev->bus->number;
4393                 info->devfn = pdev->devfn;
4394                 info->segment = pci_domain_nr(pdev->bus);
4395         } else {
4396                 info->bus = bus;
4397                 info->devfn = devfn;
4398                 info->segment = iommu->segment;
4399         }
4400
4401         info->dev = dev;
4402         info->iommu = iommu;
4403         if (dev_is_pci(dev)) {
4404                 if (ecap_dev_iotlb_support(iommu->ecap) &&
4405                     pci_ats_supported(pdev) &&
4406                     dmar_ats_supported(pdev, iommu)) {
4407                         info->ats_supported = 1;
4408                         info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
4409
4410                         /*
4411                          * For IOMMU that supports device IOTLB throttling
4412                          * (DIT), we assign PFSID to the invalidation desc
4413                          * of a VF such that IOMMU HW can gauge queue depth
4414                          * at PF level. If DIT is not set, PFSID will be
4415                          * treated as reserved, which should be set to 0.
4416                          */
4417                         if (ecap_dit(iommu->ecap))
4418                                 info->pfsid = pci_dev_id(pci_physfn(pdev));
4419                         info->ats_qdep = pci_ats_queue_depth(pdev);
4420                 }
4421                 if (sm_supported(iommu)) {
4422                         if (pasid_supported(iommu)) {
4423                                 int features = pci_pasid_features(pdev);
4424
4425                                 if (features >= 0)
4426                                         info->pasid_supported = features | 1;
4427                         }
4428
4429                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
4430                             pci_pri_supported(pdev))
4431                                 info->pri_supported = 1;
4432                 }
4433         }
4434
4435         dev_iommu_priv_set(dev, info);
4436
4437         if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
4438                 ret = intel_pasid_alloc_table(dev);
4439                 if (ret) {
4440                         dev_err(dev, "PASID table allocation failed\n");
4441                         dev_iommu_priv_set(dev, NULL);
4442                         kfree(info);
4443                         return ERR_PTR(ret);
4444                 }
4445         }
4446
4447         return &iommu->iommu;
4448 }
4449
4450 static void intel_iommu_release_device(struct device *dev)
4451 {
4452         struct device_domain_info *info = dev_iommu_priv_get(dev);
4453
4454         dmar_remove_one_dev_info(dev);
4455         intel_pasid_free_table(dev);
4456         dev_iommu_priv_set(dev, NULL);
4457         kfree(info);
4458         set_dma_ops(dev, NULL);
4459 }
4460
4461 static void intel_iommu_probe_finalize(struct device *dev)
4462 {
4463         set_dma_ops(dev, NULL);
4464         iommu_setup_dma_ops(dev, 0, U64_MAX);
4465 }
4466
4467 static void intel_iommu_get_resv_regions(struct device *device,
4468                                          struct list_head *head)
4469 {
4470         int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4471         struct iommu_resv_region *reg;
4472         struct dmar_rmrr_unit *rmrr;
4473         struct device *i_dev;
4474         int i;
4475
4476         rcu_read_lock();
4477         for_each_rmrr_units(rmrr) {
4478                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4479                                           i, i_dev) {
4480                         struct iommu_resv_region *resv;
4481                         enum iommu_resv_type type;
4482                         size_t length;
4483
4484                         if (i_dev != device &&
4485                             !is_downstream_to_pci_bridge(device, i_dev))
4486                                 continue;
4487
4488                         length = rmrr->end_address - rmrr->base_address + 1;
4489
4490                         type = device_rmrr_is_relaxable(device) ?
4491                                 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4492
4493                         resv = iommu_alloc_resv_region(rmrr->base_address,
4494                                                        length, prot, type,
4495                                                        GFP_ATOMIC);
4496                         if (!resv)
4497                                 break;
4498
4499                         list_add_tail(&resv->list, head);
4500                 }
4501         }
4502         rcu_read_unlock();
4503
4504 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4505         if (dev_is_pci(device)) {
4506                 struct pci_dev *pdev = to_pci_dev(device);
4507
4508                 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4509                         reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4510                                         IOMMU_RESV_DIRECT_RELAXABLE,
4511                                         GFP_KERNEL);
4512                         if (reg)
4513                                 list_add_tail(&reg->list, head);
4514                 }
4515         }
4516 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4517
4518         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4519                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4520                                       0, IOMMU_RESV_MSI, GFP_KERNEL);
4521         if (!reg)
4522                 return;
4523         list_add_tail(&reg->list, head);
4524 }
4525
4526 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4527 {
4528         if (dev_is_pci(dev))
4529                 return pci_device_group(dev);
4530         return generic_device_group(dev);
4531 }
4532
4533 static int intel_iommu_enable_sva(struct device *dev)
4534 {
4535         struct device_domain_info *info = dev_iommu_priv_get(dev);
4536         struct intel_iommu *iommu;
4537
4538         if (!info || dmar_disabled)
4539                 return -EINVAL;
4540
4541         iommu = info->iommu;
4542         if (!iommu)
4543                 return -EINVAL;
4544
4545         if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4546                 return -ENODEV;
4547
4548         if (!info->pasid_enabled || !info->ats_enabled)
4549                 return -EINVAL;
4550
4551         /*
4552          * Devices having device-specific I/O fault handling should not
4553          * support PCI/PRI. The IOMMU side has no means to check the
4554          * capability of device-specific IOPF.  Therefore, IOMMU can only
4555          * default that if the device driver enables SVA on a non-PRI
4556          * device, it will handle IOPF in its own way.
4557          */
4558         if (!info->pri_supported)
4559                 return 0;
4560
4561         /* Devices supporting PRI should have it enabled. */
4562         if (!info->pri_enabled)
4563                 return -EINVAL;
4564
4565         return 0;
4566 }
4567
4568 static int intel_iommu_enable_iopf(struct device *dev)
4569 {
4570         struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4571         struct device_domain_info *info = dev_iommu_priv_get(dev);
4572         struct intel_iommu *iommu;
4573         int ret;
4574
4575         if (!pdev || !info || !info->ats_enabled || !info->pri_supported)
4576                 return -ENODEV;
4577
4578         if (info->pri_enabled)
4579                 return -EBUSY;
4580
4581         iommu = info->iommu;
4582         if (!iommu)
4583                 return -EINVAL;
4584
4585         /* PASID is required in PRG Response Message. */
4586         if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
4587                 return -EINVAL;
4588
4589         ret = pci_reset_pri(pdev);
4590         if (ret)
4591                 return ret;
4592
4593         ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4594         if (ret)
4595                 return ret;
4596
4597         ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4598         if (ret)
4599                 goto iopf_remove_device;
4600
4601         ret = pci_enable_pri(pdev, PRQ_DEPTH);
4602         if (ret)
4603                 goto iopf_unregister_handler;
4604         info->pri_enabled = 1;
4605
4606         return 0;
4607
4608 iopf_unregister_handler:
4609         iommu_unregister_device_fault_handler(dev);
4610 iopf_remove_device:
4611         iopf_queue_remove_device(iommu->iopf_queue, dev);
4612
4613         return ret;
4614 }
4615
4616 static int intel_iommu_disable_iopf(struct device *dev)
4617 {
4618         struct device_domain_info *info = dev_iommu_priv_get(dev);
4619         struct intel_iommu *iommu = info->iommu;
4620
4621         if (!info->pri_enabled)
4622                 return -EINVAL;
4623
4624         /*
4625          * PCIe spec states that by clearing PRI enable bit, the Page
4626          * Request Interface will not issue new page requests, but has
4627          * outstanding page requests that have been transmitted or are
4628          * queued for transmission. This is supposed to be called after
4629          * the device driver has stopped DMA, all PASIDs have been
4630          * unbound and the outstanding PRQs have been drained.
4631          */
4632         pci_disable_pri(to_pci_dev(dev));
4633         info->pri_enabled = 0;
4634
4635         /*
4636          * With PRI disabled and outstanding PRQs drained, unregistering
4637          * fault handler and removing device from iopf queue should never
4638          * fail.
4639          */
4640         WARN_ON(iommu_unregister_device_fault_handler(dev));
4641         WARN_ON(iopf_queue_remove_device(iommu->iopf_queue, dev));
4642
4643         return 0;
4644 }
4645
4646 static int
4647 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4648 {
4649         switch (feat) {
4650         case IOMMU_DEV_FEAT_IOPF:
4651                 return intel_iommu_enable_iopf(dev);
4652
4653         case IOMMU_DEV_FEAT_SVA:
4654                 return intel_iommu_enable_sva(dev);
4655
4656         default:
4657                 return -ENODEV;
4658         }
4659 }
4660
4661 static int
4662 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4663 {
4664         switch (feat) {
4665         case IOMMU_DEV_FEAT_IOPF:
4666                 return intel_iommu_disable_iopf(dev);
4667
4668         case IOMMU_DEV_FEAT_SVA:
4669                 return 0;
4670
4671         default:
4672                 return -ENODEV;
4673         }
4674 }
4675
4676 static bool intel_iommu_is_attach_deferred(struct device *dev)
4677 {
4678         struct device_domain_info *info = dev_iommu_priv_get(dev);
4679
4680         return translation_pre_enabled(info->iommu) && !info->domain;
4681 }
4682
4683 /*
4684  * Check that the device does not live on an external facing PCI port that is
4685  * marked as untrusted. Such devices should not be able to apply quirks and
4686  * thus not be able to bypass the IOMMU restrictions.
4687  */
4688 static bool risky_device(struct pci_dev *pdev)
4689 {
4690         if (pdev->untrusted) {
4691                 pci_info(pdev,
4692                          "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4693                          pdev->vendor, pdev->device);
4694                 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4695                 return true;
4696         }
4697         return false;
4698 }
4699
4700 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4701                                        unsigned long iova, size_t size)
4702 {
4703         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4704         unsigned long pages = aligned_nrpages(iova, size);
4705         unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4706         struct iommu_domain_info *info;
4707         unsigned long i;
4708
4709         xa_for_each(&dmar_domain->iommu_array, i, info)
4710                 __mapping_notify_one(info->iommu, dmar_domain, pfn, pages);
4711 }
4712
4713 static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
4714 {
4715         struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
4716         struct iommu_domain *domain;
4717
4718         /* Domain type specific cleanup: */
4719         domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0);
4720         if (domain) {
4721                 switch (domain->type) {
4722                 case IOMMU_DOMAIN_SVA:
4723                         intel_svm_remove_dev_pasid(dev, pasid);
4724                         break;
4725                 default:
4726                         /* should never reach here */
4727                         WARN_ON(1);
4728                         break;
4729                 }
4730         }
4731
4732         intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4733 }
4734
4735 const struct iommu_ops intel_iommu_ops = {
4736         .capable                = intel_iommu_capable,
4737         .domain_alloc           = intel_iommu_domain_alloc,
4738         .probe_device           = intel_iommu_probe_device,
4739         .probe_finalize         = intel_iommu_probe_finalize,
4740         .release_device         = intel_iommu_release_device,
4741         .get_resv_regions       = intel_iommu_get_resv_regions,
4742         .device_group           = intel_iommu_device_group,
4743         .dev_enable_feat        = intel_iommu_dev_enable_feat,
4744         .dev_disable_feat       = intel_iommu_dev_disable_feat,
4745         .is_attach_deferred     = intel_iommu_is_attach_deferred,
4746         .def_domain_type        = device_def_domain_type,
4747         .remove_dev_pasid       = intel_iommu_remove_dev_pasid,
4748         .pgsize_bitmap          = SZ_4K,
4749 #ifdef CONFIG_INTEL_IOMMU_SVM
4750         .page_response          = intel_svm_page_response,
4751 #endif
4752         .default_domain_ops = &(const struct iommu_domain_ops) {
4753                 .attach_dev             = intel_iommu_attach_device,
4754                 .map_pages              = intel_iommu_map_pages,
4755                 .unmap_pages            = intel_iommu_unmap_pages,
4756                 .iotlb_sync_map         = intel_iommu_iotlb_sync_map,
4757                 .flush_iotlb_all        = intel_flush_iotlb_all,
4758                 .iotlb_sync             = intel_iommu_tlb_sync,
4759                 .iova_to_phys           = intel_iommu_iova_to_phys,
4760                 .free                   = intel_iommu_domain_free,
4761                 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4762         }
4763 };
4764
4765 static void quirk_iommu_igfx(struct pci_dev *dev)
4766 {
4767         if (risky_device(dev))
4768                 return;
4769
4770         pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4771         dmar_map_gfx = 0;
4772 }
4773
4774 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4775 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4776 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4777 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4778 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4779 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4780 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4781 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4782
4783 /* Broadwell igfx malfunctions with dmar */
4784 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4785 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4786 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4787 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4788 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4789 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4790 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4791 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4792 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4793 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4794 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4795 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4796 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4797 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4798 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4799 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4800 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4801 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4802 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4803 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4804 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4805 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4806 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4807 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4808
4809 static void quirk_iommu_rwbf(struct pci_dev *dev)
4810 {
4811         if (risky_device(dev))
4812                 return;
4813
4814         /*
4815          * Mobile 4 Series Chipset neglects to set RWBF capability,
4816          * but needs it. Same seems to hold for the desktop versions.
4817          */
4818         pci_info(dev, "Forcing write-buffer flush capability\n");
4819         rwbf_quirk = 1;
4820 }
4821
4822 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4823 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4824 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4825 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4826 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4827 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4828 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4829
4830 #define GGC 0x52
4831 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
4832 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
4833 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
4834 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
4835 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
4836 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
4837 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
4838 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
4839
4840 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4841 {
4842         unsigned short ggc;
4843
4844         if (risky_device(dev))
4845                 return;
4846
4847         if (pci_read_config_word(dev, GGC, &ggc))
4848                 return;
4849
4850         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4851                 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4852                 dmar_map_gfx = 0;
4853         } else if (dmar_map_gfx) {
4854                 /* we have to ensure the gfx device is idle before we flush */
4855                 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4856                 iommu_set_dma_strict();
4857         }
4858 }
4859 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4860 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4861 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4862 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4863
4864 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4865 {
4866         unsigned short ver;
4867
4868         if (!IS_GFX_DEVICE(dev))
4869                 return;
4870
4871         ver = (dev->device >> 8) & 0xff;
4872         if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4873             ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4874             ver != 0x9a && ver != 0xa7)
4875                 return;
4876
4877         if (risky_device(dev))
4878                 return;
4879
4880         pci_info(dev, "Skip IOMMU disabling for graphics\n");
4881         iommu_skip_te_disable = 1;
4882 }
4883 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4884
4885 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4886    ISOCH DMAR unit for the Azalia sound device, but not give it any
4887    TLB entries, which causes it to deadlock. Check for that.  We do
4888    this in a function called from init_dmars(), instead of in a PCI
4889    quirk, because we don't want to print the obnoxious "BIOS broken"
4890    message if VT-d is actually disabled.
4891 */
4892 static void __init check_tylersburg_isoch(void)
4893 {
4894         struct pci_dev *pdev;
4895         uint32_t vtisochctrl;
4896
4897         /* If there's no Azalia in the system anyway, forget it. */
4898         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4899         if (!pdev)
4900                 return;
4901
4902         if (risky_device(pdev)) {
4903                 pci_dev_put(pdev);
4904                 return;
4905         }
4906
4907         pci_dev_put(pdev);
4908
4909         /* System Management Registers. Might be hidden, in which case
4910            we can't do the sanity check. But that's OK, because the
4911            known-broken BIOSes _don't_ actually hide it, so far. */
4912         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4913         if (!pdev)
4914                 return;
4915
4916         if (risky_device(pdev)) {
4917                 pci_dev_put(pdev);
4918                 return;
4919         }
4920
4921         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4922                 pci_dev_put(pdev);
4923                 return;
4924         }
4925
4926         pci_dev_put(pdev);
4927
4928         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4929         if (vtisochctrl & 1)
4930                 return;
4931
4932         /* Drop all bits other than the number of TLB entries */
4933         vtisochctrl &= 0x1c;
4934
4935         /* If we have the recommended number of TLB entries (16), fine. */
4936         if (vtisochctrl == 0x10)
4937                 return;
4938
4939         /* Zero TLB entries? You get to ride the short bus to school. */
4940         if (!vtisochctrl) {
4941                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4942                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4943                      dmi_get_system_info(DMI_BIOS_VENDOR),
4944                      dmi_get_system_info(DMI_BIOS_VERSION),
4945                      dmi_get_system_info(DMI_PRODUCT_VERSION));
4946                 iommu_identity_mapping |= IDENTMAP_AZALIA;
4947                 return;
4948         }
4949
4950         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4951                vtisochctrl);
4952 }
4953
4954 /*
4955  * Here we deal with a device TLB defect where device may inadvertently issue ATS
4956  * invalidation completion before posted writes initiated with translated address
4957  * that utilized translations matching the invalidation address range, violating
4958  * the invalidation completion ordering.
4959  * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
4960  * vulnerable to this defect. In other words, any dTLB invalidation initiated not
4961  * under the control of the trusted/privileged host device driver must use this
4962  * quirk.
4963  * Device TLBs are invalidated under the following six conditions:
4964  * 1. Device driver does DMA API unmap IOVA
4965  * 2. Device driver unbind a PASID from a process, sva_unbind_device()
4966  * 3. PASID is torn down, after PASID cache is flushed. e.g. process
4967  *    exit_mmap() due to crash
4968  * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
4969  *    VM has to free pages that were unmapped
4970  * 5. Userspace driver unmaps a DMA buffer
4971  * 6. Cache invalidation in vSVA usage (upcoming)
4972  *
4973  * For #1 and #2, device drivers are responsible for stopping DMA traffic
4974  * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
4975  * invalidate TLB the same way as normal user unmap which will use this quirk.
4976  * The dTLB invalidation after PASID cache flush does not need this quirk.
4977  *
4978  * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
4979  */
4980 void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
4981                                unsigned long address, unsigned long mask,
4982                                u32 pasid, u16 qdep)
4983 {
4984         u16 sid;
4985
4986         if (likely(!info->dtlb_extra_inval))
4987                 return;
4988
4989         sid = PCI_DEVID(info->bus, info->devfn);
4990         if (pasid == PASID_RID2PASID) {
4991                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
4992                                    qdep, address, mask);
4993         } else {
4994                 qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
4995                                          pasid, qdep, address, mask);
4996         }
4997 }
4998
4999 #define ecmd_get_status_code(res)       (((res) & 0xff) >> 1)
5000
5001 /*
5002  * Function to submit a command to the enhanced command interface. The
5003  * valid enhanced command descriptions are defined in Table 47 of the
5004  * VT-d spec. The VT-d hardware implementation may support some but not
5005  * all commands, which can be determined by checking the Enhanced
5006  * Command Capability Register.
5007  *
5008  * Return values:
5009  *  - 0: Command successful without any error;
5010  *  - Negative: software error value;
5011  *  - Nonzero positive: failure status code defined in Table 48.
5012  */
5013 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
5014 {
5015         unsigned long flags;
5016         u64 res;
5017         int ret;
5018
5019         if (!cap_ecmds(iommu->cap))
5020                 return -ENODEV;
5021
5022         raw_spin_lock_irqsave(&iommu->register_lock, flags);
5023
5024         res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
5025         if (res & DMA_ECMD_ECRSP_IP) {
5026                 ret = -EBUSY;
5027                 goto err;
5028         }
5029
5030         /*
5031          * Unconditionally write the operand B, because
5032          * - There is no side effect if an ecmd doesn't require an
5033          *   operand B, but we set the register to some value.
5034          * - It's not invoked in any critical path. The extra MMIO
5035          *   write doesn't bring any performance concerns.
5036          */
5037         dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
5038         dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
5039
5040         IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
5041                       !(res & DMA_ECMD_ECRSP_IP), res);
5042
5043         if (res & DMA_ECMD_ECRSP_IP) {
5044                 ret = -ETIMEDOUT;
5045                 goto err;
5046         }
5047
5048         ret = ecmd_get_status_code(res);
5049 err:
5050         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
5051
5052         return ret;
5053 }
This page took 0.339538 seconds and 4 git commands to generate.