]> Git Repo - linux.git/blob - drivers/iommu/intel/iommu.c
Merge tag 'for-linus-iommufd' of git://git.kernel.org/pub/scm/linux/kernel/git/jgg...
[linux.git] / drivers / iommu / intel / iommu.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <[email protected]>,
6  *          Ashok Raj <[email protected]>,
7  *          Shaohua Li <[email protected]>,
8  *          Anil S Keshavamurthy <[email protected]>,
9  *          Fenghua Yu <[email protected]>
10  *          Joerg Roedel <[email protected]>
11  */
12
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dmi.h>
19 #include <linux/memory.h>
20 #include <linux/pci.h>
21 #include <linux/pci-ats.h>
22 #include <linux/spinlock.h>
23 #include <linux/syscore_ops.h>
24 #include <linux/tboot.h>
25 #include <uapi/linux/iommufd.h>
26
27 #include "iommu.h"
28 #include "../dma-iommu.h"
29 #include "../irq_remapping.h"
30 #include "../iommu-sva.h"
31 #include "pasid.h"
32 #include "cap_audit.h"
33 #include "perfmon.h"
34
35 #define ROOT_SIZE               VTD_PAGE_SIZE
36 #define CONTEXT_SIZE            VTD_PAGE_SIZE
37
38 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
39 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
40 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
41 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
42
43 #define IOAPIC_RANGE_START      (0xfee00000)
44 #define IOAPIC_RANGE_END        (0xfeefffff)
45 #define IOVA_START_ADDR         (0x1000)
46
47 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
48
49 #define MAX_AGAW_WIDTH 64
50 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
51
52 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
53 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
54
55 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
56    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
57 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
58                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
59 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
60
61 /* IO virtual address start page frame number */
62 #define IOVA_START_PFN          (1)
63
64 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
65
66 /* page table handling */
67 #define LEVEL_STRIDE            (9)
68 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
69
70 static inline int agaw_to_level(int agaw)
71 {
72         return agaw + 2;
73 }
74
75 static inline int agaw_to_width(int agaw)
76 {
77         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
78 }
79
80 static inline int width_to_agaw(int width)
81 {
82         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
83 }
84
85 static inline unsigned int level_to_offset_bits(int level)
86 {
87         return (level - 1) * LEVEL_STRIDE;
88 }
89
90 static inline int pfn_level_offset(u64 pfn, int level)
91 {
92         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
93 }
94
95 static inline u64 level_mask(int level)
96 {
97         return -1ULL << level_to_offset_bits(level);
98 }
99
100 static inline u64 level_size(int level)
101 {
102         return 1ULL << level_to_offset_bits(level);
103 }
104
105 static inline u64 align_to_level(u64 pfn, int level)
106 {
107         return (pfn + level_size(level) - 1) & level_mask(level);
108 }
109
110 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
111 {
112         return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
113 }
114
115 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
116    are never going to work. */
117 static inline unsigned long mm_to_dma_pfn_start(unsigned long mm_pfn)
118 {
119         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
120 }
121 static inline unsigned long mm_to_dma_pfn_end(unsigned long mm_pfn)
122 {
123         return ((mm_pfn + 1) << (PAGE_SHIFT - VTD_PAGE_SHIFT)) - 1;
124 }
125 static inline unsigned long page_to_dma_pfn(struct page *pg)
126 {
127         return mm_to_dma_pfn_start(page_to_pfn(pg));
128 }
129 static inline unsigned long virt_to_dma_pfn(void *p)
130 {
131         return page_to_dma_pfn(virt_to_page(p));
132 }
133
134 static void __init check_tylersburg_isoch(void);
135 static int rwbf_quirk;
136
137 /*
138  * set to 1 to panic kernel if can't successfully enable VT-d
139  * (used when kernel is launched w/ TXT)
140  */
141 static int force_on = 0;
142 static int intel_iommu_tboot_noforce;
143 static int no_platform_optin;
144
145 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
146
147 /*
148  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
149  * if marked present.
150  */
151 static phys_addr_t root_entry_lctp(struct root_entry *re)
152 {
153         if (!(re->lo & 1))
154                 return 0;
155
156         return re->lo & VTD_PAGE_MASK;
157 }
158
159 /*
160  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
161  * if marked present.
162  */
163 static phys_addr_t root_entry_uctp(struct root_entry *re)
164 {
165         if (!(re->hi & 1))
166                 return 0;
167
168         return re->hi & VTD_PAGE_MASK;
169 }
170
171 static inline void context_set_present(struct context_entry *context)
172 {
173         context->lo |= 1;
174 }
175
176 static inline void context_set_fault_enable(struct context_entry *context)
177 {
178         context->lo &= (((u64)-1) << 2) | 1;
179 }
180
181 static inline void context_set_translation_type(struct context_entry *context,
182                                                 unsigned long value)
183 {
184         context->lo &= (((u64)-1) << 4) | 3;
185         context->lo |= (value & 3) << 2;
186 }
187
188 static inline void context_set_address_root(struct context_entry *context,
189                                             unsigned long value)
190 {
191         context->lo &= ~VTD_PAGE_MASK;
192         context->lo |= value & VTD_PAGE_MASK;
193 }
194
195 static inline void context_set_address_width(struct context_entry *context,
196                                              unsigned long value)
197 {
198         context->hi |= value & 7;
199 }
200
201 static inline void context_set_domain_id(struct context_entry *context,
202                                          unsigned long value)
203 {
204         context->hi |= (value & ((1 << 16) - 1)) << 8;
205 }
206
207 static inline void context_set_pasid(struct context_entry *context)
208 {
209         context->lo |= CONTEXT_PASIDE;
210 }
211
212 static inline int context_domain_id(struct context_entry *c)
213 {
214         return((c->hi >> 8) & 0xffff);
215 }
216
217 static inline void context_clear_entry(struct context_entry *context)
218 {
219         context->lo = 0;
220         context->hi = 0;
221 }
222
223 static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
224 {
225         if (!iommu->copied_tables)
226                 return false;
227
228         return test_bit(((long)bus << 8) | devfn, iommu->copied_tables);
229 }
230
231 static inline void
232 set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
233 {
234         set_bit(((long)bus << 8) | devfn, iommu->copied_tables);
235 }
236
237 static inline void
238 clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
239 {
240         clear_bit(((long)bus << 8) | devfn, iommu->copied_tables);
241 }
242
243 /*
244  * This domain is a statically identity mapping domain.
245  *      1. This domain creats a static 1:1 mapping to all usable memory.
246  *      2. It maps to each iommu if successful.
247  *      3. Each iommu mapps to this domain if successful.
248  */
249 static struct dmar_domain *si_domain;
250 static int hw_pass_through = 1;
251
252 struct dmar_rmrr_unit {
253         struct list_head list;          /* list of rmrr units   */
254         struct acpi_dmar_header *hdr;   /* ACPI header          */
255         u64     base_address;           /* reserved base address*/
256         u64     end_address;            /* reserved end address */
257         struct dmar_dev_scope *devices; /* target devices */
258         int     devices_cnt;            /* target device count */
259 };
260
261 struct dmar_atsr_unit {
262         struct list_head list;          /* list of ATSR units */
263         struct acpi_dmar_header *hdr;   /* ACPI header */
264         struct dmar_dev_scope *devices; /* target devices */
265         int devices_cnt;                /* target device count */
266         u8 include_all:1;               /* include all ports */
267 };
268
269 struct dmar_satc_unit {
270         struct list_head list;          /* list of SATC units */
271         struct acpi_dmar_header *hdr;   /* ACPI header */
272         struct dmar_dev_scope *devices; /* target devices */
273         struct intel_iommu *iommu;      /* the corresponding iommu */
274         int devices_cnt;                /* target device count */
275         u8 atc_required:1;              /* ATS is required */
276 };
277
278 static LIST_HEAD(dmar_atsr_units);
279 static LIST_HEAD(dmar_rmrr_units);
280 static LIST_HEAD(dmar_satc_units);
281
282 #define for_each_rmrr_units(rmrr) \
283         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
284
285 static void intel_iommu_domain_free(struct iommu_domain *domain);
286
287 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
288 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
289
290 int intel_iommu_enabled = 0;
291 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
292
293 static int dmar_map_gfx = 1;
294 static int intel_iommu_superpage = 1;
295 static int iommu_identity_mapping;
296 static int iommu_skip_te_disable;
297
298 #define IDENTMAP_GFX            2
299 #define IDENTMAP_AZALIA         4
300
301 const struct iommu_ops intel_iommu_ops;
302 const struct iommu_dirty_ops intel_dirty_ops;
303
304 static bool translation_pre_enabled(struct intel_iommu *iommu)
305 {
306         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
307 }
308
309 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
310 {
311         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
312 }
313
314 static void init_translation_status(struct intel_iommu *iommu)
315 {
316         u32 gsts;
317
318         gsts = readl(iommu->reg + DMAR_GSTS_REG);
319         if (gsts & DMA_GSTS_TES)
320                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
321 }
322
323 static int __init intel_iommu_setup(char *str)
324 {
325         if (!str)
326                 return -EINVAL;
327
328         while (*str) {
329                 if (!strncmp(str, "on", 2)) {
330                         dmar_disabled = 0;
331                         pr_info("IOMMU enabled\n");
332                 } else if (!strncmp(str, "off", 3)) {
333                         dmar_disabled = 1;
334                         no_platform_optin = 1;
335                         pr_info("IOMMU disabled\n");
336                 } else if (!strncmp(str, "igfx_off", 8)) {
337                         dmar_map_gfx = 0;
338                         pr_info("Disable GFX device mapping\n");
339                 } else if (!strncmp(str, "forcedac", 8)) {
340                         pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
341                         iommu_dma_forcedac = true;
342                 } else if (!strncmp(str, "strict", 6)) {
343                         pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
344                         iommu_set_dma_strict();
345                 } else if (!strncmp(str, "sp_off", 6)) {
346                         pr_info("Disable supported super page\n");
347                         intel_iommu_superpage = 0;
348                 } else if (!strncmp(str, "sm_on", 5)) {
349                         pr_info("Enable scalable mode if hardware supports\n");
350                         intel_iommu_sm = 1;
351                 } else if (!strncmp(str, "sm_off", 6)) {
352                         pr_info("Scalable mode is disallowed\n");
353                         intel_iommu_sm = 0;
354                 } else if (!strncmp(str, "tboot_noforce", 13)) {
355                         pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
356                         intel_iommu_tboot_noforce = 1;
357                 } else {
358                         pr_notice("Unknown option - '%s'\n", str);
359                 }
360
361                 str += strcspn(str, ",");
362                 while (*str == ',')
363                         str++;
364         }
365
366         return 1;
367 }
368 __setup("intel_iommu=", intel_iommu_setup);
369
370 void *alloc_pgtable_page(int node, gfp_t gfp)
371 {
372         struct page *page;
373         void *vaddr = NULL;
374
375         page = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
376         if (page)
377                 vaddr = page_address(page);
378         return vaddr;
379 }
380
381 void free_pgtable_page(void *vaddr)
382 {
383         free_page((unsigned long)vaddr);
384 }
385
386 static inline int domain_type_is_si(struct dmar_domain *domain)
387 {
388         return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
389 }
390
391 static inline int domain_pfn_supported(struct dmar_domain *domain,
392                                        unsigned long pfn)
393 {
394         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
395
396         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
397 }
398
399 /*
400  * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
401  * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
402  * the returned SAGAW.
403  */
404 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
405 {
406         unsigned long fl_sagaw, sl_sagaw;
407
408         fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
409         sl_sagaw = cap_sagaw(iommu->cap);
410
411         /* Second level only. */
412         if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
413                 return sl_sagaw;
414
415         /* First level only. */
416         if (!ecap_slts(iommu->ecap))
417                 return fl_sagaw;
418
419         return fl_sagaw & sl_sagaw;
420 }
421
422 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
423 {
424         unsigned long sagaw;
425         int agaw;
426
427         sagaw = __iommu_calculate_sagaw(iommu);
428         for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
429                 if (test_bit(agaw, &sagaw))
430                         break;
431         }
432
433         return agaw;
434 }
435
436 /*
437  * Calculate max SAGAW for each iommu.
438  */
439 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
440 {
441         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
442 }
443
444 /*
445  * calculate agaw for each iommu.
446  * "SAGAW" may be different across iommus, use a default agaw, and
447  * get a supported less agaw for iommus that don't support the default agaw.
448  */
449 int iommu_calculate_agaw(struct intel_iommu *iommu)
450 {
451         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
452 }
453
454 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
455 {
456         return sm_supported(iommu) ?
457                         ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
458 }
459
460 static void domain_update_iommu_coherency(struct dmar_domain *domain)
461 {
462         struct iommu_domain_info *info;
463         struct dmar_drhd_unit *drhd;
464         struct intel_iommu *iommu;
465         bool found = false;
466         unsigned long i;
467
468         domain->iommu_coherency = true;
469         xa_for_each(&domain->iommu_array, i, info) {
470                 found = true;
471                 if (!iommu_paging_structure_coherency(info->iommu)) {
472                         domain->iommu_coherency = false;
473                         break;
474                 }
475         }
476         if (found)
477                 return;
478
479         /* No hardware attached; use lowest common denominator */
480         rcu_read_lock();
481         for_each_active_iommu(iommu, drhd) {
482                 if (!iommu_paging_structure_coherency(iommu)) {
483                         domain->iommu_coherency = false;
484                         break;
485                 }
486         }
487         rcu_read_unlock();
488 }
489
490 static int domain_update_iommu_superpage(struct dmar_domain *domain,
491                                          struct intel_iommu *skip)
492 {
493         struct dmar_drhd_unit *drhd;
494         struct intel_iommu *iommu;
495         int mask = 0x3;
496
497         if (!intel_iommu_superpage)
498                 return 0;
499
500         /* set iommu_superpage to the smallest common denominator */
501         rcu_read_lock();
502         for_each_active_iommu(iommu, drhd) {
503                 if (iommu != skip) {
504                         if (domain && domain->use_first_level) {
505                                 if (!cap_fl1gp_support(iommu->cap))
506                                         mask = 0x1;
507                         } else {
508                                 mask &= cap_super_page_val(iommu->cap);
509                         }
510
511                         if (!mask)
512                                 break;
513                 }
514         }
515         rcu_read_unlock();
516
517         return fls(mask);
518 }
519
520 static int domain_update_device_node(struct dmar_domain *domain)
521 {
522         struct device_domain_info *info;
523         int nid = NUMA_NO_NODE;
524         unsigned long flags;
525
526         spin_lock_irqsave(&domain->lock, flags);
527         list_for_each_entry(info, &domain->devices, link) {
528                 /*
529                  * There could possibly be multiple device numa nodes as devices
530                  * within the same domain may sit behind different IOMMUs. There
531                  * isn't perfect answer in such situation, so we select first
532                  * come first served policy.
533                  */
534                 nid = dev_to_node(info->dev);
535                 if (nid != NUMA_NO_NODE)
536                         break;
537         }
538         spin_unlock_irqrestore(&domain->lock, flags);
539
540         return nid;
541 }
542
543 static void domain_update_iotlb(struct dmar_domain *domain);
544
545 /* Return the super pagesize bitmap if supported. */
546 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
547 {
548         unsigned long bitmap = 0;
549
550         /*
551          * 1-level super page supports page size of 2MiB, 2-level super page
552          * supports page size of both 2MiB and 1GiB.
553          */
554         if (domain->iommu_superpage == 1)
555                 bitmap |= SZ_2M;
556         else if (domain->iommu_superpage == 2)
557                 bitmap |= SZ_2M | SZ_1G;
558
559         return bitmap;
560 }
561
562 /* Some capabilities may be different across iommus */
563 void domain_update_iommu_cap(struct dmar_domain *domain)
564 {
565         domain_update_iommu_coherency(domain);
566         domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
567
568         /*
569          * If RHSA is missing, we should default to the device numa domain
570          * as fall back.
571          */
572         if (domain->nid == NUMA_NO_NODE)
573                 domain->nid = domain_update_device_node(domain);
574
575         /*
576          * First-level translation restricts the input-address to a
577          * canonical address (i.e., address bits 63:N have the same
578          * value as address bit [N-1], where N is 48-bits with 4-level
579          * paging and 57-bits with 5-level paging). Hence, skip bit
580          * [N-1].
581          */
582         if (domain->use_first_level)
583                 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
584         else
585                 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
586
587         domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
588         domain_update_iotlb(domain);
589 }
590
591 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
592                                          u8 devfn, int alloc)
593 {
594         struct root_entry *root = &iommu->root_entry[bus];
595         struct context_entry *context;
596         u64 *entry;
597
598         /*
599          * Except that the caller requested to allocate a new entry,
600          * returning a copied context entry makes no sense.
601          */
602         if (!alloc && context_copied(iommu, bus, devfn))
603                 return NULL;
604
605         entry = &root->lo;
606         if (sm_supported(iommu)) {
607                 if (devfn >= 0x80) {
608                         devfn -= 0x80;
609                         entry = &root->hi;
610                 }
611                 devfn *= 2;
612         }
613         if (*entry & 1)
614                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
615         else {
616                 unsigned long phy_addr;
617                 if (!alloc)
618                         return NULL;
619
620                 context = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
621                 if (!context)
622                         return NULL;
623
624                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
625                 phy_addr = virt_to_phys((void *)context);
626                 *entry = phy_addr | 1;
627                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
628         }
629         return &context[devfn];
630 }
631
632 /**
633  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
634  *                               sub-hierarchy of a candidate PCI-PCI bridge
635  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
636  * @bridge: the candidate PCI-PCI bridge
637  *
638  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
639  */
640 static bool
641 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
642 {
643         struct pci_dev *pdev, *pbridge;
644
645         if (!dev_is_pci(dev) || !dev_is_pci(bridge))
646                 return false;
647
648         pdev = to_pci_dev(dev);
649         pbridge = to_pci_dev(bridge);
650
651         if (pbridge->subordinate &&
652             pbridge->subordinate->number <= pdev->bus->number &&
653             pbridge->subordinate->busn_res.end >= pdev->bus->number)
654                 return true;
655
656         return false;
657 }
658
659 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
660 {
661         struct dmar_drhd_unit *drhd;
662         u32 vtbar;
663         int rc;
664
665         /* We know that this device on this chipset has its own IOMMU.
666          * If we find it under a different IOMMU, then the BIOS is lying
667          * to us. Hope that the IOMMU for this device is actually
668          * disabled, and it needs no translation...
669          */
670         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
671         if (rc) {
672                 /* "can't" happen */
673                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
674                 return false;
675         }
676         vtbar &= 0xffff0000;
677
678         /* we know that the this iommu should be at offset 0xa000 from vtbar */
679         drhd = dmar_find_matched_drhd_unit(pdev);
680         if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
681                 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
682                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
683                 return true;
684         }
685
686         return false;
687 }
688
689 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
690 {
691         if (!iommu || iommu->drhd->ignored)
692                 return true;
693
694         if (dev_is_pci(dev)) {
695                 struct pci_dev *pdev = to_pci_dev(dev);
696
697                 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
698                     pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
699                     quirk_ioat_snb_local_iommu(pdev))
700                         return true;
701         }
702
703         return false;
704 }
705
706 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
707 {
708         struct dmar_drhd_unit *drhd = NULL;
709         struct pci_dev *pdev = NULL;
710         struct intel_iommu *iommu;
711         struct device *tmp;
712         u16 segment = 0;
713         int i;
714
715         if (!dev)
716                 return NULL;
717
718         if (dev_is_pci(dev)) {
719                 struct pci_dev *pf_pdev;
720
721                 pdev = pci_real_dma_dev(to_pci_dev(dev));
722
723                 /* VFs aren't listed in scope tables; we need to look up
724                  * the PF instead to find the IOMMU. */
725                 pf_pdev = pci_physfn(pdev);
726                 dev = &pf_pdev->dev;
727                 segment = pci_domain_nr(pdev->bus);
728         } else if (has_acpi_companion(dev))
729                 dev = &ACPI_COMPANION(dev)->dev;
730
731         rcu_read_lock();
732         for_each_iommu(iommu, drhd) {
733                 if (pdev && segment != drhd->segment)
734                         continue;
735
736                 for_each_active_dev_scope(drhd->devices,
737                                           drhd->devices_cnt, i, tmp) {
738                         if (tmp == dev) {
739                                 /* For a VF use its original BDF# not that of the PF
740                                  * which we used for the IOMMU lookup. Strictly speaking
741                                  * we could do this for all PCI devices; we only need to
742                                  * get the BDF# from the scope table for ACPI matches. */
743                                 if (pdev && pdev->is_virtfn)
744                                         goto got_pdev;
745
746                                 if (bus && devfn) {
747                                         *bus = drhd->devices[i].bus;
748                                         *devfn = drhd->devices[i].devfn;
749                                 }
750                                 goto out;
751                         }
752
753                         if (is_downstream_to_pci_bridge(dev, tmp))
754                                 goto got_pdev;
755                 }
756
757                 if (pdev && drhd->include_all) {
758 got_pdev:
759                         if (bus && devfn) {
760                                 *bus = pdev->bus->number;
761                                 *devfn = pdev->devfn;
762                         }
763                         goto out;
764                 }
765         }
766         iommu = NULL;
767 out:
768         if (iommu_is_dummy(iommu, dev))
769                 iommu = NULL;
770
771         rcu_read_unlock();
772
773         return iommu;
774 }
775
776 static void domain_flush_cache(struct dmar_domain *domain,
777                                void *addr, int size)
778 {
779         if (!domain->iommu_coherency)
780                 clflush_cache_range(addr, size);
781 }
782
783 static void free_context_table(struct intel_iommu *iommu)
784 {
785         struct context_entry *context;
786         int i;
787
788         if (!iommu->root_entry)
789                 return;
790
791         for (i = 0; i < ROOT_ENTRY_NR; i++) {
792                 context = iommu_context_addr(iommu, i, 0, 0);
793                 if (context)
794                         free_pgtable_page(context);
795
796                 if (!sm_supported(iommu))
797                         continue;
798
799                 context = iommu_context_addr(iommu, i, 0x80, 0);
800                 if (context)
801                         free_pgtable_page(context);
802         }
803
804         free_pgtable_page(iommu->root_entry);
805         iommu->root_entry = NULL;
806 }
807
808 #ifdef CONFIG_DMAR_DEBUG
809 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
810                          u8 bus, u8 devfn, struct dma_pte *parent, int level)
811 {
812         struct dma_pte *pte;
813         int offset;
814
815         while (1) {
816                 offset = pfn_level_offset(pfn, level);
817                 pte = &parent[offset];
818                 if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
819                         pr_info("PTE not present at level %d\n", level);
820                         break;
821                 }
822
823                 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
824
825                 if (level == 1)
826                         break;
827
828                 parent = phys_to_virt(dma_pte_addr(pte));
829                 level--;
830         }
831 }
832
833 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
834                           unsigned long long addr, u32 pasid)
835 {
836         struct pasid_dir_entry *dir, *pde;
837         struct pasid_entry *entries, *pte;
838         struct context_entry *ctx_entry;
839         struct root_entry *rt_entry;
840         int i, dir_index, index, level;
841         u8 devfn = source_id & 0xff;
842         u8 bus = source_id >> 8;
843         struct dma_pte *pgtable;
844
845         pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
846
847         /* root entry dump */
848         rt_entry = &iommu->root_entry[bus];
849         if (!rt_entry) {
850                 pr_info("root table entry is not present\n");
851                 return;
852         }
853
854         if (sm_supported(iommu))
855                 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
856                         rt_entry->hi, rt_entry->lo);
857         else
858                 pr_info("root entry: 0x%016llx", rt_entry->lo);
859
860         /* context entry dump */
861         ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
862         if (!ctx_entry) {
863                 pr_info("context table entry is not present\n");
864                 return;
865         }
866
867         pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
868                 ctx_entry->hi, ctx_entry->lo);
869
870         /* legacy mode does not require PASID entries */
871         if (!sm_supported(iommu)) {
872                 level = agaw_to_level(ctx_entry->hi & 7);
873                 pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
874                 goto pgtable_walk;
875         }
876
877         /* get the pointer to pasid directory entry */
878         dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
879         if (!dir) {
880                 pr_info("pasid directory entry is not present\n");
881                 return;
882         }
883         /* For request-without-pasid, get the pasid from context entry */
884         if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
885                 pasid = IOMMU_NO_PASID;
886
887         dir_index = pasid >> PASID_PDE_SHIFT;
888         pde = &dir[dir_index];
889         pr_info("pasid dir entry: 0x%016llx\n", pde->val);
890
891         /* get the pointer to the pasid table entry */
892         entries = get_pasid_table_from_pde(pde);
893         if (!entries) {
894                 pr_info("pasid table entry is not present\n");
895                 return;
896         }
897         index = pasid & PASID_PTE_MASK;
898         pte = &entries[index];
899         for (i = 0; i < ARRAY_SIZE(pte->val); i++)
900                 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
901
902         if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
903                 level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
904                 pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
905         } else {
906                 level = agaw_to_level((pte->val[0] >> 2) & 0x7);
907                 pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
908         }
909
910 pgtable_walk:
911         pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
912 }
913 #endif
914
915 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
916                                       unsigned long pfn, int *target_level,
917                                       gfp_t gfp)
918 {
919         struct dma_pte *parent, *pte;
920         int level = agaw_to_level(domain->agaw);
921         int offset;
922
923         if (!domain_pfn_supported(domain, pfn))
924                 /* Address beyond IOMMU's addressing capabilities. */
925                 return NULL;
926
927         parent = domain->pgd;
928
929         while (1) {
930                 void *tmp_page;
931
932                 offset = pfn_level_offset(pfn, level);
933                 pte = &parent[offset];
934                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
935                         break;
936                 if (level == *target_level)
937                         break;
938
939                 if (!dma_pte_present(pte)) {
940                         uint64_t pteval;
941
942                         tmp_page = alloc_pgtable_page(domain->nid, gfp);
943
944                         if (!tmp_page)
945                                 return NULL;
946
947                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
948                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
949                         if (domain->use_first_level)
950                                 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
951
952                         if (cmpxchg64(&pte->val, 0ULL, pteval))
953                                 /* Someone else set it while we were thinking; use theirs. */
954                                 free_pgtable_page(tmp_page);
955                         else
956                                 domain_flush_cache(domain, pte, sizeof(*pte));
957                 }
958                 if (level == 1)
959                         break;
960
961                 parent = phys_to_virt(dma_pte_addr(pte));
962                 level--;
963         }
964
965         if (!*target_level)
966                 *target_level = level;
967
968         return pte;
969 }
970
971 /* return address's pte at specific level */
972 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
973                                          unsigned long pfn,
974                                          int level, int *large_page)
975 {
976         struct dma_pte *parent, *pte;
977         int total = agaw_to_level(domain->agaw);
978         int offset;
979
980         parent = domain->pgd;
981         while (level <= total) {
982                 offset = pfn_level_offset(pfn, total);
983                 pte = &parent[offset];
984                 if (level == total)
985                         return pte;
986
987                 if (!dma_pte_present(pte)) {
988                         *large_page = total;
989                         break;
990                 }
991
992                 if (dma_pte_superpage(pte)) {
993                         *large_page = total;
994                         return pte;
995                 }
996
997                 parent = phys_to_virt(dma_pte_addr(pte));
998                 total--;
999         }
1000         return NULL;
1001 }
1002
1003 /* clear last level pte, a tlb flush should be followed */
1004 static void dma_pte_clear_range(struct dmar_domain *domain,
1005                                 unsigned long start_pfn,
1006                                 unsigned long last_pfn)
1007 {
1008         unsigned int large_page;
1009         struct dma_pte *first_pte, *pte;
1010
1011         if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1012             WARN_ON(start_pfn > last_pfn))
1013                 return;
1014
1015         /* we don't need lock here; nobody else touches the iova range */
1016         do {
1017                 large_page = 1;
1018                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1019                 if (!pte) {
1020                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1021                         continue;
1022                 }
1023                 do {
1024                         dma_clear_pte(pte);
1025                         start_pfn += lvl_to_nr_pages(large_page);
1026                         pte++;
1027                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1028
1029                 domain_flush_cache(domain, first_pte,
1030                                    (void *)pte - (void *)first_pte);
1031
1032         } while (start_pfn && start_pfn <= last_pfn);
1033 }
1034
1035 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1036                                int retain_level, struct dma_pte *pte,
1037                                unsigned long pfn, unsigned long start_pfn,
1038                                unsigned long last_pfn)
1039 {
1040         pfn = max(start_pfn, pfn);
1041         pte = &pte[pfn_level_offset(pfn, level)];
1042
1043         do {
1044                 unsigned long level_pfn;
1045                 struct dma_pte *level_pte;
1046
1047                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1048                         goto next;
1049
1050                 level_pfn = pfn & level_mask(level);
1051                 level_pte = phys_to_virt(dma_pte_addr(pte));
1052
1053                 if (level > 2) {
1054                         dma_pte_free_level(domain, level - 1, retain_level,
1055                                            level_pte, level_pfn, start_pfn,
1056                                            last_pfn);
1057                 }
1058
1059                 /*
1060                  * Free the page table if we're below the level we want to
1061                  * retain and the range covers the entire table.
1062                  */
1063                 if (level < retain_level && !(start_pfn > level_pfn ||
1064                       last_pfn < level_pfn + level_size(level) - 1)) {
1065                         dma_clear_pte(pte);
1066                         domain_flush_cache(domain, pte, sizeof(*pte));
1067                         free_pgtable_page(level_pte);
1068                 }
1069 next:
1070                 pfn += level_size(level);
1071         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1072 }
1073
1074 /*
1075  * clear last level (leaf) ptes and free page table pages below the
1076  * level we wish to keep intact.
1077  */
1078 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1079                                    unsigned long start_pfn,
1080                                    unsigned long last_pfn,
1081                                    int retain_level)
1082 {
1083         dma_pte_clear_range(domain, start_pfn, last_pfn);
1084
1085         /* We don't need lock here; nobody else touches the iova range */
1086         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1087                            domain->pgd, 0, start_pfn, last_pfn);
1088
1089         /* free pgd */
1090         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1091                 free_pgtable_page(domain->pgd);
1092                 domain->pgd = NULL;
1093         }
1094 }
1095
1096 /* When a page at a given level is being unlinked from its parent, we don't
1097    need to *modify* it at all. All we need to do is make a list of all the
1098    pages which can be freed just as soon as we've flushed the IOTLB and we
1099    know the hardware page-walk will no longer touch them.
1100    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1101    be freed. */
1102 static void dma_pte_list_pagetables(struct dmar_domain *domain,
1103                                     int level, struct dma_pte *pte,
1104                                     struct list_head *freelist)
1105 {
1106         struct page *pg;
1107
1108         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1109         list_add_tail(&pg->lru, freelist);
1110
1111         if (level == 1)
1112                 return;
1113
1114         pte = page_address(pg);
1115         do {
1116                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1117                         dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1118                 pte++;
1119         } while (!first_pte_in_page(pte));
1120 }
1121
1122 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1123                                 struct dma_pte *pte, unsigned long pfn,
1124                                 unsigned long start_pfn, unsigned long last_pfn,
1125                                 struct list_head *freelist)
1126 {
1127         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1128
1129         pfn = max(start_pfn, pfn);
1130         pte = &pte[pfn_level_offset(pfn, level)];
1131
1132         do {
1133                 unsigned long level_pfn = pfn & level_mask(level);
1134
1135                 if (!dma_pte_present(pte))
1136                         goto next;
1137
1138                 /* If range covers entire pagetable, free it */
1139                 if (start_pfn <= level_pfn &&
1140                     last_pfn >= level_pfn + level_size(level) - 1) {
1141                         /* These suborbinate page tables are going away entirely. Don't
1142                            bother to clear them; we're just going to *free* them. */
1143                         if (level > 1 && !dma_pte_superpage(pte))
1144                                 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1145
1146                         dma_clear_pte(pte);
1147                         if (!first_pte)
1148                                 first_pte = pte;
1149                         last_pte = pte;
1150                 } else if (level > 1) {
1151                         /* Recurse down into a level that isn't *entirely* obsolete */
1152                         dma_pte_clear_level(domain, level - 1,
1153                                             phys_to_virt(dma_pte_addr(pte)),
1154                                             level_pfn, start_pfn, last_pfn,
1155                                             freelist);
1156                 }
1157 next:
1158                 pfn = level_pfn + level_size(level);
1159         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1160
1161         if (first_pte)
1162                 domain_flush_cache(domain, first_pte,
1163                                    (void *)++last_pte - (void *)first_pte);
1164 }
1165
1166 /* We can't just free the pages because the IOMMU may still be walking
1167    the page tables, and may have cached the intermediate levels. The
1168    pages can only be freed after the IOTLB flush has been done. */
1169 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1170                          unsigned long last_pfn, struct list_head *freelist)
1171 {
1172         if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1173             WARN_ON(start_pfn > last_pfn))
1174                 return;
1175
1176         /* we don't need lock here; nobody else touches the iova range */
1177         dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1178                             domain->pgd, 0, start_pfn, last_pfn, freelist);
1179
1180         /* free pgd */
1181         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1182                 struct page *pgd_page = virt_to_page(domain->pgd);
1183                 list_add_tail(&pgd_page->lru, freelist);
1184                 domain->pgd = NULL;
1185         }
1186 }
1187
1188 /* iommu handling */
1189 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1190 {
1191         struct root_entry *root;
1192
1193         root = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
1194         if (!root) {
1195                 pr_err("Allocating root entry for %s failed\n",
1196                         iommu->name);
1197                 return -ENOMEM;
1198         }
1199
1200         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1201         iommu->root_entry = root;
1202
1203         return 0;
1204 }
1205
1206 static void iommu_set_root_entry(struct intel_iommu *iommu)
1207 {
1208         u64 addr;
1209         u32 sts;
1210         unsigned long flag;
1211
1212         addr = virt_to_phys(iommu->root_entry);
1213         if (sm_supported(iommu))
1214                 addr |= DMA_RTADDR_SMT;
1215
1216         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1217         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1218
1219         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1220
1221         /* Make sure hardware complete it */
1222         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1223                       readl, (sts & DMA_GSTS_RTPS), sts);
1224
1225         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1226
1227         /*
1228          * Hardware invalidates all DMA remapping hardware translation
1229          * caches as part of SRTP flow.
1230          */
1231         if (cap_esrtps(iommu->cap))
1232                 return;
1233
1234         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1235         if (sm_supported(iommu))
1236                 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1237         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1238 }
1239
1240 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1241 {
1242         u32 val;
1243         unsigned long flag;
1244
1245         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1246                 return;
1247
1248         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1249         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1250
1251         /* Make sure hardware complete it */
1252         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1253                       readl, (!(val & DMA_GSTS_WBFS)), val);
1254
1255         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1256 }
1257
1258 /* return value determine if we need a write buffer flush */
1259 static void __iommu_flush_context(struct intel_iommu *iommu,
1260                                   u16 did, u16 source_id, u8 function_mask,
1261                                   u64 type)
1262 {
1263         u64 val = 0;
1264         unsigned long flag;
1265
1266         switch (type) {
1267         case DMA_CCMD_GLOBAL_INVL:
1268                 val = DMA_CCMD_GLOBAL_INVL;
1269                 break;
1270         case DMA_CCMD_DOMAIN_INVL:
1271                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1272                 break;
1273         case DMA_CCMD_DEVICE_INVL:
1274                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1275                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1276                 break;
1277         default:
1278                 pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1279                         iommu->name, type);
1280                 return;
1281         }
1282         val |= DMA_CCMD_ICC;
1283
1284         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1285         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1286
1287         /* Make sure hardware complete it */
1288         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1289                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1290
1291         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1292 }
1293
1294 /* return value determine if we need a write buffer flush */
1295 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1296                                 u64 addr, unsigned int size_order, u64 type)
1297 {
1298         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1299         u64 val = 0, val_iva = 0;
1300         unsigned long flag;
1301
1302         switch (type) {
1303         case DMA_TLB_GLOBAL_FLUSH:
1304                 /* global flush doesn't need set IVA_REG */
1305                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1306                 break;
1307         case DMA_TLB_DSI_FLUSH:
1308                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1309                 break;
1310         case DMA_TLB_PSI_FLUSH:
1311                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1312                 /* IH bit is passed in as part of address */
1313                 val_iva = size_order | addr;
1314                 break;
1315         default:
1316                 pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1317                         iommu->name, type);
1318                 return;
1319         }
1320
1321         if (cap_write_drain(iommu->cap))
1322                 val |= DMA_TLB_WRITE_DRAIN;
1323
1324         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1325         /* Note: Only uses first TLB reg currently */
1326         if (val_iva)
1327                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1328         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1329
1330         /* Make sure hardware complete it */
1331         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1332                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1333
1334         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1335
1336         /* check IOTLB invalidation granularity */
1337         if (DMA_TLB_IAIG(val) == 0)
1338                 pr_err("Flush IOTLB failed\n");
1339         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1340                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1341                         (unsigned long long)DMA_TLB_IIRG(type),
1342                         (unsigned long long)DMA_TLB_IAIG(val));
1343 }
1344
1345 static struct device_domain_info *
1346 domain_lookup_dev_info(struct dmar_domain *domain,
1347                        struct intel_iommu *iommu, u8 bus, u8 devfn)
1348 {
1349         struct device_domain_info *info;
1350         unsigned long flags;
1351
1352         spin_lock_irqsave(&domain->lock, flags);
1353         list_for_each_entry(info, &domain->devices, link) {
1354                 if (info->iommu == iommu && info->bus == bus &&
1355                     info->devfn == devfn) {
1356                         spin_unlock_irqrestore(&domain->lock, flags);
1357                         return info;
1358                 }
1359         }
1360         spin_unlock_irqrestore(&domain->lock, flags);
1361
1362         return NULL;
1363 }
1364
1365 static void domain_update_iotlb(struct dmar_domain *domain)
1366 {
1367         struct dev_pasid_info *dev_pasid;
1368         struct device_domain_info *info;
1369         bool has_iotlb_device = false;
1370         unsigned long flags;
1371
1372         spin_lock_irqsave(&domain->lock, flags);
1373         list_for_each_entry(info, &domain->devices, link) {
1374                 if (info->ats_enabled) {
1375                         has_iotlb_device = true;
1376                         break;
1377                 }
1378         }
1379
1380         list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1381                 info = dev_iommu_priv_get(dev_pasid->dev);
1382                 if (info->ats_enabled) {
1383                         has_iotlb_device = true;
1384                         break;
1385                 }
1386         }
1387         domain->has_iotlb_device = has_iotlb_device;
1388         spin_unlock_irqrestore(&domain->lock, flags);
1389 }
1390
1391 /*
1392  * The extra devTLB flush quirk impacts those QAT devices with PCI device
1393  * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1394  * check because it applies only to the built-in QAT devices and it doesn't
1395  * grant additional privileges.
1396  */
1397 #define BUGGY_QAT_DEVID_MASK 0x4940
1398 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1399 {
1400         if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1401                 return false;
1402
1403         if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1404                 return false;
1405
1406         return true;
1407 }
1408
1409 static void iommu_enable_pci_caps(struct device_domain_info *info)
1410 {
1411         struct pci_dev *pdev;
1412
1413         if (!dev_is_pci(info->dev))
1414                 return;
1415
1416         pdev = to_pci_dev(info->dev);
1417
1418         /* The PCIe spec, in its wisdom, declares that the behaviour of
1419            the device if you enable PASID support after ATS support is
1420            undefined. So always enable PASID support on devices which
1421            have it, even if we can't yet know if we're ever going to
1422            use it. */
1423         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1424                 info->pasid_enabled = 1;
1425
1426         if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1427             !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1428                 info->ats_enabled = 1;
1429                 domain_update_iotlb(info->domain);
1430         }
1431 }
1432
1433 static void iommu_disable_pci_caps(struct device_domain_info *info)
1434 {
1435         struct pci_dev *pdev;
1436
1437         if (!dev_is_pci(info->dev))
1438                 return;
1439
1440         pdev = to_pci_dev(info->dev);
1441
1442         if (info->ats_enabled) {
1443                 pci_disable_ats(pdev);
1444                 info->ats_enabled = 0;
1445                 domain_update_iotlb(info->domain);
1446         }
1447
1448         if (info->pasid_enabled) {
1449                 pci_disable_pasid(pdev);
1450                 info->pasid_enabled = 0;
1451         }
1452 }
1453
1454 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1455                                     u64 addr, unsigned int mask)
1456 {
1457         u16 sid, qdep;
1458
1459         if (!info || !info->ats_enabled)
1460                 return;
1461
1462         sid = info->bus << 8 | info->devfn;
1463         qdep = info->ats_qdep;
1464         qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1465                            qdep, addr, mask);
1466         quirk_extra_dev_tlb_flush(info, addr, mask, IOMMU_NO_PASID, qdep);
1467 }
1468
1469 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1470                                   u64 addr, unsigned mask)
1471 {
1472         struct dev_pasid_info *dev_pasid;
1473         struct device_domain_info *info;
1474         unsigned long flags;
1475
1476         if (!domain->has_iotlb_device)
1477                 return;
1478
1479         spin_lock_irqsave(&domain->lock, flags);
1480         list_for_each_entry(info, &domain->devices, link)
1481                 __iommu_flush_dev_iotlb(info, addr, mask);
1482
1483         list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1484                 info = dev_iommu_priv_get(dev_pasid->dev);
1485
1486                 if (!info->ats_enabled)
1487                         continue;
1488
1489                 qi_flush_dev_iotlb_pasid(info->iommu,
1490                                          PCI_DEVID(info->bus, info->devfn),
1491                                          info->pfsid, dev_pasid->pasid,
1492                                          info->ats_qdep, addr,
1493                                          mask);
1494         }
1495         spin_unlock_irqrestore(&domain->lock, flags);
1496 }
1497
1498 static void domain_flush_pasid_iotlb(struct intel_iommu *iommu,
1499                                      struct dmar_domain *domain, u64 addr,
1500                                      unsigned long npages, bool ih)
1501 {
1502         u16 did = domain_id_iommu(domain, iommu);
1503         struct dev_pasid_info *dev_pasid;
1504         unsigned long flags;
1505
1506         spin_lock_irqsave(&domain->lock, flags);
1507         list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain)
1508                 qi_flush_piotlb(iommu, did, dev_pasid->pasid, addr, npages, ih);
1509
1510         if (!list_empty(&domain->devices))
1511                 qi_flush_piotlb(iommu, did, IOMMU_NO_PASID, addr, npages, ih);
1512         spin_unlock_irqrestore(&domain->lock, flags);
1513 }
1514
1515 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1516                                   struct dmar_domain *domain,
1517                                   unsigned long pfn, unsigned int pages,
1518                                   int ih, int map)
1519 {
1520         unsigned int aligned_pages = __roundup_pow_of_two(pages);
1521         unsigned int mask = ilog2(aligned_pages);
1522         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1523         u16 did = domain_id_iommu(domain, iommu);
1524
1525         if (WARN_ON(!pages))
1526                 return;
1527
1528         if (ih)
1529                 ih = 1 << 6;
1530
1531         if (domain->use_first_level) {
1532                 domain_flush_pasid_iotlb(iommu, domain, addr, pages, ih);
1533         } else {
1534                 unsigned long bitmask = aligned_pages - 1;
1535
1536                 /*
1537                  * PSI masks the low order bits of the base address. If the
1538                  * address isn't aligned to the mask, then compute a mask value
1539                  * needed to ensure the target range is flushed.
1540                  */
1541                 if (unlikely(bitmask & pfn)) {
1542                         unsigned long end_pfn = pfn + pages - 1, shared_bits;
1543
1544                         /*
1545                          * Since end_pfn <= pfn + bitmask, the only way bits
1546                          * higher than bitmask can differ in pfn and end_pfn is
1547                          * by carrying. This means after masking out bitmask,
1548                          * high bits starting with the first set bit in
1549                          * shared_bits are all equal in both pfn and end_pfn.
1550                          */
1551                         shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1552                         mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1553                 }
1554
1555                 /*
1556                  * Fallback to domain selective flush if no PSI support or
1557                  * the size is too big.
1558                  */
1559                 if (!cap_pgsel_inv(iommu->cap) ||
1560                     mask > cap_max_amask_val(iommu->cap))
1561                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1562                                                         DMA_TLB_DSI_FLUSH);
1563                 else
1564                         iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1565                                                         DMA_TLB_PSI_FLUSH);
1566         }
1567
1568         /*
1569          * In caching mode, changes of pages from non-present to present require
1570          * flush. However, device IOTLB doesn't need to be flushed in this case.
1571          */
1572         if (!cap_caching_mode(iommu->cap) || !map)
1573                 iommu_flush_dev_iotlb(domain, addr, mask);
1574 }
1575
1576 /* Notification for newly created mappings */
1577 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1578                                         struct dmar_domain *domain,
1579                                         unsigned long pfn, unsigned int pages)
1580 {
1581         /*
1582          * It's a non-present to present mapping. Only flush if caching mode
1583          * and second level.
1584          */
1585         if (cap_caching_mode(iommu->cap) && !domain->use_first_level)
1586                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1587         else
1588                 iommu_flush_write_buffer(iommu);
1589 }
1590
1591 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1592 {
1593         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1594         struct iommu_domain_info *info;
1595         unsigned long idx;
1596
1597         xa_for_each(&dmar_domain->iommu_array, idx, info) {
1598                 struct intel_iommu *iommu = info->iommu;
1599                 u16 did = domain_id_iommu(dmar_domain, iommu);
1600
1601                 if (dmar_domain->use_first_level)
1602                         domain_flush_pasid_iotlb(iommu, dmar_domain, 0, -1, 0);
1603                 else
1604                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1605                                                  DMA_TLB_DSI_FLUSH);
1606
1607                 if (!cap_caching_mode(iommu->cap))
1608                         iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1609         }
1610 }
1611
1612 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1613 {
1614         u32 pmen;
1615         unsigned long flags;
1616
1617         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1618                 return;
1619
1620         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1621         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1622         pmen &= ~DMA_PMEN_EPM;
1623         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1624
1625         /* wait for the protected region status bit to clear */
1626         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1627                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1628
1629         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1630 }
1631
1632 static void iommu_enable_translation(struct intel_iommu *iommu)
1633 {
1634         u32 sts;
1635         unsigned long flags;
1636
1637         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1638         iommu->gcmd |= DMA_GCMD_TE;
1639         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1640
1641         /* Make sure hardware complete it */
1642         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1643                       readl, (sts & DMA_GSTS_TES), sts);
1644
1645         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1646 }
1647
1648 static void iommu_disable_translation(struct intel_iommu *iommu)
1649 {
1650         u32 sts;
1651         unsigned long flag;
1652
1653         if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1654             (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1655                 return;
1656
1657         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1658         iommu->gcmd &= ~DMA_GCMD_TE;
1659         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1660
1661         /* Make sure hardware complete it */
1662         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1663                       readl, (!(sts & DMA_GSTS_TES)), sts);
1664
1665         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1666 }
1667
1668 static int iommu_init_domains(struct intel_iommu *iommu)
1669 {
1670         u32 ndomains;
1671
1672         ndomains = cap_ndoms(iommu->cap);
1673         pr_debug("%s: Number of Domains supported <%d>\n",
1674                  iommu->name, ndomains);
1675
1676         spin_lock_init(&iommu->lock);
1677
1678         iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1679         if (!iommu->domain_ids)
1680                 return -ENOMEM;
1681
1682         /*
1683          * If Caching mode is set, then invalid translations are tagged
1684          * with domain-id 0, hence we need to pre-allocate it. We also
1685          * use domain-id 0 as a marker for non-allocated domain-id, so
1686          * make sure it is not used for a real domain.
1687          */
1688         set_bit(0, iommu->domain_ids);
1689
1690         /*
1691          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1692          * entry for first-level or pass-through translation modes should
1693          * be programmed with a domain id different from those used for
1694          * second-level or nested translation. We reserve a domain id for
1695          * this purpose.
1696          */
1697         if (sm_supported(iommu))
1698                 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1699
1700         return 0;
1701 }
1702
1703 static void disable_dmar_iommu(struct intel_iommu *iommu)
1704 {
1705         if (!iommu->domain_ids)
1706                 return;
1707
1708         /*
1709          * All iommu domains must have been detached from the devices,
1710          * hence there should be no domain IDs in use.
1711          */
1712         if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1713                     > NUM_RESERVED_DID))
1714                 return;
1715
1716         if (iommu->gcmd & DMA_GCMD_TE)
1717                 iommu_disable_translation(iommu);
1718 }
1719
1720 static void free_dmar_iommu(struct intel_iommu *iommu)
1721 {
1722         if (iommu->domain_ids) {
1723                 bitmap_free(iommu->domain_ids);
1724                 iommu->domain_ids = NULL;
1725         }
1726
1727         if (iommu->copied_tables) {
1728                 bitmap_free(iommu->copied_tables);
1729                 iommu->copied_tables = NULL;
1730         }
1731
1732         /* free context mapping */
1733         free_context_table(iommu);
1734
1735 #ifdef CONFIG_INTEL_IOMMU_SVM
1736         if (pasid_supported(iommu)) {
1737                 if (ecap_prs(iommu->ecap))
1738                         intel_svm_finish_prq(iommu);
1739         }
1740 #endif
1741 }
1742
1743 /*
1744  * Check and return whether first level is used by default for
1745  * DMA translation.
1746  */
1747 static bool first_level_by_default(unsigned int type)
1748 {
1749         /* Only SL is available in legacy mode */
1750         if (!scalable_mode_support())
1751                 return false;
1752
1753         /* Only level (either FL or SL) is available, just use it */
1754         if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1755                 return intel_cap_flts_sanity();
1756
1757         /* Both levels are available, decide it based on domain type */
1758         return type != IOMMU_DOMAIN_UNMANAGED;
1759 }
1760
1761 static struct dmar_domain *alloc_domain(unsigned int type)
1762 {
1763         struct dmar_domain *domain;
1764
1765         domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1766         if (!domain)
1767                 return NULL;
1768
1769         domain->nid = NUMA_NO_NODE;
1770         if (first_level_by_default(type))
1771                 domain->use_first_level = true;
1772         domain->has_iotlb_device = false;
1773         INIT_LIST_HEAD(&domain->devices);
1774         INIT_LIST_HEAD(&domain->dev_pasids);
1775         spin_lock_init(&domain->lock);
1776         xa_init(&domain->iommu_array);
1777
1778         return domain;
1779 }
1780
1781 int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1782 {
1783         struct iommu_domain_info *info, *curr;
1784         unsigned long ndomains;
1785         int num, ret = -ENOSPC;
1786
1787         info = kzalloc(sizeof(*info), GFP_KERNEL);
1788         if (!info)
1789                 return -ENOMEM;
1790
1791         spin_lock(&iommu->lock);
1792         curr = xa_load(&domain->iommu_array, iommu->seq_id);
1793         if (curr) {
1794                 curr->refcnt++;
1795                 spin_unlock(&iommu->lock);
1796                 kfree(info);
1797                 return 0;
1798         }
1799
1800         ndomains = cap_ndoms(iommu->cap);
1801         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1802         if (num >= ndomains) {
1803                 pr_err("%s: No free domain ids\n", iommu->name);
1804                 goto err_unlock;
1805         }
1806
1807         set_bit(num, iommu->domain_ids);
1808         info->refcnt    = 1;
1809         info->did       = num;
1810         info->iommu     = iommu;
1811         curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1812                           NULL, info, GFP_ATOMIC);
1813         if (curr) {
1814                 ret = xa_err(curr) ? : -EBUSY;
1815                 goto err_clear;
1816         }
1817         domain_update_iommu_cap(domain);
1818
1819         spin_unlock(&iommu->lock);
1820         return 0;
1821
1822 err_clear:
1823         clear_bit(info->did, iommu->domain_ids);
1824 err_unlock:
1825         spin_unlock(&iommu->lock);
1826         kfree(info);
1827         return ret;
1828 }
1829
1830 void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1831 {
1832         struct iommu_domain_info *info;
1833
1834         spin_lock(&iommu->lock);
1835         info = xa_load(&domain->iommu_array, iommu->seq_id);
1836         if (--info->refcnt == 0) {
1837                 clear_bit(info->did, iommu->domain_ids);
1838                 xa_erase(&domain->iommu_array, iommu->seq_id);
1839                 domain->nid = NUMA_NO_NODE;
1840                 domain_update_iommu_cap(domain);
1841                 kfree(info);
1842         }
1843         spin_unlock(&iommu->lock);
1844 }
1845
1846 static inline int guestwidth_to_adjustwidth(int gaw)
1847 {
1848         int agaw;
1849         int r = (gaw - 12) % 9;
1850
1851         if (r == 0)
1852                 agaw = gaw;
1853         else
1854                 agaw = gaw + 9 - r;
1855         if (agaw > 64)
1856                 agaw = 64;
1857         return agaw;
1858 }
1859
1860 static void domain_exit(struct dmar_domain *domain)
1861 {
1862         if (domain->pgd) {
1863                 LIST_HEAD(freelist);
1864
1865                 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1866                 put_pages_list(&freelist);
1867         }
1868
1869         if (WARN_ON(!list_empty(&domain->devices)))
1870                 return;
1871
1872         kfree(domain);
1873 }
1874
1875 /*
1876  * Get the PASID directory size for scalable mode context entry.
1877  * Value of X in the PDTS field of a scalable mode context entry
1878  * indicates PASID directory with 2^(X + 7) entries.
1879  */
1880 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1881 {
1882         unsigned long pds, max_pde;
1883
1884         max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1885         pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
1886         if (pds < 7)
1887                 return 0;
1888
1889         return pds - 7;
1890 }
1891
1892 /*
1893  * Set the RID_PASID field of a scalable mode context entry. The
1894  * IOMMU hardware will use the PASID value set in this field for
1895  * DMA translations of DMA requests without PASID.
1896  */
1897 static inline void
1898 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1899 {
1900         context->hi |= pasid & ((1 << 20) - 1);
1901 }
1902
1903 /*
1904  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1905  * entry.
1906  */
1907 static inline void context_set_sm_dte(struct context_entry *context)
1908 {
1909         context->lo |= BIT_ULL(2);
1910 }
1911
1912 /*
1913  * Set the PRE(Page Request Enable) field of a scalable mode context
1914  * entry.
1915  */
1916 static inline void context_set_sm_pre(struct context_entry *context)
1917 {
1918         context->lo |= BIT_ULL(4);
1919 }
1920
1921 /* Convert value to context PASID directory size field coding. */
1922 #define context_pdts(pds)       (((pds) & 0x7) << 9)
1923
1924 static int domain_context_mapping_one(struct dmar_domain *domain,
1925                                       struct intel_iommu *iommu,
1926                                       struct pasid_table *table,
1927                                       u8 bus, u8 devfn)
1928 {
1929         struct device_domain_info *info =
1930                         domain_lookup_dev_info(domain, iommu, bus, devfn);
1931         u16 did = domain_id_iommu(domain, iommu);
1932         int translation = CONTEXT_TT_MULTI_LEVEL;
1933         struct context_entry *context;
1934         int ret;
1935
1936         if (hw_pass_through && domain_type_is_si(domain))
1937                 translation = CONTEXT_TT_PASS_THROUGH;
1938
1939         pr_debug("Set context mapping for %02x:%02x.%d\n",
1940                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1941
1942         spin_lock(&iommu->lock);
1943         ret = -ENOMEM;
1944         context = iommu_context_addr(iommu, bus, devfn, 1);
1945         if (!context)
1946                 goto out_unlock;
1947
1948         ret = 0;
1949         if (context_present(context) && !context_copied(iommu, bus, devfn))
1950                 goto out_unlock;
1951
1952         /*
1953          * For kdump cases, old valid entries may be cached due to the
1954          * in-flight DMA and copied pgtable, but there is no unmapping
1955          * behaviour for them, thus we need an explicit cache flush for
1956          * the newly-mapped device. For kdump, at this point, the device
1957          * is supposed to finish reset at its driver probe stage, so no
1958          * in-flight DMA will exist, and we don't need to worry anymore
1959          * hereafter.
1960          */
1961         if (context_copied(iommu, bus, devfn)) {
1962                 u16 did_old = context_domain_id(context);
1963
1964                 if (did_old < cap_ndoms(iommu->cap)) {
1965                         iommu->flush.flush_context(iommu, did_old,
1966                                                    (((u16)bus) << 8) | devfn,
1967                                                    DMA_CCMD_MASK_NOBIT,
1968                                                    DMA_CCMD_DEVICE_INVL);
1969                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1970                                                  DMA_TLB_DSI_FLUSH);
1971                 }
1972
1973                 clear_context_copied(iommu, bus, devfn);
1974         }
1975
1976         context_clear_entry(context);
1977
1978         if (sm_supported(iommu)) {
1979                 unsigned long pds;
1980
1981                 /* Setup the PASID DIR pointer: */
1982                 pds = context_get_sm_pds(table);
1983                 context->lo = (u64)virt_to_phys(table->table) |
1984                                 context_pdts(pds);
1985
1986                 /* Setup the RID_PASID field: */
1987                 context_set_sm_rid2pasid(context, IOMMU_NO_PASID);
1988
1989                 /*
1990                  * Setup the Device-TLB enable bit and Page request
1991                  * Enable bit:
1992                  */
1993                 if (info && info->ats_supported)
1994                         context_set_sm_dte(context);
1995                 if (info && info->pri_supported)
1996                         context_set_sm_pre(context);
1997                 if (info && info->pasid_supported)
1998                         context_set_pasid(context);
1999         } else {
2000                 struct dma_pte *pgd = domain->pgd;
2001                 int agaw;
2002
2003                 context_set_domain_id(context, did);
2004
2005                 if (translation != CONTEXT_TT_PASS_THROUGH) {
2006                         /*
2007                          * Skip top levels of page tables for iommu which has
2008                          * less agaw than default. Unnecessary for PT mode.
2009                          */
2010                         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2011                                 ret = -ENOMEM;
2012                                 pgd = phys_to_virt(dma_pte_addr(pgd));
2013                                 if (!dma_pte_present(pgd))
2014                                         goto out_unlock;
2015                         }
2016
2017                         if (info && info->ats_supported)
2018                                 translation = CONTEXT_TT_DEV_IOTLB;
2019                         else
2020                                 translation = CONTEXT_TT_MULTI_LEVEL;
2021
2022                         context_set_address_root(context, virt_to_phys(pgd));
2023                         context_set_address_width(context, agaw);
2024                 } else {
2025                         /*
2026                          * In pass through mode, AW must be programmed to
2027                          * indicate the largest AGAW value supported by
2028                          * hardware. And ASR is ignored by hardware.
2029                          */
2030                         context_set_address_width(context, iommu->msagaw);
2031                 }
2032
2033                 context_set_translation_type(context, translation);
2034         }
2035
2036         context_set_fault_enable(context);
2037         context_set_present(context);
2038         if (!ecap_coherent(iommu->ecap))
2039                 clflush_cache_range(context, sizeof(*context));
2040
2041         /*
2042          * It's a non-present to present mapping. If hardware doesn't cache
2043          * non-present entry we only need to flush the write-buffer. If the
2044          * _does_ cache non-present entries, then it does so in the special
2045          * domain #0, which we have to flush:
2046          */
2047         if (cap_caching_mode(iommu->cap)) {
2048                 iommu->flush.flush_context(iommu, 0,
2049                                            (((u16)bus) << 8) | devfn,
2050                                            DMA_CCMD_MASK_NOBIT,
2051                                            DMA_CCMD_DEVICE_INVL);
2052                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2053         } else {
2054                 iommu_flush_write_buffer(iommu);
2055         }
2056
2057         ret = 0;
2058
2059 out_unlock:
2060         spin_unlock(&iommu->lock);
2061
2062         return ret;
2063 }
2064
2065 struct domain_context_mapping_data {
2066         struct dmar_domain *domain;
2067         struct intel_iommu *iommu;
2068         struct pasid_table *table;
2069 };
2070
2071 static int domain_context_mapping_cb(struct pci_dev *pdev,
2072                                      u16 alias, void *opaque)
2073 {
2074         struct domain_context_mapping_data *data = opaque;
2075
2076         return domain_context_mapping_one(data->domain, data->iommu,
2077                                           data->table, PCI_BUS_NUM(alias),
2078                                           alias & 0xff);
2079 }
2080
2081 static int
2082 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2083 {
2084         struct domain_context_mapping_data data;
2085         struct pasid_table *table;
2086         struct intel_iommu *iommu;
2087         u8 bus, devfn;
2088
2089         iommu = device_to_iommu(dev, &bus, &devfn);
2090         if (!iommu)
2091                 return -ENODEV;
2092
2093         table = intel_pasid_get_table(dev);
2094
2095         if (!dev_is_pci(dev))
2096                 return domain_context_mapping_one(domain, iommu, table,
2097                                                   bus, devfn);
2098
2099         data.domain = domain;
2100         data.iommu = iommu;
2101         data.table = table;
2102
2103         return pci_for_each_dma_alias(to_pci_dev(dev),
2104                                       &domain_context_mapping_cb, &data);
2105 }
2106
2107 /* Returns a number of VTD pages, but aligned to MM page size */
2108 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2109                                             size_t size)
2110 {
2111         host_addr &= ~PAGE_MASK;
2112         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2113 }
2114
2115 /* Return largest possible superpage level for a given mapping */
2116 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2117                                           unsigned long iov_pfn,
2118                                           unsigned long phy_pfn,
2119                                           unsigned long pages)
2120 {
2121         int support, level = 1;
2122         unsigned long pfnmerge;
2123
2124         support = domain->iommu_superpage;
2125
2126         /* To use a large page, the virtual *and* physical addresses
2127            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2128            of them will mean we have to use smaller pages. So just
2129            merge them and check both at once. */
2130         pfnmerge = iov_pfn | phy_pfn;
2131
2132         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2133                 pages >>= VTD_STRIDE_SHIFT;
2134                 if (!pages)
2135                         break;
2136                 pfnmerge >>= VTD_STRIDE_SHIFT;
2137                 level++;
2138                 support--;
2139         }
2140         return level;
2141 }
2142
2143 /*
2144  * Ensure that old small page tables are removed to make room for superpage(s).
2145  * We're going to add new large pages, so make sure we don't remove their parent
2146  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2147  */
2148 static void switch_to_super_page(struct dmar_domain *domain,
2149                                  unsigned long start_pfn,
2150                                  unsigned long end_pfn, int level)
2151 {
2152         unsigned long lvl_pages = lvl_to_nr_pages(level);
2153         struct iommu_domain_info *info;
2154         struct dma_pte *pte = NULL;
2155         unsigned long i;
2156
2157         while (start_pfn <= end_pfn) {
2158                 if (!pte)
2159                         pte = pfn_to_dma_pte(domain, start_pfn, &level,
2160                                              GFP_ATOMIC);
2161
2162                 if (dma_pte_present(pte)) {
2163                         dma_pte_free_pagetable(domain, start_pfn,
2164                                                start_pfn + lvl_pages - 1,
2165                                                level + 1);
2166
2167                         xa_for_each(&domain->iommu_array, i, info)
2168                                 iommu_flush_iotlb_psi(info->iommu, domain,
2169                                                       start_pfn, lvl_pages,
2170                                                       0, 0);
2171                 }
2172
2173                 pte++;
2174                 start_pfn += lvl_pages;
2175                 if (first_pte_in_page(pte))
2176                         pte = NULL;
2177         }
2178 }
2179
2180 static int
2181 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2182                  unsigned long phys_pfn, unsigned long nr_pages, int prot,
2183                  gfp_t gfp)
2184 {
2185         struct dma_pte *first_pte = NULL, *pte = NULL;
2186         unsigned int largepage_lvl = 0;
2187         unsigned long lvl_pages = 0;
2188         phys_addr_t pteval;
2189         u64 attr;
2190
2191         if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
2192                 return -EINVAL;
2193
2194         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2195                 return -EINVAL;
2196
2197         if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
2198                 pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
2199                 return -EINVAL;
2200         }
2201
2202         attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2203         attr |= DMA_FL_PTE_PRESENT;
2204         if (domain->use_first_level) {
2205                 attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2206                 if (prot & DMA_PTE_WRITE)
2207                         attr |= DMA_FL_PTE_DIRTY;
2208         }
2209
2210         pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2211
2212         while (nr_pages > 0) {
2213                 uint64_t tmp;
2214
2215                 if (!pte) {
2216                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2217                                         phys_pfn, nr_pages);
2218
2219                         pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
2220                                              gfp);
2221                         if (!pte)
2222                                 return -ENOMEM;
2223                         first_pte = pte;
2224
2225                         lvl_pages = lvl_to_nr_pages(largepage_lvl);
2226
2227                         /* It is large page*/
2228                         if (largepage_lvl > 1) {
2229                                 unsigned long end_pfn;
2230                                 unsigned long pages_to_remove;
2231
2232                                 pteval |= DMA_PTE_LARGE_PAGE;
2233                                 pages_to_remove = min_t(unsigned long, nr_pages,
2234                                                         nr_pte_to_next_page(pte) * lvl_pages);
2235                                 end_pfn = iov_pfn + pages_to_remove - 1;
2236                                 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2237                         } else {
2238                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2239                         }
2240
2241                 }
2242                 /* We don't need lock here, nobody else
2243                  * touches the iova range
2244                  */
2245                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2246                 if (tmp) {
2247                         static int dumps = 5;
2248                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2249                                 iov_pfn, tmp, (unsigned long long)pteval);
2250                         if (dumps) {
2251                                 dumps--;
2252                                 debug_dma_dump_mappings(NULL);
2253                         }
2254                         WARN_ON(1);
2255                 }
2256
2257                 nr_pages -= lvl_pages;
2258                 iov_pfn += lvl_pages;
2259                 phys_pfn += lvl_pages;
2260                 pteval += lvl_pages * VTD_PAGE_SIZE;
2261
2262                 /* If the next PTE would be the first in a new page, then we
2263                  * need to flush the cache on the entries we've just written.
2264                  * And then we'll need to recalculate 'pte', so clear it and
2265                  * let it get set again in the if (!pte) block above.
2266                  *
2267                  * If we're done (!nr_pages) we need to flush the cache too.
2268                  *
2269                  * Also if we've been setting superpages, we may need to
2270                  * recalculate 'pte' and switch back to smaller pages for the
2271                  * end of the mapping, if the trailing size is not enough to
2272                  * use another superpage (i.e. nr_pages < lvl_pages).
2273                  */
2274                 pte++;
2275                 if (!nr_pages || first_pte_in_page(pte) ||
2276                     (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2277                         domain_flush_cache(domain, first_pte,
2278                                            (void *)pte - (void *)first_pte);
2279                         pte = NULL;
2280                 }
2281         }
2282
2283         return 0;
2284 }
2285
2286 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2287 {
2288         struct intel_iommu *iommu = info->iommu;
2289         struct context_entry *context;
2290         u16 did_old;
2291
2292         if (!iommu)
2293                 return;
2294
2295         spin_lock(&iommu->lock);
2296         context = iommu_context_addr(iommu, bus, devfn, 0);
2297         if (!context) {
2298                 spin_unlock(&iommu->lock);
2299                 return;
2300         }
2301
2302         if (sm_supported(iommu)) {
2303                 if (hw_pass_through && domain_type_is_si(info->domain))
2304                         did_old = FLPT_DEFAULT_DID;
2305                 else
2306                         did_old = domain_id_iommu(info->domain, iommu);
2307         } else {
2308                 did_old = context_domain_id(context);
2309         }
2310
2311         context_clear_entry(context);
2312         __iommu_flush_cache(iommu, context, sizeof(*context));
2313         spin_unlock(&iommu->lock);
2314         iommu->flush.flush_context(iommu,
2315                                    did_old,
2316                                    (((u16)bus) << 8) | devfn,
2317                                    DMA_CCMD_MASK_NOBIT,
2318                                    DMA_CCMD_DEVICE_INVL);
2319
2320         if (sm_supported(iommu))
2321                 qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2322
2323         iommu->flush.flush_iotlb(iommu,
2324                                  did_old,
2325                                  0,
2326                                  0,
2327                                  DMA_TLB_DSI_FLUSH);
2328
2329         __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2330 }
2331
2332 static int domain_setup_first_level(struct intel_iommu *iommu,
2333                                     struct dmar_domain *domain,
2334                                     struct device *dev,
2335                                     u32 pasid)
2336 {
2337         struct dma_pte *pgd = domain->pgd;
2338         int agaw, level;
2339         int flags = 0;
2340
2341         /*
2342          * Skip top levels of page tables for iommu which has
2343          * less agaw than default. Unnecessary for PT mode.
2344          */
2345         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2346                 pgd = phys_to_virt(dma_pte_addr(pgd));
2347                 if (!dma_pte_present(pgd))
2348                         return -ENOMEM;
2349         }
2350
2351         level = agaw_to_level(agaw);
2352         if (level != 4 && level != 5)
2353                 return -EINVAL;
2354
2355         if (level == 5)
2356                 flags |= PASID_FLAG_FL5LP;
2357
2358         if (domain->force_snooping)
2359                 flags |= PASID_FLAG_PAGE_SNOOP;
2360
2361         return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2362                                              domain_id_iommu(domain, iommu),
2363                                              flags);
2364 }
2365
2366 static bool dev_is_real_dma_subdevice(struct device *dev)
2367 {
2368         return dev && dev_is_pci(dev) &&
2369                pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2370 }
2371
2372 static int iommu_domain_identity_map(struct dmar_domain *domain,
2373                                      unsigned long first_vpfn,
2374                                      unsigned long last_vpfn)
2375 {
2376         /*
2377          * RMRR range might have overlap with physical memory range,
2378          * clear it first
2379          */
2380         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2381
2382         return __domain_mapping(domain, first_vpfn,
2383                                 first_vpfn, last_vpfn - first_vpfn + 1,
2384                                 DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL);
2385 }
2386
2387 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2388
2389 static int __init si_domain_init(int hw)
2390 {
2391         struct dmar_rmrr_unit *rmrr;
2392         struct device *dev;
2393         int i, nid, ret;
2394
2395         si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2396         if (!si_domain)
2397                 return -EFAULT;
2398
2399         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2400                 domain_exit(si_domain);
2401                 si_domain = NULL;
2402                 return -EFAULT;
2403         }
2404
2405         if (hw)
2406                 return 0;
2407
2408         for_each_online_node(nid) {
2409                 unsigned long start_pfn, end_pfn;
2410                 int i;
2411
2412                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2413                         ret = iommu_domain_identity_map(si_domain,
2414                                         mm_to_dma_pfn_start(start_pfn),
2415                                         mm_to_dma_pfn_end(end_pfn));
2416                         if (ret)
2417                                 return ret;
2418                 }
2419         }
2420
2421         /*
2422          * Identity map the RMRRs so that devices with RMRRs could also use
2423          * the si_domain.
2424          */
2425         for_each_rmrr_units(rmrr) {
2426                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2427                                           i, dev) {
2428                         unsigned long long start = rmrr->base_address;
2429                         unsigned long long end = rmrr->end_address;
2430
2431                         if (WARN_ON(end < start ||
2432                                     end >> agaw_to_width(si_domain->agaw)))
2433                                 continue;
2434
2435                         ret = iommu_domain_identity_map(si_domain,
2436                                         mm_to_dma_pfn_start(start >> PAGE_SHIFT),
2437                                         mm_to_dma_pfn_end(end >> PAGE_SHIFT));
2438                         if (ret)
2439                                 return ret;
2440                 }
2441         }
2442
2443         return 0;
2444 }
2445
2446 static int dmar_domain_attach_device(struct dmar_domain *domain,
2447                                      struct device *dev)
2448 {
2449         struct device_domain_info *info = dev_iommu_priv_get(dev);
2450         struct intel_iommu *iommu;
2451         unsigned long flags;
2452         u8 bus, devfn;
2453         int ret;
2454
2455         iommu = device_to_iommu(dev, &bus, &devfn);
2456         if (!iommu)
2457                 return -ENODEV;
2458
2459         ret = domain_attach_iommu(domain, iommu);
2460         if (ret)
2461                 return ret;
2462         info->domain = domain;
2463         spin_lock_irqsave(&domain->lock, flags);
2464         list_add(&info->link, &domain->devices);
2465         spin_unlock_irqrestore(&domain->lock, flags);
2466
2467         /* PASID table is mandatory for a PCI device in scalable mode. */
2468         if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2469                 /* Setup the PASID entry for requests without PASID: */
2470                 if (hw_pass_through && domain_type_is_si(domain))
2471                         ret = intel_pasid_setup_pass_through(iommu, domain,
2472                                         dev, IOMMU_NO_PASID);
2473                 else if (domain->use_first_level)
2474                         ret = domain_setup_first_level(iommu, domain, dev,
2475                                         IOMMU_NO_PASID);
2476                 else
2477                         ret = intel_pasid_setup_second_level(iommu, domain,
2478                                         dev, IOMMU_NO_PASID);
2479                 if (ret) {
2480                         dev_err(dev, "Setup RID2PASID failed\n");
2481                         device_block_translation(dev);
2482                         return ret;
2483                 }
2484         }
2485
2486         ret = domain_context_mapping(domain, dev);
2487         if (ret) {
2488                 dev_err(dev, "Domain context map failed\n");
2489                 device_block_translation(dev);
2490                 return ret;
2491         }
2492
2493         iommu_enable_pci_caps(info);
2494
2495         return 0;
2496 }
2497
2498 /**
2499  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2500  * is relaxable (ie. is allowed to be not enforced under some conditions)
2501  * @dev: device handle
2502  *
2503  * We assume that PCI USB devices with RMRRs have them largely
2504  * for historical reasons and that the RMRR space is not actively used post
2505  * boot.  This exclusion may change if vendors begin to abuse it.
2506  *
2507  * The same exception is made for graphics devices, with the requirement that
2508  * any use of the RMRR regions will be torn down before assigning the device
2509  * to a guest.
2510  *
2511  * Return: true if the RMRR is relaxable, false otherwise
2512  */
2513 static bool device_rmrr_is_relaxable(struct device *dev)
2514 {
2515         struct pci_dev *pdev;
2516
2517         if (!dev_is_pci(dev))
2518                 return false;
2519
2520         pdev = to_pci_dev(dev);
2521         if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2522                 return true;
2523         else
2524                 return false;
2525 }
2526
2527 /*
2528  * Return the required default domain type for a specific device.
2529  *
2530  * @dev: the device in query
2531  * @startup: true if this is during early boot
2532  *
2533  * Returns:
2534  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2535  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2536  *  - 0: both identity and dynamic domains work for this device
2537  */
2538 static int device_def_domain_type(struct device *dev)
2539 {
2540         if (dev_is_pci(dev)) {
2541                 struct pci_dev *pdev = to_pci_dev(dev);
2542
2543                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2544                         return IOMMU_DOMAIN_IDENTITY;
2545
2546                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2547                         return IOMMU_DOMAIN_IDENTITY;
2548         }
2549
2550         return 0;
2551 }
2552
2553 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2554 {
2555         /*
2556          * Start from the sane iommu hardware state.
2557          * If the queued invalidation is already initialized by us
2558          * (for example, while enabling interrupt-remapping) then
2559          * we got the things already rolling from a sane state.
2560          */
2561         if (!iommu->qi) {
2562                 /*
2563                  * Clear any previous faults.
2564                  */
2565                 dmar_fault(-1, iommu);
2566                 /*
2567                  * Disable queued invalidation if supported and already enabled
2568                  * before OS handover.
2569                  */
2570                 dmar_disable_qi(iommu);
2571         }
2572
2573         if (dmar_enable_qi(iommu)) {
2574                 /*
2575                  * Queued Invalidate not enabled, use Register Based Invalidate
2576                  */
2577                 iommu->flush.flush_context = __iommu_flush_context;
2578                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2579                 pr_info("%s: Using Register based invalidation\n",
2580                         iommu->name);
2581         } else {
2582                 iommu->flush.flush_context = qi_flush_context;
2583                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2584                 pr_info("%s: Using Queued invalidation\n", iommu->name);
2585         }
2586 }
2587
2588 static int copy_context_table(struct intel_iommu *iommu,
2589                               struct root_entry *old_re,
2590                               struct context_entry **tbl,
2591                               int bus, bool ext)
2592 {
2593         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2594         struct context_entry *new_ce = NULL, ce;
2595         struct context_entry *old_ce = NULL;
2596         struct root_entry re;
2597         phys_addr_t old_ce_phys;
2598
2599         tbl_idx = ext ? bus * 2 : bus;
2600         memcpy(&re, old_re, sizeof(re));
2601
2602         for (devfn = 0; devfn < 256; devfn++) {
2603                 /* First calculate the correct index */
2604                 idx = (ext ? devfn * 2 : devfn) % 256;
2605
2606                 if (idx == 0) {
2607                         /* First save what we may have and clean up */
2608                         if (new_ce) {
2609                                 tbl[tbl_idx] = new_ce;
2610                                 __iommu_flush_cache(iommu, new_ce,
2611                                                     VTD_PAGE_SIZE);
2612                                 pos = 1;
2613                         }
2614
2615                         if (old_ce)
2616                                 memunmap(old_ce);
2617
2618                         ret = 0;
2619                         if (devfn < 0x80)
2620                                 old_ce_phys = root_entry_lctp(&re);
2621                         else
2622                                 old_ce_phys = root_entry_uctp(&re);
2623
2624                         if (!old_ce_phys) {
2625                                 if (ext && devfn == 0) {
2626                                         /* No LCTP, try UCTP */
2627                                         devfn = 0x7f;
2628                                         continue;
2629                                 } else {
2630                                         goto out;
2631                                 }
2632                         }
2633
2634                         ret = -ENOMEM;
2635                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
2636                                         MEMREMAP_WB);
2637                         if (!old_ce)
2638                                 goto out;
2639
2640                         new_ce = alloc_pgtable_page(iommu->node, GFP_KERNEL);
2641                         if (!new_ce)
2642                                 goto out_unmap;
2643
2644                         ret = 0;
2645                 }
2646
2647                 /* Now copy the context entry */
2648                 memcpy(&ce, old_ce + idx, sizeof(ce));
2649
2650                 if (!context_present(&ce))
2651                         continue;
2652
2653                 did = context_domain_id(&ce);
2654                 if (did >= 0 && did < cap_ndoms(iommu->cap))
2655                         set_bit(did, iommu->domain_ids);
2656
2657                 set_context_copied(iommu, bus, devfn);
2658                 new_ce[idx] = ce;
2659         }
2660
2661         tbl[tbl_idx + pos] = new_ce;
2662
2663         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2664
2665 out_unmap:
2666         memunmap(old_ce);
2667
2668 out:
2669         return ret;
2670 }
2671
2672 static int copy_translation_tables(struct intel_iommu *iommu)
2673 {
2674         struct context_entry **ctxt_tbls;
2675         struct root_entry *old_rt;
2676         phys_addr_t old_rt_phys;
2677         int ctxt_table_entries;
2678         u64 rtaddr_reg;
2679         int bus, ret;
2680         bool new_ext, ext;
2681
2682         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2683         ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
2684         new_ext    = !!sm_supported(iommu);
2685
2686         /*
2687          * The RTT bit can only be changed when translation is disabled,
2688          * but disabling translation means to open a window for data
2689          * corruption. So bail out and don't copy anything if we would
2690          * have to change the bit.
2691          */
2692         if (new_ext != ext)
2693                 return -EINVAL;
2694
2695         iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2696         if (!iommu->copied_tables)
2697                 return -ENOMEM;
2698
2699         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2700         if (!old_rt_phys)
2701                 return -EINVAL;
2702
2703         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2704         if (!old_rt)
2705                 return -ENOMEM;
2706
2707         /* This is too big for the stack - allocate it from slab */
2708         ctxt_table_entries = ext ? 512 : 256;
2709         ret = -ENOMEM;
2710         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2711         if (!ctxt_tbls)
2712                 goto out_unmap;
2713
2714         for (bus = 0; bus < 256; bus++) {
2715                 ret = copy_context_table(iommu, &old_rt[bus],
2716                                          ctxt_tbls, bus, ext);
2717                 if (ret) {
2718                         pr_err("%s: Failed to copy context table for bus %d\n",
2719                                 iommu->name, bus);
2720                         continue;
2721                 }
2722         }
2723
2724         spin_lock(&iommu->lock);
2725
2726         /* Context tables are copied, now write them to the root_entry table */
2727         for (bus = 0; bus < 256; bus++) {
2728                 int idx = ext ? bus * 2 : bus;
2729                 u64 val;
2730
2731                 if (ctxt_tbls[idx]) {
2732                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
2733                         iommu->root_entry[bus].lo = val;
2734                 }
2735
2736                 if (!ext || !ctxt_tbls[idx + 1])
2737                         continue;
2738
2739                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2740                 iommu->root_entry[bus].hi = val;
2741         }
2742
2743         spin_unlock(&iommu->lock);
2744
2745         kfree(ctxt_tbls);
2746
2747         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2748
2749         ret = 0;
2750
2751 out_unmap:
2752         memunmap(old_rt);
2753
2754         return ret;
2755 }
2756
2757 static int __init init_dmars(void)
2758 {
2759         struct dmar_drhd_unit *drhd;
2760         struct intel_iommu *iommu;
2761         int ret;
2762
2763         ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2764         if (ret)
2765                 goto free_iommu;
2766
2767         for_each_iommu(iommu, drhd) {
2768                 if (drhd->ignored) {
2769                         iommu_disable_translation(iommu);
2770                         continue;
2771                 }
2772
2773                 /*
2774                  * Find the max pasid size of all IOMMU's in the system.
2775                  * We need to ensure the system pasid table is no bigger
2776                  * than the smallest supported.
2777                  */
2778                 if (pasid_supported(iommu)) {
2779                         u32 temp = 2 << ecap_pss(iommu->ecap);
2780
2781                         intel_pasid_max_id = min_t(u32, temp,
2782                                                    intel_pasid_max_id);
2783                 }
2784
2785                 intel_iommu_init_qi(iommu);
2786
2787                 ret = iommu_init_domains(iommu);
2788                 if (ret)
2789                         goto free_iommu;
2790
2791                 init_translation_status(iommu);
2792
2793                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2794                         iommu_disable_translation(iommu);
2795                         clear_translation_pre_enabled(iommu);
2796                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2797                                 iommu->name);
2798                 }
2799
2800                 /*
2801                  * TBD:
2802                  * we could share the same root & context tables
2803                  * among all IOMMU's. Need to Split it later.
2804                  */
2805                 ret = iommu_alloc_root_entry(iommu);
2806                 if (ret)
2807                         goto free_iommu;
2808
2809                 if (translation_pre_enabled(iommu)) {
2810                         pr_info("Translation already enabled - trying to copy translation structures\n");
2811
2812                         ret = copy_translation_tables(iommu);
2813                         if (ret) {
2814                                 /*
2815                                  * We found the IOMMU with translation
2816                                  * enabled - but failed to copy over the
2817                                  * old root-entry table. Try to proceed
2818                                  * by disabling translation now and
2819                                  * allocating a clean root-entry table.
2820                                  * This might cause DMAR faults, but
2821                                  * probably the dump will still succeed.
2822                                  */
2823                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
2824                                        iommu->name);
2825                                 iommu_disable_translation(iommu);
2826                                 clear_translation_pre_enabled(iommu);
2827                         } else {
2828                                 pr_info("Copied translation tables from previous kernel for %s\n",
2829                                         iommu->name);
2830                         }
2831                 }
2832
2833                 if (!ecap_pass_through(iommu->ecap))
2834                         hw_pass_through = 0;
2835                 intel_svm_check(iommu);
2836         }
2837
2838         /*
2839          * Now that qi is enabled on all iommus, set the root entry and flush
2840          * caches. This is required on some Intel X58 chipsets, otherwise the
2841          * flush_context function will loop forever and the boot hangs.
2842          */
2843         for_each_active_iommu(iommu, drhd) {
2844                 iommu_flush_write_buffer(iommu);
2845                 iommu_set_root_entry(iommu);
2846         }
2847
2848 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2849         dmar_map_gfx = 0;
2850 #endif
2851
2852         if (!dmar_map_gfx)
2853                 iommu_identity_mapping |= IDENTMAP_GFX;
2854
2855         check_tylersburg_isoch();
2856
2857         ret = si_domain_init(hw_pass_through);
2858         if (ret)
2859                 goto free_iommu;
2860
2861         /*
2862          * for each drhd
2863          *   enable fault log
2864          *   global invalidate context cache
2865          *   global invalidate iotlb
2866          *   enable translation
2867          */
2868         for_each_iommu(iommu, drhd) {
2869                 if (drhd->ignored) {
2870                         /*
2871                          * we always have to disable PMRs or DMA may fail on
2872                          * this device
2873                          */
2874                         if (force_on)
2875                                 iommu_disable_protect_mem_regions(iommu);
2876                         continue;
2877                 }
2878
2879                 iommu_flush_write_buffer(iommu);
2880
2881 #ifdef CONFIG_INTEL_IOMMU_SVM
2882                 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
2883                         /*
2884                          * Call dmar_alloc_hwirq() with dmar_global_lock held,
2885                          * could cause possible lock race condition.
2886                          */
2887                         up_write(&dmar_global_lock);
2888                         ret = intel_svm_enable_prq(iommu);
2889                         down_write(&dmar_global_lock);
2890                         if (ret)
2891                                 goto free_iommu;
2892                 }
2893 #endif
2894                 ret = dmar_set_interrupt(iommu);
2895                 if (ret)
2896                         goto free_iommu;
2897         }
2898
2899         return 0;
2900
2901 free_iommu:
2902         for_each_active_iommu(iommu, drhd) {
2903                 disable_dmar_iommu(iommu);
2904                 free_dmar_iommu(iommu);
2905         }
2906         if (si_domain) {
2907                 domain_exit(si_domain);
2908                 si_domain = NULL;
2909         }
2910
2911         return ret;
2912 }
2913
2914 static void __init init_no_remapping_devices(void)
2915 {
2916         struct dmar_drhd_unit *drhd;
2917         struct device *dev;
2918         int i;
2919
2920         for_each_drhd_unit(drhd) {
2921                 if (!drhd->include_all) {
2922                         for_each_active_dev_scope(drhd->devices,
2923                                                   drhd->devices_cnt, i, dev)
2924                                 break;
2925                         /* ignore DMAR unit if no devices exist */
2926                         if (i == drhd->devices_cnt)
2927                                 drhd->ignored = 1;
2928                 }
2929         }
2930
2931         for_each_active_drhd_unit(drhd) {
2932                 if (drhd->include_all)
2933                         continue;
2934
2935                 for_each_active_dev_scope(drhd->devices,
2936                                           drhd->devices_cnt, i, dev)
2937                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2938                                 break;
2939                 if (i < drhd->devices_cnt)
2940                         continue;
2941
2942                 /* This IOMMU has *only* gfx devices. Either bypass it or
2943                    set the gfx_mapped flag, as appropriate */
2944                 drhd->gfx_dedicated = 1;
2945                 if (!dmar_map_gfx)
2946                         drhd->ignored = 1;
2947         }
2948 }
2949
2950 #ifdef CONFIG_SUSPEND
2951 static int init_iommu_hw(void)
2952 {
2953         struct dmar_drhd_unit *drhd;
2954         struct intel_iommu *iommu = NULL;
2955         int ret;
2956
2957         for_each_active_iommu(iommu, drhd) {
2958                 if (iommu->qi) {
2959                         ret = dmar_reenable_qi(iommu);
2960                         if (ret)
2961                                 return ret;
2962                 }
2963         }
2964
2965         for_each_iommu(iommu, drhd) {
2966                 if (drhd->ignored) {
2967                         /*
2968                          * we always have to disable PMRs or DMA may fail on
2969                          * this device
2970                          */
2971                         if (force_on)
2972                                 iommu_disable_protect_mem_regions(iommu);
2973                         continue;
2974                 }
2975
2976                 iommu_flush_write_buffer(iommu);
2977                 iommu_set_root_entry(iommu);
2978                 iommu_enable_translation(iommu);
2979                 iommu_disable_protect_mem_regions(iommu);
2980         }
2981
2982         return 0;
2983 }
2984
2985 static void iommu_flush_all(void)
2986 {
2987         struct dmar_drhd_unit *drhd;
2988         struct intel_iommu *iommu;
2989
2990         for_each_active_iommu(iommu, drhd) {
2991                 iommu->flush.flush_context(iommu, 0, 0, 0,
2992                                            DMA_CCMD_GLOBAL_INVL);
2993                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2994                                          DMA_TLB_GLOBAL_FLUSH);
2995         }
2996 }
2997
2998 static int iommu_suspend(void)
2999 {
3000         struct dmar_drhd_unit *drhd;
3001         struct intel_iommu *iommu = NULL;
3002         unsigned long flag;
3003
3004         iommu_flush_all();
3005
3006         for_each_active_iommu(iommu, drhd) {
3007                 iommu_disable_translation(iommu);
3008
3009                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3010
3011                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3012                         readl(iommu->reg + DMAR_FECTL_REG);
3013                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3014                         readl(iommu->reg + DMAR_FEDATA_REG);
3015                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3016                         readl(iommu->reg + DMAR_FEADDR_REG);
3017                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3018                         readl(iommu->reg + DMAR_FEUADDR_REG);
3019
3020                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3021         }
3022         return 0;
3023 }
3024
3025 static void iommu_resume(void)
3026 {
3027         struct dmar_drhd_unit *drhd;
3028         struct intel_iommu *iommu = NULL;
3029         unsigned long flag;
3030
3031         if (init_iommu_hw()) {
3032                 if (force_on)
3033                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3034                 else
3035                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3036                 return;
3037         }
3038
3039         for_each_active_iommu(iommu, drhd) {
3040
3041                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3042
3043                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3044                         iommu->reg + DMAR_FECTL_REG);
3045                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3046                         iommu->reg + DMAR_FEDATA_REG);
3047                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3048                         iommu->reg + DMAR_FEADDR_REG);
3049                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3050                         iommu->reg + DMAR_FEUADDR_REG);
3051
3052                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3053         }
3054 }
3055
3056 static struct syscore_ops iommu_syscore_ops = {
3057         .resume         = iommu_resume,
3058         .suspend        = iommu_suspend,
3059 };
3060
3061 static void __init init_iommu_pm_ops(void)
3062 {
3063         register_syscore_ops(&iommu_syscore_ops);
3064 }
3065
3066 #else
3067 static inline void init_iommu_pm_ops(void) {}
3068 #endif  /* CONFIG_PM */
3069
3070 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3071 {
3072         if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3073             !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3074             rmrr->end_address <= rmrr->base_address ||
3075             arch_rmrr_sanity_check(rmrr))
3076                 return -EINVAL;
3077
3078         return 0;
3079 }
3080
3081 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3082 {
3083         struct acpi_dmar_reserved_memory *rmrr;
3084         struct dmar_rmrr_unit *rmrru;
3085
3086         rmrr = (struct acpi_dmar_reserved_memory *)header;
3087         if (rmrr_sanity_check(rmrr)) {
3088                 pr_warn(FW_BUG
3089                            "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3090                            "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3091                            rmrr->base_address, rmrr->end_address,
3092                            dmi_get_system_info(DMI_BIOS_VENDOR),
3093                            dmi_get_system_info(DMI_BIOS_VERSION),
3094                            dmi_get_system_info(DMI_PRODUCT_VERSION));
3095                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3096         }
3097
3098         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3099         if (!rmrru)
3100                 goto out;
3101
3102         rmrru->hdr = header;
3103
3104         rmrru->base_address = rmrr->base_address;
3105         rmrru->end_address = rmrr->end_address;
3106
3107         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3108                                 ((void *)rmrr) + rmrr->header.length,
3109                                 &rmrru->devices_cnt);
3110         if (rmrru->devices_cnt && rmrru->devices == NULL)
3111                 goto free_rmrru;
3112
3113         list_add(&rmrru->list, &dmar_rmrr_units);
3114
3115         return 0;
3116 free_rmrru:
3117         kfree(rmrru);
3118 out:
3119         return -ENOMEM;
3120 }
3121
3122 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3123 {
3124         struct dmar_atsr_unit *atsru;
3125         struct acpi_dmar_atsr *tmp;
3126
3127         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3128                                 dmar_rcu_check()) {
3129                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3130                 if (atsr->segment != tmp->segment)
3131                         continue;
3132                 if (atsr->header.length != tmp->header.length)
3133                         continue;
3134                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
3135                         return atsru;
3136         }
3137
3138         return NULL;
3139 }
3140
3141 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3142 {
3143         struct acpi_dmar_atsr *atsr;
3144         struct dmar_atsr_unit *atsru;
3145
3146         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3147                 return 0;
3148
3149         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3150         atsru = dmar_find_atsr(atsr);
3151         if (atsru)
3152                 return 0;
3153
3154         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3155         if (!atsru)
3156                 return -ENOMEM;
3157
3158         /*
3159          * If memory is allocated from slab by ACPI _DSM method, we need to
3160          * copy the memory content because the memory buffer will be freed
3161          * on return.
3162          */
3163         atsru->hdr = (void *)(atsru + 1);
3164         memcpy(atsru->hdr, hdr, hdr->length);
3165         atsru->include_all = atsr->flags & 0x1;
3166         if (!atsru->include_all) {
3167                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3168                                 (void *)atsr + atsr->header.length,
3169                                 &atsru->devices_cnt);
3170                 if (atsru->devices_cnt && atsru->devices == NULL) {
3171                         kfree(atsru);
3172                         return -ENOMEM;
3173                 }
3174         }
3175
3176         list_add_rcu(&atsru->list, &dmar_atsr_units);
3177
3178         return 0;
3179 }
3180
3181 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3182 {
3183         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3184         kfree(atsru);
3185 }
3186
3187 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3188 {
3189         struct acpi_dmar_atsr *atsr;
3190         struct dmar_atsr_unit *atsru;
3191
3192         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3193         atsru = dmar_find_atsr(atsr);
3194         if (atsru) {
3195                 list_del_rcu(&atsru->list);
3196                 synchronize_rcu();
3197                 intel_iommu_free_atsr(atsru);
3198         }
3199
3200         return 0;
3201 }
3202
3203 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3204 {
3205         int i;
3206         struct device *dev;
3207         struct acpi_dmar_atsr *atsr;
3208         struct dmar_atsr_unit *atsru;
3209
3210         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3211         atsru = dmar_find_atsr(atsr);
3212         if (!atsru)
3213                 return 0;
3214
3215         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3216                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3217                                           i, dev)
3218                         return -EBUSY;
3219         }
3220
3221         return 0;
3222 }
3223
3224 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3225 {
3226         struct dmar_satc_unit *satcu;
3227         struct acpi_dmar_satc *tmp;
3228
3229         list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3230                                 dmar_rcu_check()) {
3231                 tmp = (struct acpi_dmar_satc *)satcu->hdr;
3232                 if (satc->segment != tmp->segment)
3233                         continue;
3234                 if (satc->header.length != tmp->header.length)
3235                         continue;
3236                 if (memcmp(satc, tmp, satc->header.length) == 0)
3237                         return satcu;
3238         }
3239
3240         return NULL;
3241 }
3242
3243 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3244 {
3245         struct acpi_dmar_satc *satc;
3246         struct dmar_satc_unit *satcu;
3247
3248         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3249                 return 0;
3250
3251         satc = container_of(hdr, struct acpi_dmar_satc, header);
3252         satcu = dmar_find_satc(satc);
3253         if (satcu)
3254                 return 0;
3255
3256         satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3257         if (!satcu)
3258                 return -ENOMEM;
3259
3260         satcu->hdr = (void *)(satcu + 1);
3261         memcpy(satcu->hdr, hdr, hdr->length);
3262         satcu->atc_required = satc->flags & 0x1;
3263         satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3264                                               (void *)satc + satc->header.length,
3265                                               &satcu->devices_cnt);
3266         if (satcu->devices_cnt && !satcu->devices) {
3267                 kfree(satcu);
3268                 return -ENOMEM;
3269         }
3270         list_add_rcu(&satcu->list, &dmar_satc_units);
3271
3272         return 0;
3273 }
3274
3275 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3276 {
3277         int sp, ret;
3278         struct intel_iommu *iommu = dmaru->iommu;
3279
3280         ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3281         if (ret)
3282                 goto out;
3283
3284         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3285                 pr_warn("%s: Doesn't support hardware pass through.\n",
3286                         iommu->name);
3287                 return -ENXIO;
3288         }
3289
3290         sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3291         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3292                 pr_warn("%s: Doesn't support large page.\n",
3293                         iommu->name);
3294                 return -ENXIO;
3295         }
3296
3297         /*
3298          * Disable translation if already enabled prior to OS handover.
3299          */
3300         if (iommu->gcmd & DMA_GCMD_TE)
3301                 iommu_disable_translation(iommu);
3302
3303         ret = iommu_init_domains(iommu);
3304         if (ret == 0)
3305                 ret = iommu_alloc_root_entry(iommu);
3306         if (ret)
3307                 goto out;
3308
3309         intel_svm_check(iommu);
3310
3311         if (dmaru->ignored) {
3312                 /*
3313                  * we always have to disable PMRs or DMA may fail on this device
3314                  */
3315                 if (force_on)
3316                         iommu_disable_protect_mem_regions(iommu);
3317                 return 0;
3318         }
3319
3320         intel_iommu_init_qi(iommu);
3321         iommu_flush_write_buffer(iommu);
3322
3323 #ifdef CONFIG_INTEL_IOMMU_SVM
3324         if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3325                 ret = intel_svm_enable_prq(iommu);
3326                 if (ret)
3327                         goto disable_iommu;
3328         }
3329 #endif
3330         ret = dmar_set_interrupt(iommu);
3331         if (ret)
3332                 goto disable_iommu;
3333
3334         iommu_set_root_entry(iommu);
3335         iommu_enable_translation(iommu);
3336
3337         iommu_disable_protect_mem_regions(iommu);
3338         return 0;
3339
3340 disable_iommu:
3341         disable_dmar_iommu(iommu);
3342 out:
3343         free_dmar_iommu(iommu);
3344         return ret;
3345 }
3346
3347 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3348 {
3349         int ret = 0;
3350         struct intel_iommu *iommu = dmaru->iommu;
3351
3352         if (!intel_iommu_enabled)
3353                 return 0;
3354         if (iommu == NULL)
3355                 return -EINVAL;
3356
3357         if (insert) {
3358                 ret = intel_iommu_add(dmaru);
3359         } else {
3360                 disable_dmar_iommu(iommu);
3361                 free_dmar_iommu(iommu);
3362         }
3363
3364         return ret;
3365 }
3366
3367 static void intel_iommu_free_dmars(void)
3368 {
3369         struct dmar_rmrr_unit *rmrru, *rmrr_n;
3370         struct dmar_atsr_unit *atsru, *atsr_n;
3371         struct dmar_satc_unit *satcu, *satc_n;
3372
3373         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3374                 list_del(&rmrru->list);
3375                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3376                 kfree(rmrru);
3377         }
3378
3379         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3380                 list_del(&atsru->list);
3381                 intel_iommu_free_atsr(atsru);
3382         }
3383         list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3384                 list_del(&satcu->list);
3385                 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3386                 kfree(satcu);
3387         }
3388 }
3389
3390 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3391 {
3392         struct dmar_satc_unit *satcu;
3393         struct acpi_dmar_satc *satc;
3394         struct device *tmp;
3395         int i;
3396
3397         dev = pci_physfn(dev);
3398         rcu_read_lock();
3399
3400         list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3401                 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3402                 if (satc->segment != pci_domain_nr(dev->bus))
3403                         continue;
3404                 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3405                         if (to_pci_dev(tmp) == dev)
3406                                 goto out;
3407         }
3408         satcu = NULL;
3409 out:
3410         rcu_read_unlock();
3411         return satcu;
3412 }
3413
3414 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3415 {
3416         int i, ret = 1;
3417         struct pci_bus *bus;
3418         struct pci_dev *bridge = NULL;
3419         struct device *tmp;
3420         struct acpi_dmar_atsr *atsr;
3421         struct dmar_atsr_unit *atsru;
3422         struct dmar_satc_unit *satcu;
3423
3424         dev = pci_physfn(dev);
3425         satcu = dmar_find_matched_satc_unit(dev);
3426         if (satcu)
3427                 /*
3428                  * This device supports ATS as it is in SATC table.
3429                  * When IOMMU is in legacy mode, enabling ATS is done
3430                  * automatically by HW for the device that requires
3431                  * ATS, hence OS should not enable this device ATS
3432                  * to avoid duplicated TLB invalidation.
3433                  */
3434                 return !(satcu->atc_required && !sm_supported(iommu));
3435
3436         for (bus = dev->bus; bus; bus = bus->parent) {
3437                 bridge = bus->self;
3438                 /* If it's an integrated device, allow ATS */
3439                 if (!bridge)
3440                         return 1;
3441                 /* Connected via non-PCIe: no ATS */
3442                 if (!pci_is_pcie(bridge) ||
3443                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3444                         return 0;
3445                 /* If we found the root port, look it up in the ATSR */
3446                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3447                         break;
3448         }
3449
3450         rcu_read_lock();
3451         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3452                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3453                 if (atsr->segment != pci_domain_nr(dev->bus))
3454                         continue;
3455
3456                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3457                         if (tmp == &bridge->dev)
3458                                 goto out;
3459
3460                 if (atsru->include_all)
3461                         goto out;
3462         }
3463         ret = 0;
3464 out:
3465         rcu_read_unlock();
3466
3467         return ret;
3468 }
3469
3470 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3471 {
3472         int ret;
3473         struct dmar_rmrr_unit *rmrru;
3474         struct dmar_atsr_unit *atsru;
3475         struct dmar_satc_unit *satcu;
3476         struct acpi_dmar_atsr *atsr;
3477         struct acpi_dmar_reserved_memory *rmrr;
3478         struct acpi_dmar_satc *satc;
3479
3480         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3481                 return 0;
3482
3483         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3484                 rmrr = container_of(rmrru->hdr,
3485                                     struct acpi_dmar_reserved_memory, header);
3486                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3487                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3488                                 ((void *)rmrr) + rmrr->header.length,
3489                                 rmrr->segment, rmrru->devices,
3490                                 rmrru->devices_cnt);
3491                         if (ret < 0)
3492                                 return ret;
3493                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3494                         dmar_remove_dev_scope(info, rmrr->segment,
3495                                 rmrru->devices, rmrru->devices_cnt);
3496                 }
3497         }
3498
3499         list_for_each_entry(atsru, &dmar_atsr_units, list) {
3500                 if (atsru->include_all)
3501                         continue;
3502
3503                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3504                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3505                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3506                                         (void *)atsr + atsr->header.length,
3507                                         atsr->segment, atsru->devices,
3508                                         atsru->devices_cnt);
3509                         if (ret > 0)
3510                                 break;
3511                         else if (ret < 0)
3512                                 return ret;
3513                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3514                         if (dmar_remove_dev_scope(info, atsr->segment,
3515                                         atsru->devices, atsru->devices_cnt))
3516                                 break;
3517                 }
3518         }
3519         list_for_each_entry(satcu, &dmar_satc_units, list) {
3520                 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3521                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3522                         ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3523                                         (void *)satc + satc->header.length,
3524                                         satc->segment, satcu->devices,
3525                                         satcu->devices_cnt);
3526                         if (ret > 0)
3527                                 break;
3528                         else if (ret < 0)
3529                                 return ret;
3530                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3531                         if (dmar_remove_dev_scope(info, satc->segment,
3532                                         satcu->devices, satcu->devices_cnt))
3533                                 break;
3534                 }
3535         }
3536
3537         return 0;
3538 }
3539
3540 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3541                                        unsigned long val, void *v)
3542 {
3543         struct memory_notify *mhp = v;
3544         unsigned long start_vpfn = mm_to_dma_pfn_start(mhp->start_pfn);
3545         unsigned long last_vpfn = mm_to_dma_pfn_end(mhp->start_pfn +
3546                         mhp->nr_pages - 1);
3547
3548         switch (val) {
3549         case MEM_GOING_ONLINE:
3550                 if (iommu_domain_identity_map(si_domain,
3551                                               start_vpfn, last_vpfn)) {
3552                         pr_warn("Failed to build identity map for [%lx-%lx]\n",
3553                                 start_vpfn, last_vpfn);
3554                         return NOTIFY_BAD;
3555                 }
3556                 break;
3557
3558         case MEM_OFFLINE:
3559         case MEM_CANCEL_ONLINE:
3560                 {
3561                         struct dmar_drhd_unit *drhd;
3562                         struct intel_iommu *iommu;
3563                         LIST_HEAD(freelist);
3564
3565                         domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3566
3567                         rcu_read_lock();
3568                         for_each_active_iommu(iommu, drhd)
3569                                 iommu_flush_iotlb_psi(iommu, si_domain,
3570                                         start_vpfn, mhp->nr_pages,
3571                                         list_empty(&freelist), 0);
3572                         rcu_read_unlock();
3573                         put_pages_list(&freelist);
3574                 }
3575                 break;
3576         }
3577
3578         return NOTIFY_OK;
3579 }
3580
3581 static struct notifier_block intel_iommu_memory_nb = {
3582         .notifier_call = intel_iommu_memory_notifier,
3583         .priority = 0
3584 };
3585
3586 static void intel_disable_iommus(void)
3587 {
3588         struct intel_iommu *iommu = NULL;
3589         struct dmar_drhd_unit *drhd;
3590
3591         for_each_iommu(iommu, drhd)
3592                 iommu_disable_translation(iommu);
3593 }
3594
3595 void intel_iommu_shutdown(void)
3596 {
3597         struct dmar_drhd_unit *drhd;
3598         struct intel_iommu *iommu = NULL;
3599
3600         if (no_iommu || dmar_disabled)
3601                 return;
3602
3603         down_write(&dmar_global_lock);
3604
3605         /* Disable PMRs explicitly here. */
3606         for_each_iommu(iommu, drhd)
3607                 iommu_disable_protect_mem_regions(iommu);
3608
3609         /* Make sure the IOMMUs are switched off */
3610         intel_disable_iommus();
3611
3612         up_write(&dmar_global_lock);
3613 }
3614
3615 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3616 {
3617         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3618
3619         return container_of(iommu_dev, struct intel_iommu, iommu);
3620 }
3621
3622 static ssize_t version_show(struct device *dev,
3623                             struct device_attribute *attr, char *buf)
3624 {
3625         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3626         u32 ver = readl(iommu->reg + DMAR_VER_REG);
3627         return sysfs_emit(buf, "%d:%d\n",
3628                           DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3629 }
3630 static DEVICE_ATTR_RO(version);
3631
3632 static ssize_t address_show(struct device *dev,
3633                             struct device_attribute *attr, char *buf)
3634 {
3635         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3636         return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
3637 }
3638 static DEVICE_ATTR_RO(address);
3639
3640 static ssize_t cap_show(struct device *dev,
3641                         struct device_attribute *attr, char *buf)
3642 {
3643         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3644         return sysfs_emit(buf, "%llx\n", iommu->cap);
3645 }
3646 static DEVICE_ATTR_RO(cap);
3647
3648 static ssize_t ecap_show(struct device *dev,
3649                          struct device_attribute *attr, char *buf)
3650 {
3651         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3652         return sysfs_emit(buf, "%llx\n", iommu->ecap);
3653 }
3654 static DEVICE_ATTR_RO(ecap);
3655
3656 static ssize_t domains_supported_show(struct device *dev,
3657                                       struct device_attribute *attr, char *buf)
3658 {
3659         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3660         return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
3661 }
3662 static DEVICE_ATTR_RO(domains_supported);
3663
3664 static ssize_t domains_used_show(struct device *dev,
3665                                  struct device_attribute *attr, char *buf)
3666 {
3667         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3668         return sysfs_emit(buf, "%d\n",
3669                           bitmap_weight(iommu->domain_ids,
3670                                         cap_ndoms(iommu->cap)));
3671 }
3672 static DEVICE_ATTR_RO(domains_used);
3673
3674 static struct attribute *intel_iommu_attrs[] = {
3675         &dev_attr_version.attr,
3676         &dev_attr_address.attr,
3677         &dev_attr_cap.attr,
3678         &dev_attr_ecap.attr,
3679         &dev_attr_domains_supported.attr,
3680         &dev_attr_domains_used.attr,
3681         NULL,
3682 };
3683
3684 static struct attribute_group intel_iommu_group = {
3685         .name = "intel-iommu",
3686         .attrs = intel_iommu_attrs,
3687 };
3688
3689 const struct attribute_group *intel_iommu_groups[] = {
3690         &intel_iommu_group,
3691         NULL,
3692 };
3693
3694 static inline bool has_external_pci(void)
3695 {
3696         struct pci_dev *pdev = NULL;
3697
3698         for_each_pci_dev(pdev)
3699                 if (pdev->external_facing) {
3700                         pci_dev_put(pdev);
3701                         return true;
3702                 }
3703
3704         return false;
3705 }
3706
3707 static int __init platform_optin_force_iommu(void)
3708 {
3709         if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3710                 return 0;
3711
3712         if (no_iommu || dmar_disabled)
3713                 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3714
3715         /*
3716          * If Intel-IOMMU is disabled by default, we will apply identity
3717          * map for all devices except those marked as being untrusted.
3718          */
3719         if (dmar_disabled)
3720                 iommu_set_default_passthrough(false);
3721
3722         dmar_disabled = 0;
3723         no_iommu = 0;
3724
3725         return 1;
3726 }
3727
3728 static int __init probe_acpi_namespace_devices(void)
3729 {
3730         struct dmar_drhd_unit *drhd;
3731         /* To avoid a -Wunused-but-set-variable warning. */
3732         struct intel_iommu *iommu __maybe_unused;
3733         struct device *dev;
3734         int i, ret = 0;
3735
3736         for_each_active_iommu(iommu, drhd) {
3737                 for_each_active_dev_scope(drhd->devices,
3738                                           drhd->devices_cnt, i, dev) {
3739                         struct acpi_device_physical_node *pn;
3740                         struct acpi_device *adev;
3741
3742                         if (dev->bus != &acpi_bus_type)
3743                                 continue;
3744
3745                         adev = to_acpi_device(dev);
3746                         mutex_lock(&adev->physical_node_lock);
3747                         list_for_each_entry(pn,
3748                                             &adev->physical_node_list, node) {
3749                                 ret = iommu_probe_device(pn->dev);
3750                                 if (ret)
3751                                         break;
3752                         }
3753                         mutex_unlock(&adev->physical_node_lock);
3754
3755                         if (ret)
3756                                 return ret;
3757                 }
3758         }
3759
3760         return 0;
3761 }
3762
3763 static __init int tboot_force_iommu(void)
3764 {
3765         if (!tboot_enabled())
3766                 return 0;
3767
3768         if (no_iommu || dmar_disabled)
3769                 pr_warn("Forcing Intel-IOMMU to enabled\n");
3770
3771         dmar_disabled = 0;
3772         no_iommu = 0;
3773
3774         return 1;
3775 }
3776
3777 int __init intel_iommu_init(void)
3778 {
3779         int ret = -ENODEV;
3780         struct dmar_drhd_unit *drhd;
3781         struct intel_iommu *iommu;
3782
3783         /*
3784          * Intel IOMMU is required for a TXT/tboot launch or platform
3785          * opt in, so enforce that.
3786          */
3787         force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3788                     platform_optin_force_iommu();
3789
3790         down_write(&dmar_global_lock);
3791         if (dmar_table_init()) {
3792                 if (force_on)
3793                         panic("tboot: Failed to initialize DMAR table\n");
3794                 goto out_free_dmar;
3795         }
3796
3797         if (dmar_dev_scope_init() < 0) {
3798                 if (force_on)
3799                         panic("tboot: Failed to initialize DMAR device scope\n");
3800                 goto out_free_dmar;
3801         }
3802
3803         up_write(&dmar_global_lock);
3804
3805         /*
3806          * The bus notifier takes the dmar_global_lock, so lockdep will
3807          * complain later when we register it under the lock.
3808          */
3809         dmar_register_bus_notifier();
3810
3811         down_write(&dmar_global_lock);
3812
3813         if (!no_iommu)
3814                 intel_iommu_debugfs_init();
3815
3816         if (no_iommu || dmar_disabled) {
3817                 /*
3818                  * We exit the function here to ensure IOMMU's remapping and
3819                  * mempool aren't setup, which means that the IOMMU's PMRs
3820                  * won't be disabled via the call to init_dmars(). So disable
3821                  * it explicitly here. The PMRs were setup by tboot prior to
3822                  * calling SENTER, but the kernel is expected to reset/tear
3823                  * down the PMRs.
3824                  */
3825                 if (intel_iommu_tboot_noforce) {
3826                         for_each_iommu(iommu, drhd)
3827                                 iommu_disable_protect_mem_regions(iommu);
3828                 }
3829
3830                 /*
3831                  * Make sure the IOMMUs are switched off, even when we
3832                  * boot into a kexec kernel and the previous kernel left
3833                  * them enabled
3834                  */
3835                 intel_disable_iommus();
3836                 goto out_free_dmar;
3837         }
3838
3839         if (list_empty(&dmar_rmrr_units))
3840                 pr_info("No RMRR found\n");
3841
3842         if (list_empty(&dmar_atsr_units))
3843                 pr_info("No ATSR found\n");
3844
3845         if (list_empty(&dmar_satc_units))
3846                 pr_info("No SATC found\n");
3847
3848         init_no_remapping_devices();
3849
3850         ret = init_dmars();
3851         if (ret) {
3852                 if (force_on)
3853                         panic("tboot: Failed to initialize DMARs\n");
3854                 pr_err("Initialization failed\n");
3855                 goto out_free_dmar;
3856         }
3857         up_write(&dmar_global_lock);
3858
3859         init_iommu_pm_ops();
3860
3861         down_read(&dmar_global_lock);
3862         for_each_active_iommu(iommu, drhd) {
3863                 /*
3864                  * The flush queue implementation does not perform
3865                  * page-selective invalidations that are required for efficient
3866                  * TLB flushes in virtual environments.  The benefit of batching
3867                  * is likely to be much lower than the overhead of synchronizing
3868                  * the virtual and physical IOMMU page-tables.
3869                  */
3870                 if (cap_caching_mode(iommu->cap) &&
3871                     !first_level_by_default(IOMMU_DOMAIN_DMA)) {
3872                         pr_info_once("IOMMU batching disallowed due to virtualization\n");
3873                         iommu_set_dma_strict();
3874                 }
3875                 iommu_device_sysfs_add(&iommu->iommu, NULL,
3876                                        intel_iommu_groups,
3877                                        "%s", iommu->name);
3878                 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3879
3880                 iommu_pmu_register(iommu);
3881         }
3882         up_read(&dmar_global_lock);
3883
3884         if (si_domain && !hw_pass_through)
3885                 register_memory_notifier(&intel_iommu_memory_nb);
3886
3887         down_read(&dmar_global_lock);
3888         if (probe_acpi_namespace_devices())
3889                 pr_warn("ACPI name space devices didn't probe correctly\n");
3890
3891         /* Finally, we enable the DMA remapping hardware. */
3892         for_each_iommu(iommu, drhd) {
3893                 if (!drhd->ignored && !translation_pre_enabled(iommu))
3894                         iommu_enable_translation(iommu);
3895
3896                 iommu_disable_protect_mem_regions(iommu);
3897         }
3898         up_read(&dmar_global_lock);
3899
3900         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3901
3902         intel_iommu_enabled = 1;
3903
3904         return 0;
3905
3906 out_free_dmar:
3907         intel_iommu_free_dmars();
3908         up_write(&dmar_global_lock);
3909         return ret;
3910 }
3911
3912 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3913 {
3914         struct device_domain_info *info = opaque;
3915
3916         domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3917         return 0;
3918 }
3919
3920 /*
3921  * NB - intel-iommu lacks any sort of reference counting for the users of
3922  * dependent devices.  If multiple endpoints have intersecting dependent
3923  * devices, unbinding the driver from any one of them will possibly leave
3924  * the others unable to operate.
3925  */
3926 static void domain_context_clear(struct device_domain_info *info)
3927 {
3928         if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
3929                 return;
3930
3931         pci_for_each_dma_alias(to_pci_dev(info->dev),
3932                                &domain_context_clear_one_cb, info);
3933 }
3934
3935 static void dmar_remove_one_dev_info(struct device *dev)
3936 {
3937         struct device_domain_info *info = dev_iommu_priv_get(dev);
3938         struct dmar_domain *domain = info->domain;
3939         struct intel_iommu *iommu = info->iommu;
3940         unsigned long flags;
3941
3942         if (!dev_is_real_dma_subdevice(info->dev)) {
3943                 if (dev_is_pci(info->dev) && sm_supported(iommu))
3944                         intel_pasid_tear_down_entry(iommu, info->dev,
3945                                         IOMMU_NO_PASID, false);
3946
3947                 iommu_disable_pci_caps(info);
3948                 domain_context_clear(info);
3949         }
3950
3951         spin_lock_irqsave(&domain->lock, flags);
3952         list_del(&info->link);
3953         spin_unlock_irqrestore(&domain->lock, flags);
3954
3955         domain_detach_iommu(domain, iommu);
3956         info->domain = NULL;
3957 }
3958
3959 /*
3960  * Clear the page table pointer in context or pasid table entries so that
3961  * all DMA requests without PASID from the device are blocked. If the page
3962  * table has been set, clean up the data structures.
3963  */
3964 void device_block_translation(struct device *dev)
3965 {
3966         struct device_domain_info *info = dev_iommu_priv_get(dev);
3967         struct intel_iommu *iommu = info->iommu;
3968         unsigned long flags;
3969
3970         iommu_disable_pci_caps(info);
3971         if (!dev_is_real_dma_subdevice(dev)) {
3972                 if (sm_supported(iommu))
3973                         intel_pasid_tear_down_entry(iommu, dev,
3974                                                     IOMMU_NO_PASID, false);
3975                 else
3976                         domain_context_clear(info);
3977         }
3978
3979         if (!info->domain)
3980                 return;
3981
3982         spin_lock_irqsave(&info->domain->lock, flags);
3983         list_del(&info->link);
3984         spin_unlock_irqrestore(&info->domain->lock, flags);
3985
3986         domain_detach_iommu(info->domain, iommu);
3987         info->domain = NULL;
3988 }
3989
3990 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3991 {
3992         int adjust_width;
3993
3994         /* calculate AGAW */
3995         domain->gaw = guest_width;
3996         adjust_width = guestwidth_to_adjustwidth(guest_width);
3997         domain->agaw = width_to_agaw(adjust_width);
3998
3999         domain->iommu_coherency = false;
4000         domain->iommu_superpage = 0;
4001         domain->max_addr = 0;
4002
4003         /* always allocate the top pgd */
4004         domain->pgd = alloc_pgtable_page(domain->nid, GFP_ATOMIC);
4005         if (!domain->pgd)
4006                 return -ENOMEM;
4007         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4008         return 0;
4009 }
4010
4011 static int blocking_domain_attach_dev(struct iommu_domain *domain,
4012                                       struct device *dev)
4013 {
4014         device_block_translation(dev);
4015         return 0;
4016 }
4017
4018 static struct iommu_domain blocking_domain = {
4019         .ops = &(const struct iommu_domain_ops) {
4020                 .attach_dev     = blocking_domain_attach_dev,
4021                 .free           = intel_iommu_domain_free
4022         }
4023 };
4024
4025 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4026 {
4027         struct dmar_domain *dmar_domain;
4028         struct iommu_domain *domain;
4029
4030         switch (type) {
4031         case IOMMU_DOMAIN_BLOCKED:
4032                 return &blocking_domain;
4033         case IOMMU_DOMAIN_DMA:
4034         case IOMMU_DOMAIN_UNMANAGED:
4035                 dmar_domain = alloc_domain(type);
4036                 if (!dmar_domain) {
4037                         pr_err("Can't allocate dmar_domain\n");
4038                         return NULL;
4039                 }
4040                 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4041                         pr_err("Domain initialization failed\n");
4042                         domain_exit(dmar_domain);
4043                         return NULL;
4044                 }
4045
4046                 domain = &dmar_domain->domain;
4047                 domain->geometry.aperture_start = 0;
4048                 domain->geometry.aperture_end   =
4049                                 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4050                 domain->geometry.force_aperture = true;
4051
4052                 return domain;
4053         case IOMMU_DOMAIN_IDENTITY:
4054                 return &si_domain->domain;
4055         case IOMMU_DOMAIN_SVA:
4056                 return intel_svm_domain_alloc();
4057         default:
4058                 return NULL;
4059         }
4060
4061         return NULL;
4062 }
4063
4064 static struct iommu_domain *
4065 intel_iommu_domain_alloc_user(struct device *dev, u32 flags,
4066                               struct iommu_domain *parent,
4067                               const struct iommu_user_data *user_data)
4068 {
4069         struct device_domain_info *info = dev_iommu_priv_get(dev);
4070         bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
4071         bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
4072         struct intel_iommu *iommu = info->iommu;
4073         struct iommu_domain *domain;
4074
4075         /* Must be NESTING domain */
4076         if (parent) {
4077                 if (!nested_supported(iommu) || flags)
4078                         return ERR_PTR(-EOPNOTSUPP);
4079                 return intel_nested_domain_alloc(parent, user_data);
4080         }
4081
4082         if (flags &
4083             (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
4084                 return ERR_PTR(-EOPNOTSUPP);
4085         if (nested_parent && !nested_supported(iommu))
4086                 return ERR_PTR(-EOPNOTSUPP);
4087         if (user_data || (dirty_tracking && !ssads_supported(iommu)))
4088                 return ERR_PTR(-EOPNOTSUPP);
4089
4090         /*
4091          * domain_alloc_user op needs to fully initialize a domain before
4092          * return, so uses iommu_domain_alloc() here for simple.
4093          */
4094         domain = iommu_domain_alloc(dev->bus);
4095         if (!domain)
4096                 return ERR_PTR(-ENOMEM);
4097
4098         if (nested_parent)
4099                 to_dmar_domain(domain)->nested_parent = true;
4100
4101         if (dirty_tracking) {
4102                 if (to_dmar_domain(domain)->use_first_level) {
4103                         iommu_domain_free(domain);
4104                         return ERR_PTR(-EOPNOTSUPP);
4105                 }
4106                 domain->dirty_ops = &intel_dirty_ops;
4107         }
4108
4109         return domain;
4110 }
4111
4112 static void intel_iommu_domain_free(struct iommu_domain *domain)
4113 {
4114         if (domain != &si_domain->domain && domain != &blocking_domain)
4115                 domain_exit(to_dmar_domain(domain));
4116 }
4117
4118 int prepare_domain_attach_device(struct iommu_domain *domain,
4119                                  struct device *dev)
4120 {
4121         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4122         struct intel_iommu *iommu;
4123         int addr_width;
4124
4125         iommu = device_to_iommu(dev, NULL, NULL);
4126         if (!iommu)
4127                 return -ENODEV;
4128
4129         if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
4130                 return -EINVAL;
4131
4132         if (domain->dirty_ops && !ssads_supported(iommu))
4133                 return -EINVAL;
4134
4135         /* check if this iommu agaw is sufficient for max mapped address */
4136         addr_width = agaw_to_width(iommu->agaw);
4137         if (addr_width > cap_mgaw(iommu->cap))
4138                 addr_width = cap_mgaw(iommu->cap);
4139
4140         if (dmar_domain->max_addr > (1LL << addr_width))
4141                 return -EINVAL;
4142         dmar_domain->gaw = addr_width;
4143
4144         /*
4145          * Knock out extra levels of page tables if necessary
4146          */
4147         while (iommu->agaw < dmar_domain->agaw) {
4148                 struct dma_pte *pte;
4149
4150                 pte = dmar_domain->pgd;
4151                 if (dma_pte_present(pte)) {
4152                         dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4153                         free_pgtable_page(pte);
4154                 }
4155                 dmar_domain->agaw--;
4156         }
4157
4158         return 0;
4159 }
4160
4161 static int intel_iommu_attach_device(struct iommu_domain *domain,
4162                                      struct device *dev)
4163 {
4164         struct device_domain_info *info = dev_iommu_priv_get(dev);
4165         int ret;
4166
4167         if (info->domain)
4168                 device_block_translation(dev);
4169
4170         ret = prepare_domain_attach_device(domain, dev);
4171         if (ret)
4172                 return ret;
4173
4174         return dmar_domain_attach_device(to_dmar_domain(domain), dev);
4175 }
4176
4177 static int intel_iommu_map(struct iommu_domain *domain,
4178                            unsigned long iova, phys_addr_t hpa,
4179                            size_t size, int iommu_prot, gfp_t gfp)
4180 {
4181         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4182         u64 max_addr;
4183         int prot = 0;
4184
4185         if (iommu_prot & IOMMU_READ)
4186                 prot |= DMA_PTE_READ;
4187         if (iommu_prot & IOMMU_WRITE)
4188                 prot |= DMA_PTE_WRITE;
4189         if (dmar_domain->set_pte_snp)
4190                 prot |= DMA_PTE_SNP;
4191
4192         max_addr = iova + size;
4193         if (dmar_domain->max_addr < max_addr) {
4194                 u64 end;
4195
4196                 /* check if minimum agaw is sufficient for mapped address */
4197                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4198                 if (end < max_addr) {
4199                         pr_err("%s: iommu width (%d) is not "
4200                                "sufficient for the mapped address (%llx)\n",
4201                                __func__, dmar_domain->gaw, max_addr);
4202                         return -EFAULT;
4203                 }
4204                 dmar_domain->max_addr = max_addr;
4205         }
4206         /* Round up size to next multiple of PAGE_SIZE, if it and
4207            the low bits of hpa would take us onto the next page */
4208         size = aligned_nrpages(hpa, size);
4209         return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4210                                 hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
4211 }
4212
4213 static int intel_iommu_map_pages(struct iommu_domain *domain,
4214                                  unsigned long iova, phys_addr_t paddr,
4215                                  size_t pgsize, size_t pgcount,
4216                                  int prot, gfp_t gfp, size_t *mapped)
4217 {
4218         unsigned long pgshift = __ffs(pgsize);
4219         size_t size = pgcount << pgshift;
4220         int ret;
4221
4222         if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4223                 return -EINVAL;
4224
4225         if (!IS_ALIGNED(iova | paddr, pgsize))
4226                 return -EINVAL;
4227
4228         ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4229         if (!ret && mapped)
4230                 *mapped = size;
4231
4232         return ret;
4233 }
4234
4235 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4236                                 unsigned long iova, size_t size,
4237                                 struct iommu_iotlb_gather *gather)
4238 {
4239         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4240         unsigned long start_pfn, last_pfn;
4241         int level = 0;
4242
4243         /* Cope with horrid API which requires us to unmap more than the
4244            size argument if it happens to be a large-page mapping. */
4245         if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
4246                                      &level, GFP_ATOMIC)))
4247                 return 0;
4248
4249         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4250                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4251
4252         start_pfn = iova >> VTD_PAGE_SHIFT;
4253         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4254
4255         domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4256
4257         if (dmar_domain->max_addr == iova + size)
4258                 dmar_domain->max_addr = iova;
4259
4260         /*
4261          * We do not use page-selective IOTLB invalidation in flush queue,
4262          * so there is no need to track page and sync iotlb.
4263          */
4264         if (!iommu_iotlb_gather_queued(gather))
4265                 iommu_iotlb_gather_add_page(domain, gather, iova, size);
4266
4267         return size;
4268 }
4269
4270 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4271                                       unsigned long iova,
4272                                       size_t pgsize, size_t pgcount,
4273                                       struct iommu_iotlb_gather *gather)
4274 {
4275         unsigned long pgshift = __ffs(pgsize);
4276         size_t size = pgcount << pgshift;
4277
4278         return intel_iommu_unmap(domain, iova, size, gather);
4279 }
4280
4281 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4282                                  struct iommu_iotlb_gather *gather)
4283 {
4284         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4285         unsigned long iova_pfn = IOVA_PFN(gather->start);
4286         size_t size = gather->end - gather->start;
4287         struct iommu_domain_info *info;
4288         unsigned long start_pfn;
4289         unsigned long nrpages;
4290         unsigned long i;
4291
4292         nrpages = aligned_nrpages(gather->start, size);
4293         start_pfn = mm_to_dma_pfn_start(iova_pfn);
4294
4295         xa_for_each(&dmar_domain->iommu_array, i, info)
4296                 iommu_flush_iotlb_psi(info->iommu, dmar_domain,
4297                                       start_pfn, nrpages,
4298                                       list_empty(&gather->freelist), 0);
4299
4300         put_pages_list(&gather->freelist);
4301 }
4302
4303 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4304                                             dma_addr_t iova)
4305 {
4306         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4307         struct dma_pte *pte;
4308         int level = 0;
4309         u64 phys = 0;
4310
4311         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
4312                              GFP_ATOMIC);
4313         if (pte && dma_pte_present(pte))
4314                 phys = dma_pte_addr(pte) +
4315                         (iova & (BIT_MASK(level_to_offset_bits(level) +
4316                                                 VTD_PAGE_SHIFT) - 1));
4317
4318         return phys;
4319 }
4320
4321 static bool domain_support_force_snooping(struct dmar_domain *domain)
4322 {
4323         struct device_domain_info *info;
4324         bool support = true;
4325
4326         assert_spin_locked(&domain->lock);
4327         list_for_each_entry(info, &domain->devices, link) {
4328                 if (!ecap_sc_support(info->iommu->ecap)) {
4329                         support = false;
4330                         break;
4331                 }
4332         }
4333
4334         return support;
4335 }
4336
4337 static void domain_set_force_snooping(struct dmar_domain *domain)
4338 {
4339         struct device_domain_info *info;
4340
4341         assert_spin_locked(&domain->lock);
4342         /*
4343          * Second level page table supports per-PTE snoop control. The
4344          * iommu_map() interface will handle this by setting SNP bit.
4345          */
4346         if (!domain->use_first_level) {
4347                 domain->set_pte_snp = true;
4348                 return;
4349         }
4350
4351         list_for_each_entry(info, &domain->devices, link)
4352                 intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4353                                                      IOMMU_NO_PASID);
4354 }
4355
4356 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4357 {
4358         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4359         unsigned long flags;
4360
4361         if (dmar_domain->force_snooping)
4362                 return true;
4363
4364         spin_lock_irqsave(&dmar_domain->lock, flags);
4365         if (!domain_support_force_snooping(dmar_domain)) {
4366                 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4367                 return false;
4368         }
4369
4370         domain_set_force_snooping(dmar_domain);
4371         dmar_domain->force_snooping = true;
4372         spin_unlock_irqrestore(&dmar_domain->lock, flags);
4373
4374         return true;
4375 }
4376
4377 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
4378 {
4379         struct device_domain_info *info = dev_iommu_priv_get(dev);
4380
4381         switch (cap) {
4382         case IOMMU_CAP_CACHE_COHERENCY:
4383         case IOMMU_CAP_DEFERRED_FLUSH:
4384                 return true;
4385         case IOMMU_CAP_PRE_BOOT_PROTECTION:
4386                 return dmar_platform_optin();
4387         case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
4388                 return ecap_sc_support(info->iommu->ecap);
4389         case IOMMU_CAP_DIRTY_TRACKING:
4390                 return ssads_supported(info->iommu);
4391         default:
4392                 return false;
4393         }
4394 }
4395
4396 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4397 {
4398         struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4399         struct device_domain_info *info;
4400         struct intel_iommu *iommu;
4401         u8 bus, devfn;
4402         int ret;
4403
4404         iommu = device_to_iommu(dev, &bus, &devfn);
4405         if (!iommu || !iommu->iommu.ops)
4406                 return ERR_PTR(-ENODEV);
4407
4408         info = kzalloc(sizeof(*info), GFP_KERNEL);
4409         if (!info)
4410                 return ERR_PTR(-ENOMEM);
4411
4412         if (dev_is_real_dma_subdevice(dev)) {
4413                 info->bus = pdev->bus->number;
4414                 info->devfn = pdev->devfn;
4415                 info->segment = pci_domain_nr(pdev->bus);
4416         } else {
4417                 info->bus = bus;
4418                 info->devfn = devfn;
4419                 info->segment = iommu->segment;
4420         }
4421
4422         info->dev = dev;
4423         info->iommu = iommu;
4424         if (dev_is_pci(dev)) {
4425                 if (ecap_dev_iotlb_support(iommu->ecap) &&
4426                     pci_ats_supported(pdev) &&
4427                     dmar_ats_supported(pdev, iommu)) {
4428                         info->ats_supported = 1;
4429                         info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
4430
4431                         /*
4432                          * For IOMMU that supports device IOTLB throttling
4433                          * (DIT), we assign PFSID to the invalidation desc
4434                          * of a VF such that IOMMU HW can gauge queue depth
4435                          * at PF level. If DIT is not set, PFSID will be
4436                          * treated as reserved, which should be set to 0.
4437                          */
4438                         if (ecap_dit(iommu->ecap))
4439                                 info->pfsid = pci_dev_id(pci_physfn(pdev));
4440                         info->ats_qdep = pci_ats_queue_depth(pdev);
4441                 }
4442                 if (sm_supported(iommu)) {
4443                         if (pasid_supported(iommu)) {
4444                                 int features = pci_pasid_features(pdev);
4445
4446                                 if (features >= 0)
4447                                         info->pasid_supported = features | 1;
4448                         }
4449
4450                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
4451                             pci_pri_supported(pdev))
4452                                 info->pri_supported = 1;
4453                 }
4454         }
4455
4456         dev_iommu_priv_set(dev, info);
4457
4458         if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
4459                 ret = intel_pasid_alloc_table(dev);
4460                 if (ret) {
4461                         dev_err(dev, "PASID table allocation failed\n");
4462                         dev_iommu_priv_set(dev, NULL);
4463                         kfree(info);
4464                         return ERR_PTR(ret);
4465                 }
4466         }
4467
4468         return &iommu->iommu;
4469 }
4470
4471 static void intel_iommu_release_device(struct device *dev)
4472 {
4473         struct device_domain_info *info = dev_iommu_priv_get(dev);
4474
4475         dmar_remove_one_dev_info(dev);
4476         intel_pasid_free_table(dev);
4477         dev_iommu_priv_set(dev, NULL);
4478         kfree(info);
4479         set_dma_ops(dev, NULL);
4480 }
4481
4482 static void intel_iommu_probe_finalize(struct device *dev)
4483 {
4484         set_dma_ops(dev, NULL);
4485         iommu_setup_dma_ops(dev, 0, U64_MAX);
4486 }
4487
4488 static void intel_iommu_get_resv_regions(struct device *device,
4489                                          struct list_head *head)
4490 {
4491         int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4492         struct iommu_resv_region *reg;
4493         struct dmar_rmrr_unit *rmrr;
4494         struct device *i_dev;
4495         int i;
4496
4497         rcu_read_lock();
4498         for_each_rmrr_units(rmrr) {
4499                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4500                                           i, i_dev) {
4501                         struct iommu_resv_region *resv;
4502                         enum iommu_resv_type type;
4503                         size_t length;
4504
4505                         if (i_dev != device &&
4506                             !is_downstream_to_pci_bridge(device, i_dev))
4507                                 continue;
4508
4509                         length = rmrr->end_address - rmrr->base_address + 1;
4510
4511                         type = device_rmrr_is_relaxable(device) ?
4512                                 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4513
4514                         resv = iommu_alloc_resv_region(rmrr->base_address,
4515                                                        length, prot, type,
4516                                                        GFP_ATOMIC);
4517                         if (!resv)
4518                                 break;
4519
4520                         list_add_tail(&resv->list, head);
4521                 }
4522         }
4523         rcu_read_unlock();
4524
4525 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4526         if (dev_is_pci(device)) {
4527                 struct pci_dev *pdev = to_pci_dev(device);
4528
4529                 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4530                         reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4531                                         IOMMU_RESV_DIRECT_RELAXABLE,
4532                                         GFP_KERNEL);
4533                         if (reg)
4534                                 list_add_tail(&reg->list, head);
4535                 }
4536         }
4537 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4538
4539         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4540                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4541                                       0, IOMMU_RESV_MSI, GFP_KERNEL);
4542         if (!reg)
4543                 return;
4544         list_add_tail(&reg->list, head);
4545 }
4546
4547 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4548 {
4549         if (dev_is_pci(dev))
4550                 return pci_device_group(dev);
4551         return generic_device_group(dev);
4552 }
4553
4554 static int intel_iommu_enable_sva(struct device *dev)
4555 {
4556         struct device_domain_info *info = dev_iommu_priv_get(dev);
4557         struct intel_iommu *iommu;
4558
4559         if (!info || dmar_disabled)
4560                 return -EINVAL;
4561
4562         iommu = info->iommu;
4563         if (!iommu)
4564                 return -EINVAL;
4565
4566         if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4567                 return -ENODEV;
4568
4569         if (!info->pasid_enabled || !info->ats_enabled)
4570                 return -EINVAL;
4571
4572         /*
4573          * Devices having device-specific I/O fault handling should not
4574          * support PCI/PRI. The IOMMU side has no means to check the
4575          * capability of device-specific IOPF.  Therefore, IOMMU can only
4576          * default that if the device driver enables SVA on a non-PRI
4577          * device, it will handle IOPF in its own way.
4578          */
4579         if (!info->pri_supported)
4580                 return 0;
4581
4582         /* Devices supporting PRI should have it enabled. */
4583         if (!info->pri_enabled)
4584                 return -EINVAL;
4585
4586         return 0;
4587 }
4588
4589 static int intel_iommu_enable_iopf(struct device *dev)
4590 {
4591         struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4592         struct device_domain_info *info = dev_iommu_priv_get(dev);
4593         struct intel_iommu *iommu;
4594         int ret;
4595
4596         if (!pdev || !info || !info->ats_enabled || !info->pri_supported)
4597                 return -ENODEV;
4598
4599         if (info->pri_enabled)
4600                 return -EBUSY;
4601
4602         iommu = info->iommu;
4603         if (!iommu)
4604                 return -EINVAL;
4605
4606         /* PASID is required in PRG Response Message. */
4607         if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
4608                 return -EINVAL;
4609
4610         ret = pci_reset_pri(pdev);
4611         if (ret)
4612                 return ret;
4613
4614         ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4615         if (ret)
4616                 return ret;
4617
4618         ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4619         if (ret)
4620                 goto iopf_remove_device;
4621
4622         ret = pci_enable_pri(pdev, PRQ_DEPTH);
4623         if (ret)
4624                 goto iopf_unregister_handler;
4625         info->pri_enabled = 1;
4626
4627         return 0;
4628
4629 iopf_unregister_handler:
4630         iommu_unregister_device_fault_handler(dev);
4631 iopf_remove_device:
4632         iopf_queue_remove_device(iommu->iopf_queue, dev);
4633
4634         return ret;
4635 }
4636
4637 static int intel_iommu_disable_iopf(struct device *dev)
4638 {
4639         struct device_domain_info *info = dev_iommu_priv_get(dev);
4640         struct intel_iommu *iommu = info->iommu;
4641
4642         if (!info->pri_enabled)
4643                 return -EINVAL;
4644
4645         /*
4646          * PCIe spec states that by clearing PRI enable bit, the Page
4647          * Request Interface will not issue new page requests, but has
4648          * outstanding page requests that have been transmitted or are
4649          * queued for transmission. This is supposed to be called after
4650          * the device driver has stopped DMA, all PASIDs have been
4651          * unbound and the outstanding PRQs have been drained.
4652          */
4653         pci_disable_pri(to_pci_dev(dev));
4654         info->pri_enabled = 0;
4655
4656         /*
4657          * With PRI disabled and outstanding PRQs drained, unregistering
4658          * fault handler and removing device from iopf queue should never
4659          * fail.
4660          */
4661         WARN_ON(iommu_unregister_device_fault_handler(dev));
4662         WARN_ON(iopf_queue_remove_device(iommu->iopf_queue, dev));
4663
4664         return 0;
4665 }
4666
4667 static int
4668 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4669 {
4670         switch (feat) {
4671         case IOMMU_DEV_FEAT_IOPF:
4672                 return intel_iommu_enable_iopf(dev);
4673
4674         case IOMMU_DEV_FEAT_SVA:
4675                 return intel_iommu_enable_sva(dev);
4676
4677         default:
4678                 return -ENODEV;
4679         }
4680 }
4681
4682 static int
4683 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4684 {
4685         switch (feat) {
4686         case IOMMU_DEV_FEAT_IOPF:
4687                 return intel_iommu_disable_iopf(dev);
4688
4689         case IOMMU_DEV_FEAT_SVA:
4690                 return 0;
4691
4692         default:
4693                 return -ENODEV;
4694         }
4695 }
4696
4697 static bool intel_iommu_is_attach_deferred(struct device *dev)
4698 {
4699         struct device_domain_info *info = dev_iommu_priv_get(dev);
4700
4701         return translation_pre_enabled(info->iommu) && !info->domain;
4702 }
4703
4704 /*
4705  * Check that the device does not live on an external facing PCI port that is
4706  * marked as untrusted. Such devices should not be able to apply quirks and
4707  * thus not be able to bypass the IOMMU restrictions.
4708  */
4709 static bool risky_device(struct pci_dev *pdev)
4710 {
4711         if (pdev->untrusted) {
4712                 pci_info(pdev,
4713                          "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4714                          pdev->vendor, pdev->device);
4715                 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4716                 return true;
4717         }
4718         return false;
4719 }
4720
4721 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4722                                        unsigned long iova, size_t size)
4723 {
4724         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4725         unsigned long pages = aligned_nrpages(iova, size);
4726         unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4727         struct iommu_domain_info *info;
4728         unsigned long i;
4729
4730         xa_for_each(&dmar_domain->iommu_array, i, info)
4731                 __mapping_notify_one(info->iommu, dmar_domain, pfn, pages);
4732 }
4733
4734 static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
4735 {
4736         struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
4737         struct dev_pasid_info *curr, *dev_pasid = NULL;
4738         struct dmar_domain *dmar_domain;
4739         struct iommu_domain *domain;
4740         unsigned long flags;
4741
4742         domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0);
4743         if (WARN_ON_ONCE(!domain))
4744                 goto out_tear_down;
4745
4746         /*
4747          * The SVA implementation needs to handle its own stuffs like the mm
4748          * notification. Before consolidating that code into iommu core, let
4749          * the intel sva code handle it.
4750          */
4751         if (domain->type == IOMMU_DOMAIN_SVA) {
4752                 intel_svm_remove_dev_pasid(dev, pasid);
4753                 goto out_tear_down;
4754         }
4755
4756         dmar_domain = to_dmar_domain(domain);
4757         spin_lock_irqsave(&dmar_domain->lock, flags);
4758         list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
4759                 if (curr->dev == dev && curr->pasid == pasid) {
4760                         list_del(&curr->link_domain);
4761                         dev_pasid = curr;
4762                         break;
4763                 }
4764         }
4765         WARN_ON_ONCE(!dev_pasid);
4766         spin_unlock_irqrestore(&dmar_domain->lock, flags);
4767
4768         domain_detach_iommu(dmar_domain, iommu);
4769         kfree(dev_pasid);
4770 out_tear_down:
4771         intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4772         intel_drain_pasid_prq(dev, pasid);
4773 }
4774
4775 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4776                                      struct device *dev, ioasid_t pasid)
4777 {
4778         struct device_domain_info *info = dev_iommu_priv_get(dev);
4779         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4780         struct intel_iommu *iommu = info->iommu;
4781         struct dev_pasid_info *dev_pasid;
4782         unsigned long flags;
4783         int ret;
4784
4785         if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4786                 return -EOPNOTSUPP;
4787
4788         if (domain->dirty_ops)
4789                 return -EINVAL;
4790
4791         if (context_copied(iommu, info->bus, info->devfn))
4792                 return -EBUSY;
4793
4794         ret = prepare_domain_attach_device(domain, dev);
4795         if (ret)
4796                 return ret;
4797
4798         dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
4799         if (!dev_pasid)
4800                 return -ENOMEM;
4801
4802         ret = domain_attach_iommu(dmar_domain, iommu);
4803         if (ret)
4804                 goto out_free;
4805
4806         if (domain_type_is_si(dmar_domain))
4807                 ret = intel_pasid_setup_pass_through(iommu, dmar_domain,
4808                                                      dev, pasid);
4809         else if (dmar_domain->use_first_level)
4810                 ret = domain_setup_first_level(iommu, dmar_domain,
4811                                                dev, pasid);
4812         else
4813                 ret = intel_pasid_setup_second_level(iommu, dmar_domain,
4814                                                      dev, pasid);
4815         if (ret)
4816                 goto out_detach_iommu;
4817
4818         dev_pasid->dev = dev;
4819         dev_pasid->pasid = pasid;
4820         spin_lock_irqsave(&dmar_domain->lock, flags);
4821         list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
4822         spin_unlock_irqrestore(&dmar_domain->lock, flags);
4823
4824         return 0;
4825 out_detach_iommu:
4826         domain_detach_iommu(dmar_domain, iommu);
4827 out_free:
4828         kfree(dev_pasid);
4829         return ret;
4830 }
4831
4832 static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
4833 {
4834         struct device_domain_info *info = dev_iommu_priv_get(dev);
4835         struct intel_iommu *iommu = info->iommu;
4836         struct iommu_hw_info_vtd *vtd;
4837
4838         vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
4839         if (!vtd)
4840                 return ERR_PTR(-ENOMEM);
4841
4842         vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17;
4843         vtd->cap_reg = iommu->cap;
4844         vtd->ecap_reg = iommu->ecap;
4845         *length = sizeof(*vtd);
4846         *type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
4847         return vtd;
4848 }
4849
4850 static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
4851                                           bool enable)
4852 {
4853         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4854         struct device_domain_info *info;
4855         int ret;
4856
4857         spin_lock(&dmar_domain->lock);
4858         if (dmar_domain->dirty_tracking == enable)
4859                 goto out_unlock;
4860
4861         list_for_each_entry(info, &dmar_domain->devices, link) {
4862                 ret = intel_pasid_setup_dirty_tracking(info->iommu,
4863                                                        info->domain, info->dev,
4864                                                        IOMMU_NO_PASID, enable);
4865                 if (ret)
4866                         goto err_unwind;
4867         }
4868
4869         dmar_domain->dirty_tracking = enable;
4870 out_unlock:
4871         spin_unlock(&dmar_domain->lock);
4872
4873         return 0;
4874
4875 err_unwind:
4876         list_for_each_entry(info, &dmar_domain->devices, link)
4877                 intel_pasid_setup_dirty_tracking(info->iommu, dmar_domain,
4878                                                  info->dev, IOMMU_NO_PASID,
4879                                                  dmar_domain->dirty_tracking);
4880         spin_unlock(&dmar_domain->lock);
4881         return ret;
4882 }
4883
4884 static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
4885                                             unsigned long iova, size_t size,
4886                                             unsigned long flags,
4887                                             struct iommu_dirty_bitmap *dirty)
4888 {
4889         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4890         unsigned long end = iova + size - 1;
4891         unsigned long pgsize;
4892
4893         /*
4894          * IOMMUFD core calls into a dirty tracking disabled domain without an
4895          * IOVA bitmap set in order to clean dirty bits in all PTEs that might
4896          * have occurred when we stopped dirty tracking. This ensures that we
4897          * never inherit dirtied bits from a previous cycle.
4898          */
4899         if (!dmar_domain->dirty_tracking && dirty->bitmap)
4900                 return -EINVAL;
4901
4902         do {
4903                 struct dma_pte *pte;
4904                 int lvl = 0;
4905
4906                 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
4907                                      GFP_ATOMIC);
4908                 pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
4909                 if (!pte || !dma_pte_present(pte)) {
4910                         iova += pgsize;
4911                         continue;
4912                 }
4913
4914                 if (dma_sl_pte_test_and_clear_dirty(pte, flags))
4915                         iommu_dirty_bitmap_record(dirty, iova, pgsize);
4916                 iova += pgsize;
4917         } while (iova < end);
4918
4919         return 0;
4920 }
4921
4922 const struct iommu_dirty_ops intel_dirty_ops = {
4923         .set_dirty_tracking = intel_iommu_set_dirty_tracking,
4924         .read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
4925 };
4926
4927 const struct iommu_ops intel_iommu_ops = {
4928         .capable                = intel_iommu_capable,
4929         .hw_info                = intel_iommu_hw_info,
4930         .domain_alloc           = intel_iommu_domain_alloc,
4931         .domain_alloc_user      = intel_iommu_domain_alloc_user,
4932         .probe_device           = intel_iommu_probe_device,
4933         .probe_finalize         = intel_iommu_probe_finalize,
4934         .release_device         = intel_iommu_release_device,
4935         .get_resv_regions       = intel_iommu_get_resv_regions,
4936         .device_group           = intel_iommu_device_group,
4937         .dev_enable_feat        = intel_iommu_dev_enable_feat,
4938         .dev_disable_feat       = intel_iommu_dev_disable_feat,
4939         .is_attach_deferred     = intel_iommu_is_attach_deferred,
4940         .def_domain_type        = device_def_domain_type,
4941         .remove_dev_pasid       = intel_iommu_remove_dev_pasid,
4942         .pgsize_bitmap          = SZ_4K,
4943 #ifdef CONFIG_INTEL_IOMMU_SVM
4944         .page_response          = intel_svm_page_response,
4945 #endif
4946         .default_domain_ops = &(const struct iommu_domain_ops) {
4947                 .attach_dev             = intel_iommu_attach_device,
4948                 .set_dev_pasid          = intel_iommu_set_dev_pasid,
4949                 .map_pages              = intel_iommu_map_pages,
4950                 .unmap_pages            = intel_iommu_unmap_pages,
4951                 .iotlb_sync_map         = intel_iommu_iotlb_sync_map,
4952                 .flush_iotlb_all        = intel_flush_iotlb_all,
4953                 .iotlb_sync             = intel_iommu_tlb_sync,
4954                 .iova_to_phys           = intel_iommu_iova_to_phys,
4955                 .free                   = intel_iommu_domain_free,
4956                 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4957         }
4958 };
4959
4960 static void quirk_iommu_igfx(struct pci_dev *dev)
4961 {
4962         if (risky_device(dev))
4963                 return;
4964
4965         pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4966         dmar_map_gfx = 0;
4967 }
4968
4969 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4970 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4971 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4972 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4973 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4974 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4975 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4976 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4977
4978 /* Broadwell igfx malfunctions with dmar */
4979 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4980 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4981 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4982 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4983 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4984 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4985 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4986 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4987 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4988 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4989 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4990 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4991 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4992 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4993 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4994 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4995 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4996 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4997 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4998 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4999 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
5000 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
5001 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
5002 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
5003
5004 static void quirk_iommu_rwbf(struct pci_dev *dev)
5005 {
5006         if (risky_device(dev))
5007                 return;
5008
5009         /*
5010          * Mobile 4 Series Chipset neglects to set RWBF capability,
5011          * but needs it. Same seems to hold for the desktop versions.
5012          */
5013         pci_info(dev, "Forcing write-buffer flush capability\n");
5014         rwbf_quirk = 1;
5015 }
5016
5017 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5018 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5019 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5020 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5021 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5022 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5023 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5024
5025 #define GGC 0x52
5026 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
5027 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
5028 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
5029 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
5030 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
5031 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
5032 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
5033 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
5034
5035 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5036 {
5037         unsigned short ggc;
5038
5039         if (risky_device(dev))
5040                 return;
5041
5042         if (pci_read_config_word(dev, GGC, &ggc))
5043                 return;
5044
5045         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5046                 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5047                 dmar_map_gfx = 0;
5048         } else if (dmar_map_gfx) {
5049                 /* we have to ensure the gfx device is idle before we flush */
5050                 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5051                 iommu_set_dma_strict();
5052         }
5053 }
5054 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5055 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5056 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5057 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5058
5059 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
5060 {
5061         unsigned short ver;
5062
5063         if (!IS_GFX_DEVICE(dev))
5064                 return;
5065
5066         ver = (dev->device >> 8) & 0xff;
5067         if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
5068             ver != 0x4e && ver != 0x8a && ver != 0x98 &&
5069             ver != 0x9a && ver != 0xa7)
5070                 return;
5071
5072         if (risky_device(dev))
5073                 return;
5074
5075         pci_info(dev, "Skip IOMMU disabling for graphics\n");
5076         iommu_skip_te_disable = 1;
5077 }
5078 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
5079
5080 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5081    ISOCH DMAR unit for the Azalia sound device, but not give it any
5082    TLB entries, which causes it to deadlock. Check for that.  We do
5083    this in a function called from init_dmars(), instead of in a PCI
5084    quirk, because we don't want to print the obnoxious "BIOS broken"
5085    message if VT-d is actually disabled.
5086 */
5087 static void __init check_tylersburg_isoch(void)
5088 {
5089         struct pci_dev *pdev;
5090         uint32_t vtisochctrl;
5091
5092         /* If there's no Azalia in the system anyway, forget it. */
5093         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5094         if (!pdev)
5095                 return;
5096
5097         if (risky_device(pdev)) {
5098                 pci_dev_put(pdev);
5099                 return;
5100         }
5101
5102         pci_dev_put(pdev);
5103
5104         /* System Management Registers. Might be hidden, in which case
5105            we can't do the sanity check. But that's OK, because the
5106            known-broken BIOSes _don't_ actually hide it, so far. */
5107         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5108         if (!pdev)
5109                 return;
5110
5111         if (risky_device(pdev)) {
5112                 pci_dev_put(pdev);
5113                 return;
5114         }
5115
5116         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5117                 pci_dev_put(pdev);
5118                 return;
5119         }
5120
5121         pci_dev_put(pdev);
5122
5123         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5124         if (vtisochctrl & 1)
5125                 return;
5126
5127         /* Drop all bits other than the number of TLB entries */
5128         vtisochctrl &= 0x1c;
5129
5130         /* If we have the recommended number of TLB entries (16), fine. */
5131         if (vtisochctrl == 0x10)
5132                 return;
5133
5134         /* Zero TLB entries? You get to ride the short bus to school. */
5135         if (!vtisochctrl) {
5136                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5137                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5138                      dmi_get_system_info(DMI_BIOS_VENDOR),
5139                      dmi_get_system_info(DMI_BIOS_VERSION),
5140                      dmi_get_system_info(DMI_PRODUCT_VERSION));
5141                 iommu_identity_mapping |= IDENTMAP_AZALIA;
5142                 return;
5143         }
5144
5145         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5146                vtisochctrl);
5147 }
5148
5149 /*
5150  * Here we deal with a device TLB defect where device may inadvertently issue ATS
5151  * invalidation completion before posted writes initiated with translated address
5152  * that utilized translations matching the invalidation address range, violating
5153  * the invalidation completion ordering.
5154  * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
5155  * vulnerable to this defect. In other words, any dTLB invalidation initiated not
5156  * under the control of the trusted/privileged host device driver must use this
5157  * quirk.
5158  * Device TLBs are invalidated under the following six conditions:
5159  * 1. Device driver does DMA API unmap IOVA
5160  * 2. Device driver unbind a PASID from a process, sva_unbind_device()
5161  * 3. PASID is torn down, after PASID cache is flushed. e.g. process
5162  *    exit_mmap() due to crash
5163  * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
5164  *    VM has to free pages that were unmapped
5165  * 5. Userspace driver unmaps a DMA buffer
5166  * 6. Cache invalidation in vSVA usage (upcoming)
5167  *
5168  * For #1 and #2, device drivers are responsible for stopping DMA traffic
5169  * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
5170  * invalidate TLB the same way as normal user unmap which will use this quirk.
5171  * The dTLB invalidation after PASID cache flush does not need this quirk.
5172  *
5173  * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
5174  */
5175 void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
5176                                unsigned long address, unsigned long mask,
5177                                u32 pasid, u16 qdep)
5178 {
5179         u16 sid;
5180
5181         if (likely(!info->dtlb_extra_inval))
5182                 return;
5183
5184         sid = PCI_DEVID(info->bus, info->devfn);
5185         if (pasid == IOMMU_NO_PASID) {
5186                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
5187                                    qdep, address, mask);
5188         } else {
5189                 qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
5190                                          pasid, qdep, address, mask);
5191         }
5192 }
5193
5194 #define ecmd_get_status_code(res)       (((res) & 0xff) >> 1)
5195
5196 /*
5197  * Function to submit a command to the enhanced command interface. The
5198  * valid enhanced command descriptions are defined in Table 47 of the
5199  * VT-d spec. The VT-d hardware implementation may support some but not
5200  * all commands, which can be determined by checking the Enhanced
5201  * Command Capability Register.
5202  *
5203  * Return values:
5204  *  - 0: Command successful without any error;
5205  *  - Negative: software error value;
5206  *  - Nonzero positive: failure status code defined in Table 48.
5207  */
5208 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
5209 {
5210         unsigned long flags;
5211         u64 res;
5212         int ret;
5213
5214         if (!cap_ecmds(iommu->cap))
5215                 return -ENODEV;
5216
5217         raw_spin_lock_irqsave(&iommu->register_lock, flags);
5218
5219         res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
5220         if (res & DMA_ECMD_ECRSP_IP) {
5221                 ret = -EBUSY;
5222                 goto err;
5223         }
5224
5225         /*
5226          * Unconditionally write the operand B, because
5227          * - There is no side effect if an ecmd doesn't require an
5228          *   operand B, but we set the register to some value.
5229          * - It's not invoked in any critical path. The extra MMIO
5230          *   write doesn't bring any performance concerns.
5231          */
5232         dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
5233         dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
5234
5235         IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
5236                       !(res & DMA_ECMD_ECRSP_IP), res);
5237
5238         if (res & DMA_ECMD_ECRSP_IP) {
5239                 ret = -ETIMEDOUT;
5240                 goto err;
5241         }
5242
5243         ret = ecmd_get_status_code(res);
5244 err:
5245         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
5246
5247         return ret;
5248 }
This page took 0.343769 seconds and 4 git commands to generate.