drivers/iommu/intel/iommu.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Copyright © 2006-2014 Intel Corporation.
   4  *
   5  * Authors: David Woodhouse <[email protected]>,
   6  *          Ashok Raj <[email protected]>,
   7  *          Shaohua Li <[email protected]>,
   8  *          Anil S Keshavamurthy <[email protected]>,
   9  *          Fenghua Yu <[email protected]>
  10  *          Joerg Roedel <[email protected]>
  11  */
  12
  13 #define pr_fmt(fmt)     "DMAR: " fmt
  14 #define dev_fmt(fmt)    pr_fmt(fmt)
  15
  16 #include <linux/crash_dump.h>
  17 #include <linux/dma-direct.h>
  18 #include <linux/dmi.h>
  19 #include <linux/memory.h>
  20 #include <linux/pci.h>
  21 #include <linux/pci-ats.h>
  22 #include <linux/spinlock.h>
  23 #include <linux/syscore_ops.h>
  24 #include <linux/tboot.h>
  25 #include <uapi/linux/iommufd.h>
  26
  27 #include "iommu.h"
  28 #include "../dma-iommu.h"
  29 #include "../irq_remapping.h"
  30 #include "pasid.h"
  31 #include "cap_audit.h"
  32 #include "perfmon.h"
  33
  34 #define ROOT_SIZE               VTD_PAGE_SIZE
  35 #define CONTEXT_SIZE            VTD_PAGE_SIZE
  36
  37 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
  38 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
  39 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
  40 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
  41
  42 #define IOAPIC_RANGE_START      (0xfee00000)
  43 #define IOAPIC_RANGE_END        (0xfeefffff)
  44 #define IOVA_START_ADDR         (0x1000)
  45
  46 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
  47
  48 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
  49 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
  50
  51 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
  52    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
  53 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
  54                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
  55 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
  56
  57 /* IO virtual address start page frame number */
  58 #define IOVA_START_PFN          (1)
  59
  60 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
  61
  62 static void __init check_tylersburg_isoch(void);
  63 static int rwbf_quirk;
  64
  65 /*
  66  * set to 1 to panic kernel if can't successfully enable VT-d
  67  * (used when kernel is launched w/ TXT)
  68  */
  69 static int force_on = 0;
  70 static int intel_iommu_tboot_noforce;
  71 static int no_platform_optin;
  72
  73 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
  74
  75 /*
  76  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
  77  * if marked present.
  78  */
  79 static phys_addr_t root_entry_lctp(struct root_entry *re)
  80 {
  81         if (!(re->lo & 1))
  82                 return 0;
  83
  84         return re->lo & VTD_PAGE_MASK;
  85 }
  86
  87 /*
  88  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
  89  * if marked present.
  90  */
  91 static phys_addr_t root_entry_uctp(struct root_entry *re)
  92 {
  93         if (!(re->hi & 1))
  94                 return 0;
  95
  96         return re->hi & VTD_PAGE_MASK;
  97 }
  98
  99 static int device_rid_cmp_key(const void *key, const struct rb_node *node)
 100 {
 101         struct device_domain_info *info =
 102                 rb_entry(node, struct device_domain_info, node);
 103         const u16 *rid_lhs = key;
 104
 105         if (*rid_lhs < PCI_DEVID(info->bus, info->devfn))
 106                 return -1;
 107
 108         if (*rid_lhs > PCI_DEVID(info->bus, info->devfn))
 109                 return 1;
 110
 111         return 0;
 112 }
 113
 114 static int device_rid_cmp(struct rb_node *lhs, const struct rb_node *rhs)
 115 {
 116         struct device_domain_info *info =
 117                 rb_entry(lhs, struct device_domain_info, node);
 118         u16 key = PCI_DEVID(info->bus, info->devfn);
 119
 120         return device_rid_cmp_key(&key, rhs);
 121 }
 122
 123 /*
 124  * Looks up an IOMMU-probed device using its source ID.
 125  *
 126  * Returns the pointer to the device if there is a match. Otherwise,
 127  * returns NULL.
 128  *
 129  * Note that this helper doesn't guarantee that the device won't be
 130  * released by the iommu subsystem after being returned. The caller
 131  * should use its own synchronization mechanism to avoid the device
 132  * being released during its use if its possibly the case.
 133  */
 134 struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid)
 135 {
 136         struct device_domain_info *info = NULL;
 137         struct rb_node *node;
 138         unsigned long flags;
 139
 140         spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
 141         node = rb_find(&rid, &iommu->device_rbtree, device_rid_cmp_key);
 142         if (node)
 143                 info = rb_entry(node, struct device_domain_info, node);
 144         spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
 145
 146         return info ? info->dev : NULL;
 147 }
 148
 149 static int device_rbtree_insert(struct intel_iommu *iommu,
 150                                 struct device_domain_info *info)
 151 {
 152         struct rb_node *curr;
 153         unsigned long flags;
 154
 155         spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
 156         curr = rb_find_add(&info->node, &iommu->device_rbtree, device_rid_cmp);
 157         spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
 158         if (WARN_ON(curr))
 159                 return -EEXIST;
 160
 161         return 0;
 162 }
 163
 164 static void device_rbtree_remove(struct device_domain_info *info)
 165 {
 166         struct intel_iommu *iommu = info->iommu;
 167         unsigned long flags;
 168
 169         spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
 170         rb_erase(&info->node, &iommu->device_rbtree);
 171         spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
 172 }
 173
 174 /*
 175  * This domain is a statically identity mapping domain.
 176  *      1. This domain creats a static 1:1 mapping to all usable memory.
 177  *      2. It maps to each iommu if successful.
 178  *      3. Each iommu mapps to this domain if successful.
 179  */
 180 static struct dmar_domain *si_domain;
 181 static int hw_pass_through = 1;
 182
 183 struct dmar_rmrr_unit {
 184         struct list_head list;          /* list of rmrr units   */
 185         struct acpi_dmar_header *hdr;   /* ACPI header          */
 186         u64     base_address;           /* reserved base address*/
 187         u64     end_address;            /* reserved end address */
 188         struct dmar_dev_scope *devices; /* target devices */
 189         int     devices_cnt;            /* target device count */
 190 };
 191
 192 struct dmar_atsr_unit {
 193         struct list_head list;          /* list of ATSR units */
 194         struct acpi_dmar_header *hdr;   /* ACPI header */
 195         struct dmar_dev_scope *devices; /* target devices */
 196         int devices_cnt;                /* target device count */
 197         u8 include_all:1;               /* include all ports */
 198 };
 199
 200 struct dmar_satc_unit {
 201         struct list_head list;          /* list of SATC units */
 202         struct acpi_dmar_header *hdr;   /* ACPI header */
 203         struct dmar_dev_scope *devices; /* target devices */
 204         struct intel_iommu *iommu;      /* the corresponding iommu */
 205         int devices_cnt;                /* target device count */
 206         u8 atc_required:1;              /* ATS is required */
 207 };
 208
 209 static LIST_HEAD(dmar_atsr_units);
 210 static LIST_HEAD(dmar_rmrr_units);
 211 static LIST_HEAD(dmar_satc_units);
 212
 213 #define for_each_rmrr_units(rmrr) \
 214         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
 215
 216 static void intel_iommu_domain_free(struct iommu_domain *domain);
 217
 218 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
 219 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
 220
 221 int intel_iommu_enabled = 0;
 222 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 223
 224 static int dmar_map_gfx = 1;
 225 static int intel_iommu_superpage = 1;
 226 static int iommu_identity_mapping;
 227 static int iommu_skip_te_disable;
 228
 229 #define IDENTMAP_GFX            2
 230 #define IDENTMAP_AZALIA         4
 231
 232 const struct iommu_ops intel_iommu_ops;
 233 static const struct iommu_dirty_ops intel_dirty_ops;
 234
 235 static bool translation_pre_enabled(struct intel_iommu *iommu)
 236 {
 237         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
 238 }
 239
 240 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
 241 {
 242         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
 243 }
 244
 245 static void init_translation_status(struct intel_iommu *iommu)
 246 {
 247         u32 gsts;
 248
 249         gsts = readl(iommu->reg + DMAR_GSTS_REG);
 250         if (gsts & DMA_GSTS_TES)
 251                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
 252 }
 253
 254 static int __init intel_iommu_setup(char *str)
 255 {
 256         if (!str)
 257                 return -EINVAL;
 258
 259         while (*str) {
 260                 if (!strncmp(str, "on", 2)) {
 261                         dmar_disabled = 0;
 262                         pr_info("IOMMU enabled\n");
 263                 } else if (!strncmp(str, "off", 3)) {
 264                         dmar_disabled = 1;
 265                         no_platform_optin = 1;
 266                         pr_info("IOMMU disabled\n");
 267                 } else if (!strncmp(str, "igfx_off", 8)) {
 268                         dmar_map_gfx = 0;
 269                         pr_info("Disable GFX device mapping\n");
 270                 } else if (!strncmp(str, "forcedac", 8)) {
 271                         pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
 272                         iommu_dma_forcedac = true;
 273                 } else if (!strncmp(str, "strict", 6)) {
 274                         pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
 275                         iommu_set_dma_strict();
 276                 } else if (!strncmp(str, "sp_off", 6)) {
 277                         pr_info("Disable supported super page\n");
 278                         intel_iommu_superpage = 0;
 279                 } else if (!strncmp(str, "sm_on", 5)) {
 280                         pr_info("Enable scalable mode if hardware supports\n");
 281                         intel_iommu_sm = 1;
 282                 } else if (!strncmp(str, "sm_off", 6)) {
 283                         pr_info("Scalable mode is disallowed\n");
 284                         intel_iommu_sm = 0;
 285                 } else if (!strncmp(str, "tboot_noforce", 13)) {
 286                         pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
 287                         intel_iommu_tboot_noforce = 1;
 288                 } else {
 289                         pr_notice("Unknown option - '%s'\n", str);
 290                 }
 291
 292                 str += strcspn(str, ",");
 293                 while (*str == ',')
 294                         str++;
 295         }
 296
 297         return 1;
 298 }
 299 __setup("intel_iommu=", intel_iommu_setup);
 300
 301 void *alloc_pgtable_page(int node, gfp_t gfp)
 302 {
 303         struct page *page;
 304         void *vaddr = NULL;
 305
 306         page = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
 307         if (page)
 308                 vaddr = page_address(page);
 309         return vaddr;
 310 }
 311
 312 void free_pgtable_page(void *vaddr)
 313 {
 314         free_page((unsigned long)vaddr);
 315 }
 316
 317 static int domain_type_is_si(struct dmar_domain *domain)
 318 {
 319         return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
 320 }
 321
 322 static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn)
 323 {
 324         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 325
 326         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
 327 }
 328
 329 /*
 330  * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
 331  * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
 332  * the returned SAGAW.
 333  */
 334 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
 335 {
 336         unsigned long fl_sagaw, sl_sagaw;
 337
 338         fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
 339         sl_sagaw = cap_sagaw(iommu->cap);
 340
 341         /* Second level only. */
 342         if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
 343                 return sl_sagaw;
 344
 345         /* First level only. */
 346         if (!ecap_slts(iommu->ecap))
 347                 return fl_sagaw;
 348
 349         return fl_sagaw & sl_sagaw;
 350 }
 351
 352 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
 353 {
 354         unsigned long sagaw;
 355         int agaw;
 356
 357         sagaw = __iommu_calculate_sagaw(iommu);
 358         for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
 359                 if (test_bit(agaw, &sagaw))
 360                         break;
 361         }
 362
 363         return agaw;
 364 }
 365
 366 /*
 367  * Calculate max SAGAW for each iommu.
 368  */
 369 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
 370 {
 371         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
 372 }
 373
 374 /*
 375  * calculate agaw for each iommu.
 376  * "SAGAW" may be different across iommus, use a default agaw, and
 377  * get a supported less agaw for iommus that don't support the default agaw.
 378  */
 379 int iommu_calculate_agaw(struct intel_iommu *iommu)
 380 {
 381         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
 382 }
 383
 384 static bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
 385 {
 386         return sm_supported(iommu) ?
 387                         ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
 388 }
 389
 390 static void domain_update_iommu_coherency(struct dmar_domain *domain)
 391 {
 392         struct iommu_domain_info *info;
 393         struct dmar_drhd_unit *drhd;
 394         struct intel_iommu *iommu;
 395         bool found = false;
 396         unsigned long i;
 397
 398         domain->iommu_coherency = true;
 399         xa_for_each(&domain->iommu_array, i, info) {
 400                 found = true;
 401                 if (!iommu_paging_structure_coherency(info->iommu)) {
 402                         domain->iommu_coherency = false;
 403                         break;
 404                 }
 405         }
 406         if (found)
 407                 return;
 408
 409         /* No hardware attached; use lowest common denominator */
 410         rcu_read_lock();
 411         for_each_active_iommu(iommu, drhd) {
 412                 if (!iommu_paging_structure_coherency(iommu)) {
 413                         domain->iommu_coherency = false;
 414                         break;
 415                 }
 416         }
 417         rcu_read_unlock();
 418 }
 419
 420 static int domain_update_iommu_superpage(struct dmar_domain *domain,
 421                                          struct intel_iommu *skip)
 422 {
 423         struct dmar_drhd_unit *drhd;
 424         struct intel_iommu *iommu;
 425         int mask = 0x3;
 426
 427         if (!intel_iommu_superpage)
 428                 return 0;
 429
 430         /* set iommu_superpage to the smallest common denominator */
 431         rcu_read_lock();
 432         for_each_active_iommu(iommu, drhd) {
 433                 if (iommu != skip) {
 434                         if (domain && domain->use_first_level) {
 435                                 if (!cap_fl1gp_support(iommu->cap))
 436                                         mask = 0x1;
 437                         } else {
 438                                 mask &= cap_super_page_val(iommu->cap);
 439                         }
 440
 441                         if (!mask)
 442                                 break;
 443                 }
 444         }
 445         rcu_read_unlock();
 446
 447         return fls(mask);
 448 }
 449
 450 static int domain_update_device_node(struct dmar_domain *domain)
 451 {
 452         struct device_domain_info *info;
 453         int nid = NUMA_NO_NODE;
 454         unsigned long flags;
 455
 456         spin_lock_irqsave(&domain->lock, flags);
 457         list_for_each_entry(info, &domain->devices, link) {
 458                 /*
 459                  * There could possibly be multiple device numa nodes as devices
 460                  * within the same domain may sit behind different IOMMUs. There
 461                  * isn't perfect answer in such situation, so we select first
 462                  * come first served policy.
 463                  */
 464                 nid = dev_to_node(info->dev);
 465                 if (nid != NUMA_NO_NODE)
 466                         break;
 467         }
 468         spin_unlock_irqrestore(&domain->lock, flags);
 469
 470         return nid;
 471 }
 472
 473 /* Return the super pagesize bitmap if supported. */
 474 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
 475 {
 476         unsigned long bitmap = 0;
 477
 478         /*
 479          * 1-level super page supports page size of 2MiB, 2-level super page
 480          * supports page size of both 2MiB and 1GiB.
 481          */
 482         if (domain->iommu_superpage == 1)
 483                 bitmap |= SZ_2M;
 484         else if (domain->iommu_superpage == 2)
 485                 bitmap |= SZ_2M | SZ_1G;
 486
 487         return bitmap;
 488 }
 489
 490 /* Some capabilities may be different across iommus */
 491 void domain_update_iommu_cap(struct dmar_domain *domain)
 492 {
 493         domain_update_iommu_coherency(domain);
 494         domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
 495
 496         /*
 497          * If RHSA is missing, we should default to the device numa domain
 498          * as fall back.
 499          */
 500         if (domain->nid == NUMA_NO_NODE)
 501                 domain->nid = domain_update_device_node(domain);
 502
 503         /*
 504          * First-level translation restricts the input-address to a
 505          * canonical address (i.e., address bits 63:N have the same
 506          * value as address bit [N-1], where N is 48-bits with 4-level
 507          * paging and 57-bits with 5-level paging). Hence, skip bit
 508          * [N-1].
 509          */
 510         if (domain->use_first_level)
 511                 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
 512         else
 513                 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
 514
 515         domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
 516         domain_update_iotlb(domain);
 517 }
 518
 519 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
 520                                          u8 devfn, int alloc)
 521 {
 522         struct root_entry *root = &iommu->root_entry[bus];
 523         struct context_entry *context;
 524         u64 *entry;
 525
 526         /*
 527          * Except that the caller requested to allocate a new entry,
 528          * returning a copied context entry makes no sense.
 529          */
 530         if (!alloc && context_copied(iommu, bus, devfn))
 531                 return NULL;
 532
 533         entry = &root->lo;
 534         if (sm_supported(iommu)) {
 535                 if (devfn >= 0x80) {
 536                         devfn -= 0x80;
 537                         entry = &root->hi;
 538                 }
 539                 devfn *= 2;
 540         }
 541         if (*entry & 1)
 542                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
 543         else {
 544                 unsigned long phy_addr;
 545                 if (!alloc)
 546                         return NULL;
 547
 548                 context = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
 549                 if (!context)
 550                         return NULL;
 551
 552                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
 553                 phy_addr = virt_to_phys((void *)context);
 554                 *entry = phy_addr | 1;
 555                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
 556         }
 557         return &context[devfn];
 558 }
 559
 560 /**
 561  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
 562  *                               sub-hierarchy of a candidate PCI-PCI bridge
 563  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
 564  * @bridge: the candidate PCI-PCI bridge
 565  *
 566  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
 567  */
 568 static bool
 569 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
 570 {
 571         struct pci_dev *pdev, *pbridge;
 572
 573         if (!dev_is_pci(dev) || !dev_is_pci(bridge))
 574                 return false;
 575
 576         pdev = to_pci_dev(dev);
 577         pbridge = to_pci_dev(bridge);
 578
 579         if (pbridge->subordinate &&
 580             pbridge->subordinate->number <= pdev->bus->number &&
 581             pbridge->subordinate->busn_res.end >= pdev->bus->number)
 582                 return true;
 583
 584         return false;
 585 }
 586
 587 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
 588 {
 589         struct dmar_drhd_unit *drhd;
 590         u32 vtbar;
 591         int rc;
 592
 593         /* We know that this device on this chipset has its own IOMMU.
 594          * If we find it under a different IOMMU, then the BIOS is lying
 595          * to us. Hope that the IOMMU for this device is actually
 596          * disabled, and it needs no translation...
 597          */
 598         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
 599         if (rc) {
 600                 /* "can't" happen */
 601                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
 602                 return false;
 603         }
 604         vtbar &= 0xffff0000;
 605
 606         /* we know that the this iommu should be at offset 0xa000 from vtbar */
 607         drhd = dmar_find_matched_drhd_unit(pdev);
 608         if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
 609                 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
 610                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
 611                 return true;
 612         }
 613
 614         return false;
 615 }
 616
 617 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
 618 {
 619         if (!iommu || iommu->drhd->ignored)
 620                 return true;
 621
 622         if (dev_is_pci(dev)) {
 623                 struct pci_dev *pdev = to_pci_dev(dev);
 624
 625                 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
 626                     pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
 627                     quirk_ioat_snb_local_iommu(pdev))
 628                         return true;
 629         }
 630
 631         return false;
 632 }
 633
 634 static struct intel_iommu *device_lookup_iommu(struct device *dev, u8 *bus, u8 *devfn)
 635 {
 636         struct dmar_drhd_unit *drhd = NULL;
 637         struct pci_dev *pdev = NULL;
 638         struct intel_iommu *iommu;
 639         struct device *tmp;
 640         u16 segment = 0;
 641         int i;
 642
 643         if (!dev)
 644                 return NULL;
 645
 646         if (dev_is_pci(dev)) {
 647                 struct pci_dev *pf_pdev;
 648
 649                 pdev = pci_real_dma_dev(to_pci_dev(dev));
 650
 651                 /* VFs aren't listed in scope tables; we need to look up
 652                  * the PF instead to find the IOMMU. */
 653                 pf_pdev = pci_physfn(pdev);
 654                 dev = &pf_pdev->dev;
 655                 segment = pci_domain_nr(pdev->bus);
 656         } else if (has_acpi_companion(dev))
 657                 dev = &ACPI_COMPANION(dev)->dev;
 658
 659         rcu_read_lock();
 660         for_each_iommu(iommu, drhd) {
 661                 if (pdev && segment != drhd->segment)
 662                         continue;
 663
 664                 for_each_active_dev_scope(drhd->devices,
 665                                           drhd->devices_cnt, i, tmp) {
 666                         if (tmp == dev) {
 667                                 /* For a VF use its original BDF# not that of the PF
 668                                  * which we used for the IOMMU lookup. Strictly speaking
 669                                  * we could do this for all PCI devices; we only need to
 670                                  * get the BDF# from the scope table for ACPI matches. */
 671                                 if (pdev && pdev->is_virtfn)
 672                                         goto got_pdev;
 673
 674                                 if (bus && devfn) {
 675                                         *bus = drhd->devices[i].bus;
 676                                         *devfn = drhd->devices[i].devfn;
 677                                 }
 678                                 goto out;
 679                         }
 680
 681                         if (is_downstream_to_pci_bridge(dev, tmp))
 682                                 goto got_pdev;
 683                 }
 684
 685                 if (pdev && drhd->include_all) {
 686 got_pdev:
 687                         if (bus && devfn) {
 688                                 *bus = pdev->bus->number;
 689                                 *devfn = pdev->devfn;
 690                         }
 691                         goto out;
 692                 }
 693         }
 694         iommu = NULL;
 695 out:
 696         if (iommu_is_dummy(iommu, dev))
 697                 iommu = NULL;
 698
 699         rcu_read_unlock();
 700
 701         return iommu;
 702 }
 703
 704 static void domain_flush_cache(struct dmar_domain *domain,
 705                                void *addr, int size)
 706 {
 707         if (!domain->iommu_coherency)
 708                 clflush_cache_range(addr, size);
 709 }
 710
 711 static void free_context_table(struct intel_iommu *iommu)
 712 {
 713         struct context_entry *context;
 714         int i;
 715
 716         if (!iommu->root_entry)
 717                 return;
 718
 719         for (i = 0; i < ROOT_ENTRY_NR; i++) {
 720                 context = iommu_context_addr(iommu, i, 0, 0);
 721                 if (context)
 722                         free_pgtable_page(context);
 723
 724                 if (!sm_supported(iommu))
 725                         continue;
 726
 727                 context = iommu_context_addr(iommu, i, 0x80, 0);
 728                 if (context)
 729                         free_pgtable_page(context);
 730         }
 731
 732         free_pgtable_page(iommu->root_entry);
 733         iommu->root_entry = NULL;
 734 }
 735
 736 #ifdef CONFIG_DMAR_DEBUG
 737 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
 738                          u8 bus, u8 devfn, struct dma_pte *parent, int level)
 739 {
 740         struct dma_pte *pte;
 741         int offset;
 742
 743         while (1) {
 744                 offset = pfn_level_offset(pfn, level);
 745                 pte = &parent[offset];
 746                 if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
 747                         pr_info("PTE not present at level %d\n", level);
 748                         break;
 749                 }
 750
 751                 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
 752
 753                 if (level == 1)
 754                         break;
 755
 756                 parent = phys_to_virt(dma_pte_addr(pte));
 757                 level--;
 758         }
 759 }
 760
 761 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
 762                           unsigned long long addr, u32 pasid)
 763 {
 764         struct pasid_dir_entry *dir, *pde;
 765         struct pasid_entry *entries, *pte;
 766         struct context_entry *ctx_entry;
 767         struct root_entry *rt_entry;
 768         int i, dir_index, index, level;
 769         u8 devfn = source_id & 0xff;
 770         u8 bus = source_id >> 8;
 771         struct dma_pte *pgtable;
 772
 773         pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
 774
 775         /* root entry dump */
 776         rt_entry = &iommu->root_entry[bus];
 777         if (!rt_entry) {
 778                 pr_info("root table entry is not present\n");
 779                 return;
 780         }
 781
 782         if (sm_supported(iommu))
 783                 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
 784                         rt_entry->hi, rt_entry->lo);
 785         else
 786                 pr_info("root entry: 0x%016llx", rt_entry->lo);
 787
 788         /* context entry dump */
 789         ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
 790         if (!ctx_entry) {
 791                 pr_info("context table entry is not present\n");
 792                 return;
 793         }
 794
 795         pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
 796                 ctx_entry->hi, ctx_entry->lo);
 797
 798         /* legacy mode does not require PASID entries */
 799         if (!sm_supported(iommu)) {
 800                 level = agaw_to_level(ctx_entry->hi & 7);
 801                 pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
 802                 goto pgtable_walk;
 803         }
 804
 805         /* get the pointer to pasid directory entry */
 806         dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
 807         if (!dir) {
 808                 pr_info("pasid directory entry is not present\n");
 809                 return;
 810         }
 811         /* For request-without-pasid, get the pasid from context entry */
 812         if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
 813                 pasid = IOMMU_NO_PASID;
 814
 815         dir_index = pasid >> PASID_PDE_SHIFT;
 816         pde = &dir[dir_index];
 817         pr_info("pasid dir entry: 0x%016llx\n", pde->val);
 818
 819         /* get the pointer to the pasid table entry */
 820         entries = get_pasid_table_from_pde(pde);
 821         if (!entries) {
 822                 pr_info("pasid table entry is not present\n");
 823                 return;
 824         }
 825         index = pasid & PASID_PTE_MASK;
 826         pte = &entries[index];
 827         for (i = 0; i < ARRAY_SIZE(pte->val); i++)
 828                 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
 829
 830         if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
 831                 level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
 832                 pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
 833         } else {
 834                 level = agaw_to_level((pte->val[0] >> 2) & 0x7);
 835                 pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
 836         }
 837
 838 pgtable_walk:
 839         pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
 840 }
 841 #endif
 842
 843 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 844                                       unsigned long pfn, int *target_level,
 845                                       gfp_t gfp)
 846 {
 847         struct dma_pte *parent, *pte;
 848         int level = agaw_to_level(domain->agaw);
 849         int offset;
 850
 851         if (!domain_pfn_supported(domain, pfn))
 852                 /* Address beyond IOMMU's addressing capabilities. */
 853                 return NULL;
 854
 855         parent = domain->pgd;
 856
 857         while (1) {
 858                 void *tmp_page;
 859
 860                 offset = pfn_level_offset(pfn, level);
 861                 pte = &parent[offset];
 862                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
 863                         break;
 864                 if (level == *target_level)
 865                         break;
 866
 867                 if (!dma_pte_present(pte)) {
 868                         uint64_t pteval;
 869
 870                         tmp_page = alloc_pgtable_page(domain->nid, gfp);
 871
 872                         if (!tmp_page)
 873                                 return NULL;
 874
 875                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
 876                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
 877                         if (domain->use_first_level)
 878                                 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
 879
 880                         if (cmpxchg64(&pte->val, 0ULL, pteval))
 881                                 /* Someone else set it while we were thinking; use theirs. */
 882                                 free_pgtable_page(tmp_page);
 883                         else
 884                                 domain_flush_cache(domain, pte, sizeof(*pte));
 885                 }
 886                 if (level == 1)
 887                         break;
 888
 889                 parent = phys_to_virt(dma_pte_addr(pte));
 890                 level--;
 891         }
 892
 893         if (!*target_level)
 894                 *target_level = level;
 895
 896         return pte;
 897 }
 898
 899 /* return address's pte at specific level */
 900 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
 901                                          unsigned long pfn,
 902                                          int level, int *large_page)
 903 {
 904         struct dma_pte *parent, *pte;
 905         int total = agaw_to_level(domain->agaw);
 906         int offset;
 907
 908         parent = domain->pgd;
 909         while (level <= total) {
 910                 offset = pfn_level_offset(pfn, total);
 911                 pte = &parent[offset];
 912                 if (level == total)
 913                         return pte;
 914
 915                 if (!dma_pte_present(pte)) {
 916                         *large_page = total;
 917                         break;
 918                 }
 919
 920                 if (dma_pte_superpage(pte)) {
 921                         *large_page = total;
 922                         return pte;
 923                 }
 924
 925                 parent = phys_to_virt(dma_pte_addr(pte));
 926                 total--;
 927         }
 928         return NULL;
 929 }
 930
 931 /* clear last level pte, a tlb flush should be followed */
 932 static void dma_pte_clear_range(struct dmar_domain *domain,
 933                                 unsigned long start_pfn,
 934                                 unsigned long last_pfn)
 935 {
 936         unsigned int large_page;
 937         struct dma_pte *first_pte, *pte;
 938
 939         if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
 940             WARN_ON(start_pfn > last_pfn))
 941                 return;
 942
 943         /* we don't need lock here; nobody else touches the iova range */
 944         do {
 945                 large_page = 1;
 946                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
 947                 if (!pte) {
 948                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
 949                         continue;
 950                 }
 951                 do {
 952                         dma_clear_pte(pte);
 953                         start_pfn += lvl_to_nr_pages(large_page);
 954                         pte++;
 955                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
 956
 957                 domain_flush_cache(domain, first_pte,
 958                                    (void *)pte - (void *)first_pte);
 959
 960         } while (start_pfn && start_pfn <= last_pfn);
 961 }
 962
 963 static void dma_pte_free_level(struct dmar_domain *domain, int level,
 964                                int retain_level, struct dma_pte *pte,
 965                                unsigned long pfn, unsigned long start_pfn,
 966                                unsigned long last_pfn)
 967 {
 968         pfn = max(start_pfn, pfn);
 969         pte = &pte[pfn_level_offset(pfn, level)];
 970
 971         do {
 972                 unsigned long level_pfn;
 973                 struct dma_pte *level_pte;
 974
 975                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
 976                         goto next;
 977
 978                 level_pfn = pfn & level_mask(level);
 979                 level_pte = phys_to_virt(dma_pte_addr(pte));
 980
 981                 if (level > 2) {
 982                         dma_pte_free_level(domain, level - 1, retain_level,
 983                                            level_pte, level_pfn, start_pfn,
 984                                            last_pfn);
 985                 }
 986
 987                 /*
 988                  * Free the page table if we're below the level we want to
 989                  * retain and the range covers the entire table.
 990                  */
 991                 if (level < retain_level && !(start_pfn > level_pfn ||
 992                       last_pfn < level_pfn + level_size(level) - 1)) {
 993                         dma_clear_pte(pte);
 994                         domain_flush_cache(domain, pte, sizeof(*pte));
 995                         free_pgtable_page(level_pte);
 996                 }
 997 next:
 998                 pfn += level_size(level);
 999         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1000 }
1001
1002 /*
1003  * clear last level (leaf) ptes and free page table pages below the
1004  * level we wish to keep intact.
1005  */
1006 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1007                                    unsigned long start_pfn,
1008                                    unsigned long last_pfn,
1009                                    int retain_level)
1010 {
1011         dma_pte_clear_range(domain, start_pfn, last_pfn);
1012
1013         /* We don't need lock here; nobody else touches the iova range */
1014         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1015                            domain->pgd, 0, start_pfn, last_pfn);
1016
1017         /* free pgd */
1018         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1019                 free_pgtable_page(domain->pgd);
1020                 domain->pgd = NULL;
1021         }
1022 }
1023
1024 /* When a page at a given level is being unlinked from its parent, we don't
1025    need to *modify* it at all. All we need to do is make a list of all the
1026    pages which can be freed just as soon as we've flushed the IOTLB and we
1027    know the hardware page-walk will no longer touch them.
1028    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1029    be freed. */
1030 static void dma_pte_list_pagetables(struct dmar_domain *domain,
1031                                     int level, struct dma_pte *pte,
1032                                     struct list_head *freelist)
1033 {
1034         struct page *pg;
1035
1036         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1037         list_add_tail(&pg->lru, freelist);
1038
1039         if (level == 1)
1040                 return;
1041
1042         pte = page_address(pg);
1043         do {
1044                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1045                         dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1046                 pte++;
1047         } while (!first_pte_in_page(pte));
1048 }
1049
1050 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1051                                 struct dma_pte *pte, unsigned long pfn,
1052                                 unsigned long start_pfn, unsigned long last_pfn,
1053                                 struct list_head *freelist)
1054 {
1055         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1056
1057         pfn = max(start_pfn, pfn);
1058         pte = &pte[pfn_level_offset(pfn, level)];
1059
1060         do {
1061                 unsigned long level_pfn = pfn & level_mask(level);
1062
1063                 if (!dma_pte_present(pte))
1064                         goto next;
1065
1066                 /* If range covers entire pagetable, free it */
1067                 if (start_pfn <= level_pfn &&
1068                     last_pfn >= level_pfn + level_size(level) - 1) {
1069                         /* These suborbinate page tables are going away entirely. Don't
1070                            bother to clear them; we're just going to *free* them. */
1071                         if (level > 1 && !dma_pte_superpage(pte))
1072                                 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1073
1074                         dma_clear_pte(pte);
1075                         if (!first_pte)
1076                                 first_pte = pte;
1077                         last_pte = pte;
1078                 } else if (level > 1) {
1079                         /* Recurse down into a level that isn't *entirely* obsolete */
1080                         dma_pte_clear_level(domain, level - 1,
1081                                             phys_to_virt(dma_pte_addr(pte)),
1082                                             level_pfn, start_pfn, last_pfn,
1083                                             freelist);
1084                 }
1085 next:
1086                 pfn = level_pfn + level_size(level);
1087         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1088
1089         if (first_pte)
1090                 domain_flush_cache(domain, first_pte,
1091                                    (void *)++last_pte - (void *)first_pte);
1092 }
1093
1094 /* We can't just free the pages because the IOMMU may still be walking
1095    the page tables, and may have cached the intermediate levels. The
1096    pages can only be freed after the IOTLB flush has been done. */
1097 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1098                          unsigned long last_pfn, struct list_head *freelist)
1099 {
1100         if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1101             WARN_ON(start_pfn > last_pfn))
1102                 return;
1103
1104         /* we don't need lock here; nobody else touches the iova range */
1105         dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1106                             domain->pgd, 0, start_pfn, last_pfn, freelist);
1107
1108         /* free pgd */
1109         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1110                 struct page *pgd_page = virt_to_page(domain->pgd);
1111                 list_add_tail(&pgd_page->lru, freelist);
1112                 domain->pgd = NULL;
1113         }
1114 }
1115
1116 /* iommu handling */
1117 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1118 {
1119         struct root_entry *root;
1120
1121         root = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
1122         if (!root) {
1123                 pr_err("Allocating root entry for %s failed\n",
1124                         iommu->name);
1125                 return -ENOMEM;
1126         }
1127
1128         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1129         iommu->root_entry = root;
1130
1131         return 0;
1132 }
1133
1134 static void iommu_set_root_entry(struct intel_iommu *iommu)
1135 {
1136         u64 addr;
1137         u32 sts;
1138         unsigned long flag;
1139
1140         addr = virt_to_phys(iommu->root_entry);
1141         if (sm_supported(iommu))
1142                 addr |= DMA_RTADDR_SMT;
1143
1144         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1145         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1146
1147         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1148
1149         /* Make sure hardware complete it */
1150         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1151                       readl, (sts & DMA_GSTS_RTPS), sts);
1152
1153         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1154
1155         /*
1156          * Hardware invalidates all DMA remapping hardware translation
1157          * caches as part of SRTP flow.
1158          */
1159         if (cap_esrtps(iommu->cap))
1160                 return;
1161
1162         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1163         if (sm_supported(iommu))
1164                 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1165         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1166 }
1167
1168 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1169 {
1170         u32 val;
1171         unsigned long flag;
1172
1173         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1174                 return;
1175
1176         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1177         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1178
1179         /* Make sure hardware complete it */
1180         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1181                       readl, (!(val & DMA_GSTS_WBFS)), val);
1182
1183         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1184 }
1185
1186 /* return value determine if we need a write buffer flush */
1187 static void __iommu_flush_context(struct intel_iommu *iommu,
1188                                   u16 did, u16 source_id, u8 function_mask,
1189                                   u64 type)
1190 {
1191         u64 val = 0;
1192         unsigned long flag;
1193
1194         switch (type) {
1195         case DMA_CCMD_GLOBAL_INVL:
1196                 val = DMA_CCMD_GLOBAL_INVL;
1197                 break;
1198         case DMA_CCMD_DOMAIN_INVL:
1199                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1200                 break;
1201         case DMA_CCMD_DEVICE_INVL:
1202                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1203                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1204                 break;
1205         default:
1206                 pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1207                         iommu->name, type);
1208                 return;
1209         }
1210         val |= DMA_CCMD_ICC;
1211
1212         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1213         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1214
1215         /* Make sure hardware complete it */
1216         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1217                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1218
1219         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1220 }
1221
1222 /* return value determine if we need a write buffer flush */
1223 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1224                                 u64 addr, unsigned int size_order, u64 type)
1225 {
1226         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1227         u64 val = 0, val_iva = 0;
1228         unsigned long flag;
1229
1230         switch (type) {
1231         case DMA_TLB_GLOBAL_FLUSH:
1232                 /* global flush doesn't need set IVA_REG */
1233                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1234                 break;
1235         case DMA_TLB_DSI_FLUSH:
1236                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1237                 break;
1238         case DMA_TLB_PSI_FLUSH:
1239                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1240                 /* IH bit is passed in as part of address */
1241                 val_iva = size_order | addr;
1242                 break;
1243         default:
1244                 pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1245                         iommu->name, type);
1246                 return;
1247         }
1248
1249         if (cap_write_drain(iommu->cap))
1250                 val |= DMA_TLB_WRITE_DRAIN;
1251
1252         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1253         /* Note: Only uses first TLB reg currently */
1254         if (val_iva)
1255                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1256         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1257
1258         /* Make sure hardware complete it */
1259         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1260                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1261
1262         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1263
1264         /* check IOTLB invalidation granularity */
1265         if (DMA_TLB_IAIG(val) == 0)
1266                 pr_err("Flush IOTLB failed\n");
1267         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1268                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1269                         (unsigned long long)DMA_TLB_IIRG(type),
1270                         (unsigned long long)DMA_TLB_IAIG(val));
1271 }
1272
1273 static struct device_domain_info *
1274 domain_lookup_dev_info(struct dmar_domain *domain,
1275                        struct intel_iommu *iommu, u8 bus, u8 devfn)
1276 {
1277         struct device_domain_info *info;
1278         unsigned long flags;
1279
1280         spin_lock_irqsave(&domain->lock, flags);
1281         list_for_each_entry(info, &domain->devices, link) {
1282                 if (info->iommu == iommu && info->bus == bus &&
1283                     info->devfn == devfn) {
1284                         spin_unlock_irqrestore(&domain->lock, flags);
1285                         return info;
1286                 }
1287         }
1288         spin_unlock_irqrestore(&domain->lock, flags);
1289
1290         return NULL;
1291 }
1292
1293 void domain_update_iotlb(struct dmar_domain *domain)
1294 {
1295         struct dev_pasid_info *dev_pasid;
1296         struct device_domain_info *info;
1297         bool has_iotlb_device = false;
1298         unsigned long flags;
1299
1300         spin_lock_irqsave(&domain->lock, flags);
1301         list_for_each_entry(info, &domain->devices, link) {
1302                 if (info->ats_enabled) {
1303                         has_iotlb_device = true;
1304                         break;
1305                 }
1306         }
1307
1308         list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1309                 info = dev_iommu_priv_get(dev_pasid->dev);
1310                 if (info->ats_enabled) {
1311                         has_iotlb_device = true;
1312                         break;
1313                 }
1314         }
1315         domain->has_iotlb_device = has_iotlb_device;
1316         spin_unlock_irqrestore(&domain->lock, flags);
1317 }
1318
1319 /*
1320  * The extra devTLB flush quirk impacts those QAT devices with PCI device
1321  * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1322  * check because it applies only to the built-in QAT devices and it doesn't
1323  * grant additional privileges.
1324  */
1325 #define BUGGY_QAT_DEVID_MASK 0x4940
1326 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1327 {
1328         if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1329                 return false;
1330
1331         if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1332                 return false;
1333
1334         return true;
1335 }
1336
1337 static void iommu_enable_pci_caps(struct device_domain_info *info)
1338 {
1339         struct pci_dev *pdev;
1340
1341         if (!dev_is_pci(info->dev))
1342                 return;
1343
1344         pdev = to_pci_dev(info->dev);
1345
1346         /* The PCIe spec, in its wisdom, declares that the behaviour of
1347            the device if you enable PASID support after ATS support is
1348            undefined. So always enable PASID support on devices which
1349            have it, even if we can't yet know if we're ever going to
1350            use it. */
1351         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1352                 info->pasid_enabled = 1;
1353
1354         if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1355             !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1356                 info->ats_enabled = 1;
1357                 domain_update_iotlb(info->domain);
1358         }
1359 }
1360
1361 static void iommu_disable_pci_caps(struct device_domain_info *info)
1362 {
1363         struct pci_dev *pdev;
1364
1365         if (!dev_is_pci(info->dev))
1366                 return;
1367
1368         pdev = to_pci_dev(info->dev);
1369
1370         if (info->ats_enabled) {
1371                 pci_disable_ats(pdev);
1372                 info->ats_enabled = 0;
1373                 domain_update_iotlb(info->domain);
1374         }
1375
1376         if (info->pasid_enabled) {
1377                 pci_disable_pasid(pdev);
1378                 info->pasid_enabled = 0;
1379         }
1380 }
1381
1382 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1383                                     u64 addr, unsigned int mask)
1384 {
1385         u16 sid, qdep;
1386
1387         if (!info || !info->ats_enabled)
1388                 return;
1389
1390         sid = info->bus << 8 | info->devfn;
1391         qdep = info->ats_qdep;
1392         qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1393                            qdep, addr, mask);
1394         quirk_extra_dev_tlb_flush(info, addr, mask, IOMMU_NO_PASID, qdep);
1395 }
1396
1397 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1398                                   u64 addr, unsigned mask)
1399 {
1400         struct dev_pasid_info *dev_pasid;
1401         struct device_domain_info *info;
1402         unsigned long flags;
1403
1404         if (!domain->has_iotlb_device)
1405                 return;
1406
1407         spin_lock_irqsave(&domain->lock, flags);
1408         list_for_each_entry(info, &domain->devices, link)
1409                 __iommu_flush_dev_iotlb(info, addr, mask);
1410
1411         list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1412                 info = dev_iommu_priv_get(dev_pasid->dev);
1413
1414                 if (!info->ats_enabled)
1415                         continue;
1416
1417                 qi_flush_dev_iotlb_pasid(info->iommu,
1418                                          PCI_DEVID(info->bus, info->devfn),
1419                                          info->pfsid, dev_pasid->pasid,
1420                                          info->ats_qdep, addr,
1421                                          mask);
1422         }
1423         spin_unlock_irqrestore(&domain->lock, flags);
1424 }
1425
1426 static void domain_flush_pasid_iotlb(struct intel_iommu *iommu,
1427                                      struct dmar_domain *domain, u64 addr,
1428                                      unsigned long npages, bool ih)
1429 {
1430         u16 did = domain_id_iommu(domain, iommu);
1431         struct dev_pasid_info *dev_pasid;
1432         unsigned long flags;
1433
1434         spin_lock_irqsave(&domain->lock, flags);
1435         list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain)
1436                 qi_flush_piotlb(iommu, did, dev_pasid->pasid, addr, npages, ih);
1437
1438         if (!list_empty(&domain->devices))
1439                 qi_flush_piotlb(iommu, did, IOMMU_NO_PASID, addr, npages, ih);
1440         spin_unlock_irqrestore(&domain->lock, flags);
1441 }
1442
1443 static void __iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1444                                     unsigned long pfn, unsigned int pages,
1445                                     int ih)
1446 {
1447         unsigned int aligned_pages = __roundup_pow_of_two(pages);
1448         unsigned long bitmask = aligned_pages - 1;
1449         unsigned int mask = ilog2(aligned_pages);
1450         u64 addr = (u64)pfn << VTD_PAGE_SHIFT;
1451
1452         /*
1453          * PSI masks the low order bits of the base address. If the
1454          * address isn't aligned to the mask, then compute a mask value
1455          * needed to ensure the target range is flushed.
1456          */
1457         if (unlikely(bitmask & pfn)) {
1458                 unsigned long end_pfn = pfn + pages - 1, shared_bits;
1459
1460                 /*
1461                  * Since end_pfn <= pfn + bitmask, the only way bits
1462                  * higher than bitmask can differ in pfn and end_pfn is
1463                  * by carrying. This means after masking out bitmask,
1464                  * high bits starting with the first set bit in
1465                  * shared_bits are all equal in both pfn and end_pfn.
1466                  */
1467                 shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1468                 mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1469         }
1470
1471         /*
1472          * Fallback to domain selective flush if no PSI support or
1473          * the size is too big.
1474          */
1475         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1476                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1477                                          DMA_TLB_DSI_FLUSH);
1478         else
1479                 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1480                                          DMA_TLB_PSI_FLUSH);
1481 }
1482
1483 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1484                                   struct dmar_domain *domain,
1485                                   unsigned long pfn, unsigned int pages,
1486                                   int ih, int map)
1487 {
1488         unsigned int aligned_pages = __roundup_pow_of_two(pages);
1489         unsigned int mask = ilog2(aligned_pages);
1490         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1491         u16 did = domain_id_iommu(domain, iommu);
1492
1493         if (WARN_ON(!pages))
1494                 return;
1495
1496         if (ih)
1497                 ih = 1 << 6;
1498
1499         if (domain->use_first_level)
1500                 domain_flush_pasid_iotlb(iommu, domain, addr, pages, ih);
1501         else
1502                 __iommu_flush_iotlb_psi(iommu, did, pfn, pages, ih);
1503
1504         /*
1505          * In caching mode, changes of pages from non-present to present require
1506          * flush. However, device IOTLB doesn't need to be flushed in this case.
1507          */
1508         if (!cap_caching_mode(iommu->cap) || !map)
1509                 iommu_flush_dev_iotlb(domain, addr, mask);
1510 }
1511
1512 /* Notification for newly created mappings */
1513 static void __mapping_notify_one(struct intel_iommu *iommu, struct dmar_domain *domain,
1514                                  unsigned long pfn, unsigned int pages)
1515 {
1516         /*
1517          * It's a non-present to present mapping. Only flush if caching mode
1518          * and second level.
1519          */
1520         if (cap_caching_mode(iommu->cap) && !domain->use_first_level)
1521                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1522         else
1523                 iommu_flush_write_buffer(iommu);
1524 }
1525
1526 /*
1527  * Flush the relevant caches in nested translation if the domain
1528  * also serves as a parent
1529  */
1530 static void parent_domain_flush(struct dmar_domain *domain,
1531                                 unsigned long pfn,
1532                                 unsigned long pages, int ih)
1533 {
1534         struct dmar_domain *s1_domain;
1535
1536         spin_lock(&domain->s1_lock);
1537         list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
1538                 struct device_domain_info *device_info;
1539                 struct iommu_domain_info *info;
1540                 unsigned long flags;
1541                 unsigned long i;
1542
1543                 xa_for_each(&s1_domain->iommu_array, i, info)
1544                         __iommu_flush_iotlb_psi(info->iommu, info->did,
1545                                                 pfn, pages, ih);
1546
1547                 if (!s1_domain->has_iotlb_device)
1548                         continue;
1549
1550                 spin_lock_irqsave(&s1_domain->lock, flags);
1551                 list_for_each_entry(device_info, &s1_domain->devices, link)
1552                         /*
1553                          * Address translation cache in device side caches the
1554                          * result of nested translation. There is no easy way
1555                          * to identify the exact set of nested translations
1556                          * affected by a change in S2. So just flush the entire
1557                          * device cache.
1558                          */
1559                         __iommu_flush_dev_iotlb(device_info, 0,
1560                                                 MAX_AGAW_PFN_WIDTH);
1561                 spin_unlock_irqrestore(&s1_domain->lock, flags);
1562         }
1563         spin_unlock(&domain->s1_lock);
1564 }
1565
1566 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1567 {
1568         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1569         struct iommu_domain_info *info;
1570         unsigned long idx;
1571
1572         xa_for_each(&dmar_domain->iommu_array, idx, info) {
1573                 struct intel_iommu *iommu = info->iommu;
1574                 u16 did = domain_id_iommu(dmar_domain, iommu);
1575
1576                 if (dmar_domain->use_first_level)
1577                         domain_flush_pasid_iotlb(iommu, dmar_domain, 0, -1, 0);
1578                 else
1579                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1580                                                  DMA_TLB_DSI_FLUSH);
1581
1582                 if (!cap_caching_mode(iommu->cap))
1583                         iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1584         }
1585
1586         if (dmar_domain->nested_parent)
1587                 parent_domain_flush(dmar_domain, 0, -1, 0);
1588 }
1589
1590 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1591 {
1592         u32 pmen;
1593         unsigned long flags;
1594
1595         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1596                 return;
1597
1598         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1599         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1600         pmen &= ~DMA_PMEN_EPM;
1601         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1602
1603         /* wait for the protected region status bit to clear */
1604         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1605                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1606
1607         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1608 }
1609
1610 static void iommu_enable_translation(struct intel_iommu *iommu)
1611 {
1612         u32 sts;
1613         unsigned long flags;
1614
1615         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1616         iommu->gcmd |= DMA_GCMD_TE;
1617         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1618
1619         /* Make sure hardware complete it */
1620         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1621                       readl, (sts & DMA_GSTS_TES), sts);
1622
1623         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1624 }
1625
1626 static void iommu_disable_translation(struct intel_iommu *iommu)
1627 {
1628         u32 sts;
1629         unsigned long flag;
1630
1631         if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1632             (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1633                 return;
1634
1635         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1636         iommu->gcmd &= ~DMA_GCMD_TE;
1637         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1638
1639         /* Make sure hardware complete it */
1640         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1641                       readl, (!(sts & DMA_GSTS_TES)), sts);
1642
1643         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1644 }
1645
1646 static int iommu_init_domains(struct intel_iommu *iommu)
1647 {
1648         u32 ndomains;
1649
1650         ndomains = cap_ndoms(iommu->cap);
1651         pr_debug("%s: Number of Domains supported <%d>\n",
1652                  iommu->name, ndomains);
1653
1654         spin_lock_init(&iommu->lock);
1655
1656         iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1657         if (!iommu->domain_ids)
1658                 return -ENOMEM;
1659
1660         /*
1661          * If Caching mode is set, then invalid translations are tagged
1662          * with domain-id 0, hence we need to pre-allocate it. We also
1663          * use domain-id 0 as a marker for non-allocated domain-id, so
1664          * make sure it is not used for a real domain.
1665          */
1666         set_bit(0, iommu->domain_ids);
1667
1668         /*
1669          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1670          * entry for first-level or pass-through translation modes should
1671          * be programmed with a domain id different from those used for
1672          * second-level or nested translation. We reserve a domain id for
1673          * this purpose.
1674          */
1675         if (sm_supported(iommu))
1676                 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1677
1678         return 0;
1679 }
1680
1681 static void disable_dmar_iommu(struct intel_iommu *iommu)
1682 {
1683         if (!iommu->domain_ids)
1684                 return;
1685
1686         /*
1687          * All iommu domains must have been detached from the devices,
1688          * hence there should be no domain IDs in use.
1689          */
1690         if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1691                     > NUM_RESERVED_DID))
1692                 return;
1693
1694         if (iommu->gcmd & DMA_GCMD_TE)
1695                 iommu_disable_translation(iommu);
1696 }
1697
1698 static void free_dmar_iommu(struct intel_iommu *iommu)
1699 {
1700         if (iommu->domain_ids) {
1701                 bitmap_free(iommu->domain_ids);
1702                 iommu->domain_ids = NULL;
1703         }
1704
1705         if (iommu->copied_tables) {
1706                 bitmap_free(iommu->copied_tables);
1707                 iommu->copied_tables = NULL;
1708         }
1709
1710         /* free context mapping */
1711         free_context_table(iommu);
1712
1713 #ifdef CONFIG_INTEL_IOMMU_SVM
1714         if (pasid_supported(iommu)) {
1715                 if (ecap_prs(iommu->ecap))
1716                         intel_svm_finish_prq(iommu);
1717         }
1718 #endif
1719 }
1720
1721 /*
1722  * Check and return whether first level is used by default for
1723  * DMA translation.
1724  */
1725 static bool first_level_by_default(unsigned int type)
1726 {
1727         /* Only SL is available in legacy mode */
1728         if (!scalable_mode_support())
1729                 return false;
1730
1731         /* Only level (either FL or SL) is available, just use it */
1732         if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1733                 return intel_cap_flts_sanity();
1734
1735         /* Both levels are available, decide it based on domain type */
1736         return type != IOMMU_DOMAIN_UNMANAGED;
1737 }
1738
1739 static struct dmar_domain *alloc_domain(unsigned int type)
1740 {
1741         struct dmar_domain *domain;
1742
1743         domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1744         if (!domain)
1745                 return NULL;
1746
1747         domain->nid = NUMA_NO_NODE;
1748         if (first_level_by_default(type))
1749                 domain->use_first_level = true;
1750         domain->has_iotlb_device = false;
1751         INIT_LIST_HEAD(&domain->devices);
1752         INIT_LIST_HEAD(&domain->dev_pasids);
1753         spin_lock_init(&domain->lock);
1754         xa_init(&domain->iommu_array);
1755
1756         return domain;
1757 }
1758
1759 int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1760 {
1761         struct iommu_domain_info *info, *curr;
1762         unsigned long ndomains;
1763         int num, ret = -ENOSPC;
1764
1765         info = kzalloc(sizeof(*info), GFP_KERNEL);
1766         if (!info)
1767                 return -ENOMEM;
1768
1769         spin_lock(&iommu->lock);
1770         curr = xa_load(&domain->iommu_array, iommu->seq_id);
1771         if (curr) {
1772                 curr->refcnt++;
1773                 spin_unlock(&iommu->lock);
1774                 kfree(info);
1775                 return 0;
1776         }
1777
1778         ndomains = cap_ndoms(iommu->cap);
1779         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1780         if (num >= ndomains) {
1781                 pr_err("%s: No free domain ids\n", iommu->name);
1782                 goto err_unlock;
1783         }
1784
1785         set_bit(num, iommu->domain_ids);
1786         info->refcnt    = 1;
1787         info->did       = num;
1788         info->iommu     = iommu;
1789         curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1790                           NULL, info, GFP_ATOMIC);
1791         if (curr) {
1792                 ret = xa_err(curr) ? : -EBUSY;
1793                 goto err_clear;
1794         }
1795         domain_update_iommu_cap(domain);
1796
1797         spin_unlock(&iommu->lock);
1798         return 0;
1799
1800 err_clear:
1801         clear_bit(info->did, iommu->domain_ids);
1802 err_unlock:
1803         spin_unlock(&iommu->lock);
1804         kfree(info);
1805         return ret;
1806 }
1807
1808 void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1809 {
1810         struct iommu_domain_info *info;
1811
1812         spin_lock(&iommu->lock);
1813         info = xa_load(&domain->iommu_array, iommu->seq_id);
1814         if (--info->refcnt == 0) {
1815                 clear_bit(info->did, iommu->domain_ids);
1816                 xa_erase(&domain->iommu_array, iommu->seq_id);
1817                 domain->nid = NUMA_NO_NODE;
1818                 domain_update_iommu_cap(domain);
1819                 kfree(info);
1820         }
1821         spin_unlock(&iommu->lock);
1822 }
1823
1824 static int guestwidth_to_adjustwidth(int gaw)
1825 {
1826         int agaw;
1827         int r = (gaw - 12) % 9;
1828
1829         if (r == 0)
1830                 agaw = gaw;
1831         else
1832                 agaw = gaw + 9 - r;
1833         if (agaw > 64)
1834                 agaw = 64;
1835         return agaw;
1836 }
1837
1838 static void domain_exit(struct dmar_domain *domain)
1839 {
1840         if (domain->pgd) {
1841                 LIST_HEAD(freelist);
1842
1843                 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1844                 put_pages_list(&freelist);
1845         }
1846
1847         if (WARN_ON(!list_empty(&domain->devices)))
1848                 return;
1849
1850         kfree(domain);
1851 }
1852
1853 static int domain_context_mapping_one(struct dmar_domain *domain,
1854                                       struct intel_iommu *iommu,
1855                                       u8 bus, u8 devfn)
1856 {
1857         struct device_domain_info *info =
1858                         domain_lookup_dev_info(domain, iommu, bus, devfn);
1859         u16 did = domain_id_iommu(domain, iommu);
1860         int translation = CONTEXT_TT_MULTI_LEVEL;
1861         struct dma_pte *pgd = domain->pgd;
1862         struct context_entry *context;
1863         int agaw, ret;
1864
1865         if (hw_pass_through && domain_type_is_si(domain))
1866                 translation = CONTEXT_TT_PASS_THROUGH;
1867
1868         pr_debug("Set context mapping for %02x:%02x.%d\n",
1869                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1870
1871         spin_lock(&iommu->lock);
1872         ret = -ENOMEM;
1873         context = iommu_context_addr(iommu, bus, devfn, 1);
1874         if (!context)
1875                 goto out_unlock;
1876
1877         ret = 0;
1878         if (context_present(context) && !context_copied(iommu, bus, devfn))
1879                 goto out_unlock;
1880
1881         /*
1882          * For kdump cases, old valid entries may be cached due to the
1883          * in-flight DMA and copied pgtable, but there is no unmapping
1884          * behaviour for them, thus we need an explicit cache flush for
1885          * the newly-mapped device. For kdump, at this point, the device
1886          * is supposed to finish reset at its driver probe stage, so no
1887          * in-flight DMA will exist, and we don't need to worry anymore
1888          * hereafter.
1889          */
1890         if (context_copied(iommu, bus, devfn)) {
1891                 u16 did_old = context_domain_id(context);
1892
1893                 if (did_old < cap_ndoms(iommu->cap)) {
1894                         iommu->flush.flush_context(iommu, did_old,
1895                                                    (((u16)bus) << 8) | devfn,
1896                                                    DMA_CCMD_MASK_NOBIT,
1897                                                    DMA_CCMD_DEVICE_INVL);
1898                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1899                                                  DMA_TLB_DSI_FLUSH);
1900                 }
1901
1902                 clear_context_copied(iommu, bus, devfn);
1903         }
1904
1905         context_clear_entry(context);
1906         context_set_domain_id(context, did);
1907
1908         if (translation != CONTEXT_TT_PASS_THROUGH) {
1909                 /*
1910                  * Skip top levels of page tables for iommu which has
1911                  * less agaw than default. Unnecessary for PT mode.
1912                  */
1913                 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
1914                         ret = -ENOMEM;
1915                         pgd = phys_to_virt(dma_pte_addr(pgd));
1916                         if (!dma_pte_present(pgd))
1917                                 goto out_unlock;
1918                 }
1919
1920                 if (info && info->ats_supported)
1921                         translation = CONTEXT_TT_DEV_IOTLB;
1922                 else
1923                         translation = CONTEXT_TT_MULTI_LEVEL;
1924
1925                 context_set_address_root(context, virt_to_phys(pgd));
1926                 context_set_address_width(context, agaw);
1927         } else {
1928                 /*
1929                  * In pass through mode, AW must be programmed to
1930                  * indicate the largest AGAW value supported by
1931                  * hardware. And ASR is ignored by hardware.
1932                  */
1933                 context_set_address_width(context, iommu->msagaw);
1934         }
1935
1936         context_set_translation_type(context, translation);
1937         context_set_fault_enable(context);
1938         context_set_present(context);
1939         if (!ecap_coherent(iommu->ecap))
1940                 clflush_cache_range(context, sizeof(*context));
1941
1942         /*
1943          * It's a non-present to present mapping. If hardware doesn't cache
1944          * non-present entry we only need to flush the write-buffer. If the
1945          * _does_ cache non-present entries, then it does so in the special
1946          * domain #0, which we have to flush:
1947          */
1948         if (cap_caching_mode(iommu->cap)) {
1949                 iommu->flush.flush_context(iommu, 0,
1950                                            (((u16)bus) << 8) | devfn,
1951                                            DMA_CCMD_MASK_NOBIT,
1952                                            DMA_CCMD_DEVICE_INVL);
1953                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1954         } else {
1955                 iommu_flush_write_buffer(iommu);
1956         }
1957
1958         ret = 0;
1959
1960 out_unlock:
1961         spin_unlock(&iommu->lock);
1962
1963         return ret;
1964 }
1965
1966 static int domain_context_mapping_cb(struct pci_dev *pdev,
1967                                      u16 alias, void *opaque)
1968 {
1969         struct device_domain_info *info = dev_iommu_priv_get(&pdev->dev);
1970         struct intel_iommu *iommu = info->iommu;
1971         struct dmar_domain *domain = opaque;
1972
1973         return domain_context_mapping_one(domain, iommu,
1974                                           PCI_BUS_NUM(alias), alias & 0xff);
1975 }
1976
1977 static int
1978 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
1979 {
1980         struct device_domain_info *info = dev_iommu_priv_get(dev);
1981         struct intel_iommu *iommu = info->iommu;
1982         u8 bus = info->bus, devfn = info->devfn;
1983
1984         if (!dev_is_pci(dev))
1985                 return domain_context_mapping_one(domain, iommu, bus, devfn);
1986
1987         return pci_for_each_dma_alias(to_pci_dev(dev),
1988                                       domain_context_mapping_cb, domain);
1989 }
1990
1991 /* Returns a number of VTD pages, but aligned to MM page size */
1992 static unsigned long aligned_nrpages(unsigned long host_addr, size_t size)
1993 {
1994         host_addr &= ~PAGE_MASK;
1995         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1996 }
1997
1998 /* Return largest possible superpage level for a given mapping */
1999 static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn,
2000                                    unsigned long phy_pfn, unsigned long pages)
2001 {
2002         int support, level = 1;
2003         unsigned long pfnmerge;
2004
2005         support = domain->iommu_superpage;
2006
2007         /* To use a large page, the virtual *and* physical addresses
2008            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2009            of them will mean we have to use smaller pages. So just
2010            merge them and check both at once. */
2011         pfnmerge = iov_pfn | phy_pfn;
2012
2013         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2014                 pages >>= VTD_STRIDE_SHIFT;
2015                 if (!pages)
2016                         break;
2017                 pfnmerge >>= VTD_STRIDE_SHIFT;
2018                 level++;
2019                 support--;
2020         }
2021         return level;
2022 }
2023
2024 /*
2025  * Ensure that old small page tables are removed to make room for superpage(s).
2026  * We're going to add new large pages, so make sure we don't remove their parent
2027  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2028  */
2029 static void switch_to_super_page(struct dmar_domain *domain,
2030                                  unsigned long start_pfn,
2031                                  unsigned long end_pfn, int level)
2032 {
2033         unsigned long lvl_pages = lvl_to_nr_pages(level);
2034         struct iommu_domain_info *info;
2035         struct dma_pte *pte = NULL;
2036         unsigned long i;
2037
2038         while (start_pfn <= end_pfn) {
2039                 if (!pte)
2040                         pte = pfn_to_dma_pte(domain, start_pfn, &level,
2041                                              GFP_ATOMIC);
2042
2043                 if (dma_pte_present(pte)) {
2044                         dma_pte_free_pagetable(domain, start_pfn,
2045                                                start_pfn + lvl_pages - 1,
2046                                                level + 1);
2047
2048                         xa_for_each(&domain->iommu_array, i, info)
2049                                 iommu_flush_iotlb_psi(info->iommu, domain,
2050                                                       start_pfn, lvl_pages,
2051                                                       0, 0);
2052                         if (domain->nested_parent)
2053                                 parent_domain_flush(domain, start_pfn,
2054                                                     lvl_pages, 0);
2055                 }
2056
2057                 pte++;
2058                 start_pfn += lvl_pages;
2059                 if (first_pte_in_page(pte))
2060                         pte = NULL;
2061         }
2062 }
2063
2064 static int
2065 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2066                  unsigned long phys_pfn, unsigned long nr_pages, int prot,
2067                  gfp_t gfp)
2068 {
2069         struct dma_pte *first_pte = NULL, *pte = NULL;
2070         unsigned int largepage_lvl = 0;
2071         unsigned long lvl_pages = 0;
2072         phys_addr_t pteval;
2073         u64 attr;
2074
2075         if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
2076                 return -EINVAL;
2077
2078         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2079                 return -EINVAL;
2080
2081         if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
2082                 pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
2083                 return -EINVAL;
2084         }
2085
2086         attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2087         attr |= DMA_FL_PTE_PRESENT;
2088         if (domain->use_first_level) {
2089                 attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2090                 if (prot & DMA_PTE_WRITE)
2091                         attr |= DMA_FL_PTE_DIRTY;
2092         }
2093
2094         domain->has_mappings = true;
2095
2096         pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2097
2098         while (nr_pages > 0) {
2099                 uint64_t tmp;
2100
2101                 if (!pte) {
2102                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2103                                         phys_pfn, nr_pages);
2104
2105                         pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
2106                                              gfp);
2107                         if (!pte)
2108                                 return -ENOMEM;
2109                         first_pte = pte;
2110
2111                         lvl_pages = lvl_to_nr_pages(largepage_lvl);
2112
2113                         /* It is large page*/
2114                         if (largepage_lvl > 1) {
2115                                 unsigned long end_pfn;
2116                                 unsigned long pages_to_remove;
2117
2118                                 pteval |= DMA_PTE_LARGE_PAGE;
2119                                 pages_to_remove = min_t(unsigned long, nr_pages,
2120                                                         nr_pte_to_next_page(pte) * lvl_pages);
2121                                 end_pfn = iov_pfn + pages_to_remove - 1;
2122                                 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2123                         } else {
2124                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2125                         }
2126
2127                 }
2128                 /* We don't need lock here, nobody else
2129                  * touches the iova range
2130                  */
2131                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2132                 if (tmp) {
2133                         static int dumps = 5;
2134                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2135                                 iov_pfn, tmp, (unsigned long long)pteval);
2136                         if (dumps) {
2137                                 dumps--;
2138                                 debug_dma_dump_mappings(NULL);
2139                         }
2140                         WARN_ON(1);
2141                 }
2142
2143                 nr_pages -= lvl_pages;
2144                 iov_pfn += lvl_pages;
2145                 phys_pfn += lvl_pages;
2146                 pteval += lvl_pages * VTD_PAGE_SIZE;
2147
2148                 /* If the next PTE would be the first in a new page, then we
2149                  * need to flush the cache on the entries we've just written.
2150                  * And then we'll need to recalculate 'pte', so clear it and
2151                  * let it get set again in the if (!pte) block above.
2152                  *
2153                  * If we're done (!nr_pages) we need to flush the cache too.
2154                  *
2155                  * Also if we've been setting superpages, we may need to
2156                  * recalculate 'pte' and switch back to smaller pages for the
2157                  * end of the mapping, if the trailing size is not enough to
2158                  * use another superpage (i.e. nr_pages < lvl_pages).
2159                  */
2160                 pte++;
2161                 if (!nr_pages || first_pte_in_page(pte) ||
2162                     (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2163                         domain_flush_cache(domain, first_pte,
2164                                            (void *)pte - (void *)first_pte);
2165                         pte = NULL;
2166                 }
2167         }
2168
2169         return 0;
2170 }
2171
2172 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2173 {
2174         struct intel_iommu *iommu = info->iommu;
2175         struct context_entry *context;
2176         u16 did_old;
2177
2178         spin_lock(&iommu->lock);
2179         context = iommu_context_addr(iommu, bus, devfn, 0);
2180         if (!context) {
2181                 spin_unlock(&iommu->lock);
2182                 return;
2183         }
2184
2185         did_old = context_domain_id(context);
2186
2187         context_clear_entry(context);
2188         __iommu_flush_cache(iommu, context, sizeof(*context));
2189         spin_unlock(&iommu->lock);
2190         iommu->flush.flush_context(iommu,
2191                                    did_old,
2192                                    (((u16)bus) << 8) | devfn,
2193                                    DMA_CCMD_MASK_NOBIT,
2194                                    DMA_CCMD_DEVICE_INVL);
2195
2196         iommu->flush.flush_iotlb(iommu,
2197                                  did_old,
2198                                  0,
2199                                  0,
2200                                  DMA_TLB_DSI_FLUSH);
2201
2202         __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2203 }
2204
2205 static int domain_setup_first_level(struct intel_iommu *iommu,
2206                                     struct dmar_domain *domain,
2207                                     struct device *dev,
2208                                     u32 pasid)
2209 {
2210         struct dma_pte *pgd = domain->pgd;
2211         int agaw, level;
2212         int flags = 0;
2213
2214         /*
2215          * Skip top levels of page tables for iommu which has
2216          * less agaw than default. Unnecessary for PT mode.
2217          */
2218         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2219                 pgd = phys_to_virt(dma_pte_addr(pgd));
2220                 if (!dma_pte_present(pgd))
2221                         return -ENOMEM;
2222         }
2223
2224         level = agaw_to_level(agaw);
2225         if (level != 4 && level != 5)
2226                 return -EINVAL;
2227
2228         if (level == 5)
2229                 flags |= PASID_FLAG_FL5LP;
2230
2231         if (domain->force_snooping)
2232                 flags |= PASID_FLAG_PAGE_SNOOP;
2233
2234         return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2235                                              domain_id_iommu(domain, iommu),
2236                                              flags);
2237 }
2238
2239 static bool dev_is_real_dma_subdevice(struct device *dev)
2240 {
2241         return dev && dev_is_pci(dev) &&
2242                pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2243 }
2244
2245 static int iommu_domain_identity_map(struct dmar_domain *domain,
2246                                      unsigned long first_vpfn,
2247                                      unsigned long last_vpfn)
2248 {
2249         /*
2250          * RMRR range might have overlap with physical memory range,
2251          * clear it first
2252          */
2253         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2254
2255         return __domain_mapping(domain, first_vpfn,
2256                                 first_vpfn, last_vpfn - first_vpfn + 1,
2257                                 DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL);
2258 }
2259
2260 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2261
2262 static int __init si_domain_init(int hw)
2263 {
2264         struct dmar_rmrr_unit *rmrr;
2265         struct device *dev;
2266         int i, nid, ret;
2267
2268         si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2269         if (!si_domain)
2270                 return -EFAULT;
2271
2272         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2273                 domain_exit(si_domain);
2274                 si_domain = NULL;
2275                 return -EFAULT;
2276         }
2277
2278         if (hw)
2279                 return 0;
2280
2281         for_each_online_node(nid) {
2282                 unsigned long start_pfn, end_pfn;
2283                 int i;
2284
2285                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2286                         ret = iommu_domain_identity_map(si_domain,
2287                                         mm_to_dma_pfn_start(start_pfn),
2288                                         mm_to_dma_pfn_end(end_pfn));
2289                         if (ret)
2290                                 return ret;
2291                 }
2292         }
2293
2294         /*
2295          * Identity map the RMRRs so that devices with RMRRs could also use
2296          * the si_domain.
2297          */
2298         for_each_rmrr_units(rmrr) {
2299                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2300                                           i, dev) {
2301                         unsigned long long start = rmrr->base_address;
2302                         unsigned long long end = rmrr->end_address;
2303
2304                         if (WARN_ON(end < start ||
2305                                     end >> agaw_to_width(si_domain->agaw)))
2306                                 continue;
2307
2308                         ret = iommu_domain_identity_map(si_domain,
2309                                         mm_to_dma_pfn_start(start >> PAGE_SHIFT),
2310                                         mm_to_dma_pfn_end(end >> PAGE_SHIFT));
2311                         if (ret)
2312                                 return ret;
2313                 }
2314         }
2315
2316         return 0;
2317 }
2318
2319 static int dmar_domain_attach_device(struct dmar_domain *domain,
2320                                      struct device *dev)
2321 {
2322         struct device_domain_info *info = dev_iommu_priv_get(dev);
2323         struct intel_iommu *iommu = info->iommu;
2324         unsigned long flags;
2325         int ret;
2326
2327         ret = domain_attach_iommu(domain, iommu);
2328         if (ret)
2329                 return ret;
2330         info->domain = domain;
2331         spin_lock_irqsave(&domain->lock, flags);
2332         list_add(&info->link, &domain->devices);
2333         spin_unlock_irqrestore(&domain->lock, flags);
2334
2335         if (dev_is_real_dma_subdevice(dev))
2336                 return 0;
2337
2338         if (!sm_supported(iommu))
2339                 ret = domain_context_mapping(domain, dev);
2340         else if (hw_pass_through && domain_type_is_si(domain))
2341                 ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID);
2342         else if (domain->use_first_level)
2343                 ret = domain_setup_first_level(iommu, domain, dev, IOMMU_NO_PASID);
2344         else
2345                 ret = intel_pasid_setup_second_level(iommu, domain, dev, IOMMU_NO_PASID);
2346
2347         if (ret) {
2348                 device_block_translation(dev);
2349                 return ret;
2350         }
2351
2352         if (sm_supported(info->iommu) || !domain_type_is_si(info->domain))
2353                 iommu_enable_pci_caps(info);
2354
2355         return 0;
2356 }
2357
2358 /**
2359  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2360  * is relaxable (ie. is allowed to be not enforced under some conditions)
2361  * @dev: device handle
2362  *
2363  * We assume that PCI USB devices with RMRRs have them largely
2364  * for historical reasons and that the RMRR space is not actively used post
2365  * boot.  This exclusion may change if vendors begin to abuse it.
2366  *
2367  * The same exception is made for graphics devices, with the requirement that
2368  * any use of the RMRR regions will be torn down before assigning the device
2369  * to a guest.
2370  *
2371  * Return: true if the RMRR is relaxable, false otherwise
2372  */
2373 static bool device_rmrr_is_relaxable(struct device *dev)
2374 {
2375         struct pci_dev *pdev;
2376
2377         if (!dev_is_pci(dev))
2378                 return false;
2379
2380         pdev = to_pci_dev(dev);
2381         if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2382                 return true;
2383         else
2384                 return false;
2385 }
2386
2387 /*
2388  * Return the required default domain type for a specific device.
2389  *
2390  * @dev: the device in query
2391  * @startup: true if this is during early boot
2392  *
2393  * Returns:
2394  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2395  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2396  *  - 0: both identity and dynamic domains work for this device
2397  */
2398 static int device_def_domain_type(struct device *dev)
2399 {
2400         if (dev_is_pci(dev)) {
2401                 struct pci_dev *pdev = to_pci_dev(dev);
2402
2403                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2404                         return IOMMU_DOMAIN_IDENTITY;
2405
2406                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2407                         return IOMMU_DOMAIN_IDENTITY;
2408         }
2409
2410         return 0;
2411 }
2412
2413 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2414 {
2415         /*
2416          * Start from the sane iommu hardware state.
2417          * If the queued invalidation is already initialized by us
2418          * (for example, while enabling interrupt-remapping) then
2419          * we got the things already rolling from a sane state.
2420          */
2421         if (!iommu->qi) {
2422                 /*
2423                  * Clear any previous faults.
2424                  */
2425                 dmar_fault(-1, iommu);
2426                 /*
2427                  * Disable queued invalidation if supported and already enabled
2428                  * before OS handover.
2429                  */
2430                 dmar_disable_qi(iommu);
2431         }
2432
2433         if (dmar_enable_qi(iommu)) {
2434                 /*
2435                  * Queued Invalidate not enabled, use Register Based Invalidate
2436                  */
2437                 iommu->flush.flush_context = __iommu_flush_context;
2438                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2439                 pr_info("%s: Using Register based invalidation\n",
2440                         iommu->name);
2441         } else {
2442                 iommu->flush.flush_context = qi_flush_context;
2443                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2444                 pr_info("%s: Using Queued invalidation\n", iommu->name);
2445         }
2446 }
2447
2448 static int copy_context_table(struct intel_iommu *iommu,
2449                               struct root_entry *old_re,
2450                               struct context_entry **tbl,
2451                               int bus, bool ext)
2452 {
2453         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2454         struct context_entry *new_ce = NULL, ce;
2455         struct context_entry *old_ce = NULL;
2456         struct root_entry re;
2457         phys_addr_t old_ce_phys;
2458
2459         tbl_idx = ext ? bus * 2 : bus;
2460         memcpy(&re, old_re, sizeof(re));
2461
2462         for (devfn = 0; devfn < 256; devfn++) {
2463                 /* First calculate the correct index */
2464                 idx = (ext ? devfn * 2 : devfn) % 256;
2465
2466                 if (idx == 0) {
2467                         /* First save what we may have and clean up */
2468                         if (new_ce) {
2469                                 tbl[tbl_idx] = new_ce;
2470                                 __iommu_flush_cache(iommu, new_ce,
2471                                                     VTD_PAGE_SIZE);
2472                                 pos = 1;
2473                         }
2474
2475                         if (old_ce)
2476                                 memunmap(old_ce);
2477
2478                         ret = 0;
2479                         if (devfn < 0x80)
2480                                 old_ce_phys = root_entry_lctp(&re);
2481                         else
2482                                 old_ce_phys = root_entry_uctp(&re);
2483
2484                         if (!old_ce_phys) {
2485                                 if (ext && devfn == 0) {
2486                                         /* No LCTP, try UCTP */
2487                                         devfn = 0x7f;
2488                                         continue;
2489                                 } else {
2490                                         goto out;
2491                                 }
2492                         }
2493
2494                         ret = -ENOMEM;
2495                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
2496                                         MEMREMAP_WB);
2497                         if (!old_ce)
2498                                 goto out;
2499
2500                         new_ce = alloc_pgtable_page(iommu->node, GFP_KERNEL);
2501                         if (!new_ce)
2502                                 goto out_unmap;
2503
2504                         ret = 0;
2505                 }
2506
2507                 /* Now copy the context entry */
2508                 memcpy(&ce, old_ce + idx, sizeof(ce));
2509
2510                 if (!context_present(&ce))
2511                         continue;
2512
2513                 did = context_domain_id(&ce);
2514                 if (did >= 0 && did < cap_ndoms(iommu->cap))
2515                         set_bit(did, iommu->domain_ids);
2516
2517                 set_context_copied(iommu, bus, devfn);
2518                 new_ce[idx] = ce;
2519         }
2520
2521         tbl[tbl_idx + pos] = new_ce;
2522
2523         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2524
2525 out_unmap:
2526         memunmap(old_ce);
2527
2528 out:
2529         return ret;
2530 }
2531
2532 static int copy_translation_tables(struct intel_iommu *iommu)
2533 {
2534         struct context_entry **ctxt_tbls;
2535         struct root_entry *old_rt;
2536         phys_addr_t old_rt_phys;
2537         int ctxt_table_entries;
2538         u64 rtaddr_reg;
2539         int bus, ret;
2540         bool new_ext, ext;
2541
2542         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2543         ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
2544         new_ext    = !!sm_supported(iommu);
2545
2546         /*
2547          * The RTT bit can only be changed when translation is disabled,
2548          * but disabling translation means to open a window for data
2549          * corruption. So bail out and don't copy anything if we would
2550          * have to change the bit.
2551          */
2552         if (new_ext != ext)
2553                 return -EINVAL;
2554
2555         iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2556         if (!iommu->copied_tables)
2557                 return -ENOMEM;
2558
2559         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2560         if (!old_rt_phys)
2561                 return -EINVAL;
2562
2563         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2564         if (!old_rt)
2565                 return -ENOMEM;
2566
2567         /* This is too big for the stack - allocate it from slab */
2568         ctxt_table_entries = ext ? 512 : 256;
2569         ret = -ENOMEM;
2570         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2571         if (!ctxt_tbls)
2572                 goto out_unmap;
2573
2574         for (bus = 0; bus < 256; bus++) {
2575                 ret = copy_context_table(iommu, &old_rt[bus],
2576                                          ctxt_tbls, bus, ext);
2577                 if (ret) {
2578                         pr_err("%s: Failed to copy context table for bus %d\n",
2579                                 iommu->name, bus);
2580                         continue;
2581                 }
2582         }
2583
2584         spin_lock(&iommu->lock);
2585
2586         /* Context tables are copied, now write them to the root_entry table */
2587         for (bus = 0; bus < 256; bus++) {
2588                 int idx = ext ? bus * 2 : bus;
2589                 u64 val;
2590
2591                 if (ctxt_tbls[idx]) {
2592                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
2593                         iommu->root_entry[bus].lo = val;
2594                 }
2595
2596                 if (!ext || !ctxt_tbls[idx + 1])
2597                         continue;
2598
2599                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2600                 iommu->root_entry[bus].hi = val;
2601         }
2602
2603         spin_unlock(&iommu->lock);
2604
2605         kfree(ctxt_tbls);
2606
2607         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2608
2609         ret = 0;
2610
2611 out_unmap:
2612         memunmap(old_rt);
2613
2614         return ret;
2615 }
2616
2617 static int __init init_dmars(void)
2618 {
2619         struct dmar_drhd_unit *drhd;
2620         struct intel_iommu *iommu;
2621         int ret;
2622
2623         ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2624         if (ret)
2625                 goto free_iommu;
2626
2627         for_each_iommu(iommu, drhd) {
2628                 if (drhd->ignored) {
2629                         iommu_disable_translation(iommu);
2630                         continue;
2631                 }
2632
2633                 /*
2634                  * Find the max pasid size of all IOMMU's in the system.
2635                  * We need to ensure the system pasid table is no bigger
2636                  * than the smallest supported.
2637                  */
2638                 if (pasid_supported(iommu)) {
2639                         u32 temp = 2 << ecap_pss(iommu->ecap);
2640
2641                         intel_pasid_max_id = min_t(u32, temp,
2642                                                    intel_pasid_max_id);
2643                 }
2644
2645                 intel_iommu_init_qi(iommu);
2646
2647                 ret = iommu_init_domains(iommu);
2648                 if (ret)
2649                         goto free_iommu;
2650
2651                 init_translation_status(iommu);
2652
2653                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2654                         iommu_disable_translation(iommu);
2655                         clear_translation_pre_enabled(iommu);
2656                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2657                                 iommu->name);
2658                 }
2659
2660                 /*
2661                  * TBD:
2662                  * we could share the same root & context tables
2663                  * among all IOMMU's. Need to Split it later.
2664                  */
2665                 ret = iommu_alloc_root_entry(iommu);
2666                 if (ret)
2667                         goto free_iommu;
2668
2669                 if (translation_pre_enabled(iommu)) {
2670                         pr_info("Translation already enabled - trying to copy translation structures\n");
2671
2672                         ret = copy_translation_tables(iommu);
2673                         if (ret) {
2674                                 /*
2675                                  * We found the IOMMU with translation
2676                                  * enabled - but failed to copy over the
2677                                  * old root-entry table. Try to proceed
2678                                  * by disabling translation now and
2679                                  * allocating a clean root-entry table.
2680                                  * This might cause DMAR faults, but
2681                                  * probably the dump will still succeed.
2682                                  */
2683                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
2684                                        iommu->name);
2685                                 iommu_disable_translation(iommu);
2686                                 clear_translation_pre_enabled(iommu);
2687                         } else {
2688                                 pr_info("Copied translation tables from previous kernel for %s\n",
2689                                         iommu->name);
2690                         }
2691                 }
2692
2693                 if (!ecap_pass_through(iommu->ecap))
2694                         hw_pass_through = 0;
2695                 intel_svm_check(iommu);
2696         }
2697
2698         /*
2699          * Now that qi is enabled on all iommus, set the root entry and flush
2700          * caches. This is required on some Intel X58 chipsets, otherwise the
2701          * flush_context function will loop forever and the boot hangs.
2702          */
2703         for_each_active_iommu(iommu, drhd) {
2704                 iommu_flush_write_buffer(iommu);
2705                 iommu_set_root_entry(iommu);
2706         }
2707
2708         if (!dmar_map_gfx)
2709                 iommu_identity_mapping |= IDENTMAP_GFX;
2710
2711         check_tylersburg_isoch();
2712
2713         ret = si_domain_init(hw_pass_through);
2714         if (ret)
2715                 goto free_iommu;
2716
2717         /*
2718          * for each drhd
2719          *   enable fault log
2720          *   global invalidate context cache
2721          *   global invalidate iotlb
2722          *   enable translation
2723          */
2724         for_each_iommu(iommu, drhd) {
2725                 if (drhd->ignored) {
2726                         /*
2727                          * we always have to disable PMRs or DMA may fail on
2728                          * this device
2729                          */
2730                         if (force_on)
2731                                 iommu_disable_protect_mem_regions(iommu);
2732                         continue;
2733                 }
2734
2735                 iommu_flush_write_buffer(iommu);
2736
2737 #ifdef CONFIG_INTEL_IOMMU_SVM
2738                 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
2739                         /*
2740                          * Call dmar_alloc_hwirq() with dmar_global_lock held,
2741                          * could cause possible lock race condition.
2742                          */
2743                         up_write(&dmar_global_lock);
2744                         ret = intel_svm_enable_prq(iommu);
2745                         down_write(&dmar_global_lock);
2746                         if (ret)
2747                                 goto free_iommu;
2748                 }
2749 #endif
2750                 ret = dmar_set_interrupt(iommu);
2751                 if (ret)
2752                         goto free_iommu;
2753         }
2754
2755         return 0;
2756
2757 free_iommu:
2758         for_each_active_iommu(iommu, drhd) {
2759                 disable_dmar_iommu(iommu);
2760                 free_dmar_iommu(iommu);
2761         }
2762         if (si_domain) {
2763                 domain_exit(si_domain);
2764                 si_domain = NULL;
2765         }
2766
2767         return ret;
2768 }
2769
2770 static void __init init_no_remapping_devices(void)
2771 {
2772         struct dmar_drhd_unit *drhd;
2773         struct device *dev;
2774         int i;
2775
2776         for_each_drhd_unit(drhd) {
2777                 if (!drhd->include_all) {
2778                         for_each_active_dev_scope(drhd->devices,
2779                                                   drhd->devices_cnt, i, dev)
2780                                 break;
2781                         /* ignore DMAR unit if no devices exist */
2782                         if (i == drhd->devices_cnt)
2783                                 drhd->ignored = 1;
2784                 }
2785         }
2786
2787         for_each_active_drhd_unit(drhd) {
2788                 if (drhd->include_all)
2789                         continue;
2790
2791                 for_each_active_dev_scope(drhd->devices,
2792                                           drhd->devices_cnt, i, dev)
2793                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2794                                 break;
2795                 if (i < drhd->devices_cnt)
2796                         continue;
2797
2798                 /* This IOMMU has *only* gfx devices. Either bypass it or
2799                    set the gfx_mapped flag, as appropriate */
2800                 drhd->gfx_dedicated = 1;
2801                 if (!dmar_map_gfx)
2802                         drhd->ignored = 1;
2803         }
2804 }
2805
2806 #ifdef CONFIG_SUSPEND
2807 static int init_iommu_hw(void)
2808 {
2809         struct dmar_drhd_unit *drhd;
2810         struct intel_iommu *iommu = NULL;
2811         int ret;
2812
2813         for_each_active_iommu(iommu, drhd) {
2814                 if (iommu->qi) {
2815                         ret = dmar_reenable_qi(iommu);
2816                         if (ret)
2817                                 return ret;
2818                 }
2819         }
2820
2821         for_each_iommu(iommu, drhd) {
2822                 if (drhd->ignored) {
2823                         /*
2824                          * we always have to disable PMRs or DMA may fail on
2825                          * this device
2826                          */
2827                         if (force_on)
2828                                 iommu_disable_protect_mem_regions(iommu);
2829                         continue;
2830                 }
2831
2832                 iommu_flush_write_buffer(iommu);
2833                 iommu_set_root_entry(iommu);
2834                 iommu_enable_translation(iommu);
2835                 iommu_disable_protect_mem_regions(iommu);
2836         }
2837
2838         return 0;
2839 }
2840
2841 static void iommu_flush_all(void)
2842 {
2843         struct dmar_drhd_unit *drhd;
2844         struct intel_iommu *iommu;
2845
2846         for_each_active_iommu(iommu, drhd) {
2847                 iommu->flush.flush_context(iommu, 0, 0, 0,
2848                                            DMA_CCMD_GLOBAL_INVL);
2849                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2850                                          DMA_TLB_GLOBAL_FLUSH);
2851         }
2852 }
2853
2854 static int iommu_suspend(void)
2855 {
2856         struct dmar_drhd_unit *drhd;
2857         struct intel_iommu *iommu = NULL;
2858         unsigned long flag;
2859
2860         iommu_flush_all();
2861
2862         for_each_active_iommu(iommu, drhd) {
2863                 iommu_disable_translation(iommu);
2864
2865                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
2866
2867                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
2868                         readl(iommu->reg + DMAR_FECTL_REG);
2869                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
2870                         readl(iommu->reg + DMAR_FEDATA_REG);
2871                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
2872                         readl(iommu->reg + DMAR_FEADDR_REG);
2873                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
2874                         readl(iommu->reg + DMAR_FEUADDR_REG);
2875
2876                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2877         }
2878         return 0;
2879 }
2880
2881 static void iommu_resume(void)
2882 {
2883         struct dmar_drhd_unit *drhd;
2884         struct intel_iommu *iommu = NULL;
2885         unsigned long flag;
2886
2887         if (init_iommu_hw()) {
2888                 if (force_on)
2889                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
2890                 else
2891                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
2892                 return;
2893         }
2894
2895         for_each_active_iommu(iommu, drhd) {
2896
2897                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
2898
2899                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
2900                         iommu->reg + DMAR_FECTL_REG);
2901                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
2902                         iommu->reg + DMAR_FEDATA_REG);
2903                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
2904                         iommu->reg + DMAR_FEADDR_REG);
2905                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
2906                         iommu->reg + DMAR_FEUADDR_REG);
2907
2908                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2909         }
2910 }
2911
2912 static struct syscore_ops iommu_syscore_ops = {
2913         .resume         = iommu_resume,
2914         .suspend        = iommu_suspend,
2915 };
2916
2917 static void __init init_iommu_pm_ops(void)
2918 {
2919         register_syscore_ops(&iommu_syscore_ops);
2920 }
2921
2922 #else
2923 static inline void init_iommu_pm_ops(void) {}
2924 #endif  /* CONFIG_PM */
2925
2926 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
2927 {
2928         if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
2929             !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
2930             rmrr->end_address <= rmrr->base_address ||
2931             arch_rmrr_sanity_check(rmrr))
2932                 return -EINVAL;
2933
2934         return 0;
2935 }
2936
2937 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
2938 {
2939         struct acpi_dmar_reserved_memory *rmrr;
2940         struct dmar_rmrr_unit *rmrru;
2941
2942         rmrr = (struct acpi_dmar_reserved_memory *)header;
2943         if (rmrr_sanity_check(rmrr)) {
2944                 pr_warn(FW_BUG
2945                            "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
2946                            "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2947                            rmrr->base_address, rmrr->end_address,
2948                            dmi_get_system_info(DMI_BIOS_VENDOR),
2949                            dmi_get_system_info(DMI_BIOS_VERSION),
2950                            dmi_get_system_info(DMI_PRODUCT_VERSION));
2951                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
2952         }
2953
2954         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
2955         if (!rmrru)
2956                 goto out;
2957
2958         rmrru->hdr = header;
2959
2960         rmrru->base_address = rmrr->base_address;
2961         rmrru->end_address = rmrr->end_address;
2962
2963         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
2964                                 ((void *)rmrr) + rmrr->header.length,
2965                                 &rmrru->devices_cnt);
2966         if (rmrru->devices_cnt && rmrru->devices == NULL)
2967                 goto free_rmrru;
2968
2969         list_add(&rmrru->list, &dmar_rmrr_units);
2970
2971         return 0;
2972 free_rmrru:
2973         kfree(rmrru);
2974 out:
2975         return -ENOMEM;
2976 }
2977
2978 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
2979 {
2980         struct dmar_atsr_unit *atsru;
2981         struct acpi_dmar_atsr *tmp;
2982
2983         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
2984                                 dmar_rcu_check()) {
2985                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
2986                 if (atsr->segment != tmp->segment)
2987                         continue;
2988                 if (atsr->header.length != tmp->header.length)
2989                         continue;
2990                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
2991                         return atsru;
2992         }
2993
2994         return NULL;
2995 }
2996
2997 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2998 {
2999         struct acpi_dmar_atsr *atsr;
3000         struct dmar_atsr_unit *atsru;
3001
3002         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3003                 return 0;
3004
3005         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3006         atsru = dmar_find_atsr(atsr);
3007         if (atsru)
3008                 return 0;
3009
3010         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3011         if (!atsru)
3012                 return -ENOMEM;
3013
3014         /*
3015          * If memory is allocated from slab by ACPI _DSM method, we need to
3016          * copy the memory content because the memory buffer will be freed
3017          * on return.
3018          */
3019         atsru->hdr = (void *)(atsru + 1);
3020         memcpy(atsru->hdr, hdr, hdr->length);
3021         atsru->include_all = atsr->flags & 0x1;
3022         if (!atsru->include_all) {
3023                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3024                                 (void *)atsr + atsr->header.length,
3025                                 &atsru->devices_cnt);
3026                 if (atsru->devices_cnt && atsru->devices == NULL) {
3027                         kfree(atsru);
3028                         return -ENOMEM;
3029                 }
3030         }
3031
3032         list_add_rcu(&atsru->list, &dmar_atsr_units);
3033
3034         return 0;
3035 }
3036
3037 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3038 {
3039         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3040         kfree(atsru);
3041 }
3042
3043 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3044 {
3045         struct acpi_dmar_atsr *atsr;
3046         struct dmar_atsr_unit *atsru;
3047
3048         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3049         atsru = dmar_find_atsr(atsr);
3050         if (atsru) {
3051                 list_del_rcu(&atsru->list);
3052                 synchronize_rcu();
3053                 intel_iommu_free_atsr(atsru);
3054         }
3055
3056         return 0;
3057 }
3058
3059 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3060 {
3061         int i;
3062         struct device *dev;
3063         struct acpi_dmar_atsr *atsr;
3064         struct dmar_atsr_unit *atsru;
3065
3066         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3067         atsru = dmar_find_atsr(atsr);
3068         if (!atsru)
3069                 return 0;
3070
3071         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3072                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3073                                           i, dev)
3074                         return -EBUSY;
3075         }
3076
3077         return 0;
3078 }
3079
3080 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3081 {
3082         struct dmar_satc_unit *satcu;
3083         struct acpi_dmar_satc *tmp;
3084
3085         list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3086                                 dmar_rcu_check()) {
3087                 tmp = (struct acpi_dmar_satc *)satcu->hdr;
3088                 if (satc->segment != tmp->segment)
3089                         continue;
3090                 if (satc->header.length != tmp->header.length)
3091                         continue;
3092                 if (memcmp(satc, tmp, satc->header.length) == 0)
3093                         return satcu;
3094         }
3095
3096         return NULL;
3097 }
3098
3099 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3100 {
3101         struct acpi_dmar_satc *satc;
3102         struct dmar_satc_unit *satcu;
3103
3104         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3105                 return 0;
3106
3107         satc = container_of(hdr, struct acpi_dmar_satc, header);
3108         satcu = dmar_find_satc(satc);
3109         if (satcu)
3110                 return 0;
3111
3112         satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3113         if (!satcu)
3114                 return -ENOMEM;
3115
3116         satcu->hdr = (void *)(satcu + 1);
3117         memcpy(satcu->hdr, hdr, hdr->length);
3118         satcu->atc_required = satc->flags & 0x1;
3119         satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3120                                               (void *)satc + satc->header.length,
3121                                               &satcu->devices_cnt);
3122         if (satcu->devices_cnt && !satcu->devices) {
3123                 kfree(satcu);
3124                 return -ENOMEM;
3125         }
3126         list_add_rcu(&satcu->list, &dmar_satc_units);
3127
3128         return 0;
3129 }
3130
3131 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3132 {
3133         int sp, ret;
3134         struct intel_iommu *iommu = dmaru->iommu;
3135
3136         ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3137         if (ret)
3138                 goto out;
3139
3140         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3141                 pr_warn("%s: Doesn't support hardware pass through.\n",
3142                         iommu->name);
3143                 return -ENXIO;
3144         }
3145
3146         sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3147         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3148                 pr_warn("%s: Doesn't support large page.\n",
3149                         iommu->name);
3150                 return -ENXIO;
3151         }
3152
3153         /*
3154          * Disable translation if already enabled prior to OS handover.
3155          */
3156         if (iommu->gcmd & DMA_GCMD_TE)
3157                 iommu_disable_translation(iommu);
3158
3159         ret = iommu_init_domains(iommu);
3160         if (ret == 0)
3161                 ret = iommu_alloc_root_entry(iommu);
3162         if (ret)
3163                 goto out;
3164
3165         intel_svm_check(iommu);
3166
3167         if (dmaru->ignored) {
3168                 /*
3169                  * we always have to disable PMRs or DMA may fail on this device
3170                  */
3171                 if (force_on)
3172                         iommu_disable_protect_mem_regions(iommu);
3173                 return 0;
3174         }
3175
3176         intel_iommu_init_qi(iommu);
3177         iommu_flush_write_buffer(iommu);
3178
3179 #ifdef CONFIG_INTEL_IOMMU_SVM
3180         if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3181                 ret = intel_svm_enable_prq(iommu);
3182                 if (ret)
3183                         goto disable_iommu;
3184         }
3185 #endif
3186         ret = dmar_set_interrupt(iommu);
3187         if (ret)
3188                 goto disable_iommu;
3189
3190         iommu_set_root_entry(iommu);
3191         iommu_enable_translation(iommu);
3192
3193         iommu_disable_protect_mem_regions(iommu);
3194         return 0;
3195
3196 disable_iommu:
3197         disable_dmar_iommu(iommu);
3198 out:
3199         free_dmar_iommu(iommu);
3200         return ret;
3201 }
3202
3203 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3204 {
3205         int ret = 0;
3206         struct intel_iommu *iommu = dmaru->iommu;
3207
3208         if (!intel_iommu_enabled)
3209                 return 0;
3210         if (iommu == NULL)
3211                 return -EINVAL;
3212
3213         if (insert) {
3214                 ret = intel_iommu_add(dmaru);
3215         } else {
3216                 disable_dmar_iommu(iommu);
3217                 free_dmar_iommu(iommu);
3218         }
3219
3220         return ret;
3221 }
3222
3223 static void intel_iommu_free_dmars(void)
3224 {
3225         struct dmar_rmrr_unit *rmrru, *rmrr_n;
3226         struct dmar_atsr_unit *atsru, *atsr_n;
3227         struct dmar_satc_unit *satcu, *satc_n;
3228
3229         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3230                 list_del(&rmrru->list);
3231                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3232                 kfree(rmrru);
3233         }
3234
3235         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3236                 list_del(&atsru->list);
3237                 intel_iommu_free_atsr(atsru);
3238         }
3239         list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3240                 list_del(&satcu->list);
3241                 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3242                 kfree(satcu);
3243         }
3244 }
3245
3246 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3247 {
3248         struct dmar_satc_unit *satcu;
3249         struct acpi_dmar_satc *satc;
3250         struct device *tmp;
3251         int i;
3252
3253         dev = pci_physfn(dev);
3254         rcu_read_lock();
3255
3256         list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3257                 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3258                 if (satc->segment != pci_domain_nr(dev->bus))
3259                         continue;
3260                 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3261                         if (to_pci_dev(tmp) == dev)
3262                                 goto out;
3263         }
3264         satcu = NULL;
3265 out:
3266         rcu_read_unlock();
3267         return satcu;
3268 }
3269
3270 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3271 {
3272         int i, ret = 1;
3273         struct pci_bus *bus;
3274         struct pci_dev *bridge = NULL;
3275         struct device *tmp;
3276         struct acpi_dmar_atsr *atsr;
3277         struct dmar_atsr_unit *atsru;
3278         struct dmar_satc_unit *satcu;
3279
3280         dev = pci_physfn(dev);
3281         satcu = dmar_find_matched_satc_unit(dev);
3282         if (satcu)
3283                 /*
3284                  * This device supports ATS as it is in SATC table.
3285                  * When IOMMU is in legacy mode, enabling ATS is done
3286                  * automatically by HW for the device that requires
3287                  * ATS, hence OS should not enable this device ATS
3288                  * to avoid duplicated TLB invalidation.
3289                  */
3290                 return !(satcu->atc_required && !sm_supported(iommu));
3291
3292         for (bus = dev->bus; bus; bus = bus->parent) {
3293                 bridge = bus->self;
3294                 /* If it's an integrated device, allow ATS */
3295                 if (!bridge)
3296                         return 1;
3297                 /* Connected via non-PCIe: no ATS */
3298                 if (!pci_is_pcie(bridge) ||
3299                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3300                         return 0;
3301                 /* If we found the root port, look it up in the ATSR */
3302                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3303                         break;
3304         }
3305
3306         rcu_read_lock();
3307         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3308                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3309                 if (atsr->segment != pci_domain_nr(dev->bus))
3310                         continue;
3311
3312                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3313                         if (tmp == &bridge->dev)
3314                                 goto out;
3315
3316                 if (atsru->include_all)
3317                         goto out;
3318         }
3319         ret = 0;
3320 out:
3321         rcu_read_unlock();
3322
3323         return ret;
3324 }
3325
3326 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3327 {
3328         int ret;
3329         struct dmar_rmrr_unit *rmrru;
3330         struct dmar_atsr_unit *atsru;
3331         struct dmar_satc_unit *satcu;
3332         struct acpi_dmar_atsr *atsr;
3333         struct acpi_dmar_reserved_memory *rmrr;
3334         struct acpi_dmar_satc *satc;
3335
3336         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3337                 return 0;
3338
3339         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3340                 rmrr = container_of(rmrru->hdr,
3341                                     struct acpi_dmar_reserved_memory, header);
3342                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3343                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3344                                 ((void *)rmrr) + rmrr->header.length,
3345                                 rmrr->segment, rmrru->devices,
3346                                 rmrru->devices_cnt);
3347                         if (ret < 0)
3348                                 return ret;
3349                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3350                         dmar_remove_dev_scope(info, rmrr->segment,
3351                                 rmrru->devices, rmrru->devices_cnt);
3352                 }
3353         }
3354
3355         list_for_each_entry(atsru, &dmar_atsr_units, list) {
3356                 if (atsru->include_all)
3357                         continue;
3358
3359                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3360                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3361                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3362                                         (void *)atsr + atsr->header.length,
3363                                         atsr->segment, atsru->devices,
3364                                         atsru->devices_cnt);
3365                         if (ret > 0)
3366                                 break;
3367                         else if (ret < 0)
3368                                 return ret;
3369                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3370                         if (dmar_remove_dev_scope(info, atsr->segment,
3371                                         atsru->devices, atsru->devices_cnt))
3372                                 break;
3373                 }
3374         }
3375         list_for_each_entry(satcu, &dmar_satc_units, list) {
3376                 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3377                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3378                         ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3379                                         (void *)satc + satc->header.length,
3380                                         satc->segment, satcu->devices,
3381                                         satcu->devices_cnt);
3382                         if (ret > 0)
3383                                 break;
3384                         else if (ret < 0)
3385                                 return ret;
3386                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3387                         if (dmar_remove_dev_scope(info, satc->segment,
3388                                         satcu->devices, satcu->devices_cnt))
3389                                 break;
3390                 }
3391         }
3392
3393         return 0;
3394 }
3395
3396 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3397                                        unsigned long val, void *v)
3398 {
3399         struct memory_notify *mhp = v;
3400         unsigned long start_vpfn = mm_to_dma_pfn_start(mhp->start_pfn);
3401         unsigned long last_vpfn = mm_to_dma_pfn_end(mhp->start_pfn +
3402                         mhp->nr_pages - 1);
3403
3404         switch (val) {
3405         case MEM_GOING_ONLINE:
3406                 if (iommu_domain_identity_map(si_domain,
3407                                               start_vpfn, last_vpfn)) {
3408                         pr_warn("Failed to build identity map for [%lx-%lx]\n",
3409                                 start_vpfn, last_vpfn);
3410                         return NOTIFY_BAD;
3411                 }
3412                 break;
3413
3414         case MEM_OFFLINE:
3415         case MEM_CANCEL_ONLINE:
3416                 {
3417                         struct dmar_drhd_unit *drhd;
3418                         struct intel_iommu *iommu;
3419                         LIST_HEAD(freelist);
3420
3421                         domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3422
3423                         rcu_read_lock();
3424                         for_each_active_iommu(iommu, drhd)
3425                                 iommu_flush_iotlb_psi(iommu, si_domain,
3426                                         start_vpfn, mhp->nr_pages,
3427                                         list_empty(&freelist), 0);
3428                         rcu_read_unlock();
3429                         put_pages_list(&freelist);
3430                 }
3431                 break;
3432         }
3433
3434         return NOTIFY_OK;
3435 }
3436
3437 static struct notifier_block intel_iommu_memory_nb = {
3438         .notifier_call = intel_iommu_memory_notifier,
3439         .priority = 0
3440 };
3441
3442 static void intel_disable_iommus(void)
3443 {
3444         struct intel_iommu *iommu = NULL;
3445         struct dmar_drhd_unit *drhd;
3446
3447         for_each_iommu(iommu, drhd)
3448                 iommu_disable_translation(iommu);
3449 }
3450
3451 void intel_iommu_shutdown(void)
3452 {
3453         struct dmar_drhd_unit *drhd;
3454         struct intel_iommu *iommu = NULL;
3455
3456         if (no_iommu || dmar_disabled)
3457                 return;
3458
3459         down_write(&dmar_global_lock);
3460
3461         /* Disable PMRs explicitly here. */
3462         for_each_iommu(iommu, drhd)
3463                 iommu_disable_protect_mem_regions(iommu);
3464
3465         /* Make sure the IOMMUs are switched off */
3466         intel_disable_iommus();
3467
3468         up_write(&dmar_global_lock);
3469 }
3470
3471 static struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3472 {
3473         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3474
3475         return container_of(iommu_dev, struct intel_iommu, iommu);
3476 }
3477
3478 static ssize_t version_show(struct device *dev,
3479                             struct device_attribute *attr, char *buf)
3480 {
3481         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3482         u32 ver = readl(iommu->reg + DMAR_VER_REG);
3483         return sysfs_emit(buf, "%d:%d\n",
3484                           DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3485 }
3486 static DEVICE_ATTR_RO(version);
3487
3488 static ssize_t address_show(struct device *dev,
3489                             struct device_attribute *attr, char *buf)
3490 {
3491         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3492         return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
3493 }
3494 static DEVICE_ATTR_RO(address);
3495
3496 static ssize_t cap_show(struct device *dev,
3497                         struct device_attribute *attr, char *buf)
3498 {
3499         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3500         return sysfs_emit(buf, "%llx\n", iommu->cap);
3501 }
3502 static DEVICE_ATTR_RO(cap);
3503
3504 static ssize_t ecap_show(struct device *dev,
3505                          struct device_attribute *attr, char *buf)
3506 {
3507         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3508         return sysfs_emit(buf, "%llx\n", iommu->ecap);
3509 }
3510 static DEVICE_ATTR_RO(ecap);
3511
3512 static ssize_t domains_supported_show(struct device *dev,
3513                                       struct device_attribute *attr, char *buf)
3514 {
3515         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3516         return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
3517 }
3518 static DEVICE_ATTR_RO(domains_supported);
3519
3520 static ssize_t domains_used_show(struct device *dev,
3521                                  struct device_attribute *attr, char *buf)
3522 {
3523         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3524         return sysfs_emit(buf, "%d\n",
3525                           bitmap_weight(iommu->domain_ids,
3526                                         cap_ndoms(iommu->cap)));
3527 }
3528 static DEVICE_ATTR_RO(domains_used);
3529
3530 static struct attribute *intel_iommu_attrs[] = {
3531         &dev_attr_version.attr,
3532         &dev_attr_address.attr,
3533         &dev_attr_cap.attr,
3534         &dev_attr_ecap.attr,
3535         &dev_attr_domains_supported.attr,
3536         &dev_attr_domains_used.attr,
3537         NULL,
3538 };
3539
3540 static struct attribute_group intel_iommu_group = {
3541         .name = "intel-iommu",
3542         .attrs = intel_iommu_attrs,
3543 };
3544
3545 const struct attribute_group *intel_iommu_groups[] = {
3546         &intel_iommu_group,
3547         NULL,
3548 };
3549
3550 static bool has_external_pci(void)
3551 {
3552         struct pci_dev *pdev = NULL;
3553
3554         for_each_pci_dev(pdev)
3555                 if (pdev->external_facing) {
3556                         pci_dev_put(pdev);
3557                         return true;
3558                 }
3559
3560         return false;
3561 }
3562
3563 static int __init platform_optin_force_iommu(void)
3564 {
3565         if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3566                 return 0;
3567
3568         if (no_iommu || dmar_disabled)
3569                 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3570
3571         /*
3572          * If Intel-IOMMU is disabled by default, we will apply identity
3573          * map for all devices except those marked as being untrusted.
3574          */
3575         if (dmar_disabled)
3576                 iommu_set_default_passthrough(false);
3577
3578         dmar_disabled = 0;
3579         no_iommu = 0;
3580
3581         return 1;
3582 }
3583
3584 static int __init probe_acpi_namespace_devices(void)
3585 {
3586         struct dmar_drhd_unit *drhd;
3587         /* To avoid a -Wunused-but-set-variable warning. */
3588         struct intel_iommu *iommu __maybe_unused;
3589         struct device *dev;
3590         int i, ret = 0;
3591
3592         for_each_active_iommu(iommu, drhd) {
3593                 for_each_active_dev_scope(drhd->devices,
3594                                           drhd->devices_cnt, i, dev) {
3595                         struct acpi_device_physical_node *pn;
3596                         struct acpi_device *adev;
3597
3598                         if (dev->bus != &acpi_bus_type)
3599                                 continue;
3600
3601                         adev = to_acpi_device(dev);
3602                         mutex_lock(&adev->physical_node_lock);
3603                         list_for_each_entry(pn,
3604                                             &adev->physical_node_list, node) {
3605                                 ret = iommu_probe_device(pn->dev);
3606                                 if (ret)
3607                                         break;
3608                         }
3609                         mutex_unlock(&adev->physical_node_lock);
3610
3611                         if (ret)
3612                                 return ret;
3613                 }
3614         }
3615
3616         return 0;
3617 }
3618
3619 static __init int tboot_force_iommu(void)
3620 {
3621         if (!tboot_enabled())
3622                 return 0;
3623
3624         if (no_iommu || dmar_disabled)
3625                 pr_warn("Forcing Intel-IOMMU to enabled\n");
3626
3627         dmar_disabled = 0;
3628         no_iommu = 0;
3629
3630         return 1;
3631 }
3632
3633 int __init intel_iommu_init(void)
3634 {
3635         int ret = -ENODEV;
3636         struct dmar_drhd_unit *drhd;
3637         struct intel_iommu *iommu;
3638
3639         /*
3640          * Intel IOMMU is required for a TXT/tboot launch or platform
3641          * opt in, so enforce that.
3642          */
3643         force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3644                     platform_optin_force_iommu();
3645
3646         down_write(&dmar_global_lock);
3647         if (dmar_table_init()) {
3648                 if (force_on)
3649                         panic("tboot: Failed to initialize DMAR table\n");
3650                 goto out_free_dmar;
3651         }
3652
3653         if (dmar_dev_scope_init() < 0) {
3654                 if (force_on)
3655                         panic("tboot: Failed to initialize DMAR device scope\n");
3656                 goto out_free_dmar;
3657         }
3658
3659         up_write(&dmar_global_lock);
3660
3661         /*
3662          * The bus notifier takes the dmar_global_lock, so lockdep will
3663          * complain later when we register it under the lock.
3664          */
3665         dmar_register_bus_notifier();
3666
3667         down_write(&dmar_global_lock);
3668
3669         if (!no_iommu)
3670                 intel_iommu_debugfs_init();
3671
3672         if (no_iommu || dmar_disabled) {
3673                 /*
3674                  * We exit the function here to ensure IOMMU's remapping and
3675                  * mempool aren't setup, which means that the IOMMU's PMRs
3676                  * won't be disabled via the call to init_dmars(). So disable
3677                  * it explicitly here. The PMRs were setup by tboot prior to
3678                  * calling SENTER, but the kernel is expected to reset/tear
3679                  * down the PMRs.
3680                  */
3681                 if (intel_iommu_tboot_noforce) {
3682                         for_each_iommu(iommu, drhd)
3683                                 iommu_disable_protect_mem_regions(iommu);
3684                 }
3685
3686                 /*
3687                  * Make sure the IOMMUs are switched off, even when we
3688                  * boot into a kexec kernel and the previous kernel left
3689                  * them enabled
3690                  */
3691                 intel_disable_iommus();
3692                 goto out_free_dmar;
3693         }
3694
3695         if (list_empty(&dmar_rmrr_units))
3696                 pr_info("No RMRR found\n");
3697
3698         if (list_empty(&dmar_atsr_units))
3699                 pr_info("No ATSR found\n");
3700
3701         if (list_empty(&dmar_satc_units))
3702                 pr_info("No SATC found\n");
3703
3704         init_no_remapping_devices();
3705
3706         ret = init_dmars();
3707         if (ret) {
3708                 if (force_on)
3709                         panic("tboot: Failed to initialize DMARs\n");
3710                 pr_err("Initialization failed\n");
3711                 goto out_free_dmar;
3712         }
3713         up_write(&dmar_global_lock);
3714
3715         init_iommu_pm_ops();
3716
3717         down_read(&dmar_global_lock);
3718         for_each_active_iommu(iommu, drhd) {
3719                 /*
3720                  * The flush queue implementation does not perform
3721                  * page-selective invalidations that are required for efficient
3722                  * TLB flushes in virtual environments.  The benefit of batching
3723                  * is likely to be much lower than the overhead of synchronizing
3724                  * the virtual and physical IOMMU page-tables.
3725                  */
3726                 if (cap_caching_mode(iommu->cap) &&
3727                     !first_level_by_default(IOMMU_DOMAIN_DMA)) {
3728                         pr_info_once("IOMMU batching disallowed due to virtualization\n");
3729                         iommu_set_dma_strict();
3730                 }
3731                 iommu_device_sysfs_add(&iommu->iommu, NULL,
3732                                        intel_iommu_groups,
3733                                        "%s", iommu->name);
3734                 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3735
3736                 iommu_pmu_register(iommu);
3737         }
3738         up_read(&dmar_global_lock);
3739
3740         if (si_domain && !hw_pass_through)
3741                 register_memory_notifier(&intel_iommu_memory_nb);
3742
3743         down_read(&dmar_global_lock);
3744         if (probe_acpi_namespace_devices())
3745                 pr_warn("ACPI name space devices didn't probe correctly\n");
3746
3747         /* Finally, we enable the DMA remapping hardware. */
3748         for_each_iommu(iommu, drhd) {
3749                 if (!drhd->ignored && !translation_pre_enabled(iommu))
3750                         iommu_enable_translation(iommu);
3751
3752                 iommu_disable_protect_mem_regions(iommu);
3753         }
3754         up_read(&dmar_global_lock);
3755
3756         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3757
3758         intel_iommu_enabled = 1;
3759
3760         return 0;
3761
3762 out_free_dmar:
3763         intel_iommu_free_dmars();
3764         up_write(&dmar_global_lock);
3765         return ret;
3766 }
3767
3768 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3769 {
3770         struct device_domain_info *info = opaque;
3771
3772         domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3773         return 0;
3774 }
3775
3776 /*
3777  * NB - intel-iommu lacks any sort of reference counting for the users of
3778  * dependent devices.  If multiple endpoints have intersecting dependent
3779  * devices, unbinding the driver from any one of them will possibly leave
3780  * the others unable to operate.
3781  */
3782 static void domain_context_clear(struct device_domain_info *info)
3783 {
3784         if (!dev_is_pci(info->dev))
3785                 domain_context_clear_one(info, info->bus, info->devfn);
3786
3787         pci_for_each_dma_alias(to_pci_dev(info->dev),
3788                                &domain_context_clear_one_cb, info);
3789 }
3790
3791 /*
3792  * Clear the page table pointer in context or pasid table entries so that
3793  * all DMA requests without PASID from the device are blocked. If the page
3794  * table has been set, clean up the data structures.
3795  */
3796 void device_block_translation(struct device *dev)
3797 {
3798         struct device_domain_info *info = dev_iommu_priv_get(dev);
3799         struct intel_iommu *iommu = info->iommu;
3800         unsigned long flags;
3801
3802         iommu_disable_pci_caps(info);
3803         if (!dev_is_real_dma_subdevice(dev)) {
3804                 if (sm_supported(iommu))
3805                         intel_pasid_tear_down_entry(iommu, dev,
3806                                                     IOMMU_NO_PASID, false);
3807                 else
3808                         domain_context_clear(info);
3809         }
3810
3811         if (!info->domain)
3812                 return;
3813
3814         spin_lock_irqsave(&info->domain->lock, flags);
3815         list_del(&info->link);
3816         spin_unlock_irqrestore(&info->domain->lock, flags);
3817
3818         domain_detach_iommu(info->domain, iommu);
3819         info->domain = NULL;
3820 }
3821
3822 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3823 {
3824         int adjust_width;
3825
3826         /* calculate AGAW */
3827         domain->gaw = guest_width;
3828         adjust_width = guestwidth_to_adjustwidth(guest_width);
3829         domain->agaw = width_to_agaw(adjust_width);
3830
3831         domain->iommu_coherency = false;
3832         domain->iommu_superpage = 0;
3833         domain->max_addr = 0;
3834
3835         /* always allocate the top pgd */
3836         domain->pgd = alloc_pgtable_page(domain->nid, GFP_ATOMIC);
3837         if (!domain->pgd)
3838                 return -ENOMEM;
3839         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3840         return 0;
3841 }
3842
3843 static int blocking_domain_attach_dev(struct iommu_domain *domain,
3844                                       struct device *dev)
3845 {
3846         device_block_translation(dev);
3847         return 0;
3848 }
3849
3850 static struct iommu_domain blocking_domain = {
3851         .type = IOMMU_DOMAIN_BLOCKED,
3852         .ops = &(const struct iommu_domain_ops) {
3853                 .attach_dev     = blocking_domain_attach_dev,
3854         }
3855 };
3856
3857 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
3858 {
3859         struct dmar_domain *dmar_domain;
3860         struct iommu_domain *domain;
3861
3862         switch (type) {
3863         case IOMMU_DOMAIN_DMA:
3864         case IOMMU_DOMAIN_UNMANAGED:
3865                 dmar_domain = alloc_domain(type);
3866                 if (!dmar_domain) {
3867                         pr_err("Can't allocate dmar_domain\n");
3868                         return NULL;
3869                 }
3870                 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3871                         pr_err("Domain initialization failed\n");
3872                         domain_exit(dmar_domain);
3873                         return NULL;
3874                 }
3875
3876                 domain = &dmar_domain->domain;
3877                 domain->geometry.aperture_start = 0;
3878                 domain->geometry.aperture_end   =
3879                                 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
3880                 domain->geometry.force_aperture = true;
3881
3882                 return domain;
3883         case IOMMU_DOMAIN_IDENTITY:
3884                 return &si_domain->domain;
3885         case IOMMU_DOMAIN_SVA:
3886                 return intel_svm_domain_alloc();
3887         default:
3888                 return NULL;
3889         }
3890
3891         return NULL;
3892 }
3893
3894 static struct iommu_domain *
3895 intel_iommu_domain_alloc_user(struct device *dev, u32 flags,
3896                               struct iommu_domain *parent,
3897                               const struct iommu_user_data *user_data)
3898 {
3899         struct device_domain_info *info = dev_iommu_priv_get(dev);
3900         bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
3901         bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
3902         struct intel_iommu *iommu = info->iommu;
3903         struct dmar_domain *dmar_domain;
3904         struct iommu_domain *domain;
3905
3906         /* Must be NESTING domain */
3907         if (parent) {
3908                 if (!nested_supported(iommu) || flags)
3909                         return ERR_PTR(-EOPNOTSUPP);
3910                 return intel_nested_domain_alloc(parent, user_data);
3911         }
3912
3913         if (flags &
3914             (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
3915                 return ERR_PTR(-EOPNOTSUPP);
3916         if (nested_parent && !nested_supported(iommu))
3917                 return ERR_PTR(-EOPNOTSUPP);
3918         if (user_data || (dirty_tracking && !ssads_supported(iommu)))
3919                 return ERR_PTR(-EOPNOTSUPP);
3920
3921         /*
3922          * domain_alloc_user op needs to fully initialize a domain before
3923          * return, so uses iommu_domain_alloc() here for simple.
3924          */
3925         domain = iommu_domain_alloc(dev->bus);
3926         if (!domain)
3927                 return ERR_PTR(-ENOMEM);
3928
3929         dmar_domain = to_dmar_domain(domain);
3930
3931         if (nested_parent) {
3932                 dmar_domain->nested_parent = true;
3933                 INIT_LIST_HEAD(&dmar_domain->s1_domains);
3934                 spin_lock_init(&dmar_domain->s1_lock);
3935         }
3936
3937         if (dirty_tracking) {
3938                 if (dmar_domain->use_first_level) {
3939                         iommu_domain_free(domain);
3940                         return ERR_PTR(-EOPNOTSUPP);
3941                 }
3942                 domain->dirty_ops = &intel_dirty_ops;
3943         }
3944
3945         return domain;
3946 }
3947
3948 static void intel_iommu_domain_free(struct iommu_domain *domain)
3949 {
3950         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3951
3952         WARN_ON(dmar_domain->nested_parent &&
3953                 !list_empty(&dmar_domain->s1_domains));
3954         if (domain != &si_domain->domain)
3955                 domain_exit(dmar_domain);
3956 }
3957
3958 int prepare_domain_attach_device(struct iommu_domain *domain,
3959                                  struct device *dev)
3960 {
3961         struct device_domain_info *info = dev_iommu_priv_get(dev);
3962         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3963         struct intel_iommu *iommu = info->iommu;
3964         int addr_width;
3965
3966         if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
3967                 return -EINVAL;
3968
3969         if (domain->dirty_ops && !ssads_supported(iommu))
3970                 return -EINVAL;
3971
3972         /* check if this iommu agaw is sufficient for max mapped address */
3973         addr_width = agaw_to_width(iommu->agaw);
3974         if (addr_width > cap_mgaw(iommu->cap))
3975                 addr_width = cap_mgaw(iommu->cap);
3976
3977         if (dmar_domain->max_addr > (1LL << addr_width))
3978                 return -EINVAL;
3979         dmar_domain->gaw = addr_width;
3980
3981         /*
3982          * Knock out extra levels of page tables if necessary
3983          */
3984         while (iommu->agaw < dmar_domain->agaw) {
3985                 struct dma_pte *pte;
3986
3987                 pte = dmar_domain->pgd;
3988                 if (dma_pte_present(pte)) {
3989                         dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
3990                         free_pgtable_page(pte);
3991                 }
3992                 dmar_domain->agaw--;
3993         }
3994
3995         if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
3996             context_copied(iommu, info->bus, info->devfn))
3997                 return intel_pasid_setup_sm_context(dev);
3998
3999         return 0;
4000 }
4001
4002 static int intel_iommu_attach_device(struct iommu_domain *domain,
4003                                      struct device *dev)
4004 {
4005         struct device_domain_info *info = dev_iommu_priv_get(dev);
4006         int ret;
4007
4008         if (info->domain)
4009                 device_block_translation(dev);
4010
4011         ret = prepare_domain_attach_device(domain, dev);
4012         if (ret)
4013                 return ret;
4014
4015         return dmar_domain_attach_device(to_dmar_domain(domain), dev);
4016 }
4017
4018 static int intel_iommu_map(struct iommu_domain *domain,
4019                            unsigned long iova, phys_addr_t hpa,
4020                            size_t size, int iommu_prot, gfp_t gfp)
4021 {
4022         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4023         u64 max_addr;
4024         int prot = 0;
4025
4026         if (iommu_prot & IOMMU_READ)
4027                 prot |= DMA_PTE_READ;
4028         if (iommu_prot & IOMMU_WRITE)
4029                 prot |= DMA_PTE_WRITE;
4030         if (dmar_domain->set_pte_snp)
4031                 prot |= DMA_PTE_SNP;
4032
4033         max_addr = iova + size;
4034         if (dmar_domain->max_addr < max_addr) {
4035                 u64 end;
4036
4037                 /* check if minimum agaw is sufficient for mapped address */
4038                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4039                 if (end < max_addr) {
4040                         pr_err("%s: iommu width (%d) is not "
4041                                "sufficient for the mapped address (%llx)\n",
4042                                __func__, dmar_domain->gaw, max_addr);
4043                         return -EFAULT;
4044                 }
4045                 dmar_domain->max_addr = max_addr;
4046         }
4047         /* Round up size to next multiple of PAGE_SIZE, if it and
4048            the low bits of hpa would take us onto the next page */
4049         size = aligned_nrpages(hpa, size);
4050         return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4051                                 hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
4052 }
4053
4054 static int intel_iommu_map_pages(struct iommu_domain *domain,
4055                                  unsigned long iova, phys_addr_t paddr,
4056                                  size_t pgsize, size_t pgcount,
4057                                  int prot, gfp_t gfp, size_t *mapped)
4058 {
4059         unsigned long pgshift = __ffs(pgsize);
4060         size_t size = pgcount << pgshift;
4061         int ret;
4062
4063         if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4064                 return -EINVAL;
4065
4066         if (!IS_ALIGNED(iova | paddr, pgsize))
4067                 return -EINVAL;
4068
4069         ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4070         if (!ret && mapped)
4071                 *mapped = size;
4072
4073         return ret;
4074 }
4075
4076 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4077                                 unsigned long iova, size_t size,
4078                                 struct iommu_iotlb_gather *gather)
4079 {
4080         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4081         unsigned long start_pfn, last_pfn;
4082         int level = 0;
4083
4084         /* Cope with horrid API which requires us to unmap more than the
4085            size argument if it happens to be a large-page mapping. */
4086         if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
4087                                      &level, GFP_ATOMIC)))
4088                 return 0;
4089
4090         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4091                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4092
4093         start_pfn = iova >> VTD_PAGE_SHIFT;
4094         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4095
4096         domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4097
4098         if (dmar_domain->max_addr == iova + size)
4099                 dmar_domain->max_addr = iova;
4100
4101         /*
4102          * We do not use page-selective IOTLB invalidation in flush queue,
4103          * so there is no need to track page and sync iotlb.
4104          */
4105         if (!iommu_iotlb_gather_queued(gather))
4106                 iommu_iotlb_gather_add_page(domain, gather, iova, size);
4107
4108         return size;
4109 }
4110
4111 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4112                                       unsigned long iova,
4113                                       size_t pgsize, size_t pgcount,
4114                                       struct iommu_iotlb_gather *gather)
4115 {
4116         unsigned long pgshift = __ffs(pgsize);
4117         size_t size = pgcount << pgshift;
4118
4119         return intel_iommu_unmap(domain, iova, size, gather);
4120 }
4121
4122 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4123                                  struct iommu_iotlb_gather *gather)
4124 {
4125         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4126         unsigned long iova_pfn = IOVA_PFN(gather->start);
4127         size_t size = gather->end - gather->start;
4128         struct iommu_domain_info *info;
4129         unsigned long start_pfn;
4130         unsigned long nrpages;
4131         unsigned long i;
4132
4133         nrpages = aligned_nrpages(gather->start, size);
4134         start_pfn = mm_to_dma_pfn_start(iova_pfn);
4135
4136         xa_for_each(&dmar_domain->iommu_array, i, info)
4137                 iommu_flush_iotlb_psi(info->iommu, dmar_domain,
4138                                       start_pfn, nrpages,
4139                                       list_empty(&gather->freelist), 0);
4140
4141         if (dmar_domain->nested_parent)
4142                 parent_domain_flush(dmar_domain, start_pfn, nrpages,
4143                                     list_empty(&gather->freelist));
4144         put_pages_list(&gather->freelist);
4145 }
4146
4147 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4148                                             dma_addr_t iova)
4149 {
4150         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4151         struct dma_pte *pte;
4152         int level = 0;
4153         u64 phys = 0;
4154
4155         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
4156                              GFP_ATOMIC);
4157         if (pte && dma_pte_present(pte))
4158                 phys = dma_pte_addr(pte) +
4159                         (iova & (BIT_MASK(level_to_offset_bits(level) +
4160                                                 VTD_PAGE_SHIFT) - 1));
4161
4162         return phys;
4163 }
4164
4165 static bool domain_support_force_snooping(struct dmar_domain *domain)
4166 {
4167         struct device_domain_info *info;
4168         bool support = true;
4169
4170         assert_spin_locked(&domain->lock);
4171         list_for_each_entry(info, &domain->devices, link) {
4172                 if (!ecap_sc_support(info->iommu->ecap)) {
4173                         support = false;
4174                         break;
4175                 }
4176         }
4177
4178         return support;
4179 }
4180
4181 static void domain_set_force_snooping(struct dmar_domain *domain)
4182 {
4183         struct device_domain_info *info;
4184
4185         assert_spin_locked(&domain->lock);
4186         /*
4187          * Second level page table supports per-PTE snoop control. The
4188          * iommu_map() interface will handle this by setting SNP bit.
4189          */
4190         if (!domain->use_first_level) {
4191                 domain->set_pte_snp = true;
4192                 return;
4193         }
4194
4195         list_for_each_entry(info, &domain->devices, link)
4196                 intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4197                                                      IOMMU_NO_PASID);
4198 }
4199
4200 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4201 {
4202         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4203         unsigned long flags;
4204
4205         if (dmar_domain->force_snooping)
4206                 return true;
4207
4208         spin_lock_irqsave(&dmar_domain->lock, flags);
4209         if (!domain_support_force_snooping(dmar_domain) ||
4210             (!dmar_domain->use_first_level && dmar_domain->has_mappings)) {
4211                 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4212                 return false;
4213         }
4214
4215         domain_set_force_snooping(dmar_domain);
4216         dmar_domain->force_snooping = true;
4217         spin_unlock_irqrestore(&dmar_domain->lock, flags);
4218
4219         return true;
4220 }
4221
4222 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
4223 {
4224         struct device_domain_info *info = dev_iommu_priv_get(dev);
4225
4226         switch (cap) {
4227         case IOMMU_CAP_CACHE_COHERENCY:
4228         case IOMMU_CAP_DEFERRED_FLUSH:
4229                 return true;
4230         case IOMMU_CAP_PRE_BOOT_PROTECTION:
4231                 return dmar_platform_optin();
4232         case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
4233                 return ecap_sc_support(info->iommu->ecap);
4234         case IOMMU_CAP_DIRTY_TRACKING:
4235                 return ssads_supported(info->iommu);
4236         default:
4237                 return false;
4238         }
4239 }
4240
4241 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4242 {
4243         struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4244         struct device_domain_info *info;
4245         struct intel_iommu *iommu;
4246         u8 bus, devfn;
4247         int ret;
4248
4249         iommu = device_lookup_iommu(dev, &bus, &devfn);
4250         if (!iommu || !iommu->iommu.ops)
4251                 return ERR_PTR(-ENODEV);
4252
4253         info = kzalloc(sizeof(*info), GFP_KERNEL);
4254         if (!info)
4255                 return ERR_PTR(-ENOMEM);
4256
4257         if (dev_is_real_dma_subdevice(dev)) {
4258                 info->bus = pdev->bus->number;
4259                 info->devfn = pdev->devfn;
4260                 info->segment = pci_domain_nr(pdev->bus);
4261         } else {
4262                 info->bus = bus;
4263                 info->devfn = devfn;
4264                 info->segment = iommu->segment;
4265         }
4266
4267         info->dev = dev;
4268         info->iommu = iommu;
4269         if (dev_is_pci(dev)) {
4270                 if (ecap_dev_iotlb_support(iommu->ecap) &&
4271                     pci_ats_supported(pdev) &&
4272                     dmar_ats_supported(pdev, iommu)) {
4273                         info->ats_supported = 1;
4274                         info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
4275
4276                         /*
4277                          * For IOMMU that supports device IOTLB throttling
4278                          * (DIT), we assign PFSID to the invalidation desc
4279                          * of a VF such that IOMMU HW can gauge queue depth
4280                          * at PF level. If DIT is not set, PFSID will be
4281                          * treated as reserved, which should be set to 0.
4282                          */
4283                         if (ecap_dit(iommu->ecap))
4284                                 info->pfsid = pci_dev_id(pci_physfn(pdev));
4285                         info->ats_qdep = pci_ats_queue_depth(pdev);
4286                 }
4287                 if (sm_supported(iommu)) {
4288                         if (pasid_supported(iommu)) {
4289                                 int features = pci_pasid_features(pdev);
4290
4291                                 if (features >= 0)
4292                                         info->pasid_supported = features | 1;
4293                         }
4294
4295                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
4296                             pci_pri_supported(pdev))
4297                                 info->pri_supported = 1;
4298                 }
4299         }
4300
4301         dev_iommu_priv_set(dev, info);
4302         if (pdev && pci_ats_supported(pdev)) {
4303                 ret = device_rbtree_insert(iommu, info);
4304                 if (ret)
4305                         goto free;
4306         }
4307
4308         if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
4309                 ret = intel_pasid_alloc_table(dev);
4310                 if (ret) {
4311                         dev_err(dev, "PASID table allocation failed\n");
4312                         goto clear_rbtree;
4313                 }
4314
4315                 if (!context_copied(iommu, info->bus, info->devfn)) {
4316                         ret = intel_pasid_setup_sm_context(dev);
4317                         if (ret)
4318                                 goto free_table;
4319                 }
4320         }
4321
4322         intel_iommu_debugfs_create_dev(info);
4323
4324         return &iommu->iommu;
4325 free_table:
4326         intel_pasid_free_table(dev);
4327 clear_rbtree:
4328         device_rbtree_remove(info);
4329 free:
4330         kfree(info);
4331
4332         return ERR_PTR(ret);
4333 }
4334
4335 static void intel_iommu_release_device(struct device *dev)
4336 {
4337         struct device_domain_info *info = dev_iommu_priv_get(dev);
4338         struct intel_iommu *iommu = info->iommu;
4339
4340         mutex_lock(&iommu->iopf_lock);
4341         if (dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev)))
4342                 device_rbtree_remove(info);
4343         mutex_unlock(&iommu->iopf_lock);
4344
4345         if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
4346             !context_copied(iommu, info->bus, info->devfn))
4347                 intel_pasid_teardown_sm_context(dev);
4348
4349         intel_pasid_free_table(dev);
4350         intel_iommu_debugfs_remove_dev(info);
4351         kfree(info);
4352         set_dma_ops(dev, NULL);
4353 }
4354
4355 static void intel_iommu_probe_finalize(struct device *dev)
4356 {
4357         set_dma_ops(dev, NULL);
4358         iommu_setup_dma_ops(dev, 0, U64_MAX);
4359 }
4360
4361 static void intel_iommu_get_resv_regions(struct device *device,
4362                                          struct list_head *head)
4363 {
4364         int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4365         struct iommu_resv_region *reg;
4366         struct dmar_rmrr_unit *rmrr;
4367         struct device *i_dev;
4368         int i;
4369
4370         rcu_read_lock();
4371         for_each_rmrr_units(rmrr) {
4372                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4373                                           i, i_dev) {
4374                         struct iommu_resv_region *resv;
4375                         enum iommu_resv_type type;
4376                         size_t length;
4377
4378                         if (i_dev != device &&
4379                             !is_downstream_to_pci_bridge(device, i_dev))
4380                                 continue;
4381
4382                         length = rmrr->end_address - rmrr->base_address + 1;
4383
4384                         type = device_rmrr_is_relaxable(device) ?
4385                                 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4386
4387                         resv = iommu_alloc_resv_region(rmrr->base_address,
4388                                                        length, prot, type,
4389                                                        GFP_ATOMIC);
4390                         if (!resv)
4391                                 break;
4392
4393                         list_add_tail(&resv->list, head);
4394                 }
4395         }
4396         rcu_read_unlock();
4397
4398 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4399         if (dev_is_pci(device)) {
4400                 struct pci_dev *pdev = to_pci_dev(device);
4401
4402                 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4403                         reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4404                                         IOMMU_RESV_DIRECT_RELAXABLE,
4405                                         GFP_KERNEL);
4406                         if (reg)
4407                                 list_add_tail(&reg->list, head);
4408                 }
4409         }
4410 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4411
4412         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4413                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4414                                       0, IOMMU_RESV_MSI, GFP_KERNEL);
4415         if (!reg)
4416                 return;
4417         list_add_tail(&reg->list, head);
4418 }
4419
4420 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4421 {
4422         if (dev_is_pci(dev))
4423                 return pci_device_group(dev);
4424         return generic_device_group(dev);
4425 }
4426
4427 static int intel_iommu_enable_sva(struct device *dev)
4428 {
4429         struct device_domain_info *info = dev_iommu_priv_get(dev);
4430         struct intel_iommu *iommu;
4431
4432         if (!info || dmar_disabled)
4433                 return -EINVAL;
4434
4435         iommu = info->iommu;
4436         if (!iommu)
4437                 return -EINVAL;
4438
4439         if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4440                 return -ENODEV;
4441
4442         if (!info->pasid_enabled || !info->ats_enabled)
4443                 return -EINVAL;
4444
4445         /*
4446          * Devices having device-specific I/O fault handling should not
4447          * support PCI/PRI. The IOMMU side has no means to check the
4448          * capability of device-specific IOPF.  Therefore, IOMMU can only
4449          * default that if the device driver enables SVA on a non-PRI
4450          * device, it will handle IOPF in its own way.
4451          */
4452         if (!info->pri_supported)
4453                 return 0;
4454
4455         /* Devices supporting PRI should have it enabled. */
4456         if (!info->pri_enabled)
4457                 return -EINVAL;
4458
4459         return 0;
4460 }
4461
4462 static int intel_iommu_enable_iopf(struct device *dev)
4463 {
4464         struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4465         struct device_domain_info *info = dev_iommu_priv_get(dev);
4466         struct intel_iommu *iommu;
4467         int ret;
4468
4469         if (!pdev || !info || !info->ats_enabled || !info->pri_supported)
4470                 return -ENODEV;
4471
4472         if (info->pri_enabled)
4473                 return -EBUSY;
4474
4475         iommu = info->iommu;
4476         if (!iommu)
4477                 return -EINVAL;
4478
4479         /* PASID is required in PRG Response Message. */
4480         if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
4481                 return -EINVAL;
4482
4483         ret = pci_reset_pri(pdev);
4484         if (ret)
4485                 return ret;
4486
4487         ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4488         if (ret)
4489                 return ret;
4490
4491         ret = pci_enable_pri(pdev, PRQ_DEPTH);
4492         if (ret) {
4493                 iopf_queue_remove_device(iommu->iopf_queue, dev);
4494                 return ret;
4495         }
4496
4497         info->pri_enabled = 1;
4498
4499         return 0;
4500 }
4501
4502 static int intel_iommu_disable_iopf(struct device *dev)
4503 {
4504         struct device_domain_info *info = dev_iommu_priv_get(dev);
4505         struct intel_iommu *iommu = info->iommu;
4506
4507         if (!info->pri_enabled)
4508                 return -EINVAL;
4509
4510         /*
4511          * PCIe spec states that by clearing PRI enable bit, the Page
4512          * Request Interface will not issue new page requests, but has
4513          * outstanding page requests that have been transmitted or are
4514          * queued for transmission. This is supposed to be called after
4515          * the device driver has stopped DMA, all PASIDs have been
4516          * unbound and the outstanding PRQs have been drained.
4517          */
4518         pci_disable_pri(to_pci_dev(dev));
4519         info->pri_enabled = 0;
4520         iopf_queue_remove_device(iommu->iopf_queue, dev);
4521
4522         return 0;
4523 }
4524
4525 static int
4526 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4527 {
4528         switch (feat) {
4529         case IOMMU_DEV_FEAT_IOPF:
4530                 return intel_iommu_enable_iopf(dev);
4531
4532         case IOMMU_DEV_FEAT_SVA:
4533                 return intel_iommu_enable_sva(dev);
4534
4535         default:
4536                 return -ENODEV;
4537         }
4538 }
4539
4540 static int
4541 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4542 {
4543         switch (feat) {
4544         case IOMMU_DEV_FEAT_IOPF:
4545                 return intel_iommu_disable_iopf(dev);
4546
4547         case IOMMU_DEV_FEAT_SVA:
4548                 return 0;
4549
4550         default:
4551                 return -ENODEV;
4552         }
4553 }
4554
4555 static bool intel_iommu_is_attach_deferred(struct device *dev)
4556 {
4557         struct device_domain_info *info = dev_iommu_priv_get(dev);
4558
4559         return translation_pre_enabled(info->iommu) && !info->domain;
4560 }
4561
4562 /*
4563  * Check that the device does not live on an external facing PCI port that is
4564  * marked as untrusted. Such devices should not be able to apply quirks and
4565  * thus not be able to bypass the IOMMU restrictions.
4566  */
4567 static bool risky_device(struct pci_dev *pdev)
4568 {
4569         if (pdev->untrusted) {
4570                 pci_info(pdev,
4571                          "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4572                          pdev->vendor, pdev->device);
4573                 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4574                 return true;
4575         }
4576         return false;
4577 }
4578
4579 static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4580                                       unsigned long iova, size_t size)
4581 {
4582         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4583         unsigned long pages = aligned_nrpages(iova, size);
4584         unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4585         struct iommu_domain_info *info;
4586         unsigned long i;
4587
4588         xa_for_each(&dmar_domain->iommu_array, i, info)
4589                 __mapping_notify_one(info->iommu, dmar_domain, pfn, pages);
4590         return 0;
4591 }
4592
4593 static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
4594 {
4595         struct device_domain_info *info = dev_iommu_priv_get(dev);
4596         struct dev_pasid_info *curr, *dev_pasid = NULL;
4597         struct intel_iommu *iommu = info->iommu;
4598         struct dmar_domain *dmar_domain;
4599         struct iommu_domain *domain;
4600         unsigned long flags;
4601
4602         domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0);
4603         if (WARN_ON_ONCE(!domain))
4604                 goto out_tear_down;
4605
4606         /*
4607          * The SVA implementation needs to handle its own stuffs like the mm
4608          * notification. Before consolidating that code into iommu core, let
4609          * the intel sva code handle it.
4610          */
4611         if (domain->type == IOMMU_DOMAIN_SVA) {
4612                 intel_svm_remove_dev_pasid(dev, pasid);
4613                 goto out_tear_down;
4614         }
4615
4616         dmar_domain = to_dmar_domain(domain);
4617         spin_lock_irqsave(&dmar_domain->lock, flags);
4618         list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
4619                 if (curr->dev == dev && curr->pasid == pasid) {
4620                         list_del(&curr->link_domain);
4621                         dev_pasid = curr;
4622                         break;
4623                 }
4624         }
4625         WARN_ON_ONCE(!dev_pasid);
4626         spin_unlock_irqrestore(&dmar_domain->lock, flags);
4627
4628         domain_detach_iommu(dmar_domain, iommu);
4629         intel_iommu_debugfs_remove_dev_pasid(dev_pasid);
4630         kfree(dev_pasid);
4631 out_tear_down:
4632         intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4633         intel_drain_pasid_prq(dev, pasid);
4634 }
4635
4636 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4637                                      struct device *dev, ioasid_t pasid)
4638 {
4639         struct device_domain_info *info = dev_iommu_priv_get(dev);
4640         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4641         struct intel_iommu *iommu = info->iommu;
4642         struct dev_pasid_info *dev_pasid;
4643         unsigned long flags;
4644         int ret;
4645
4646         if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4647                 return -EOPNOTSUPP;
4648
4649         if (domain->dirty_ops)
4650                 return -EINVAL;
4651
4652         if (context_copied(iommu, info->bus, info->devfn))
4653                 return -EBUSY;
4654
4655         ret = prepare_domain_attach_device(domain, dev);
4656         if (ret)
4657                 return ret;
4658
4659         dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
4660         if (!dev_pasid)
4661                 return -ENOMEM;
4662
4663         ret = domain_attach_iommu(dmar_domain, iommu);
4664         if (ret)
4665                 goto out_free;
4666
4667         if (domain_type_is_si(dmar_domain))
4668                 ret = intel_pasid_setup_pass_through(iommu, dev, pasid);
4669         else if (dmar_domain->use_first_level)
4670                 ret = domain_setup_first_level(iommu, dmar_domain,
4671                                                dev, pasid);
4672         else
4673                 ret = intel_pasid_setup_second_level(iommu, dmar_domain,
4674                                                      dev, pasid);
4675         if (ret)
4676                 goto out_detach_iommu;
4677
4678         dev_pasid->dev = dev;
4679         dev_pasid->pasid = pasid;
4680         spin_lock_irqsave(&dmar_domain->lock, flags);
4681         list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
4682         spin_unlock_irqrestore(&dmar_domain->lock, flags);
4683
4684         if (domain->type & __IOMMU_DOMAIN_PAGING)
4685                 intel_iommu_debugfs_create_dev_pasid(dev_pasid);
4686
4687         return 0;
4688 out_detach_iommu:
4689         domain_detach_iommu(dmar_domain, iommu);
4690 out_free:
4691         kfree(dev_pasid);
4692         return ret;
4693 }
4694
4695 static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
4696 {
4697         struct device_domain_info *info = dev_iommu_priv_get(dev);
4698         struct intel_iommu *iommu = info->iommu;
4699         struct iommu_hw_info_vtd *vtd;
4700
4701         vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
4702         if (!vtd)
4703                 return ERR_PTR(-ENOMEM);
4704
4705         vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17;
4706         vtd->cap_reg = iommu->cap;
4707         vtd->ecap_reg = iommu->ecap;
4708         *length = sizeof(*vtd);
4709         *type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
4710         return vtd;
4711 }
4712
4713 /*
4714  * Set dirty tracking for the device list of a domain. The caller must
4715  * hold the domain->lock when calling it.
4716  */
4717 static int device_set_dirty_tracking(struct list_head *devices, bool enable)
4718 {
4719         struct device_domain_info *info;
4720         int ret = 0;
4721
4722         list_for_each_entry(info, devices, link) {
4723                 ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev,
4724                                                        IOMMU_NO_PASID, enable);
4725                 if (ret)
4726                         break;
4727         }
4728
4729         return ret;
4730 }
4731
4732 static int parent_domain_set_dirty_tracking(struct dmar_domain *domain,
4733                                             bool enable)
4734 {
4735         struct dmar_domain *s1_domain;
4736         unsigned long flags;
4737         int ret;
4738
4739         spin_lock(&domain->s1_lock);
4740         list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4741                 spin_lock_irqsave(&s1_domain->lock, flags);
4742                 ret = device_set_dirty_tracking(&s1_domain->devices, enable);
4743                 spin_unlock_irqrestore(&s1_domain->lock, flags);
4744                 if (ret)
4745                         goto err_unwind;
4746         }
4747         spin_unlock(&domain->s1_lock);
4748         return 0;
4749
4750 err_unwind:
4751         list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4752                 spin_lock_irqsave(&s1_domain->lock, flags);
4753                 device_set_dirty_tracking(&s1_domain->devices,
4754                                           domain->dirty_tracking);
4755                 spin_unlock_irqrestore(&s1_domain->lock, flags);
4756         }
4757         spin_unlock(&domain->s1_lock);
4758         return ret;
4759 }
4760
4761 static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
4762                                           bool enable)
4763 {
4764         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4765         int ret;
4766
4767         spin_lock(&dmar_domain->lock);
4768         if (dmar_domain->dirty_tracking == enable)
4769                 goto out_unlock;
4770
4771         ret = device_set_dirty_tracking(&dmar_domain->devices, enable);
4772         if (ret)
4773                 goto err_unwind;
4774
4775         if (dmar_domain->nested_parent) {
4776                 ret = parent_domain_set_dirty_tracking(dmar_domain, enable);
4777                 if (ret)
4778                         goto err_unwind;
4779         }
4780
4781         dmar_domain->dirty_tracking = enable;
4782 out_unlock:
4783         spin_unlock(&dmar_domain->lock);
4784
4785         return 0;
4786
4787 err_unwind:
4788         device_set_dirty_tracking(&dmar_domain->devices,
4789                                   dmar_domain->dirty_tracking);
4790         spin_unlock(&dmar_domain->lock);
4791         return ret;
4792 }
4793
4794 static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
4795                                             unsigned long iova, size_t size,
4796                                             unsigned long flags,
4797                                             struct iommu_dirty_bitmap *dirty)
4798 {
4799         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4800         unsigned long end = iova + size - 1;
4801         unsigned long pgsize;
4802
4803         /*
4804          * IOMMUFD core calls into a dirty tracking disabled domain without an
4805          * IOVA bitmap set in order to clean dirty bits in all PTEs that might
4806          * have occurred when we stopped dirty tracking. This ensures that we
4807          * never inherit dirtied bits from a previous cycle.
4808          */
4809         if (!dmar_domain->dirty_tracking && dirty->bitmap)
4810                 return -EINVAL;
4811
4812         do {
4813                 struct dma_pte *pte;
4814                 int lvl = 0;
4815
4816                 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
4817                                      GFP_ATOMIC);
4818                 pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
4819                 if (!pte || !dma_pte_present(pte)) {
4820                         iova += pgsize;
4821                         continue;
4822                 }
4823
4824                 if (dma_sl_pte_test_and_clear_dirty(pte, flags))
4825                         iommu_dirty_bitmap_record(dirty, iova, pgsize);
4826                 iova += pgsize;
4827         } while (iova < end);
4828
4829         return 0;
4830 }
4831
4832 static const struct iommu_dirty_ops intel_dirty_ops = {
4833         .set_dirty_tracking = intel_iommu_set_dirty_tracking,
4834         .read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
4835 };
4836
4837 const struct iommu_ops intel_iommu_ops = {
4838         .blocked_domain         = &blocking_domain,
4839         .release_domain         = &blocking_domain,
4840         .capable                = intel_iommu_capable,
4841         .hw_info                = intel_iommu_hw_info,
4842         .domain_alloc           = intel_iommu_domain_alloc,
4843         .domain_alloc_user      = intel_iommu_domain_alloc_user,
4844         .probe_device           = intel_iommu_probe_device,
4845         .probe_finalize         = intel_iommu_probe_finalize,
4846         .release_device         = intel_iommu_release_device,
4847         .get_resv_regions       = intel_iommu_get_resv_regions,
4848         .device_group           = intel_iommu_device_group,
4849         .dev_enable_feat        = intel_iommu_dev_enable_feat,
4850         .dev_disable_feat       = intel_iommu_dev_disable_feat,
4851         .is_attach_deferred     = intel_iommu_is_attach_deferred,
4852         .def_domain_type        = device_def_domain_type,
4853         .remove_dev_pasid       = intel_iommu_remove_dev_pasid,
4854         .pgsize_bitmap          = SZ_4K,
4855 #ifdef CONFIG_INTEL_IOMMU_SVM
4856         .page_response          = intel_svm_page_response,
4857 #endif
4858         .default_domain_ops = &(const struct iommu_domain_ops) {
4859                 .attach_dev             = intel_iommu_attach_device,
4860                 .set_dev_pasid          = intel_iommu_set_dev_pasid,
4861                 .map_pages              = intel_iommu_map_pages,
4862                 .unmap_pages            = intel_iommu_unmap_pages,
4863                 .iotlb_sync_map         = intel_iommu_iotlb_sync_map,
4864                 .flush_iotlb_all        = intel_flush_iotlb_all,
4865                 .iotlb_sync             = intel_iommu_tlb_sync,
4866                 .iova_to_phys           = intel_iommu_iova_to_phys,
4867                 .free                   = intel_iommu_domain_free,
4868                 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4869         }
4870 };
4871
4872 static void quirk_iommu_igfx(struct pci_dev *dev)
4873 {
4874         if (risky_device(dev))
4875                 return;
4876
4877         pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4878         dmar_map_gfx = 0;
4879 }
4880
4881 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4882 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4883 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4884 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4885 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4886 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4887 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4888 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4889
4890 /* Broadwell igfx malfunctions with dmar */
4891 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4892 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4893 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4894 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4895 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4896 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4897 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4898 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4899 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4900 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4901 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4902 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4903 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4904 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4905 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4906 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4907 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4908 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4909 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4910 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4911 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4912 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4913 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4914 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4915
4916 static void quirk_iommu_rwbf(struct pci_dev *dev)
4917 {
4918         if (risky_device(dev))
4919                 return;
4920
4921         /*
4922          * Mobile 4 Series Chipset neglects to set RWBF capability,
4923          * but needs it. Same seems to hold for the desktop versions.
4924          */
4925         pci_info(dev, "Forcing write-buffer flush capability\n");
4926         rwbf_quirk = 1;
4927 }
4928
4929 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4930 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4931 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4932 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4933 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4934 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4935 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4936
4937 #define GGC 0x52
4938 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
4939 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
4940 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
4941 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
4942 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
4943 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
4944 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
4945 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
4946
4947 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4948 {
4949         unsigned short ggc;
4950
4951         if (risky_device(dev))
4952                 return;
4953
4954         if (pci_read_config_word(dev, GGC, &ggc))
4955                 return;
4956
4957         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4958                 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4959                 dmar_map_gfx = 0;
4960         } else if (dmar_map_gfx) {
4961                 /* we have to ensure the gfx device is idle before we flush */
4962                 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4963                 iommu_set_dma_strict();
4964         }
4965 }
4966 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4967 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4968 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4969 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4970
4971 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4972 {
4973         unsigned short ver;
4974
4975         if (!IS_GFX_DEVICE(dev))
4976                 return;
4977
4978         ver = (dev->device >> 8) & 0xff;
4979         if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4980             ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4981             ver != 0x9a && ver != 0xa7 && ver != 0x7d)
4982                 return;
4983
4984         if (risky_device(dev))
4985                 return;
4986
4987         pci_info(dev, "Skip IOMMU disabling for graphics\n");
4988         iommu_skip_te_disable = 1;
4989 }
4990 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4991
4992 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4993    ISOCH DMAR unit for the Azalia sound device, but not give it any
4994    TLB entries, which causes it to deadlock. Check for that.  We do
4995    this in a function called from init_dmars(), instead of in a PCI
4996    quirk, because we don't want to print the obnoxious "BIOS broken"
4997    message if VT-d is actually disabled.
4998 */
4999 static void __init check_tylersburg_isoch(void)
5000 {
5001         struct pci_dev *pdev;
5002         uint32_t vtisochctrl;
5003
5004         /* If there's no Azalia in the system anyway, forget it. */
5005         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5006         if (!pdev)
5007                 return;
5008
5009         if (risky_device(pdev)) {
5010                 pci_dev_put(pdev);
5011                 return;
5012         }
5013
5014         pci_dev_put(pdev);
5015
5016         /* System Management Registers. Might be hidden, in which case
5017            we can't do the sanity check. But that's OK, because the
5018            known-broken BIOSes _don't_ actually hide it, so far. */
5019         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5020         if (!pdev)
5021                 return;
5022
5023         if (risky_device(pdev)) {
5024                 pci_dev_put(pdev);
5025                 return;
5026         }
5027
5028         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5029                 pci_dev_put(pdev);
5030                 return;
5031         }
5032
5033         pci_dev_put(pdev);
5034
5035         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5036         if (vtisochctrl & 1)
5037                 return;
5038
5039         /* Drop all bits other than the number of TLB entries */
5040         vtisochctrl &= 0x1c;
5041
5042         /* If we have the recommended number of TLB entries (16), fine. */
5043         if (vtisochctrl == 0x10)
5044                 return;
5045
5046         /* Zero TLB entries? You get to ride the short bus to school. */
5047         if (!vtisochctrl) {
5048                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5049                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5050                      dmi_get_system_info(DMI_BIOS_VENDOR),
5051                      dmi_get_system_info(DMI_BIOS_VERSION),
5052                      dmi_get_system_info(DMI_PRODUCT_VERSION));
5053                 iommu_identity_mapping |= IDENTMAP_AZALIA;
5054                 return;
5055         }
5056
5057         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5058                vtisochctrl);
5059 }
5060
5061 /*
5062  * Here we deal with a device TLB defect where device may inadvertently issue ATS
5063  * invalidation completion before posted writes initiated with translated address
5064  * that utilized translations matching the invalidation address range, violating
5065  * the invalidation completion ordering.
5066  * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
5067  * vulnerable to this defect. In other words, any dTLB invalidation initiated not
5068  * under the control of the trusted/privileged host device driver must use this
5069  * quirk.
5070  * Device TLBs are invalidated under the following six conditions:
5071  * 1. Device driver does DMA API unmap IOVA
5072  * 2. Device driver unbind a PASID from a process, sva_unbind_device()
5073  * 3. PASID is torn down, after PASID cache is flushed. e.g. process
5074  *    exit_mmap() due to crash
5075  * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
5076  *    VM has to free pages that were unmapped
5077  * 5. Userspace driver unmaps a DMA buffer
5078  * 6. Cache invalidation in vSVA usage (upcoming)
5079  *
5080  * For #1 and #2, device drivers are responsible for stopping DMA traffic
5081  * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
5082  * invalidate TLB the same way as normal user unmap which will use this quirk.
5083  * The dTLB invalidation after PASID cache flush does not need this quirk.
5084  *
5085  * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
5086  */
5087 void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
5088                                unsigned long address, unsigned long mask,
5089                                u32 pasid, u16 qdep)
5090 {
5091         u16 sid;
5092
5093         if (likely(!info->dtlb_extra_inval))
5094                 return;
5095
5096         sid = PCI_DEVID(info->bus, info->devfn);
5097         if (pasid == IOMMU_NO_PASID) {
5098                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
5099                                    qdep, address, mask);
5100         } else {
5101                 qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
5102                                          pasid, qdep, address, mask);
5103         }
5104 }
5105
5106 #define ecmd_get_status_code(res)       (((res) & 0xff) >> 1)
5107
5108 /*
5109  * Function to submit a command to the enhanced command interface. The
5110  * valid enhanced command descriptions are defined in Table 47 of the
5111  * VT-d spec. The VT-d hardware implementation may support some but not
5112  * all commands, which can be determined by checking the Enhanced
5113  * Command Capability Register.
5114  *
5115  * Return values:
5116  *  - 0: Command successful without any error;
5117  *  - Negative: software error value;
5118  *  - Nonzero positive: failure status code defined in Table 48.
5119  */
5120 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
5121 {
5122         unsigned long flags;
5123         u64 res;
5124         int ret;
5125
5126         if (!cap_ecmds(iommu->cap))
5127                 return -ENODEV;
5128
5129         raw_spin_lock_irqsave(&iommu->register_lock, flags);
5130
5131         res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
5132         if (res & DMA_ECMD_ECRSP_IP) {
5133                 ret = -EBUSY;
5134                 goto err;
5135         }
5136
5137         /*
5138          * Unconditionally write the operand B, because
5139          * - There is no side effect if an ecmd doesn't require an
5140          *   operand B, but we set the register to some value.
5141          * - It's not invoked in any critical path. The extra MMIO
5142          *   write doesn't bring any performance concerns.
5143          */
5144         dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
5145         dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
5146
5147         IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
5148                       !(res & DMA_ECMD_ECRSP_IP), res);
5149
5150         if (res & DMA_ECMD_ECRSP_IP) {
5151                 ret = -ETIMEDOUT;
5152                 goto err;
5153         }
5154
5155         ret = ecmd_get_status_code(res);
5156 err:
5157         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
5158
5159         return ret;
5160 }