drivers/iommu/intel/iommu.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Copyright © 2006-2014 Intel Corporation.
   4  *
   5  * Authors: David Woodhouse <[email protected]>,
   6  *          Ashok Raj <[email protected]>,
   7  *          Shaohua Li <[email protected]>,
   8  *          Anil S Keshavamurthy <[email protected]>,
   9  *          Fenghua Yu <[email protected]>
  10  *          Joerg Roedel <[email protected]>
  11  */
  12
  13 #define pr_fmt(fmt)     "DMAR: " fmt
  14 #define dev_fmt(fmt)    pr_fmt(fmt)
  15
  16 #include <linux/crash_dump.h>
  17 #include <linux/dma-direct.h>
  18 #include <linux/dmi.h>
  19 #include <linux/memory.h>
  20 #include <linux/pci.h>
  21 #include <linux/pci-ats.h>
  22 #include <linux/spinlock.h>
  23 #include <linux/syscore_ops.h>
  24 #include <linux/tboot.h>
  25 #include <uapi/linux/iommufd.h>
  26
  27 #include "iommu.h"
  28 #include "../dma-iommu.h"
  29 #include "../irq_remapping.h"
  30 #include "../iommu-sva.h"
  31 #include "pasid.h"
  32 #include "cap_audit.h"
  33 #include "perfmon.h"
  34
  35 #define ROOT_SIZE               VTD_PAGE_SIZE
  36 #define CONTEXT_SIZE            VTD_PAGE_SIZE
  37
  38 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
  39 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
  40 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
  41 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
  42
  43 #define IOAPIC_RANGE_START      (0xfee00000)
  44 #define IOAPIC_RANGE_END        (0xfeefffff)
  45 #define IOVA_START_ADDR         (0x1000)
  46
  47 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
  48
  49 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
  50 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
  51
  52 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
  53    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
  54 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
  55                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
  56 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
  57
  58 /* IO virtual address start page frame number */
  59 #define IOVA_START_PFN          (1)
  60
  61 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
  62
  63 static void __init check_tylersburg_isoch(void);
  64 static int rwbf_quirk;
  65
  66 /*
  67  * set to 1 to panic kernel if can't successfully enable VT-d
  68  * (used when kernel is launched w/ TXT)
  69  */
  70 static int force_on = 0;
  71 static int intel_iommu_tboot_noforce;
  72 static int no_platform_optin;
  73
  74 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
  75
  76 /*
  77  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
  78  * if marked present.
  79  */
  80 static phys_addr_t root_entry_lctp(struct root_entry *re)
  81 {
  82         if (!(re->lo & 1))
  83                 return 0;
  84
  85         return re->lo & VTD_PAGE_MASK;
  86 }
  87
  88 /*
  89  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
  90  * if marked present.
  91  */
  92 static phys_addr_t root_entry_uctp(struct root_entry *re)
  93 {
  94         if (!(re->hi & 1))
  95                 return 0;
  96
  97         return re->hi & VTD_PAGE_MASK;
  98 }
  99
 100 /*
 101  * This domain is a statically identity mapping domain.
 102  *      1. This domain creats a static 1:1 mapping to all usable memory.
 103  *      2. It maps to each iommu if successful.
 104  *      3. Each iommu mapps to this domain if successful.
 105  */
 106 static struct dmar_domain *si_domain;
 107 static int hw_pass_through = 1;
 108
 109 struct dmar_rmrr_unit {
 110         struct list_head list;          /* list of rmrr units   */
 111         struct acpi_dmar_header *hdr;   /* ACPI header          */
 112         u64     base_address;           /* reserved base address*/
 113         u64     end_address;            /* reserved end address */
 114         struct dmar_dev_scope *devices; /* target devices */
 115         int     devices_cnt;            /* target device count */
 116 };
 117
 118 struct dmar_atsr_unit {
 119         struct list_head list;          /* list of ATSR units */
 120         struct acpi_dmar_header *hdr;   /* ACPI header */
 121         struct dmar_dev_scope *devices; /* target devices */
 122         int devices_cnt;                /* target device count */
 123         u8 include_all:1;               /* include all ports */
 124 };
 125
 126 struct dmar_satc_unit {
 127         struct list_head list;          /* list of SATC units */
 128         struct acpi_dmar_header *hdr;   /* ACPI header */
 129         struct dmar_dev_scope *devices; /* target devices */
 130         struct intel_iommu *iommu;      /* the corresponding iommu */
 131         int devices_cnt;                /* target device count */
 132         u8 atc_required:1;              /* ATS is required */
 133 };
 134
 135 static LIST_HEAD(dmar_atsr_units);
 136 static LIST_HEAD(dmar_rmrr_units);
 137 static LIST_HEAD(dmar_satc_units);
 138
 139 #define for_each_rmrr_units(rmrr) \
 140         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
 141
 142 static void intel_iommu_domain_free(struct iommu_domain *domain);
 143
 144 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
 145 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
 146
 147 int intel_iommu_enabled = 0;
 148 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 149
 150 static int dmar_map_gfx = 1;
 151 static int intel_iommu_superpage = 1;
 152 static int iommu_identity_mapping;
 153 static int iommu_skip_te_disable;
 154
 155 #define IDENTMAP_GFX            2
 156 #define IDENTMAP_AZALIA         4
 157
 158 const struct iommu_ops intel_iommu_ops;
 159 static const struct iommu_dirty_ops intel_dirty_ops;
 160
 161 static bool translation_pre_enabled(struct intel_iommu *iommu)
 162 {
 163         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
 164 }
 165
 166 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
 167 {
 168         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
 169 }
 170
 171 static void init_translation_status(struct intel_iommu *iommu)
 172 {
 173         u32 gsts;
 174
 175         gsts = readl(iommu->reg + DMAR_GSTS_REG);
 176         if (gsts & DMA_GSTS_TES)
 177                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
 178 }
 179
 180 static int __init intel_iommu_setup(char *str)
 181 {
 182         if (!str)
 183                 return -EINVAL;
 184
 185         while (*str) {
 186                 if (!strncmp(str, "on", 2)) {
 187                         dmar_disabled = 0;
 188                         pr_info("IOMMU enabled\n");
 189                 } else if (!strncmp(str, "off", 3)) {
 190                         dmar_disabled = 1;
 191                         no_platform_optin = 1;
 192                         pr_info("IOMMU disabled\n");
 193                 } else if (!strncmp(str, "igfx_off", 8)) {
 194                         dmar_map_gfx = 0;
 195                         pr_info("Disable GFX device mapping\n");
 196                 } else if (!strncmp(str, "forcedac", 8)) {
 197                         pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
 198                         iommu_dma_forcedac = true;
 199                 } else if (!strncmp(str, "strict", 6)) {
 200                         pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
 201                         iommu_set_dma_strict();
 202                 } else if (!strncmp(str, "sp_off", 6)) {
 203                         pr_info("Disable supported super page\n");
 204                         intel_iommu_superpage = 0;
 205                 } else if (!strncmp(str, "sm_on", 5)) {
 206                         pr_info("Enable scalable mode if hardware supports\n");
 207                         intel_iommu_sm = 1;
 208                 } else if (!strncmp(str, "sm_off", 6)) {
 209                         pr_info("Scalable mode is disallowed\n");
 210                         intel_iommu_sm = 0;
 211                 } else if (!strncmp(str, "tboot_noforce", 13)) {
 212                         pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
 213                         intel_iommu_tboot_noforce = 1;
 214                 } else {
 215                         pr_notice("Unknown option - '%s'\n", str);
 216                 }
 217
 218                 str += strcspn(str, ",");
 219                 while (*str == ',')
 220                         str++;
 221         }
 222
 223         return 1;
 224 }
 225 __setup("intel_iommu=", intel_iommu_setup);
 226
 227 void *alloc_pgtable_page(int node, gfp_t gfp)
 228 {
 229         struct page *page;
 230         void *vaddr = NULL;
 231
 232         page = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
 233         if (page)
 234                 vaddr = page_address(page);
 235         return vaddr;
 236 }
 237
 238 void free_pgtable_page(void *vaddr)
 239 {
 240         free_page((unsigned long)vaddr);
 241 }
 242
 243 static int domain_type_is_si(struct dmar_domain *domain)
 244 {
 245         return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
 246 }
 247
 248 static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn)
 249 {
 250         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 251
 252         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
 253 }
 254
 255 /*
 256  * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
 257  * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
 258  * the returned SAGAW.
 259  */
 260 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
 261 {
 262         unsigned long fl_sagaw, sl_sagaw;
 263
 264         fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
 265         sl_sagaw = cap_sagaw(iommu->cap);
 266
 267         /* Second level only. */
 268         if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
 269                 return sl_sagaw;
 270
 271         /* First level only. */
 272         if (!ecap_slts(iommu->ecap))
 273                 return fl_sagaw;
 274
 275         return fl_sagaw & sl_sagaw;
 276 }
 277
 278 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
 279 {
 280         unsigned long sagaw;
 281         int agaw;
 282
 283         sagaw = __iommu_calculate_sagaw(iommu);
 284         for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
 285                 if (test_bit(agaw, &sagaw))
 286                         break;
 287         }
 288
 289         return agaw;
 290 }
 291
 292 /*
 293  * Calculate max SAGAW for each iommu.
 294  */
 295 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
 296 {
 297         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
 298 }
 299
 300 /*
 301  * calculate agaw for each iommu.
 302  * "SAGAW" may be different across iommus, use a default agaw, and
 303  * get a supported less agaw for iommus that don't support the default agaw.
 304  */
 305 int iommu_calculate_agaw(struct intel_iommu *iommu)
 306 {
 307         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
 308 }
 309
 310 static bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
 311 {
 312         return sm_supported(iommu) ?
 313                         ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
 314 }
 315
 316 static void domain_update_iommu_coherency(struct dmar_domain *domain)
 317 {
 318         struct iommu_domain_info *info;
 319         struct dmar_drhd_unit *drhd;
 320         struct intel_iommu *iommu;
 321         bool found = false;
 322         unsigned long i;
 323
 324         domain->iommu_coherency = true;
 325         xa_for_each(&domain->iommu_array, i, info) {
 326                 found = true;
 327                 if (!iommu_paging_structure_coherency(info->iommu)) {
 328                         domain->iommu_coherency = false;
 329                         break;
 330                 }
 331         }
 332         if (found)
 333                 return;
 334
 335         /* No hardware attached; use lowest common denominator */
 336         rcu_read_lock();
 337         for_each_active_iommu(iommu, drhd) {
 338                 if (!iommu_paging_structure_coherency(iommu)) {
 339                         domain->iommu_coherency = false;
 340                         break;
 341                 }
 342         }
 343         rcu_read_unlock();
 344 }
 345
 346 static int domain_update_iommu_superpage(struct dmar_domain *domain,
 347                                          struct intel_iommu *skip)
 348 {
 349         struct dmar_drhd_unit *drhd;
 350         struct intel_iommu *iommu;
 351         int mask = 0x3;
 352
 353         if (!intel_iommu_superpage)
 354                 return 0;
 355
 356         /* set iommu_superpage to the smallest common denominator */
 357         rcu_read_lock();
 358         for_each_active_iommu(iommu, drhd) {
 359                 if (iommu != skip) {
 360                         if (domain && domain->use_first_level) {
 361                                 if (!cap_fl1gp_support(iommu->cap))
 362                                         mask = 0x1;
 363                         } else {
 364                                 mask &= cap_super_page_val(iommu->cap);
 365                         }
 366
 367                         if (!mask)
 368                                 break;
 369                 }
 370         }
 371         rcu_read_unlock();
 372
 373         return fls(mask);
 374 }
 375
 376 static int domain_update_device_node(struct dmar_domain *domain)
 377 {
 378         struct device_domain_info *info;
 379         int nid = NUMA_NO_NODE;
 380         unsigned long flags;
 381
 382         spin_lock_irqsave(&domain->lock, flags);
 383         list_for_each_entry(info, &domain->devices, link) {
 384                 /*
 385                  * There could possibly be multiple device numa nodes as devices
 386                  * within the same domain may sit behind different IOMMUs. There
 387                  * isn't perfect answer in such situation, so we select first
 388                  * come first served policy.
 389                  */
 390                 nid = dev_to_node(info->dev);
 391                 if (nid != NUMA_NO_NODE)
 392                         break;
 393         }
 394         spin_unlock_irqrestore(&domain->lock, flags);
 395
 396         return nid;
 397 }
 398
 399 static void domain_update_iotlb(struct dmar_domain *domain);
 400
 401 /* Return the super pagesize bitmap if supported. */
 402 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
 403 {
 404         unsigned long bitmap = 0;
 405
 406         /*
 407          * 1-level super page supports page size of 2MiB, 2-level super page
 408          * supports page size of both 2MiB and 1GiB.
 409          */
 410         if (domain->iommu_superpage == 1)
 411                 bitmap |= SZ_2M;
 412         else if (domain->iommu_superpage == 2)
 413                 bitmap |= SZ_2M | SZ_1G;
 414
 415         return bitmap;
 416 }
 417
 418 /* Some capabilities may be different across iommus */
 419 void domain_update_iommu_cap(struct dmar_domain *domain)
 420 {
 421         domain_update_iommu_coherency(domain);
 422         domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
 423
 424         /*
 425          * If RHSA is missing, we should default to the device numa domain
 426          * as fall back.
 427          */
 428         if (domain->nid == NUMA_NO_NODE)
 429                 domain->nid = domain_update_device_node(domain);
 430
 431         /*
 432          * First-level translation restricts the input-address to a
 433          * canonical address (i.e., address bits 63:N have the same
 434          * value as address bit [N-1], where N is 48-bits with 4-level
 435          * paging and 57-bits with 5-level paging). Hence, skip bit
 436          * [N-1].
 437          */
 438         if (domain->use_first_level)
 439                 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
 440         else
 441                 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
 442
 443         domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
 444         domain_update_iotlb(domain);
 445 }
 446
 447 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
 448                                          u8 devfn, int alloc)
 449 {
 450         struct root_entry *root = &iommu->root_entry[bus];
 451         struct context_entry *context;
 452         u64 *entry;
 453
 454         /*
 455          * Except that the caller requested to allocate a new entry,
 456          * returning a copied context entry makes no sense.
 457          */
 458         if (!alloc && context_copied(iommu, bus, devfn))
 459                 return NULL;
 460
 461         entry = &root->lo;
 462         if (sm_supported(iommu)) {
 463                 if (devfn >= 0x80) {
 464                         devfn -= 0x80;
 465                         entry = &root->hi;
 466                 }
 467                 devfn *= 2;
 468         }
 469         if (*entry & 1)
 470                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
 471         else {
 472                 unsigned long phy_addr;
 473                 if (!alloc)
 474                         return NULL;
 475
 476                 context = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
 477                 if (!context)
 478                         return NULL;
 479
 480                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
 481                 phy_addr = virt_to_phys((void *)context);
 482                 *entry = phy_addr | 1;
 483                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
 484         }
 485         return &context[devfn];
 486 }
 487
 488 /**
 489  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
 490  *                               sub-hierarchy of a candidate PCI-PCI bridge
 491  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
 492  * @bridge: the candidate PCI-PCI bridge
 493  *
 494  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
 495  */
 496 static bool
 497 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
 498 {
 499         struct pci_dev *pdev, *pbridge;
 500
 501         if (!dev_is_pci(dev) || !dev_is_pci(bridge))
 502                 return false;
 503
 504         pdev = to_pci_dev(dev);
 505         pbridge = to_pci_dev(bridge);
 506
 507         if (pbridge->subordinate &&
 508             pbridge->subordinate->number <= pdev->bus->number &&
 509             pbridge->subordinate->busn_res.end >= pdev->bus->number)
 510                 return true;
 511
 512         return false;
 513 }
 514
 515 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
 516 {
 517         struct dmar_drhd_unit *drhd;
 518         u32 vtbar;
 519         int rc;
 520
 521         /* We know that this device on this chipset has its own IOMMU.
 522          * If we find it under a different IOMMU, then the BIOS is lying
 523          * to us. Hope that the IOMMU for this device is actually
 524          * disabled, and it needs no translation...
 525          */
 526         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
 527         if (rc) {
 528                 /* "can't" happen */
 529                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
 530                 return false;
 531         }
 532         vtbar &= 0xffff0000;
 533
 534         /* we know that the this iommu should be at offset 0xa000 from vtbar */
 535         drhd = dmar_find_matched_drhd_unit(pdev);
 536         if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
 537                 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
 538                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
 539                 return true;
 540         }
 541
 542         return false;
 543 }
 544
 545 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
 546 {
 547         if (!iommu || iommu->drhd->ignored)
 548                 return true;
 549
 550         if (dev_is_pci(dev)) {
 551                 struct pci_dev *pdev = to_pci_dev(dev);
 552
 553                 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
 554                     pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
 555                     quirk_ioat_snb_local_iommu(pdev))
 556                         return true;
 557         }
 558
 559         return false;
 560 }
 561
 562 static struct intel_iommu *device_lookup_iommu(struct device *dev, u8 *bus, u8 *devfn)
 563 {
 564         struct dmar_drhd_unit *drhd = NULL;
 565         struct pci_dev *pdev = NULL;
 566         struct intel_iommu *iommu;
 567         struct device *tmp;
 568         u16 segment = 0;
 569         int i;
 570
 571         if (!dev)
 572                 return NULL;
 573
 574         if (dev_is_pci(dev)) {
 575                 struct pci_dev *pf_pdev;
 576
 577                 pdev = pci_real_dma_dev(to_pci_dev(dev));
 578
 579                 /* VFs aren't listed in scope tables; we need to look up
 580                  * the PF instead to find the IOMMU. */
 581                 pf_pdev = pci_physfn(pdev);
 582                 dev = &pf_pdev->dev;
 583                 segment = pci_domain_nr(pdev->bus);
 584         } else if (has_acpi_companion(dev))
 585                 dev = &ACPI_COMPANION(dev)->dev;
 586
 587         rcu_read_lock();
 588         for_each_iommu(iommu, drhd) {
 589                 if (pdev && segment != drhd->segment)
 590                         continue;
 591
 592                 for_each_active_dev_scope(drhd->devices,
 593                                           drhd->devices_cnt, i, tmp) {
 594                         if (tmp == dev) {
 595                                 /* For a VF use its original BDF# not that of the PF
 596                                  * which we used for the IOMMU lookup. Strictly speaking
 597                                  * we could do this for all PCI devices; we only need to
 598                                  * get the BDF# from the scope table for ACPI matches. */
 599                                 if (pdev && pdev->is_virtfn)
 600                                         goto got_pdev;
 601
 602                                 if (bus && devfn) {
 603                                         *bus = drhd->devices[i].bus;
 604                                         *devfn = drhd->devices[i].devfn;
 605                                 }
 606                                 goto out;
 607                         }
 608
 609                         if (is_downstream_to_pci_bridge(dev, tmp))
 610                                 goto got_pdev;
 611                 }
 612
 613                 if (pdev && drhd->include_all) {
 614 got_pdev:
 615                         if (bus && devfn) {
 616                                 *bus = pdev->bus->number;
 617                                 *devfn = pdev->devfn;
 618                         }
 619                         goto out;
 620                 }
 621         }
 622         iommu = NULL;
 623 out:
 624         if (iommu_is_dummy(iommu, dev))
 625                 iommu = NULL;
 626
 627         rcu_read_unlock();
 628
 629         return iommu;
 630 }
 631
 632 static void domain_flush_cache(struct dmar_domain *domain,
 633                                void *addr, int size)
 634 {
 635         if (!domain->iommu_coherency)
 636                 clflush_cache_range(addr, size);
 637 }
 638
 639 static void free_context_table(struct intel_iommu *iommu)
 640 {
 641         struct context_entry *context;
 642         int i;
 643
 644         if (!iommu->root_entry)
 645                 return;
 646
 647         for (i = 0; i < ROOT_ENTRY_NR; i++) {
 648                 context = iommu_context_addr(iommu, i, 0, 0);
 649                 if (context)
 650                         free_pgtable_page(context);
 651
 652                 if (!sm_supported(iommu))
 653                         continue;
 654
 655                 context = iommu_context_addr(iommu, i, 0x80, 0);
 656                 if (context)
 657                         free_pgtable_page(context);
 658         }
 659
 660         free_pgtable_page(iommu->root_entry);
 661         iommu->root_entry = NULL;
 662 }
 663
 664 #ifdef CONFIG_DMAR_DEBUG
 665 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
 666                          u8 bus, u8 devfn, struct dma_pte *parent, int level)
 667 {
 668         struct dma_pte *pte;
 669         int offset;
 670
 671         while (1) {
 672                 offset = pfn_level_offset(pfn, level);
 673                 pte = &parent[offset];
 674                 if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
 675                         pr_info("PTE not present at level %d\n", level);
 676                         break;
 677                 }
 678
 679                 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
 680
 681                 if (level == 1)
 682                         break;
 683
 684                 parent = phys_to_virt(dma_pte_addr(pte));
 685                 level--;
 686         }
 687 }
 688
 689 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
 690                           unsigned long long addr, u32 pasid)
 691 {
 692         struct pasid_dir_entry *dir, *pde;
 693         struct pasid_entry *entries, *pte;
 694         struct context_entry *ctx_entry;
 695         struct root_entry *rt_entry;
 696         int i, dir_index, index, level;
 697         u8 devfn = source_id & 0xff;
 698         u8 bus = source_id >> 8;
 699         struct dma_pte *pgtable;
 700
 701         pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
 702
 703         /* root entry dump */
 704         rt_entry = &iommu->root_entry[bus];
 705         if (!rt_entry) {
 706                 pr_info("root table entry is not present\n");
 707                 return;
 708         }
 709
 710         if (sm_supported(iommu))
 711                 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
 712                         rt_entry->hi, rt_entry->lo);
 713         else
 714                 pr_info("root entry: 0x%016llx", rt_entry->lo);
 715
 716         /* context entry dump */
 717         ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
 718         if (!ctx_entry) {
 719                 pr_info("context table entry is not present\n");
 720                 return;
 721         }
 722
 723         pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
 724                 ctx_entry->hi, ctx_entry->lo);
 725
 726         /* legacy mode does not require PASID entries */
 727         if (!sm_supported(iommu)) {
 728                 level = agaw_to_level(ctx_entry->hi & 7);
 729                 pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
 730                 goto pgtable_walk;
 731         }
 732
 733         /* get the pointer to pasid directory entry */
 734         dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
 735         if (!dir) {
 736                 pr_info("pasid directory entry is not present\n");
 737                 return;
 738         }
 739         /* For request-without-pasid, get the pasid from context entry */
 740         if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
 741                 pasid = IOMMU_NO_PASID;
 742
 743         dir_index = pasid >> PASID_PDE_SHIFT;
 744         pde = &dir[dir_index];
 745         pr_info("pasid dir entry: 0x%016llx\n", pde->val);
 746
 747         /* get the pointer to the pasid table entry */
 748         entries = get_pasid_table_from_pde(pde);
 749         if (!entries) {
 750                 pr_info("pasid table entry is not present\n");
 751                 return;
 752         }
 753         index = pasid & PASID_PTE_MASK;
 754         pte = &entries[index];
 755         for (i = 0; i < ARRAY_SIZE(pte->val); i++)
 756                 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
 757
 758         if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
 759                 level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
 760                 pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
 761         } else {
 762                 level = agaw_to_level((pte->val[0] >> 2) & 0x7);
 763                 pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
 764         }
 765
 766 pgtable_walk:
 767         pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
 768 }
 769 #endif
 770
 771 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 772                                       unsigned long pfn, int *target_level,
 773                                       gfp_t gfp)
 774 {
 775         struct dma_pte *parent, *pte;
 776         int level = agaw_to_level(domain->agaw);
 777         int offset;
 778
 779         if (!domain_pfn_supported(domain, pfn))
 780                 /* Address beyond IOMMU's addressing capabilities. */
 781                 return NULL;
 782
 783         parent = domain->pgd;
 784
 785         while (1) {
 786                 void *tmp_page;
 787
 788                 offset = pfn_level_offset(pfn, level);
 789                 pte = &parent[offset];
 790                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
 791                         break;
 792                 if (level == *target_level)
 793                         break;
 794
 795                 if (!dma_pte_present(pte)) {
 796                         uint64_t pteval;
 797
 798                         tmp_page = alloc_pgtable_page(domain->nid, gfp);
 799
 800                         if (!tmp_page)
 801                                 return NULL;
 802
 803                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
 804                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
 805                         if (domain->use_first_level)
 806                                 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
 807
 808                         if (cmpxchg64(&pte->val, 0ULL, pteval))
 809                                 /* Someone else set it while we were thinking; use theirs. */
 810                                 free_pgtable_page(tmp_page);
 811                         else
 812                                 domain_flush_cache(domain, pte, sizeof(*pte));
 813                 }
 814                 if (level == 1)
 815                         break;
 816
 817                 parent = phys_to_virt(dma_pte_addr(pte));
 818                 level--;
 819         }
 820
 821         if (!*target_level)
 822                 *target_level = level;
 823
 824         return pte;
 825 }
 826
 827 /* return address's pte at specific level */
 828 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
 829                                          unsigned long pfn,
 830                                          int level, int *large_page)
 831 {
 832         struct dma_pte *parent, *pte;
 833         int total = agaw_to_level(domain->agaw);
 834         int offset;
 835
 836         parent = domain->pgd;
 837         while (level <= total) {
 838                 offset = pfn_level_offset(pfn, total);
 839                 pte = &parent[offset];
 840                 if (level == total)
 841                         return pte;
 842
 843                 if (!dma_pte_present(pte)) {
 844                         *large_page = total;
 845                         break;
 846                 }
 847
 848                 if (dma_pte_superpage(pte)) {
 849                         *large_page = total;
 850                         return pte;
 851                 }
 852
 853                 parent = phys_to_virt(dma_pte_addr(pte));
 854                 total--;
 855         }
 856         return NULL;
 857 }
 858
 859 /* clear last level pte, a tlb flush should be followed */
 860 static void dma_pte_clear_range(struct dmar_domain *domain,
 861                                 unsigned long start_pfn,
 862                                 unsigned long last_pfn)
 863 {
 864         unsigned int large_page;
 865         struct dma_pte *first_pte, *pte;
 866
 867         if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
 868             WARN_ON(start_pfn > last_pfn))
 869                 return;
 870
 871         /* we don't need lock here; nobody else touches the iova range */
 872         do {
 873                 large_page = 1;
 874                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
 875                 if (!pte) {
 876                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
 877                         continue;
 878                 }
 879                 do {
 880                         dma_clear_pte(pte);
 881                         start_pfn += lvl_to_nr_pages(large_page);
 882                         pte++;
 883                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
 884
 885                 domain_flush_cache(domain, first_pte,
 886                                    (void *)pte - (void *)first_pte);
 887
 888         } while (start_pfn && start_pfn <= last_pfn);
 889 }
 890
 891 static void dma_pte_free_level(struct dmar_domain *domain, int level,
 892                                int retain_level, struct dma_pte *pte,
 893                                unsigned long pfn, unsigned long start_pfn,
 894                                unsigned long last_pfn)
 895 {
 896         pfn = max(start_pfn, pfn);
 897         pte = &pte[pfn_level_offset(pfn, level)];
 898
 899         do {
 900                 unsigned long level_pfn;
 901                 struct dma_pte *level_pte;
 902
 903                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
 904                         goto next;
 905
 906                 level_pfn = pfn & level_mask(level);
 907                 level_pte = phys_to_virt(dma_pte_addr(pte));
 908
 909                 if (level > 2) {
 910                         dma_pte_free_level(domain, level - 1, retain_level,
 911                                            level_pte, level_pfn, start_pfn,
 912                                            last_pfn);
 913                 }
 914
 915                 /*
 916                  * Free the page table if we're below the level we want to
 917                  * retain and the range covers the entire table.
 918                  */
 919                 if (level < retain_level && !(start_pfn > level_pfn ||
 920                       last_pfn < level_pfn + level_size(level) - 1)) {
 921                         dma_clear_pte(pte);
 922                         domain_flush_cache(domain, pte, sizeof(*pte));
 923                         free_pgtable_page(level_pte);
 924                 }
 925 next:
 926                 pfn += level_size(level);
 927         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
 928 }
 929
 930 /*
 931  * clear last level (leaf) ptes and free page table pages below the
 932  * level we wish to keep intact.
 933  */
 934 static void dma_pte_free_pagetable(struct dmar_domain *domain,
 935                                    unsigned long start_pfn,
 936                                    unsigned long last_pfn,
 937                                    int retain_level)
 938 {
 939         dma_pte_clear_range(domain, start_pfn, last_pfn);
 940
 941         /* We don't need lock here; nobody else touches the iova range */
 942         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
 943                            domain->pgd, 0, start_pfn, last_pfn);
 944
 945         /* free pgd */
 946         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
 947                 free_pgtable_page(domain->pgd);
 948                 domain->pgd = NULL;
 949         }
 950 }
 951
 952 /* When a page at a given level is being unlinked from its parent, we don't
 953    need to *modify* it at all. All we need to do is make a list of all the
 954    pages which can be freed just as soon as we've flushed the IOTLB and we
 955    know the hardware page-walk will no longer touch them.
 956    The 'pte' argument is the *parent* PTE, pointing to the page that is to
 957    be freed. */
 958 static void dma_pte_list_pagetables(struct dmar_domain *domain,
 959                                     int level, struct dma_pte *pte,
 960                                     struct list_head *freelist)
 961 {
 962         struct page *pg;
 963
 964         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
 965         list_add_tail(&pg->lru, freelist);
 966
 967         if (level == 1)
 968                 return;
 969
 970         pte = page_address(pg);
 971         do {
 972                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
 973                         dma_pte_list_pagetables(domain, level - 1, pte, freelist);
 974                 pte++;
 975         } while (!first_pte_in_page(pte));
 976 }
 977
 978 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
 979                                 struct dma_pte *pte, unsigned long pfn,
 980                                 unsigned long start_pfn, unsigned long last_pfn,
 981                                 struct list_head *freelist)
 982 {
 983         struct dma_pte *first_pte = NULL, *last_pte = NULL;
 984
 985         pfn = max(start_pfn, pfn);
 986         pte = &pte[pfn_level_offset(pfn, level)];
 987
 988         do {
 989                 unsigned long level_pfn = pfn & level_mask(level);
 990
 991                 if (!dma_pte_present(pte))
 992                         goto next;
 993
 994                 /* If range covers entire pagetable, free it */
 995                 if (start_pfn <= level_pfn &&
 996                     last_pfn >= level_pfn + level_size(level) - 1) {
 997                         /* These suborbinate page tables are going away entirely. Don't
 998                            bother to clear them; we're just going to *free* them. */
 999                         if (level > 1 && !dma_pte_superpage(pte))
1000                                 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1001
1002                         dma_clear_pte(pte);
1003                         if (!first_pte)
1004                                 first_pte = pte;
1005                         last_pte = pte;
1006                 } else if (level > 1) {
1007                         /* Recurse down into a level that isn't *entirely* obsolete */
1008                         dma_pte_clear_level(domain, level - 1,
1009                                             phys_to_virt(dma_pte_addr(pte)),
1010                                             level_pfn, start_pfn, last_pfn,
1011                                             freelist);
1012                 }
1013 next:
1014                 pfn = level_pfn + level_size(level);
1015         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1016
1017         if (first_pte)
1018                 domain_flush_cache(domain, first_pte,
1019                                    (void *)++last_pte - (void *)first_pte);
1020 }
1021
1022 /* We can't just free the pages because the IOMMU may still be walking
1023    the page tables, and may have cached the intermediate levels. The
1024    pages can only be freed after the IOTLB flush has been done. */
1025 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1026                          unsigned long last_pfn, struct list_head *freelist)
1027 {
1028         if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1029             WARN_ON(start_pfn > last_pfn))
1030                 return;
1031
1032         /* we don't need lock here; nobody else touches the iova range */
1033         dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1034                             domain->pgd, 0, start_pfn, last_pfn, freelist);
1035
1036         /* free pgd */
1037         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1038                 struct page *pgd_page = virt_to_page(domain->pgd);
1039                 list_add_tail(&pgd_page->lru, freelist);
1040                 domain->pgd = NULL;
1041         }
1042 }
1043
1044 /* iommu handling */
1045 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1046 {
1047         struct root_entry *root;
1048
1049         root = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
1050         if (!root) {
1051                 pr_err("Allocating root entry for %s failed\n",
1052                         iommu->name);
1053                 return -ENOMEM;
1054         }
1055
1056         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1057         iommu->root_entry = root;
1058
1059         return 0;
1060 }
1061
1062 static void iommu_set_root_entry(struct intel_iommu *iommu)
1063 {
1064         u64 addr;
1065         u32 sts;
1066         unsigned long flag;
1067
1068         addr = virt_to_phys(iommu->root_entry);
1069         if (sm_supported(iommu))
1070                 addr |= DMA_RTADDR_SMT;
1071
1072         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1073         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1074
1075         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1076
1077         /* Make sure hardware complete it */
1078         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1079                       readl, (sts & DMA_GSTS_RTPS), sts);
1080
1081         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1082
1083         /*
1084          * Hardware invalidates all DMA remapping hardware translation
1085          * caches as part of SRTP flow.
1086          */
1087         if (cap_esrtps(iommu->cap))
1088                 return;
1089
1090         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1091         if (sm_supported(iommu))
1092                 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1093         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1094 }
1095
1096 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1097 {
1098         u32 val;
1099         unsigned long flag;
1100
1101         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1102                 return;
1103
1104         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1105         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1106
1107         /* Make sure hardware complete it */
1108         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1109                       readl, (!(val & DMA_GSTS_WBFS)), val);
1110
1111         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1112 }
1113
1114 /* return value determine if we need a write buffer flush */
1115 static void __iommu_flush_context(struct intel_iommu *iommu,
1116                                   u16 did, u16 source_id, u8 function_mask,
1117                                   u64 type)
1118 {
1119         u64 val = 0;
1120         unsigned long flag;
1121
1122         switch (type) {
1123         case DMA_CCMD_GLOBAL_INVL:
1124                 val = DMA_CCMD_GLOBAL_INVL;
1125                 break;
1126         case DMA_CCMD_DOMAIN_INVL:
1127                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1128                 break;
1129         case DMA_CCMD_DEVICE_INVL:
1130                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1131                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1132                 break;
1133         default:
1134                 pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1135                         iommu->name, type);
1136                 return;
1137         }
1138         val |= DMA_CCMD_ICC;
1139
1140         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1141         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1142
1143         /* Make sure hardware complete it */
1144         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1145                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1146
1147         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1148 }
1149
1150 /* return value determine if we need a write buffer flush */
1151 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1152                                 u64 addr, unsigned int size_order, u64 type)
1153 {
1154         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1155         u64 val = 0, val_iva = 0;
1156         unsigned long flag;
1157
1158         switch (type) {
1159         case DMA_TLB_GLOBAL_FLUSH:
1160                 /* global flush doesn't need set IVA_REG */
1161                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1162                 break;
1163         case DMA_TLB_DSI_FLUSH:
1164                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1165                 break;
1166         case DMA_TLB_PSI_FLUSH:
1167                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1168                 /* IH bit is passed in as part of address */
1169                 val_iva = size_order | addr;
1170                 break;
1171         default:
1172                 pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1173                         iommu->name, type);
1174                 return;
1175         }
1176
1177         if (cap_write_drain(iommu->cap))
1178                 val |= DMA_TLB_WRITE_DRAIN;
1179
1180         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1181         /* Note: Only uses first TLB reg currently */
1182         if (val_iva)
1183                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1184         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1185
1186         /* Make sure hardware complete it */
1187         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1188                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1189
1190         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1191
1192         /* check IOTLB invalidation granularity */
1193         if (DMA_TLB_IAIG(val) == 0)
1194                 pr_err("Flush IOTLB failed\n");
1195         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1196                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1197                         (unsigned long long)DMA_TLB_IIRG(type),
1198                         (unsigned long long)DMA_TLB_IAIG(val));
1199 }
1200
1201 static struct device_domain_info *
1202 domain_lookup_dev_info(struct dmar_domain *domain,
1203                        struct intel_iommu *iommu, u8 bus, u8 devfn)
1204 {
1205         struct device_domain_info *info;
1206         unsigned long flags;
1207
1208         spin_lock_irqsave(&domain->lock, flags);
1209         list_for_each_entry(info, &domain->devices, link) {
1210                 if (info->iommu == iommu && info->bus == bus &&
1211                     info->devfn == devfn) {
1212                         spin_unlock_irqrestore(&domain->lock, flags);
1213                         return info;
1214                 }
1215         }
1216         spin_unlock_irqrestore(&domain->lock, flags);
1217
1218         return NULL;
1219 }
1220
1221 static void domain_update_iotlb(struct dmar_domain *domain)
1222 {
1223         struct dev_pasid_info *dev_pasid;
1224         struct device_domain_info *info;
1225         bool has_iotlb_device = false;
1226         unsigned long flags;
1227
1228         spin_lock_irqsave(&domain->lock, flags);
1229         list_for_each_entry(info, &domain->devices, link) {
1230                 if (info->ats_enabled) {
1231                         has_iotlb_device = true;
1232                         break;
1233                 }
1234         }
1235
1236         list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1237                 info = dev_iommu_priv_get(dev_pasid->dev);
1238                 if (info->ats_enabled) {
1239                         has_iotlb_device = true;
1240                         break;
1241                 }
1242         }
1243         domain->has_iotlb_device = has_iotlb_device;
1244         spin_unlock_irqrestore(&domain->lock, flags);
1245 }
1246
1247 /*
1248  * The extra devTLB flush quirk impacts those QAT devices with PCI device
1249  * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1250  * check because it applies only to the built-in QAT devices and it doesn't
1251  * grant additional privileges.
1252  */
1253 #define BUGGY_QAT_DEVID_MASK 0x4940
1254 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1255 {
1256         if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1257                 return false;
1258
1259         if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1260                 return false;
1261
1262         return true;
1263 }
1264
1265 static void iommu_enable_pci_caps(struct device_domain_info *info)
1266 {
1267         struct pci_dev *pdev;
1268
1269         if (!dev_is_pci(info->dev))
1270                 return;
1271
1272         pdev = to_pci_dev(info->dev);
1273
1274         /* The PCIe spec, in its wisdom, declares that the behaviour of
1275            the device if you enable PASID support after ATS support is
1276            undefined. So always enable PASID support on devices which
1277            have it, even if we can't yet know if we're ever going to
1278            use it. */
1279         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1280                 info->pasid_enabled = 1;
1281
1282         if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1283             !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1284                 info->ats_enabled = 1;
1285                 domain_update_iotlb(info->domain);
1286         }
1287 }
1288
1289 static void iommu_disable_pci_caps(struct device_domain_info *info)
1290 {
1291         struct pci_dev *pdev;
1292
1293         if (!dev_is_pci(info->dev))
1294                 return;
1295
1296         pdev = to_pci_dev(info->dev);
1297
1298         if (info->ats_enabled) {
1299                 pci_disable_ats(pdev);
1300                 info->ats_enabled = 0;
1301                 domain_update_iotlb(info->domain);
1302         }
1303
1304         if (info->pasid_enabled) {
1305                 pci_disable_pasid(pdev);
1306                 info->pasid_enabled = 0;
1307         }
1308 }
1309
1310 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1311                                     u64 addr, unsigned int mask)
1312 {
1313         u16 sid, qdep;
1314
1315         if (!info || !info->ats_enabled)
1316                 return;
1317
1318         sid = info->bus << 8 | info->devfn;
1319         qdep = info->ats_qdep;
1320         qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1321                            qdep, addr, mask);
1322         quirk_extra_dev_tlb_flush(info, addr, mask, IOMMU_NO_PASID, qdep);
1323 }
1324
1325 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1326                                   u64 addr, unsigned mask)
1327 {
1328         struct dev_pasid_info *dev_pasid;
1329         struct device_domain_info *info;
1330         unsigned long flags;
1331
1332         if (!domain->has_iotlb_device)
1333                 return;
1334
1335         spin_lock_irqsave(&domain->lock, flags);
1336         list_for_each_entry(info, &domain->devices, link)
1337                 __iommu_flush_dev_iotlb(info, addr, mask);
1338
1339         list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1340                 info = dev_iommu_priv_get(dev_pasid->dev);
1341
1342                 if (!info->ats_enabled)
1343                         continue;
1344
1345                 qi_flush_dev_iotlb_pasid(info->iommu,
1346                                          PCI_DEVID(info->bus, info->devfn),
1347                                          info->pfsid, dev_pasid->pasid,
1348                                          info->ats_qdep, addr,
1349                                          mask);
1350         }
1351         spin_unlock_irqrestore(&domain->lock, flags);
1352 }
1353
1354 static void domain_flush_pasid_iotlb(struct intel_iommu *iommu,
1355                                      struct dmar_domain *domain, u64 addr,
1356                                      unsigned long npages, bool ih)
1357 {
1358         u16 did = domain_id_iommu(domain, iommu);
1359         struct dev_pasid_info *dev_pasid;
1360         unsigned long flags;
1361
1362         spin_lock_irqsave(&domain->lock, flags);
1363         list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain)
1364                 qi_flush_piotlb(iommu, did, dev_pasid->pasid, addr, npages, ih);
1365
1366         if (!list_empty(&domain->devices))
1367                 qi_flush_piotlb(iommu, did, IOMMU_NO_PASID, addr, npages, ih);
1368         spin_unlock_irqrestore(&domain->lock, flags);
1369 }
1370
1371 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1372                                   struct dmar_domain *domain,
1373                                   unsigned long pfn, unsigned int pages,
1374                                   int ih, int map)
1375 {
1376         unsigned int aligned_pages = __roundup_pow_of_two(pages);
1377         unsigned int mask = ilog2(aligned_pages);
1378         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1379         u16 did = domain_id_iommu(domain, iommu);
1380
1381         if (WARN_ON(!pages))
1382                 return;
1383
1384         if (ih)
1385                 ih = 1 << 6;
1386
1387         if (domain->use_first_level) {
1388                 domain_flush_pasid_iotlb(iommu, domain, addr, pages, ih);
1389         } else {
1390                 unsigned long bitmask = aligned_pages - 1;
1391
1392                 /*
1393                  * PSI masks the low order bits of the base address. If the
1394                  * address isn't aligned to the mask, then compute a mask value
1395                  * needed to ensure the target range is flushed.
1396                  */
1397                 if (unlikely(bitmask & pfn)) {
1398                         unsigned long end_pfn = pfn + pages - 1, shared_bits;
1399
1400                         /*
1401                          * Since end_pfn <= pfn + bitmask, the only way bits
1402                          * higher than bitmask can differ in pfn and end_pfn is
1403                          * by carrying. This means after masking out bitmask,
1404                          * high bits starting with the first set bit in
1405                          * shared_bits are all equal in both pfn and end_pfn.
1406                          */
1407                         shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1408                         mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1409                 }
1410
1411                 /*
1412                  * Fallback to domain selective flush if no PSI support or
1413                  * the size is too big.
1414                  */
1415                 if (!cap_pgsel_inv(iommu->cap) ||
1416                     mask > cap_max_amask_val(iommu->cap))
1417                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1418                                                         DMA_TLB_DSI_FLUSH);
1419                 else
1420                         iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1421                                                         DMA_TLB_PSI_FLUSH);
1422         }
1423
1424         /*
1425          * In caching mode, changes of pages from non-present to present require
1426          * flush. However, device IOTLB doesn't need to be flushed in this case.
1427          */
1428         if (!cap_caching_mode(iommu->cap) || !map)
1429                 iommu_flush_dev_iotlb(domain, addr, mask);
1430 }
1431
1432 /* Notification for newly created mappings */
1433 static void __mapping_notify_one(struct intel_iommu *iommu, struct dmar_domain *domain,
1434                                  unsigned long pfn, unsigned int pages)
1435 {
1436         /*
1437          * It's a non-present to present mapping. Only flush if caching mode
1438          * and second level.
1439          */
1440         if (cap_caching_mode(iommu->cap) && !domain->use_first_level)
1441                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1442         else
1443                 iommu_flush_write_buffer(iommu);
1444 }
1445
1446 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1447 {
1448         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1449         struct iommu_domain_info *info;
1450         unsigned long idx;
1451
1452         xa_for_each(&dmar_domain->iommu_array, idx, info) {
1453                 struct intel_iommu *iommu = info->iommu;
1454                 u16 did = domain_id_iommu(dmar_domain, iommu);
1455
1456                 if (dmar_domain->use_first_level)
1457                         domain_flush_pasid_iotlb(iommu, dmar_domain, 0, -1, 0);
1458                 else
1459                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1460                                                  DMA_TLB_DSI_FLUSH);
1461
1462                 if (!cap_caching_mode(iommu->cap))
1463                         iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1464         }
1465 }
1466
1467 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1468 {
1469         u32 pmen;
1470         unsigned long flags;
1471
1472         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1473                 return;
1474
1475         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1476         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1477         pmen &= ~DMA_PMEN_EPM;
1478         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1479
1480         /* wait for the protected region status bit to clear */
1481         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1482                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1483
1484         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1485 }
1486
1487 static void iommu_enable_translation(struct intel_iommu *iommu)
1488 {
1489         u32 sts;
1490         unsigned long flags;
1491
1492         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1493         iommu->gcmd |= DMA_GCMD_TE;
1494         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1495
1496         /* Make sure hardware complete it */
1497         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1498                       readl, (sts & DMA_GSTS_TES), sts);
1499
1500         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1501 }
1502
1503 static void iommu_disable_translation(struct intel_iommu *iommu)
1504 {
1505         u32 sts;
1506         unsigned long flag;
1507
1508         if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1509             (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1510                 return;
1511
1512         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1513         iommu->gcmd &= ~DMA_GCMD_TE;
1514         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1515
1516         /* Make sure hardware complete it */
1517         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1518                       readl, (!(sts & DMA_GSTS_TES)), sts);
1519
1520         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1521 }
1522
1523 static int iommu_init_domains(struct intel_iommu *iommu)
1524 {
1525         u32 ndomains;
1526
1527         ndomains = cap_ndoms(iommu->cap);
1528         pr_debug("%s: Number of Domains supported <%d>\n",
1529                  iommu->name, ndomains);
1530
1531         spin_lock_init(&iommu->lock);
1532
1533         iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1534         if (!iommu->domain_ids)
1535                 return -ENOMEM;
1536
1537         /*
1538          * If Caching mode is set, then invalid translations are tagged
1539          * with domain-id 0, hence we need to pre-allocate it. We also
1540          * use domain-id 0 as a marker for non-allocated domain-id, so
1541          * make sure it is not used for a real domain.
1542          */
1543         set_bit(0, iommu->domain_ids);
1544
1545         /*
1546          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1547          * entry for first-level or pass-through translation modes should
1548          * be programmed with a domain id different from those used for
1549          * second-level or nested translation. We reserve a domain id for
1550          * this purpose.
1551          */
1552         if (sm_supported(iommu))
1553                 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1554
1555         return 0;
1556 }
1557
1558 static void disable_dmar_iommu(struct intel_iommu *iommu)
1559 {
1560         if (!iommu->domain_ids)
1561                 return;
1562
1563         /*
1564          * All iommu domains must have been detached from the devices,
1565          * hence there should be no domain IDs in use.
1566          */
1567         if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1568                     > NUM_RESERVED_DID))
1569                 return;
1570
1571         if (iommu->gcmd & DMA_GCMD_TE)
1572                 iommu_disable_translation(iommu);
1573 }
1574
1575 static void free_dmar_iommu(struct intel_iommu *iommu)
1576 {
1577         if (iommu->domain_ids) {
1578                 bitmap_free(iommu->domain_ids);
1579                 iommu->domain_ids = NULL;
1580         }
1581
1582         if (iommu->copied_tables) {
1583                 bitmap_free(iommu->copied_tables);
1584                 iommu->copied_tables = NULL;
1585         }
1586
1587         /* free context mapping */
1588         free_context_table(iommu);
1589
1590 #ifdef CONFIG_INTEL_IOMMU_SVM
1591         if (pasid_supported(iommu)) {
1592                 if (ecap_prs(iommu->ecap))
1593                         intel_svm_finish_prq(iommu);
1594         }
1595 #endif
1596 }
1597
1598 /*
1599  * Check and return whether first level is used by default for
1600  * DMA translation.
1601  */
1602 static bool first_level_by_default(unsigned int type)
1603 {
1604         /* Only SL is available in legacy mode */
1605         if (!scalable_mode_support())
1606                 return false;
1607
1608         /* Only level (either FL or SL) is available, just use it */
1609         if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1610                 return intel_cap_flts_sanity();
1611
1612         /* Both levels are available, decide it based on domain type */
1613         return type != IOMMU_DOMAIN_UNMANAGED;
1614 }
1615
1616 static struct dmar_domain *alloc_domain(unsigned int type)
1617 {
1618         struct dmar_domain *domain;
1619
1620         domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1621         if (!domain)
1622                 return NULL;
1623
1624         domain->nid = NUMA_NO_NODE;
1625         if (first_level_by_default(type))
1626                 domain->use_first_level = true;
1627         domain->has_iotlb_device = false;
1628         INIT_LIST_HEAD(&domain->devices);
1629         INIT_LIST_HEAD(&domain->dev_pasids);
1630         spin_lock_init(&domain->lock);
1631         xa_init(&domain->iommu_array);
1632
1633         return domain;
1634 }
1635
1636 int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1637 {
1638         struct iommu_domain_info *info, *curr;
1639         unsigned long ndomains;
1640         int num, ret = -ENOSPC;
1641
1642         info = kzalloc(sizeof(*info), GFP_KERNEL);
1643         if (!info)
1644                 return -ENOMEM;
1645
1646         spin_lock(&iommu->lock);
1647         curr = xa_load(&domain->iommu_array, iommu->seq_id);
1648         if (curr) {
1649                 curr->refcnt++;
1650                 spin_unlock(&iommu->lock);
1651                 kfree(info);
1652                 return 0;
1653         }
1654
1655         ndomains = cap_ndoms(iommu->cap);
1656         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1657         if (num >= ndomains) {
1658                 pr_err("%s: No free domain ids\n", iommu->name);
1659                 goto err_unlock;
1660         }
1661
1662         set_bit(num, iommu->domain_ids);
1663         info->refcnt    = 1;
1664         info->did       = num;
1665         info->iommu     = iommu;
1666         curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1667                           NULL, info, GFP_ATOMIC);
1668         if (curr) {
1669                 ret = xa_err(curr) ? : -EBUSY;
1670                 goto err_clear;
1671         }
1672         domain_update_iommu_cap(domain);
1673
1674         spin_unlock(&iommu->lock);
1675         return 0;
1676
1677 err_clear:
1678         clear_bit(info->did, iommu->domain_ids);
1679 err_unlock:
1680         spin_unlock(&iommu->lock);
1681         kfree(info);
1682         return ret;
1683 }
1684
1685 void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1686 {
1687         struct iommu_domain_info *info;
1688
1689         spin_lock(&iommu->lock);
1690         info = xa_load(&domain->iommu_array, iommu->seq_id);
1691         if (--info->refcnt == 0) {
1692                 clear_bit(info->did, iommu->domain_ids);
1693                 xa_erase(&domain->iommu_array, iommu->seq_id);
1694                 domain->nid = NUMA_NO_NODE;
1695                 domain_update_iommu_cap(domain);
1696                 kfree(info);
1697         }
1698         spin_unlock(&iommu->lock);
1699 }
1700
1701 static int guestwidth_to_adjustwidth(int gaw)
1702 {
1703         int agaw;
1704         int r = (gaw - 12) % 9;
1705
1706         if (r == 0)
1707                 agaw = gaw;
1708         else
1709                 agaw = gaw + 9 - r;
1710         if (agaw > 64)
1711                 agaw = 64;
1712         return agaw;
1713 }
1714
1715 static void domain_exit(struct dmar_domain *domain)
1716 {
1717         if (domain->pgd) {
1718                 LIST_HEAD(freelist);
1719
1720                 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1721                 put_pages_list(&freelist);
1722         }
1723
1724         if (WARN_ON(!list_empty(&domain->devices)))
1725                 return;
1726
1727         kfree(domain);
1728 }
1729
1730 /*
1731  * Get the PASID directory size for scalable mode context entry.
1732  * Value of X in the PDTS field of a scalable mode context entry
1733  * indicates PASID directory with 2^(X + 7) entries.
1734  */
1735 static unsigned long context_get_sm_pds(struct pasid_table *table)
1736 {
1737         unsigned long pds, max_pde;
1738
1739         max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1740         pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
1741         if (pds < 7)
1742                 return 0;
1743
1744         return pds - 7;
1745 }
1746
1747 static int domain_context_mapping_one(struct dmar_domain *domain,
1748                                       struct intel_iommu *iommu,
1749                                       struct pasid_table *table,
1750                                       u8 bus, u8 devfn)
1751 {
1752         struct device_domain_info *info =
1753                         domain_lookup_dev_info(domain, iommu, bus, devfn);
1754         u16 did = domain_id_iommu(domain, iommu);
1755         int translation = CONTEXT_TT_MULTI_LEVEL;
1756         struct context_entry *context;
1757         int ret;
1758
1759         if (hw_pass_through && domain_type_is_si(domain))
1760                 translation = CONTEXT_TT_PASS_THROUGH;
1761
1762         pr_debug("Set context mapping for %02x:%02x.%d\n",
1763                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1764
1765         spin_lock(&iommu->lock);
1766         ret = -ENOMEM;
1767         context = iommu_context_addr(iommu, bus, devfn, 1);
1768         if (!context)
1769                 goto out_unlock;
1770
1771         ret = 0;
1772         if (context_present(context) && !context_copied(iommu, bus, devfn))
1773                 goto out_unlock;
1774
1775         /*
1776          * For kdump cases, old valid entries may be cached due to the
1777          * in-flight DMA and copied pgtable, but there is no unmapping
1778          * behaviour for them, thus we need an explicit cache flush for
1779          * the newly-mapped device. For kdump, at this point, the device
1780          * is supposed to finish reset at its driver probe stage, so no
1781          * in-flight DMA will exist, and we don't need to worry anymore
1782          * hereafter.
1783          */
1784         if (context_copied(iommu, bus, devfn)) {
1785                 u16 did_old = context_domain_id(context);
1786
1787                 if (did_old < cap_ndoms(iommu->cap)) {
1788                         iommu->flush.flush_context(iommu, did_old,
1789                                                    (((u16)bus) << 8) | devfn,
1790                                                    DMA_CCMD_MASK_NOBIT,
1791                                                    DMA_CCMD_DEVICE_INVL);
1792                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1793                                                  DMA_TLB_DSI_FLUSH);
1794                 }
1795
1796                 clear_context_copied(iommu, bus, devfn);
1797         }
1798
1799         context_clear_entry(context);
1800
1801         if (sm_supported(iommu)) {
1802                 unsigned long pds;
1803
1804                 /* Setup the PASID DIR pointer: */
1805                 pds = context_get_sm_pds(table);
1806                 context->lo = (u64)virt_to_phys(table->table) |
1807                                 context_pdts(pds);
1808
1809                 /* Setup the RID_PASID field: */
1810                 context_set_sm_rid2pasid(context, IOMMU_NO_PASID);
1811
1812                 /*
1813                  * Setup the Device-TLB enable bit and Page request
1814                  * Enable bit:
1815                  */
1816                 if (info && info->ats_supported)
1817                         context_set_sm_dte(context);
1818                 if (info && info->pri_supported)
1819                         context_set_sm_pre(context);
1820                 if (info && info->pasid_supported)
1821                         context_set_pasid(context);
1822         } else {
1823                 struct dma_pte *pgd = domain->pgd;
1824                 int agaw;
1825
1826                 context_set_domain_id(context, did);
1827
1828                 if (translation != CONTEXT_TT_PASS_THROUGH) {
1829                         /*
1830                          * Skip top levels of page tables for iommu which has
1831                          * less agaw than default. Unnecessary for PT mode.
1832                          */
1833                         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
1834                                 ret = -ENOMEM;
1835                                 pgd = phys_to_virt(dma_pte_addr(pgd));
1836                                 if (!dma_pte_present(pgd))
1837                                         goto out_unlock;
1838                         }
1839
1840                         if (info && info->ats_supported)
1841                                 translation = CONTEXT_TT_DEV_IOTLB;
1842                         else
1843                                 translation = CONTEXT_TT_MULTI_LEVEL;
1844
1845                         context_set_address_root(context, virt_to_phys(pgd));
1846                         context_set_address_width(context, agaw);
1847                 } else {
1848                         /*
1849                          * In pass through mode, AW must be programmed to
1850                          * indicate the largest AGAW value supported by
1851                          * hardware. And ASR is ignored by hardware.
1852                          */
1853                         context_set_address_width(context, iommu->msagaw);
1854                 }
1855
1856                 context_set_translation_type(context, translation);
1857         }
1858
1859         context_set_fault_enable(context);
1860         context_set_present(context);
1861         if (!ecap_coherent(iommu->ecap))
1862                 clflush_cache_range(context, sizeof(*context));
1863
1864         /*
1865          * It's a non-present to present mapping. If hardware doesn't cache
1866          * non-present entry we only need to flush the write-buffer. If the
1867          * _does_ cache non-present entries, then it does so in the special
1868          * domain #0, which we have to flush:
1869          */
1870         if (cap_caching_mode(iommu->cap)) {
1871                 iommu->flush.flush_context(iommu, 0,
1872                                            (((u16)bus) << 8) | devfn,
1873                                            DMA_CCMD_MASK_NOBIT,
1874                                            DMA_CCMD_DEVICE_INVL);
1875                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1876         } else {
1877                 iommu_flush_write_buffer(iommu);
1878         }
1879
1880         ret = 0;
1881
1882 out_unlock:
1883         spin_unlock(&iommu->lock);
1884
1885         return ret;
1886 }
1887
1888 struct domain_context_mapping_data {
1889         struct dmar_domain *domain;
1890         struct intel_iommu *iommu;
1891         struct pasid_table *table;
1892 };
1893
1894 static int domain_context_mapping_cb(struct pci_dev *pdev,
1895                                      u16 alias, void *opaque)
1896 {
1897         struct domain_context_mapping_data *data = opaque;
1898
1899         return domain_context_mapping_one(data->domain, data->iommu,
1900                                           data->table, PCI_BUS_NUM(alias),
1901                                           alias & 0xff);
1902 }
1903
1904 static int
1905 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
1906 {
1907         struct device_domain_info *info = dev_iommu_priv_get(dev);
1908         struct domain_context_mapping_data data;
1909         struct intel_iommu *iommu = info->iommu;
1910         u8 bus = info->bus, devfn = info->devfn;
1911         struct pasid_table *table;
1912
1913         table = intel_pasid_get_table(dev);
1914
1915         if (!dev_is_pci(dev))
1916                 return domain_context_mapping_one(domain, iommu, table,
1917                                                   bus, devfn);
1918
1919         data.domain = domain;
1920         data.iommu = iommu;
1921         data.table = table;
1922
1923         return pci_for_each_dma_alias(to_pci_dev(dev),
1924                                       &domain_context_mapping_cb, &data);
1925 }
1926
1927 /* Returns a number of VTD pages, but aligned to MM page size */
1928 static unsigned long aligned_nrpages(unsigned long host_addr, size_t size)
1929 {
1930         host_addr &= ~PAGE_MASK;
1931         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1932 }
1933
1934 /* Return largest possible superpage level for a given mapping */
1935 static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn,
1936                                    unsigned long phy_pfn, unsigned long pages)
1937 {
1938         int support, level = 1;
1939         unsigned long pfnmerge;
1940
1941         support = domain->iommu_superpage;
1942
1943         /* To use a large page, the virtual *and* physical addresses
1944            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1945            of them will mean we have to use smaller pages. So just
1946            merge them and check both at once. */
1947         pfnmerge = iov_pfn | phy_pfn;
1948
1949         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1950                 pages >>= VTD_STRIDE_SHIFT;
1951                 if (!pages)
1952                         break;
1953                 pfnmerge >>= VTD_STRIDE_SHIFT;
1954                 level++;
1955                 support--;
1956         }
1957         return level;
1958 }
1959
1960 /*
1961  * Ensure that old small page tables are removed to make room for superpage(s).
1962  * We're going to add new large pages, so make sure we don't remove their parent
1963  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
1964  */
1965 static void switch_to_super_page(struct dmar_domain *domain,
1966                                  unsigned long start_pfn,
1967                                  unsigned long end_pfn, int level)
1968 {
1969         unsigned long lvl_pages = lvl_to_nr_pages(level);
1970         struct iommu_domain_info *info;
1971         struct dma_pte *pte = NULL;
1972         unsigned long i;
1973
1974         while (start_pfn <= end_pfn) {
1975                 if (!pte)
1976                         pte = pfn_to_dma_pte(domain, start_pfn, &level,
1977                                              GFP_ATOMIC);
1978
1979                 if (dma_pte_present(pte)) {
1980                         dma_pte_free_pagetable(domain, start_pfn,
1981                                                start_pfn + lvl_pages - 1,
1982                                                level + 1);
1983
1984                         xa_for_each(&domain->iommu_array, i, info)
1985                                 iommu_flush_iotlb_psi(info->iommu, domain,
1986                                                       start_pfn, lvl_pages,
1987                                                       0, 0);
1988                 }
1989
1990                 pte++;
1991                 start_pfn += lvl_pages;
1992                 if (first_pte_in_page(pte))
1993                         pte = NULL;
1994         }
1995 }
1996
1997 static int
1998 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1999                  unsigned long phys_pfn, unsigned long nr_pages, int prot,
2000                  gfp_t gfp)
2001 {
2002         struct dma_pte *first_pte = NULL, *pte = NULL;
2003         unsigned int largepage_lvl = 0;
2004         unsigned long lvl_pages = 0;
2005         phys_addr_t pteval;
2006         u64 attr;
2007
2008         if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
2009                 return -EINVAL;
2010
2011         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2012                 return -EINVAL;
2013
2014         if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
2015                 pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
2016                 return -EINVAL;
2017         }
2018
2019         attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2020         attr |= DMA_FL_PTE_PRESENT;
2021         if (domain->use_first_level) {
2022                 attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2023                 if (prot & DMA_PTE_WRITE)
2024                         attr |= DMA_FL_PTE_DIRTY;
2025         }
2026
2027         domain->has_mappings = true;
2028
2029         pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2030
2031         while (nr_pages > 0) {
2032                 uint64_t tmp;
2033
2034                 if (!pte) {
2035                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2036                                         phys_pfn, nr_pages);
2037
2038                         pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
2039                                              gfp);
2040                         if (!pte)
2041                                 return -ENOMEM;
2042                         first_pte = pte;
2043
2044                         lvl_pages = lvl_to_nr_pages(largepage_lvl);
2045
2046                         /* It is large page*/
2047                         if (largepage_lvl > 1) {
2048                                 unsigned long end_pfn;
2049                                 unsigned long pages_to_remove;
2050
2051                                 pteval |= DMA_PTE_LARGE_PAGE;
2052                                 pages_to_remove = min_t(unsigned long, nr_pages,
2053                                                         nr_pte_to_next_page(pte) * lvl_pages);
2054                                 end_pfn = iov_pfn + pages_to_remove - 1;
2055                                 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2056                         } else {
2057                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2058                         }
2059
2060                 }
2061                 /* We don't need lock here, nobody else
2062                  * touches the iova range
2063                  */
2064                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2065                 if (tmp) {
2066                         static int dumps = 5;
2067                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2068                                 iov_pfn, tmp, (unsigned long long)pteval);
2069                         if (dumps) {
2070                                 dumps--;
2071                                 debug_dma_dump_mappings(NULL);
2072                         }
2073                         WARN_ON(1);
2074                 }
2075
2076                 nr_pages -= lvl_pages;
2077                 iov_pfn += lvl_pages;
2078                 phys_pfn += lvl_pages;
2079                 pteval += lvl_pages * VTD_PAGE_SIZE;
2080
2081                 /* If the next PTE would be the first in a new page, then we
2082                  * need to flush the cache on the entries we've just written.
2083                  * And then we'll need to recalculate 'pte', so clear it and
2084                  * let it get set again in the if (!pte) block above.
2085                  *
2086                  * If we're done (!nr_pages) we need to flush the cache too.
2087                  *
2088                  * Also if we've been setting superpages, we may need to
2089                  * recalculate 'pte' and switch back to smaller pages for the
2090                  * end of the mapping, if the trailing size is not enough to
2091                  * use another superpage (i.e. nr_pages < lvl_pages).
2092                  */
2093                 pte++;
2094                 if (!nr_pages || first_pte_in_page(pte) ||
2095                     (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2096                         domain_flush_cache(domain, first_pte,
2097                                            (void *)pte - (void *)first_pte);
2098                         pte = NULL;
2099                 }
2100         }
2101
2102         return 0;
2103 }
2104
2105 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2106 {
2107         struct intel_iommu *iommu = info->iommu;
2108         struct context_entry *context;
2109         u16 did_old;
2110
2111         if (!iommu)
2112                 return;
2113
2114         spin_lock(&iommu->lock);
2115         context = iommu_context_addr(iommu, bus, devfn, 0);
2116         if (!context) {
2117                 spin_unlock(&iommu->lock);
2118                 return;
2119         }
2120
2121         if (sm_supported(iommu)) {
2122                 if (hw_pass_through && domain_type_is_si(info->domain))
2123                         did_old = FLPT_DEFAULT_DID;
2124                 else
2125                         did_old = domain_id_iommu(info->domain, iommu);
2126         } else {
2127                 did_old = context_domain_id(context);
2128         }
2129
2130         context_clear_entry(context);
2131         __iommu_flush_cache(iommu, context, sizeof(*context));
2132         spin_unlock(&iommu->lock);
2133         iommu->flush.flush_context(iommu,
2134                                    did_old,
2135                                    (((u16)bus) << 8) | devfn,
2136                                    DMA_CCMD_MASK_NOBIT,
2137                                    DMA_CCMD_DEVICE_INVL);
2138
2139         if (sm_supported(iommu))
2140                 qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2141
2142         iommu->flush.flush_iotlb(iommu,
2143                                  did_old,
2144                                  0,
2145                                  0,
2146                                  DMA_TLB_DSI_FLUSH);
2147
2148         __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2149 }
2150
2151 static int domain_setup_first_level(struct intel_iommu *iommu,
2152                                     struct dmar_domain *domain,
2153                                     struct device *dev,
2154                                     u32 pasid)
2155 {
2156         struct dma_pte *pgd = domain->pgd;
2157         int agaw, level;
2158         int flags = 0;
2159
2160         /*
2161          * Skip top levels of page tables for iommu which has
2162          * less agaw than default. Unnecessary for PT mode.
2163          */
2164         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2165                 pgd = phys_to_virt(dma_pte_addr(pgd));
2166                 if (!dma_pte_present(pgd))
2167                         return -ENOMEM;
2168         }
2169
2170         level = agaw_to_level(agaw);
2171         if (level != 4 && level != 5)
2172                 return -EINVAL;
2173
2174         if (level == 5)
2175                 flags |= PASID_FLAG_FL5LP;
2176
2177         if (domain->force_snooping)
2178                 flags |= PASID_FLAG_PAGE_SNOOP;
2179
2180         return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2181                                              domain_id_iommu(domain, iommu),
2182                                              flags);
2183 }
2184
2185 static bool dev_is_real_dma_subdevice(struct device *dev)
2186 {
2187         return dev && dev_is_pci(dev) &&
2188                pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2189 }
2190
2191 static int iommu_domain_identity_map(struct dmar_domain *domain,
2192                                      unsigned long first_vpfn,
2193                                      unsigned long last_vpfn)
2194 {
2195         /*
2196          * RMRR range might have overlap with physical memory range,
2197          * clear it first
2198          */
2199         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2200
2201         return __domain_mapping(domain, first_vpfn,
2202                                 first_vpfn, last_vpfn - first_vpfn + 1,
2203                                 DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL);
2204 }
2205
2206 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2207
2208 static int __init si_domain_init(int hw)
2209 {
2210         struct dmar_rmrr_unit *rmrr;
2211         struct device *dev;
2212         int i, nid, ret;
2213
2214         si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2215         if (!si_domain)
2216                 return -EFAULT;
2217
2218         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2219                 domain_exit(si_domain);
2220                 si_domain = NULL;
2221                 return -EFAULT;
2222         }
2223
2224         if (hw)
2225                 return 0;
2226
2227         for_each_online_node(nid) {
2228                 unsigned long start_pfn, end_pfn;
2229                 int i;
2230
2231                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2232                         ret = iommu_domain_identity_map(si_domain,
2233                                         mm_to_dma_pfn_start(start_pfn),
2234                                         mm_to_dma_pfn_end(end_pfn));
2235                         if (ret)
2236                                 return ret;
2237                 }
2238         }
2239
2240         /*
2241          * Identity map the RMRRs so that devices with RMRRs could also use
2242          * the si_domain.
2243          */
2244         for_each_rmrr_units(rmrr) {
2245                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2246                                           i, dev) {
2247                         unsigned long long start = rmrr->base_address;
2248                         unsigned long long end = rmrr->end_address;
2249
2250                         if (WARN_ON(end < start ||
2251                                     end >> agaw_to_width(si_domain->agaw)))
2252                                 continue;
2253
2254                         ret = iommu_domain_identity_map(si_domain,
2255                                         mm_to_dma_pfn_start(start >> PAGE_SHIFT),
2256                                         mm_to_dma_pfn_end(end >> PAGE_SHIFT));
2257                         if (ret)
2258                                 return ret;
2259                 }
2260         }
2261
2262         return 0;
2263 }
2264
2265 static int dmar_domain_attach_device(struct dmar_domain *domain,
2266                                      struct device *dev)
2267 {
2268         struct device_domain_info *info = dev_iommu_priv_get(dev);
2269         struct intel_iommu *iommu = info->iommu;
2270         unsigned long flags;
2271         int ret;
2272
2273         ret = domain_attach_iommu(domain, iommu);
2274         if (ret)
2275                 return ret;
2276         info->domain = domain;
2277         spin_lock_irqsave(&domain->lock, flags);
2278         list_add(&info->link, &domain->devices);
2279         spin_unlock_irqrestore(&domain->lock, flags);
2280
2281         /* PASID table is mandatory for a PCI device in scalable mode. */
2282         if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2283                 /* Setup the PASID entry for requests without PASID: */
2284                 if (hw_pass_through && domain_type_is_si(domain))
2285                         ret = intel_pasid_setup_pass_through(iommu,
2286                                         dev, IOMMU_NO_PASID);
2287                 else if (domain->use_first_level)
2288                         ret = domain_setup_first_level(iommu, domain, dev,
2289                                         IOMMU_NO_PASID);
2290                 else
2291                         ret = intel_pasid_setup_second_level(iommu, domain,
2292                                         dev, IOMMU_NO_PASID);
2293                 if (ret) {
2294                         dev_err(dev, "Setup RID2PASID failed\n");
2295                         device_block_translation(dev);
2296                         return ret;
2297                 }
2298         }
2299
2300         ret = domain_context_mapping(domain, dev);
2301         if (ret) {
2302                 dev_err(dev, "Domain context map failed\n");
2303                 device_block_translation(dev);
2304                 return ret;
2305         }
2306
2307         if (sm_supported(info->iommu) || !domain_type_is_si(info->domain))
2308                 iommu_enable_pci_caps(info);
2309
2310         return 0;
2311 }
2312
2313 /**
2314  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2315  * is relaxable (ie. is allowed to be not enforced under some conditions)
2316  * @dev: device handle
2317  *
2318  * We assume that PCI USB devices with RMRRs have them largely
2319  * for historical reasons and that the RMRR space is not actively used post
2320  * boot.  This exclusion may change if vendors begin to abuse it.
2321  *
2322  * The same exception is made for graphics devices, with the requirement that
2323  * any use of the RMRR regions will be torn down before assigning the device
2324  * to a guest.
2325  *
2326  * Return: true if the RMRR is relaxable, false otherwise
2327  */
2328 static bool device_rmrr_is_relaxable(struct device *dev)
2329 {
2330         struct pci_dev *pdev;
2331
2332         if (!dev_is_pci(dev))
2333                 return false;
2334
2335         pdev = to_pci_dev(dev);
2336         if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2337                 return true;
2338         else
2339                 return false;
2340 }
2341
2342 /*
2343  * Return the required default domain type for a specific device.
2344  *
2345  * @dev: the device in query
2346  * @startup: true if this is during early boot
2347  *
2348  * Returns:
2349  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2350  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2351  *  - 0: both identity and dynamic domains work for this device
2352  */
2353 static int device_def_domain_type(struct device *dev)
2354 {
2355         if (dev_is_pci(dev)) {
2356                 struct pci_dev *pdev = to_pci_dev(dev);
2357
2358                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2359                         return IOMMU_DOMAIN_IDENTITY;
2360
2361                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2362                         return IOMMU_DOMAIN_IDENTITY;
2363         }
2364
2365         return 0;
2366 }
2367
2368 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2369 {
2370         /*
2371          * Start from the sane iommu hardware state.
2372          * If the queued invalidation is already initialized by us
2373          * (for example, while enabling interrupt-remapping) then
2374          * we got the things already rolling from a sane state.
2375          */
2376         if (!iommu->qi) {
2377                 /*
2378                  * Clear any previous faults.
2379                  */
2380                 dmar_fault(-1, iommu);
2381                 /*
2382                  * Disable queued invalidation if supported and already enabled
2383                  * before OS handover.
2384                  */
2385                 dmar_disable_qi(iommu);
2386         }
2387
2388         if (dmar_enable_qi(iommu)) {
2389                 /*
2390                  * Queued Invalidate not enabled, use Register Based Invalidate
2391                  */
2392                 iommu->flush.flush_context = __iommu_flush_context;
2393                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2394                 pr_info("%s: Using Register based invalidation\n",
2395                         iommu->name);
2396         } else {
2397                 iommu->flush.flush_context = qi_flush_context;
2398                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2399                 pr_info("%s: Using Queued invalidation\n", iommu->name);
2400         }
2401 }
2402
2403 static int copy_context_table(struct intel_iommu *iommu,
2404                               struct root_entry *old_re,
2405                               struct context_entry **tbl,
2406                               int bus, bool ext)
2407 {
2408         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2409         struct context_entry *new_ce = NULL, ce;
2410         struct context_entry *old_ce = NULL;
2411         struct root_entry re;
2412         phys_addr_t old_ce_phys;
2413
2414         tbl_idx = ext ? bus * 2 : bus;
2415         memcpy(&re, old_re, sizeof(re));
2416
2417         for (devfn = 0; devfn < 256; devfn++) {
2418                 /* First calculate the correct index */
2419                 idx = (ext ? devfn * 2 : devfn) % 256;
2420
2421                 if (idx == 0) {
2422                         /* First save what we may have and clean up */
2423                         if (new_ce) {
2424                                 tbl[tbl_idx] = new_ce;
2425                                 __iommu_flush_cache(iommu, new_ce,
2426                                                     VTD_PAGE_SIZE);
2427                                 pos = 1;
2428                         }
2429
2430                         if (old_ce)
2431                                 memunmap(old_ce);
2432
2433                         ret = 0;
2434                         if (devfn < 0x80)
2435                                 old_ce_phys = root_entry_lctp(&re);
2436                         else
2437                                 old_ce_phys = root_entry_uctp(&re);
2438
2439                         if (!old_ce_phys) {
2440                                 if (ext && devfn == 0) {
2441                                         /* No LCTP, try UCTP */
2442                                         devfn = 0x7f;
2443                                         continue;
2444                                 } else {
2445                                         goto out;
2446                                 }
2447                         }
2448
2449                         ret = -ENOMEM;
2450                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
2451                                         MEMREMAP_WB);
2452                         if (!old_ce)
2453                                 goto out;
2454
2455                         new_ce = alloc_pgtable_page(iommu->node, GFP_KERNEL);
2456                         if (!new_ce)
2457                                 goto out_unmap;
2458
2459                         ret = 0;
2460                 }
2461
2462                 /* Now copy the context entry */
2463                 memcpy(&ce, old_ce + idx, sizeof(ce));
2464
2465                 if (!context_present(&ce))
2466                         continue;
2467
2468                 did = context_domain_id(&ce);
2469                 if (did >= 0 && did < cap_ndoms(iommu->cap))
2470                         set_bit(did, iommu->domain_ids);
2471
2472                 set_context_copied(iommu, bus, devfn);
2473                 new_ce[idx] = ce;
2474         }
2475
2476         tbl[tbl_idx + pos] = new_ce;
2477
2478         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2479
2480 out_unmap:
2481         memunmap(old_ce);
2482
2483 out:
2484         return ret;
2485 }
2486
2487 static int copy_translation_tables(struct intel_iommu *iommu)
2488 {
2489         struct context_entry **ctxt_tbls;
2490         struct root_entry *old_rt;
2491         phys_addr_t old_rt_phys;
2492         int ctxt_table_entries;
2493         u64 rtaddr_reg;
2494         int bus, ret;
2495         bool new_ext, ext;
2496
2497         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2498         ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
2499         new_ext    = !!sm_supported(iommu);
2500
2501         /*
2502          * The RTT bit can only be changed when translation is disabled,
2503          * but disabling translation means to open a window for data
2504          * corruption. So bail out and don't copy anything if we would
2505          * have to change the bit.
2506          */
2507         if (new_ext != ext)
2508                 return -EINVAL;
2509
2510         iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2511         if (!iommu->copied_tables)
2512                 return -ENOMEM;
2513
2514         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2515         if (!old_rt_phys)
2516                 return -EINVAL;
2517
2518         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2519         if (!old_rt)
2520                 return -ENOMEM;
2521
2522         /* This is too big for the stack - allocate it from slab */
2523         ctxt_table_entries = ext ? 512 : 256;
2524         ret = -ENOMEM;
2525         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2526         if (!ctxt_tbls)
2527                 goto out_unmap;
2528
2529         for (bus = 0; bus < 256; bus++) {
2530                 ret = copy_context_table(iommu, &old_rt[bus],
2531                                          ctxt_tbls, bus, ext);
2532                 if (ret) {
2533                         pr_err("%s: Failed to copy context table for bus %d\n",
2534                                 iommu->name, bus);
2535                         continue;
2536                 }
2537         }
2538
2539         spin_lock(&iommu->lock);
2540
2541         /* Context tables are copied, now write them to the root_entry table */
2542         for (bus = 0; bus < 256; bus++) {
2543                 int idx = ext ? bus * 2 : bus;
2544                 u64 val;
2545
2546                 if (ctxt_tbls[idx]) {
2547                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
2548                         iommu->root_entry[bus].lo = val;
2549                 }
2550
2551                 if (!ext || !ctxt_tbls[idx + 1])
2552                         continue;
2553
2554                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2555                 iommu->root_entry[bus].hi = val;
2556         }
2557
2558         spin_unlock(&iommu->lock);
2559
2560         kfree(ctxt_tbls);
2561
2562         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2563
2564         ret = 0;
2565
2566 out_unmap:
2567         memunmap(old_rt);
2568
2569         return ret;
2570 }
2571
2572 static int __init init_dmars(void)
2573 {
2574         struct dmar_drhd_unit *drhd;
2575         struct intel_iommu *iommu;
2576         int ret;
2577
2578         ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2579         if (ret)
2580                 goto free_iommu;
2581
2582         for_each_iommu(iommu, drhd) {
2583                 if (drhd->ignored) {
2584                         iommu_disable_translation(iommu);
2585                         continue;
2586                 }
2587
2588                 /*
2589                  * Find the max pasid size of all IOMMU's in the system.
2590                  * We need to ensure the system pasid table is no bigger
2591                  * than the smallest supported.
2592                  */
2593                 if (pasid_supported(iommu)) {
2594                         u32 temp = 2 << ecap_pss(iommu->ecap);
2595
2596                         intel_pasid_max_id = min_t(u32, temp,
2597                                                    intel_pasid_max_id);
2598                 }
2599
2600                 intel_iommu_init_qi(iommu);
2601
2602                 ret = iommu_init_domains(iommu);
2603                 if (ret)
2604                         goto free_iommu;
2605
2606                 init_translation_status(iommu);
2607
2608                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2609                         iommu_disable_translation(iommu);
2610                         clear_translation_pre_enabled(iommu);
2611                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2612                                 iommu->name);
2613                 }
2614
2615                 /*
2616                  * TBD:
2617                  * we could share the same root & context tables
2618                  * among all IOMMU's. Need to Split it later.
2619                  */
2620                 ret = iommu_alloc_root_entry(iommu);
2621                 if (ret)
2622                         goto free_iommu;
2623
2624                 if (translation_pre_enabled(iommu)) {
2625                         pr_info("Translation already enabled - trying to copy translation structures\n");
2626
2627                         ret = copy_translation_tables(iommu);
2628                         if (ret) {
2629                                 /*
2630                                  * We found the IOMMU with translation
2631                                  * enabled - but failed to copy over the
2632                                  * old root-entry table. Try to proceed
2633                                  * by disabling translation now and
2634                                  * allocating a clean root-entry table.
2635                                  * This might cause DMAR faults, but
2636                                  * probably the dump will still succeed.
2637                                  */
2638                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
2639                                        iommu->name);
2640                                 iommu_disable_translation(iommu);
2641                                 clear_translation_pre_enabled(iommu);
2642                         } else {
2643                                 pr_info("Copied translation tables from previous kernel for %s\n",
2644                                         iommu->name);
2645                         }
2646                 }
2647
2648                 if (!ecap_pass_through(iommu->ecap))
2649                         hw_pass_through = 0;
2650                 intel_svm_check(iommu);
2651         }
2652
2653         /*
2654          * Now that qi is enabled on all iommus, set the root entry and flush
2655          * caches. This is required on some Intel X58 chipsets, otherwise the
2656          * flush_context function will loop forever and the boot hangs.
2657          */
2658         for_each_active_iommu(iommu, drhd) {
2659                 iommu_flush_write_buffer(iommu);
2660                 iommu_set_root_entry(iommu);
2661         }
2662
2663 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2664         dmar_map_gfx = 0;
2665 #endif
2666
2667         if (!dmar_map_gfx)
2668                 iommu_identity_mapping |= IDENTMAP_GFX;
2669
2670         check_tylersburg_isoch();
2671
2672         ret = si_domain_init(hw_pass_through);
2673         if (ret)
2674                 goto free_iommu;
2675
2676         /*
2677          * for each drhd
2678          *   enable fault log
2679          *   global invalidate context cache
2680          *   global invalidate iotlb
2681          *   enable translation
2682          */
2683         for_each_iommu(iommu, drhd) {
2684                 if (drhd->ignored) {
2685                         /*
2686                          * we always have to disable PMRs or DMA may fail on
2687                          * this device
2688                          */
2689                         if (force_on)
2690                                 iommu_disable_protect_mem_regions(iommu);
2691                         continue;
2692                 }
2693
2694                 iommu_flush_write_buffer(iommu);
2695
2696 #ifdef CONFIG_INTEL_IOMMU_SVM
2697                 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
2698                         /*
2699                          * Call dmar_alloc_hwirq() with dmar_global_lock held,
2700                          * could cause possible lock race condition.
2701                          */
2702                         up_write(&dmar_global_lock);
2703                         ret = intel_svm_enable_prq(iommu);
2704                         down_write(&dmar_global_lock);
2705                         if (ret)
2706                                 goto free_iommu;
2707                 }
2708 #endif
2709                 ret = dmar_set_interrupt(iommu);
2710                 if (ret)
2711                         goto free_iommu;
2712         }
2713
2714         return 0;
2715
2716 free_iommu:
2717         for_each_active_iommu(iommu, drhd) {
2718                 disable_dmar_iommu(iommu);
2719                 free_dmar_iommu(iommu);
2720         }
2721         if (si_domain) {
2722                 domain_exit(si_domain);
2723                 si_domain = NULL;
2724         }
2725
2726         return ret;
2727 }
2728
2729 static void __init init_no_remapping_devices(void)
2730 {
2731         struct dmar_drhd_unit *drhd;
2732         struct device *dev;
2733         int i;
2734
2735         for_each_drhd_unit(drhd) {
2736                 if (!drhd->include_all) {
2737                         for_each_active_dev_scope(drhd->devices,
2738                                                   drhd->devices_cnt, i, dev)
2739                                 break;
2740                         /* ignore DMAR unit if no devices exist */
2741                         if (i == drhd->devices_cnt)
2742                                 drhd->ignored = 1;
2743                 }
2744         }
2745
2746         for_each_active_drhd_unit(drhd) {
2747                 if (drhd->include_all)
2748                         continue;
2749
2750                 for_each_active_dev_scope(drhd->devices,
2751                                           drhd->devices_cnt, i, dev)
2752                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2753                                 break;
2754                 if (i < drhd->devices_cnt)
2755                         continue;
2756
2757                 /* This IOMMU has *only* gfx devices. Either bypass it or
2758                    set the gfx_mapped flag, as appropriate */
2759                 drhd->gfx_dedicated = 1;
2760                 if (!dmar_map_gfx)
2761                         drhd->ignored = 1;
2762         }
2763 }
2764
2765 #ifdef CONFIG_SUSPEND
2766 static int init_iommu_hw(void)
2767 {
2768         struct dmar_drhd_unit *drhd;
2769         struct intel_iommu *iommu = NULL;
2770         int ret;
2771
2772         for_each_active_iommu(iommu, drhd) {
2773                 if (iommu->qi) {
2774                         ret = dmar_reenable_qi(iommu);
2775                         if (ret)
2776                                 return ret;
2777                 }
2778         }
2779
2780         for_each_iommu(iommu, drhd) {
2781                 if (drhd->ignored) {
2782                         /*
2783                          * we always have to disable PMRs or DMA may fail on
2784                          * this device
2785                          */
2786                         if (force_on)
2787                                 iommu_disable_protect_mem_regions(iommu);
2788                         continue;
2789                 }
2790
2791                 iommu_flush_write_buffer(iommu);
2792                 iommu_set_root_entry(iommu);
2793                 iommu_enable_translation(iommu);
2794                 iommu_disable_protect_mem_regions(iommu);
2795         }
2796
2797         return 0;
2798 }
2799
2800 static void iommu_flush_all(void)
2801 {
2802         struct dmar_drhd_unit *drhd;
2803         struct intel_iommu *iommu;
2804
2805         for_each_active_iommu(iommu, drhd) {
2806                 iommu->flush.flush_context(iommu, 0, 0, 0,
2807                                            DMA_CCMD_GLOBAL_INVL);
2808                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2809                                          DMA_TLB_GLOBAL_FLUSH);
2810         }
2811 }
2812
2813 static int iommu_suspend(void)
2814 {
2815         struct dmar_drhd_unit *drhd;
2816         struct intel_iommu *iommu = NULL;
2817         unsigned long flag;
2818
2819         iommu_flush_all();
2820
2821         for_each_active_iommu(iommu, drhd) {
2822                 iommu_disable_translation(iommu);
2823
2824                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
2825
2826                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
2827                         readl(iommu->reg + DMAR_FECTL_REG);
2828                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
2829                         readl(iommu->reg + DMAR_FEDATA_REG);
2830                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
2831                         readl(iommu->reg + DMAR_FEADDR_REG);
2832                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
2833                         readl(iommu->reg + DMAR_FEUADDR_REG);
2834
2835                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2836         }
2837         return 0;
2838 }
2839
2840 static void iommu_resume(void)
2841 {
2842         struct dmar_drhd_unit *drhd;
2843         struct intel_iommu *iommu = NULL;
2844         unsigned long flag;
2845
2846         if (init_iommu_hw()) {
2847                 if (force_on)
2848                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
2849                 else
2850                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
2851                 return;
2852         }
2853
2854         for_each_active_iommu(iommu, drhd) {
2855
2856                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
2857
2858                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
2859                         iommu->reg + DMAR_FECTL_REG);
2860                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
2861                         iommu->reg + DMAR_FEDATA_REG);
2862                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
2863                         iommu->reg + DMAR_FEADDR_REG);
2864                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
2865                         iommu->reg + DMAR_FEUADDR_REG);
2866
2867                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2868         }
2869 }
2870
2871 static struct syscore_ops iommu_syscore_ops = {
2872         .resume         = iommu_resume,
2873         .suspend        = iommu_suspend,
2874 };
2875
2876 static void __init init_iommu_pm_ops(void)
2877 {
2878         register_syscore_ops(&iommu_syscore_ops);
2879 }
2880
2881 #else
2882 static inline void init_iommu_pm_ops(void) {}
2883 #endif  /* CONFIG_PM */
2884
2885 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
2886 {
2887         if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
2888             !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
2889             rmrr->end_address <= rmrr->base_address ||
2890             arch_rmrr_sanity_check(rmrr))
2891                 return -EINVAL;
2892
2893         return 0;
2894 }
2895
2896 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
2897 {
2898         struct acpi_dmar_reserved_memory *rmrr;
2899         struct dmar_rmrr_unit *rmrru;
2900
2901         rmrr = (struct acpi_dmar_reserved_memory *)header;
2902         if (rmrr_sanity_check(rmrr)) {
2903                 pr_warn(FW_BUG
2904                            "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
2905                            "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2906                            rmrr->base_address, rmrr->end_address,
2907                            dmi_get_system_info(DMI_BIOS_VENDOR),
2908                            dmi_get_system_info(DMI_BIOS_VERSION),
2909                            dmi_get_system_info(DMI_PRODUCT_VERSION));
2910                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
2911         }
2912
2913         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
2914         if (!rmrru)
2915                 goto out;
2916
2917         rmrru->hdr = header;
2918
2919         rmrru->base_address = rmrr->base_address;
2920         rmrru->end_address = rmrr->end_address;
2921
2922         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
2923                                 ((void *)rmrr) + rmrr->header.length,
2924                                 &rmrru->devices_cnt);
2925         if (rmrru->devices_cnt && rmrru->devices == NULL)
2926                 goto free_rmrru;
2927
2928         list_add(&rmrru->list, &dmar_rmrr_units);
2929
2930         return 0;
2931 free_rmrru:
2932         kfree(rmrru);
2933 out:
2934         return -ENOMEM;
2935 }
2936
2937 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
2938 {
2939         struct dmar_atsr_unit *atsru;
2940         struct acpi_dmar_atsr *tmp;
2941
2942         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
2943                                 dmar_rcu_check()) {
2944                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
2945                 if (atsr->segment != tmp->segment)
2946                         continue;
2947                 if (atsr->header.length != tmp->header.length)
2948                         continue;
2949                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
2950                         return atsru;
2951         }
2952
2953         return NULL;
2954 }
2955
2956 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2957 {
2958         struct acpi_dmar_atsr *atsr;
2959         struct dmar_atsr_unit *atsru;
2960
2961         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2962                 return 0;
2963
2964         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2965         atsru = dmar_find_atsr(atsr);
2966         if (atsru)
2967                 return 0;
2968
2969         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
2970         if (!atsru)
2971                 return -ENOMEM;
2972
2973         /*
2974          * If memory is allocated from slab by ACPI _DSM method, we need to
2975          * copy the memory content because the memory buffer will be freed
2976          * on return.
2977          */
2978         atsru->hdr = (void *)(atsru + 1);
2979         memcpy(atsru->hdr, hdr, hdr->length);
2980         atsru->include_all = atsr->flags & 0x1;
2981         if (!atsru->include_all) {
2982                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
2983                                 (void *)atsr + atsr->header.length,
2984                                 &atsru->devices_cnt);
2985                 if (atsru->devices_cnt && atsru->devices == NULL) {
2986                         kfree(atsru);
2987                         return -ENOMEM;
2988                 }
2989         }
2990
2991         list_add_rcu(&atsru->list, &dmar_atsr_units);
2992
2993         return 0;
2994 }
2995
2996 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
2997 {
2998         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
2999         kfree(atsru);
3000 }
3001
3002 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3003 {
3004         struct acpi_dmar_atsr *atsr;
3005         struct dmar_atsr_unit *atsru;
3006
3007         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3008         atsru = dmar_find_atsr(atsr);
3009         if (atsru) {
3010                 list_del_rcu(&atsru->list);
3011                 synchronize_rcu();
3012                 intel_iommu_free_atsr(atsru);
3013         }
3014
3015         return 0;
3016 }
3017
3018 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3019 {
3020         int i;
3021         struct device *dev;
3022         struct acpi_dmar_atsr *atsr;
3023         struct dmar_atsr_unit *atsru;
3024
3025         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3026         atsru = dmar_find_atsr(atsr);
3027         if (!atsru)
3028                 return 0;
3029
3030         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3031                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3032                                           i, dev)
3033                         return -EBUSY;
3034         }
3035
3036         return 0;
3037 }
3038
3039 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3040 {
3041         struct dmar_satc_unit *satcu;
3042         struct acpi_dmar_satc *tmp;
3043
3044         list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3045                                 dmar_rcu_check()) {
3046                 tmp = (struct acpi_dmar_satc *)satcu->hdr;
3047                 if (satc->segment != tmp->segment)
3048                         continue;
3049                 if (satc->header.length != tmp->header.length)
3050                         continue;
3051                 if (memcmp(satc, tmp, satc->header.length) == 0)
3052                         return satcu;
3053         }
3054
3055         return NULL;
3056 }
3057
3058 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3059 {
3060         struct acpi_dmar_satc *satc;
3061         struct dmar_satc_unit *satcu;
3062
3063         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3064                 return 0;
3065
3066         satc = container_of(hdr, struct acpi_dmar_satc, header);
3067         satcu = dmar_find_satc(satc);
3068         if (satcu)
3069                 return 0;
3070
3071         satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3072         if (!satcu)
3073                 return -ENOMEM;
3074
3075         satcu->hdr = (void *)(satcu + 1);
3076         memcpy(satcu->hdr, hdr, hdr->length);
3077         satcu->atc_required = satc->flags & 0x1;
3078         satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3079                                               (void *)satc + satc->header.length,
3080                                               &satcu->devices_cnt);
3081         if (satcu->devices_cnt && !satcu->devices) {
3082                 kfree(satcu);
3083                 return -ENOMEM;
3084         }
3085         list_add_rcu(&satcu->list, &dmar_satc_units);
3086
3087         return 0;
3088 }
3089
3090 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3091 {
3092         int sp, ret;
3093         struct intel_iommu *iommu = dmaru->iommu;
3094
3095         ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3096         if (ret)
3097                 goto out;
3098
3099         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3100                 pr_warn("%s: Doesn't support hardware pass through.\n",
3101                         iommu->name);
3102                 return -ENXIO;
3103         }
3104
3105         sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3106         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3107                 pr_warn("%s: Doesn't support large page.\n",
3108                         iommu->name);
3109                 return -ENXIO;
3110         }
3111
3112         /*
3113          * Disable translation if already enabled prior to OS handover.
3114          */
3115         if (iommu->gcmd & DMA_GCMD_TE)
3116                 iommu_disable_translation(iommu);
3117
3118         ret = iommu_init_domains(iommu);
3119         if (ret == 0)
3120                 ret = iommu_alloc_root_entry(iommu);
3121         if (ret)
3122                 goto out;
3123
3124         intel_svm_check(iommu);
3125
3126         if (dmaru->ignored) {
3127                 /*
3128                  * we always have to disable PMRs or DMA may fail on this device
3129                  */
3130                 if (force_on)
3131                         iommu_disable_protect_mem_regions(iommu);
3132                 return 0;
3133         }
3134
3135         intel_iommu_init_qi(iommu);
3136         iommu_flush_write_buffer(iommu);
3137
3138 #ifdef CONFIG_INTEL_IOMMU_SVM
3139         if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3140                 ret = intel_svm_enable_prq(iommu);
3141                 if (ret)
3142                         goto disable_iommu;
3143         }
3144 #endif
3145         ret = dmar_set_interrupt(iommu);
3146         if (ret)
3147                 goto disable_iommu;
3148
3149         iommu_set_root_entry(iommu);
3150         iommu_enable_translation(iommu);
3151
3152         iommu_disable_protect_mem_regions(iommu);
3153         return 0;
3154
3155 disable_iommu:
3156         disable_dmar_iommu(iommu);
3157 out:
3158         free_dmar_iommu(iommu);
3159         return ret;
3160 }
3161
3162 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3163 {
3164         int ret = 0;
3165         struct intel_iommu *iommu = dmaru->iommu;
3166
3167         if (!intel_iommu_enabled)
3168                 return 0;
3169         if (iommu == NULL)
3170                 return -EINVAL;
3171
3172         if (insert) {
3173                 ret = intel_iommu_add(dmaru);
3174         } else {
3175                 disable_dmar_iommu(iommu);
3176                 free_dmar_iommu(iommu);
3177         }
3178
3179         return ret;
3180 }
3181
3182 static void intel_iommu_free_dmars(void)
3183 {
3184         struct dmar_rmrr_unit *rmrru, *rmrr_n;
3185         struct dmar_atsr_unit *atsru, *atsr_n;
3186         struct dmar_satc_unit *satcu, *satc_n;
3187
3188         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3189                 list_del(&rmrru->list);
3190                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3191                 kfree(rmrru);
3192         }
3193
3194         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3195                 list_del(&atsru->list);
3196                 intel_iommu_free_atsr(atsru);
3197         }
3198         list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3199                 list_del(&satcu->list);
3200                 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3201                 kfree(satcu);
3202         }
3203 }
3204
3205 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3206 {
3207         struct dmar_satc_unit *satcu;
3208         struct acpi_dmar_satc *satc;
3209         struct device *tmp;
3210         int i;
3211
3212         dev = pci_physfn(dev);
3213         rcu_read_lock();
3214
3215         list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3216                 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3217                 if (satc->segment != pci_domain_nr(dev->bus))
3218                         continue;
3219                 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3220                         if (to_pci_dev(tmp) == dev)
3221                                 goto out;
3222         }
3223         satcu = NULL;
3224 out:
3225         rcu_read_unlock();
3226         return satcu;
3227 }
3228
3229 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3230 {
3231         int i, ret = 1;
3232         struct pci_bus *bus;
3233         struct pci_dev *bridge = NULL;
3234         struct device *tmp;
3235         struct acpi_dmar_atsr *atsr;
3236         struct dmar_atsr_unit *atsru;
3237         struct dmar_satc_unit *satcu;
3238
3239         dev = pci_physfn(dev);
3240         satcu = dmar_find_matched_satc_unit(dev);
3241         if (satcu)
3242                 /*
3243                  * This device supports ATS as it is in SATC table.
3244                  * When IOMMU is in legacy mode, enabling ATS is done
3245                  * automatically by HW for the device that requires
3246                  * ATS, hence OS should not enable this device ATS
3247                  * to avoid duplicated TLB invalidation.
3248                  */
3249                 return !(satcu->atc_required && !sm_supported(iommu));
3250
3251         for (bus = dev->bus; bus; bus = bus->parent) {
3252                 bridge = bus->self;
3253                 /* If it's an integrated device, allow ATS */
3254                 if (!bridge)
3255                         return 1;
3256                 /* Connected via non-PCIe: no ATS */
3257                 if (!pci_is_pcie(bridge) ||
3258                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3259                         return 0;
3260                 /* If we found the root port, look it up in the ATSR */
3261                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3262                         break;
3263         }
3264
3265         rcu_read_lock();
3266         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3267                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3268                 if (atsr->segment != pci_domain_nr(dev->bus))
3269                         continue;
3270
3271                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3272                         if (tmp == &bridge->dev)
3273                                 goto out;
3274
3275                 if (atsru->include_all)
3276                         goto out;
3277         }
3278         ret = 0;
3279 out:
3280         rcu_read_unlock();
3281
3282         return ret;
3283 }
3284
3285 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3286 {
3287         int ret;
3288         struct dmar_rmrr_unit *rmrru;
3289         struct dmar_atsr_unit *atsru;
3290         struct dmar_satc_unit *satcu;
3291         struct acpi_dmar_atsr *atsr;
3292         struct acpi_dmar_reserved_memory *rmrr;
3293         struct acpi_dmar_satc *satc;
3294
3295         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3296                 return 0;
3297
3298         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3299                 rmrr = container_of(rmrru->hdr,
3300                                     struct acpi_dmar_reserved_memory, header);
3301                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3302                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3303                                 ((void *)rmrr) + rmrr->header.length,
3304                                 rmrr->segment, rmrru->devices,
3305                                 rmrru->devices_cnt);
3306                         if (ret < 0)
3307                                 return ret;
3308                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3309                         dmar_remove_dev_scope(info, rmrr->segment,
3310                                 rmrru->devices, rmrru->devices_cnt);
3311                 }
3312         }
3313
3314         list_for_each_entry(atsru, &dmar_atsr_units, list) {
3315                 if (atsru->include_all)
3316                         continue;
3317
3318                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3319                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3320                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3321                                         (void *)atsr + atsr->header.length,
3322                                         atsr->segment, atsru->devices,
3323                                         atsru->devices_cnt);
3324                         if (ret > 0)
3325                                 break;
3326                         else if (ret < 0)
3327                                 return ret;
3328                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3329                         if (dmar_remove_dev_scope(info, atsr->segment,
3330                                         atsru->devices, atsru->devices_cnt))
3331                                 break;
3332                 }
3333         }
3334         list_for_each_entry(satcu, &dmar_satc_units, list) {
3335                 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3336                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3337                         ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3338                                         (void *)satc + satc->header.length,
3339                                         satc->segment, satcu->devices,
3340                                         satcu->devices_cnt);
3341                         if (ret > 0)
3342                                 break;
3343                         else if (ret < 0)
3344                                 return ret;
3345                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3346                         if (dmar_remove_dev_scope(info, satc->segment,
3347                                         satcu->devices, satcu->devices_cnt))
3348                                 break;
3349                 }
3350         }
3351
3352         return 0;
3353 }
3354
3355 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3356                                        unsigned long val, void *v)
3357 {
3358         struct memory_notify *mhp = v;
3359         unsigned long start_vpfn = mm_to_dma_pfn_start(mhp->start_pfn);
3360         unsigned long last_vpfn = mm_to_dma_pfn_end(mhp->start_pfn +
3361                         mhp->nr_pages - 1);
3362
3363         switch (val) {
3364         case MEM_GOING_ONLINE:
3365                 if (iommu_domain_identity_map(si_domain,
3366                                               start_vpfn, last_vpfn)) {
3367                         pr_warn("Failed to build identity map for [%lx-%lx]\n",
3368                                 start_vpfn, last_vpfn);
3369                         return NOTIFY_BAD;
3370                 }
3371                 break;
3372
3373         case MEM_OFFLINE:
3374         case MEM_CANCEL_ONLINE:
3375                 {
3376                         struct dmar_drhd_unit *drhd;
3377                         struct intel_iommu *iommu;
3378                         LIST_HEAD(freelist);
3379
3380                         domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3381
3382                         rcu_read_lock();
3383                         for_each_active_iommu(iommu, drhd)
3384                                 iommu_flush_iotlb_psi(iommu, si_domain,
3385                                         start_vpfn, mhp->nr_pages,
3386                                         list_empty(&freelist), 0);
3387                         rcu_read_unlock();
3388                         put_pages_list(&freelist);
3389                 }
3390                 break;
3391         }
3392
3393         return NOTIFY_OK;
3394 }
3395
3396 static struct notifier_block intel_iommu_memory_nb = {
3397         .notifier_call = intel_iommu_memory_notifier,
3398         .priority = 0
3399 };
3400
3401 static void intel_disable_iommus(void)
3402 {
3403         struct intel_iommu *iommu = NULL;
3404         struct dmar_drhd_unit *drhd;
3405
3406         for_each_iommu(iommu, drhd)
3407                 iommu_disable_translation(iommu);
3408 }
3409
3410 void intel_iommu_shutdown(void)
3411 {
3412         struct dmar_drhd_unit *drhd;
3413         struct intel_iommu *iommu = NULL;
3414
3415         if (no_iommu || dmar_disabled)
3416                 return;
3417
3418         down_write(&dmar_global_lock);
3419
3420         /* Disable PMRs explicitly here. */
3421         for_each_iommu(iommu, drhd)
3422                 iommu_disable_protect_mem_regions(iommu);
3423
3424         /* Make sure the IOMMUs are switched off */
3425         intel_disable_iommus();
3426
3427         up_write(&dmar_global_lock);
3428 }
3429
3430 static struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3431 {
3432         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3433
3434         return container_of(iommu_dev, struct intel_iommu, iommu);
3435 }
3436
3437 static ssize_t version_show(struct device *dev,
3438                             struct device_attribute *attr, char *buf)
3439 {
3440         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3441         u32 ver = readl(iommu->reg + DMAR_VER_REG);
3442         return sysfs_emit(buf, "%d:%d\n",
3443                           DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3444 }
3445 static DEVICE_ATTR_RO(version);
3446
3447 static ssize_t address_show(struct device *dev,
3448                             struct device_attribute *attr, char *buf)
3449 {
3450         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3451         return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
3452 }
3453 static DEVICE_ATTR_RO(address);
3454
3455 static ssize_t cap_show(struct device *dev,
3456                         struct device_attribute *attr, char *buf)
3457 {
3458         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3459         return sysfs_emit(buf, "%llx\n", iommu->cap);
3460 }
3461 static DEVICE_ATTR_RO(cap);
3462
3463 static ssize_t ecap_show(struct device *dev,
3464                          struct device_attribute *attr, char *buf)
3465 {
3466         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3467         return sysfs_emit(buf, "%llx\n", iommu->ecap);
3468 }
3469 static DEVICE_ATTR_RO(ecap);
3470
3471 static ssize_t domains_supported_show(struct device *dev,
3472                                       struct device_attribute *attr, char *buf)
3473 {
3474         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3475         return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
3476 }
3477 static DEVICE_ATTR_RO(domains_supported);
3478
3479 static ssize_t domains_used_show(struct device *dev,
3480                                  struct device_attribute *attr, char *buf)
3481 {
3482         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3483         return sysfs_emit(buf, "%d\n",
3484                           bitmap_weight(iommu->domain_ids,
3485                                         cap_ndoms(iommu->cap)));
3486 }
3487 static DEVICE_ATTR_RO(domains_used);
3488
3489 static struct attribute *intel_iommu_attrs[] = {
3490         &dev_attr_version.attr,
3491         &dev_attr_address.attr,
3492         &dev_attr_cap.attr,
3493         &dev_attr_ecap.attr,
3494         &dev_attr_domains_supported.attr,
3495         &dev_attr_domains_used.attr,
3496         NULL,
3497 };
3498
3499 static struct attribute_group intel_iommu_group = {
3500         .name = "intel-iommu",
3501         .attrs = intel_iommu_attrs,
3502 };
3503
3504 const struct attribute_group *intel_iommu_groups[] = {
3505         &intel_iommu_group,
3506         NULL,
3507 };
3508
3509 static bool has_external_pci(void)
3510 {
3511         struct pci_dev *pdev = NULL;
3512
3513         for_each_pci_dev(pdev)
3514                 if (pdev->external_facing) {
3515                         pci_dev_put(pdev);
3516                         return true;
3517                 }
3518
3519         return false;
3520 }
3521
3522 static int __init platform_optin_force_iommu(void)
3523 {
3524         if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3525                 return 0;
3526
3527         if (no_iommu || dmar_disabled)
3528                 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3529
3530         /*
3531          * If Intel-IOMMU is disabled by default, we will apply identity
3532          * map for all devices except those marked as being untrusted.
3533          */
3534         if (dmar_disabled)
3535                 iommu_set_default_passthrough(false);
3536
3537         dmar_disabled = 0;
3538         no_iommu = 0;
3539
3540         return 1;
3541 }
3542
3543 static int __init probe_acpi_namespace_devices(void)
3544 {
3545         struct dmar_drhd_unit *drhd;
3546         /* To avoid a -Wunused-but-set-variable warning. */
3547         struct intel_iommu *iommu __maybe_unused;
3548         struct device *dev;
3549         int i, ret = 0;
3550
3551         for_each_active_iommu(iommu, drhd) {
3552                 for_each_active_dev_scope(drhd->devices,
3553                                           drhd->devices_cnt, i, dev) {
3554                         struct acpi_device_physical_node *pn;
3555                         struct acpi_device *adev;
3556
3557                         if (dev->bus != &acpi_bus_type)
3558                                 continue;
3559
3560                         adev = to_acpi_device(dev);
3561                         mutex_lock(&adev->physical_node_lock);
3562                         list_for_each_entry(pn,
3563                                             &adev->physical_node_list, node) {
3564                                 ret = iommu_probe_device(pn->dev);
3565                                 if (ret)
3566                                         break;
3567                         }
3568                         mutex_unlock(&adev->physical_node_lock);
3569
3570                         if (ret)
3571                                 return ret;
3572                 }
3573         }
3574
3575         return 0;
3576 }
3577
3578 static __init int tboot_force_iommu(void)
3579 {
3580         if (!tboot_enabled())
3581                 return 0;
3582
3583         if (no_iommu || dmar_disabled)
3584                 pr_warn("Forcing Intel-IOMMU to enabled\n");
3585
3586         dmar_disabled = 0;
3587         no_iommu = 0;
3588
3589         return 1;
3590 }
3591
3592 int __init intel_iommu_init(void)
3593 {
3594         int ret = -ENODEV;
3595         struct dmar_drhd_unit *drhd;
3596         struct intel_iommu *iommu;
3597
3598         /*
3599          * Intel IOMMU is required for a TXT/tboot launch or platform
3600          * opt in, so enforce that.
3601          */
3602         force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3603                     platform_optin_force_iommu();
3604
3605         down_write(&dmar_global_lock);
3606         if (dmar_table_init()) {
3607                 if (force_on)
3608                         panic("tboot: Failed to initialize DMAR table\n");
3609                 goto out_free_dmar;
3610         }
3611
3612         if (dmar_dev_scope_init() < 0) {
3613                 if (force_on)
3614                         panic("tboot: Failed to initialize DMAR device scope\n");
3615                 goto out_free_dmar;
3616         }
3617
3618         up_write(&dmar_global_lock);
3619
3620         /*
3621          * The bus notifier takes the dmar_global_lock, so lockdep will
3622          * complain later when we register it under the lock.
3623          */
3624         dmar_register_bus_notifier();
3625
3626         down_write(&dmar_global_lock);
3627
3628         if (!no_iommu)
3629                 intel_iommu_debugfs_init();
3630
3631         if (no_iommu || dmar_disabled) {
3632                 /*
3633                  * We exit the function here to ensure IOMMU's remapping and
3634                  * mempool aren't setup, which means that the IOMMU's PMRs
3635                  * won't be disabled via the call to init_dmars(). So disable
3636                  * it explicitly here. The PMRs were setup by tboot prior to
3637                  * calling SENTER, but the kernel is expected to reset/tear
3638                  * down the PMRs.
3639                  */
3640                 if (intel_iommu_tboot_noforce) {
3641                         for_each_iommu(iommu, drhd)
3642                                 iommu_disable_protect_mem_regions(iommu);
3643                 }
3644
3645                 /*
3646                  * Make sure the IOMMUs are switched off, even when we
3647                  * boot into a kexec kernel and the previous kernel left
3648                  * them enabled
3649                  */
3650                 intel_disable_iommus();
3651                 goto out_free_dmar;
3652         }
3653
3654         if (list_empty(&dmar_rmrr_units))
3655                 pr_info("No RMRR found\n");
3656
3657         if (list_empty(&dmar_atsr_units))
3658                 pr_info("No ATSR found\n");
3659
3660         if (list_empty(&dmar_satc_units))
3661                 pr_info("No SATC found\n");
3662
3663         init_no_remapping_devices();
3664
3665         ret = init_dmars();
3666         if (ret) {
3667                 if (force_on)
3668                         panic("tboot: Failed to initialize DMARs\n");
3669                 pr_err("Initialization failed\n");
3670                 goto out_free_dmar;
3671         }
3672         up_write(&dmar_global_lock);
3673
3674         init_iommu_pm_ops();
3675
3676         down_read(&dmar_global_lock);
3677         for_each_active_iommu(iommu, drhd) {
3678                 /*
3679                  * The flush queue implementation does not perform
3680                  * page-selective invalidations that are required for efficient
3681                  * TLB flushes in virtual environments.  The benefit of batching
3682                  * is likely to be much lower than the overhead of synchronizing
3683                  * the virtual and physical IOMMU page-tables.
3684                  */
3685                 if (cap_caching_mode(iommu->cap) &&
3686                     !first_level_by_default(IOMMU_DOMAIN_DMA)) {
3687                         pr_info_once("IOMMU batching disallowed due to virtualization\n");
3688                         iommu_set_dma_strict();
3689                 }
3690                 iommu_device_sysfs_add(&iommu->iommu, NULL,
3691                                        intel_iommu_groups,
3692                                        "%s", iommu->name);
3693                 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3694
3695                 iommu_pmu_register(iommu);
3696         }
3697         up_read(&dmar_global_lock);
3698
3699         if (si_domain && !hw_pass_through)
3700                 register_memory_notifier(&intel_iommu_memory_nb);
3701
3702         down_read(&dmar_global_lock);
3703         if (probe_acpi_namespace_devices())
3704                 pr_warn("ACPI name space devices didn't probe correctly\n");
3705
3706         /* Finally, we enable the DMA remapping hardware. */
3707         for_each_iommu(iommu, drhd) {
3708                 if (!drhd->ignored && !translation_pre_enabled(iommu))
3709                         iommu_enable_translation(iommu);
3710
3711                 iommu_disable_protect_mem_regions(iommu);
3712         }
3713         up_read(&dmar_global_lock);
3714
3715         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3716
3717         intel_iommu_enabled = 1;
3718
3719         return 0;
3720
3721 out_free_dmar:
3722         intel_iommu_free_dmars();
3723         up_write(&dmar_global_lock);
3724         return ret;
3725 }
3726
3727 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3728 {
3729         struct device_domain_info *info = opaque;
3730
3731         domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3732         return 0;
3733 }
3734
3735 /*
3736  * NB - intel-iommu lacks any sort of reference counting for the users of
3737  * dependent devices.  If multiple endpoints have intersecting dependent
3738  * devices, unbinding the driver from any one of them will possibly leave
3739  * the others unable to operate.
3740  */
3741 static void domain_context_clear(struct device_domain_info *info)
3742 {
3743         if (!dev_is_pci(info->dev))
3744                 domain_context_clear_one(info, info->bus, info->devfn);
3745
3746         pci_for_each_dma_alias(to_pci_dev(info->dev),
3747                                &domain_context_clear_one_cb, info);
3748 }
3749
3750 static void dmar_remove_one_dev_info(struct device *dev)
3751 {
3752         struct device_domain_info *info = dev_iommu_priv_get(dev);
3753         struct dmar_domain *domain = info->domain;
3754         struct intel_iommu *iommu = info->iommu;
3755         unsigned long flags;
3756
3757         if (!dev_is_real_dma_subdevice(info->dev)) {
3758                 if (dev_is_pci(info->dev) && sm_supported(iommu))
3759                         intel_pasid_tear_down_entry(iommu, info->dev,
3760                                         IOMMU_NO_PASID, false);
3761
3762                 iommu_disable_pci_caps(info);
3763                 domain_context_clear(info);
3764         }
3765
3766         spin_lock_irqsave(&domain->lock, flags);
3767         list_del(&info->link);
3768         spin_unlock_irqrestore(&domain->lock, flags);
3769
3770         domain_detach_iommu(domain, iommu);
3771         info->domain = NULL;
3772 }
3773
3774 /*
3775  * Clear the page table pointer in context or pasid table entries so that
3776  * all DMA requests without PASID from the device are blocked. If the page
3777  * table has been set, clean up the data structures.
3778  */
3779 void device_block_translation(struct device *dev)
3780 {
3781         struct device_domain_info *info = dev_iommu_priv_get(dev);
3782         struct intel_iommu *iommu = info->iommu;
3783         unsigned long flags;
3784
3785         iommu_disable_pci_caps(info);
3786         if (!dev_is_real_dma_subdevice(dev)) {
3787                 if (sm_supported(iommu))
3788                         intel_pasid_tear_down_entry(iommu, dev,
3789                                                     IOMMU_NO_PASID, false);
3790                 else
3791                         domain_context_clear(info);
3792         }
3793
3794         if (!info->domain)
3795                 return;
3796
3797         spin_lock_irqsave(&info->domain->lock, flags);
3798         list_del(&info->link);
3799         spin_unlock_irqrestore(&info->domain->lock, flags);
3800
3801         domain_detach_iommu(info->domain, iommu);
3802         info->domain = NULL;
3803 }
3804
3805 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3806 {
3807         int adjust_width;
3808
3809         /* calculate AGAW */
3810         domain->gaw = guest_width;
3811         adjust_width = guestwidth_to_adjustwidth(guest_width);
3812         domain->agaw = width_to_agaw(adjust_width);
3813
3814         domain->iommu_coherency = false;
3815         domain->iommu_superpage = 0;
3816         domain->max_addr = 0;
3817
3818         /* always allocate the top pgd */
3819         domain->pgd = alloc_pgtable_page(domain->nid, GFP_ATOMIC);
3820         if (!domain->pgd)
3821                 return -ENOMEM;
3822         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3823         return 0;
3824 }
3825
3826 static int blocking_domain_attach_dev(struct iommu_domain *domain,
3827                                       struct device *dev)
3828 {
3829         device_block_translation(dev);
3830         return 0;
3831 }
3832
3833 static struct iommu_domain blocking_domain = {
3834         .type = IOMMU_DOMAIN_BLOCKED,
3835         .ops = &(const struct iommu_domain_ops) {
3836                 .attach_dev     = blocking_domain_attach_dev,
3837         }
3838 };
3839
3840 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
3841 {
3842         struct dmar_domain *dmar_domain;
3843         struct iommu_domain *domain;
3844
3845         switch (type) {
3846         case IOMMU_DOMAIN_DMA:
3847         case IOMMU_DOMAIN_UNMANAGED:
3848                 dmar_domain = alloc_domain(type);
3849                 if (!dmar_domain) {
3850                         pr_err("Can't allocate dmar_domain\n");
3851                         return NULL;
3852                 }
3853                 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3854                         pr_err("Domain initialization failed\n");
3855                         domain_exit(dmar_domain);
3856                         return NULL;
3857                 }
3858
3859                 domain = &dmar_domain->domain;
3860                 domain->geometry.aperture_start = 0;
3861                 domain->geometry.aperture_end   =
3862                                 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
3863                 domain->geometry.force_aperture = true;
3864
3865                 return domain;
3866         case IOMMU_DOMAIN_IDENTITY:
3867                 return &si_domain->domain;
3868         case IOMMU_DOMAIN_SVA:
3869                 return intel_svm_domain_alloc();
3870         default:
3871                 return NULL;
3872         }
3873
3874         return NULL;
3875 }
3876
3877 static struct iommu_domain *
3878 intel_iommu_domain_alloc_user(struct device *dev, u32 flags,
3879                               struct iommu_domain *parent,
3880                               const struct iommu_user_data *user_data)
3881 {
3882         struct device_domain_info *info = dev_iommu_priv_get(dev);
3883         bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
3884         bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
3885         struct intel_iommu *iommu = info->iommu;
3886         struct iommu_domain *domain;
3887
3888         /* Must be NESTING domain */
3889         if (parent) {
3890                 if (!nested_supported(iommu) || flags)
3891                         return ERR_PTR(-EOPNOTSUPP);
3892                 return intel_nested_domain_alloc(parent, user_data);
3893         }
3894
3895         if (flags &
3896             (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
3897                 return ERR_PTR(-EOPNOTSUPP);
3898         if (nested_parent && !nested_supported(iommu))
3899                 return ERR_PTR(-EOPNOTSUPP);
3900         if (user_data || (dirty_tracking && !ssads_supported(iommu)))
3901                 return ERR_PTR(-EOPNOTSUPP);
3902
3903         /*
3904          * domain_alloc_user op needs to fully initialize a domain before
3905          * return, so uses iommu_domain_alloc() here for simple.
3906          */
3907         domain = iommu_domain_alloc(dev->bus);
3908         if (!domain)
3909                 return ERR_PTR(-ENOMEM);
3910
3911         if (nested_parent)
3912                 to_dmar_domain(domain)->nested_parent = true;
3913
3914         if (dirty_tracking) {
3915                 if (to_dmar_domain(domain)->use_first_level) {
3916                         iommu_domain_free(domain);
3917                         return ERR_PTR(-EOPNOTSUPP);
3918                 }
3919                 domain->dirty_ops = &intel_dirty_ops;
3920         }
3921
3922         return domain;
3923 }
3924
3925 static void intel_iommu_domain_free(struct iommu_domain *domain)
3926 {
3927         if (domain != &si_domain->domain)
3928                 domain_exit(to_dmar_domain(domain));
3929 }
3930
3931 int prepare_domain_attach_device(struct iommu_domain *domain,
3932                                  struct device *dev)
3933 {
3934         struct device_domain_info *info = dev_iommu_priv_get(dev);
3935         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3936         struct intel_iommu *iommu = info->iommu;
3937         int addr_width;
3938
3939         if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
3940                 return -EINVAL;
3941
3942         if (domain->dirty_ops && !ssads_supported(iommu))
3943                 return -EINVAL;
3944
3945         /* check if this iommu agaw is sufficient for max mapped address */
3946         addr_width = agaw_to_width(iommu->agaw);
3947         if (addr_width > cap_mgaw(iommu->cap))
3948                 addr_width = cap_mgaw(iommu->cap);
3949
3950         if (dmar_domain->max_addr > (1LL << addr_width))
3951                 return -EINVAL;
3952         dmar_domain->gaw = addr_width;
3953
3954         /*
3955          * Knock out extra levels of page tables if necessary
3956          */
3957         while (iommu->agaw < dmar_domain->agaw) {
3958                 struct dma_pte *pte;
3959
3960                 pte = dmar_domain->pgd;
3961                 if (dma_pte_present(pte)) {
3962                         dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
3963                         free_pgtable_page(pte);
3964                 }
3965                 dmar_domain->agaw--;
3966         }
3967
3968         return 0;
3969 }
3970
3971 static int intel_iommu_attach_device(struct iommu_domain *domain,
3972                                      struct device *dev)
3973 {
3974         struct device_domain_info *info = dev_iommu_priv_get(dev);
3975         int ret;
3976
3977         if (info->domain)
3978                 device_block_translation(dev);
3979
3980         ret = prepare_domain_attach_device(domain, dev);
3981         if (ret)
3982                 return ret;
3983
3984         return dmar_domain_attach_device(to_dmar_domain(domain), dev);
3985 }
3986
3987 static int intel_iommu_map(struct iommu_domain *domain,
3988                            unsigned long iova, phys_addr_t hpa,
3989                            size_t size, int iommu_prot, gfp_t gfp)
3990 {
3991         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3992         u64 max_addr;
3993         int prot = 0;
3994
3995         if (iommu_prot & IOMMU_READ)
3996                 prot |= DMA_PTE_READ;
3997         if (iommu_prot & IOMMU_WRITE)
3998                 prot |= DMA_PTE_WRITE;
3999         if (dmar_domain->set_pte_snp)
4000                 prot |= DMA_PTE_SNP;
4001
4002         max_addr = iova + size;
4003         if (dmar_domain->max_addr < max_addr) {
4004                 u64 end;
4005
4006                 /* check if minimum agaw is sufficient for mapped address */
4007                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4008                 if (end < max_addr) {
4009                         pr_err("%s: iommu width (%d) is not "
4010                                "sufficient for the mapped address (%llx)\n",
4011                                __func__, dmar_domain->gaw, max_addr);
4012                         return -EFAULT;
4013                 }
4014                 dmar_domain->max_addr = max_addr;
4015         }
4016         /* Round up size to next multiple of PAGE_SIZE, if it and
4017            the low bits of hpa would take us onto the next page */
4018         size = aligned_nrpages(hpa, size);
4019         return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4020                                 hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
4021 }
4022
4023 static int intel_iommu_map_pages(struct iommu_domain *domain,
4024                                  unsigned long iova, phys_addr_t paddr,
4025                                  size_t pgsize, size_t pgcount,
4026                                  int prot, gfp_t gfp, size_t *mapped)
4027 {
4028         unsigned long pgshift = __ffs(pgsize);
4029         size_t size = pgcount << pgshift;
4030         int ret;
4031
4032         if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4033                 return -EINVAL;
4034
4035         if (!IS_ALIGNED(iova | paddr, pgsize))
4036                 return -EINVAL;
4037
4038         ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4039         if (!ret && mapped)
4040                 *mapped = size;
4041
4042         return ret;
4043 }
4044
4045 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4046                                 unsigned long iova, size_t size,
4047                                 struct iommu_iotlb_gather *gather)
4048 {
4049         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4050         unsigned long start_pfn, last_pfn;
4051         int level = 0;
4052
4053         /* Cope with horrid API which requires us to unmap more than the
4054            size argument if it happens to be a large-page mapping. */
4055         if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
4056                                      &level, GFP_ATOMIC)))
4057                 return 0;
4058
4059         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4060                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4061
4062         start_pfn = iova >> VTD_PAGE_SHIFT;
4063         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4064
4065         domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4066
4067         if (dmar_domain->max_addr == iova + size)
4068                 dmar_domain->max_addr = iova;
4069
4070         /*
4071          * We do not use page-selective IOTLB invalidation in flush queue,
4072          * so there is no need to track page and sync iotlb.
4073          */
4074         if (!iommu_iotlb_gather_queued(gather))
4075                 iommu_iotlb_gather_add_page(domain, gather, iova, size);
4076
4077         return size;
4078 }
4079
4080 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4081                                       unsigned long iova,
4082                                       size_t pgsize, size_t pgcount,
4083                                       struct iommu_iotlb_gather *gather)
4084 {
4085         unsigned long pgshift = __ffs(pgsize);
4086         size_t size = pgcount << pgshift;
4087
4088         return intel_iommu_unmap(domain, iova, size, gather);
4089 }
4090
4091 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4092                                  struct iommu_iotlb_gather *gather)
4093 {
4094         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4095         unsigned long iova_pfn = IOVA_PFN(gather->start);
4096         size_t size = gather->end - gather->start;
4097         struct iommu_domain_info *info;
4098         unsigned long start_pfn;
4099         unsigned long nrpages;
4100         unsigned long i;
4101
4102         nrpages = aligned_nrpages(gather->start, size);
4103         start_pfn = mm_to_dma_pfn_start(iova_pfn);
4104
4105         xa_for_each(&dmar_domain->iommu_array, i, info)
4106                 iommu_flush_iotlb_psi(info->iommu, dmar_domain,
4107                                       start_pfn, nrpages,
4108                                       list_empty(&gather->freelist), 0);
4109
4110         put_pages_list(&gather->freelist);
4111 }
4112
4113 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4114                                             dma_addr_t iova)
4115 {
4116         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4117         struct dma_pte *pte;
4118         int level = 0;
4119         u64 phys = 0;
4120
4121         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
4122                              GFP_ATOMIC);
4123         if (pte && dma_pte_present(pte))
4124                 phys = dma_pte_addr(pte) +
4125                         (iova & (BIT_MASK(level_to_offset_bits(level) +
4126                                                 VTD_PAGE_SHIFT) - 1));
4127
4128         return phys;
4129 }
4130
4131 static bool domain_support_force_snooping(struct dmar_domain *domain)
4132 {
4133         struct device_domain_info *info;
4134         bool support = true;
4135
4136         assert_spin_locked(&domain->lock);
4137         list_for_each_entry(info, &domain->devices, link) {
4138                 if (!ecap_sc_support(info->iommu->ecap)) {
4139                         support = false;
4140                         break;
4141                 }
4142         }
4143
4144         return support;
4145 }
4146
4147 static void domain_set_force_snooping(struct dmar_domain *domain)
4148 {
4149         struct device_domain_info *info;
4150
4151         assert_spin_locked(&domain->lock);
4152         /*
4153          * Second level page table supports per-PTE snoop control. The
4154          * iommu_map() interface will handle this by setting SNP bit.
4155          */
4156         if (!domain->use_first_level) {
4157                 domain->set_pte_snp = true;
4158                 return;
4159         }
4160
4161         list_for_each_entry(info, &domain->devices, link)
4162                 intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4163                                                      IOMMU_NO_PASID);
4164 }
4165
4166 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4167 {
4168         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4169         unsigned long flags;
4170
4171         if (dmar_domain->force_snooping)
4172                 return true;
4173
4174         spin_lock_irqsave(&dmar_domain->lock, flags);
4175         if (!domain_support_force_snooping(dmar_domain) ||
4176             (!dmar_domain->use_first_level && dmar_domain->has_mappings)) {
4177                 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4178                 return false;
4179         }
4180
4181         domain_set_force_snooping(dmar_domain);
4182         dmar_domain->force_snooping = true;
4183         spin_unlock_irqrestore(&dmar_domain->lock, flags);
4184
4185         return true;
4186 }
4187
4188 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
4189 {
4190         struct device_domain_info *info = dev_iommu_priv_get(dev);
4191
4192         switch (cap) {
4193         case IOMMU_CAP_CACHE_COHERENCY:
4194         case IOMMU_CAP_DEFERRED_FLUSH:
4195                 return true;
4196         case IOMMU_CAP_PRE_BOOT_PROTECTION:
4197                 return dmar_platform_optin();
4198         case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
4199                 return ecap_sc_support(info->iommu->ecap);
4200         case IOMMU_CAP_DIRTY_TRACKING:
4201                 return ssads_supported(info->iommu);
4202         default:
4203                 return false;
4204         }
4205 }
4206
4207 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4208 {
4209         struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4210         struct device_domain_info *info;
4211         struct intel_iommu *iommu;
4212         u8 bus, devfn;
4213         int ret;
4214
4215         iommu = device_lookup_iommu(dev, &bus, &devfn);
4216         if (!iommu || !iommu->iommu.ops)
4217                 return ERR_PTR(-ENODEV);
4218
4219         info = kzalloc(sizeof(*info), GFP_KERNEL);
4220         if (!info)
4221                 return ERR_PTR(-ENOMEM);
4222
4223         if (dev_is_real_dma_subdevice(dev)) {
4224                 info->bus = pdev->bus->number;
4225                 info->devfn = pdev->devfn;
4226                 info->segment = pci_domain_nr(pdev->bus);
4227         } else {
4228                 info->bus = bus;
4229                 info->devfn = devfn;
4230                 info->segment = iommu->segment;
4231         }
4232
4233         info->dev = dev;
4234         info->iommu = iommu;
4235         if (dev_is_pci(dev)) {
4236                 if (ecap_dev_iotlb_support(iommu->ecap) &&
4237                     pci_ats_supported(pdev) &&
4238                     dmar_ats_supported(pdev, iommu)) {
4239                         info->ats_supported = 1;
4240                         info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
4241
4242                         /*
4243                          * For IOMMU that supports device IOTLB throttling
4244                          * (DIT), we assign PFSID to the invalidation desc
4245                          * of a VF such that IOMMU HW can gauge queue depth
4246                          * at PF level. If DIT is not set, PFSID will be
4247                          * treated as reserved, which should be set to 0.
4248                          */
4249                         if (ecap_dit(iommu->ecap))
4250                                 info->pfsid = pci_dev_id(pci_physfn(pdev));
4251                         info->ats_qdep = pci_ats_queue_depth(pdev);
4252                 }
4253                 if (sm_supported(iommu)) {
4254                         if (pasid_supported(iommu)) {
4255                                 int features = pci_pasid_features(pdev);
4256
4257                                 if (features >= 0)
4258                                         info->pasid_supported = features | 1;
4259                         }
4260
4261                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
4262                             pci_pri_supported(pdev))
4263                                 info->pri_supported = 1;
4264                 }
4265         }
4266
4267         dev_iommu_priv_set(dev, info);
4268
4269         if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
4270                 ret = intel_pasid_alloc_table(dev);
4271                 if (ret) {
4272                         dev_err(dev, "PASID table allocation failed\n");
4273                         kfree(info);
4274                         return ERR_PTR(ret);
4275                 }
4276         }
4277
4278         intel_iommu_debugfs_create_dev(info);
4279
4280         return &iommu->iommu;
4281 }
4282
4283 static void intel_iommu_release_device(struct device *dev)
4284 {
4285         struct device_domain_info *info = dev_iommu_priv_get(dev);
4286
4287         dmar_remove_one_dev_info(dev);
4288         intel_pasid_free_table(dev);
4289         intel_iommu_debugfs_remove_dev(info);
4290         kfree(info);
4291         set_dma_ops(dev, NULL);
4292 }
4293
4294 static void intel_iommu_probe_finalize(struct device *dev)
4295 {
4296         set_dma_ops(dev, NULL);
4297         iommu_setup_dma_ops(dev, 0, U64_MAX);
4298 }
4299
4300 static void intel_iommu_get_resv_regions(struct device *device,
4301                                          struct list_head *head)
4302 {
4303         int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4304         struct iommu_resv_region *reg;
4305         struct dmar_rmrr_unit *rmrr;
4306         struct device *i_dev;
4307         int i;
4308
4309         rcu_read_lock();
4310         for_each_rmrr_units(rmrr) {
4311                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4312                                           i, i_dev) {
4313                         struct iommu_resv_region *resv;
4314                         enum iommu_resv_type type;
4315                         size_t length;
4316
4317                         if (i_dev != device &&
4318                             !is_downstream_to_pci_bridge(device, i_dev))
4319                                 continue;
4320
4321                         length = rmrr->end_address - rmrr->base_address + 1;
4322
4323                         type = device_rmrr_is_relaxable(device) ?
4324                                 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4325
4326                         resv = iommu_alloc_resv_region(rmrr->base_address,
4327                                                        length, prot, type,
4328                                                        GFP_ATOMIC);
4329                         if (!resv)
4330                                 break;
4331
4332                         list_add_tail(&resv->list, head);
4333                 }
4334         }
4335         rcu_read_unlock();
4336
4337 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4338         if (dev_is_pci(device)) {
4339                 struct pci_dev *pdev = to_pci_dev(device);
4340
4341                 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4342                         reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4343                                         IOMMU_RESV_DIRECT_RELAXABLE,
4344                                         GFP_KERNEL);
4345                         if (reg)
4346                                 list_add_tail(&reg->list, head);
4347                 }
4348         }
4349 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4350
4351         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4352                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4353                                       0, IOMMU_RESV_MSI, GFP_KERNEL);
4354         if (!reg)
4355                 return;
4356         list_add_tail(&reg->list, head);
4357 }
4358
4359 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4360 {
4361         if (dev_is_pci(dev))
4362                 return pci_device_group(dev);
4363         return generic_device_group(dev);
4364 }
4365
4366 static int intel_iommu_enable_sva(struct device *dev)
4367 {
4368         struct device_domain_info *info = dev_iommu_priv_get(dev);
4369         struct intel_iommu *iommu;
4370
4371         if (!info || dmar_disabled)
4372                 return -EINVAL;
4373
4374         iommu = info->iommu;
4375         if (!iommu)
4376                 return -EINVAL;
4377
4378         if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4379                 return -ENODEV;
4380
4381         if (!info->pasid_enabled || !info->ats_enabled)
4382                 return -EINVAL;
4383
4384         /*
4385          * Devices having device-specific I/O fault handling should not
4386          * support PCI/PRI. The IOMMU side has no means to check the
4387          * capability of device-specific IOPF.  Therefore, IOMMU can only
4388          * default that if the device driver enables SVA on a non-PRI
4389          * device, it will handle IOPF in its own way.
4390          */
4391         if (!info->pri_supported)
4392                 return 0;
4393
4394         /* Devices supporting PRI should have it enabled. */
4395         if (!info->pri_enabled)
4396                 return -EINVAL;
4397
4398         return 0;
4399 }
4400
4401 static int intel_iommu_enable_iopf(struct device *dev)
4402 {
4403         struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4404         struct device_domain_info *info = dev_iommu_priv_get(dev);
4405         struct intel_iommu *iommu;
4406         int ret;
4407
4408         if (!pdev || !info || !info->ats_enabled || !info->pri_supported)
4409                 return -ENODEV;
4410
4411         if (info->pri_enabled)
4412                 return -EBUSY;
4413
4414         iommu = info->iommu;
4415         if (!iommu)
4416                 return -EINVAL;
4417
4418         /* PASID is required in PRG Response Message. */
4419         if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
4420                 return -EINVAL;
4421
4422         ret = pci_reset_pri(pdev);
4423         if (ret)
4424                 return ret;
4425
4426         ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4427         if (ret)
4428                 return ret;
4429
4430         ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4431         if (ret)
4432                 goto iopf_remove_device;
4433
4434         ret = pci_enable_pri(pdev, PRQ_DEPTH);
4435         if (ret)
4436                 goto iopf_unregister_handler;
4437         info->pri_enabled = 1;
4438
4439         return 0;
4440
4441 iopf_unregister_handler:
4442         iommu_unregister_device_fault_handler(dev);
4443 iopf_remove_device:
4444         iopf_queue_remove_device(iommu->iopf_queue, dev);
4445
4446         return ret;
4447 }
4448
4449 static int intel_iommu_disable_iopf(struct device *dev)
4450 {
4451         struct device_domain_info *info = dev_iommu_priv_get(dev);
4452         struct intel_iommu *iommu = info->iommu;
4453
4454         if (!info->pri_enabled)
4455                 return -EINVAL;
4456
4457         /*
4458          * PCIe spec states that by clearing PRI enable bit, the Page
4459          * Request Interface will not issue new page requests, but has
4460          * outstanding page requests that have been transmitted or are
4461          * queued for transmission. This is supposed to be called after
4462          * the device driver has stopped DMA, all PASIDs have been
4463          * unbound and the outstanding PRQs have been drained.
4464          */
4465         pci_disable_pri(to_pci_dev(dev));
4466         info->pri_enabled = 0;
4467
4468         /*
4469          * With PRI disabled and outstanding PRQs drained, unregistering
4470          * fault handler and removing device from iopf queue should never
4471          * fail.
4472          */
4473         WARN_ON(iommu_unregister_device_fault_handler(dev));
4474         WARN_ON(iopf_queue_remove_device(iommu->iopf_queue, dev));
4475
4476         return 0;
4477 }
4478
4479 static int
4480 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4481 {
4482         switch (feat) {
4483         case IOMMU_DEV_FEAT_IOPF:
4484                 return intel_iommu_enable_iopf(dev);
4485
4486         case IOMMU_DEV_FEAT_SVA:
4487                 return intel_iommu_enable_sva(dev);
4488
4489         default:
4490                 return -ENODEV;
4491         }
4492 }
4493
4494 static int
4495 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4496 {
4497         switch (feat) {
4498         case IOMMU_DEV_FEAT_IOPF:
4499                 return intel_iommu_disable_iopf(dev);
4500
4501         case IOMMU_DEV_FEAT_SVA:
4502                 return 0;
4503
4504         default:
4505                 return -ENODEV;
4506         }
4507 }
4508
4509 static bool intel_iommu_is_attach_deferred(struct device *dev)
4510 {
4511         struct device_domain_info *info = dev_iommu_priv_get(dev);
4512
4513         return translation_pre_enabled(info->iommu) && !info->domain;
4514 }
4515
4516 /*
4517  * Check that the device does not live on an external facing PCI port that is
4518  * marked as untrusted. Such devices should not be able to apply quirks and
4519  * thus not be able to bypass the IOMMU restrictions.
4520  */
4521 static bool risky_device(struct pci_dev *pdev)
4522 {
4523         if (pdev->untrusted) {
4524                 pci_info(pdev,
4525                          "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4526                          pdev->vendor, pdev->device);
4527                 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4528                 return true;
4529         }
4530         return false;
4531 }
4532
4533 static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4534                                       unsigned long iova, size_t size)
4535 {
4536         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4537         unsigned long pages = aligned_nrpages(iova, size);
4538         unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4539         struct iommu_domain_info *info;
4540         unsigned long i;
4541
4542         xa_for_each(&dmar_domain->iommu_array, i, info)
4543                 __mapping_notify_one(info->iommu, dmar_domain, pfn, pages);
4544         return 0;
4545 }
4546
4547 static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
4548 {
4549         struct device_domain_info *info = dev_iommu_priv_get(dev);
4550         struct dev_pasid_info *curr, *dev_pasid = NULL;
4551         struct intel_iommu *iommu = info->iommu;
4552         struct dmar_domain *dmar_domain;
4553         struct iommu_domain *domain;
4554         unsigned long flags;
4555
4556         domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0);
4557         if (WARN_ON_ONCE(!domain))
4558                 goto out_tear_down;
4559
4560         /*
4561          * The SVA implementation needs to handle its own stuffs like the mm
4562          * notification. Before consolidating that code into iommu core, let
4563          * the intel sva code handle it.
4564          */
4565         if (domain->type == IOMMU_DOMAIN_SVA) {
4566                 intel_svm_remove_dev_pasid(dev, pasid);
4567                 goto out_tear_down;
4568         }
4569
4570         dmar_domain = to_dmar_domain(domain);
4571         spin_lock_irqsave(&dmar_domain->lock, flags);
4572         list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
4573                 if (curr->dev == dev && curr->pasid == pasid) {
4574                         list_del(&curr->link_domain);
4575                         dev_pasid = curr;
4576                         break;
4577                 }
4578         }
4579         WARN_ON_ONCE(!dev_pasid);
4580         spin_unlock_irqrestore(&dmar_domain->lock, flags);
4581
4582         domain_detach_iommu(dmar_domain, iommu);
4583         intel_iommu_debugfs_remove_dev_pasid(dev_pasid);
4584         kfree(dev_pasid);
4585 out_tear_down:
4586         intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4587         intel_drain_pasid_prq(dev, pasid);
4588 }
4589
4590 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4591                                      struct device *dev, ioasid_t pasid)
4592 {
4593         struct device_domain_info *info = dev_iommu_priv_get(dev);
4594         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4595         struct intel_iommu *iommu = info->iommu;
4596         struct dev_pasid_info *dev_pasid;
4597         unsigned long flags;
4598         int ret;
4599
4600         if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4601                 return -EOPNOTSUPP;
4602
4603         if (domain->dirty_ops)
4604                 return -EINVAL;
4605
4606         if (context_copied(iommu, info->bus, info->devfn))
4607                 return -EBUSY;
4608
4609         ret = prepare_domain_attach_device(domain, dev);
4610         if (ret)
4611                 return ret;
4612
4613         dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
4614         if (!dev_pasid)
4615                 return -ENOMEM;
4616
4617         ret = domain_attach_iommu(dmar_domain, iommu);
4618         if (ret)
4619                 goto out_free;
4620
4621         if (domain_type_is_si(dmar_domain))
4622                 ret = intel_pasid_setup_pass_through(iommu, dev, pasid);
4623         else if (dmar_domain->use_first_level)
4624                 ret = domain_setup_first_level(iommu, dmar_domain,
4625                                                dev, pasid);
4626         else
4627                 ret = intel_pasid_setup_second_level(iommu, dmar_domain,
4628                                                      dev, pasid);
4629         if (ret)
4630                 goto out_detach_iommu;
4631
4632         dev_pasid->dev = dev;
4633         dev_pasid->pasid = pasid;
4634         spin_lock_irqsave(&dmar_domain->lock, flags);
4635         list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
4636         spin_unlock_irqrestore(&dmar_domain->lock, flags);
4637
4638         if (domain->type & __IOMMU_DOMAIN_PAGING)
4639                 intel_iommu_debugfs_create_dev_pasid(dev_pasid);
4640
4641         return 0;
4642 out_detach_iommu:
4643         domain_detach_iommu(dmar_domain, iommu);
4644 out_free:
4645         kfree(dev_pasid);
4646         return ret;
4647 }
4648
4649 static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
4650 {
4651         struct device_domain_info *info = dev_iommu_priv_get(dev);
4652         struct intel_iommu *iommu = info->iommu;
4653         struct iommu_hw_info_vtd *vtd;
4654
4655         vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
4656         if (!vtd)
4657                 return ERR_PTR(-ENOMEM);
4658
4659         vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17;
4660         vtd->cap_reg = iommu->cap;
4661         vtd->ecap_reg = iommu->ecap;
4662         *length = sizeof(*vtd);
4663         *type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
4664         return vtd;
4665 }
4666
4667 static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
4668                                           bool enable)
4669 {
4670         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4671         struct device_domain_info *info;
4672         int ret;
4673
4674         spin_lock(&dmar_domain->lock);
4675         if (dmar_domain->dirty_tracking == enable)
4676                 goto out_unlock;
4677
4678         list_for_each_entry(info, &dmar_domain->devices, link) {
4679                 ret = intel_pasid_setup_dirty_tracking(info->iommu,
4680                                                        info->domain, info->dev,
4681                                                        IOMMU_NO_PASID, enable);
4682                 if (ret)
4683                         goto err_unwind;
4684         }
4685
4686         dmar_domain->dirty_tracking = enable;
4687 out_unlock:
4688         spin_unlock(&dmar_domain->lock);
4689
4690         return 0;
4691
4692 err_unwind:
4693         list_for_each_entry(info, &dmar_domain->devices, link)
4694                 intel_pasid_setup_dirty_tracking(info->iommu, dmar_domain,
4695                                                  info->dev, IOMMU_NO_PASID,
4696                                                  dmar_domain->dirty_tracking);
4697         spin_unlock(&dmar_domain->lock);
4698         return ret;
4699 }
4700
4701 static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
4702                                             unsigned long iova, size_t size,
4703                                             unsigned long flags,
4704                                             struct iommu_dirty_bitmap *dirty)
4705 {
4706         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4707         unsigned long end = iova + size - 1;
4708         unsigned long pgsize;
4709
4710         /*
4711          * IOMMUFD core calls into a dirty tracking disabled domain without an
4712          * IOVA bitmap set in order to clean dirty bits in all PTEs that might
4713          * have occurred when we stopped dirty tracking. This ensures that we
4714          * never inherit dirtied bits from a previous cycle.
4715          */
4716         if (!dmar_domain->dirty_tracking && dirty->bitmap)
4717                 return -EINVAL;
4718
4719         do {
4720                 struct dma_pte *pte;
4721                 int lvl = 0;
4722
4723                 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
4724                                      GFP_ATOMIC);
4725                 pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
4726                 if (!pte || !dma_pte_present(pte)) {
4727                         iova += pgsize;
4728                         continue;
4729                 }
4730
4731                 if (dma_sl_pte_test_and_clear_dirty(pte, flags))
4732                         iommu_dirty_bitmap_record(dirty, iova, pgsize);
4733                 iova += pgsize;
4734         } while (iova < end);
4735
4736         return 0;
4737 }
4738
4739 static const struct iommu_dirty_ops intel_dirty_ops = {
4740         .set_dirty_tracking = intel_iommu_set_dirty_tracking,
4741         .read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
4742 };
4743
4744 const struct iommu_ops intel_iommu_ops = {
4745         .blocked_domain         = &blocking_domain,
4746         .capable                = intel_iommu_capable,
4747         .hw_info                = intel_iommu_hw_info,
4748         .domain_alloc           = intel_iommu_domain_alloc,
4749         .domain_alloc_user      = intel_iommu_domain_alloc_user,
4750         .probe_device           = intel_iommu_probe_device,
4751         .probe_finalize         = intel_iommu_probe_finalize,
4752         .release_device         = intel_iommu_release_device,
4753         .get_resv_regions       = intel_iommu_get_resv_regions,
4754         .device_group           = intel_iommu_device_group,
4755         .dev_enable_feat        = intel_iommu_dev_enable_feat,
4756         .dev_disable_feat       = intel_iommu_dev_disable_feat,
4757         .is_attach_deferred     = intel_iommu_is_attach_deferred,
4758         .def_domain_type        = device_def_domain_type,
4759         .remove_dev_pasid       = intel_iommu_remove_dev_pasid,
4760         .pgsize_bitmap          = SZ_4K,
4761 #ifdef CONFIG_INTEL_IOMMU_SVM
4762         .page_response          = intel_svm_page_response,
4763 #endif
4764         .default_domain_ops = &(const struct iommu_domain_ops) {
4765                 .attach_dev             = intel_iommu_attach_device,
4766                 .set_dev_pasid          = intel_iommu_set_dev_pasid,
4767                 .map_pages              = intel_iommu_map_pages,
4768                 .unmap_pages            = intel_iommu_unmap_pages,
4769                 .iotlb_sync_map         = intel_iommu_iotlb_sync_map,
4770                 .flush_iotlb_all        = intel_flush_iotlb_all,
4771                 .iotlb_sync             = intel_iommu_tlb_sync,
4772                 .iova_to_phys           = intel_iommu_iova_to_phys,
4773                 .free                   = intel_iommu_domain_free,
4774                 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4775         }
4776 };
4777
4778 static void quirk_iommu_igfx(struct pci_dev *dev)
4779 {
4780         if (risky_device(dev))
4781                 return;
4782
4783         pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4784         dmar_map_gfx = 0;
4785 }
4786
4787 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4788 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4789 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4790 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4791 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4792 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4793 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4794 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4795
4796 /* Broadwell igfx malfunctions with dmar */
4797 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4798 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4799 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4800 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4801 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4802 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4803 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4804 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4805 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4806 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4807 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4808 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4809 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4810 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4811 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4812 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4813 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4814 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4815 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4816 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4817 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4818 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4819 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4820 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4821
4822 static void quirk_iommu_rwbf(struct pci_dev *dev)
4823 {
4824         if (risky_device(dev))
4825                 return;
4826
4827         /*
4828          * Mobile 4 Series Chipset neglects to set RWBF capability,
4829          * but needs it. Same seems to hold for the desktop versions.
4830          */
4831         pci_info(dev, "Forcing write-buffer flush capability\n");
4832         rwbf_quirk = 1;
4833 }
4834
4835 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4836 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4837 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4838 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4839 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4840 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4841 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4842
4843 #define GGC 0x52
4844 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
4845 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
4846 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
4847 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
4848 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
4849 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
4850 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
4851 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
4852
4853 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4854 {
4855         unsigned short ggc;
4856
4857         if (risky_device(dev))
4858                 return;
4859
4860         if (pci_read_config_word(dev, GGC, &ggc))
4861                 return;
4862
4863         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4864                 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4865                 dmar_map_gfx = 0;
4866         } else if (dmar_map_gfx) {
4867                 /* we have to ensure the gfx device is idle before we flush */
4868                 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4869                 iommu_set_dma_strict();
4870         }
4871 }
4872 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4873 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4874 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4875 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4876
4877 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4878 {
4879         unsigned short ver;
4880
4881         if (!IS_GFX_DEVICE(dev))
4882                 return;
4883
4884         ver = (dev->device >> 8) & 0xff;
4885         if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4886             ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4887             ver != 0x9a && ver != 0xa7 && ver != 0x7d)
4888                 return;
4889
4890         if (risky_device(dev))
4891                 return;
4892
4893         pci_info(dev, "Skip IOMMU disabling for graphics\n");
4894         iommu_skip_te_disable = 1;
4895 }
4896 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4897
4898 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4899    ISOCH DMAR unit for the Azalia sound device, but not give it any
4900    TLB entries, which causes it to deadlock. Check for that.  We do
4901    this in a function called from init_dmars(), instead of in a PCI
4902    quirk, because we don't want to print the obnoxious "BIOS broken"
4903    message if VT-d is actually disabled.
4904 */
4905 static void __init check_tylersburg_isoch(void)
4906 {
4907         struct pci_dev *pdev;
4908         uint32_t vtisochctrl;
4909
4910         /* If there's no Azalia in the system anyway, forget it. */
4911         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4912         if (!pdev)
4913                 return;
4914
4915         if (risky_device(pdev)) {
4916                 pci_dev_put(pdev);
4917                 return;
4918         }
4919
4920         pci_dev_put(pdev);
4921
4922         /* System Management Registers. Might be hidden, in which case
4923            we can't do the sanity check. But that's OK, because the
4924            known-broken BIOSes _don't_ actually hide it, so far. */
4925         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4926         if (!pdev)
4927                 return;
4928
4929         if (risky_device(pdev)) {
4930                 pci_dev_put(pdev);
4931                 return;
4932         }
4933
4934         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4935                 pci_dev_put(pdev);
4936                 return;
4937         }
4938
4939         pci_dev_put(pdev);
4940
4941         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4942         if (vtisochctrl & 1)
4943                 return;
4944
4945         /* Drop all bits other than the number of TLB entries */
4946         vtisochctrl &= 0x1c;
4947
4948         /* If we have the recommended number of TLB entries (16), fine. */
4949         if (vtisochctrl == 0x10)
4950                 return;
4951
4952         /* Zero TLB entries? You get to ride the short bus to school. */
4953         if (!vtisochctrl) {
4954                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4955                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4956                      dmi_get_system_info(DMI_BIOS_VENDOR),
4957                      dmi_get_system_info(DMI_BIOS_VERSION),
4958                      dmi_get_system_info(DMI_PRODUCT_VERSION));
4959                 iommu_identity_mapping |= IDENTMAP_AZALIA;
4960                 return;
4961         }
4962
4963         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4964                vtisochctrl);
4965 }
4966
4967 /*
4968  * Here we deal with a device TLB defect where device may inadvertently issue ATS
4969  * invalidation completion before posted writes initiated with translated address
4970  * that utilized translations matching the invalidation address range, violating
4971  * the invalidation completion ordering.
4972  * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
4973  * vulnerable to this defect. In other words, any dTLB invalidation initiated not
4974  * under the control of the trusted/privileged host device driver must use this
4975  * quirk.
4976  * Device TLBs are invalidated under the following six conditions:
4977  * 1. Device driver does DMA API unmap IOVA
4978  * 2. Device driver unbind a PASID from a process, sva_unbind_device()
4979  * 3. PASID is torn down, after PASID cache is flushed. e.g. process
4980  *    exit_mmap() due to crash
4981  * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
4982  *    VM has to free pages that were unmapped
4983  * 5. Userspace driver unmaps a DMA buffer
4984  * 6. Cache invalidation in vSVA usage (upcoming)
4985  *
4986  * For #1 and #2, device drivers are responsible for stopping DMA traffic
4987  * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
4988  * invalidate TLB the same way as normal user unmap which will use this quirk.
4989  * The dTLB invalidation after PASID cache flush does not need this quirk.
4990  *
4991  * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
4992  */
4993 void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
4994                                unsigned long address, unsigned long mask,
4995                                u32 pasid, u16 qdep)
4996 {
4997         u16 sid;
4998
4999         if (likely(!info->dtlb_extra_inval))
5000                 return;
5001
5002         sid = PCI_DEVID(info->bus, info->devfn);
5003         if (pasid == IOMMU_NO_PASID) {
5004                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
5005                                    qdep, address, mask);
5006         } else {
5007                 qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
5008                                          pasid, qdep, address, mask);
5009         }
5010 }
5011
5012 #define ecmd_get_status_code(res)       (((res) & 0xff) >> 1)
5013
5014 /*
5015  * Function to submit a command to the enhanced command interface. The
5016  * valid enhanced command descriptions are defined in Table 47 of the
5017  * VT-d spec. The VT-d hardware implementation may support some but not
5018  * all commands, which can be determined by checking the Enhanced
5019  * Command Capability Register.
5020  *
5021  * Return values:
5022  *  - 0: Command successful without any error;
5023  *  - Negative: software error value;
5024  *  - Nonzero positive: failure status code defined in Table 48.
5025  */
5026 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
5027 {
5028         unsigned long flags;
5029         u64 res;
5030         int ret;
5031
5032         if (!cap_ecmds(iommu->cap))
5033                 return -ENODEV;
5034
5035         raw_spin_lock_irqsave(&iommu->register_lock, flags);
5036
5037         res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
5038         if (res & DMA_ECMD_ECRSP_IP) {
5039                 ret = -EBUSY;
5040                 goto err;
5041         }
5042
5043         /*
5044          * Unconditionally write the operand B, because
5045          * - There is no side effect if an ecmd doesn't require an
5046          *   operand B, but we set the register to some value.
5047          * - It's not invoked in any critical path. The extra MMIO
5048          *   write doesn't bring any performance concerns.
5049          */
5050         dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
5051         dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
5052
5053         IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
5054                       !(res & DMA_ECMD_ECRSP_IP), res);
5055
5056         if (res & DMA_ECMD_ECRSP_IP) {
5057                 ret = -ETIMEDOUT;
5058                 goto err;
5059         }
5060
5061         ret = ecmd_get_status_code(res);
5062 err:
5063         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
5064
5065         return ret;
5066 }