drivers/iommu/intel/iommu.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Copyright © 2006-2014 Intel Corporation.
   4  *
   5  * Authors: David Woodhouse <[email protected]>,
   6  *          Ashok Raj <[email protected]>,
   7  *          Shaohua Li <[email protected]>,
   8  *          Anil S Keshavamurthy <[email protected]>,
   9  *          Fenghua Yu <[email protected]>
  10  *          Joerg Roedel <[email protected]>
  11  */
  12
  13 #define pr_fmt(fmt)     "DMAR: " fmt
  14 #define dev_fmt(fmt)    pr_fmt(fmt)
  15
  16 #include <linux/crash_dump.h>
  17 #include <linux/dma-direct.h>
  18 #include <linux/dmi.h>
  19 #include <linux/memory.h>
  20 #include <linux/pci.h>
  21 #include <linux/pci-ats.h>
  22 #include <linux/spinlock.h>
  23 #include <linux/syscore_ops.h>
  24 #include <linux/tboot.h>
  25
  26 #include "iommu.h"
  27 #include "../dma-iommu.h"
  28 #include "../irq_remapping.h"
  29 #include "../iommu-sva.h"
  30 #include "pasid.h"
  31 #include "cap_audit.h"
  32 #include "perfmon.h"
  33
  34 #define ROOT_SIZE               VTD_PAGE_SIZE
  35 #define CONTEXT_SIZE            VTD_PAGE_SIZE
  36
  37 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
  38 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
  39 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
  40 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
  41
  42 #define IOAPIC_RANGE_START      (0xfee00000)
  43 #define IOAPIC_RANGE_END        (0xfeefffff)
  44 #define IOVA_START_ADDR         (0x1000)
  45
  46 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
  47
  48 #define MAX_AGAW_WIDTH 64
  49 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
  50
  51 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
  52 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
  53
  54 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
  55    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
  56 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
  57                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
  58 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
  59
  60 /* IO virtual address start page frame number */
  61 #define IOVA_START_PFN          (1)
  62
  63 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
  64
  65 /* page table handling */
  66 #define LEVEL_STRIDE            (9)
  67 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
  68
  69 static inline int agaw_to_level(int agaw)
  70 {
  71         return agaw + 2;
  72 }
  73
  74 static inline int agaw_to_width(int agaw)
  75 {
  76         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
  77 }
  78
  79 static inline int width_to_agaw(int width)
  80 {
  81         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
  82 }
  83
  84 static inline unsigned int level_to_offset_bits(int level)
  85 {
  86         return (level - 1) * LEVEL_STRIDE;
  87 }
  88
  89 static inline int pfn_level_offset(u64 pfn, int level)
  90 {
  91         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
  92 }
  93
  94 static inline u64 level_mask(int level)
  95 {
  96         return -1ULL << level_to_offset_bits(level);
  97 }
  98
  99 static inline u64 level_size(int level)
 100 {
 101         return 1ULL << level_to_offset_bits(level);
 102 }
 103
 104 static inline u64 align_to_level(u64 pfn, int level)
 105 {
 106         return (pfn + level_size(level) - 1) & level_mask(level);
 107 }
 108
 109 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
 110 {
 111         return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
 112 }
 113
 114 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
 115    are never going to work. */
 116 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
 117 {
 118         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
 119 }
 120 static inline unsigned long page_to_dma_pfn(struct page *pg)
 121 {
 122         return mm_to_dma_pfn(page_to_pfn(pg));
 123 }
 124 static inline unsigned long virt_to_dma_pfn(void *p)
 125 {
 126         return page_to_dma_pfn(virt_to_page(p));
 127 }
 128
 129 static void __init check_tylersburg_isoch(void);
 130 static int rwbf_quirk;
 131
 132 /*
 133  * set to 1 to panic kernel if can't successfully enable VT-d
 134  * (used when kernel is launched w/ TXT)
 135  */
 136 static int force_on = 0;
 137 static int intel_iommu_tboot_noforce;
 138 static int no_platform_optin;
 139
 140 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
 141
 142 /*
 143  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
 144  * if marked present.
 145  */
 146 static phys_addr_t root_entry_lctp(struct root_entry *re)
 147 {
 148         if (!(re->lo & 1))
 149                 return 0;
 150
 151         return re->lo & VTD_PAGE_MASK;
 152 }
 153
 154 /*
 155  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
 156  * if marked present.
 157  */
 158 static phys_addr_t root_entry_uctp(struct root_entry *re)
 159 {
 160         if (!(re->hi & 1))
 161                 return 0;
 162
 163         return re->hi & VTD_PAGE_MASK;
 164 }
 165
 166 static inline void context_set_present(struct context_entry *context)
 167 {
 168         context->lo |= 1;
 169 }
 170
 171 static inline void context_set_fault_enable(struct context_entry *context)
 172 {
 173         context->lo &= (((u64)-1) << 2) | 1;
 174 }
 175
 176 static inline void context_set_translation_type(struct context_entry *context,
 177                                                 unsigned long value)
 178 {
 179         context->lo &= (((u64)-1) << 4) | 3;
 180         context->lo |= (value & 3) << 2;
 181 }
 182
 183 static inline void context_set_address_root(struct context_entry *context,
 184                                             unsigned long value)
 185 {
 186         context->lo &= ~VTD_PAGE_MASK;
 187         context->lo |= value & VTD_PAGE_MASK;
 188 }
 189
 190 static inline void context_set_address_width(struct context_entry *context,
 191                                              unsigned long value)
 192 {
 193         context->hi |= value & 7;
 194 }
 195
 196 static inline void context_set_domain_id(struct context_entry *context,
 197                                          unsigned long value)
 198 {
 199         context->hi |= (value & ((1 << 16) - 1)) << 8;
 200 }
 201
 202 static inline void context_set_pasid(struct context_entry *context)
 203 {
 204         context->lo |= CONTEXT_PASIDE;
 205 }
 206
 207 static inline int context_domain_id(struct context_entry *c)
 208 {
 209         return((c->hi >> 8) & 0xffff);
 210 }
 211
 212 static inline void context_clear_entry(struct context_entry *context)
 213 {
 214         context->lo = 0;
 215         context->hi = 0;
 216 }
 217
 218 static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
 219 {
 220         if (!iommu->copied_tables)
 221                 return false;
 222
 223         return test_bit(((long)bus << 8) | devfn, iommu->copied_tables);
 224 }
 225
 226 static inline void
 227 set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
 228 {
 229         set_bit(((long)bus << 8) | devfn, iommu->copied_tables);
 230 }
 231
 232 static inline void
 233 clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
 234 {
 235         clear_bit(((long)bus << 8) | devfn, iommu->copied_tables);
 236 }
 237
 238 /*
 239  * This domain is a statically identity mapping domain.
 240  *      1. This domain creats a static 1:1 mapping to all usable memory.
 241  *      2. It maps to each iommu if successful.
 242  *      3. Each iommu mapps to this domain if successful.
 243  */
 244 static struct dmar_domain *si_domain;
 245 static int hw_pass_through = 1;
 246
 247 struct dmar_rmrr_unit {
 248         struct list_head list;          /* list of rmrr units   */
 249         struct acpi_dmar_header *hdr;   /* ACPI header          */
 250         u64     base_address;           /* reserved base address*/
 251         u64     end_address;            /* reserved end address */
 252         struct dmar_dev_scope *devices; /* target devices */
 253         int     devices_cnt;            /* target device count */
 254 };
 255
 256 struct dmar_atsr_unit {
 257         struct list_head list;          /* list of ATSR units */
 258         struct acpi_dmar_header *hdr;   /* ACPI header */
 259         struct dmar_dev_scope *devices; /* target devices */
 260         int devices_cnt;                /* target device count */
 261         u8 include_all:1;               /* include all ports */
 262 };
 263
 264 struct dmar_satc_unit {
 265         struct list_head list;          /* list of SATC units */
 266         struct acpi_dmar_header *hdr;   /* ACPI header */
 267         struct dmar_dev_scope *devices; /* target devices */
 268         struct intel_iommu *iommu;      /* the corresponding iommu */
 269         int devices_cnt;                /* target device count */
 270         u8 atc_required:1;              /* ATS is required */
 271 };
 272
 273 static LIST_HEAD(dmar_atsr_units);
 274 static LIST_HEAD(dmar_rmrr_units);
 275 static LIST_HEAD(dmar_satc_units);
 276
 277 #define for_each_rmrr_units(rmrr) \
 278         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
 279
 280 static void device_block_translation(struct device *dev);
 281 static void intel_iommu_domain_free(struct iommu_domain *domain);
 282
 283 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
 284 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
 285
 286 int intel_iommu_enabled = 0;
 287 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 288
 289 static int dmar_map_gfx = 1;
 290 static int intel_iommu_superpage = 1;
 291 static int iommu_identity_mapping;
 292 static int iommu_skip_te_disable;
 293
 294 #define IDENTMAP_GFX            2
 295 #define IDENTMAP_AZALIA         4
 296
 297 const struct iommu_ops intel_iommu_ops;
 298
 299 static bool translation_pre_enabled(struct intel_iommu *iommu)
 300 {
 301         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
 302 }
 303
 304 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
 305 {
 306         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
 307 }
 308
 309 static void init_translation_status(struct intel_iommu *iommu)
 310 {
 311         u32 gsts;
 312
 313         gsts = readl(iommu->reg + DMAR_GSTS_REG);
 314         if (gsts & DMA_GSTS_TES)
 315                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
 316 }
 317
 318 static int __init intel_iommu_setup(char *str)
 319 {
 320         if (!str)
 321                 return -EINVAL;
 322
 323         while (*str) {
 324                 if (!strncmp(str, "on", 2)) {
 325                         dmar_disabled = 0;
 326                         pr_info("IOMMU enabled\n");
 327                 } else if (!strncmp(str, "off", 3)) {
 328                         dmar_disabled = 1;
 329                         no_platform_optin = 1;
 330                         pr_info("IOMMU disabled\n");
 331                 } else if (!strncmp(str, "igfx_off", 8)) {
 332                         dmar_map_gfx = 0;
 333                         pr_info("Disable GFX device mapping\n");
 334                 } else if (!strncmp(str, "forcedac", 8)) {
 335                         pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
 336                         iommu_dma_forcedac = true;
 337                 } else if (!strncmp(str, "strict", 6)) {
 338                         pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
 339                         iommu_set_dma_strict();
 340                 } else if (!strncmp(str, "sp_off", 6)) {
 341                         pr_info("Disable supported super page\n");
 342                         intel_iommu_superpage = 0;
 343                 } else if (!strncmp(str, "sm_on", 5)) {
 344                         pr_info("Enable scalable mode if hardware supports\n");
 345                         intel_iommu_sm = 1;
 346                 } else if (!strncmp(str, "sm_off", 6)) {
 347                         pr_info("Scalable mode is disallowed\n");
 348                         intel_iommu_sm = 0;
 349                 } else if (!strncmp(str, "tboot_noforce", 13)) {
 350                         pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
 351                         intel_iommu_tboot_noforce = 1;
 352                 } else {
 353                         pr_notice("Unknown option - '%s'\n", str);
 354                 }
 355
 356                 str += strcspn(str, ",");
 357                 while (*str == ',')
 358                         str++;
 359         }
 360
 361         return 1;
 362 }
 363 __setup("intel_iommu=", intel_iommu_setup);
 364
 365 void *alloc_pgtable_page(int node, gfp_t gfp)
 366 {
 367         struct page *page;
 368         void *vaddr = NULL;
 369
 370         page = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
 371         if (page)
 372                 vaddr = page_address(page);
 373         return vaddr;
 374 }
 375
 376 void free_pgtable_page(void *vaddr)
 377 {
 378         free_page((unsigned long)vaddr);
 379 }
 380
 381 static inline int domain_type_is_si(struct dmar_domain *domain)
 382 {
 383         return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
 384 }
 385
 386 static inline int domain_pfn_supported(struct dmar_domain *domain,
 387                                        unsigned long pfn)
 388 {
 389         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 390
 391         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
 392 }
 393
 394 /*
 395  * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
 396  * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
 397  * the returned SAGAW.
 398  */
 399 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
 400 {
 401         unsigned long fl_sagaw, sl_sagaw;
 402
 403         fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
 404         sl_sagaw = cap_sagaw(iommu->cap);
 405
 406         /* Second level only. */
 407         if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
 408                 return sl_sagaw;
 409
 410         /* First level only. */
 411         if (!ecap_slts(iommu->ecap))
 412                 return fl_sagaw;
 413
 414         return fl_sagaw & sl_sagaw;
 415 }
 416
 417 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
 418 {
 419         unsigned long sagaw;
 420         int agaw;
 421
 422         sagaw = __iommu_calculate_sagaw(iommu);
 423         for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
 424                 if (test_bit(agaw, &sagaw))
 425                         break;
 426         }
 427
 428         return agaw;
 429 }
 430
 431 /*
 432  * Calculate max SAGAW for each iommu.
 433  */
 434 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
 435 {
 436         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
 437 }
 438
 439 /*
 440  * calculate agaw for each iommu.
 441  * "SAGAW" may be different across iommus, use a default agaw, and
 442  * get a supported less agaw for iommus that don't support the default agaw.
 443  */
 444 int iommu_calculate_agaw(struct intel_iommu *iommu)
 445 {
 446         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
 447 }
 448
 449 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
 450 {
 451         return sm_supported(iommu) ?
 452                         ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
 453 }
 454
 455 static void domain_update_iommu_coherency(struct dmar_domain *domain)
 456 {
 457         struct iommu_domain_info *info;
 458         struct dmar_drhd_unit *drhd;
 459         struct intel_iommu *iommu;
 460         bool found = false;
 461         unsigned long i;
 462
 463         domain->iommu_coherency = true;
 464         xa_for_each(&domain->iommu_array, i, info) {
 465                 found = true;
 466                 if (!iommu_paging_structure_coherency(info->iommu)) {
 467                         domain->iommu_coherency = false;
 468                         break;
 469                 }
 470         }
 471         if (found)
 472                 return;
 473
 474         /* No hardware attached; use lowest common denominator */
 475         rcu_read_lock();
 476         for_each_active_iommu(iommu, drhd) {
 477                 if (!iommu_paging_structure_coherency(iommu)) {
 478                         domain->iommu_coherency = false;
 479                         break;
 480                 }
 481         }
 482         rcu_read_unlock();
 483 }
 484
 485 static int domain_update_iommu_superpage(struct dmar_domain *domain,
 486                                          struct intel_iommu *skip)
 487 {
 488         struct dmar_drhd_unit *drhd;
 489         struct intel_iommu *iommu;
 490         int mask = 0x3;
 491
 492         if (!intel_iommu_superpage)
 493                 return 0;
 494
 495         /* set iommu_superpage to the smallest common denominator */
 496         rcu_read_lock();
 497         for_each_active_iommu(iommu, drhd) {
 498                 if (iommu != skip) {
 499                         if (domain && domain->use_first_level) {
 500                                 if (!cap_fl1gp_support(iommu->cap))
 501                                         mask = 0x1;
 502                         } else {
 503                                 mask &= cap_super_page_val(iommu->cap);
 504                         }
 505
 506                         if (!mask)
 507                                 break;
 508                 }
 509         }
 510         rcu_read_unlock();
 511
 512         return fls(mask);
 513 }
 514
 515 static int domain_update_device_node(struct dmar_domain *domain)
 516 {
 517         struct device_domain_info *info;
 518         int nid = NUMA_NO_NODE;
 519         unsigned long flags;
 520
 521         spin_lock_irqsave(&domain->lock, flags);
 522         list_for_each_entry(info, &domain->devices, link) {
 523                 /*
 524                  * There could possibly be multiple device numa nodes as devices
 525                  * within the same domain may sit behind different IOMMUs. There
 526                  * isn't perfect answer in such situation, so we select first
 527                  * come first served policy.
 528                  */
 529                 nid = dev_to_node(info->dev);
 530                 if (nid != NUMA_NO_NODE)
 531                         break;
 532         }
 533         spin_unlock_irqrestore(&domain->lock, flags);
 534
 535         return nid;
 536 }
 537
 538 static void domain_update_iotlb(struct dmar_domain *domain);
 539
 540 /* Return the super pagesize bitmap if supported. */
 541 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
 542 {
 543         unsigned long bitmap = 0;
 544
 545         /*
 546          * 1-level super page supports page size of 2MiB, 2-level super page
 547          * supports page size of both 2MiB and 1GiB.
 548          */
 549         if (domain->iommu_superpage == 1)
 550                 bitmap |= SZ_2M;
 551         else if (domain->iommu_superpage == 2)
 552                 bitmap |= SZ_2M | SZ_1G;
 553
 554         return bitmap;
 555 }
 556
 557 /* Some capabilities may be different across iommus */
 558 static void domain_update_iommu_cap(struct dmar_domain *domain)
 559 {
 560         domain_update_iommu_coherency(domain);
 561         domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
 562
 563         /*
 564          * If RHSA is missing, we should default to the device numa domain
 565          * as fall back.
 566          */
 567         if (domain->nid == NUMA_NO_NODE)
 568                 domain->nid = domain_update_device_node(domain);
 569
 570         /*
 571          * First-level translation restricts the input-address to a
 572          * canonical address (i.e., address bits 63:N have the same
 573          * value as address bit [N-1], where N is 48-bits with 4-level
 574          * paging and 57-bits with 5-level paging). Hence, skip bit
 575          * [N-1].
 576          */
 577         if (domain->use_first_level)
 578                 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
 579         else
 580                 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
 581
 582         domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
 583         domain_update_iotlb(domain);
 584 }
 585
 586 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
 587                                          u8 devfn, int alloc)
 588 {
 589         struct root_entry *root = &iommu->root_entry[bus];
 590         struct context_entry *context;
 591         u64 *entry;
 592
 593         /*
 594          * Except that the caller requested to allocate a new entry,
 595          * returning a copied context entry makes no sense.
 596          */
 597         if (!alloc && context_copied(iommu, bus, devfn))
 598                 return NULL;
 599
 600         entry = &root->lo;
 601         if (sm_supported(iommu)) {
 602                 if (devfn >= 0x80) {
 603                         devfn -= 0x80;
 604                         entry = &root->hi;
 605                 }
 606                 devfn *= 2;
 607         }
 608         if (*entry & 1)
 609                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
 610         else {
 611                 unsigned long phy_addr;
 612                 if (!alloc)
 613                         return NULL;
 614
 615                 context = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
 616                 if (!context)
 617                         return NULL;
 618
 619                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
 620                 phy_addr = virt_to_phys((void *)context);
 621                 *entry = phy_addr | 1;
 622                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
 623         }
 624         return &context[devfn];
 625 }
 626
 627 /**
 628  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
 629  *                               sub-hierarchy of a candidate PCI-PCI bridge
 630  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
 631  * @bridge: the candidate PCI-PCI bridge
 632  *
 633  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
 634  */
 635 static bool
 636 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
 637 {
 638         struct pci_dev *pdev, *pbridge;
 639
 640         if (!dev_is_pci(dev) || !dev_is_pci(bridge))
 641                 return false;
 642
 643         pdev = to_pci_dev(dev);
 644         pbridge = to_pci_dev(bridge);
 645
 646         if (pbridge->subordinate &&
 647             pbridge->subordinate->number <= pdev->bus->number &&
 648             pbridge->subordinate->busn_res.end >= pdev->bus->number)
 649                 return true;
 650
 651         return false;
 652 }
 653
 654 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
 655 {
 656         struct dmar_drhd_unit *drhd;
 657         u32 vtbar;
 658         int rc;
 659
 660         /* We know that this device on this chipset has its own IOMMU.
 661          * If we find it under a different IOMMU, then the BIOS is lying
 662          * to us. Hope that the IOMMU for this device is actually
 663          * disabled, and it needs no translation...
 664          */
 665         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
 666         if (rc) {
 667                 /* "can't" happen */
 668                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
 669                 return false;
 670         }
 671         vtbar &= 0xffff0000;
 672
 673         /* we know that the this iommu should be at offset 0xa000 from vtbar */
 674         drhd = dmar_find_matched_drhd_unit(pdev);
 675         if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
 676                 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
 677                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
 678                 return true;
 679         }
 680
 681         return false;
 682 }
 683
 684 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
 685 {
 686         if (!iommu || iommu->drhd->ignored)
 687                 return true;
 688
 689         if (dev_is_pci(dev)) {
 690                 struct pci_dev *pdev = to_pci_dev(dev);
 691
 692                 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
 693                     pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
 694                     quirk_ioat_snb_local_iommu(pdev))
 695                         return true;
 696         }
 697
 698         return false;
 699 }
 700
 701 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
 702 {
 703         struct dmar_drhd_unit *drhd = NULL;
 704         struct pci_dev *pdev = NULL;
 705         struct intel_iommu *iommu;
 706         struct device *tmp;
 707         u16 segment = 0;
 708         int i;
 709
 710         if (!dev)
 711                 return NULL;
 712
 713         if (dev_is_pci(dev)) {
 714                 struct pci_dev *pf_pdev;
 715
 716                 pdev = pci_real_dma_dev(to_pci_dev(dev));
 717
 718                 /* VFs aren't listed in scope tables; we need to look up
 719                  * the PF instead to find the IOMMU. */
 720                 pf_pdev = pci_physfn(pdev);
 721                 dev = &pf_pdev->dev;
 722                 segment = pci_domain_nr(pdev->bus);
 723         } else if (has_acpi_companion(dev))
 724                 dev = &ACPI_COMPANION(dev)->dev;
 725
 726         rcu_read_lock();
 727         for_each_iommu(iommu, drhd) {
 728                 if (pdev && segment != drhd->segment)
 729                         continue;
 730
 731                 for_each_active_dev_scope(drhd->devices,
 732                                           drhd->devices_cnt, i, tmp) {
 733                         if (tmp == dev) {
 734                                 /* For a VF use its original BDF# not that of the PF
 735                                  * which we used for the IOMMU lookup. Strictly speaking
 736                                  * we could do this for all PCI devices; we only need to
 737                                  * get the BDF# from the scope table for ACPI matches. */
 738                                 if (pdev && pdev->is_virtfn)
 739                                         goto got_pdev;
 740
 741                                 if (bus && devfn) {
 742                                         *bus = drhd->devices[i].bus;
 743                                         *devfn = drhd->devices[i].devfn;
 744                                 }
 745                                 goto out;
 746                         }
 747
 748                         if (is_downstream_to_pci_bridge(dev, tmp))
 749                                 goto got_pdev;
 750                 }
 751
 752                 if (pdev && drhd->include_all) {
 753 got_pdev:
 754                         if (bus && devfn) {
 755                                 *bus = pdev->bus->number;
 756                                 *devfn = pdev->devfn;
 757                         }
 758                         goto out;
 759                 }
 760         }
 761         iommu = NULL;
 762 out:
 763         if (iommu_is_dummy(iommu, dev))
 764                 iommu = NULL;
 765
 766         rcu_read_unlock();
 767
 768         return iommu;
 769 }
 770
 771 static void domain_flush_cache(struct dmar_domain *domain,
 772                                void *addr, int size)
 773 {
 774         if (!domain->iommu_coherency)
 775                 clflush_cache_range(addr, size);
 776 }
 777
 778 static void free_context_table(struct intel_iommu *iommu)
 779 {
 780         struct context_entry *context;
 781         int i;
 782
 783         if (!iommu->root_entry)
 784                 return;
 785
 786         for (i = 0; i < ROOT_ENTRY_NR; i++) {
 787                 context = iommu_context_addr(iommu, i, 0, 0);
 788                 if (context)
 789                         free_pgtable_page(context);
 790
 791                 if (!sm_supported(iommu))
 792                         continue;
 793
 794                 context = iommu_context_addr(iommu, i, 0x80, 0);
 795                 if (context)
 796                         free_pgtable_page(context);
 797         }
 798
 799         free_pgtable_page(iommu->root_entry);
 800         iommu->root_entry = NULL;
 801 }
 802
 803 #ifdef CONFIG_DMAR_DEBUG
 804 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
 805                          u8 bus, u8 devfn, struct dma_pte *parent, int level)
 806 {
 807         struct dma_pte *pte;
 808         int offset;
 809
 810         while (1) {
 811                 offset = pfn_level_offset(pfn, level);
 812                 pte = &parent[offset];
 813                 if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
 814                         pr_info("PTE not present at level %d\n", level);
 815                         break;
 816                 }
 817
 818                 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
 819
 820                 if (level == 1)
 821                         break;
 822
 823                 parent = phys_to_virt(dma_pte_addr(pte));
 824                 level--;
 825         }
 826 }
 827
 828 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
 829                           unsigned long long addr, u32 pasid)
 830 {
 831         struct pasid_dir_entry *dir, *pde;
 832         struct pasid_entry *entries, *pte;
 833         struct context_entry *ctx_entry;
 834         struct root_entry *rt_entry;
 835         int i, dir_index, index, level;
 836         u8 devfn = source_id & 0xff;
 837         u8 bus = source_id >> 8;
 838         struct dma_pte *pgtable;
 839
 840         pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
 841
 842         /* root entry dump */
 843         rt_entry = &iommu->root_entry[bus];
 844         if (!rt_entry) {
 845                 pr_info("root table entry is not present\n");
 846                 return;
 847         }
 848
 849         if (sm_supported(iommu))
 850                 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
 851                         rt_entry->hi, rt_entry->lo);
 852         else
 853                 pr_info("root entry: 0x%016llx", rt_entry->lo);
 854
 855         /* context entry dump */
 856         ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
 857         if (!ctx_entry) {
 858                 pr_info("context table entry is not present\n");
 859                 return;
 860         }
 861
 862         pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
 863                 ctx_entry->hi, ctx_entry->lo);
 864
 865         /* legacy mode does not require PASID entries */
 866         if (!sm_supported(iommu)) {
 867                 level = agaw_to_level(ctx_entry->hi & 7);
 868                 pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
 869                 goto pgtable_walk;
 870         }
 871
 872         /* get the pointer to pasid directory entry */
 873         dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
 874         if (!dir) {
 875                 pr_info("pasid directory entry is not present\n");
 876                 return;
 877         }
 878         /* For request-without-pasid, get the pasid from context entry */
 879         if (intel_iommu_sm && pasid == INVALID_IOASID)
 880                 pasid = PASID_RID2PASID;
 881
 882         dir_index = pasid >> PASID_PDE_SHIFT;
 883         pde = &dir[dir_index];
 884         pr_info("pasid dir entry: 0x%016llx\n", pde->val);
 885
 886         /* get the pointer to the pasid table entry */
 887         entries = get_pasid_table_from_pde(pde);
 888         if (!entries) {
 889                 pr_info("pasid table entry is not present\n");
 890                 return;
 891         }
 892         index = pasid & PASID_PTE_MASK;
 893         pte = &entries[index];
 894         for (i = 0; i < ARRAY_SIZE(pte->val); i++)
 895                 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
 896
 897         if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
 898                 level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
 899                 pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
 900         } else {
 901                 level = agaw_to_level((pte->val[0] >> 2) & 0x7);
 902                 pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
 903         }
 904
 905 pgtable_walk:
 906         pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
 907 }
 908 #endif
 909
 910 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 911                                       unsigned long pfn, int *target_level,
 912                                       gfp_t gfp)
 913 {
 914         struct dma_pte *parent, *pte;
 915         int level = agaw_to_level(domain->agaw);
 916         int offset;
 917
 918         BUG_ON(!domain->pgd);
 919
 920         if (!domain_pfn_supported(domain, pfn))
 921                 /* Address beyond IOMMU's addressing capabilities. */
 922                 return NULL;
 923
 924         parent = domain->pgd;
 925
 926         while (1) {
 927                 void *tmp_page;
 928
 929                 offset = pfn_level_offset(pfn, level);
 930                 pte = &parent[offset];
 931                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
 932                         break;
 933                 if (level == *target_level)
 934                         break;
 935
 936                 if (!dma_pte_present(pte)) {
 937                         uint64_t pteval;
 938
 939                         tmp_page = alloc_pgtable_page(domain->nid, gfp);
 940
 941                         if (!tmp_page)
 942                                 return NULL;
 943
 944                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
 945                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
 946                         if (domain->use_first_level)
 947                                 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
 948
 949                         if (cmpxchg64(&pte->val, 0ULL, pteval))
 950                                 /* Someone else set it while we were thinking; use theirs. */
 951                                 free_pgtable_page(tmp_page);
 952                         else
 953                                 domain_flush_cache(domain, pte, sizeof(*pte));
 954                 }
 955                 if (level == 1)
 956                         break;
 957
 958                 parent = phys_to_virt(dma_pte_addr(pte));
 959                 level--;
 960         }
 961
 962         if (!*target_level)
 963                 *target_level = level;
 964
 965         return pte;
 966 }
 967
 968 /* return address's pte at specific level */
 969 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
 970                                          unsigned long pfn,
 971                                          int level, int *large_page)
 972 {
 973         struct dma_pte *parent, *pte;
 974         int total = agaw_to_level(domain->agaw);
 975         int offset;
 976
 977         parent = domain->pgd;
 978         while (level <= total) {
 979                 offset = pfn_level_offset(pfn, total);
 980                 pte = &parent[offset];
 981                 if (level == total)
 982                         return pte;
 983
 984                 if (!dma_pte_present(pte)) {
 985                         *large_page = total;
 986                         break;
 987                 }
 988
 989                 if (dma_pte_superpage(pte)) {
 990                         *large_page = total;
 991                         return pte;
 992                 }
 993
 994                 parent = phys_to_virt(dma_pte_addr(pte));
 995                 total--;
 996         }
 997         return NULL;
 998 }
 999
1000 /* clear last level pte, a tlb flush should be followed */
1001 static void dma_pte_clear_range(struct dmar_domain *domain,
1002                                 unsigned long start_pfn,
1003                                 unsigned long last_pfn)
1004 {
1005         unsigned int large_page;
1006         struct dma_pte *first_pte, *pte;
1007
1008         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1009         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1010         BUG_ON(start_pfn > last_pfn);
1011
1012         /* we don't need lock here; nobody else touches the iova range */
1013         do {
1014                 large_page = 1;
1015                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1016                 if (!pte) {
1017                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1018                         continue;
1019                 }
1020                 do {
1021                         dma_clear_pte(pte);
1022                         start_pfn += lvl_to_nr_pages(large_page);
1023                         pte++;
1024                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1025
1026                 domain_flush_cache(domain, first_pte,
1027                                    (void *)pte - (void *)first_pte);
1028
1029         } while (start_pfn && start_pfn <= last_pfn);
1030 }
1031
1032 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1033                                int retain_level, struct dma_pte *pte,
1034                                unsigned long pfn, unsigned long start_pfn,
1035                                unsigned long last_pfn)
1036 {
1037         pfn = max(start_pfn, pfn);
1038         pte = &pte[pfn_level_offset(pfn, level)];
1039
1040         do {
1041                 unsigned long level_pfn;
1042                 struct dma_pte *level_pte;
1043
1044                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1045                         goto next;
1046
1047                 level_pfn = pfn & level_mask(level);
1048                 level_pte = phys_to_virt(dma_pte_addr(pte));
1049
1050                 if (level > 2) {
1051                         dma_pte_free_level(domain, level - 1, retain_level,
1052                                            level_pte, level_pfn, start_pfn,
1053                                            last_pfn);
1054                 }
1055
1056                 /*
1057                  * Free the page table if we're below the level we want to
1058                  * retain and the range covers the entire table.
1059                  */
1060                 if (level < retain_level && !(start_pfn > level_pfn ||
1061                       last_pfn < level_pfn + level_size(level) - 1)) {
1062                         dma_clear_pte(pte);
1063                         domain_flush_cache(domain, pte, sizeof(*pte));
1064                         free_pgtable_page(level_pte);
1065                 }
1066 next:
1067                 pfn += level_size(level);
1068         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1069 }
1070
1071 /*
1072  * clear last level (leaf) ptes and free page table pages below the
1073  * level we wish to keep intact.
1074  */
1075 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1076                                    unsigned long start_pfn,
1077                                    unsigned long last_pfn,
1078                                    int retain_level)
1079 {
1080         dma_pte_clear_range(domain, start_pfn, last_pfn);
1081
1082         /* We don't need lock here; nobody else touches the iova range */
1083         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1084                            domain->pgd, 0, start_pfn, last_pfn);
1085
1086         /* free pgd */
1087         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1088                 free_pgtable_page(domain->pgd);
1089                 domain->pgd = NULL;
1090         }
1091 }
1092
1093 /* When a page at a given level is being unlinked from its parent, we don't
1094    need to *modify* it at all. All we need to do is make a list of all the
1095    pages which can be freed just as soon as we've flushed the IOTLB and we
1096    know the hardware page-walk will no longer touch them.
1097    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1098    be freed. */
1099 static void dma_pte_list_pagetables(struct dmar_domain *domain,
1100                                     int level, struct dma_pte *pte,
1101                                     struct list_head *freelist)
1102 {
1103         struct page *pg;
1104
1105         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1106         list_add_tail(&pg->lru, freelist);
1107
1108         if (level == 1)
1109                 return;
1110
1111         pte = page_address(pg);
1112         do {
1113                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1114                         dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1115                 pte++;
1116         } while (!first_pte_in_page(pte));
1117 }
1118
1119 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1120                                 struct dma_pte *pte, unsigned long pfn,
1121                                 unsigned long start_pfn, unsigned long last_pfn,
1122                                 struct list_head *freelist)
1123 {
1124         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1125
1126         pfn = max(start_pfn, pfn);
1127         pte = &pte[pfn_level_offset(pfn, level)];
1128
1129         do {
1130                 unsigned long level_pfn = pfn & level_mask(level);
1131
1132                 if (!dma_pte_present(pte))
1133                         goto next;
1134
1135                 /* If range covers entire pagetable, free it */
1136                 if (start_pfn <= level_pfn &&
1137                     last_pfn >= level_pfn + level_size(level) - 1) {
1138                         /* These suborbinate page tables are going away entirely. Don't
1139                            bother to clear them; we're just going to *free* them. */
1140                         if (level > 1 && !dma_pte_superpage(pte))
1141                                 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1142
1143                         dma_clear_pte(pte);
1144                         if (!first_pte)
1145                                 first_pte = pte;
1146                         last_pte = pte;
1147                 } else if (level > 1) {
1148                         /* Recurse down into a level that isn't *entirely* obsolete */
1149                         dma_pte_clear_level(domain, level - 1,
1150                                             phys_to_virt(dma_pte_addr(pte)),
1151                                             level_pfn, start_pfn, last_pfn,
1152                                             freelist);
1153                 }
1154 next:
1155                 pfn = level_pfn + level_size(level);
1156         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1157
1158         if (first_pte)
1159                 domain_flush_cache(domain, first_pte,
1160                                    (void *)++last_pte - (void *)first_pte);
1161 }
1162
1163 /* We can't just free the pages because the IOMMU may still be walking
1164    the page tables, and may have cached the intermediate levels. The
1165    pages can only be freed after the IOTLB flush has been done. */
1166 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1167                          unsigned long last_pfn, struct list_head *freelist)
1168 {
1169         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1170         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1171         BUG_ON(start_pfn > last_pfn);
1172
1173         /* we don't need lock here; nobody else touches the iova range */
1174         dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1175                             domain->pgd, 0, start_pfn, last_pfn, freelist);
1176
1177         /* free pgd */
1178         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1179                 struct page *pgd_page = virt_to_page(domain->pgd);
1180                 list_add_tail(&pgd_page->lru, freelist);
1181                 domain->pgd = NULL;
1182         }
1183 }
1184
1185 /* iommu handling */
1186 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1187 {
1188         struct root_entry *root;
1189
1190         root = (struct root_entry *)alloc_pgtable_page(iommu->node, GFP_ATOMIC);
1191         if (!root) {
1192                 pr_err("Allocating root entry for %s failed\n",
1193                         iommu->name);
1194                 return -ENOMEM;
1195         }
1196
1197         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1198         iommu->root_entry = root;
1199
1200         return 0;
1201 }
1202
1203 static void iommu_set_root_entry(struct intel_iommu *iommu)
1204 {
1205         u64 addr;
1206         u32 sts;
1207         unsigned long flag;
1208
1209         addr = virt_to_phys(iommu->root_entry);
1210         if (sm_supported(iommu))
1211                 addr |= DMA_RTADDR_SMT;
1212
1213         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1214         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1215
1216         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1217
1218         /* Make sure hardware complete it */
1219         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1220                       readl, (sts & DMA_GSTS_RTPS), sts);
1221
1222         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1223
1224         /*
1225          * Hardware invalidates all DMA remapping hardware translation
1226          * caches as part of SRTP flow.
1227          */
1228         if (cap_esrtps(iommu->cap))
1229                 return;
1230
1231         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1232         if (sm_supported(iommu))
1233                 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1234         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1235 }
1236
1237 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1238 {
1239         u32 val;
1240         unsigned long flag;
1241
1242         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1243                 return;
1244
1245         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1246         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1247
1248         /* Make sure hardware complete it */
1249         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1250                       readl, (!(val & DMA_GSTS_WBFS)), val);
1251
1252         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1253 }
1254
1255 /* return value determine if we need a write buffer flush */
1256 static void __iommu_flush_context(struct intel_iommu *iommu,
1257                                   u16 did, u16 source_id, u8 function_mask,
1258                                   u64 type)
1259 {
1260         u64 val = 0;
1261         unsigned long flag;
1262
1263         switch (type) {
1264         case DMA_CCMD_GLOBAL_INVL:
1265                 val = DMA_CCMD_GLOBAL_INVL;
1266                 break;
1267         case DMA_CCMD_DOMAIN_INVL:
1268                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1269                 break;
1270         case DMA_CCMD_DEVICE_INVL:
1271                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1272                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1273                 break;
1274         default:
1275                 BUG();
1276         }
1277         val |= DMA_CCMD_ICC;
1278
1279         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1280         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1281
1282         /* Make sure hardware complete it */
1283         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1284                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1285
1286         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1287 }
1288
1289 /* return value determine if we need a write buffer flush */
1290 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1291                                 u64 addr, unsigned int size_order, u64 type)
1292 {
1293         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1294         u64 val = 0, val_iva = 0;
1295         unsigned long flag;
1296
1297         switch (type) {
1298         case DMA_TLB_GLOBAL_FLUSH:
1299                 /* global flush doesn't need set IVA_REG */
1300                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1301                 break;
1302         case DMA_TLB_DSI_FLUSH:
1303                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1304                 break;
1305         case DMA_TLB_PSI_FLUSH:
1306                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1307                 /* IH bit is passed in as part of address */
1308                 val_iva = size_order | addr;
1309                 break;
1310         default:
1311                 BUG();
1312         }
1313         /* Note: set drain read/write */
1314 #if 0
1315         /*
1316          * This is probably to be super secure.. Looks like we can
1317          * ignore it without any impact.
1318          */
1319         if (cap_read_drain(iommu->cap))
1320                 val |= DMA_TLB_READ_DRAIN;
1321 #endif
1322         if (cap_write_drain(iommu->cap))
1323                 val |= DMA_TLB_WRITE_DRAIN;
1324
1325         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1326         /* Note: Only uses first TLB reg currently */
1327         if (val_iva)
1328                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1329         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1330
1331         /* Make sure hardware complete it */
1332         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1333                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1334
1335         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1336
1337         /* check IOTLB invalidation granularity */
1338         if (DMA_TLB_IAIG(val) == 0)
1339                 pr_err("Flush IOTLB failed\n");
1340         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1341                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1342                         (unsigned long long)DMA_TLB_IIRG(type),
1343                         (unsigned long long)DMA_TLB_IAIG(val));
1344 }
1345
1346 static struct device_domain_info *
1347 domain_lookup_dev_info(struct dmar_domain *domain,
1348                        struct intel_iommu *iommu, u8 bus, u8 devfn)
1349 {
1350         struct device_domain_info *info;
1351         unsigned long flags;
1352
1353         spin_lock_irqsave(&domain->lock, flags);
1354         list_for_each_entry(info, &domain->devices, link) {
1355                 if (info->iommu == iommu && info->bus == bus &&
1356                     info->devfn == devfn) {
1357                         spin_unlock_irqrestore(&domain->lock, flags);
1358                         return info;
1359                 }
1360         }
1361         spin_unlock_irqrestore(&domain->lock, flags);
1362
1363         return NULL;
1364 }
1365
1366 static void domain_update_iotlb(struct dmar_domain *domain)
1367 {
1368         struct device_domain_info *info;
1369         bool has_iotlb_device = false;
1370         unsigned long flags;
1371
1372         spin_lock_irqsave(&domain->lock, flags);
1373         list_for_each_entry(info, &domain->devices, link) {
1374                 if (info->ats_enabled) {
1375                         has_iotlb_device = true;
1376                         break;
1377                 }
1378         }
1379         domain->has_iotlb_device = has_iotlb_device;
1380         spin_unlock_irqrestore(&domain->lock, flags);
1381 }
1382
1383 /*
1384  * The extra devTLB flush quirk impacts those QAT devices with PCI device
1385  * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1386  * check because it applies only to the built-in QAT devices and it doesn't
1387  * grant additional privileges.
1388  */
1389 #define BUGGY_QAT_DEVID_MASK 0x4940
1390 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1391 {
1392         if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1393                 return false;
1394
1395         if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1396                 return false;
1397
1398         return true;
1399 }
1400
1401 static void iommu_enable_pci_caps(struct device_domain_info *info)
1402 {
1403         struct pci_dev *pdev;
1404
1405         if (!dev_is_pci(info->dev))
1406                 return;
1407
1408         pdev = to_pci_dev(info->dev);
1409         /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1410          * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1411          * queue depth at PF level. If DIT is not set, PFSID will be treated as
1412          * reserved, which should be set to 0.
1413          */
1414         if (!ecap_dit(info->iommu->ecap))
1415                 info->pfsid = 0;
1416         else {
1417                 struct pci_dev *pf_pdev;
1418
1419                 /* pdev will be returned if device is not a vf */
1420                 pf_pdev = pci_physfn(pdev);
1421                 info->pfsid = pci_dev_id(pf_pdev);
1422         }
1423
1424         /* The PCIe spec, in its wisdom, declares that the behaviour of
1425            the device if you enable PASID support after ATS support is
1426            undefined. So always enable PASID support on devices which
1427            have it, even if we can't yet know if we're ever going to
1428            use it. */
1429         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1430                 info->pasid_enabled = 1;
1431
1432         if (info->pri_supported &&
1433             (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1434             !pci_reset_pri(pdev) && !pci_enable_pri(pdev, PRQ_DEPTH))
1435                 info->pri_enabled = 1;
1436
1437         if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1438             !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1439                 info->ats_enabled = 1;
1440                 domain_update_iotlb(info->domain);
1441                 info->ats_qdep = pci_ats_queue_depth(pdev);
1442         }
1443 }
1444
1445 static void iommu_disable_pci_caps(struct device_domain_info *info)
1446 {
1447         struct pci_dev *pdev;
1448
1449         if (!dev_is_pci(info->dev))
1450                 return;
1451
1452         pdev = to_pci_dev(info->dev);
1453
1454         if (info->ats_enabled) {
1455                 pci_disable_ats(pdev);
1456                 info->ats_enabled = 0;
1457                 domain_update_iotlb(info->domain);
1458         }
1459
1460         if (info->pri_enabled) {
1461                 pci_disable_pri(pdev);
1462                 info->pri_enabled = 0;
1463         }
1464
1465         if (info->pasid_enabled) {
1466                 pci_disable_pasid(pdev);
1467                 info->pasid_enabled = 0;
1468         }
1469 }
1470
1471 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1472                                     u64 addr, unsigned int mask)
1473 {
1474         u16 sid, qdep;
1475
1476         if (!info || !info->ats_enabled)
1477                 return;
1478
1479         sid = info->bus << 8 | info->devfn;
1480         qdep = info->ats_qdep;
1481         qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1482                            qdep, addr, mask);
1483         quirk_extra_dev_tlb_flush(info, addr, mask, PASID_RID2PASID, qdep);
1484 }
1485
1486 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1487                                   u64 addr, unsigned mask)
1488 {
1489         struct device_domain_info *info;
1490         unsigned long flags;
1491
1492         if (!domain->has_iotlb_device)
1493                 return;
1494
1495         spin_lock_irqsave(&domain->lock, flags);
1496         list_for_each_entry(info, &domain->devices, link)
1497                 __iommu_flush_dev_iotlb(info, addr, mask);
1498         spin_unlock_irqrestore(&domain->lock, flags);
1499 }
1500
1501 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1502                                   struct dmar_domain *domain,
1503                                   unsigned long pfn, unsigned int pages,
1504                                   int ih, int map)
1505 {
1506         unsigned int aligned_pages = __roundup_pow_of_two(pages);
1507         unsigned int mask = ilog2(aligned_pages);
1508         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1509         u16 did = domain_id_iommu(domain, iommu);
1510
1511         BUG_ON(pages == 0);
1512
1513         if (ih)
1514                 ih = 1 << 6;
1515
1516         if (domain->use_first_level) {
1517                 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, pages, ih);
1518         } else {
1519                 unsigned long bitmask = aligned_pages - 1;
1520
1521                 /*
1522                  * PSI masks the low order bits of the base address. If the
1523                  * address isn't aligned to the mask, then compute a mask value
1524                  * needed to ensure the target range is flushed.
1525                  */
1526                 if (unlikely(bitmask & pfn)) {
1527                         unsigned long end_pfn = pfn + pages - 1, shared_bits;
1528
1529                         /*
1530                          * Since end_pfn <= pfn + bitmask, the only way bits
1531                          * higher than bitmask can differ in pfn and end_pfn is
1532                          * by carrying. This means after masking out bitmask,
1533                          * high bits starting with the first set bit in
1534                          * shared_bits are all equal in both pfn and end_pfn.
1535                          */
1536                         shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1537                         mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1538                 }
1539
1540                 /*
1541                  * Fallback to domain selective flush if no PSI support or
1542                  * the size is too big.
1543                  */
1544                 if (!cap_pgsel_inv(iommu->cap) ||
1545                     mask > cap_max_amask_val(iommu->cap))
1546                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1547                                                         DMA_TLB_DSI_FLUSH);
1548                 else
1549                         iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1550                                                         DMA_TLB_PSI_FLUSH);
1551         }
1552
1553         /*
1554          * In caching mode, changes of pages from non-present to present require
1555          * flush. However, device IOTLB doesn't need to be flushed in this case.
1556          */
1557         if (!cap_caching_mode(iommu->cap) || !map)
1558                 iommu_flush_dev_iotlb(domain, addr, mask);
1559 }
1560
1561 /* Notification for newly created mappings */
1562 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1563                                         struct dmar_domain *domain,
1564                                         unsigned long pfn, unsigned int pages)
1565 {
1566         /*
1567          * It's a non-present to present mapping. Only flush if caching mode
1568          * and second level.
1569          */
1570         if (cap_caching_mode(iommu->cap) && !domain->use_first_level)
1571                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1572         else
1573                 iommu_flush_write_buffer(iommu);
1574 }
1575
1576 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1577 {
1578         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1579         struct iommu_domain_info *info;
1580         unsigned long idx;
1581
1582         xa_for_each(&dmar_domain->iommu_array, idx, info) {
1583                 struct intel_iommu *iommu = info->iommu;
1584                 u16 did = domain_id_iommu(dmar_domain, iommu);
1585
1586                 if (dmar_domain->use_first_level)
1587                         qi_flush_piotlb(iommu, did, PASID_RID2PASID, 0, -1, 0);
1588                 else
1589                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1590                                                  DMA_TLB_DSI_FLUSH);
1591
1592                 if (!cap_caching_mode(iommu->cap))
1593                         iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1594         }
1595 }
1596
1597 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1598 {
1599         u32 pmen;
1600         unsigned long flags;
1601
1602         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1603                 return;
1604
1605         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1606         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1607         pmen &= ~DMA_PMEN_EPM;
1608         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1609
1610         /* wait for the protected region status bit to clear */
1611         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1612                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1613
1614         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1615 }
1616
1617 static void iommu_enable_translation(struct intel_iommu *iommu)
1618 {
1619         u32 sts;
1620         unsigned long flags;
1621
1622         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1623         iommu->gcmd |= DMA_GCMD_TE;
1624         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1625
1626         /* Make sure hardware complete it */
1627         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1628                       readl, (sts & DMA_GSTS_TES), sts);
1629
1630         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1631 }
1632
1633 static void iommu_disable_translation(struct intel_iommu *iommu)
1634 {
1635         u32 sts;
1636         unsigned long flag;
1637
1638         if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1639             (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1640                 return;
1641
1642         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1643         iommu->gcmd &= ~DMA_GCMD_TE;
1644         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1645
1646         /* Make sure hardware complete it */
1647         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1648                       readl, (!(sts & DMA_GSTS_TES)), sts);
1649
1650         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1651 }
1652
1653 static int iommu_init_domains(struct intel_iommu *iommu)
1654 {
1655         u32 ndomains;
1656
1657         ndomains = cap_ndoms(iommu->cap);
1658         pr_debug("%s: Number of Domains supported <%d>\n",
1659                  iommu->name, ndomains);
1660
1661         spin_lock_init(&iommu->lock);
1662
1663         iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1664         if (!iommu->domain_ids)
1665                 return -ENOMEM;
1666
1667         /*
1668          * If Caching mode is set, then invalid translations are tagged
1669          * with domain-id 0, hence we need to pre-allocate it. We also
1670          * use domain-id 0 as a marker for non-allocated domain-id, so
1671          * make sure it is not used for a real domain.
1672          */
1673         set_bit(0, iommu->domain_ids);
1674
1675         /*
1676          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1677          * entry for first-level or pass-through translation modes should
1678          * be programmed with a domain id different from those used for
1679          * second-level or nested translation. We reserve a domain id for
1680          * this purpose.
1681          */
1682         if (sm_supported(iommu))
1683                 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1684
1685         return 0;
1686 }
1687
1688 static void disable_dmar_iommu(struct intel_iommu *iommu)
1689 {
1690         if (!iommu->domain_ids)
1691                 return;
1692
1693         /*
1694          * All iommu domains must have been detached from the devices,
1695          * hence there should be no domain IDs in use.
1696          */
1697         if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1698                     > NUM_RESERVED_DID))
1699                 return;
1700
1701         if (iommu->gcmd & DMA_GCMD_TE)
1702                 iommu_disable_translation(iommu);
1703 }
1704
1705 static void free_dmar_iommu(struct intel_iommu *iommu)
1706 {
1707         if (iommu->domain_ids) {
1708                 bitmap_free(iommu->domain_ids);
1709                 iommu->domain_ids = NULL;
1710         }
1711
1712         if (iommu->copied_tables) {
1713                 bitmap_free(iommu->copied_tables);
1714                 iommu->copied_tables = NULL;
1715         }
1716
1717         /* free context mapping */
1718         free_context_table(iommu);
1719
1720 #ifdef CONFIG_INTEL_IOMMU_SVM
1721         if (pasid_supported(iommu)) {
1722                 if (ecap_prs(iommu->ecap))
1723                         intel_svm_finish_prq(iommu);
1724         }
1725         if (vccap_pasid(iommu->vccap))
1726                 ioasid_unregister_allocator(&iommu->pasid_allocator);
1727
1728 #endif
1729 }
1730
1731 /*
1732  * Check and return whether first level is used by default for
1733  * DMA translation.
1734  */
1735 static bool first_level_by_default(unsigned int type)
1736 {
1737         /* Only SL is available in legacy mode */
1738         if (!scalable_mode_support())
1739                 return false;
1740
1741         /* Only level (either FL or SL) is available, just use it */
1742         if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1743                 return intel_cap_flts_sanity();
1744
1745         /* Both levels are available, decide it based on domain type */
1746         return type != IOMMU_DOMAIN_UNMANAGED;
1747 }
1748
1749 static struct dmar_domain *alloc_domain(unsigned int type)
1750 {
1751         struct dmar_domain *domain;
1752
1753         domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1754         if (!domain)
1755                 return NULL;
1756
1757         domain->nid = NUMA_NO_NODE;
1758         if (first_level_by_default(type))
1759                 domain->use_first_level = true;
1760         domain->has_iotlb_device = false;
1761         INIT_LIST_HEAD(&domain->devices);
1762         spin_lock_init(&domain->lock);
1763         xa_init(&domain->iommu_array);
1764
1765         return domain;
1766 }
1767
1768 static int domain_attach_iommu(struct dmar_domain *domain,
1769                                struct intel_iommu *iommu)
1770 {
1771         struct iommu_domain_info *info, *curr;
1772         unsigned long ndomains;
1773         int num, ret = -ENOSPC;
1774
1775         info = kzalloc(sizeof(*info), GFP_KERNEL);
1776         if (!info)
1777                 return -ENOMEM;
1778
1779         spin_lock(&iommu->lock);
1780         curr = xa_load(&domain->iommu_array, iommu->seq_id);
1781         if (curr) {
1782                 curr->refcnt++;
1783                 spin_unlock(&iommu->lock);
1784                 kfree(info);
1785                 return 0;
1786         }
1787
1788         ndomains = cap_ndoms(iommu->cap);
1789         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1790         if (num >= ndomains) {
1791                 pr_err("%s: No free domain ids\n", iommu->name);
1792                 goto err_unlock;
1793         }
1794
1795         set_bit(num, iommu->domain_ids);
1796         info->refcnt    = 1;
1797         info->did       = num;
1798         info->iommu     = iommu;
1799         curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1800                           NULL, info, GFP_ATOMIC);
1801         if (curr) {
1802                 ret = xa_err(curr) ? : -EBUSY;
1803                 goto err_clear;
1804         }
1805         domain_update_iommu_cap(domain);
1806
1807         spin_unlock(&iommu->lock);
1808         return 0;
1809
1810 err_clear:
1811         clear_bit(info->did, iommu->domain_ids);
1812 err_unlock:
1813         spin_unlock(&iommu->lock);
1814         kfree(info);
1815         return ret;
1816 }
1817
1818 static void domain_detach_iommu(struct dmar_domain *domain,
1819                                 struct intel_iommu *iommu)
1820 {
1821         struct iommu_domain_info *info;
1822
1823         spin_lock(&iommu->lock);
1824         info = xa_load(&domain->iommu_array, iommu->seq_id);
1825         if (--info->refcnt == 0) {
1826                 clear_bit(info->did, iommu->domain_ids);
1827                 xa_erase(&domain->iommu_array, iommu->seq_id);
1828                 domain->nid = NUMA_NO_NODE;
1829                 domain_update_iommu_cap(domain);
1830                 kfree(info);
1831         }
1832         spin_unlock(&iommu->lock);
1833 }
1834
1835 static inline int guestwidth_to_adjustwidth(int gaw)
1836 {
1837         int agaw;
1838         int r = (gaw - 12) % 9;
1839
1840         if (r == 0)
1841                 agaw = gaw;
1842         else
1843                 agaw = gaw + 9 - r;
1844         if (agaw > 64)
1845                 agaw = 64;
1846         return agaw;
1847 }
1848
1849 static void domain_exit(struct dmar_domain *domain)
1850 {
1851         if (domain->pgd) {
1852                 LIST_HEAD(freelist);
1853
1854                 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1855                 put_pages_list(&freelist);
1856         }
1857
1858         if (WARN_ON(!list_empty(&domain->devices)))
1859                 return;
1860
1861         kfree(domain);
1862 }
1863
1864 /*
1865  * Get the PASID directory size for scalable mode context entry.
1866  * Value of X in the PDTS field of a scalable mode context entry
1867  * indicates PASID directory with 2^(X + 7) entries.
1868  */
1869 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1870 {
1871         unsigned long pds, max_pde;
1872
1873         max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1874         pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
1875         if (pds < 7)
1876                 return 0;
1877
1878         return pds - 7;
1879 }
1880
1881 /*
1882  * Set the RID_PASID field of a scalable mode context entry. The
1883  * IOMMU hardware will use the PASID value set in this field for
1884  * DMA translations of DMA requests without PASID.
1885  */
1886 static inline void
1887 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1888 {
1889         context->hi |= pasid & ((1 << 20) - 1);
1890 }
1891
1892 /*
1893  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1894  * entry.
1895  */
1896 static inline void context_set_sm_dte(struct context_entry *context)
1897 {
1898         context->lo |= (1 << 2);
1899 }
1900
1901 /*
1902  * Set the PRE(Page Request Enable) field of a scalable mode context
1903  * entry.
1904  */
1905 static inline void context_set_sm_pre(struct context_entry *context)
1906 {
1907         context->lo |= (1 << 4);
1908 }
1909
1910 /* Convert value to context PASID directory size field coding. */
1911 #define context_pdts(pds)       (((pds) & 0x7) << 9)
1912
1913 static int domain_context_mapping_one(struct dmar_domain *domain,
1914                                       struct intel_iommu *iommu,
1915                                       struct pasid_table *table,
1916                                       u8 bus, u8 devfn)
1917 {
1918         struct device_domain_info *info =
1919                         domain_lookup_dev_info(domain, iommu, bus, devfn);
1920         u16 did = domain_id_iommu(domain, iommu);
1921         int translation = CONTEXT_TT_MULTI_LEVEL;
1922         struct context_entry *context;
1923         int ret;
1924
1925         WARN_ON(did == 0);
1926
1927         if (hw_pass_through && domain_type_is_si(domain))
1928                 translation = CONTEXT_TT_PASS_THROUGH;
1929
1930         pr_debug("Set context mapping for %02x:%02x.%d\n",
1931                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1932
1933         BUG_ON(!domain->pgd);
1934
1935         spin_lock(&iommu->lock);
1936         ret = -ENOMEM;
1937         context = iommu_context_addr(iommu, bus, devfn, 1);
1938         if (!context)
1939                 goto out_unlock;
1940
1941         ret = 0;
1942         if (context_present(context) && !context_copied(iommu, bus, devfn))
1943                 goto out_unlock;
1944
1945         /*
1946          * For kdump cases, old valid entries may be cached due to the
1947          * in-flight DMA and copied pgtable, but there is no unmapping
1948          * behaviour for them, thus we need an explicit cache flush for
1949          * the newly-mapped device. For kdump, at this point, the device
1950          * is supposed to finish reset at its driver probe stage, so no
1951          * in-flight DMA will exist, and we don't need to worry anymore
1952          * hereafter.
1953          */
1954         if (context_copied(iommu, bus, devfn)) {
1955                 u16 did_old = context_domain_id(context);
1956
1957                 if (did_old < cap_ndoms(iommu->cap)) {
1958                         iommu->flush.flush_context(iommu, did_old,
1959                                                    (((u16)bus) << 8) | devfn,
1960                                                    DMA_CCMD_MASK_NOBIT,
1961                                                    DMA_CCMD_DEVICE_INVL);
1962                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1963                                                  DMA_TLB_DSI_FLUSH);
1964                 }
1965
1966                 clear_context_copied(iommu, bus, devfn);
1967         }
1968
1969         context_clear_entry(context);
1970
1971         if (sm_supported(iommu)) {
1972                 unsigned long pds;
1973
1974                 WARN_ON(!table);
1975
1976                 /* Setup the PASID DIR pointer: */
1977                 pds = context_get_sm_pds(table);
1978                 context->lo = (u64)virt_to_phys(table->table) |
1979                                 context_pdts(pds);
1980
1981                 /* Setup the RID_PASID field: */
1982                 context_set_sm_rid2pasid(context, PASID_RID2PASID);
1983
1984                 /*
1985                  * Setup the Device-TLB enable bit and Page request
1986                  * Enable bit:
1987                  */
1988                 if (info && info->ats_supported)
1989                         context_set_sm_dte(context);
1990                 if (info && info->pri_supported)
1991                         context_set_sm_pre(context);
1992                 if (info && info->pasid_supported)
1993                         context_set_pasid(context);
1994         } else {
1995                 struct dma_pte *pgd = domain->pgd;
1996                 int agaw;
1997
1998                 context_set_domain_id(context, did);
1999
2000                 if (translation != CONTEXT_TT_PASS_THROUGH) {
2001                         /*
2002                          * Skip top levels of page tables for iommu which has
2003                          * less agaw than default. Unnecessary for PT mode.
2004                          */
2005                         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2006                                 ret = -ENOMEM;
2007                                 pgd = phys_to_virt(dma_pte_addr(pgd));
2008                                 if (!dma_pte_present(pgd))
2009                                         goto out_unlock;
2010                         }
2011
2012                         if (info && info->ats_supported)
2013                                 translation = CONTEXT_TT_DEV_IOTLB;
2014                         else
2015                                 translation = CONTEXT_TT_MULTI_LEVEL;
2016
2017                         context_set_address_root(context, virt_to_phys(pgd));
2018                         context_set_address_width(context, agaw);
2019                 } else {
2020                         /*
2021                          * In pass through mode, AW must be programmed to
2022                          * indicate the largest AGAW value supported by
2023                          * hardware. And ASR is ignored by hardware.
2024                          */
2025                         context_set_address_width(context, iommu->msagaw);
2026                 }
2027
2028                 context_set_translation_type(context, translation);
2029         }
2030
2031         context_set_fault_enable(context);
2032         context_set_present(context);
2033         if (!ecap_coherent(iommu->ecap))
2034                 clflush_cache_range(context, sizeof(*context));
2035
2036         /*
2037          * It's a non-present to present mapping. If hardware doesn't cache
2038          * non-present entry we only need to flush the write-buffer. If the
2039          * _does_ cache non-present entries, then it does so in the special
2040          * domain #0, which we have to flush:
2041          */
2042         if (cap_caching_mode(iommu->cap)) {
2043                 iommu->flush.flush_context(iommu, 0,
2044                                            (((u16)bus) << 8) | devfn,
2045                                            DMA_CCMD_MASK_NOBIT,
2046                                            DMA_CCMD_DEVICE_INVL);
2047                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2048         } else {
2049                 iommu_flush_write_buffer(iommu);
2050         }
2051
2052         ret = 0;
2053
2054 out_unlock:
2055         spin_unlock(&iommu->lock);
2056
2057         return ret;
2058 }
2059
2060 struct domain_context_mapping_data {
2061         struct dmar_domain *domain;
2062         struct intel_iommu *iommu;
2063         struct pasid_table *table;
2064 };
2065
2066 static int domain_context_mapping_cb(struct pci_dev *pdev,
2067                                      u16 alias, void *opaque)
2068 {
2069         struct domain_context_mapping_data *data = opaque;
2070
2071         return domain_context_mapping_one(data->domain, data->iommu,
2072                                           data->table, PCI_BUS_NUM(alias),
2073                                           alias & 0xff);
2074 }
2075
2076 static int
2077 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2078 {
2079         struct domain_context_mapping_data data;
2080         struct pasid_table *table;
2081         struct intel_iommu *iommu;
2082         u8 bus, devfn;
2083
2084         iommu = device_to_iommu(dev, &bus, &devfn);
2085         if (!iommu)
2086                 return -ENODEV;
2087
2088         table = intel_pasid_get_table(dev);
2089
2090         if (!dev_is_pci(dev))
2091                 return domain_context_mapping_one(domain, iommu, table,
2092                                                   bus, devfn);
2093
2094         data.domain = domain;
2095         data.iommu = iommu;
2096         data.table = table;
2097
2098         return pci_for_each_dma_alias(to_pci_dev(dev),
2099                                       &domain_context_mapping_cb, &data);
2100 }
2101
2102 /* Returns a number of VTD pages, but aligned to MM page size */
2103 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2104                                             size_t size)
2105 {
2106         host_addr &= ~PAGE_MASK;
2107         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2108 }
2109
2110 /* Return largest possible superpage level for a given mapping */
2111 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2112                                           unsigned long iov_pfn,
2113                                           unsigned long phy_pfn,
2114                                           unsigned long pages)
2115 {
2116         int support, level = 1;
2117         unsigned long pfnmerge;
2118
2119         support = domain->iommu_superpage;
2120
2121         /* To use a large page, the virtual *and* physical addresses
2122            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2123            of them will mean we have to use smaller pages. So just
2124            merge them and check both at once. */
2125         pfnmerge = iov_pfn | phy_pfn;
2126
2127         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2128                 pages >>= VTD_STRIDE_SHIFT;
2129                 if (!pages)
2130                         break;
2131                 pfnmerge >>= VTD_STRIDE_SHIFT;
2132                 level++;
2133                 support--;
2134         }
2135         return level;
2136 }
2137
2138 /*
2139  * Ensure that old small page tables are removed to make room for superpage(s).
2140  * We're going to add new large pages, so make sure we don't remove their parent
2141  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2142  */
2143 static void switch_to_super_page(struct dmar_domain *domain,
2144                                  unsigned long start_pfn,
2145                                  unsigned long end_pfn, int level)
2146 {
2147         unsigned long lvl_pages = lvl_to_nr_pages(level);
2148         struct iommu_domain_info *info;
2149         struct dma_pte *pte = NULL;
2150         unsigned long i;
2151
2152         while (start_pfn <= end_pfn) {
2153                 if (!pte)
2154                         pte = pfn_to_dma_pte(domain, start_pfn, &level,
2155                                              GFP_ATOMIC);
2156
2157                 if (dma_pte_present(pte)) {
2158                         dma_pte_free_pagetable(domain, start_pfn,
2159                                                start_pfn + lvl_pages - 1,
2160                                                level + 1);
2161
2162                         xa_for_each(&domain->iommu_array, i, info)
2163                                 iommu_flush_iotlb_psi(info->iommu, domain,
2164                                                       start_pfn, lvl_pages,
2165                                                       0, 0);
2166                 }
2167
2168                 pte++;
2169                 start_pfn += lvl_pages;
2170                 if (first_pte_in_page(pte))
2171                         pte = NULL;
2172         }
2173 }
2174
2175 static int
2176 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2177                  unsigned long phys_pfn, unsigned long nr_pages, int prot,
2178                  gfp_t gfp)
2179 {
2180         struct dma_pte *first_pte = NULL, *pte = NULL;
2181         unsigned int largepage_lvl = 0;
2182         unsigned long lvl_pages = 0;
2183         phys_addr_t pteval;
2184         u64 attr;
2185
2186         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2187
2188         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2189                 return -EINVAL;
2190
2191         attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2192         attr |= DMA_FL_PTE_PRESENT;
2193         if (domain->use_first_level) {
2194                 attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2195                 if (prot & DMA_PTE_WRITE)
2196                         attr |= DMA_FL_PTE_DIRTY;
2197         }
2198
2199         pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2200
2201         while (nr_pages > 0) {
2202                 uint64_t tmp;
2203
2204                 if (!pte) {
2205                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2206                                         phys_pfn, nr_pages);
2207
2208                         pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
2209                                              gfp);
2210                         if (!pte)
2211                                 return -ENOMEM;
2212                         first_pte = pte;
2213
2214                         lvl_pages = lvl_to_nr_pages(largepage_lvl);
2215
2216                         /* It is large page*/
2217                         if (largepage_lvl > 1) {
2218                                 unsigned long end_pfn;
2219                                 unsigned long pages_to_remove;
2220
2221                                 pteval |= DMA_PTE_LARGE_PAGE;
2222                                 pages_to_remove = min_t(unsigned long, nr_pages,
2223                                                         nr_pte_to_next_page(pte) * lvl_pages);
2224                                 end_pfn = iov_pfn + pages_to_remove - 1;
2225                                 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2226                         } else {
2227                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2228                         }
2229
2230                 }
2231                 /* We don't need lock here, nobody else
2232                  * touches the iova range
2233                  */
2234                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2235                 if (tmp) {
2236                         static int dumps = 5;
2237                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2238                                 iov_pfn, tmp, (unsigned long long)pteval);
2239                         if (dumps) {
2240                                 dumps--;
2241                                 debug_dma_dump_mappings(NULL);
2242                         }
2243                         WARN_ON(1);
2244                 }
2245
2246                 nr_pages -= lvl_pages;
2247                 iov_pfn += lvl_pages;
2248                 phys_pfn += lvl_pages;
2249                 pteval += lvl_pages * VTD_PAGE_SIZE;
2250
2251                 /* If the next PTE would be the first in a new page, then we
2252                  * need to flush the cache on the entries we've just written.
2253                  * And then we'll need to recalculate 'pte', so clear it and
2254                  * let it get set again in the if (!pte) block above.
2255                  *
2256                  * If we're done (!nr_pages) we need to flush the cache too.
2257                  *
2258                  * Also if we've been setting superpages, we may need to
2259                  * recalculate 'pte' and switch back to smaller pages for the
2260                  * end of the mapping, if the trailing size is not enough to
2261                  * use another superpage (i.e. nr_pages < lvl_pages).
2262                  */
2263                 pte++;
2264                 if (!nr_pages || first_pte_in_page(pte) ||
2265                     (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2266                         domain_flush_cache(domain, first_pte,
2267                                            (void *)pte - (void *)first_pte);
2268                         pte = NULL;
2269                 }
2270         }
2271
2272         return 0;
2273 }
2274
2275 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2276 {
2277         struct intel_iommu *iommu = info->iommu;
2278         struct context_entry *context;
2279         u16 did_old;
2280
2281         if (!iommu)
2282                 return;
2283
2284         spin_lock(&iommu->lock);
2285         context = iommu_context_addr(iommu, bus, devfn, 0);
2286         if (!context) {
2287                 spin_unlock(&iommu->lock);
2288                 return;
2289         }
2290
2291         if (sm_supported(iommu)) {
2292                 if (hw_pass_through && domain_type_is_si(info->domain))
2293                         did_old = FLPT_DEFAULT_DID;
2294                 else
2295                         did_old = domain_id_iommu(info->domain, iommu);
2296         } else {
2297                 did_old = context_domain_id(context);
2298         }
2299
2300         context_clear_entry(context);
2301         __iommu_flush_cache(iommu, context, sizeof(*context));
2302         spin_unlock(&iommu->lock);
2303         iommu->flush.flush_context(iommu,
2304                                    did_old,
2305                                    (((u16)bus) << 8) | devfn,
2306                                    DMA_CCMD_MASK_NOBIT,
2307                                    DMA_CCMD_DEVICE_INVL);
2308
2309         if (sm_supported(iommu))
2310                 qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2311
2312         iommu->flush.flush_iotlb(iommu,
2313                                  did_old,
2314                                  0,
2315                                  0,
2316                                  DMA_TLB_DSI_FLUSH);
2317
2318         __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2319 }
2320
2321 static int domain_setup_first_level(struct intel_iommu *iommu,
2322                                     struct dmar_domain *domain,
2323                                     struct device *dev,
2324                                     u32 pasid)
2325 {
2326         struct dma_pte *pgd = domain->pgd;
2327         int agaw, level;
2328         int flags = 0;
2329
2330         /*
2331          * Skip top levels of page tables for iommu which has
2332          * less agaw than default. Unnecessary for PT mode.
2333          */
2334         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2335                 pgd = phys_to_virt(dma_pte_addr(pgd));
2336                 if (!dma_pte_present(pgd))
2337                         return -ENOMEM;
2338         }
2339
2340         level = agaw_to_level(agaw);
2341         if (level != 4 && level != 5)
2342                 return -EINVAL;
2343
2344         if (pasid != PASID_RID2PASID)
2345                 flags |= PASID_FLAG_SUPERVISOR_MODE;
2346         if (level == 5)
2347                 flags |= PASID_FLAG_FL5LP;
2348
2349         if (domain->force_snooping)
2350                 flags |= PASID_FLAG_PAGE_SNOOP;
2351
2352         return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2353                                              domain_id_iommu(domain, iommu),
2354                                              flags);
2355 }
2356
2357 static bool dev_is_real_dma_subdevice(struct device *dev)
2358 {
2359         return dev && dev_is_pci(dev) &&
2360                pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2361 }
2362
2363 static int iommu_domain_identity_map(struct dmar_domain *domain,
2364                                      unsigned long first_vpfn,
2365                                      unsigned long last_vpfn)
2366 {
2367         /*
2368          * RMRR range might have overlap with physical memory range,
2369          * clear it first
2370          */
2371         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2372
2373         return __domain_mapping(domain, first_vpfn,
2374                                 first_vpfn, last_vpfn - first_vpfn + 1,
2375                                 DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL);
2376 }
2377
2378 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2379
2380 static int __init si_domain_init(int hw)
2381 {
2382         struct dmar_rmrr_unit *rmrr;
2383         struct device *dev;
2384         int i, nid, ret;
2385
2386         si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2387         if (!si_domain)
2388                 return -EFAULT;
2389
2390         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2391                 domain_exit(si_domain);
2392                 si_domain = NULL;
2393                 return -EFAULT;
2394         }
2395
2396         if (hw)
2397                 return 0;
2398
2399         for_each_online_node(nid) {
2400                 unsigned long start_pfn, end_pfn;
2401                 int i;
2402
2403                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2404                         ret = iommu_domain_identity_map(si_domain,
2405                                         mm_to_dma_pfn(start_pfn),
2406                                         mm_to_dma_pfn(end_pfn));
2407                         if (ret)
2408                                 return ret;
2409                 }
2410         }
2411
2412         /*
2413          * Identity map the RMRRs so that devices with RMRRs could also use
2414          * the si_domain.
2415          */
2416         for_each_rmrr_units(rmrr) {
2417                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2418                                           i, dev) {
2419                         unsigned long long start = rmrr->base_address;
2420                         unsigned long long end = rmrr->end_address;
2421
2422                         if (WARN_ON(end < start ||
2423                                     end >> agaw_to_width(si_domain->agaw)))
2424                                 continue;
2425
2426                         ret = iommu_domain_identity_map(si_domain,
2427                                         mm_to_dma_pfn(start >> PAGE_SHIFT),
2428                                         mm_to_dma_pfn(end >> PAGE_SHIFT));
2429                         if (ret)
2430                                 return ret;
2431                 }
2432         }
2433
2434         return 0;
2435 }
2436
2437 static int dmar_domain_attach_device(struct dmar_domain *domain,
2438                                      struct device *dev)
2439 {
2440         struct device_domain_info *info = dev_iommu_priv_get(dev);
2441         struct intel_iommu *iommu;
2442         unsigned long flags;
2443         u8 bus, devfn;
2444         int ret;
2445
2446         iommu = device_to_iommu(dev, &bus, &devfn);
2447         if (!iommu)
2448                 return -ENODEV;
2449
2450         ret = domain_attach_iommu(domain, iommu);
2451         if (ret)
2452                 return ret;
2453         info->domain = domain;
2454         spin_lock_irqsave(&domain->lock, flags);
2455         list_add(&info->link, &domain->devices);
2456         spin_unlock_irqrestore(&domain->lock, flags);
2457
2458         /* PASID table is mandatory for a PCI device in scalable mode. */
2459         if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2460                 /* Setup the PASID entry for requests without PASID: */
2461                 if (hw_pass_through && domain_type_is_si(domain))
2462                         ret = intel_pasid_setup_pass_through(iommu, domain,
2463                                         dev, PASID_RID2PASID);
2464                 else if (domain->use_first_level)
2465                         ret = domain_setup_first_level(iommu, domain, dev,
2466                                         PASID_RID2PASID);
2467                 else
2468                         ret = intel_pasid_setup_second_level(iommu, domain,
2469                                         dev, PASID_RID2PASID);
2470                 if (ret) {
2471                         dev_err(dev, "Setup RID2PASID failed\n");
2472                         device_block_translation(dev);
2473                         return ret;
2474                 }
2475         }
2476
2477         ret = domain_context_mapping(domain, dev);
2478         if (ret) {
2479                 dev_err(dev, "Domain context map failed\n");
2480                 device_block_translation(dev);
2481                 return ret;
2482         }
2483
2484         iommu_enable_pci_caps(info);
2485
2486         return 0;
2487 }
2488
2489 static bool device_has_rmrr(struct device *dev)
2490 {
2491         struct dmar_rmrr_unit *rmrr;
2492         struct device *tmp;
2493         int i;
2494
2495         rcu_read_lock();
2496         for_each_rmrr_units(rmrr) {
2497                 /*
2498                  * Return TRUE if this RMRR contains the device that
2499                  * is passed in.
2500                  */
2501                 for_each_active_dev_scope(rmrr->devices,
2502                                           rmrr->devices_cnt, i, tmp)
2503                         if (tmp == dev ||
2504                             is_downstream_to_pci_bridge(dev, tmp)) {
2505                                 rcu_read_unlock();
2506                                 return true;
2507                         }
2508         }
2509         rcu_read_unlock();
2510         return false;
2511 }
2512
2513 /**
2514  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2515  * is relaxable (ie. is allowed to be not enforced under some conditions)
2516  * @dev: device handle
2517  *
2518  * We assume that PCI USB devices with RMRRs have them largely
2519  * for historical reasons and that the RMRR space is not actively used post
2520  * boot.  This exclusion may change if vendors begin to abuse it.
2521  *
2522  * The same exception is made for graphics devices, with the requirement that
2523  * any use of the RMRR regions will be torn down before assigning the device
2524  * to a guest.
2525  *
2526  * Return: true if the RMRR is relaxable, false otherwise
2527  */
2528 static bool device_rmrr_is_relaxable(struct device *dev)
2529 {
2530         struct pci_dev *pdev;
2531
2532         if (!dev_is_pci(dev))
2533                 return false;
2534
2535         pdev = to_pci_dev(dev);
2536         if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2537                 return true;
2538         else
2539                 return false;
2540 }
2541
2542 /*
2543  * There are a couple cases where we need to restrict the functionality of
2544  * devices associated with RMRRs.  The first is when evaluating a device for
2545  * identity mapping because problems exist when devices are moved in and out
2546  * of domains and their respective RMRR information is lost.  This means that
2547  * a device with associated RMRRs will never be in a "passthrough" domain.
2548  * The second is use of the device through the IOMMU API.  This interface
2549  * expects to have full control of the IOVA space for the device.  We cannot
2550  * satisfy both the requirement that RMRR access is maintained and have an
2551  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2552  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2553  * We therefore prevent devices associated with an RMRR from participating in
2554  * the IOMMU API, which eliminates them from device assignment.
2555  *
2556  * In both cases, devices which have relaxable RMRRs are not concerned by this
2557  * restriction. See device_rmrr_is_relaxable comment.
2558  */
2559 static bool device_is_rmrr_locked(struct device *dev)
2560 {
2561         if (!device_has_rmrr(dev))
2562                 return false;
2563
2564         if (device_rmrr_is_relaxable(dev))
2565                 return false;
2566
2567         return true;
2568 }
2569
2570 /*
2571  * Return the required default domain type for a specific device.
2572  *
2573  * @dev: the device in query
2574  * @startup: true if this is during early boot
2575  *
2576  * Returns:
2577  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2578  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2579  *  - 0: both identity and dynamic domains work for this device
2580  */
2581 static int device_def_domain_type(struct device *dev)
2582 {
2583         if (dev_is_pci(dev)) {
2584                 struct pci_dev *pdev = to_pci_dev(dev);
2585
2586                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2587                         return IOMMU_DOMAIN_IDENTITY;
2588
2589                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2590                         return IOMMU_DOMAIN_IDENTITY;
2591         }
2592
2593         return 0;
2594 }
2595
2596 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2597 {
2598         /*
2599          * Start from the sane iommu hardware state.
2600          * If the queued invalidation is already initialized by us
2601          * (for example, while enabling interrupt-remapping) then
2602          * we got the things already rolling from a sane state.
2603          */
2604         if (!iommu->qi) {
2605                 /*
2606                  * Clear any previous faults.
2607                  */
2608                 dmar_fault(-1, iommu);
2609                 /*
2610                  * Disable queued invalidation if supported and already enabled
2611                  * before OS handover.
2612                  */
2613                 dmar_disable_qi(iommu);
2614         }
2615
2616         if (dmar_enable_qi(iommu)) {
2617                 /*
2618                  * Queued Invalidate not enabled, use Register Based Invalidate
2619                  */
2620                 iommu->flush.flush_context = __iommu_flush_context;
2621                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2622                 pr_info("%s: Using Register based invalidation\n",
2623                         iommu->name);
2624         } else {
2625                 iommu->flush.flush_context = qi_flush_context;
2626                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2627                 pr_info("%s: Using Queued invalidation\n", iommu->name);
2628         }
2629 }
2630
2631 static int copy_context_table(struct intel_iommu *iommu,
2632                               struct root_entry *old_re,
2633                               struct context_entry **tbl,
2634                               int bus, bool ext)
2635 {
2636         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2637         struct context_entry *new_ce = NULL, ce;
2638         struct context_entry *old_ce = NULL;
2639         struct root_entry re;
2640         phys_addr_t old_ce_phys;
2641
2642         tbl_idx = ext ? bus * 2 : bus;
2643         memcpy(&re, old_re, sizeof(re));
2644
2645         for (devfn = 0; devfn < 256; devfn++) {
2646                 /* First calculate the correct index */
2647                 idx = (ext ? devfn * 2 : devfn) % 256;
2648
2649                 if (idx == 0) {
2650                         /* First save what we may have and clean up */
2651                         if (new_ce) {
2652                                 tbl[tbl_idx] = new_ce;
2653                                 __iommu_flush_cache(iommu, new_ce,
2654                                                     VTD_PAGE_SIZE);
2655                                 pos = 1;
2656                         }
2657
2658                         if (old_ce)
2659                                 memunmap(old_ce);
2660
2661                         ret = 0;
2662                         if (devfn < 0x80)
2663                                 old_ce_phys = root_entry_lctp(&re);
2664                         else
2665                                 old_ce_phys = root_entry_uctp(&re);
2666
2667                         if (!old_ce_phys) {
2668                                 if (ext && devfn == 0) {
2669                                         /* No LCTP, try UCTP */
2670                                         devfn = 0x7f;
2671                                         continue;
2672                                 } else {
2673                                         goto out;
2674                                 }
2675                         }
2676
2677                         ret = -ENOMEM;
2678                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
2679                                         MEMREMAP_WB);
2680                         if (!old_ce)
2681                                 goto out;
2682
2683                         new_ce = alloc_pgtable_page(iommu->node, GFP_KERNEL);
2684                         if (!new_ce)
2685                                 goto out_unmap;
2686
2687                         ret = 0;
2688                 }
2689
2690                 /* Now copy the context entry */
2691                 memcpy(&ce, old_ce + idx, sizeof(ce));
2692
2693                 if (!context_present(&ce))
2694                         continue;
2695
2696                 did = context_domain_id(&ce);
2697                 if (did >= 0 && did < cap_ndoms(iommu->cap))
2698                         set_bit(did, iommu->domain_ids);
2699
2700                 set_context_copied(iommu, bus, devfn);
2701                 new_ce[idx] = ce;
2702         }
2703
2704         tbl[tbl_idx + pos] = new_ce;
2705
2706         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2707
2708 out_unmap:
2709         memunmap(old_ce);
2710
2711 out:
2712         return ret;
2713 }
2714
2715 static int copy_translation_tables(struct intel_iommu *iommu)
2716 {
2717         struct context_entry **ctxt_tbls;
2718         struct root_entry *old_rt;
2719         phys_addr_t old_rt_phys;
2720         int ctxt_table_entries;
2721         u64 rtaddr_reg;
2722         int bus, ret;
2723         bool new_ext, ext;
2724
2725         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2726         ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
2727         new_ext    = !!sm_supported(iommu);
2728
2729         /*
2730          * The RTT bit can only be changed when translation is disabled,
2731          * but disabling translation means to open a window for data
2732          * corruption. So bail out and don't copy anything if we would
2733          * have to change the bit.
2734          */
2735         if (new_ext != ext)
2736                 return -EINVAL;
2737
2738         iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2739         if (!iommu->copied_tables)
2740                 return -ENOMEM;
2741
2742         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2743         if (!old_rt_phys)
2744                 return -EINVAL;
2745
2746         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2747         if (!old_rt)
2748                 return -ENOMEM;
2749
2750         /* This is too big for the stack - allocate it from slab */
2751         ctxt_table_entries = ext ? 512 : 256;
2752         ret = -ENOMEM;
2753         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2754         if (!ctxt_tbls)
2755                 goto out_unmap;
2756
2757         for (bus = 0; bus < 256; bus++) {
2758                 ret = copy_context_table(iommu, &old_rt[bus],
2759                                          ctxt_tbls, bus, ext);
2760                 if (ret) {
2761                         pr_err("%s: Failed to copy context table for bus %d\n",
2762                                 iommu->name, bus);
2763                         continue;
2764                 }
2765         }
2766
2767         spin_lock(&iommu->lock);
2768
2769         /* Context tables are copied, now write them to the root_entry table */
2770         for (bus = 0; bus < 256; bus++) {
2771                 int idx = ext ? bus * 2 : bus;
2772                 u64 val;
2773
2774                 if (ctxt_tbls[idx]) {
2775                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
2776                         iommu->root_entry[bus].lo = val;
2777                 }
2778
2779                 if (!ext || !ctxt_tbls[idx + 1])
2780                         continue;
2781
2782                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2783                 iommu->root_entry[bus].hi = val;
2784         }
2785
2786         spin_unlock(&iommu->lock);
2787
2788         kfree(ctxt_tbls);
2789
2790         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2791
2792         ret = 0;
2793
2794 out_unmap:
2795         memunmap(old_rt);
2796
2797         return ret;
2798 }
2799
2800 #ifdef CONFIG_INTEL_IOMMU_SVM
2801 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
2802 {
2803         struct intel_iommu *iommu = data;
2804         ioasid_t ioasid;
2805
2806         if (!iommu)
2807                 return INVALID_IOASID;
2808         /*
2809          * VT-d virtual command interface always uses the full 20 bit
2810          * PASID range. Host can partition guest PASID range based on
2811          * policies but it is out of guest's control.
2812          */
2813         if (min < PASID_MIN || max > intel_pasid_max_id)
2814                 return INVALID_IOASID;
2815
2816         if (vcmd_alloc_pasid(iommu, &ioasid))
2817                 return INVALID_IOASID;
2818
2819         return ioasid;
2820 }
2821
2822 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
2823 {
2824         struct intel_iommu *iommu = data;
2825
2826         if (!iommu)
2827                 return;
2828         /*
2829          * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
2830          * We can only free the PASID when all the devices are unbound.
2831          */
2832         if (ioasid_find(NULL, ioasid, NULL)) {
2833                 pr_alert("Cannot free active IOASID %d\n", ioasid);
2834                 return;
2835         }
2836         vcmd_free_pasid(iommu, ioasid);
2837 }
2838
2839 static void register_pasid_allocator(struct intel_iommu *iommu)
2840 {
2841         /*
2842          * If we are running in the host, no need for custom allocator
2843          * in that PASIDs are allocated from the host system-wide.
2844          */
2845         if (!cap_caching_mode(iommu->cap))
2846                 return;
2847
2848         if (!sm_supported(iommu)) {
2849                 pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
2850                 return;
2851         }
2852
2853         /*
2854          * Register a custom PASID allocator if we are running in a guest,
2855          * guest PASID must be obtained via virtual command interface.
2856          * There can be multiple vIOMMUs in each guest but only one allocator
2857          * is active. All vIOMMU allocators will eventually be calling the same
2858          * host allocator.
2859          */
2860         if (!vccap_pasid(iommu->vccap))
2861                 return;
2862
2863         pr_info("Register custom PASID allocator\n");
2864         iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
2865         iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
2866         iommu->pasid_allocator.pdata = (void *)iommu;
2867         if (ioasid_register_allocator(&iommu->pasid_allocator)) {
2868                 pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
2869                 /*
2870                  * Disable scalable mode on this IOMMU if there
2871                  * is no custom allocator. Mixing SM capable vIOMMU
2872                  * and non-SM vIOMMU are not supported.
2873                  */
2874                 intel_iommu_sm = 0;
2875         }
2876 }
2877 #endif
2878
2879 static int __init init_dmars(void)
2880 {
2881         struct dmar_drhd_unit *drhd;
2882         struct intel_iommu *iommu;
2883         int ret;
2884
2885         ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2886         if (ret)
2887                 goto free_iommu;
2888
2889         for_each_iommu(iommu, drhd) {
2890                 if (drhd->ignored) {
2891                         iommu_disable_translation(iommu);
2892                         continue;
2893                 }
2894
2895                 /*
2896                  * Find the max pasid size of all IOMMU's in the system.
2897                  * We need to ensure the system pasid table is no bigger
2898                  * than the smallest supported.
2899                  */
2900                 if (pasid_supported(iommu)) {
2901                         u32 temp = 2 << ecap_pss(iommu->ecap);
2902
2903                         intel_pasid_max_id = min_t(u32, temp,
2904                                                    intel_pasid_max_id);
2905                 }
2906
2907                 intel_iommu_init_qi(iommu);
2908
2909                 ret = iommu_init_domains(iommu);
2910                 if (ret)
2911                         goto free_iommu;
2912
2913                 init_translation_status(iommu);
2914
2915                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2916                         iommu_disable_translation(iommu);
2917                         clear_translation_pre_enabled(iommu);
2918                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2919                                 iommu->name);
2920                 }
2921
2922                 /*
2923                  * TBD:
2924                  * we could share the same root & context tables
2925                  * among all IOMMU's. Need to Split it later.
2926                  */
2927                 ret = iommu_alloc_root_entry(iommu);
2928                 if (ret)
2929                         goto free_iommu;
2930
2931                 if (translation_pre_enabled(iommu)) {
2932                         pr_info("Translation already enabled - trying to copy translation structures\n");
2933
2934                         ret = copy_translation_tables(iommu);
2935                         if (ret) {
2936                                 /*
2937                                  * We found the IOMMU with translation
2938                                  * enabled - but failed to copy over the
2939                                  * old root-entry table. Try to proceed
2940                                  * by disabling translation now and
2941                                  * allocating a clean root-entry table.
2942                                  * This might cause DMAR faults, but
2943                                  * probably the dump will still succeed.
2944                                  */
2945                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
2946                                        iommu->name);
2947                                 iommu_disable_translation(iommu);
2948                                 clear_translation_pre_enabled(iommu);
2949                         } else {
2950                                 pr_info("Copied translation tables from previous kernel for %s\n",
2951                                         iommu->name);
2952                         }
2953                 }
2954
2955                 if (!ecap_pass_through(iommu->ecap))
2956                         hw_pass_through = 0;
2957                 intel_svm_check(iommu);
2958         }
2959
2960         /*
2961          * Now that qi is enabled on all iommus, set the root entry and flush
2962          * caches. This is required on some Intel X58 chipsets, otherwise the
2963          * flush_context function will loop forever and the boot hangs.
2964          */
2965         for_each_active_iommu(iommu, drhd) {
2966                 iommu_flush_write_buffer(iommu);
2967 #ifdef CONFIG_INTEL_IOMMU_SVM
2968                 register_pasid_allocator(iommu);
2969 #endif
2970                 iommu_set_root_entry(iommu);
2971         }
2972
2973 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2974         dmar_map_gfx = 0;
2975 #endif
2976
2977         if (!dmar_map_gfx)
2978                 iommu_identity_mapping |= IDENTMAP_GFX;
2979
2980         check_tylersburg_isoch();
2981
2982         ret = si_domain_init(hw_pass_through);
2983         if (ret)
2984                 goto free_iommu;
2985
2986         /*
2987          * for each drhd
2988          *   enable fault log
2989          *   global invalidate context cache
2990          *   global invalidate iotlb
2991          *   enable translation
2992          */
2993         for_each_iommu(iommu, drhd) {
2994                 if (drhd->ignored) {
2995                         /*
2996                          * we always have to disable PMRs or DMA may fail on
2997                          * this device
2998                          */
2999                         if (force_on)
3000                                 iommu_disable_protect_mem_regions(iommu);
3001                         continue;
3002                 }
3003
3004                 iommu_flush_write_buffer(iommu);
3005
3006 #ifdef CONFIG_INTEL_IOMMU_SVM
3007                 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3008                         /*
3009                          * Call dmar_alloc_hwirq() with dmar_global_lock held,
3010                          * could cause possible lock race condition.
3011                          */
3012                         up_write(&dmar_global_lock);
3013                         ret = intel_svm_enable_prq(iommu);
3014                         down_write(&dmar_global_lock);
3015                         if (ret)
3016                                 goto free_iommu;
3017                 }
3018 #endif
3019                 ret = dmar_set_interrupt(iommu);
3020                 if (ret)
3021                         goto free_iommu;
3022         }
3023
3024         return 0;
3025
3026 free_iommu:
3027         for_each_active_iommu(iommu, drhd) {
3028                 disable_dmar_iommu(iommu);
3029                 free_dmar_iommu(iommu);
3030         }
3031         if (si_domain) {
3032                 domain_exit(si_domain);
3033                 si_domain = NULL;
3034         }
3035
3036         return ret;
3037 }
3038
3039 static void __init init_no_remapping_devices(void)
3040 {
3041         struct dmar_drhd_unit *drhd;
3042         struct device *dev;
3043         int i;
3044
3045         for_each_drhd_unit(drhd) {
3046                 if (!drhd->include_all) {
3047                         for_each_active_dev_scope(drhd->devices,
3048                                                   drhd->devices_cnt, i, dev)
3049                                 break;
3050                         /* ignore DMAR unit if no devices exist */
3051                         if (i == drhd->devices_cnt)
3052                                 drhd->ignored = 1;
3053                 }
3054         }
3055
3056         for_each_active_drhd_unit(drhd) {
3057                 if (drhd->include_all)
3058                         continue;
3059
3060                 for_each_active_dev_scope(drhd->devices,
3061                                           drhd->devices_cnt, i, dev)
3062                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3063                                 break;
3064                 if (i < drhd->devices_cnt)
3065                         continue;
3066
3067                 /* This IOMMU has *only* gfx devices. Either bypass it or
3068                    set the gfx_mapped flag, as appropriate */
3069                 drhd->gfx_dedicated = 1;
3070                 if (!dmar_map_gfx)
3071                         drhd->ignored = 1;
3072         }
3073 }
3074
3075 #ifdef CONFIG_SUSPEND
3076 static int init_iommu_hw(void)
3077 {
3078         struct dmar_drhd_unit *drhd;
3079         struct intel_iommu *iommu = NULL;
3080
3081         for_each_active_iommu(iommu, drhd)
3082                 if (iommu->qi)
3083                         dmar_reenable_qi(iommu);
3084
3085         for_each_iommu(iommu, drhd) {
3086                 if (drhd->ignored) {
3087                         /*
3088                          * we always have to disable PMRs or DMA may fail on
3089                          * this device
3090                          */
3091                         if (force_on)
3092                                 iommu_disable_protect_mem_regions(iommu);
3093                         continue;
3094                 }
3095
3096                 iommu_flush_write_buffer(iommu);
3097                 iommu_set_root_entry(iommu);
3098                 iommu_enable_translation(iommu);
3099                 iommu_disable_protect_mem_regions(iommu);
3100         }
3101
3102         return 0;
3103 }
3104
3105 static void iommu_flush_all(void)
3106 {
3107         struct dmar_drhd_unit *drhd;
3108         struct intel_iommu *iommu;
3109
3110         for_each_active_iommu(iommu, drhd) {
3111                 iommu->flush.flush_context(iommu, 0, 0, 0,
3112                                            DMA_CCMD_GLOBAL_INVL);
3113                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3114                                          DMA_TLB_GLOBAL_FLUSH);
3115         }
3116 }
3117
3118 static int iommu_suspend(void)
3119 {
3120         struct dmar_drhd_unit *drhd;
3121         struct intel_iommu *iommu = NULL;
3122         unsigned long flag;
3123
3124         for_each_active_iommu(iommu, drhd) {
3125                 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3126                                              GFP_KERNEL);
3127                 if (!iommu->iommu_state)
3128                         goto nomem;
3129         }
3130
3131         iommu_flush_all();
3132
3133         for_each_active_iommu(iommu, drhd) {
3134                 iommu_disable_translation(iommu);
3135
3136                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3137
3138                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3139                         readl(iommu->reg + DMAR_FECTL_REG);
3140                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3141                         readl(iommu->reg + DMAR_FEDATA_REG);
3142                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3143                         readl(iommu->reg + DMAR_FEADDR_REG);
3144                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3145                         readl(iommu->reg + DMAR_FEUADDR_REG);
3146
3147                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3148         }
3149         return 0;
3150
3151 nomem:
3152         for_each_active_iommu(iommu, drhd)
3153                 kfree(iommu->iommu_state);
3154
3155         return -ENOMEM;
3156 }
3157
3158 static void iommu_resume(void)
3159 {
3160         struct dmar_drhd_unit *drhd;
3161         struct intel_iommu *iommu = NULL;
3162         unsigned long flag;
3163
3164         if (init_iommu_hw()) {
3165                 if (force_on)
3166                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3167                 else
3168                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3169                 return;
3170         }
3171
3172         for_each_active_iommu(iommu, drhd) {
3173
3174                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3175
3176                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3177                         iommu->reg + DMAR_FECTL_REG);
3178                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3179                         iommu->reg + DMAR_FEDATA_REG);
3180                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3181                         iommu->reg + DMAR_FEADDR_REG);
3182                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3183                         iommu->reg + DMAR_FEUADDR_REG);
3184
3185                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3186         }
3187
3188         for_each_active_iommu(iommu, drhd)
3189                 kfree(iommu->iommu_state);
3190 }
3191
3192 static struct syscore_ops iommu_syscore_ops = {
3193         .resume         = iommu_resume,
3194         .suspend        = iommu_suspend,
3195 };
3196
3197 static void __init init_iommu_pm_ops(void)
3198 {
3199         register_syscore_ops(&iommu_syscore_ops);
3200 }
3201
3202 #else
3203 static inline void init_iommu_pm_ops(void) {}
3204 #endif  /* CONFIG_PM */
3205
3206 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3207 {
3208         if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3209             !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3210             rmrr->end_address <= rmrr->base_address ||
3211             arch_rmrr_sanity_check(rmrr))
3212                 return -EINVAL;
3213
3214         return 0;
3215 }
3216
3217 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3218 {
3219         struct acpi_dmar_reserved_memory *rmrr;
3220         struct dmar_rmrr_unit *rmrru;
3221
3222         rmrr = (struct acpi_dmar_reserved_memory *)header;
3223         if (rmrr_sanity_check(rmrr)) {
3224                 pr_warn(FW_BUG
3225                            "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3226                            "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3227                            rmrr->base_address, rmrr->end_address,
3228                            dmi_get_system_info(DMI_BIOS_VENDOR),
3229                            dmi_get_system_info(DMI_BIOS_VERSION),
3230                            dmi_get_system_info(DMI_PRODUCT_VERSION));
3231                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3232         }
3233
3234         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3235         if (!rmrru)
3236                 goto out;
3237
3238         rmrru->hdr = header;
3239
3240         rmrru->base_address = rmrr->base_address;
3241         rmrru->end_address = rmrr->end_address;
3242
3243         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3244                                 ((void *)rmrr) + rmrr->header.length,
3245                                 &rmrru->devices_cnt);
3246         if (rmrru->devices_cnt && rmrru->devices == NULL)
3247                 goto free_rmrru;
3248
3249         list_add(&rmrru->list, &dmar_rmrr_units);
3250
3251         return 0;
3252 free_rmrru:
3253         kfree(rmrru);
3254 out:
3255         return -ENOMEM;
3256 }
3257
3258 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3259 {
3260         struct dmar_atsr_unit *atsru;
3261         struct acpi_dmar_atsr *tmp;
3262
3263         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3264                                 dmar_rcu_check()) {
3265                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3266                 if (atsr->segment != tmp->segment)
3267                         continue;
3268                 if (atsr->header.length != tmp->header.length)
3269                         continue;
3270                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
3271                         return atsru;
3272         }
3273
3274         return NULL;
3275 }
3276
3277 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3278 {
3279         struct acpi_dmar_atsr *atsr;
3280         struct dmar_atsr_unit *atsru;
3281
3282         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3283                 return 0;
3284
3285         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3286         atsru = dmar_find_atsr(atsr);
3287         if (atsru)
3288                 return 0;
3289
3290         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3291         if (!atsru)
3292                 return -ENOMEM;
3293
3294         /*
3295          * If memory is allocated from slab by ACPI _DSM method, we need to
3296          * copy the memory content because the memory buffer will be freed
3297          * on return.
3298          */
3299         atsru->hdr = (void *)(atsru + 1);
3300         memcpy(atsru->hdr, hdr, hdr->length);
3301         atsru->include_all = atsr->flags & 0x1;
3302         if (!atsru->include_all) {
3303                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3304                                 (void *)atsr + atsr->header.length,
3305                                 &atsru->devices_cnt);
3306                 if (atsru->devices_cnt && atsru->devices == NULL) {
3307                         kfree(atsru);
3308                         return -ENOMEM;
3309                 }
3310         }
3311
3312         list_add_rcu(&atsru->list, &dmar_atsr_units);
3313
3314         return 0;
3315 }
3316
3317 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3318 {
3319         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3320         kfree(atsru);
3321 }
3322
3323 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3324 {
3325         struct acpi_dmar_atsr *atsr;
3326         struct dmar_atsr_unit *atsru;
3327
3328         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3329         atsru = dmar_find_atsr(atsr);
3330         if (atsru) {
3331                 list_del_rcu(&atsru->list);
3332                 synchronize_rcu();
3333                 intel_iommu_free_atsr(atsru);
3334         }
3335
3336         return 0;
3337 }
3338
3339 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3340 {
3341         int i;
3342         struct device *dev;
3343         struct acpi_dmar_atsr *atsr;
3344         struct dmar_atsr_unit *atsru;
3345
3346         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3347         atsru = dmar_find_atsr(atsr);
3348         if (!atsru)
3349                 return 0;
3350
3351         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3352                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3353                                           i, dev)
3354                         return -EBUSY;
3355         }
3356
3357         return 0;
3358 }
3359
3360 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3361 {
3362         struct dmar_satc_unit *satcu;
3363         struct acpi_dmar_satc *tmp;
3364
3365         list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3366                                 dmar_rcu_check()) {
3367                 tmp = (struct acpi_dmar_satc *)satcu->hdr;
3368                 if (satc->segment != tmp->segment)
3369                         continue;
3370                 if (satc->header.length != tmp->header.length)
3371                         continue;
3372                 if (memcmp(satc, tmp, satc->header.length) == 0)
3373                         return satcu;
3374         }
3375
3376         return NULL;
3377 }
3378
3379 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3380 {
3381         struct acpi_dmar_satc *satc;
3382         struct dmar_satc_unit *satcu;
3383
3384         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3385                 return 0;
3386
3387         satc = container_of(hdr, struct acpi_dmar_satc, header);
3388         satcu = dmar_find_satc(satc);
3389         if (satcu)
3390                 return 0;
3391
3392         satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3393         if (!satcu)
3394                 return -ENOMEM;
3395
3396         satcu->hdr = (void *)(satcu + 1);
3397         memcpy(satcu->hdr, hdr, hdr->length);
3398         satcu->atc_required = satc->flags & 0x1;
3399         satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3400                                               (void *)satc + satc->header.length,
3401                                               &satcu->devices_cnt);
3402         if (satcu->devices_cnt && !satcu->devices) {
3403                 kfree(satcu);
3404                 return -ENOMEM;
3405         }
3406         list_add_rcu(&satcu->list, &dmar_satc_units);
3407
3408         return 0;
3409 }
3410
3411 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3412 {
3413         int sp, ret;
3414         struct intel_iommu *iommu = dmaru->iommu;
3415
3416         ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3417         if (ret)
3418                 goto out;
3419
3420         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3421                 pr_warn("%s: Doesn't support hardware pass through.\n",
3422                         iommu->name);
3423                 return -ENXIO;
3424         }
3425
3426         sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3427         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3428                 pr_warn("%s: Doesn't support large page.\n",
3429                         iommu->name);
3430                 return -ENXIO;
3431         }
3432
3433         /*
3434          * Disable translation if already enabled prior to OS handover.
3435          */
3436         if (iommu->gcmd & DMA_GCMD_TE)
3437                 iommu_disable_translation(iommu);
3438
3439         ret = iommu_init_domains(iommu);
3440         if (ret == 0)
3441                 ret = iommu_alloc_root_entry(iommu);
3442         if (ret)
3443                 goto out;
3444
3445         intel_svm_check(iommu);
3446
3447         if (dmaru->ignored) {
3448                 /*
3449                  * we always have to disable PMRs or DMA may fail on this device
3450                  */
3451                 if (force_on)
3452                         iommu_disable_protect_mem_regions(iommu);
3453                 return 0;
3454         }
3455
3456         intel_iommu_init_qi(iommu);
3457         iommu_flush_write_buffer(iommu);
3458
3459 #ifdef CONFIG_INTEL_IOMMU_SVM
3460         if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3461                 ret = intel_svm_enable_prq(iommu);
3462                 if (ret)
3463                         goto disable_iommu;
3464         }
3465 #endif
3466         ret = dmar_set_interrupt(iommu);
3467         if (ret)
3468                 goto disable_iommu;
3469
3470         iommu_set_root_entry(iommu);
3471         iommu_enable_translation(iommu);
3472
3473         iommu_disable_protect_mem_regions(iommu);
3474         return 0;
3475
3476 disable_iommu:
3477         disable_dmar_iommu(iommu);
3478 out:
3479         free_dmar_iommu(iommu);
3480         return ret;
3481 }
3482
3483 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3484 {
3485         int ret = 0;
3486         struct intel_iommu *iommu = dmaru->iommu;
3487
3488         if (!intel_iommu_enabled)
3489                 return 0;
3490         if (iommu == NULL)
3491                 return -EINVAL;
3492
3493         if (insert) {
3494                 ret = intel_iommu_add(dmaru);
3495         } else {
3496                 disable_dmar_iommu(iommu);
3497                 free_dmar_iommu(iommu);
3498         }
3499
3500         return ret;
3501 }
3502
3503 static void intel_iommu_free_dmars(void)
3504 {
3505         struct dmar_rmrr_unit *rmrru, *rmrr_n;
3506         struct dmar_atsr_unit *atsru, *atsr_n;
3507         struct dmar_satc_unit *satcu, *satc_n;
3508
3509         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3510                 list_del(&rmrru->list);
3511                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3512                 kfree(rmrru);
3513         }
3514
3515         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3516                 list_del(&atsru->list);
3517                 intel_iommu_free_atsr(atsru);
3518         }
3519         list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3520                 list_del(&satcu->list);
3521                 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3522                 kfree(satcu);
3523         }
3524 }
3525
3526 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3527 {
3528         struct dmar_satc_unit *satcu;
3529         struct acpi_dmar_satc *satc;
3530         struct device *tmp;
3531         int i;
3532
3533         dev = pci_physfn(dev);
3534         rcu_read_lock();
3535
3536         list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3537                 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3538                 if (satc->segment != pci_domain_nr(dev->bus))
3539                         continue;
3540                 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3541                         if (to_pci_dev(tmp) == dev)
3542                                 goto out;
3543         }
3544         satcu = NULL;
3545 out:
3546         rcu_read_unlock();
3547         return satcu;
3548 }
3549
3550 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3551 {
3552         int i, ret = 1;
3553         struct pci_bus *bus;
3554         struct pci_dev *bridge = NULL;
3555         struct device *tmp;
3556         struct acpi_dmar_atsr *atsr;
3557         struct dmar_atsr_unit *atsru;
3558         struct dmar_satc_unit *satcu;
3559
3560         dev = pci_physfn(dev);
3561         satcu = dmar_find_matched_satc_unit(dev);
3562         if (satcu)
3563                 /*
3564                  * This device supports ATS as it is in SATC table.
3565                  * When IOMMU is in legacy mode, enabling ATS is done
3566                  * automatically by HW for the device that requires
3567                  * ATS, hence OS should not enable this device ATS
3568                  * to avoid duplicated TLB invalidation.
3569                  */
3570                 return !(satcu->atc_required && !sm_supported(iommu));
3571
3572         for (bus = dev->bus; bus; bus = bus->parent) {
3573                 bridge = bus->self;
3574                 /* If it's an integrated device, allow ATS */
3575                 if (!bridge)
3576                         return 1;
3577                 /* Connected via non-PCIe: no ATS */
3578                 if (!pci_is_pcie(bridge) ||
3579                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3580                         return 0;
3581                 /* If we found the root port, look it up in the ATSR */
3582                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3583                         break;
3584         }
3585
3586         rcu_read_lock();
3587         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3588                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3589                 if (atsr->segment != pci_domain_nr(dev->bus))
3590                         continue;
3591
3592                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3593                         if (tmp == &bridge->dev)
3594                                 goto out;
3595
3596                 if (atsru->include_all)
3597                         goto out;
3598         }
3599         ret = 0;
3600 out:
3601         rcu_read_unlock();
3602
3603         return ret;
3604 }
3605
3606 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3607 {
3608         int ret;
3609         struct dmar_rmrr_unit *rmrru;
3610         struct dmar_atsr_unit *atsru;
3611         struct dmar_satc_unit *satcu;
3612         struct acpi_dmar_atsr *atsr;
3613         struct acpi_dmar_reserved_memory *rmrr;
3614         struct acpi_dmar_satc *satc;
3615
3616         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3617                 return 0;
3618
3619         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3620                 rmrr = container_of(rmrru->hdr,
3621                                     struct acpi_dmar_reserved_memory, header);
3622                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3623                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3624                                 ((void *)rmrr) + rmrr->header.length,
3625                                 rmrr->segment, rmrru->devices,
3626                                 rmrru->devices_cnt);
3627                         if (ret < 0)
3628                                 return ret;
3629                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3630                         dmar_remove_dev_scope(info, rmrr->segment,
3631                                 rmrru->devices, rmrru->devices_cnt);
3632                 }
3633         }
3634
3635         list_for_each_entry(atsru, &dmar_atsr_units, list) {
3636                 if (atsru->include_all)
3637                         continue;
3638
3639                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3640                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3641                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3642                                         (void *)atsr + atsr->header.length,
3643                                         atsr->segment, atsru->devices,
3644                                         atsru->devices_cnt);
3645                         if (ret > 0)
3646                                 break;
3647                         else if (ret < 0)
3648                                 return ret;
3649                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3650                         if (dmar_remove_dev_scope(info, atsr->segment,
3651                                         atsru->devices, atsru->devices_cnt))
3652                                 break;
3653                 }
3654         }
3655         list_for_each_entry(satcu, &dmar_satc_units, list) {
3656                 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3657                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3658                         ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3659                                         (void *)satc + satc->header.length,
3660                                         satc->segment, satcu->devices,
3661                                         satcu->devices_cnt);
3662                         if (ret > 0)
3663                                 break;
3664                         else if (ret < 0)
3665                                 return ret;
3666                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3667                         if (dmar_remove_dev_scope(info, satc->segment,
3668                                         satcu->devices, satcu->devices_cnt))
3669                                 break;
3670                 }
3671         }
3672
3673         return 0;
3674 }
3675
3676 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3677                                        unsigned long val, void *v)
3678 {
3679         struct memory_notify *mhp = v;
3680         unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
3681         unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
3682                         mhp->nr_pages - 1);
3683
3684         switch (val) {
3685         case MEM_GOING_ONLINE:
3686                 if (iommu_domain_identity_map(si_domain,
3687                                               start_vpfn, last_vpfn)) {
3688                         pr_warn("Failed to build identity map for [%lx-%lx]\n",
3689                                 start_vpfn, last_vpfn);
3690                         return NOTIFY_BAD;
3691                 }
3692                 break;
3693
3694         case MEM_OFFLINE:
3695         case MEM_CANCEL_ONLINE:
3696                 {
3697                         struct dmar_drhd_unit *drhd;
3698                         struct intel_iommu *iommu;
3699                         LIST_HEAD(freelist);
3700
3701                         domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3702
3703                         rcu_read_lock();
3704                         for_each_active_iommu(iommu, drhd)
3705                                 iommu_flush_iotlb_psi(iommu, si_domain,
3706                                         start_vpfn, mhp->nr_pages,
3707                                         list_empty(&freelist), 0);
3708                         rcu_read_unlock();
3709                         put_pages_list(&freelist);
3710                 }
3711                 break;
3712         }
3713
3714         return NOTIFY_OK;
3715 }
3716
3717 static struct notifier_block intel_iommu_memory_nb = {
3718         .notifier_call = intel_iommu_memory_notifier,
3719         .priority = 0
3720 };
3721
3722 static void intel_disable_iommus(void)
3723 {
3724         struct intel_iommu *iommu = NULL;
3725         struct dmar_drhd_unit *drhd;
3726
3727         for_each_iommu(iommu, drhd)
3728                 iommu_disable_translation(iommu);
3729 }
3730
3731 void intel_iommu_shutdown(void)
3732 {
3733         struct dmar_drhd_unit *drhd;
3734         struct intel_iommu *iommu = NULL;
3735
3736         if (no_iommu || dmar_disabled)
3737                 return;
3738
3739         down_write(&dmar_global_lock);
3740
3741         /* Disable PMRs explicitly here. */
3742         for_each_iommu(iommu, drhd)
3743                 iommu_disable_protect_mem_regions(iommu);
3744
3745         /* Make sure the IOMMUs are switched off */
3746         intel_disable_iommus();
3747
3748         up_write(&dmar_global_lock);
3749 }
3750
3751 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3752 {
3753         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3754
3755         return container_of(iommu_dev, struct intel_iommu, iommu);
3756 }
3757
3758 static ssize_t version_show(struct device *dev,
3759                             struct device_attribute *attr, char *buf)
3760 {
3761         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3762         u32 ver = readl(iommu->reg + DMAR_VER_REG);
3763         return sprintf(buf, "%d:%d\n",
3764                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3765 }
3766 static DEVICE_ATTR_RO(version);
3767
3768 static ssize_t address_show(struct device *dev,
3769                             struct device_attribute *attr, char *buf)
3770 {
3771         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3772         return sprintf(buf, "%llx\n", iommu->reg_phys);
3773 }
3774 static DEVICE_ATTR_RO(address);
3775
3776 static ssize_t cap_show(struct device *dev,
3777                         struct device_attribute *attr, char *buf)
3778 {
3779         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3780         return sprintf(buf, "%llx\n", iommu->cap);
3781 }
3782 static DEVICE_ATTR_RO(cap);
3783
3784 static ssize_t ecap_show(struct device *dev,
3785                          struct device_attribute *attr, char *buf)
3786 {
3787         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3788         return sprintf(buf, "%llx\n", iommu->ecap);
3789 }
3790 static DEVICE_ATTR_RO(ecap);
3791
3792 static ssize_t domains_supported_show(struct device *dev,
3793                                       struct device_attribute *attr, char *buf)
3794 {
3795         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3796         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
3797 }
3798 static DEVICE_ATTR_RO(domains_supported);
3799
3800 static ssize_t domains_used_show(struct device *dev,
3801                                  struct device_attribute *attr, char *buf)
3802 {
3803         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3804         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
3805                                                   cap_ndoms(iommu->cap)));
3806 }
3807 static DEVICE_ATTR_RO(domains_used);
3808
3809 static struct attribute *intel_iommu_attrs[] = {
3810         &dev_attr_version.attr,
3811         &dev_attr_address.attr,
3812         &dev_attr_cap.attr,
3813         &dev_attr_ecap.attr,
3814         &dev_attr_domains_supported.attr,
3815         &dev_attr_domains_used.attr,
3816         NULL,
3817 };
3818
3819 static struct attribute_group intel_iommu_group = {
3820         .name = "intel-iommu",
3821         .attrs = intel_iommu_attrs,
3822 };
3823
3824 const struct attribute_group *intel_iommu_groups[] = {
3825         &intel_iommu_group,
3826         NULL,
3827 };
3828
3829 static inline bool has_external_pci(void)
3830 {
3831         struct pci_dev *pdev = NULL;
3832
3833         for_each_pci_dev(pdev)
3834                 if (pdev->external_facing) {
3835                         pci_dev_put(pdev);
3836                         return true;
3837                 }
3838
3839         return false;
3840 }
3841
3842 static int __init platform_optin_force_iommu(void)
3843 {
3844         if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3845                 return 0;
3846
3847         if (no_iommu || dmar_disabled)
3848                 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3849
3850         /*
3851          * If Intel-IOMMU is disabled by default, we will apply identity
3852          * map for all devices except those marked as being untrusted.
3853          */
3854         if (dmar_disabled)
3855                 iommu_set_default_passthrough(false);
3856
3857         dmar_disabled = 0;
3858         no_iommu = 0;
3859
3860         return 1;
3861 }
3862
3863 static int __init probe_acpi_namespace_devices(void)
3864 {
3865         struct dmar_drhd_unit *drhd;
3866         /* To avoid a -Wunused-but-set-variable warning. */
3867         struct intel_iommu *iommu __maybe_unused;
3868         struct device *dev;
3869         int i, ret = 0;
3870
3871         for_each_active_iommu(iommu, drhd) {
3872                 for_each_active_dev_scope(drhd->devices,
3873                                           drhd->devices_cnt, i, dev) {
3874                         struct acpi_device_physical_node *pn;
3875                         struct iommu_group *group;
3876                         struct acpi_device *adev;
3877
3878                         if (dev->bus != &acpi_bus_type)
3879                                 continue;
3880
3881                         adev = to_acpi_device(dev);
3882                         mutex_lock(&adev->physical_node_lock);
3883                         list_for_each_entry(pn,
3884                                             &adev->physical_node_list, node) {
3885                                 group = iommu_group_get(pn->dev);
3886                                 if (group) {
3887                                         iommu_group_put(group);
3888                                         continue;
3889                                 }
3890
3891                                 ret = iommu_probe_device(pn->dev);
3892                                 if (ret)
3893                                         break;
3894                         }
3895                         mutex_unlock(&adev->physical_node_lock);
3896
3897                         if (ret)
3898                                 return ret;
3899                 }
3900         }
3901
3902         return 0;
3903 }
3904
3905 static __init int tboot_force_iommu(void)
3906 {
3907         if (!tboot_enabled())
3908                 return 0;
3909
3910         if (no_iommu || dmar_disabled)
3911                 pr_warn("Forcing Intel-IOMMU to enabled\n");
3912
3913         dmar_disabled = 0;
3914         no_iommu = 0;
3915
3916         return 1;
3917 }
3918
3919 int __init intel_iommu_init(void)
3920 {
3921         int ret = -ENODEV;
3922         struct dmar_drhd_unit *drhd;
3923         struct intel_iommu *iommu;
3924
3925         /*
3926          * Intel IOMMU is required for a TXT/tboot launch or platform
3927          * opt in, so enforce that.
3928          */
3929         force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3930                     platform_optin_force_iommu();
3931
3932         down_write(&dmar_global_lock);
3933         if (dmar_table_init()) {
3934                 if (force_on)
3935                         panic("tboot: Failed to initialize DMAR table\n");
3936                 goto out_free_dmar;
3937         }
3938
3939         if (dmar_dev_scope_init() < 0) {
3940                 if (force_on)
3941                         panic("tboot: Failed to initialize DMAR device scope\n");
3942                 goto out_free_dmar;
3943         }
3944
3945         up_write(&dmar_global_lock);
3946
3947         /*
3948          * The bus notifier takes the dmar_global_lock, so lockdep will
3949          * complain later when we register it under the lock.
3950          */
3951         dmar_register_bus_notifier();
3952
3953         down_write(&dmar_global_lock);
3954
3955         if (!no_iommu)
3956                 intel_iommu_debugfs_init();
3957
3958         if (no_iommu || dmar_disabled) {
3959                 /*
3960                  * We exit the function here to ensure IOMMU's remapping and
3961                  * mempool aren't setup, which means that the IOMMU's PMRs
3962                  * won't be disabled via the call to init_dmars(). So disable
3963                  * it explicitly here. The PMRs were setup by tboot prior to
3964                  * calling SENTER, but the kernel is expected to reset/tear
3965                  * down the PMRs.
3966                  */
3967                 if (intel_iommu_tboot_noforce) {
3968                         for_each_iommu(iommu, drhd)
3969                                 iommu_disable_protect_mem_regions(iommu);
3970                 }
3971
3972                 /*
3973                  * Make sure the IOMMUs are switched off, even when we
3974                  * boot into a kexec kernel and the previous kernel left
3975                  * them enabled
3976                  */
3977                 intel_disable_iommus();
3978                 goto out_free_dmar;
3979         }
3980
3981         if (list_empty(&dmar_rmrr_units))
3982                 pr_info("No RMRR found\n");
3983
3984         if (list_empty(&dmar_atsr_units))
3985                 pr_info("No ATSR found\n");
3986
3987         if (list_empty(&dmar_satc_units))
3988                 pr_info("No SATC found\n");
3989
3990         init_no_remapping_devices();
3991
3992         ret = init_dmars();
3993         if (ret) {
3994                 if (force_on)
3995                         panic("tboot: Failed to initialize DMARs\n");
3996                 pr_err("Initialization failed\n");
3997                 goto out_free_dmar;
3998         }
3999         up_write(&dmar_global_lock);
4000
4001         init_iommu_pm_ops();
4002
4003         down_read(&dmar_global_lock);
4004         for_each_active_iommu(iommu, drhd) {
4005                 /*
4006                  * The flush queue implementation does not perform
4007                  * page-selective invalidations that are required for efficient
4008                  * TLB flushes in virtual environments.  The benefit of batching
4009                  * is likely to be much lower than the overhead of synchronizing
4010                  * the virtual and physical IOMMU page-tables.
4011                  */
4012                 if (cap_caching_mode(iommu->cap) &&
4013                     !first_level_by_default(IOMMU_DOMAIN_DMA)) {
4014                         pr_info_once("IOMMU batching disallowed due to virtualization\n");
4015                         iommu_set_dma_strict();
4016                 }
4017                 iommu_device_sysfs_add(&iommu->iommu, NULL,
4018                                        intel_iommu_groups,
4019                                        "%s", iommu->name);
4020                 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
4021
4022                 iommu_pmu_register(iommu);
4023         }
4024         up_read(&dmar_global_lock);
4025
4026         if (si_domain && !hw_pass_through)
4027                 register_memory_notifier(&intel_iommu_memory_nb);
4028
4029         down_read(&dmar_global_lock);
4030         if (probe_acpi_namespace_devices())
4031                 pr_warn("ACPI name space devices didn't probe correctly\n");
4032
4033         /* Finally, we enable the DMA remapping hardware. */
4034         for_each_iommu(iommu, drhd) {
4035                 if (!drhd->ignored && !translation_pre_enabled(iommu))
4036                         iommu_enable_translation(iommu);
4037
4038                 iommu_disable_protect_mem_regions(iommu);
4039         }
4040         up_read(&dmar_global_lock);
4041
4042         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4043
4044         intel_iommu_enabled = 1;
4045
4046         return 0;
4047
4048 out_free_dmar:
4049         intel_iommu_free_dmars();
4050         up_write(&dmar_global_lock);
4051         return ret;
4052 }
4053
4054 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4055 {
4056         struct device_domain_info *info = opaque;
4057
4058         domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
4059         return 0;
4060 }
4061
4062 /*
4063  * NB - intel-iommu lacks any sort of reference counting for the users of
4064  * dependent devices.  If multiple endpoints have intersecting dependent
4065  * devices, unbinding the driver from any one of them will possibly leave
4066  * the others unable to operate.
4067  */
4068 static void domain_context_clear(struct device_domain_info *info)
4069 {
4070         if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
4071                 return;
4072
4073         pci_for_each_dma_alias(to_pci_dev(info->dev),
4074                                &domain_context_clear_one_cb, info);
4075 }
4076
4077 static void dmar_remove_one_dev_info(struct device *dev)
4078 {
4079         struct device_domain_info *info = dev_iommu_priv_get(dev);
4080         struct dmar_domain *domain = info->domain;
4081         struct intel_iommu *iommu = info->iommu;
4082         unsigned long flags;
4083
4084         if (!dev_is_real_dma_subdevice(info->dev)) {
4085                 if (dev_is_pci(info->dev) && sm_supported(iommu))
4086                         intel_pasid_tear_down_entry(iommu, info->dev,
4087                                         PASID_RID2PASID, false);
4088
4089                 iommu_disable_pci_caps(info);
4090                 domain_context_clear(info);
4091         }
4092
4093         spin_lock_irqsave(&domain->lock, flags);
4094         list_del(&info->link);
4095         spin_unlock_irqrestore(&domain->lock, flags);
4096
4097         domain_detach_iommu(domain, iommu);
4098         info->domain = NULL;
4099 }
4100
4101 /*
4102  * Clear the page table pointer in context or pasid table entries so that
4103  * all DMA requests without PASID from the device are blocked. If the page
4104  * table has been set, clean up the data structures.
4105  */
4106 static void device_block_translation(struct device *dev)
4107 {
4108         struct device_domain_info *info = dev_iommu_priv_get(dev);
4109         struct intel_iommu *iommu = info->iommu;
4110         unsigned long flags;
4111
4112         iommu_disable_pci_caps(info);
4113         if (!dev_is_real_dma_subdevice(dev)) {
4114                 if (sm_supported(iommu))
4115                         intel_pasid_tear_down_entry(iommu, dev,
4116                                                     PASID_RID2PASID, false);
4117                 else
4118                         domain_context_clear(info);
4119         }
4120
4121         if (!info->domain)
4122                 return;
4123
4124         spin_lock_irqsave(&info->domain->lock, flags);
4125         list_del(&info->link);
4126         spin_unlock_irqrestore(&info->domain->lock, flags);
4127
4128         domain_detach_iommu(info->domain, iommu);
4129         info->domain = NULL;
4130 }
4131
4132 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4133 {
4134         int adjust_width;
4135
4136         /* calculate AGAW */
4137         domain->gaw = guest_width;
4138         adjust_width = guestwidth_to_adjustwidth(guest_width);
4139         domain->agaw = width_to_agaw(adjust_width);
4140
4141         domain->iommu_coherency = false;
4142         domain->iommu_superpage = 0;
4143         domain->max_addr = 0;
4144
4145         /* always allocate the top pgd */
4146         domain->pgd = alloc_pgtable_page(domain->nid, GFP_ATOMIC);
4147         if (!domain->pgd)
4148                 return -ENOMEM;
4149         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4150         return 0;
4151 }
4152
4153 static int blocking_domain_attach_dev(struct iommu_domain *domain,
4154                                       struct device *dev)
4155 {
4156         device_block_translation(dev);
4157         return 0;
4158 }
4159
4160 static struct iommu_domain blocking_domain = {
4161         .ops = &(const struct iommu_domain_ops) {
4162                 .attach_dev     = blocking_domain_attach_dev,
4163                 .free           = intel_iommu_domain_free
4164         }
4165 };
4166
4167 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4168 {
4169         struct dmar_domain *dmar_domain;
4170         struct iommu_domain *domain;
4171
4172         switch (type) {
4173         case IOMMU_DOMAIN_BLOCKED:
4174                 return &blocking_domain;
4175         case IOMMU_DOMAIN_DMA:
4176         case IOMMU_DOMAIN_DMA_FQ:
4177         case IOMMU_DOMAIN_UNMANAGED:
4178                 dmar_domain = alloc_domain(type);
4179                 if (!dmar_domain) {
4180                         pr_err("Can't allocate dmar_domain\n");
4181                         return NULL;
4182                 }
4183                 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4184                         pr_err("Domain initialization failed\n");
4185                         domain_exit(dmar_domain);
4186                         return NULL;
4187                 }
4188
4189                 domain = &dmar_domain->domain;
4190                 domain->geometry.aperture_start = 0;
4191                 domain->geometry.aperture_end   =
4192                                 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4193                 domain->geometry.force_aperture = true;
4194
4195                 return domain;
4196         case IOMMU_DOMAIN_IDENTITY:
4197                 return &si_domain->domain;
4198         case IOMMU_DOMAIN_SVA:
4199                 return intel_svm_domain_alloc();
4200         default:
4201                 return NULL;
4202         }
4203
4204         return NULL;
4205 }
4206
4207 static void intel_iommu_domain_free(struct iommu_domain *domain)
4208 {
4209         if (domain != &si_domain->domain && domain != &blocking_domain)
4210                 domain_exit(to_dmar_domain(domain));
4211 }
4212
4213 static int prepare_domain_attach_device(struct iommu_domain *domain,
4214                                         struct device *dev)
4215 {
4216         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4217         struct intel_iommu *iommu;
4218         int addr_width;
4219
4220         iommu = device_to_iommu(dev, NULL, NULL);
4221         if (!iommu)
4222                 return -ENODEV;
4223
4224         if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
4225                 return -EINVAL;
4226
4227         /* check if this iommu agaw is sufficient for max mapped address */
4228         addr_width = agaw_to_width(iommu->agaw);
4229         if (addr_width > cap_mgaw(iommu->cap))
4230                 addr_width = cap_mgaw(iommu->cap);
4231
4232         if (dmar_domain->max_addr > (1LL << addr_width))
4233                 return -EINVAL;
4234         dmar_domain->gaw = addr_width;
4235
4236         /*
4237          * Knock out extra levels of page tables if necessary
4238          */
4239         while (iommu->agaw < dmar_domain->agaw) {
4240                 struct dma_pte *pte;
4241
4242                 pte = dmar_domain->pgd;
4243                 if (dma_pte_present(pte)) {
4244                         dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4245                         free_pgtable_page(pte);
4246                 }
4247                 dmar_domain->agaw--;
4248         }
4249
4250         return 0;
4251 }
4252
4253 static int intel_iommu_attach_device(struct iommu_domain *domain,
4254                                      struct device *dev)
4255 {
4256         struct device_domain_info *info = dev_iommu_priv_get(dev);
4257         int ret;
4258
4259         if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
4260             device_is_rmrr_locked(dev)) {
4261                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4262                 return -EPERM;
4263         }
4264
4265         if (info->domain)
4266                 device_block_translation(dev);
4267
4268         ret = prepare_domain_attach_device(domain, dev);
4269         if (ret)
4270                 return ret;
4271
4272         return dmar_domain_attach_device(to_dmar_domain(domain), dev);
4273 }
4274
4275 static int intel_iommu_map(struct iommu_domain *domain,
4276                            unsigned long iova, phys_addr_t hpa,
4277                            size_t size, int iommu_prot, gfp_t gfp)
4278 {
4279         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4280         u64 max_addr;
4281         int prot = 0;
4282
4283         if (iommu_prot & IOMMU_READ)
4284                 prot |= DMA_PTE_READ;
4285         if (iommu_prot & IOMMU_WRITE)
4286                 prot |= DMA_PTE_WRITE;
4287         if (dmar_domain->set_pte_snp)
4288                 prot |= DMA_PTE_SNP;
4289
4290         max_addr = iova + size;
4291         if (dmar_domain->max_addr < max_addr) {
4292                 u64 end;
4293
4294                 /* check if minimum agaw is sufficient for mapped address */
4295                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4296                 if (end < max_addr) {
4297                         pr_err("%s: iommu width (%d) is not "
4298                                "sufficient for the mapped address (%llx)\n",
4299                                __func__, dmar_domain->gaw, max_addr);
4300                         return -EFAULT;
4301                 }
4302                 dmar_domain->max_addr = max_addr;
4303         }
4304         /* Round up size to next multiple of PAGE_SIZE, if it and
4305            the low bits of hpa would take us onto the next page */
4306         size = aligned_nrpages(hpa, size);
4307         return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4308                                 hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
4309 }
4310
4311 static int intel_iommu_map_pages(struct iommu_domain *domain,
4312                                  unsigned long iova, phys_addr_t paddr,
4313                                  size_t pgsize, size_t pgcount,
4314                                  int prot, gfp_t gfp, size_t *mapped)
4315 {
4316         unsigned long pgshift = __ffs(pgsize);
4317         size_t size = pgcount << pgshift;
4318         int ret;
4319
4320         if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4321                 return -EINVAL;
4322
4323         if (!IS_ALIGNED(iova | paddr, pgsize))
4324                 return -EINVAL;
4325
4326         ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4327         if (!ret && mapped)
4328                 *mapped = size;
4329
4330         return ret;
4331 }
4332
4333 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4334                                 unsigned long iova, size_t size,
4335                                 struct iommu_iotlb_gather *gather)
4336 {
4337         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4338         unsigned long start_pfn, last_pfn;
4339         int level = 0;
4340
4341         /* Cope with horrid API which requires us to unmap more than the
4342            size argument if it happens to be a large-page mapping. */
4343         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
4344                                GFP_ATOMIC));
4345
4346         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4347                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4348
4349         start_pfn = iova >> VTD_PAGE_SHIFT;
4350         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4351
4352         domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4353
4354         if (dmar_domain->max_addr == iova + size)
4355                 dmar_domain->max_addr = iova;
4356
4357         /*
4358          * We do not use page-selective IOTLB invalidation in flush queue,
4359          * so there is no need to track page and sync iotlb.
4360          */
4361         if (!iommu_iotlb_gather_queued(gather))
4362                 iommu_iotlb_gather_add_page(domain, gather, iova, size);
4363
4364         return size;
4365 }
4366
4367 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4368                                       unsigned long iova,
4369                                       size_t pgsize, size_t pgcount,
4370                                       struct iommu_iotlb_gather *gather)
4371 {
4372         unsigned long pgshift = __ffs(pgsize);
4373         size_t size = pgcount << pgshift;
4374
4375         return intel_iommu_unmap(domain, iova, size, gather);
4376 }
4377
4378 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4379                                  struct iommu_iotlb_gather *gather)
4380 {
4381         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4382         unsigned long iova_pfn = IOVA_PFN(gather->start);
4383         size_t size = gather->end - gather->start;
4384         struct iommu_domain_info *info;
4385         unsigned long start_pfn;
4386         unsigned long nrpages;
4387         unsigned long i;
4388
4389         nrpages = aligned_nrpages(gather->start, size);
4390         start_pfn = mm_to_dma_pfn(iova_pfn);
4391
4392         xa_for_each(&dmar_domain->iommu_array, i, info)
4393                 iommu_flush_iotlb_psi(info->iommu, dmar_domain,
4394                                       start_pfn, nrpages,
4395                                       list_empty(&gather->freelist), 0);
4396
4397         put_pages_list(&gather->freelist);
4398 }
4399
4400 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4401                                             dma_addr_t iova)
4402 {
4403         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4404         struct dma_pte *pte;
4405         int level = 0;
4406         u64 phys = 0;
4407
4408         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
4409                              GFP_ATOMIC);
4410         if (pte && dma_pte_present(pte))
4411                 phys = dma_pte_addr(pte) +
4412                         (iova & (BIT_MASK(level_to_offset_bits(level) +
4413                                                 VTD_PAGE_SHIFT) - 1));
4414
4415         return phys;
4416 }
4417
4418 static bool domain_support_force_snooping(struct dmar_domain *domain)
4419 {
4420         struct device_domain_info *info;
4421         bool support = true;
4422
4423         assert_spin_locked(&domain->lock);
4424         list_for_each_entry(info, &domain->devices, link) {
4425                 if (!ecap_sc_support(info->iommu->ecap)) {
4426                         support = false;
4427                         break;
4428                 }
4429         }
4430
4431         return support;
4432 }
4433
4434 static void domain_set_force_snooping(struct dmar_domain *domain)
4435 {
4436         struct device_domain_info *info;
4437
4438         assert_spin_locked(&domain->lock);
4439         /*
4440          * Second level page table supports per-PTE snoop control. The
4441          * iommu_map() interface will handle this by setting SNP bit.
4442          */
4443         if (!domain->use_first_level) {
4444                 domain->set_pte_snp = true;
4445                 return;
4446         }
4447
4448         list_for_each_entry(info, &domain->devices, link)
4449                 intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4450                                                      PASID_RID2PASID);
4451 }
4452
4453 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4454 {
4455         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4456         unsigned long flags;
4457
4458         if (dmar_domain->force_snooping)
4459                 return true;
4460
4461         spin_lock_irqsave(&dmar_domain->lock, flags);
4462         if (!domain_support_force_snooping(dmar_domain)) {
4463                 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4464                 return false;
4465         }
4466
4467         domain_set_force_snooping(dmar_domain);
4468         dmar_domain->force_snooping = true;
4469         spin_unlock_irqrestore(&dmar_domain->lock, flags);
4470
4471         return true;
4472 }
4473
4474 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
4475 {
4476         struct device_domain_info *info = dev_iommu_priv_get(dev);
4477
4478         switch (cap) {
4479         case IOMMU_CAP_CACHE_COHERENCY:
4480                 return true;
4481         case IOMMU_CAP_INTR_REMAP:
4482                 return irq_remapping_enabled == 1;
4483         case IOMMU_CAP_PRE_BOOT_PROTECTION:
4484                 return dmar_platform_optin();
4485         case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
4486                 return ecap_sc_support(info->iommu->ecap);
4487         default:
4488                 return false;
4489         }
4490 }
4491
4492 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4493 {
4494         struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4495         struct device_domain_info *info;
4496         struct intel_iommu *iommu;
4497         u8 bus, devfn;
4498         int ret;
4499
4500         iommu = device_to_iommu(dev, &bus, &devfn);
4501         if (!iommu || !iommu->iommu.ops)
4502                 return ERR_PTR(-ENODEV);
4503
4504         info = kzalloc(sizeof(*info), GFP_KERNEL);
4505         if (!info)
4506                 return ERR_PTR(-ENOMEM);
4507
4508         if (dev_is_real_dma_subdevice(dev)) {
4509                 info->bus = pdev->bus->number;
4510                 info->devfn = pdev->devfn;
4511                 info->segment = pci_domain_nr(pdev->bus);
4512         } else {
4513                 info->bus = bus;
4514                 info->devfn = devfn;
4515                 info->segment = iommu->segment;
4516         }
4517
4518         info->dev = dev;
4519         info->iommu = iommu;
4520         if (dev_is_pci(dev)) {
4521                 if (ecap_dev_iotlb_support(iommu->ecap) &&
4522                     pci_ats_supported(pdev) &&
4523                     dmar_ats_supported(pdev, iommu)) {
4524                         info->ats_supported = 1;
4525                         info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
4526                 }
4527                 if (sm_supported(iommu)) {
4528                         if (pasid_supported(iommu)) {
4529                                 int features = pci_pasid_features(pdev);
4530
4531                                 if (features >= 0)
4532                                         info->pasid_supported = features | 1;
4533                         }
4534
4535                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
4536                             pci_pri_supported(pdev))
4537                                 info->pri_supported = 1;
4538                 }
4539         }
4540
4541         dev_iommu_priv_set(dev, info);
4542
4543         if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
4544                 ret = intel_pasid_alloc_table(dev);
4545                 if (ret) {
4546                         dev_err(dev, "PASID table allocation failed\n");
4547                         dev_iommu_priv_set(dev, NULL);
4548                         kfree(info);
4549                         return ERR_PTR(ret);
4550                 }
4551         }
4552
4553         return &iommu->iommu;
4554 }
4555
4556 static void intel_iommu_release_device(struct device *dev)
4557 {
4558         struct device_domain_info *info = dev_iommu_priv_get(dev);
4559
4560         dmar_remove_one_dev_info(dev);
4561         intel_pasid_free_table(dev);
4562         dev_iommu_priv_set(dev, NULL);
4563         kfree(info);
4564         set_dma_ops(dev, NULL);
4565 }
4566
4567 static void intel_iommu_probe_finalize(struct device *dev)
4568 {
4569         set_dma_ops(dev, NULL);
4570         iommu_setup_dma_ops(dev, 0, U64_MAX);
4571 }
4572
4573 static void intel_iommu_get_resv_regions(struct device *device,
4574                                          struct list_head *head)
4575 {
4576         int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4577         struct iommu_resv_region *reg;
4578         struct dmar_rmrr_unit *rmrr;
4579         struct device *i_dev;
4580         int i;
4581
4582         rcu_read_lock();
4583         for_each_rmrr_units(rmrr) {
4584                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4585                                           i, i_dev) {
4586                         struct iommu_resv_region *resv;
4587                         enum iommu_resv_type type;
4588                         size_t length;
4589
4590                         if (i_dev != device &&
4591                             !is_downstream_to_pci_bridge(device, i_dev))
4592                                 continue;
4593
4594                         length = rmrr->end_address - rmrr->base_address + 1;
4595
4596                         type = device_rmrr_is_relaxable(device) ?
4597                                 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4598
4599                         resv = iommu_alloc_resv_region(rmrr->base_address,
4600                                                        length, prot, type,
4601                                                        GFP_ATOMIC);
4602                         if (!resv)
4603                                 break;
4604
4605                         list_add_tail(&resv->list, head);
4606                 }
4607         }
4608         rcu_read_unlock();
4609
4610 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4611         if (dev_is_pci(device)) {
4612                 struct pci_dev *pdev = to_pci_dev(device);
4613
4614                 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4615                         reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4616                                         IOMMU_RESV_DIRECT_RELAXABLE,
4617                                         GFP_KERNEL);
4618                         if (reg)
4619                                 list_add_tail(&reg->list, head);
4620                 }
4621         }
4622 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4623
4624         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4625                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4626                                       0, IOMMU_RESV_MSI, GFP_KERNEL);
4627         if (!reg)
4628                 return;
4629         list_add_tail(&reg->list, head);
4630 }
4631
4632 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4633 {
4634         if (dev_is_pci(dev))
4635                 return pci_device_group(dev);
4636         return generic_device_group(dev);
4637 }
4638
4639 static int intel_iommu_enable_sva(struct device *dev)
4640 {
4641         struct device_domain_info *info = dev_iommu_priv_get(dev);
4642         struct intel_iommu *iommu;
4643         int ret;
4644
4645         if (!info || dmar_disabled)
4646                 return -EINVAL;
4647
4648         iommu = info->iommu;
4649         if (!iommu)
4650                 return -EINVAL;
4651
4652         if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4653                 return -ENODEV;
4654
4655         if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
4656                 return -EINVAL;
4657
4658         ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4659         if (ret)
4660                 return ret;
4661
4662         ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4663         if (ret)
4664                 iopf_queue_remove_device(iommu->iopf_queue, dev);
4665
4666         return ret;
4667 }
4668
4669 static int intel_iommu_disable_sva(struct device *dev)
4670 {
4671         struct device_domain_info *info = dev_iommu_priv_get(dev);
4672         struct intel_iommu *iommu = info->iommu;
4673         int ret;
4674
4675         ret = iommu_unregister_device_fault_handler(dev);
4676         if (ret)
4677                 return ret;
4678
4679         ret = iopf_queue_remove_device(iommu->iopf_queue, dev);
4680         if (ret)
4681                 iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4682
4683         return ret;
4684 }
4685
4686 static int intel_iommu_enable_iopf(struct device *dev)
4687 {
4688         struct device_domain_info *info = dev_iommu_priv_get(dev);
4689
4690         if (info && info->pri_supported)
4691                 return 0;
4692
4693         return -ENODEV;
4694 }
4695
4696 static int
4697 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4698 {
4699         switch (feat) {
4700         case IOMMU_DEV_FEAT_IOPF:
4701                 return intel_iommu_enable_iopf(dev);
4702
4703         case IOMMU_DEV_FEAT_SVA:
4704                 return intel_iommu_enable_sva(dev);
4705
4706         default:
4707                 return -ENODEV;
4708         }
4709 }
4710
4711 static int
4712 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4713 {
4714         switch (feat) {
4715         case IOMMU_DEV_FEAT_IOPF:
4716                 return 0;
4717
4718         case IOMMU_DEV_FEAT_SVA:
4719                 return intel_iommu_disable_sva(dev);
4720
4721         default:
4722                 return -ENODEV;
4723         }
4724 }
4725
4726 static bool intel_iommu_is_attach_deferred(struct device *dev)
4727 {
4728         struct device_domain_info *info = dev_iommu_priv_get(dev);
4729
4730         return translation_pre_enabled(info->iommu) && !info->domain;
4731 }
4732
4733 /*
4734  * Check that the device does not live on an external facing PCI port that is
4735  * marked as untrusted. Such devices should not be able to apply quirks and
4736  * thus not be able to bypass the IOMMU restrictions.
4737  */
4738 static bool risky_device(struct pci_dev *pdev)
4739 {
4740         if (pdev->untrusted) {
4741                 pci_info(pdev,
4742                          "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4743                          pdev->vendor, pdev->device);
4744                 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4745                 return true;
4746         }
4747         return false;
4748 }
4749
4750 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4751                                        unsigned long iova, size_t size)
4752 {
4753         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4754         unsigned long pages = aligned_nrpages(iova, size);
4755         unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4756         struct iommu_domain_info *info;
4757         unsigned long i;
4758
4759         xa_for_each(&dmar_domain->iommu_array, i, info)
4760                 __mapping_notify_one(info->iommu, dmar_domain, pfn, pages);
4761 }
4762
4763 static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
4764 {
4765         struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
4766         struct iommu_domain *domain;
4767
4768         /* Domain type specific cleanup: */
4769         domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0);
4770         if (domain) {
4771                 switch (domain->type) {
4772                 case IOMMU_DOMAIN_SVA:
4773                         intel_svm_remove_dev_pasid(dev, pasid);
4774                         break;
4775                 default:
4776                         /* should never reach here */
4777                         WARN_ON(1);
4778                         break;
4779                 }
4780         }
4781
4782         intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4783 }
4784
4785 const struct iommu_ops intel_iommu_ops = {
4786         .capable                = intel_iommu_capable,
4787         .domain_alloc           = intel_iommu_domain_alloc,
4788         .probe_device           = intel_iommu_probe_device,
4789         .probe_finalize         = intel_iommu_probe_finalize,
4790         .release_device         = intel_iommu_release_device,
4791         .get_resv_regions       = intel_iommu_get_resv_regions,
4792         .device_group           = intel_iommu_device_group,
4793         .dev_enable_feat        = intel_iommu_dev_enable_feat,
4794         .dev_disable_feat       = intel_iommu_dev_disable_feat,
4795         .is_attach_deferred     = intel_iommu_is_attach_deferred,
4796         .def_domain_type        = device_def_domain_type,
4797         .remove_dev_pasid       = intel_iommu_remove_dev_pasid,
4798         .pgsize_bitmap          = SZ_4K,
4799 #ifdef CONFIG_INTEL_IOMMU_SVM
4800         .page_response          = intel_svm_page_response,
4801 #endif
4802         .default_domain_ops = &(const struct iommu_domain_ops) {
4803                 .attach_dev             = intel_iommu_attach_device,
4804                 .map_pages              = intel_iommu_map_pages,
4805                 .unmap_pages            = intel_iommu_unmap_pages,
4806                 .iotlb_sync_map         = intel_iommu_iotlb_sync_map,
4807                 .flush_iotlb_all        = intel_flush_iotlb_all,
4808                 .iotlb_sync             = intel_iommu_tlb_sync,
4809                 .iova_to_phys           = intel_iommu_iova_to_phys,
4810                 .free                   = intel_iommu_domain_free,
4811                 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4812         }
4813 };
4814
4815 static void quirk_iommu_igfx(struct pci_dev *dev)
4816 {
4817         if (risky_device(dev))
4818                 return;
4819
4820         pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4821         dmar_map_gfx = 0;
4822 }
4823
4824 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4825 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4826 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4827 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4828 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4829 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4830 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4831 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4832
4833 /* Broadwell igfx malfunctions with dmar */
4834 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4835 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4836 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4837 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4838 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4839 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4840 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4841 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4842 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4843 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4844 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4845 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4846 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4847 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4848 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4849 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4850 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4851 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4852 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4853 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4854 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4855 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4856 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4857 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4858
4859 static void quirk_iommu_rwbf(struct pci_dev *dev)
4860 {
4861         if (risky_device(dev))
4862                 return;
4863
4864         /*
4865          * Mobile 4 Series Chipset neglects to set RWBF capability,
4866          * but needs it. Same seems to hold for the desktop versions.
4867          */
4868         pci_info(dev, "Forcing write-buffer flush capability\n");
4869         rwbf_quirk = 1;
4870 }
4871
4872 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4873 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4874 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4875 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4876 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4877 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4878 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4879
4880 #define GGC 0x52
4881 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
4882 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
4883 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
4884 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
4885 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
4886 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
4887 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
4888 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
4889
4890 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4891 {
4892         unsigned short ggc;
4893
4894         if (risky_device(dev))
4895                 return;
4896
4897         if (pci_read_config_word(dev, GGC, &ggc))
4898                 return;
4899
4900         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4901                 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4902                 dmar_map_gfx = 0;
4903         } else if (dmar_map_gfx) {
4904                 /* we have to ensure the gfx device is idle before we flush */
4905                 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4906                 iommu_set_dma_strict();
4907         }
4908 }
4909 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4910 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4911 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4912 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4913
4914 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4915 {
4916         unsigned short ver;
4917
4918         if (!IS_GFX_DEVICE(dev))
4919                 return;
4920
4921         ver = (dev->device >> 8) & 0xff;
4922         if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4923             ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4924             ver != 0x9a && ver != 0xa7)
4925                 return;
4926
4927         if (risky_device(dev))
4928                 return;
4929
4930         pci_info(dev, "Skip IOMMU disabling for graphics\n");
4931         iommu_skip_te_disable = 1;
4932 }
4933 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4934
4935 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4936    ISOCH DMAR unit for the Azalia sound device, but not give it any
4937    TLB entries, which causes it to deadlock. Check for that.  We do
4938    this in a function called from init_dmars(), instead of in a PCI
4939    quirk, because we don't want to print the obnoxious "BIOS broken"
4940    message if VT-d is actually disabled.
4941 */
4942 static void __init check_tylersburg_isoch(void)
4943 {
4944         struct pci_dev *pdev;
4945         uint32_t vtisochctrl;
4946
4947         /* If there's no Azalia in the system anyway, forget it. */
4948         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4949         if (!pdev)
4950                 return;
4951
4952         if (risky_device(pdev)) {
4953                 pci_dev_put(pdev);
4954                 return;
4955         }
4956
4957         pci_dev_put(pdev);
4958
4959         /* System Management Registers. Might be hidden, in which case
4960            we can't do the sanity check. But that's OK, because the
4961            known-broken BIOSes _don't_ actually hide it, so far. */
4962         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4963         if (!pdev)
4964                 return;
4965
4966         if (risky_device(pdev)) {
4967                 pci_dev_put(pdev);
4968                 return;
4969         }
4970
4971         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4972                 pci_dev_put(pdev);
4973                 return;
4974         }
4975
4976         pci_dev_put(pdev);
4977
4978         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4979         if (vtisochctrl & 1)
4980                 return;
4981
4982         /* Drop all bits other than the number of TLB entries */
4983         vtisochctrl &= 0x1c;
4984
4985         /* If we have the recommended number of TLB entries (16), fine. */
4986         if (vtisochctrl == 0x10)
4987                 return;
4988
4989         /* Zero TLB entries? You get to ride the short bus to school. */
4990         if (!vtisochctrl) {
4991                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4992                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4993                      dmi_get_system_info(DMI_BIOS_VENDOR),
4994                      dmi_get_system_info(DMI_BIOS_VERSION),
4995                      dmi_get_system_info(DMI_PRODUCT_VERSION));
4996                 iommu_identity_mapping |= IDENTMAP_AZALIA;
4997                 return;
4998         }
4999
5000         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5001                vtisochctrl);
5002 }
5003
5004 /*
5005  * Here we deal with a device TLB defect where device may inadvertently issue ATS
5006  * invalidation completion before posted writes initiated with translated address
5007  * that utilized translations matching the invalidation address range, violating
5008  * the invalidation completion ordering.
5009  * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
5010  * vulnerable to this defect. In other words, any dTLB invalidation initiated not
5011  * under the control of the trusted/privileged host device driver must use this
5012  * quirk.
5013  * Device TLBs are invalidated under the following six conditions:
5014  * 1. Device driver does DMA API unmap IOVA
5015  * 2. Device driver unbind a PASID from a process, sva_unbind_device()
5016  * 3. PASID is torn down, after PASID cache is flushed. e.g. process
5017  *    exit_mmap() due to crash
5018  * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
5019  *    VM has to free pages that were unmapped
5020  * 5. Userspace driver unmaps a DMA buffer
5021  * 6. Cache invalidation in vSVA usage (upcoming)
5022  *
5023  * For #1 and #2, device drivers are responsible for stopping DMA traffic
5024  * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
5025  * invalidate TLB the same way as normal user unmap which will use this quirk.
5026  * The dTLB invalidation after PASID cache flush does not need this quirk.
5027  *
5028  * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
5029  */
5030 void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
5031                                unsigned long address, unsigned long mask,
5032                                u32 pasid, u16 qdep)
5033 {
5034         u16 sid;
5035
5036         if (likely(!info->dtlb_extra_inval))
5037                 return;
5038
5039         sid = PCI_DEVID(info->bus, info->devfn);
5040         if (pasid == PASID_RID2PASID) {
5041                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
5042                                    qdep, address, mask);
5043         } else {
5044                 qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
5045                                          pasid, qdep, address, mask);
5046         }
5047 }
5048
5049 #define ecmd_get_status_code(res)       (((res) & 0xff) >> 1)
5050
5051 /*
5052  * Function to submit a command to the enhanced command interface. The
5053  * valid enhanced command descriptions are defined in Table 47 of the
5054  * VT-d spec. The VT-d hardware implementation may support some but not
5055  * all commands, which can be determined by checking the Enhanced
5056  * Command Capability Register.
5057  *
5058  * Return values:
5059  *  - 0: Command successful without any error;
5060  *  - Negative: software error value;
5061  *  - Nonzero positive: failure status code defined in Table 48.
5062  */
5063 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
5064 {
5065         unsigned long flags;
5066         u64 res;
5067         int ret;
5068
5069         if (!cap_ecmds(iommu->cap))
5070                 return -ENODEV;
5071
5072         raw_spin_lock_irqsave(&iommu->register_lock, flags);
5073
5074         res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
5075         if (res & DMA_ECMD_ECRSP_IP) {
5076                 ret = -EBUSY;
5077                 goto err;
5078         }
5079
5080         /*
5081          * Unconditionally write the operand B, because
5082          * - There is no side effect if an ecmd doesn't require an
5083          *   operand B, but we set the register to some value.
5084          * - It's not invoked in any critical path. The extra MMIO
5085          *   write doesn't bring any performance concerns.
5086          */
5087         dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
5088         dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
5089
5090         IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
5091                       !(res & DMA_ECMD_ECRSP_IP), res);
5092
5093         if (res & DMA_ECMD_ECRSP_IP) {
5094                 ret = -ETIMEDOUT;
5095                 goto err;
5096         }
5097
5098         ret = ecmd_get_status_code(res);
5099 err:
5100         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
5101
5102         return ret;
5103 }