drivers/gpu/drm/i915/gt/intel_gtt.c

   1 // SPDX-License-Identifier: MIT
   2 /*
   3  * Copyright © 2020 Intel Corporation
   4  */
   5
   6 #include <linux/slab.h> /* fault-inject.h is not standalone! */
   7
   8 #include <linux/fault-inject.h>
   9
  10 #include "gem/i915_gem_lmem.h"
  11 #include "i915_trace.h"
  12 #include "intel_gt.h"
  13 #include "intel_gtt.h"
  14
  15 struct drm_i915_gem_object *alloc_pt_lmem(struct i915_address_space *vm, int sz)
  16 {
  17         struct drm_i915_gem_object *obj;
  18
  19         obj = i915_gem_object_create_lmem(vm->i915, sz, 0);
  20         /*
  21          * Ensure all paging structures for this vm share the same dma-resv
  22          * object underneath, with the idea that one object_lock() will lock
  23          * them all at once.
  24          */
  25         if (!IS_ERR(obj)) {
  26                 obj->base.resv = i915_vm_resv_get(vm);
  27                 obj->shares_resv_from = vm;
  28         }
  29
  30         return obj;
  31 }
  32
  33 struct drm_i915_gem_object *alloc_pt_dma(struct i915_address_space *vm, int sz)
  34 {
  35         struct drm_i915_gem_object *obj;
  36
  37         if (I915_SELFTEST_ONLY(should_fail(&vm->fault_attr, 1)))
  38                 i915_gem_shrink_all(vm->i915);
  39
  40         obj = i915_gem_object_create_internal(vm->i915, sz);
  41         /*
  42          * Ensure all paging structures for this vm share the same dma-resv
  43          * object underneath, with the idea that one object_lock() will lock
  44          * them all at once.
  45          */
  46         if (!IS_ERR(obj)) {
  47                 obj->base.resv = i915_vm_resv_get(vm);
  48                 obj->shares_resv_from = vm;
  49         }
  50
  51         return obj;
  52 }
  53
  54 int map_pt_dma(struct i915_address_space *vm, struct drm_i915_gem_object *obj)
  55 {
  56         enum i915_map_type type;
  57         void *vaddr;
  58
  59         type = i915_coherent_map_type(vm->i915, obj, true);
  60         vaddr = i915_gem_object_pin_map_unlocked(obj, type);
  61         if (IS_ERR(vaddr))
  62                 return PTR_ERR(vaddr);
  63
  64         i915_gem_object_make_unshrinkable(obj);
  65         return 0;
  66 }
  67
  68 int map_pt_dma_locked(struct i915_address_space *vm, struct drm_i915_gem_object *obj)
  69 {
  70         enum i915_map_type type;
  71         void *vaddr;
  72
  73         type = i915_coherent_map_type(vm->i915, obj, true);
  74         vaddr = i915_gem_object_pin_map(obj, type);
  75         if (IS_ERR(vaddr))
  76                 return PTR_ERR(vaddr);
  77
  78         i915_gem_object_make_unshrinkable(obj);
  79         return 0;
  80 }
  81
  82 void __i915_vm_close(struct i915_address_space *vm)
  83 {
  84         struct i915_vma *vma, *vn;
  85
  86         if (!atomic_dec_and_mutex_lock(&vm->open, &vm->mutex))
  87                 return;
  88
  89         list_for_each_entry_safe(vma, vn, &vm->bound_list, vm_link) {
  90                 struct drm_i915_gem_object *obj = vma->obj;
  91
  92                 /* Keep the obj (and hence the vma) alive as _we_ destroy it */
  93                 if (!kref_get_unless_zero(&obj->base.refcount))
  94                         continue;
  95
  96                 atomic_and(~I915_VMA_PIN_MASK, &vma->flags);
  97                 WARN_ON(__i915_vma_unbind(vma));
  98                 __i915_vma_put(vma);
  99
 100                 i915_gem_object_put(obj);
 101         }
 102         GEM_BUG_ON(!list_empty(&vm->bound_list));
 103
 104         mutex_unlock(&vm->mutex);
 105 }
 106
 107 /* lock the vm into the current ww, if we lock one, we lock all */
 108 int i915_vm_lock_objects(struct i915_address_space *vm,
 109                          struct i915_gem_ww_ctx *ww)
 110 {
 111         if (vm->scratch[0]->base.resv == &vm->_resv) {
 112                 return i915_gem_object_lock(vm->scratch[0], ww);
 113         } else {
 114                 struct i915_ppgtt *ppgtt = i915_vm_to_ppgtt(vm);
 115
 116                 /* We borrowed the scratch page from ggtt, take the top level object */
 117                 return i915_gem_object_lock(ppgtt->pd->pt.base, ww);
 118         }
 119 }
 120
 121 void i915_address_space_fini(struct i915_address_space *vm)
 122 {
 123         drm_mm_takedown(&vm->mm);
 124         mutex_destroy(&vm->mutex);
 125 }
 126
 127 /**
 128  * i915_vm_resv_release - Final struct i915_address_space destructor
 129  * @kref: Pointer to the &i915_address_space.resv_ref member.
 130  *
 131  * This function is called when the last lock sharer no longer shares the
 132  * &i915_address_space._resv lock.
 133  */
 134 void i915_vm_resv_release(struct kref *kref)
 135 {
 136         struct i915_address_space *vm =
 137                 container_of(kref, typeof(*vm), resv_ref);
 138
 139         dma_resv_fini(&vm->_resv);
 140         kfree(vm);
 141 }
 142
 143 static void __i915_vm_release(struct work_struct *work)
 144 {
 145         struct i915_address_space *vm =
 146                 container_of(work, struct i915_address_space, rcu.work);
 147
 148         vm->cleanup(vm);
 149         i915_address_space_fini(vm);
 150
 151         i915_vm_resv_put(vm);
 152 }
 153
 154 void i915_vm_release(struct kref *kref)
 155 {
 156         struct i915_address_space *vm =
 157                 container_of(kref, struct i915_address_space, ref);
 158
 159         GEM_BUG_ON(i915_is_ggtt(vm));
 160         trace_i915_ppgtt_release(vm);
 161
 162         queue_rcu_work(vm->i915->wq, &vm->rcu);
 163 }
 164
 165 void i915_address_space_init(struct i915_address_space *vm, int subclass)
 166 {
 167         kref_init(&vm->ref);
 168
 169         /*
 170          * Special case for GGTT that has already done an early
 171          * kref_init here.
 172          */
 173         if (!kref_read(&vm->resv_ref))
 174                 kref_init(&vm->resv_ref);
 175
 176         INIT_RCU_WORK(&vm->rcu, __i915_vm_release);
 177         atomic_set(&vm->open, 1);
 178
 179         /*
 180          * The vm->mutex must be reclaim safe (for use in the shrinker).
 181          * Do a dummy acquire now under fs_reclaim so that any allocation
 182          * attempt holding the lock is immediately reported by lockdep.
 183          */
 184         mutex_init(&vm->mutex);
 185         lockdep_set_subclass(&vm->mutex, subclass);
 186
 187         if (!intel_vm_no_concurrent_access_wa(vm->i915)) {
 188                 i915_gem_shrinker_taints_mutex(vm->i915, &vm->mutex);
 189         } else {
 190                 /*
 191                  * CHV + BXT VTD workaround use stop_machine(),
 192                  * which is allowed to allocate memory. This means &vm->mutex
 193                  * is the outer lock, and in theory we can allocate memory inside
 194                  * it through stop_machine().
 195                  *
 196                  * Add the annotation for this, we use trylock in shrinker.
 197                  */
 198                 mutex_acquire(&vm->mutex.dep_map, 0, 0, _THIS_IP_);
 199                 might_alloc(GFP_KERNEL);
 200                 mutex_release(&vm->mutex.dep_map, _THIS_IP_);
 201         }
 202         dma_resv_init(&vm->_resv);
 203
 204         GEM_BUG_ON(!vm->total);
 205         drm_mm_init(&vm->mm, 0, vm->total);
 206         vm->mm.head_node.color = I915_COLOR_UNEVICTABLE;
 207
 208         INIT_LIST_HEAD(&vm->bound_list);
 209 }
 210
 211 void clear_pages(struct i915_vma *vma)
 212 {
 213         GEM_BUG_ON(!vma->pages);
 214
 215         if (vma->pages != vma->obj->mm.pages) {
 216                 sg_free_table(vma->pages);
 217                 kfree(vma->pages);
 218         }
 219         vma->pages = NULL;
 220
 221         memset(&vma->page_sizes, 0, sizeof(vma->page_sizes));
 222 }
 223
 224 void *__px_vaddr(struct drm_i915_gem_object *p)
 225 {
 226         enum i915_map_type type;
 227
 228         GEM_BUG_ON(!i915_gem_object_has_pages(p));
 229         return page_unpack_bits(p->mm.mapping, &type);
 230 }
 231
 232 dma_addr_t __px_dma(struct drm_i915_gem_object *p)
 233 {
 234         GEM_BUG_ON(!i915_gem_object_has_pages(p));
 235         return sg_dma_address(p->mm.pages->sgl);
 236 }
 237
 238 struct page *__px_page(struct drm_i915_gem_object *p)
 239 {
 240         GEM_BUG_ON(!i915_gem_object_has_pages(p));
 241         return sg_page(p->mm.pages->sgl);
 242 }
 243
 244 void
 245 fill_page_dma(struct drm_i915_gem_object *p, const u64 val, unsigned int count)
 246 {
 247         void *vaddr = __px_vaddr(p);
 248
 249         memset64(vaddr, val, count);
 250         clflush_cache_range(vaddr, PAGE_SIZE);
 251 }
 252
 253 static void poison_scratch_page(struct drm_i915_gem_object *scratch)
 254 {
 255         void *vaddr = __px_vaddr(scratch);
 256         u8 val;
 257
 258         val = 0;
 259         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
 260                 val = POISON_FREE;
 261
 262         memset(vaddr, val, scratch->base.size);
 263 }
 264
 265 int setup_scratch_page(struct i915_address_space *vm)
 266 {
 267         unsigned long size;
 268
 269         /*
 270          * In order to utilize 64K pages for an object with a size < 2M, we will
 271          * need to support a 64K scratch page, given that every 16th entry for a
 272          * page-table operating in 64K mode must point to a properly aligned 64K
 273          * region, including any PTEs which happen to point to scratch.
 274          *
 275          * This is only relevant for the 48b PPGTT where we support
 276          * huge-gtt-pages, see also i915_vma_insert(). However, as we share the
 277          * scratch (read-only) between all vm, we create one 64k scratch page
 278          * for all.
 279          */
 280         size = I915_GTT_PAGE_SIZE_4K;
 281         if (i915_vm_is_4lvl(vm) &&
 282             HAS_PAGE_SIZES(vm->i915, I915_GTT_PAGE_SIZE_64K))
 283                 size = I915_GTT_PAGE_SIZE_64K;
 284
 285         do {
 286                 struct drm_i915_gem_object *obj;
 287
 288                 obj = vm->alloc_pt_dma(vm, size);
 289                 if (IS_ERR(obj))
 290                         goto skip;
 291
 292                 if (map_pt_dma(vm, obj))
 293                         goto skip_obj;
 294
 295                 /* We need a single contiguous page for our scratch */
 296                 if (obj->mm.page_sizes.sg < size)
 297                         goto skip_obj;
 298
 299                 /* And it needs to be correspondingly aligned */
 300                 if (__px_dma(obj) & (size - 1))
 301                         goto skip_obj;
 302
 303                 /*
 304                  * Use a non-zero scratch page for debugging.
 305                  *
 306                  * We want a value that should be reasonably obvious
 307                  * to spot in the error state, while also causing a GPU hang
 308                  * if executed. We prefer using a clear page in production, so
 309                  * should it ever be accidentally used, the effect should be
 310                  * fairly benign.
 311                  */
 312                 poison_scratch_page(obj);
 313
 314                 vm->scratch[0] = obj;
 315                 vm->scratch_order = get_order(size);
 316                 return 0;
 317
 318 skip_obj:
 319                 i915_gem_object_put(obj);
 320 skip:
 321                 if (size == I915_GTT_PAGE_SIZE_4K)
 322                         return -ENOMEM;
 323
 324                 size = I915_GTT_PAGE_SIZE_4K;
 325         } while (1);
 326 }
 327
 328 void free_scratch(struct i915_address_space *vm)
 329 {
 330         int i;
 331
 332         for (i = 0; i <= vm->top; i++)
 333                 i915_gem_object_put(vm->scratch[i]);
 334 }
 335
 336 void gtt_write_workarounds(struct intel_gt *gt)
 337 {
 338         struct drm_i915_private *i915 = gt->i915;
 339         struct intel_uncore *uncore = gt->uncore;
 340
 341         /*
 342          * This function is for gtt related workarounds. This function is
 343          * called on driver load and after a GPU reset, so you can place
 344          * workarounds here even if they get overwritten by GPU reset.
 345          */
 346         /* WaIncreaseDefaultTLBEntries:chv,bdw,skl,bxt,kbl,glk,cfl,cnl,icl */
 347         if (IS_BROADWELL(i915))
 348                 intel_uncore_write(uncore,
 349                                    GEN8_L3_LRA_1_GPGPU,
 350                                    GEN8_L3_LRA_1_GPGPU_DEFAULT_VALUE_BDW);
 351         else if (IS_CHERRYVIEW(i915))
 352                 intel_uncore_write(uncore,
 353                                    GEN8_L3_LRA_1_GPGPU,
 354                                    GEN8_L3_LRA_1_GPGPU_DEFAULT_VALUE_CHV);
 355         else if (IS_GEN9_LP(i915))
 356                 intel_uncore_write(uncore,
 357                                    GEN8_L3_LRA_1_GPGPU,
 358                                    GEN9_L3_LRA_1_GPGPU_DEFAULT_VALUE_BXT);
 359         else if (GRAPHICS_VER(i915) >= 9 && GRAPHICS_VER(i915) <= 11)
 360                 intel_uncore_write(uncore,
 361                                    GEN8_L3_LRA_1_GPGPU,
 362                                    GEN9_L3_LRA_1_GPGPU_DEFAULT_VALUE_SKL);
 363
 364         /*
 365          * To support 64K PTEs we need to first enable the use of the
 366          * Intermediate-Page-Size(IPS) bit of the PDE field via some magical
 367          * mmio, otherwise the page-walker will simply ignore the IPS bit. This
 368          * shouldn't be needed after GEN10.
 369          *
 370          * 64K pages were first introduced from BDW+, although technically they
 371          * only *work* from gen9+. For pre-BDW we instead have the option for
 372          * 32K pages, but we don't currently have any support for it in our
 373          * driver.
 374          */
 375         if (HAS_PAGE_SIZES(i915, I915_GTT_PAGE_SIZE_64K) &&
 376             GRAPHICS_VER(i915) <= 10)
 377                 intel_uncore_rmw(uncore,
 378                                  GEN8_GAMW_ECO_DEV_RW_IA,
 379                                  0,
 380                                  GAMW_ECO_ENABLE_64K_IPS_FIELD);
 381
 382         if (IS_GRAPHICS_VER(i915, 8, 11)) {
 383                 bool can_use_gtt_cache = true;
 384
 385                 /*
 386                  * According to the BSpec if we use 2M/1G pages then we also
 387                  * need to disable the GTT cache. At least on BDW we can see
 388                  * visual corruption when using 2M pages, and not disabling the
 389                  * GTT cache.
 390                  */
 391                 if (HAS_PAGE_SIZES(i915, I915_GTT_PAGE_SIZE_2M))
 392                         can_use_gtt_cache = false;
 393
 394                 /* WaGttCachingOffByDefault */
 395                 intel_uncore_write(uncore,
 396                                    HSW_GTT_CACHE_EN,
 397                                    can_use_gtt_cache ? GTT_CACHE_EN_ALL : 0);
 398                 drm_WARN_ON_ONCE(&i915->drm, can_use_gtt_cache &&
 399                                  intel_uncore_read(uncore,
 400                                                    HSW_GTT_CACHE_EN) == 0);
 401         }
 402 }
 403
 404 static void tgl_setup_private_ppat(struct intel_uncore *uncore)
 405 {
 406         /* TGL doesn't support LLC or AGE settings */
 407         intel_uncore_write(uncore, GEN12_PAT_INDEX(0), GEN8_PPAT_WB);
 408         intel_uncore_write(uncore, GEN12_PAT_INDEX(1), GEN8_PPAT_WC);
 409         intel_uncore_write(uncore, GEN12_PAT_INDEX(2), GEN8_PPAT_WT);
 410         intel_uncore_write(uncore, GEN12_PAT_INDEX(3), GEN8_PPAT_UC);
 411         intel_uncore_write(uncore, GEN12_PAT_INDEX(4), GEN8_PPAT_WB);
 412         intel_uncore_write(uncore, GEN12_PAT_INDEX(5), GEN8_PPAT_WB);
 413         intel_uncore_write(uncore, GEN12_PAT_INDEX(6), GEN8_PPAT_WB);
 414         intel_uncore_write(uncore, GEN12_PAT_INDEX(7), GEN8_PPAT_WB);
 415 }
 416
 417 static void cnl_setup_private_ppat(struct intel_uncore *uncore)
 418 {
 419         intel_uncore_write(uncore,
 420                            GEN10_PAT_INDEX(0),
 421                            GEN8_PPAT_WB | GEN8_PPAT_LLC);
 422         intel_uncore_write(uncore,
 423                            GEN10_PAT_INDEX(1),
 424                            GEN8_PPAT_WC | GEN8_PPAT_LLCELLC);
 425         intel_uncore_write(uncore,
 426                            GEN10_PAT_INDEX(2),
 427                            GEN8_PPAT_WB | GEN8_PPAT_ELLC_OVERRIDE);
 428         intel_uncore_write(uncore,
 429                            GEN10_PAT_INDEX(3),
 430                            GEN8_PPAT_UC);
 431         intel_uncore_write(uncore,
 432                            GEN10_PAT_INDEX(4),
 433                            GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(0));
 434         intel_uncore_write(uncore,
 435                            GEN10_PAT_INDEX(5),
 436                            GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(1));
 437         intel_uncore_write(uncore,
 438                            GEN10_PAT_INDEX(6),
 439                            GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(2));
 440         intel_uncore_write(uncore,
 441                            GEN10_PAT_INDEX(7),
 442                            GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(3));
 443 }
 444
 445 /*
 446  * The GGTT and PPGTT need a private PPAT setup in order to handle cacheability
 447  * bits. When using advanced contexts each context stores its own PAT, but
 448  * writing this data shouldn't be harmful even in those cases.
 449  */
 450 static void bdw_setup_private_ppat(struct intel_uncore *uncore)
 451 {
 452         struct drm_i915_private *i915 = uncore->i915;
 453         u64 pat;
 454
 455         pat = GEN8_PPAT(0, GEN8_PPAT_WB | GEN8_PPAT_LLC) |      /* for normal objects, no eLLC */
 456               GEN8_PPAT(1, GEN8_PPAT_WC | GEN8_PPAT_LLCELLC) |  /* for something pointing to ptes? */
 457               GEN8_PPAT(3, GEN8_PPAT_UC) |                      /* Uncached objects, mostly for scanout */
 458               GEN8_PPAT(4, GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(0)) |
 459               GEN8_PPAT(5, GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(1)) |
 460               GEN8_PPAT(6, GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(2)) |
 461               GEN8_PPAT(7, GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(3));
 462
 463         /* for scanout with eLLC */
 464         if (GRAPHICS_VER(i915) >= 9)
 465                 pat |= GEN8_PPAT(2, GEN8_PPAT_WB | GEN8_PPAT_ELLC_OVERRIDE);
 466         else
 467                 pat |= GEN8_PPAT(2, GEN8_PPAT_WT | GEN8_PPAT_LLCELLC);
 468
 469         intel_uncore_write(uncore, GEN8_PRIVATE_PAT_LO, lower_32_bits(pat));
 470         intel_uncore_write(uncore, GEN8_PRIVATE_PAT_HI, upper_32_bits(pat));
 471 }
 472
 473 static void chv_setup_private_ppat(struct intel_uncore *uncore)
 474 {
 475         u64 pat;
 476
 477         /*
 478          * Map WB on BDW to snooped on CHV.
 479          *
 480          * Only the snoop bit has meaning for CHV, the rest is
 481          * ignored.
 482          *
 483          * The hardware will never snoop for certain types of accesses:
 484          * - CPU GTT (GMADR->GGTT->no snoop->memory)
 485          * - PPGTT page tables
 486          * - some other special cycles
 487          *
 488          * As with BDW, we also need to consider the following for GT accesses:
 489          * "For GGTT, there is NO pat_sel[2:0] from the entry,
 490          * so RTL will always use the value corresponding to
 491          * pat_sel = 000".
 492          * Which means we must set the snoop bit in PAT entry 0
 493          * in order to keep the global status page working.
 494          */
 495
 496         pat = GEN8_PPAT(0, CHV_PPAT_SNOOP) |
 497               GEN8_PPAT(1, 0) |
 498               GEN8_PPAT(2, 0) |
 499               GEN8_PPAT(3, 0) |
 500               GEN8_PPAT(4, CHV_PPAT_SNOOP) |
 501               GEN8_PPAT(5, CHV_PPAT_SNOOP) |
 502               GEN8_PPAT(6, CHV_PPAT_SNOOP) |
 503               GEN8_PPAT(7, CHV_PPAT_SNOOP);
 504
 505         intel_uncore_write(uncore, GEN8_PRIVATE_PAT_LO, lower_32_bits(pat));
 506         intel_uncore_write(uncore, GEN8_PRIVATE_PAT_HI, upper_32_bits(pat));
 507 }
 508
 509 void setup_private_pat(struct intel_uncore *uncore)
 510 {
 511         struct drm_i915_private *i915 = uncore->i915;
 512
 513         GEM_BUG_ON(GRAPHICS_VER(i915) < 8);
 514
 515         if (GRAPHICS_VER(i915) >= 12)
 516                 tgl_setup_private_ppat(uncore);
 517         else if (GRAPHICS_VER(i915) >= 10)
 518                 cnl_setup_private_ppat(uncore);
 519         else if (IS_CHERRYVIEW(i915) || IS_GEN9_LP(i915))
 520                 chv_setup_private_ppat(uncore);
 521         else
 522                 bdw_setup_private_ppat(uncore);
 523 }
 524
 525 struct i915_vma *
 526 __vm_create_scratch_for_read(struct i915_address_space *vm, unsigned long size)
 527 {
 528         struct drm_i915_gem_object *obj;
 529         struct i915_vma *vma;
 530
 531         obj = i915_gem_object_create_internal(vm->i915, PAGE_ALIGN(size));
 532         if (IS_ERR(obj))
 533                 return ERR_CAST(obj);
 534
 535         i915_gem_object_set_cache_coherency(obj, I915_CACHING_CACHED);
 536
 537         vma = i915_vma_instance(obj, vm, NULL);
 538         if (IS_ERR(vma)) {
 539                 i915_gem_object_put(obj);
 540                 return vma;
 541         }
 542
 543         return vma;
 544 }
 545
 546 struct i915_vma *
 547 __vm_create_scratch_for_read_pinned(struct i915_address_space *vm, unsigned long size)
 548 {
 549         struct i915_vma *vma;
 550         int err;
 551
 552         vma = __vm_create_scratch_for_read(vm, size);
 553         if (IS_ERR(vma))
 554                 return vma;
 555
 556         err = i915_vma_pin(vma, 0, 0,
 557                            i915_vma_is_ggtt(vma) ? PIN_GLOBAL : PIN_USER);
 558         if (err) {
 559                 i915_vma_put(vma);
 560                 return ERR_PTR(err);
 561         }
 562
 563         return vma;
 564 }
 565
 566 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
 567 #include "selftests/mock_gtt.c"
 568 #endif