drivers/gpu/drm/i915/gt/intel_lrc.c

   1 // SPDX-License-Identifier: MIT
   2 /*
   3  * Copyright © 2014 Intel Corporation
   4  */
   5
   6 #include "gem/i915_gem_lmem.h"
   7
   8 #include "gen8_engine_cs.h"
   9 #include "i915_drv.h"
  10 #include "i915_perf.h"
  11 #include "i915_reg.h"
  12 #include "intel_context.h"
  13 #include "intel_engine.h"
  14 #include "intel_engine_regs.h"
  15 #include "intel_gpu_commands.h"
  16 #include "intel_gt.h"
  17 #include "intel_gt_regs.h"
  18 #include "intel_lrc.h"
  19 #include "intel_lrc_reg.h"
  20 #include "intel_ring.h"
  21 #include "shmem_utils.h"
  22
  23 /*
  24  * The per-platform tables are u8-encoded in @data. Decode @data and set the
  25  * addresses' offset and commands in @regs. The following encoding is used
  26  * for each byte. There are 2 steps: decoding commands and decoding addresses.
  27  *
  28  * Commands:
  29  * [7]: create NOPs - number of NOPs are set in lower bits
  30  * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set
  31  *      MI_LRI_FORCE_POSTED
  32  * [5:0]: Number of NOPs or registers to set values to in case of
  33  *        MI_LOAD_REGISTER_IMM
  34  *
  35  * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count"
  36  * number of registers. They are set by using the REG/REG16 macros: the former
  37  * is used for offsets smaller than 0x200 while the latter is for values bigger
  38  * than that. Those macros already set all the bits documented below correctly:
  39  *
  40  * [7]: When a register offset needs more than 6 bits, use additional bytes, to
  41  *      follow, for the lower bits
  42  * [6:0]: Register offset, without considering the engine base.
  43  *
  44  * This function only tweaks the commands and register offsets. Values are not
  45  * filled out.
  46  */
  47 static void set_offsets(u32 *regs,
  48                         const u8 *data,
  49                         const struct intel_engine_cs *engine,
  50                         bool close)
  51 #define NOP(x) (BIT(7) | (x))
  52 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
  53 #define POSTED BIT(0)
  54 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
  55 #define REG16(x) \
  56         (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
  57         (((x) >> 2) & 0x7f)
  58 #define END 0
  59 {
  60         const u32 base = engine->mmio_base;
  61
  62         while (*data) {
  63                 u8 count, flags;
  64
  65                 if (*data & BIT(7)) { /* skip */
  66                         count = *data++ & ~BIT(7);
  67                         regs += count;
  68                         continue;
  69                 }
  70
  71                 count = *data & 0x3f;
  72                 flags = *data >> 6;
  73                 data++;
  74
  75                 *regs = MI_LOAD_REGISTER_IMM(count);
  76                 if (flags & POSTED)
  77                         *regs |= MI_LRI_FORCE_POSTED;
  78                 if (GRAPHICS_VER(engine->i915) >= 11)
  79                         *regs |= MI_LRI_LRM_CS_MMIO;
  80                 regs++;
  81
  82                 GEM_BUG_ON(!count);
  83                 do {
  84                         u32 offset = 0;
  85                         u8 v;
  86
  87                         do {
  88                                 v = *data++;
  89                                 offset <<= 7;
  90                                 offset |= v & ~BIT(7);
  91                         } while (v & BIT(7));
  92
  93                         regs[0] = base + (offset << 2);
  94                         regs += 2;
  95                 } while (--count);
  96         }
  97
  98         if (close) {
  99                 /* Close the batch; used mainly by live_lrc_layout() */
 100                 *regs = MI_BATCH_BUFFER_END;
 101                 if (GRAPHICS_VER(engine->i915) >= 11)
 102                         *regs |= BIT(0);
 103         }
 104 }
 105
 106 static const u8 gen8_xcs_offsets[] = {
 107         NOP(1),
 108         LRI(11, 0),
 109         REG16(0x244),
 110         REG(0x034),
 111         REG(0x030),
 112         REG(0x038),
 113         REG(0x03c),
 114         REG(0x168),
 115         REG(0x140),
 116         REG(0x110),
 117         REG(0x11c),
 118         REG(0x114),
 119         REG(0x118),
 120
 121         NOP(9),
 122         LRI(9, 0),
 123         REG16(0x3a8),
 124         REG16(0x28c),
 125         REG16(0x288),
 126         REG16(0x284),
 127         REG16(0x280),
 128         REG16(0x27c),
 129         REG16(0x278),
 130         REG16(0x274),
 131         REG16(0x270),
 132
 133         NOP(13),
 134         LRI(2, 0),
 135         REG16(0x200),
 136         REG(0x028),
 137
 138         END
 139 };
 140
 141 static const u8 gen9_xcs_offsets[] = {
 142         NOP(1),
 143         LRI(14, POSTED),
 144         REG16(0x244),
 145         REG(0x034),
 146         REG(0x030),
 147         REG(0x038),
 148         REG(0x03c),
 149         REG(0x168),
 150         REG(0x140),
 151         REG(0x110),
 152         REG(0x11c),
 153         REG(0x114),
 154         REG(0x118),
 155         REG(0x1c0),
 156         REG(0x1c4),
 157         REG(0x1c8),
 158
 159         NOP(3),
 160         LRI(9, POSTED),
 161         REG16(0x3a8),
 162         REG16(0x28c),
 163         REG16(0x288),
 164         REG16(0x284),
 165         REG16(0x280),
 166         REG16(0x27c),
 167         REG16(0x278),
 168         REG16(0x274),
 169         REG16(0x270),
 170
 171         NOP(13),
 172         LRI(1, POSTED),
 173         REG16(0x200),
 174
 175         NOP(13),
 176         LRI(44, POSTED),
 177         REG(0x028),
 178         REG(0x09c),
 179         REG(0x0c0),
 180         REG(0x178),
 181         REG(0x17c),
 182         REG16(0x358),
 183         REG(0x170),
 184         REG(0x150),
 185         REG(0x154),
 186         REG(0x158),
 187         REG16(0x41c),
 188         REG16(0x600),
 189         REG16(0x604),
 190         REG16(0x608),
 191         REG16(0x60c),
 192         REG16(0x610),
 193         REG16(0x614),
 194         REG16(0x618),
 195         REG16(0x61c),
 196         REG16(0x620),
 197         REG16(0x624),
 198         REG16(0x628),
 199         REG16(0x62c),
 200         REG16(0x630),
 201         REG16(0x634),
 202         REG16(0x638),
 203         REG16(0x63c),
 204         REG16(0x640),
 205         REG16(0x644),
 206         REG16(0x648),
 207         REG16(0x64c),
 208         REG16(0x650),
 209         REG16(0x654),
 210         REG16(0x658),
 211         REG16(0x65c),
 212         REG16(0x660),
 213         REG16(0x664),
 214         REG16(0x668),
 215         REG16(0x66c),
 216         REG16(0x670),
 217         REG16(0x674),
 218         REG16(0x678),
 219         REG16(0x67c),
 220         REG(0x068),
 221
 222         END
 223 };
 224
 225 static const u8 gen12_xcs_offsets[] = {
 226         NOP(1),
 227         LRI(13, POSTED),
 228         REG16(0x244),
 229         REG(0x034),
 230         REG(0x030),
 231         REG(0x038),
 232         REG(0x03c),
 233         REG(0x168),
 234         REG(0x140),
 235         REG(0x110),
 236         REG(0x1c0),
 237         REG(0x1c4),
 238         REG(0x1c8),
 239         REG(0x180),
 240         REG16(0x2b4),
 241
 242         NOP(5),
 243         LRI(9, POSTED),
 244         REG16(0x3a8),
 245         REG16(0x28c),
 246         REG16(0x288),
 247         REG16(0x284),
 248         REG16(0x280),
 249         REG16(0x27c),
 250         REG16(0x278),
 251         REG16(0x274),
 252         REG16(0x270),
 253
 254         END
 255 };
 256
 257 static const u8 dg2_xcs_offsets[] = {
 258         NOP(1),
 259         LRI(15, POSTED),
 260         REG16(0x244),
 261         REG(0x034),
 262         REG(0x030),
 263         REG(0x038),
 264         REG(0x03c),
 265         REG(0x168),
 266         REG(0x140),
 267         REG(0x110),
 268         REG(0x1c0),
 269         REG(0x1c4),
 270         REG(0x1c8),
 271         REG(0x180),
 272         REG16(0x2b4),
 273         REG(0x120),
 274         REG(0x124),
 275
 276         NOP(1),
 277         LRI(9, POSTED),
 278         REG16(0x3a8),
 279         REG16(0x28c),
 280         REG16(0x288),
 281         REG16(0x284),
 282         REG16(0x280),
 283         REG16(0x27c),
 284         REG16(0x278),
 285         REG16(0x274),
 286         REG16(0x270),
 287
 288         END
 289 };
 290
 291 static const u8 gen8_rcs_offsets[] = {
 292         NOP(1),
 293         LRI(14, POSTED),
 294         REG16(0x244),
 295         REG(0x034),
 296         REG(0x030),
 297         REG(0x038),
 298         REG(0x03c),
 299         REG(0x168),
 300         REG(0x140),
 301         REG(0x110),
 302         REG(0x11c),
 303         REG(0x114),
 304         REG(0x118),
 305         REG(0x1c0),
 306         REG(0x1c4),
 307         REG(0x1c8),
 308
 309         NOP(3),
 310         LRI(9, POSTED),
 311         REG16(0x3a8),
 312         REG16(0x28c),
 313         REG16(0x288),
 314         REG16(0x284),
 315         REG16(0x280),
 316         REG16(0x27c),
 317         REG16(0x278),
 318         REG16(0x274),
 319         REG16(0x270),
 320
 321         NOP(13),
 322         LRI(1, 0),
 323         REG(0x0c8),
 324
 325         END
 326 };
 327
 328 static const u8 gen9_rcs_offsets[] = {
 329         NOP(1),
 330         LRI(14, POSTED),
 331         REG16(0x244),
 332         REG(0x34),
 333         REG(0x30),
 334         REG(0x38),
 335         REG(0x3c),
 336         REG(0x168),
 337         REG(0x140),
 338         REG(0x110),
 339         REG(0x11c),
 340         REG(0x114),
 341         REG(0x118),
 342         REG(0x1c0),
 343         REG(0x1c4),
 344         REG(0x1c8),
 345
 346         NOP(3),
 347         LRI(9, POSTED),
 348         REG16(0x3a8),
 349         REG16(0x28c),
 350         REG16(0x288),
 351         REG16(0x284),
 352         REG16(0x280),
 353         REG16(0x27c),
 354         REG16(0x278),
 355         REG16(0x274),
 356         REG16(0x270),
 357
 358         NOP(13),
 359         LRI(1, 0),
 360         REG(0xc8),
 361
 362         NOP(13),
 363         LRI(44, POSTED),
 364         REG(0x28),
 365         REG(0x9c),
 366         REG(0xc0),
 367         REG(0x178),
 368         REG(0x17c),
 369         REG16(0x358),
 370         REG(0x170),
 371         REG(0x150),
 372         REG(0x154),
 373         REG(0x158),
 374         REG16(0x41c),
 375         REG16(0x600),
 376         REG16(0x604),
 377         REG16(0x608),
 378         REG16(0x60c),
 379         REG16(0x610),
 380         REG16(0x614),
 381         REG16(0x618),
 382         REG16(0x61c),
 383         REG16(0x620),
 384         REG16(0x624),
 385         REG16(0x628),
 386         REG16(0x62c),
 387         REG16(0x630),
 388         REG16(0x634),
 389         REG16(0x638),
 390         REG16(0x63c),
 391         REG16(0x640),
 392         REG16(0x644),
 393         REG16(0x648),
 394         REG16(0x64c),
 395         REG16(0x650),
 396         REG16(0x654),
 397         REG16(0x658),
 398         REG16(0x65c),
 399         REG16(0x660),
 400         REG16(0x664),
 401         REG16(0x668),
 402         REG16(0x66c),
 403         REG16(0x670),
 404         REG16(0x674),
 405         REG16(0x678),
 406         REG16(0x67c),
 407         REG(0x68),
 408
 409         END
 410 };
 411
 412 static const u8 gen11_rcs_offsets[] = {
 413         NOP(1),
 414         LRI(15, POSTED),
 415         REG16(0x244),
 416         REG(0x034),
 417         REG(0x030),
 418         REG(0x038),
 419         REG(0x03c),
 420         REG(0x168),
 421         REG(0x140),
 422         REG(0x110),
 423         REG(0x11c),
 424         REG(0x114),
 425         REG(0x118),
 426         REG(0x1c0),
 427         REG(0x1c4),
 428         REG(0x1c8),
 429         REG(0x180),
 430
 431         NOP(1),
 432         LRI(9, POSTED),
 433         REG16(0x3a8),
 434         REG16(0x28c),
 435         REG16(0x288),
 436         REG16(0x284),
 437         REG16(0x280),
 438         REG16(0x27c),
 439         REG16(0x278),
 440         REG16(0x274),
 441         REG16(0x270),
 442
 443         LRI(1, POSTED),
 444         REG(0x1b0),
 445
 446         NOP(10),
 447         LRI(1, 0),
 448         REG(0x0c8),
 449
 450         END
 451 };
 452
 453 static const u8 gen12_rcs_offsets[] = {
 454         NOP(1),
 455         LRI(13, POSTED),
 456         REG16(0x244),
 457         REG(0x034),
 458         REG(0x030),
 459         REG(0x038),
 460         REG(0x03c),
 461         REG(0x168),
 462         REG(0x140),
 463         REG(0x110),
 464         REG(0x1c0),
 465         REG(0x1c4),
 466         REG(0x1c8),
 467         REG(0x180),
 468         REG16(0x2b4),
 469
 470         NOP(5),
 471         LRI(9, POSTED),
 472         REG16(0x3a8),
 473         REG16(0x28c),
 474         REG16(0x288),
 475         REG16(0x284),
 476         REG16(0x280),
 477         REG16(0x27c),
 478         REG16(0x278),
 479         REG16(0x274),
 480         REG16(0x270),
 481
 482         LRI(3, POSTED),
 483         REG(0x1b0),
 484         REG16(0x5a8),
 485         REG16(0x5ac),
 486
 487         NOP(6),
 488         LRI(1, 0),
 489         REG(0x0c8),
 490         NOP(3 + 9 + 1),
 491
 492         LRI(51, POSTED),
 493         REG16(0x588),
 494         REG16(0x588),
 495         REG16(0x588),
 496         REG16(0x588),
 497         REG16(0x588),
 498         REG16(0x588),
 499         REG(0x028),
 500         REG(0x09c),
 501         REG(0x0c0),
 502         REG(0x178),
 503         REG(0x17c),
 504         REG16(0x358),
 505         REG(0x170),
 506         REG(0x150),
 507         REG(0x154),
 508         REG(0x158),
 509         REG16(0x41c),
 510         REG16(0x600),
 511         REG16(0x604),
 512         REG16(0x608),
 513         REG16(0x60c),
 514         REG16(0x610),
 515         REG16(0x614),
 516         REG16(0x618),
 517         REG16(0x61c),
 518         REG16(0x620),
 519         REG16(0x624),
 520         REG16(0x628),
 521         REG16(0x62c),
 522         REG16(0x630),
 523         REG16(0x634),
 524         REG16(0x638),
 525         REG16(0x63c),
 526         REG16(0x640),
 527         REG16(0x644),
 528         REG16(0x648),
 529         REG16(0x64c),
 530         REG16(0x650),
 531         REG16(0x654),
 532         REG16(0x658),
 533         REG16(0x65c),
 534         REG16(0x660),
 535         REG16(0x664),
 536         REG16(0x668),
 537         REG16(0x66c),
 538         REG16(0x670),
 539         REG16(0x674),
 540         REG16(0x678),
 541         REG16(0x67c),
 542         REG(0x068),
 543         REG(0x084),
 544         NOP(1),
 545
 546         END
 547 };
 548
 549 static const u8 dg2_rcs_offsets[] = {
 550         NOP(1),
 551         LRI(15, POSTED),
 552         REG16(0x244),
 553         REG(0x034),
 554         REG(0x030),
 555         REG(0x038),
 556         REG(0x03c),
 557         REG(0x168),
 558         REG(0x140),
 559         REG(0x110),
 560         REG(0x1c0),
 561         REG(0x1c4),
 562         REG(0x1c8),
 563         REG(0x180),
 564         REG16(0x2b4),
 565         REG(0x120),
 566         REG(0x124),
 567
 568         NOP(1),
 569         LRI(9, POSTED),
 570         REG16(0x3a8),
 571         REG16(0x28c),
 572         REG16(0x288),
 573         REG16(0x284),
 574         REG16(0x280),
 575         REG16(0x27c),
 576         REG16(0x278),
 577         REG16(0x274),
 578         REG16(0x270),
 579
 580         LRI(3, POSTED),
 581         REG(0x1b0),
 582         REG16(0x5a8),
 583         REG16(0x5ac),
 584
 585         NOP(6),
 586         LRI(1, 0),
 587         REG(0x0c8),
 588
 589         END
 590 };
 591
 592 static const u8 mtl_rcs_offsets[] = {
 593         NOP(1),
 594         LRI(15, POSTED),
 595         REG16(0x244),
 596         REG(0x034),
 597         REG(0x030),
 598         REG(0x038),
 599         REG(0x03c),
 600         REG(0x168),
 601         REG(0x140),
 602         REG(0x110),
 603         REG(0x1c0),
 604         REG(0x1c4),
 605         REG(0x1c8),
 606         REG(0x180),
 607         REG16(0x2b4),
 608         REG(0x120),
 609         REG(0x124),
 610
 611         NOP(1),
 612         LRI(9, POSTED),
 613         REG16(0x3a8),
 614         REG16(0x28c),
 615         REG16(0x288),
 616         REG16(0x284),
 617         REG16(0x280),
 618         REG16(0x27c),
 619         REG16(0x278),
 620         REG16(0x274),
 621         REG16(0x270),
 622
 623         NOP(2),
 624         LRI(2, POSTED),
 625         REG16(0x5a8),
 626         REG16(0x5ac),
 627
 628         NOP(6),
 629         LRI(1, 0),
 630         REG(0x0c8),
 631
 632         END
 633 };
 634
 635 #undef END
 636 #undef REG16
 637 #undef REG
 638 #undef LRI
 639 #undef NOP
 640
 641 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
 642 {
 643         /*
 644          * The gen12+ lists only have the registers we program in the basic
 645          * default state. We rely on the context image using relative
 646          * addressing to automatic fixup the register state between the
 647          * physical engines for virtual engine.
 648          */
 649         GEM_BUG_ON(GRAPHICS_VER(engine->i915) >= 12 &&
 650                    !intel_engine_has_relative_mmio(engine));
 651
 652         if (engine->flags & I915_ENGINE_HAS_RCS_REG_STATE) {
 653                 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 70))
 654                         return mtl_rcs_offsets;
 655                 else if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
 656                         return dg2_rcs_offsets;
 657                 else if (GRAPHICS_VER(engine->i915) >= 12)
 658                         return gen12_rcs_offsets;
 659                 else if (GRAPHICS_VER(engine->i915) >= 11)
 660                         return gen11_rcs_offsets;
 661                 else if (GRAPHICS_VER(engine->i915) >= 9)
 662                         return gen9_rcs_offsets;
 663                 else
 664                         return gen8_rcs_offsets;
 665         } else {
 666                 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
 667                         return dg2_xcs_offsets;
 668                 else if (GRAPHICS_VER(engine->i915) >= 12)
 669                         return gen12_xcs_offsets;
 670                 else if (GRAPHICS_VER(engine->i915) >= 9)
 671                         return gen9_xcs_offsets;
 672                 else
 673                         return gen8_xcs_offsets;
 674         }
 675 }
 676
 677 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
 678 {
 679         if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
 680                 return 0x70;
 681         else if (GRAPHICS_VER(engine->i915) >= 12)
 682                 return 0x60;
 683         else if (GRAPHICS_VER(engine->i915) >= 9)
 684                 return 0x54;
 685         else if (engine->class == RENDER_CLASS)
 686                 return 0x58;
 687         else
 688                 return -1;
 689 }
 690
 691 static int lrc_ring_bb_offset(const struct intel_engine_cs *engine)
 692 {
 693         if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
 694                 return 0x80;
 695         else if (GRAPHICS_VER(engine->i915) >= 12)
 696                 return 0x70;
 697         else if (GRAPHICS_VER(engine->i915) >= 9)
 698                 return 0x64;
 699         else if (GRAPHICS_VER(engine->i915) >= 8 &&
 700                  engine->class == RENDER_CLASS)
 701                 return 0xc4;
 702         else
 703                 return -1;
 704 }
 705
 706 static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
 707 {
 708         if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
 709                 return 0x84;
 710         else if (GRAPHICS_VER(engine->i915) >= 12)
 711                 return 0x74;
 712         else if (GRAPHICS_VER(engine->i915) >= 9)
 713                 return 0x68;
 714         else if (engine->class == RENDER_CLASS)
 715                 return 0xd8;
 716         else
 717                 return -1;
 718 }
 719
 720 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
 721 {
 722         if (GRAPHICS_VER(engine->i915) >= 12)
 723                 return 0x12;
 724         else if (GRAPHICS_VER(engine->i915) >= 9 || engine->class == RENDER_CLASS)
 725                 return 0x18;
 726         else
 727                 return -1;
 728 }
 729
 730 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
 731 {
 732         int x;
 733
 734         x = lrc_ring_wa_bb_per_ctx(engine);
 735         if (x < 0)
 736                 return x;
 737
 738         return x + 2;
 739 }
 740
 741 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
 742 {
 743         int x;
 744
 745         x = lrc_ring_indirect_ptr(engine);
 746         if (x < 0)
 747                 return x;
 748
 749         return x + 2;
 750 }
 751
 752 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
 753 {
 754
 755         if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
 756                 /*
 757                  * Note that the CSFE context has a dummy slot for CMD_BUF_CCTL
 758                  * simply to match the RCS context image layout.
 759                  */
 760                 return 0xc6;
 761         else if (engine->class != RENDER_CLASS)
 762                 return -1;
 763         else if (GRAPHICS_VER(engine->i915) >= 12)
 764                 return 0xb6;
 765         else if (GRAPHICS_VER(engine->i915) >= 11)
 766                 return 0xaa;
 767         else
 768                 return -1;
 769 }
 770
 771 static u32
 772 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
 773 {
 774         if (GRAPHICS_VER(engine->i915) >= 12)
 775                 return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 776         else if (GRAPHICS_VER(engine->i915) >= 11)
 777                 return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 778         else if (GRAPHICS_VER(engine->i915) >= 9)
 779                 return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 780         else if (GRAPHICS_VER(engine->i915) >= 8)
 781                 return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 782
 783         GEM_BUG_ON(GRAPHICS_VER(engine->i915) < 8);
 784
 785         return 0;
 786 }
 787
 788 static void
 789 lrc_setup_bb_per_ctx(u32 *regs,
 790                      const struct intel_engine_cs *engine,
 791                      u32 ctx_bb_ggtt_addr)
 792 {
 793         GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
 794         regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
 795                 ctx_bb_ggtt_addr |
 796                 PER_CTX_BB_FORCE |
 797                 PER_CTX_BB_VALID;
 798 }
 799
 800 static void
 801 lrc_setup_indirect_ctx(u32 *regs,
 802                        const struct intel_engine_cs *engine,
 803                        u32 ctx_bb_ggtt_addr,
 804                        u32 size)
 805 {
 806         GEM_BUG_ON(!size);
 807         GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES));
 808         GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1);
 809         regs[lrc_ring_indirect_ptr(engine) + 1] =
 810                 ctx_bb_ggtt_addr | (size / CACHELINE_BYTES);
 811
 812         GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1);
 813         regs[lrc_ring_indirect_offset(engine) + 1] =
 814                 lrc_ring_indirect_offset_default(engine) << 6;
 815 }
 816
 817 static bool ctx_needs_runalone(const struct intel_context *ce)
 818 {
 819         struct i915_gem_context *gem_ctx;
 820         bool ctx_is_protected = false;
 821
 822         /*
 823          * Wa_14019159160 - Case 2.
 824          * On some platforms, protected contexts require setting
 825          * the LRC run-alone bit or else the encryption/decryption will not happen.
 826          * NOTE: Case 2 only applies to PXP use-case of said workaround.
 827          */
 828         if (GRAPHICS_VER_FULL(ce->engine->i915) >= IP_VER(12, 70) &&
 829             (ce->engine->class == COMPUTE_CLASS || ce->engine->class == RENDER_CLASS)) {
 830                 rcu_read_lock();
 831                 gem_ctx = rcu_dereference(ce->gem_context);
 832                 if (gem_ctx)
 833                         ctx_is_protected = gem_ctx->uses_protected_content;
 834                 rcu_read_unlock();
 835         }
 836
 837         return ctx_is_protected;
 838 }
 839
 840 static void init_common_regs(u32 * const regs,
 841                              const struct intel_context *ce,
 842                              const struct intel_engine_cs *engine,
 843                              bool inhibit)
 844 {
 845         u32 ctl;
 846         int loc;
 847
 848         ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
 849         ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
 850         if (inhibit)
 851                 ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
 852         if (GRAPHICS_VER(engine->i915) < 11)
 853                 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
 854                                            CTX_CTRL_RS_CTX_ENABLE);
 855         /* Wa_14019159160 - Case 2.*/
 856         if (ctx_needs_runalone(ce))
 857                 ctl |= _MASKED_BIT_ENABLE(GEN12_CTX_CTRL_RUNALONE_MODE);
 858         regs[CTX_CONTEXT_CONTROL] = ctl;
 859
 860         regs[CTX_TIMESTAMP] = ce->stats.runtime.last;
 861
 862         loc = lrc_ring_bb_offset(engine);
 863         if (loc != -1)
 864                 regs[loc + 1] = 0;
 865 }
 866
 867 static void init_wa_bb_regs(u32 * const regs,
 868                             const struct intel_engine_cs *engine)
 869 {
 870         const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
 871
 872         if (wa_ctx->per_ctx.size) {
 873                 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
 874
 875                 GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
 876                 regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
 877                         (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
 878         }
 879
 880         if (wa_ctx->indirect_ctx.size) {
 881                 lrc_setup_indirect_ctx(regs, engine,
 882                                        i915_ggtt_offset(wa_ctx->vma) +
 883                                        wa_ctx->indirect_ctx.offset,
 884                                        wa_ctx->indirect_ctx.size);
 885         }
 886 }
 887
 888 static void init_ppgtt_regs(u32 *regs, const struct i915_ppgtt *ppgtt)
 889 {
 890         if (i915_vm_is_4lvl(&ppgtt->vm)) {
 891                 /* 64b PPGTT (48bit canonical)
 892                  * PDP0_DESCRIPTOR contains the base address to PML4 and
 893                  * other PDP Descriptors are ignored.
 894                  */
 895                 ASSIGN_CTX_PML4(ppgtt, regs);
 896         } else {
 897                 ASSIGN_CTX_PDP(ppgtt, regs, 3);
 898                 ASSIGN_CTX_PDP(ppgtt, regs, 2);
 899                 ASSIGN_CTX_PDP(ppgtt, regs, 1);
 900                 ASSIGN_CTX_PDP(ppgtt, regs, 0);
 901         }
 902 }
 903
 904 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
 905 {
 906         if (i915_is_ggtt(vm))
 907                 return i915_vm_to_ggtt(vm)->alias;
 908         else
 909                 return i915_vm_to_ppgtt(vm);
 910 }
 911
 912 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
 913 {
 914         int x;
 915
 916         x = lrc_ring_mi_mode(engine);
 917         if (x != -1) {
 918                 regs[x + 1] &= ~STOP_RING;
 919                 regs[x + 1] |= STOP_RING << 16;
 920         }
 921 }
 922
 923 static void __lrc_init_regs(u32 *regs,
 924                             const struct intel_context *ce,
 925                             const struct intel_engine_cs *engine,
 926                             bool inhibit)
 927 {
 928         /*
 929          * A context is actually a big batch buffer with several
 930          * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
 931          * values we are setting here are only for the first context restore:
 932          * on a subsequent save, the GPU will recreate this batchbuffer with new
 933          * values (including all the missing MI_LOAD_REGISTER_IMM commands that
 934          * we are not initializing here).
 935          *
 936          * Must keep consistent with virtual_update_register_offsets().
 937          */
 938
 939         if (inhibit)
 940                 memset(regs, 0, PAGE_SIZE);
 941
 942         set_offsets(regs, reg_offsets(engine), engine, inhibit);
 943
 944         init_common_regs(regs, ce, engine, inhibit);
 945         init_ppgtt_regs(regs, vm_alias(ce->vm));
 946
 947         init_wa_bb_regs(regs, engine);
 948
 949         __reset_stop_ring(regs, engine);
 950 }
 951
 952 void lrc_init_regs(const struct intel_context *ce,
 953                    const struct intel_engine_cs *engine,
 954                    bool inhibit)
 955 {
 956         __lrc_init_regs(ce->lrc_reg_state, ce, engine, inhibit);
 957 }
 958
 959 void lrc_reset_regs(const struct intel_context *ce,
 960                     const struct intel_engine_cs *engine)
 961 {
 962         __reset_stop_ring(ce->lrc_reg_state, engine);
 963 }
 964
 965 static void
 966 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
 967 {
 968         if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
 969                 return;
 970
 971         vaddr += engine->context_size;
 972
 973         memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
 974 }
 975
 976 static void
 977 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
 978 {
 979         if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
 980                 return;
 981
 982         vaddr += engine->context_size;
 983
 984         if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
 985                 drm_err_once(&engine->i915->drm,
 986                              "%s context redzone overwritten!\n",
 987                              engine->name);
 988 }
 989
 990 static u32 context_wa_bb_offset(const struct intel_context *ce)
 991 {
 992         return PAGE_SIZE * ce->wa_bb_page;
 993 }
 994
 995 /*
 996  * per_ctx below determines which WABB section is used.
 997  * When true, the function returns the location of the
 998  * PER_CTX_BB.  When false, the function returns the
 999  * location of the INDIRECT_CTX.
1000  */
1001 static u32 *context_wabb(const struct intel_context *ce, bool per_ctx)
1002 {
1003         void *ptr;
1004
1005         GEM_BUG_ON(!ce->wa_bb_page);
1006
1007         ptr = ce->lrc_reg_state;
1008         ptr -= LRC_STATE_OFFSET; /* back to start of context image */
1009         ptr += context_wa_bb_offset(ce);
1010         ptr += per_ctx ? PAGE_SIZE : 0;
1011
1012         return ptr;
1013 }
1014
1015 void lrc_init_state(struct intel_context *ce,
1016                     struct intel_engine_cs *engine,
1017                     void *state)
1018 {
1019         bool inhibit = true;
1020
1021         set_redzone(state, engine);
1022
1023         if (ce->default_state) {
1024                 shmem_read(ce->default_state, 0, state, engine->context_size);
1025                 __set_bit(CONTEXT_VALID_BIT, &ce->flags);
1026                 inhibit = false;
1027         }
1028
1029         /* Clear the ppHWSP (inc. per-context counters) */
1030         memset(state, 0, PAGE_SIZE);
1031
1032         /* Clear the indirect wa and storage */
1033         if (ce->wa_bb_page)
1034                 memset(state + context_wa_bb_offset(ce), 0, PAGE_SIZE);
1035
1036         /*
1037          * The second page of the context object contains some registers which
1038          * must be set up prior to the first execution.
1039          */
1040         __lrc_init_regs(state + LRC_STATE_OFFSET, ce, engine, inhibit);
1041 }
1042
1043 u32 lrc_indirect_bb(const struct intel_context *ce)
1044 {
1045         return i915_ggtt_offset(ce->state) + context_wa_bb_offset(ce);
1046 }
1047
1048 static u32 *setup_predicate_disable_wa(const struct intel_context *ce, u32 *cs)
1049 {
1050         /* If predication is active, this will be noop'ed */
1051         *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2);
1052         *cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA;
1053         *cs++ = 0;
1054         *cs++ = 0; /* No predication */
1055
1056         /* predicated end, only terminates if SET_PREDICATE_RESULT:0 is clear */
1057         *cs++ = MI_BATCH_BUFFER_END | BIT(15);
1058         *cs++ = MI_SET_PREDICATE | MI_SET_PREDICATE_DISABLE;
1059
1060         /* Instructions are no longer predicated (disabled), we can proceed */
1061         *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2);
1062         *cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA;
1063         *cs++ = 0;
1064         *cs++ = 1; /* enable predication before the next BB */
1065
1066         *cs++ = MI_BATCH_BUFFER_END;
1067         GEM_BUG_ON(offset_in_page(cs) > DG2_PREDICATE_RESULT_WA);
1068
1069         return cs;
1070 }
1071
1072 static struct i915_vma *
1073 __lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine)
1074 {
1075         struct drm_i915_gem_object *obj;
1076         struct i915_vma *vma;
1077         u32 context_size;
1078
1079         context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
1080
1081         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1082                 context_size += I915_GTT_PAGE_SIZE; /* for redzone */
1083
1084         if (GRAPHICS_VER(engine->i915) >= 12) {
1085                 ce->wa_bb_page = context_size / PAGE_SIZE;
1086                 /* INDIRECT_CTX and PER_CTX_BB need separate pages. */
1087                 context_size += PAGE_SIZE * 2;
1088         }
1089
1090         if (intel_context_is_parent(ce) && intel_engine_uses_guc(engine)) {
1091                 ce->parallel.guc.parent_page = context_size / PAGE_SIZE;
1092                 context_size += PARENT_SCRATCH_SIZE;
1093         }
1094
1095         obj = i915_gem_object_create_lmem(engine->i915, context_size,
1096                                           I915_BO_ALLOC_PM_VOLATILE);
1097         if (IS_ERR(obj)) {
1098                 obj = i915_gem_object_create_shmem(engine->i915, context_size);
1099                 if (IS_ERR(obj))
1100                         return ERR_CAST(obj);
1101
1102                 /*
1103                  * Wa_22016122933: For Media version 13.0, all Media GT shared
1104                  * memory needs to be mapped as WC on CPU side and UC (PAT
1105                  * index 2) on GPU side.
1106                  */
1107                 if (intel_gt_needs_wa_22016122933(engine->gt))
1108                         i915_gem_object_set_cache_coherency(obj, I915_CACHE_NONE);
1109         }
1110
1111         vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
1112         if (IS_ERR(vma)) {
1113                 i915_gem_object_put(obj);
1114                 return vma;
1115         }
1116
1117         return vma;
1118 }
1119
1120 static struct intel_timeline *
1121 pinned_timeline(struct intel_context *ce, struct intel_engine_cs *engine)
1122 {
1123         struct intel_timeline *tl = fetch_and_zero(&ce->timeline);
1124
1125         return intel_timeline_create_from_engine(engine, page_unmask_bits(tl));
1126 }
1127
1128 int lrc_alloc(struct intel_context *ce, struct intel_engine_cs *engine)
1129 {
1130         struct intel_ring *ring;
1131         struct i915_vma *vma;
1132         int err;
1133
1134         GEM_BUG_ON(ce->state);
1135
1136         if (!intel_context_has_own_state(ce))
1137                 ce->default_state = engine->default_state;
1138
1139         vma = __lrc_alloc_state(ce, engine);
1140         if (IS_ERR(vma))
1141                 return PTR_ERR(vma);
1142
1143         ring = intel_engine_create_ring(engine, ce->ring_size);
1144         if (IS_ERR(ring)) {
1145                 err = PTR_ERR(ring);
1146                 goto err_vma;
1147         }
1148
1149         if (!page_mask_bits(ce->timeline)) {
1150                 struct intel_timeline *tl;
1151
1152                 /*
1153                  * Use the static global HWSP for the kernel context, and
1154                  * a dynamically allocated cacheline for everyone else.
1155                  */
1156                 if (unlikely(ce->timeline))
1157                         tl = pinned_timeline(ce, engine);
1158                 else
1159                         tl = intel_timeline_create(engine->gt);
1160                 if (IS_ERR(tl)) {
1161                         err = PTR_ERR(tl);
1162                         goto err_ring;
1163                 }
1164
1165                 ce->timeline = tl;
1166         }
1167
1168         ce->ring = ring;
1169         ce->state = vma;
1170
1171         return 0;
1172
1173 err_ring:
1174         intel_ring_put(ring);
1175 err_vma:
1176         i915_vma_put(vma);
1177         return err;
1178 }
1179
1180 void lrc_reset(struct intel_context *ce)
1181 {
1182         GEM_BUG_ON(!intel_context_is_pinned(ce));
1183
1184         intel_ring_reset(ce->ring, ce->ring->emit);
1185
1186         /* Scrub away the garbage */
1187         lrc_init_regs(ce, ce->engine, true);
1188         ce->lrc.lrca = lrc_update_regs(ce, ce->engine, ce->ring->tail);
1189 }
1190
1191 int
1192 lrc_pre_pin(struct intel_context *ce,
1193             struct intel_engine_cs *engine,
1194             struct i915_gem_ww_ctx *ww,
1195             void **vaddr)
1196 {
1197         GEM_BUG_ON(!ce->state);
1198         GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
1199
1200         *vaddr = i915_gem_object_pin_map(ce->state->obj,
1201                                          intel_gt_coherent_map_type(ce->engine->gt,
1202                                                                     ce->state->obj,
1203                                                                     false) |
1204                                          I915_MAP_OVERRIDE);
1205
1206         return PTR_ERR_OR_ZERO(*vaddr);
1207 }
1208
1209 int
1210 lrc_pin(struct intel_context *ce,
1211         struct intel_engine_cs *engine,
1212         void *vaddr)
1213 {
1214         ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
1215
1216         if (!__test_and_set_bit(CONTEXT_INIT_BIT, &ce->flags))
1217                 lrc_init_state(ce, engine, vaddr);
1218
1219         ce->lrc.lrca = lrc_update_regs(ce, engine, ce->ring->tail);
1220         return 0;
1221 }
1222
1223 void lrc_unpin(struct intel_context *ce)
1224 {
1225         if (unlikely(ce->parallel.last_rq)) {
1226                 i915_request_put(ce->parallel.last_rq);
1227                 ce->parallel.last_rq = NULL;
1228         }
1229         check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
1230                       ce->engine);
1231 }
1232
1233 void lrc_post_unpin(struct intel_context *ce)
1234 {
1235         i915_gem_object_unpin_map(ce->state->obj);
1236 }
1237
1238 void lrc_fini(struct intel_context *ce)
1239 {
1240         if (!ce->state)
1241                 return;
1242
1243         intel_ring_put(fetch_and_zero(&ce->ring));
1244         i915_vma_put(fetch_and_zero(&ce->state));
1245 }
1246
1247 void lrc_destroy(struct kref *kref)
1248 {
1249         struct intel_context *ce = container_of(kref, typeof(*ce), ref);
1250
1251         GEM_BUG_ON(!i915_active_is_idle(&ce->active));
1252         GEM_BUG_ON(intel_context_is_pinned(ce));
1253
1254         lrc_fini(ce);
1255
1256         intel_context_fini(ce);
1257         intel_context_free(ce);
1258 }
1259
1260 static u32 *
1261 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs)
1262 {
1263         *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1264                 MI_SRM_LRM_GLOBAL_GTT |
1265                 MI_LRI_LRM_CS_MMIO;
1266         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1267         *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1268                 CTX_TIMESTAMP * sizeof(u32);
1269         *cs++ = 0;
1270
1271         *cs++ = MI_LOAD_REGISTER_REG |
1272                 MI_LRR_SOURCE_CS_MMIO |
1273                 MI_LRI_LRM_CS_MMIO;
1274         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1275         *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
1276
1277         *cs++ = MI_LOAD_REGISTER_REG |
1278                 MI_LRR_SOURCE_CS_MMIO |
1279                 MI_LRI_LRM_CS_MMIO;
1280         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1281         *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
1282
1283         return cs;
1284 }
1285
1286 static u32 *
1287 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs)
1288 {
1289         GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1);
1290
1291         *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1292                 MI_SRM_LRM_GLOBAL_GTT |
1293                 MI_LRI_LRM_CS_MMIO;
1294         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1295         *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1296                 (lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32);
1297         *cs++ = 0;
1298
1299         return cs;
1300 }
1301
1302 static u32 *
1303 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
1304 {
1305         GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1);
1306
1307         *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1308                 MI_SRM_LRM_GLOBAL_GTT |
1309                 MI_LRI_LRM_CS_MMIO;
1310         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1311         *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1312                 (lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32);
1313         *cs++ = 0;
1314
1315         *cs++ = MI_LOAD_REGISTER_REG |
1316                 MI_LRR_SOURCE_CS_MMIO |
1317                 MI_LRI_LRM_CS_MMIO;
1318         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1319         *cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0));
1320
1321         return cs;
1322 }
1323
1324 /*
1325  * The bspec's tuning guide asks us to program a vertical watermark value of
1326  * 0x3FF.  However this register is not saved/restored properly by the
1327  * hardware, so we're required to apply the desired value via INDIRECT_CTX
1328  * batch buffer to ensure the value takes effect properly.  All other bits
1329  * in this register should remain at 0 (the hardware default).
1330  */
1331 static u32 *
1332 dg2_emit_draw_watermark_setting(u32 *cs)
1333 {
1334         *cs++ = MI_LOAD_REGISTER_IMM(1);
1335         *cs++ = i915_mmio_reg_offset(DRAW_WATERMARK);
1336         *cs++ = REG_FIELD_PREP(VERT_WM_VAL, 0x3FF);
1337
1338         return cs;
1339 }
1340
1341 static u32 *
1342 gen12_invalidate_state_cache(u32 *cs)
1343 {
1344         *cs++ = MI_LOAD_REGISTER_IMM(1);
1345         *cs++ = i915_mmio_reg_offset(GEN12_CS_DEBUG_MODE2);
1346         *cs++ = _MASKED_BIT_ENABLE(INSTRUCTION_STATE_CACHE_INVALIDATE);
1347         return cs;
1348 }
1349
1350 static u32 *
1351 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
1352 {
1353         cs = gen12_emit_timestamp_wa(ce, cs);
1354         cs = gen12_emit_cmd_buf_wa(ce, cs);
1355         cs = gen12_emit_restore_scratch(ce, cs);
1356
1357         /* Wa_16013000631:dg2 */
1358         if (IS_DG2_G11(ce->engine->i915))
1359                 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE, 0);
1360
1361         cs = gen12_emit_aux_table_inv(ce->engine, cs);
1362
1363         /* Wa_18022495364 */
1364         if (IS_GFX_GT_IP_RANGE(ce->engine->gt, IP_VER(12, 0), IP_VER(12, 10)))
1365                 cs = gen12_invalidate_state_cache(cs);
1366
1367         /* Wa_16014892111 */
1368         if (IS_GFX_GT_IP_STEP(ce->engine->gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
1369             IS_GFX_GT_IP_STEP(ce->engine->gt, IP_VER(12, 71), STEP_A0, STEP_B0) ||
1370             IS_DG2(ce->engine->i915))
1371                 cs = dg2_emit_draw_watermark_setting(cs);
1372
1373         return cs;
1374 }
1375
1376 static u32 *
1377 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
1378 {
1379         cs = gen12_emit_timestamp_wa(ce, cs);
1380         cs = gen12_emit_restore_scratch(ce, cs);
1381
1382         /* Wa_16013000631:dg2 */
1383         if (IS_DG2_G11(ce->engine->i915))
1384                 if (ce->engine->class == COMPUTE_CLASS)
1385                         cs = gen8_emit_pipe_control(cs,
1386                                                     PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE,
1387                                                     0);
1388
1389         return gen12_emit_aux_table_inv(ce->engine, cs);
1390 }
1391
1392 static u32 *xehp_emit_fastcolor_blt_wabb(const struct intel_context *ce, u32 *cs)
1393 {
1394         struct intel_gt *gt = ce->engine->gt;
1395         int mocs = gt->mocs.uc_index << 1;
1396
1397         /**
1398          * Wa_16018031267 / Wa_16018063123 requires that SW forces the
1399          * main copy engine arbitration into round robin mode.  We
1400          * additionally need to submit the following WABB blt command
1401          * to produce 4 subblits with each subblit generating 0 byte
1402          * write requests as WABB:
1403          *
1404          * XY_FASTCOLOR_BLT
1405          *  BG0    -> 5100000E
1406          *  BG1    -> 0000003F (Dest pitch)
1407          *  BG2    -> 00000000 (X1, Y1) = (0, 0)
1408          *  BG3    -> 00040001 (X2, Y2) = (1, 4)
1409          *  BG4    -> scratch
1410          *  BG5    -> scratch
1411          *  BG6-12 -> 00000000
1412          *  BG13   -> 20004004 (Surf. Width= 2,Surf. Height = 5 )
1413          *  BG14   -> 00000010 (Qpitch = 4)
1414          *  BG15   -> 00000000
1415          */
1416         *cs++ = XY_FAST_COLOR_BLT_CMD | (16 - 2);
1417         *cs++ = FIELD_PREP(XY_FAST_COLOR_BLT_MOCS_MASK, mocs) | 0x3f;
1418         *cs++ = 0;
1419         *cs++ = 4 << 16 | 1;
1420         *cs++ = lower_32_bits(i915_vma_offset(ce->vm->rsvd.vma));
1421         *cs++ = upper_32_bits(i915_vma_offset(ce->vm->rsvd.vma));
1422         *cs++ = 0;
1423         *cs++ = 0;
1424         *cs++ = 0;
1425         *cs++ = 0;
1426         *cs++ = 0;
1427         *cs++ = 0;
1428         *cs++ = 0;
1429         *cs++ = 0x20004004;
1430         *cs++ = 0x10;
1431         *cs++ = 0;
1432
1433         return cs;
1434 }
1435
1436 static u32 *
1437 xehp_emit_per_ctx_bb(const struct intel_context *ce, u32 *cs)
1438 {
1439         /* Wa_16018031267, Wa_16018063123 */
1440         if (NEEDS_FASTCOLOR_BLT_WABB(ce->engine))
1441                 cs = xehp_emit_fastcolor_blt_wabb(ce, cs);
1442
1443         return cs;
1444 }
1445
1446 static void
1447 setup_per_ctx_bb(const struct intel_context *ce,
1448                  const struct intel_engine_cs *engine,
1449                  u32 *(*emit)(const struct intel_context *, u32 *))
1450 {
1451         /* Place PER_CTX_BB on next page after INDIRECT_CTX */
1452         u32 * const start = context_wabb(ce, true);
1453         u32 *cs;
1454
1455         cs = emit(ce, start);
1456
1457         /* PER_CTX_BB must manually terminate */
1458         *cs++ = MI_BATCH_BUFFER_END;
1459
1460         GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
1461         lrc_setup_bb_per_ctx(ce->lrc_reg_state, engine,
1462                              lrc_indirect_bb(ce) + PAGE_SIZE);
1463 }
1464
1465 static void
1466 setup_indirect_ctx_bb(const struct intel_context *ce,
1467                       const struct intel_engine_cs *engine,
1468                       u32 *(*emit)(const struct intel_context *, u32 *))
1469 {
1470         u32 * const start = context_wabb(ce, false);
1471         u32 *cs;
1472
1473         cs = emit(ce, start);
1474         GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
1475         while ((unsigned long)cs % CACHELINE_BYTES)
1476                 *cs++ = MI_NOOP;
1477
1478         GEM_BUG_ON(cs - start > DG2_PREDICATE_RESULT_BB / sizeof(*start));
1479         setup_predicate_disable_wa(ce, start + DG2_PREDICATE_RESULT_BB / sizeof(*start));
1480
1481         lrc_setup_indirect_ctx(ce->lrc_reg_state, engine,
1482                                lrc_indirect_bb(ce),
1483                                (cs - start) * sizeof(*cs));
1484 }
1485
1486 /*
1487  * The context descriptor encodes various attributes of a context,
1488  * including its GTT address and some flags. Because it's fairly
1489  * expensive to calculate, we'll just do it once and cache the result,
1490  * which remains valid until the context is unpinned.
1491  *
1492  * This is what a descriptor looks like, from LSB to MSB::
1493  *
1494  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
1495  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
1496  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
1497  *      bits 53-54:    mbz, reserved for use by hardware
1498  *      bits 55-63:    group ID, currently unused and set to 0
1499  *
1500  * Starting from Gen11, the upper dword of the descriptor has a new format:
1501  *
1502  *      bits 32-36:    reserved
1503  *      bits 37-47:    SW context ID
1504  *      bits 48:53:    engine instance
1505  *      bit 54:        mbz, reserved for use by hardware
1506  *      bits 55-60:    SW counter
1507  *      bits 61-63:    engine class
1508  *
1509  * On Xe_HP, the upper dword of the descriptor has a new format:
1510  *
1511  *      bits 32-37:    virtual function number
1512  *      bit 38:        mbz, reserved for use by hardware
1513  *      bits 39-54:    SW context ID
1514  *      bits 55-57:    reserved
1515  *      bits 58-63:    SW counter
1516  *
1517  * engine info, SW context ID and SW counter need to form a unique number
1518  * (Context ID) per lrc.
1519  */
1520 static u32 lrc_descriptor(const struct intel_context *ce)
1521 {
1522         u32 desc;
1523
1524         desc = INTEL_LEGACY_32B_CONTEXT;
1525         if (i915_vm_is_4lvl(ce->vm))
1526                 desc = INTEL_LEGACY_64B_CONTEXT;
1527         desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
1528
1529         desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
1530         if (GRAPHICS_VER(ce->vm->i915) == 8)
1531                 desc |= GEN8_CTX_L3LLC_COHERENT;
1532
1533         return i915_ggtt_offset(ce->state) | desc;
1534 }
1535
1536 u32 lrc_update_regs(const struct intel_context *ce,
1537                     const struct intel_engine_cs *engine,
1538                     u32 head)
1539 {
1540         struct intel_ring *ring = ce->ring;
1541         u32 *regs = ce->lrc_reg_state;
1542
1543         GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
1544         GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
1545
1546         regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1547         regs[CTX_RING_HEAD] = head;
1548         regs[CTX_RING_TAIL] = ring->tail;
1549         regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1550
1551         /* RPCS */
1552         if (engine->class == RENDER_CLASS) {
1553                 regs[CTX_R_PWR_CLK_STATE] =
1554                         intel_sseu_make_rpcs(engine->gt, &ce->sseu);
1555
1556                 i915_oa_init_reg_state(ce, engine);
1557         }
1558
1559         if (ce->wa_bb_page) {
1560                 u32 *(*fn)(const struct intel_context *ce, u32 *cs);
1561
1562                 fn = gen12_emit_indirect_ctx_xcs;
1563                 if (ce->engine->class == RENDER_CLASS)
1564                         fn = gen12_emit_indirect_ctx_rcs;
1565
1566                 /* Mutually exclusive wrt to global indirect bb */
1567                 GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
1568                 setup_indirect_ctx_bb(ce, engine, fn);
1569                 setup_per_ctx_bb(ce, engine, xehp_emit_per_ctx_bb);
1570         }
1571
1572         return lrc_descriptor(ce) | CTX_DESC_FORCE_RESTORE;
1573 }
1574
1575 void lrc_update_offsets(struct intel_context *ce,
1576                         struct intel_engine_cs *engine)
1577 {
1578         set_offsets(ce->lrc_reg_state, reg_offsets(engine), engine, false);
1579 }
1580
1581 void lrc_check_regs(const struct intel_context *ce,
1582                     const struct intel_engine_cs *engine,
1583                     const char *when)
1584 {
1585         const struct intel_ring *ring = ce->ring;
1586         u32 *regs = ce->lrc_reg_state;
1587         bool valid = true;
1588         int x;
1589
1590         if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1591                 pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1592                        engine->name,
1593                        regs[CTX_RING_START],
1594                        i915_ggtt_offset(ring->vma));
1595                 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1596                 valid = false;
1597         }
1598
1599         if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1600             (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1601                 pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1602                        engine->name,
1603                        regs[CTX_RING_CTL],
1604                        (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1605                 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1606                 valid = false;
1607         }
1608
1609         x = lrc_ring_mi_mode(engine);
1610         if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1611                 pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1612                        engine->name, regs[x + 1]);
1613                 regs[x + 1] &= ~STOP_RING;
1614                 regs[x + 1] |= STOP_RING << 16;
1615                 valid = false;
1616         }
1617
1618         WARN_ONCE(!valid, "Invalid lrc state found %s submission\n", when);
1619 }
1620
1621 /*
1622  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
1623  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
1624  * but there is a slight complication as this is applied in WA batch where the
1625  * values are only initialized once so we cannot take register value at the
1626  * beginning and reuse it further; hence we save its value to memory, upload a
1627  * constant value with bit21 set and then we restore it back with the saved value.
1628  * To simplify the WA, a constant value is formed by using the default value
1629  * of this register. This shouldn't be a problem because we are only modifying
1630  * it for a short period and this batch in non-premptible. We can ofcourse
1631  * use additional instructions that read the actual value of the register
1632  * at that time and set our bit of interest but it makes the WA complicated.
1633  *
1634  * This WA is also required for Gen9 so extracting as a function avoids
1635  * code duplication.
1636  */
1637 static u32 *
1638 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
1639 {
1640         /* NB no one else is allowed to scribble over scratch + 256! */
1641         *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1642         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1643         *batch++ = intel_gt_scratch_offset(engine->gt,
1644                                            INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1645         *batch++ = 0;
1646
1647         *batch++ = MI_LOAD_REGISTER_IMM(1);
1648         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1649         *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
1650
1651         batch = gen8_emit_pipe_control(batch,
1652                                        PIPE_CONTROL_CS_STALL |
1653                                        PIPE_CONTROL_DC_FLUSH_ENABLE,
1654                                        0);
1655
1656         *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1657         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1658         *batch++ = intel_gt_scratch_offset(engine->gt,
1659                                            INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1660         *batch++ = 0;
1661
1662         return batch;
1663 }
1664
1665 /*
1666  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
1667  * initialized at the beginning and shared across all contexts but this field
1668  * helps us to have multiple batches at different offsets and select them based
1669  * on a criteria. At the moment this batch always start at the beginning of the page
1670  * and at this point we don't have multiple wa_ctx batch buffers.
1671  *
1672  * The number of WA applied are not known at the beginning; we use this field
1673  * to return the no of DWORDS written.
1674  *
1675  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
1676  * so it adds NOOPs as padding to make it cacheline aligned.
1677  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
1678  * makes a complete batch buffer.
1679  */
1680 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1681 {
1682         /* WaDisableCtxRestoreArbitration:bdw,chv */
1683         *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1684
1685         /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
1686         if (IS_BROADWELL(engine->i915))
1687                 batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1688
1689         /* WaClearSlmSpaceAtContextSwitch:bdw,chv */
1690         /* Actual scratch location is at 128 bytes offset */
1691         batch = gen8_emit_pipe_control(batch,
1692                                        PIPE_CONTROL_FLUSH_L3 |
1693                                        PIPE_CONTROL_STORE_DATA_INDEX |
1694                                        PIPE_CONTROL_CS_STALL |
1695                                        PIPE_CONTROL_QW_WRITE,
1696                                        LRC_PPHWSP_SCRATCH_ADDR);
1697
1698         *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1699
1700         /* Pad to end of cacheline */
1701         while ((unsigned long)batch % CACHELINE_BYTES)
1702                 *batch++ = MI_NOOP;
1703
1704         /*
1705          * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
1706          * execution depends on the length specified in terms of cache lines
1707          * in the register CTX_RCS_INDIRECT_CTX
1708          */
1709
1710         return batch;
1711 }
1712
1713 struct lri {
1714         i915_reg_t reg;
1715         u32 value;
1716 };
1717
1718 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
1719 {
1720         GEM_BUG_ON(!count || count > 63);
1721
1722         *batch++ = MI_LOAD_REGISTER_IMM(count);
1723         do {
1724                 *batch++ = i915_mmio_reg_offset(lri->reg);
1725                 *batch++ = lri->value;
1726         } while (lri++, --count);
1727         *batch++ = MI_NOOP;
1728
1729         return batch;
1730 }
1731
1732 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1733 {
1734         static const struct lri lri[] = {
1735                 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
1736                 {
1737                         COMMON_SLICE_CHICKEN2,
1738                         __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
1739                                        0),
1740                 },
1741
1742                 /* BSpec: 11391 */
1743                 {
1744                         FF_SLICE_CHICKEN,
1745                         __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
1746                                        FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
1747                 },
1748
1749                 /* BSpec: 11299 */
1750                 {
1751                         _3D_CHICKEN3,
1752                         __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
1753                                        _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
1754                 }
1755         };
1756
1757         *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1758
1759         /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
1760         batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1761
1762         /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
1763         batch = gen8_emit_pipe_control(batch,
1764                                        PIPE_CONTROL_FLUSH_L3 |
1765                                        PIPE_CONTROL_STORE_DATA_INDEX |
1766                                        PIPE_CONTROL_CS_STALL |
1767                                        PIPE_CONTROL_QW_WRITE,
1768                                        LRC_PPHWSP_SCRATCH_ADDR);
1769
1770         batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
1771
1772         /* WaMediaPoolStateCmdInWABB:bxt,glk */
1773         if (HAS_POOLED_EU(engine->i915)) {
1774                 /*
1775                  * EU pool configuration is setup along with golden context
1776                  * during context initialization. This value depends on
1777                  * device type (2x6 or 3x6) and needs to be updated based
1778                  * on which subslice is disabled especially for 2x6
1779                  * devices, however it is safe to load default
1780                  * configuration of 3x6 device instead of masking off
1781                  * corresponding bits because HW ignores bits of a disabled
1782                  * subslice and drops down to appropriate config. Please
1783                  * see render_state_setup() in i915_gem_render_state.c for
1784                  * possible configurations, to avoid duplication they are
1785                  * not shown here again.
1786                  */
1787                 *batch++ = GEN9_MEDIA_POOL_STATE;
1788                 *batch++ = GEN9_MEDIA_POOL_ENABLE;
1789                 *batch++ = 0x00777000;
1790                 *batch++ = 0;
1791                 *batch++ = 0;
1792                 *batch++ = 0;
1793         }
1794
1795         *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1796
1797         /* Pad to end of cacheline */
1798         while ((unsigned long)batch % CACHELINE_BYTES)
1799                 *batch++ = MI_NOOP;
1800
1801         return batch;
1802 }
1803
1804 #define CTX_WA_BB_SIZE (PAGE_SIZE)
1805
1806 static int lrc_create_wa_ctx(struct intel_engine_cs *engine)
1807 {
1808         struct drm_i915_gem_object *obj;
1809         struct i915_vma *vma;
1810         int err;
1811
1812         obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_SIZE);
1813         if (IS_ERR(obj))
1814                 return PTR_ERR(obj);
1815
1816         vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
1817         if (IS_ERR(vma)) {
1818                 err = PTR_ERR(vma);
1819                 goto err;
1820         }
1821
1822         engine->wa_ctx.vma = vma;
1823         return 0;
1824
1825 err:
1826         i915_gem_object_put(obj);
1827         return err;
1828 }
1829
1830 void lrc_fini_wa_ctx(struct intel_engine_cs *engine)
1831 {
1832         i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
1833 }
1834
1835 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
1836
1837 void lrc_init_wa_ctx(struct intel_engine_cs *engine)
1838 {
1839         struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
1840         struct i915_wa_ctx_bb *wa_bb[] = {
1841                 &wa_ctx->indirect_ctx, &wa_ctx->per_ctx
1842         };
1843         wa_bb_func_t wa_bb_fn[ARRAY_SIZE(wa_bb)];
1844         struct i915_gem_ww_ctx ww;
1845         void *batch, *batch_ptr;
1846         unsigned int i;
1847         int err;
1848
1849         if (GRAPHICS_VER(engine->i915) >= 11 ||
1850             !(engine->flags & I915_ENGINE_HAS_RCS_REG_STATE))
1851                 return;
1852
1853         if (GRAPHICS_VER(engine->i915) == 9) {
1854                 wa_bb_fn[0] = gen9_init_indirectctx_bb;
1855                 wa_bb_fn[1] = NULL;
1856         } else if (GRAPHICS_VER(engine->i915) == 8) {
1857                 wa_bb_fn[0] = gen8_init_indirectctx_bb;
1858                 wa_bb_fn[1] = NULL;
1859         }
1860
1861         err = lrc_create_wa_ctx(engine);
1862         if (err) {
1863                 /*
1864                  * We continue even if we fail to initialize WA batch
1865                  * because we only expect rare glitches but nothing
1866                  * critical to prevent us from using GPU
1867                  */
1868                 drm_err(&engine->i915->drm,
1869                         "Ignoring context switch w/a allocation error:%d\n",
1870                         err);
1871                 return;
1872         }
1873
1874         if (!engine->wa_ctx.vma)
1875                 return;
1876
1877         i915_gem_ww_ctx_init(&ww, true);
1878 retry:
1879         err = i915_gem_object_lock(wa_ctx->vma->obj, &ww);
1880         if (!err)
1881                 err = i915_ggtt_pin(wa_ctx->vma, &ww, 0, PIN_HIGH);
1882         if (err)
1883                 goto err;
1884
1885         batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB);
1886         if (IS_ERR(batch)) {
1887                 err = PTR_ERR(batch);
1888                 goto err_unpin;
1889         }
1890
1891         /*
1892          * Emit the two workaround batch buffers, recording the offset from the
1893          * start of the workaround batch buffer object for each and their
1894          * respective sizes.
1895          */
1896         batch_ptr = batch;
1897         for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
1898                 wa_bb[i]->offset = batch_ptr - batch;
1899                 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
1900                                                   CACHELINE_BYTES))) {
1901                         err = -EINVAL;
1902                         break;
1903                 }
1904                 if (wa_bb_fn[i])
1905                         batch_ptr = wa_bb_fn[i](engine, batch_ptr);
1906                 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
1907         }
1908         GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_SIZE);
1909
1910         __i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch);
1911         __i915_gem_object_release_map(wa_ctx->vma->obj);
1912
1913         /* Verify that we can handle failure to setup the wa_ctx */
1914         if (!err)
1915                 err = i915_inject_probe_error(engine->i915, -ENODEV);
1916
1917 err_unpin:
1918         if (err)
1919                 i915_vma_unpin(wa_ctx->vma);
1920 err:
1921         if (err == -EDEADLK) {
1922                 err = i915_gem_ww_ctx_backoff(&ww);
1923                 if (!err)
1924                         goto retry;
1925         }
1926         i915_gem_ww_ctx_fini(&ww);
1927
1928         if (err) {
1929                 i915_vma_put(engine->wa_ctx.vma);
1930
1931                 /* Clear all flags to prevent further use */
1932                 memset(wa_ctx, 0, sizeof(*wa_ctx));
1933         }
1934 }
1935
1936 static void st_runtime_underflow(struct intel_context_stats *stats, s32 dt)
1937 {
1938 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1939         stats->runtime.num_underflow++;
1940         stats->runtime.max_underflow =
1941                 max_t(u32, stats->runtime.max_underflow, -dt);
1942 #endif
1943 }
1944
1945 static u32 lrc_get_runtime(const struct intel_context *ce)
1946 {
1947         /*
1948          * We can use either ppHWSP[16] which is recorded before the context
1949          * switch (and so excludes the cost of context switches) or use the
1950          * value from the context image itself, which is saved/restored earlier
1951          * and so includes the cost of the save.
1952          */
1953         return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]);
1954 }
1955
1956 void lrc_update_runtime(struct intel_context *ce)
1957 {
1958         struct intel_context_stats *stats = &ce->stats;
1959         u32 old;
1960         s32 dt;
1961
1962         old = stats->runtime.last;
1963         stats->runtime.last = lrc_get_runtime(ce);
1964         dt = stats->runtime.last - old;
1965         if (!dt)
1966                 return;
1967
1968         if (unlikely(dt < 0)) {
1969                 CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1970                          old, stats->runtime.last, dt);
1971                 st_runtime_underflow(stats, dt);
1972                 return;
1973         }
1974
1975         ewma_runtime_add(&stats->runtime.avg, dt);
1976         stats->runtime.total += dt;
1977 }
1978
1979 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1980 #include "selftest_lrc.c"
1981 #endif