drivers/gpu/drm/i915/gt/intel_lrc.c

   1 // SPDX-License-Identifier: MIT
   2 /*
   3  * Copyright © 2014 Intel Corporation
   4  */
   5
   6 #include "gem/i915_gem_lmem.h"
   7
   8 #include "gen8_engine_cs.h"
   9 #include "i915_drv.h"
  10 #include "i915_perf.h"
  11 #include "i915_reg.h"
  12 #include "intel_context.h"
  13 #include "intel_engine.h"
  14 #include "intel_engine_regs.h"
  15 #include "intel_gpu_commands.h"
  16 #include "intel_gt.h"
  17 #include "intel_gt_regs.h"
  18 #include "intel_lrc.h"
  19 #include "intel_lrc_reg.h"
  20 #include "intel_ring.h"
  21 #include "shmem_utils.h"
  22
  23 static void set_offsets(u32 *regs,
  24                         const u8 *data,
  25                         const struct intel_engine_cs *engine,
  26                         bool close)
  27 #define NOP(x) (BIT(7) | (x))
  28 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
  29 #define POSTED BIT(0)
  30 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
  31 #define REG16(x) \
  32         (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
  33         (((x) >> 2) & 0x7f)
  34 #define END 0
  35 {
  36         const u32 base = engine->mmio_base;
  37
  38         while (*data) {
  39                 u8 count, flags;
  40
  41                 if (*data & BIT(7)) { /* skip */
  42                         count = *data++ & ~BIT(7);
  43                         regs += count;
  44                         continue;
  45                 }
  46
  47                 count = *data & 0x3f;
  48                 flags = *data >> 6;
  49                 data++;
  50
  51                 *regs = MI_LOAD_REGISTER_IMM(count);
  52                 if (flags & POSTED)
  53                         *regs |= MI_LRI_FORCE_POSTED;
  54                 if (GRAPHICS_VER(engine->i915) >= 11)
  55                         *regs |= MI_LRI_LRM_CS_MMIO;
  56                 regs++;
  57
  58                 GEM_BUG_ON(!count);
  59                 do {
  60                         u32 offset = 0;
  61                         u8 v;
  62
  63                         do {
  64                                 v = *data++;
  65                                 offset <<= 7;
  66                                 offset |= v & ~BIT(7);
  67                         } while (v & BIT(7));
  68
  69                         regs[0] = base + (offset << 2);
  70                         regs += 2;
  71                 } while (--count);
  72         }
  73
  74         if (close) {
  75                 /* Close the batch; used mainly by live_lrc_layout() */
  76                 *regs = MI_BATCH_BUFFER_END;
  77                 if (GRAPHICS_VER(engine->i915) >= 11)
  78                         *regs |= BIT(0);
  79         }
  80 }
  81
  82 static const u8 gen8_xcs_offsets[] = {
  83         NOP(1),
  84         LRI(11, 0),
  85         REG16(0x244),
  86         REG(0x034),
  87         REG(0x030),
  88         REG(0x038),
  89         REG(0x03c),
  90         REG(0x168),
  91         REG(0x140),
  92         REG(0x110),
  93         REG(0x11c),
  94         REG(0x114),
  95         REG(0x118),
  96
  97         NOP(9),
  98         LRI(9, 0),
  99         REG16(0x3a8),
 100         REG16(0x28c),
 101         REG16(0x288),
 102         REG16(0x284),
 103         REG16(0x280),
 104         REG16(0x27c),
 105         REG16(0x278),
 106         REG16(0x274),
 107         REG16(0x270),
 108
 109         NOP(13),
 110         LRI(2, 0),
 111         REG16(0x200),
 112         REG(0x028),
 113
 114         END
 115 };
 116
 117 static const u8 gen9_xcs_offsets[] = {
 118         NOP(1),
 119         LRI(14, POSTED),
 120         REG16(0x244),
 121         REG(0x034),
 122         REG(0x030),
 123         REG(0x038),
 124         REG(0x03c),
 125         REG(0x168),
 126         REG(0x140),
 127         REG(0x110),
 128         REG(0x11c),
 129         REG(0x114),
 130         REG(0x118),
 131         REG(0x1c0),
 132         REG(0x1c4),
 133         REG(0x1c8),
 134
 135         NOP(3),
 136         LRI(9, POSTED),
 137         REG16(0x3a8),
 138         REG16(0x28c),
 139         REG16(0x288),
 140         REG16(0x284),
 141         REG16(0x280),
 142         REG16(0x27c),
 143         REG16(0x278),
 144         REG16(0x274),
 145         REG16(0x270),
 146
 147         NOP(13),
 148         LRI(1, POSTED),
 149         REG16(0x200),
 150
 151         NOP(13),
 152         LRI(44, POSTED),
 153         REG(0x028),
 154         REG(0x09c),
 155         REG(0x0c0),
 156         REG(0x178),
 157         REG(0x17c),
 158         REG16(0x358),
 159         REG(0x170),
 160         REG(0x150),
 161         REG(0x154),
 162         REG(0x158),
 163         REG16(0x41c),
 164         REG16(0x600),
 165         REG16(0x604),
 166         REG16(0x608),
 167         REG16(0x60c),
 168         REG16(0x610),
 169         REG16(0x614),
 170         REG16(0x618),
 171         REG16(0x61c),
 172         REG16(0x620),
 173         REG16(0x624),
 174         REG16(0x628),
 175         REG16(0x62c),
 176         REG16(0x630),
 177         REG16(0x634),
 178         REG16(0x638),
 179         REG16(0x63c),
 180         REG16(0x640),
 181         REG16(0x644),
 182         REG16(0x648),
 183         REG16(0x64c),
 184         REG16(0x650),
 185         REG16(0x654),
 186         REG16(0x658),
 187         REG16(0x65c),
 188         REG16(0x660),
 189         REG16(0x664),
 190         REG16(0x668),
 191         REG16(0x66c),
 192         REG16(0x670),
 193         REG16(0x674),
 194         REG16(0x678),
 195         REG16(0x67c),
 196         REG(0x068),
 197
 198         END
 199 };
 200
 201 static const u8 gen12_xcs_offsets[] = {
 202         NOP(1),
 203         LRI(13, POSTED),
 204         REG16(0x244),
 205         REG(0x034),
 206         REG(0x030),
 207         REG(0x038),
 208         REG(0x03c),
 209         REG(0x168),
 210         REG(0x140),
 211         REG(0x110),
 212         REG(0x1c0),
 213         REG(0x1c4),
 214         REG(0x1c8),
 215         REG(0x180),
 216         REG16(0x2b4),
 217
 218         NOP(5),
 219         LRI(9, POSTED),
 220         REG16(0x3a8),
 221         REG16(0x28c),
 222         REG16(0x288),
 223         REG16(0x284),
 224         REG16(0x280),
 225         REG16(0x27c),
 226         REG16(0x278),
 227         REG16(0x274),
 228         REG16(0x270),
 229
 230         END
 231 };
 232
 233 static const u8 dg2_xcs_offsets[] = {
 234         NOP(1),
 235         LRI(15, POSTED),
 236         REG16(0x244),
 237         REG(0x034),
 238         REG(0x030),
 239         REG(0x038),
 240         REG(0x03c),
 241         REG(0x168),
 242         REG(0x140),
 243         REG(0x110),
 244         REG(0x1c0),
 245         REG(0x1c4),
 246         REG(0x1c8),
 247         REG(0x180),
 248         REG16(0x2b4),
 249         REG(0x120),
 250         REG(0x124),
 251
 252         NOP(1),
 253         LRI(9, POSTED),
 254         REG16(0x3a8),
 255         REG16(0x28c),
 256         REG16(0x288),
 257         REG16(0x284),
 258         REG16(0x280),
 259         REG16(0x27c),
 260         REG16(0x278),
 261         REG16(0x274),
 262         REG16(0x270),
 263
 264         END
 265 };
 266
 267 static const u8 gen8_rcs_offsets[] = {
 268         NOP(1),
 269         LRI(14, POSTED),
 270         REG16(0x244),
 271         REG(0x034),
 272         REG(0x030),
 273         REG(0x038),
 274         REG(0x03c),
 275         REG(0x168),
 276         REG(0x140),
 277         REG(0x110),
 278         REG(0x11c),
 279         REG(0x114),
 280         REG(0x118),
 281         REG(0x1c0),
 282         REG(0x1c4),
 283         REG(0x1c8),
 284
 285         NOP(3),
 286         LRI(9, POSTED),
 287         REG16(0x3a8),
 288         REG16(0x28c),
 289         REG16(0x288),
 290         REG16(0x284),
 291         REG16(0x280),
 292         REG16(0x27c),
 293         REG16(0x278),
 294         REG16(0x274),
 295         REG16(0x270),
 296
 297         NOP(13),
 298         LRI(1, 0),
 299         REG(0x0c8),
 300
 301         END
 302 };
 303
 304 static const u8 gen9_rcs_offsets[] = {
 305         NOP(1),
 306         LRI(14, POSTED),
 307         REG16(0x244),
 308         REG(0x34),
 309         REG(0x30),
 310         REG(0x38),
 311         REG(0x3c),
 312         REG(0x168),
 313         REG(0x140),
 314         REG(0x110),
 315         REG(0x11c),
 316         REG(0x114),
 317         REG(0x118),
 318         REG(0x1c0),
 319         REG(0x1c4),
 320         REG(0x1c8),
 321
 322         NOP(3),
 323         LRI(9, POSTED),
 324         REG16(0x3a8),
 325         REG16(0x28c),
 326         REG16(0x288),
 327         REG16(0x284),
 328         REG16(0x280),
 329         REG16(0x27c),
 330         REG16(0x278),
 331         REG16(0x274),
 332         REG16(0x270),
 333
 334         NOP(13),
 335         LRI(1, 0),
 336         REG(0xc8),
 337
 338         NOP(13),
 339         LRI(44, POSTED),
 340         REG(0x28),
 341         REG(0x9c),
 342         REG(0xc0),
 343         REG(0x178),
 344         REG(0x17c),
 345         REG16(0x358),
 346         REG(0x170),
 347         REG(0x150),
 348         REG(0x154),
 349         REG(0x158),
 350         REG16(0x41c),
 351         REG16(0x600),
 352         REG16(0x604),
 353         REG16(0x608),
 354         REG16(0x60c),
 355         REG16(0x610),
 356         REG16(0x614),
 357         REG16(0x618),
 358         REG16(0x61c),
 359         REG16(0x620),
 360         REG16(0x624),
 361         REG16(0x628),
 362         REG16(0x62c),
 363         REG16(0x630),
 364         REG16(0x634),
 365         REG16(0x638),
 366         REG16(0x63c),
 367         REG16(0x640),
 368         REG16(0x644),
 369         REG16(0x648),
 370         REG16(0x64c),
 371         REG16(0x650),
 372         REG16(0x654),
 373         REG16(0x658),
 374         REG16(0x65c),
 375         REG16(0x660),
 376         REG16(0x664),
 377         REG16(0x668),
 378         REG16(0x66c),
 379         REG16(0x670),
 380         REG16(0x674),
 381         REG16(0x678),
 382         REG16(0x67c),
 383         REG(0x68),
 384
 385         END
 386 };
 387
 388 static const u8 gen11_rcs_offsets[] = {
 389         NOP(1),
 390         LRI(15, POSTED),
 391         REG16(0x244),
 392         REG(0x034),
 393         REG(0x030),
 394         REG(0x038),
 395         REG(0x03c),
 396         REG(0x168),
 397         REG(0x140),
 398         REG(0x110),
 399         REG(0x11c),
 400         REG(0x114),
 401         REG(0x118),
 402         REG(0x1c0),
 403         REG(0x1c4),
 404         REG(0x1c8),
 405         REG(0x180),
 406
 407         NOP(1),
 408         LRI(9, POSTED),
 409         REG16(0x3a8),
 410         REG16(0x28c),
 411         REG16(0x288),
 412         REG16(0x284),
 413         REG16(0x280),
 414         REG16(0x27c),
 415         REG16(0x278),
 416         REG16(0x274),
 417         REG16(0x270),
 418
 419         LRI(1, POSTED),
 420         REG(0x1b0),
 421
 422         NOP(10),
 423         LRI(1, 0),
 424         REG(0x0c8),
 425
 426         END
 427 };
 428
 429 static const u8 gen12_rcs_offsets[] = {
 430         NOP(1),
 431         LRI(13, POSTED),
 432         REG16(0x244),
 433         REG(0x034),
 434         REG(0x030),
 435         REG(0x038),
 436         REG(0x03c),
 437         REG(0x168),
 438         REG(0x140),
 439         REG(0x110),
 440         REG(0x1c0),
 441         REG(0x1c4),
 442         REG(0x1c8),
 443         REG(0x180),
 444         REG16(0x2b4),
 445
 446         NOP(5),
 447         LRI(9, POSTED),
 448         REG16(0x3a8),
 449         REG16(0x28c),
 450         REG16(0x288),
 451         REG16(0x284),
 452         REG16(0x280),
 453         REG16(0x27c),
 454         REG16(0x278),
 455         REG16(0x274),
 456         REG16(0x270),
 457
 458         LRI(3, POSTED),
 459         REG(0x1b0),
 460         REG16(0x5a8),
 461         REG16(0x5ac),
 462
 463         NOP(6),
 464         LRI(1, 0),
 465         REG(0x0c8),
 466         NOP(3 + 9 + 1),
 467
 468         LRI(51, POSTED),
 469         REG16(0x588),
 470         REG16(0x588),
 471         REG16(0x588),
 472         REG16(0x588),
 473         REG16(0x588),
 474         REG16(0x588),
 475         REG(0x028),
 476         REG(0x09c),
 477         REG(0x0c0),
 478         REG(0x178),
 479         REG(0x17c),
 480         REG16(0x358),
 481         REG(0x170),
 482         REG(0x150),
 483         REG(0x154),
 484         REG(0x158),
 485         REG16(0x41c),
 486         REG16(0x600),
 487         REG16(0x604),
 488         REG16(0x608),
 489         REG16(0x60c),
 490         REG16(0x610),
 491         REG16(0x614),
 492         REG16(0x618),
 493         REG16(0x61c),
 494         REG16(0x620),
 495         REG16(0x624),
 496         REG16(0x628),
 497         REG16(0x62c),
 498         REG16(0x630),
 499         REG16(0x634),
 500         REG16(0x638),
 501         REG16(0x63c),
 502         REG16(0x640),
 503         REG16(0x644),
 504         REG16(0x648),
 505         REG16(0x64c),
 506         REG16(0x650),
 507         REG16(0x654),
 508         REG16(0x658),
 509         REG16(0x65c),
 510         REG16(0x660),
 511         REG16(0x664),
 512         REG16(0x668),
 513         REG16(0x66c),
 514         REG16(0x670),
 515         REG16(0x674),
 516         REG16(0x678),
 517         REG16(0x67c),
 518         REG(0x068),
 519         REG(0x084),
 520         NOP(1),
 521
 522         END
 523 };
 524
 525 static const u8 xehp_rcs_offsets[] = {
 526         NOP(1),
 527         LRI(13, POSTED),
 528         REG16(0x244),
 529         REG(0x034),
 530         REG(0x030),
 531         REG(0x038),
 532         REG(0x03c),
 533         REG(0x168),
 534         REG(0x140),
 535         REG(0x110),
 536         REG(0x1c0),
 537         REG(0x1c4),
 538         REG(0x1c8),
 539         REG(0x180),
 540         REG16(0x2b4),
 541
 542         NOP(5),
 543         LRI(9, POSTED),
 544         REG16(0x3a8),
 545         REG16(0x28c),
 546         REG16(0x288),
 547         REG16(0x284),
 548         REG16(0x280),
 549         REG16(0x27c),
 550         REG16(0x278),
 551         REG16(0x274),
 552         REG16(0x270),
 553
 554         LRI(3, POSTED),
 555         REG(0x1b0),
 556         REG16(0x5a8),
 557         REG16(0x5ac),
 558
 559         NOP(6),
 560         LRI(1, 0),
 561         REG(0x0c8),
 562
 563         END
 564 };
 565
 566 static const u8 dg2_rcs_offsets[] = {
 567         NOP(1),
 568         LRI(15, POSTED),
 569         REG16(0x244),
 570         REG(0x034),
 571         REG(0x030),
 572         REG(0x038),
 573         REG(0x03c),
 574         REG(0x168),
 575         REG(0x140),
 576         REG(0x110),
 577         REG(0x1c0),
 578         REG(0x1c4),
 579         REG(0x1c8),
 580         REG(0x180),
 581         REG16(0x2b4),
 582         REG(0x120),
 583         REG(0x124),
 584
 585         NOP(1),
 586         LRI(9, POSTED),
 587         REG16(0x3a8),
 588         REG16(0x28c),
 589         REG16(0x288),
 590         REG16(0x284),
 591         REG16(0x280),
 592         REG16(0x27c),
 593         REG16(0x278),
 594         REG16(0x274),
 595         REG16(0x270),
 596
 597         LRI(3, POSTED),
 598         REG(0x1b0),
 599         REG16(0x5a8),
 600         REG16(0x5ac),
 601
 602         NOP(6),
 603         LRI(1, 0),
 604         REG(0x0c8),
 605
 606         END
 607 };
 608
 609 #undef END
 610 #undef REG16
 611 #undef REG
 612 #undef LRI
 613 #undef NOP
 614
 615 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
 616 {
 617         /*
 618          * The gen12+ lists only have the registers we program in the basic
 619          * default state. We rely on the context image using relative
 620          * addressing to automatic fixup the register state between the
 621          * physical engines for virtual engine.
 622          */
 623         GEM_BUG_ON(GRAPHICS_VER(engine->i915) >= 12 &&
 624                    !intel_engine_has_relative_mmio(engine));
 625
 626         if (engine->flags & I915_ENGINE_HAS_RCS_REG_STATE) {
 627                 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
 628                         return dg2_rcs_offsets;
 629                 else if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
 630                         return xehp_rcs_offsets;
 631                 else if (GRAPHICS_VER(engine->i915) >= 12)
 632                         return gen12_rcs_offsets;
 633                 else if (GRAPHICS_VER(engine->i915) >= 11)
 634                         return gen11_rcs_offsets;
 635                 else if (GRAPHICS_VER(engine->i915) >= 9)
 636                         return gen9_rcs_offsets;
 637                 else
 638                         return gen8_rcs_offsets;
 639         } else {
 640                 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
 641                         return dg2_xcs_offsets;
 642                 else if (GRAPHICS_VER(engine->i915) >= 12)
 643                         return gen12_xcs_offsets;
 644                 else if (GRAPHICS_VER(engine->i915) >= 9)
 645                         return gen9_xcs_offsets;
 646                 else
 647                         return gen8_xcs_offsets;
 648         }
 649 }
 650
 651 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
 652 {
 653         if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
 654                 return 0x70;
 655         else if (GRAPHICS_VER(engine->i915) >= 12)
 656                 return 0x60;
 657         else if (GRAPHICS_VER(engine->i915) >= 9)
 658                 return 0x54;
 659         else if (engine->class == RENDER_CLASS)
 660                 return 0x58;
 661         else
 662                 return -1;
 663 }
 664
 665 static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
 666 {
 667         if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
 668                 return 0x84;
 669         else if (GRAPHICS_VER(engine->i915) >= 12)
 670                 return 0x74;
 671         else if (GRAPHICS_VER(engine->i915) >= 9)
 672                 return 0x68;
 673         else if (engine->class == RENDER_CLASS)
 674                 return 0xd8;
 675         else
 676                 return -1;
 677 }
 678
 679 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
 680 {
 681         if (GRAPHICS_VER(engine->i915) >= 12)
 682                 return 0x12;
 683         else if (GRAPHICS_VER(engine->i915) >= 9 || engine->class == RENDER_CLASS)
 684                 return 0x18;
 685         else
 686                 return -1;
 687 }
 688
 689 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
 690 {
 691         int x;
 692
 693         x = lrc_ring_wa_bb_per_ctx(engine);
 694         if (x < 0)
 695                 return x;
 696
 697         return x + 2;
 698 }
 699
 700 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
 701 {
 702         int x;
 703
 704         x = lrc_ring_indirect_ptr(engine);
 705         if (x < 0)
 706                 return x;
 707
 708         return x + 2;
 709 }
 710
 711 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
 712 {
 713
 714         if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
 715                 /*
 716                  * Note that the CSFE context has a dummy slot for CMD_BUF_CCTL
 717                  * simply to match the RCS context image layout.
 718                  */
 719                 return 0xc6;
 720         else if (engine->class != RENDER_CLASS)
 721                 return -1;
 722         else if (GRAPHICS_VER(engine->i915) >= 12)
 723                 return 0xb6;
 724         else if (GRAPHICS_VER(engine->i915) >= 11)
 725                 return 0xaa;
 726         else
 727                 return -1;
 728 }
 729
 730 static u32
 731 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
 732 {
 733         switch (GRAPHICS_VER(engine->i915)) {
 734         default:
 735                 MISSING_CASE(GRAPHICS_VER(engine->i915));
 736                 fallthrough;
 737         case 12:
 738                 return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 739         case 11:
 740                 return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 741         case 9:
 742                 return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 743         case 8:
 744                 return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 745         }
 746 }
 747
 748 static void
 749 lrc_setup_indirect_ctx(u32 *regs,
 750                        const struct intel_engine_cs *engine,
 751                        u32 ctx_bb_ggtt_addr,
 752                        u32 size)
 753 {
 754         GEM_BUG_ON(!size);
 755         GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES));
 756         GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1);
 757         regs[lrc_ring_indirect_ptr(engine) + 1] =
 758                 ctx_bb_ggtt_addr | (size / CACHELINE_BYTES);
 759
 760         GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1);
 761         regs[lrc_ring_indirect_offset(engine) + 1] =
 762                 lrc_ring_indirect_offset_default(engine) << 6;
 763 }
 764
 765 static void init_common_regs(u32 * const regs,
 766                              const struct intel_context *ce,
 767                              const struct intel_engine_cs *engine,
 768                              bool inhibit)
 769 {
 770         u32 ctl;
 771
 772         ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
 773         ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
 774         if (inhibit)
 775                 ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
 776         if (GRAPHICS_VER(engine->i915) < 11)
 777                 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
 778                                            CTX_CTRL_RS_CTX_ENABLE);
 779         regs[CTX_CONTEXT_CONTROL] = ctl;
 780
 781         regs[CTX_TIMESTAMP] = ce->stats.runtime.last;
 782 }
 783
 784 static void init_wa_bb_regs(u32 * const regs,
 785                             const struct intel_engine_cs *engine)
 786 {
 787         const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
 788
 789         if (wa_ctx->per_ctx.size) {
 790                 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
 791
 792                 GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
 793                 regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
 794                         (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
 795         }
 796
 797         if (wa_ctx->indirect_ctx.size) {
 798                 lrc_setup_indirect_ctx(regs, engine,
 799                                        i915_ggtt_offset(wa_ctx->vma) +
 800                                        wa_ctx->indirect_ctx.offset,
 801                                        wa_ctx->indirect_ctx.size);
 802         }
 803 }
 804
 805 static void init_ppgtt_regs(u32 *regs, const struct i915_ppgtt *ppgtt)
 806 {
 807         if (i915_vm_is_4lvl(&ppgtt->vm)) {
 808                 /* 64b PPGTT (48bit canonical)
 809                  * PDP0_DESCRIPTOR contains the base address to PML4 and
 810                  * other PDP Descriptors are ignored.
 811                  */
 812                 ASSIGN_CTX_PML4(ppgtt, regs);
 813         } else {
 814                 ASSIGN_CTX_PDP(ppgtt, regs, 3);
 815                 ASSIGN_CTX_PDP(ppgtt, regs, 2);
 816                 ASSIGN_CTX_PDP(ppgtt, regs, 1);
 817                 ASSIGN_CTX_PDP(ppgtt, regs, 0);
 818         }
 819 }
 820
 821 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
 822 {
 823         if (i915_is_ggtt(vm))
 824                 return i915_vm_to_ggtt(vm)->alias;
 825         else
 826                 return i915_vm_to_ppgtt(vm);
 827 }
 828
 829 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
 830 {
 831         int x;
 832
 833         x = lrc_ring_mi_mode(engine);
 834         if (x != -1) {
 835                 regs[x + 1] &= ~STOP_RING;
 836                 regs[x + 1] |= STOP_RING << 16;
 837         }
 838 }
 839
 840 static void __lrc_init_regs(u32 *regs,
 841                             const struct intel_context *ce,
 842                             const struct intel_engine_cs *engine,
 843                             bool inhibit)
 844 {
 845         /*
 846          * A context is actually a big batch buffer with several
 847          * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
 848          * values we are setting here are only for the first context restore:
 849          * on a subsequent save, the GPU will recreate this batchbuffer with new
 850          * values (including all the missing MI_LOAD_REGISTER_IMM commands that
 851          * we are not initializing here).
 852          *
 853          * Must keep consistent with virtual_update_register_offsets().
 854          */
 855
 856         if (inhibit)
 857                 memset(regs, 0, PAGE_SIZE);
 858
 859         set_offsets(regs, reg_offsets(engine), engine, inhibit);
 860
 861         init_common_regs(regs, ce, engine, inhibit);
 862         init_ppgtt_regs(regs, vm_alias(ce->vm));
 863
 864         init_wa_bb_regs(regs, engine);
 865
 866         __reset_stop_ring(regs, engine);
 867 }
 868
 869 void lrc_init_regs(const struct intel_context *ce,
 870                    const struct intel_engine_cs *engine,
 871                    bool inhibit)
 872 {
 873         __lrc_init_regs(ce->lrc_reg_state, ce, engine, inhibit);
 874 }
 875
 876 void lrc_reset_regs(const struct intel_context *ce,
 877                     const struct intel_engine_cs *engine)
 878 {
 879         __reset_stop_ring(ce->lrc_reg_state, engine);
 880 }
 881
 882 static void
 883 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
 884 {
 885         if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
 886                 return;
 887
 888         vaddr += engine->context_size;
 889
 890         memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
 891 }
 892
 893 static void
 894 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
 895 {
 896         if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
 897                 return;
 898
 899         vaddr += engine->context_size;
 900
 901         if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
 902                 drm_err_once(&engine->i915->drm,
 903                              "%s context redzone overwritten!\n",
 904                              engine->name);
 905 }
 906
 907 static u32 context_wa_bb_offset(const struct intel_context *ce)
 908 {
 909         return PAGE_SIZE * ce->wa_bb_page;
 910 }
 911
 912 static u32 *context_indirect_bb(const struct intel_context *ce)
 913 {
 914         void *ptr;
 915
 916         GEM_BUG_ON(!ce->wa_bb_page);
 917
 918         ptr = ce->lrc_reg_state;
 919         ptr -= LRC_STATE_OFFSET; /* back to start of context image */
 920         ptr += context_wa_bb_offset(ce);
 921
 922         return ptr;
 923 }
 924
 925 void lrc_init_state(struct intel_context *ce,
 926                     struct intel_engine_cs *engine,
 927                     void *state)
 928 {
 929         bool inhibit = true;
 930
 931         set_redzone(state, engine);
 932
 933         if (engine->default_state) {
 934                 shmem_read(engine->default_state, 0,
 935                            state, engine->context_size);
 936                 __set_bit(CONTEXT_VALID_BIT, &ce->flags);
 937                 inhibit = false;
 938         }
 939
 940         /* Clear the ppHWSP (inc. per-context counters) */
 941         memset(state, 0, PAGE_SIZE);
 942
 943         /* Clear the indirect wa and storage */
 944         if (ce->wa_bb_page)
 945                 memset(state + context_wa_bb_offset(ce), 0, PAGE_SIZE);
 946
 947         /*
 948          * The second page of the context object contains some registers which
 949          * must be set up prior to the first execution.
 950          */
 951         __lrc_init_regs(state + LRC_STATE_OFFSET, ce, engine, inhibit);
 952 }
 953
 954 u32 lrc_indirect_bb(const struct intel_context *ce)
 955 {
 956         return i915_ggtt_offset(ce->state) + context_wa_bb_offset(ce);
 957 }
 958
 959 static u32 *setup_predicate_disable_wa(const struct intel_context *ce, u32 *cs)
 960 {
 961         /* If predication is active, this will be noop'ed */
 962         *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2);
 963         *cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA;
 964         *cs++ = 0;
 965         *cs++ = 0; /* No predication */
 966
 967         /* predicated end, only terminates if SET_PREDICATE_RESULT:0 is clear */
 968         *cs++ = MI_BATCH_BUFFER_END | BIT(15);
 969         *cs++ = MI_SET_PREDICATE | MI_SET_PREDICATE_DISABLE;
 970
 971         /* Instructions are no longer predicated (disabled), we can proceed */
 972         *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2);
 973         *cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA;
 974         *cs++ = 0;
 975         *cs++ = 1; /* enable predication before the next BB */
 976
 977         *cs++ = MI_BATCH_BUFFER_END;
 978         GEM_BUG_ON(offset_in_page(cs) > DG2_PREDICATE_RESULT_WA);
 979
 980         return cs;
 981 }
 982
 983 static struct i915_vma *
 984 __lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine)
 985 {
 986         struct drm_i915_gem_object *obj;
 987         struct i915_vma *vma;
 988         u32 context_size;
 989
 990         context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
 991
 992         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
 993                 context_size += I915_GTT_PAGE_SIZE; /* for redzone */
 994
 995         if (GRAPHICS_VER(engine->i915) == 12) {
 996                 ce->wa_bb_page = context_size / PAGE_SIZE;
 997                 context_size += PAGE_SIZE;
 998         }
 999
1000         if (intel_context_is_parent(ce) && intel_engine_uses_guc(engine)) {
1001                 ce->parallel.guc.parent_page = context_size / PAGE_SIZE;
1002                 context_size += PARENT_SCRATCH_SIZE;
1003         }
1004
1005         obj = i915_gem_object_create_lmem(engine->i915, context_size,
1006                                           I915_BO_ALLOC_PM_VOLATILE);
1007         if (IS_ERR(obj))
1008                 obj = i915_gem_object_create_shmem(engine->i915, context_size);
1009         if (IS_ERR(obj))
1010                 return ERR_CAST(obj);
1011
1012         vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
1013         if (IS_ERR(vma)) {
1014                 i915_gem_object_put(obj);
1015                 return vma;
1016         }
1017
1018         return vma;
1019 }
1020
1021 static struct intel_timeline *
1022 pinned_timeline(struct intel_context *ce, struct intel_engine_cs *engine)
1023 {
1024         struct intel_timeline *tl = fetch_and_zero(&ce->timeline);
1025
1026         return intel_timeline_create_from_engine(engine, page_unmask_bits(tl));
1027 }
1028
1029 int lrc_alloc(struct intel_context *ce, struct intel_engine_cs *engine)
1030 {
1031         struct intel_ring *ring;
1032         struct i915_vma *vma;
1033         int err;
1034
1035         GEM_BUG_ON(ce->state);
1036
1037         vma = __lrc_alloc_state(ce, engine);
1038         if (IS_ERR(vma))
1039                 return PTR_ERR(vma);
1040
1041         ring = intel_engine_create_ring(engine, ce->ring_size);
1042         if (IS_ERR(ring)) {
1043                 err = PTR_ERR(ring);
1044                 goto err_vma;
1045         }
1046
1047         if (!page_mask_bits(ce->timeline)) {
1048                 struct intel_timeline *tl;
1049
1050                 /*
1051                  * Use the static global HWSP for the kernel context, and
1052                  * a dynamically allocated cacheline for everyone else.
1053                  */
1054                 if (unlikely(ce->timeline))
1055                         tl = pinned_timeline(ce, engine);
1056                 else
1057                         tl = intel_timeline_create(engine->gt);
1058                 if (IS_ERR(tl)) {
1059                         err = PTR_ERR(tl);
1060                         goto err_ring;
1061                 }
1062
1063                 ce->timeline = tl;
1064         }
1065
1066         ce->ring = ring;
1067         ce->state = vma;
1068
1069         return 0;
1070
1071 err_ring:
1072         intel_ring_put(ring);
1073 err_vma:
1074         i915_vma_put(vma);
1075         return err;
1076 }
1077
1078 void lrc_reset(struct intel_context *ce)
1079 {
1080         GEM_BUG_ON(!intel_context_is_pinned(ce));
1081
1082         intel_ring_reset(ce->ring, ce->ring->emit);
1083
1084         /* Scrub away the garbage */
1085         lrc_init_regs(ce, ce->engine, true);
1086         ce->lrc.lrca = lrc_update_regs(ce, ce->engine, ce->ring->tail);
1087 }
1088
1089 int
1090 lrc_pre_pin(struct intel_context *ce,
1091             struct intel_engine_cs *engine,
1092             struct i915_gem_ww_ctx *ww,
1093             void **vaddr)
1094 {
1095         GEM_BUG_ON(!ce->state);
1096         GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
1097
1098         *vaddr = i915_gem_object_pin_map(ce->state->obj,
1099                                          i915_coherent_map_type(ce->engine->i915,
1100                                                                 ce->state->obj,
1101                                                                 false) |
1102                                          I915_MAP_OVERRIDE);
1103
1104         return PTR_ERR_OR_ZERO(*vaddr);
1105 }
1106
1107 int
1108 lrc_pin(struct intel_context *ce,
1109         struct intel_engine_cs *engine,
1110         void *vaddr)
1111 {
1112         ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
1113
1114         if (!__test_and_set_bit(CONTEXT_INIT_BIT, &ce->flags))
1115                 lrc_init_state(ce, engine, vaddr);
1116
1117         ce->lrc.lrca = lrc_update_regs(ce, engine, ce->ring->tail);
1118         return 0;
1119 }
1120
1121 void lrc_unpin(struct intel_context *ce)
1122 {
1123         if (unlikely(ce->parallel.last_rq)) {
1124                 i915_request_put(ce->parallel.last_rq);
1125                 ce->parallel.last_rq = NULL;
1126         }
1127         check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
1128                       ce->engine);
1129 }
1130
1131 void lrc_post_unpin(struct intel_context *ce)
1132 {
1133         i915_gem_object_unpin_map(ce->state->obj);
1134 }
1135
1136 void lrc_fini(struct intel_context *ce)
1137 {
1138         if (!ce->state)
1139                 return;
1140
1141         intel_ring_put(fetch_and_zero(&ce->ring));
1142         i915_vma_put(fetch_and_zero(&ce->state));
1143 }
1144
1145 void lrc_destroy(struct kref *kref)
1146 {
1147         struct intel_context *ce = container_of(kref, typeof(*ce), ref);
1148
1149         GEM_BUG_ON(!i915_active_is_idle(&ce->active));
1150         GEM_BUG_ON(intel_context_is_pinned(ce));
1151
1152         lrc_fini(ce);
1153
1154         intel_context_fini(ce);
1155         intel_context_free(ce);
1156 }
1157
1158 static u32 *
1159 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs)
1160 {
1161         *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1162                 MI_SRM_LRM_GLOBAL_GTT |
1163                 MI_LRI_LRM_CS_MMIO;
1164         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1165         *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1166                 CTX_TIMESTAMP * sizeof(u32);
1167         *cs++ = 0;
1168
1169         *cs++ = MI_LOAD_REGISTER_REG |
1170                 MI_LRR_SOURCE_CS_MMIO |
1171                 MI_LRI_LRM_CS_MMIO;
1172         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1173         *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
1174
1175         *cs++ = MI_LOAD_REGISTER_REG |
1176                 MI_LRR_SOURCE_CS_MMIO |
1177                 MI_LRI_LRM_CS_MMIO;
1178         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1179         *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
1180
1181         return cs;
1182 }
1183
1184 static u32 *
1185 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs)
1186 {
1187         GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1);
1188
1189         *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1190                 MI_SRM_LRM_GLOBAL_GTT |
1191                 MI_LRI_LRM_CS_MMIO;
1192         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1193         *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1194                 (lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32);
1195         *cs++ = 0;
1196
1197         return cs;
1198 }
1199
1200 static u32 *
1201 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
1202 {
1203         GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1);
1204
1205         *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1206                 MI_SRM_LRM_GLOBAL_GTT |
1207                 MI_LRI_LRM_CS_MMIO;
1208         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1209         *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1210                 (lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32);
1211         *cs++ = 0;
1212
1213         *cs++ = MI_LOAD_REGISTER_REG |
1214                 MI_LRR_SOURCE_CS_MMIO |
1215                 MI_LRI_LRM_CS_MMIO;
1216         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1217         *cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0));
1218
1219         return cs;
1220 }
1221
1222 /*
1223  * On DG2 during context restore of a preempted context in GPGPU mode,
1224  * RCS restore hang is detected. This is extremely timing dependent.
1225  * To address this below sw wabb is implemented for DG2 A steppings.
1226  */
1227 static u32 *
1228 dg2_emit_rcs_hang_wabb(const struct intel_context *ce, u32 *cs)
1229 {
1230         *cs++ = MI_LOAD_REGISTER_IMM(1);
1231         *cs++ = i915_mmio_reg_offset(GEN12_STATE_ACK_DEBUG);
1232         *cs++ = 0x21;
1233
1234         *cs++ = MI_LOAD_REGISTER_REG;
1235         *cs++ = i915_mmio_reg_offset(RING_NOPID(ce->engine->mmio_base));
1236         *cs++ = i915_mmio_reg_offset(GEN12_CULLBIT1);
1237
1238         *cs++ = MI_LOAD_REGISTER_REG;
1239         *cs++ = i915_mmio_reg_offset(RING_NOPID(ce->engine->mmio_base));
1240         *cs++ = i915_mmio_reg_offset(GEN12_CULLBIT2);
1241
1242         return cs;
1243 }
1244
1245 static u32 *
1246 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
1247 {
1248         cs = gen12_emit_timestamp_wa(ce, cs);
1249         cs = gen12_emit_cmd_buf_wa(ce, cs);
1250         cs = gen12_emit_restore_scratch(ce, cs);
1251
1252         /* Wa_22011450934:dg2 */
1253         if (IS_DG2_GRAPHICS_STEP(ce->engine->i915, G10, STEP_A0, STEP_B0) ||
1254             IS_DG2_GRAPHICS_STEP(ce->engine->i915, G11, STEP_A0, STEP_B0))
1255                 cs = dg2_emit_rcs_hang_wabb(ce, cs);
1256
1257         /* Wa_16013000631:dg2 */
1258         if (IS_DG2_GRAPHICS_STEP(ce->engine->i915, G10, STEP_B0, STEP_C0) ||
1259             IS_DG2_G11(ce->engine->i915))
1260                 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE, 0);
1261
1262         /* hsdes: 1809175790 */
1263         if (!HAS_FLAT_CCS(ce->engine->i915))
1264                 cs = gen12_emit_aux_table_inv(cs, GEN12_GFX_CCS_AUX_NV);
1265
1266         return cs;
1267 }
1268
1269 static u32 *
1270 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
1271 {
1272         cs = gen12_emit_timestamp_wa(ce, cs);
1273         cs = gen12_emit_restore_scratch(ce, cs);
1274
1275         /* Wa_16013000631:dg2 */
1276         if (IS_DG2_GRAPHICS_STEP(ce->engine->i915, G10, STEP_B0, STEP_C0) ||
1277             IS_DG2_G11(ce->engine->i915))
1278                 if (ce->engine->class == COMPUTE_CLASS)
1279                         cs = gen8_emit_pipe_control(cs,
1280                                                     PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE,
1281                                                     0);
1282
1283         /* hsdes: 1809175790 */
1284         if (!HAS_FLAT_CCS(ce->engine->i915)) {
1285                 if (ce->engine->class == VIDEO_DECODE_CLASS)
1286                         cs = gen12_emit_aux_table_inv(cs, GEN12_VD0_AUX_NV);
1287                 else if (ce->engine->class == VIDEO_ENHANCEMENT_CLASS)
1288                         cs = gen12_emit_aux_table_inv(cs, GEN12_VE0_AUX_NV);
1289         }
1290
1291         return cs;
1292 }
1293
1294 static void
1295 setup_indirect_ctx_bb(const struct intel_context *ce,
1296                       const struct intel_engine_cs *engine,
1297                       u32 *(*emit)(const struct intel_context *, u32 *))
1298 {
1299         u32 * const start = context_indirect_bb(ce);
1300         u32 *cs;
1301
1302         cs = emit(ce, start);
1303         GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
1304         while ((unsigned long)cs % CACHELINE_BYTES)
1305                 *cs++ = MI_NOOP;
1306
1307         GEM_BUG_ON(cs - start > DG2_PREDICATE_RESULT_BB / sizeof(*start));
1308         setup_predicate_disable_wa(ce, start + DG2_PREDICATE_RESULT_BB / sizeof(*start));
1309
1310         lrc_setup_indirect_ctx(ce->lrc_reg_state, engine,
1311                                lrc_indirect_bb(ce),
1312                                (cs - start) * sizeof(*cs));
1313 }
1314
1315 /*
1316  * The context descriptor encodes various attributes of a context,
1317  * including its GTT address and some flags. Because it's fairly
1318  * expensive to calculate, we'll just do it once and cache the result,
1319  * which remains valid until the context is unpinned.
1320  *
1321  * This is what a descriptor looks like, from LSB to MSB::
1322  *
1323  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
1324  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
1325  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
1326  *      bits 53-54:    mbz, reserved for use by hardware
1327  *      bits 55-63:    group ID, currently unused and set to 0
1328  *
1329  * Starting from Gen11, the upper dword of the descriptor has a new format:
1330  *
1331  *      bits 32-36:    reserved
1332  *      bits 37-47:    SW context ID
1333  *      bits 48:53:    engine instance
1334  *      bit 54:        mbz, reserved for use by hardware
1335  *      bits 55-60:    SW counter
1336  *      bits 61-63:    engine class
1337  *
1338  * On Xe_HP, the upper dword of the descriptor has a new format:
1339  *
1340  *      bits 32-37:    virtual function number
1341  *      bit 38:        mbz, reserved for use by hardware
1342  *      bits 39-54:    SW context ID
1343  *      bits 55-57:    reserved
1344  *      bits 58-63:    SW counter
1345  *
1346  * engine info, SW context ID and SW counter need to form a unique number
1347  * (Context ID) per lrc.
1348  */
1349 static u32 lrc_descriptor(const struct intel_context *ce)
1350 {
1351         u32 desc;
1352
1353         desc = INTEL_LEGACY_32B_CONTEXT;
1354         if (i915_vm_is_4lvl(ce->vm))
1355                 desc = INTEL_LEGACY_64B_CONTEXT;
1356         desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
1357
1358         desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
1359         if (GRAPHICS_VER(ce->vm->i915) == 8)
1360                 desc |= GEN8_CTX_L3LLC_COHERENT;
1361
1362         return i915_ggtt_offset(ce->state) | desc;
1363 }
1364
1365 u32 lrc_update_regs(const struct intel_context *ce,
1366                     const struct intel_engine_cs *engine,
1367                     u32 head)
1368 {
1369         struct intel_ring *ring = ce->ring;
1370         u32 *regs = ce->lrc_reg_state;
1371
1372         GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
1373         GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
1374
1375         regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1376         regs[CTX_RING_HEAD] = head;
1377         regs[CTX_RING_TAIL] = ring->tail;
1378         regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1379
1380         /* RPCS */
1381         if (engine->class == RENDER_CLASS) {
1382                 regs[CTX_R_PWR_CLK_STATE] =
1383                         intel_sseu_make_rpcs(engine->gt, &ce->sseu);
1384
1385                 i915_oa_init_reg_state(ce, engine);
1386         }
1387
1388         if (ce->wa_bb_page) {
1389                 u32 *(*fn)(const struct intel_context *ce, u32 *cs);
1390
1391                 fn = gen12_emit_indirect_ctx_xcs;
1392                 if (ce->engine->class == RENDER_CLASS)
1393                         fn = gen12_emit_indirect_ctx_rcs;
1394
1395                 /* Mutually exclusive wrt to global indirect bb */
1396                 GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
1397                 setup_indirect_ctx_bb(ce, engine, fn);
1398         }
1399
1400         return lrc_descriptor(ce) | CTX_DESC_FORCE_RESTORE;
1401 }
1402
1403 void lrc_update_offsets(struct intel_context *ce,
1404                         struct intel_engine_cs *engine)
1405 {
1406         set_offsets(ce->lrc_reg_state, reg_offsets(engine), engine, false);
1407 }
1408
1409 void lrc_check_regs(const struct intel_context *ce,
1410                     const struct intel_engine_cs *engine,
1411                     const char *when)
1412 {
1413         const struct intel_ring *ring = ce->ring;
1414         u32 *regs = ce->lrc_reg_state;
1415         bool valid = true;
1416         int x;
1417
1418         if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1419                 pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1420                        engine->name,
1421                        regs[CTX_RING_START],
1422                        i915_ggtt_offset(ring->vma));
1423                 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1424                 valid = false;
1425         }
1426
1427         if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1428             (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1429                 pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1430                        engine->name,
1431                        regs[CTX_RING_CTL],
1432                        (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1433                 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1434                 valid = false;
1435         }
1436
1437         x = lrc_ring_mi_mode(engine);
1438         if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1439                 pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1440                        engine->name, regs[x + 1]);
1441                 regs[x + 1] &= ~STOP_RING;
1442                 regs[x + 1] |= STOP_RING << 16;
1443                 valid = false;
1444         }
1445
1446         WARN_ONCE(!valid, "Invalid lrc state found %s submission\n", when);
1447 }
1448
1449 /*
1450  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
1451  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
1452  * but there is a slight complication as this is applied in WA batch where the
1453  * values are only initialized once so we cannot take register value at the
1454  * beginning and reuse it further; hence we save its value to memory, upload a
1455  * constant value with bit21 set and then we restore it back with the saved value.
1456  * To simplify the WA, a constant value is formed by using the default value
1457  * of this register. This shouldn't be a problem because we are only modifying
1458  * it for a short period and this batch in non-premptible. We can ofcourse
1459  * use additional instructions that read the actual value of the register
1460  * at that time and set our bit of interest but it makes the WA complicated.
1461  *
1462  * This WA is also required for Gen9 so extracting as a function avoids
1463  * code duplication.
1464  */
1465 static u32 *
1466 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
1467 {
1468         /* NB no one else is allowed to scribble over scratch + 256! */
1469         *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1470         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1471         *batch++ = intel_gt_scratch_offset(engine->gt,
1472                                            INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1473         *batch++ = 0;
1474
1475         *batch++ = MI_LOAD_REGISTER_IMM(1);
1476         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1477         *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
1478
1479         batch = gen8_emit_pipe_control(batch,
1480                                        PIPE_CONTROL_CS_STALL |
1481                                        PIPE_CONTROL_DC_FLUSH_ENABLE,
1482                                        0);
1483
1484         *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1485         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1486         *batch++ = intel_gt_scratch_offset(engine->gt,
1487                                            INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1488         *batch++ = 0;
1489
1490         return batch;
1491 }
1492
1493 /*
1494  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
1495  * initialized at the beginning and shared across all contexts but this field
1496  * helps us to have multiple batches at different offsets and select them based
1497  * on a criteria. At the moment this batch always start at the beginning of the page
1498  * and at this point we don't have multiple wa_ctx batch buffers.
1499  *
1500  * The number of WA applied are not known at the beginning; we use this field
1501  * to return the no of DWORDS written.
1502  *
1503  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
1504  * so it adds NOOPs as padding to make it cacheline aligned.
1505  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
1506  * makes a complete batch buffer.
1507  */
1508 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1509 {
1510         /* WaDisableCtxRestoreArbitration:bdw,chv */
1511         *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1512
1513         /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
1514         if (IS_BROADWELL(engine->i915))
1515                 batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1516
1517         /* WaClearSlmSpaceAtContextSwitch:bdw,chv */
1518         /* Actual scratch location is at 128 bytes offset */
1519         batch = gen8_emit_pipe_control(batch,
1520                                        PIPE_CONTROL_FLUSH_L3 |
1521                                        PIPE_CONTROL_STORE_DATA_INDEX |
1522                                        PIPE_CONTROL_CS_STALL |
1523                                        PIPE_CONTROL_QW_WRITE,
1524                                        LRC_PPHWSP_SCRATCH_ADDR);
1525
1526         *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1527
1528         /* Pad to end of cacheline */
1529         while ((unsigned long)batch % CACHELINE_BYTES)
1530                 *batch++ = MI_NOOP;
1531
1532         /*
1533          * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
1534          * execution depends on the length specified in terms of cache lines
1535          * in the register CTX_RCS_INDIRECT_CTX
1536          */
1537
1538         return batch;
1539 }
1540
1541 struct lri {
1542         i915_reg_t reg;
1543         u32 value;
1544 };
1545
1546 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
1547 {
1548         GEM_BUG_ON(!count || count > 63);
1549
1550         *batch++ = MI_LOAD_REGISTER_IMM(count);
1551         do {
1552                 *batch++ = i915_mmio_reg_offset(lri->reg);
1553                 *batch++ = lri->value;
1554         } while (lri++, --count);
1555         *batch++ = MI_NOOP;
1556
1557         return batch;
1558 }
1559
1560 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1561 {
1562         static const struct lri lri[] = {
1563                 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
1564                 {
1565                         COMMON_SLICE_CHICKEN2,
1566                         __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
1567                                        0),
1568                 },
1569
1570                 /* BSpec: 11391 */
1571                 {
1572                         FF_SLICE_CHICKEN,
1573                         __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
1574                                        FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
1575                 },
1576
1577                 /* BSpec: 11299 */
1578                 {
1579                         _3D_CHICKEN3,
1580                         __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
1581                                        _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
1582                 }
1583         };
1584
1585         *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1586
1587         /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
1588         batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1589
1590         /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
1591         batch = gen8_emit_pipe_control(batch,
1592                                        PIPE_CONTROL_FLUSH_L3 |
1593                                        PIPE_CONTROL_STORE_DATA_INDEX |
1594                                        PIPE_CONTROL_CS_STALL |
1595                                        PIPE_CONTROL_QW_WRITE,
1596                                        LRC_PPHWSP_SCRATCH_ADDR);
1597
1598         batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
1599
1600         /* WaMediaPoolStateCmdInWABB:bxt,glk */
1601         if (HAS_POOLED_EU(engine->i915)) {
1602                 /*
1603                  * EU pool configuration is setup along with golden context
1604                  * during context initialization. This value depends on
1605                  * device type (2x6 or 3x6) and needs to be updated based
1606                  * on which subslice is disabled especially for 2x6
1607                  * devices, however it is safe to load default
1608                  * configuration of 3x6 device instead of masking off
1609                  * corresponding bits because HW ignores bits of a disabled
1610                  * subslice and drops down to appropriate config. Please
1611                  * see render_state_setup() in i915_gem_render_state.c for
1612                  * possible configurations, to avoid duplication they are
1613                  * not shown here again.
1614                  */
1615                 *batch++ = GEN9_MEDIA_POOL_STATE;
1616                 *batch++ = GEN9_MEDIA_POOL_ENABLE;
1617                 *batch++ = 0x00777000;
1618                 *batch++ = 0;
1619                 *batch++ = 0;
1620                 *batch++ = 0;
1621         }
1622
1623         *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1624
1625         /* Pad to end of cacheline */
1626         while ((unsigned long)batch % CACHELINE_BYTES)
1627                 *batch++ = MI_NOOP;
1628
1629         return batch;
1630 }
1631
1632 #define CTX_WA_BB_SIZE (PAGE_SIZE)
1633
1634 static int lrc_create_wa_ctx(struct intel_engine_cs *engine)
1635 {
1636         struct drm_i915_gem_object *obj;
1637         struct i915_vma *vma;
1638         int err;
1639
1640         obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_SIZE);
1641         if (IS_ERR(obj))
1642                 return PTR_ERR(obj);
1643
1644         vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
1645         if (IS_ERR(vma)) {
1646                 err = PTR_ERR(vma);
1647                 goto err;
1648         }
1649
1650         engine->wa_ctx.vma = vma;
1651         return 0;
1652
1653 err:
1654         i915_gem_object_put(obj);
1655         return err;
1656 }
1657
1658 void lrc_fini_wa_ctx(struct intel_engine_cs *engine)
1659 {
1660         i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
1661 }
1662
1663 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
1664
1665 void lrc_init_wa_ctx(struct intel_engine_cs *engine)
1666 {
1667         struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
1668         struct i915_wa_ctx_bb *wa_bb[] = {
1669                 &wa_ctx->indirect_ctx, &wa_ctx->per_ctx
1670         };
1671         wa_bb_func_t wa_bb_fn[ARRAY_SIZE(wa_bb)];
1672         struct i915_gem_ww_ctx ww;
1673         void *batch, *batch_ptr;
1674         unsigned int i;
1675         int err;
1676
1677         if (!(engine->flags & I915_ENGINE_HAS_RCS_REG_STATE))
1678                 return;
1679
1680         switch (GRAPHICS_VER(engine->i915)) {
1681         case 12:
1682         case 11:
1683                 return;
1684         case 9:
1685                 wa_bb_fn[0] = gen9_init_indirectctx_bb;
1686                 wa_bb_fn[1] = NULL;
1687                 break;
1688         case 8:
1689                 wa_bb_fn[0] = gen8_init_indirectctx_bb;
1690                 wa_bb_fn[1] = NULL;
1691                 break;
1692         default:
1693                 MISSING_CASE(GRAPHICS_VER(engine->i915));
1694                 return;
1695         }
1696
1697         err = lrc_create_wa_ctx(engine);
1698         if (err) {
1699                 /*
1700                  * We continue even if we fail to initialize WA batch
1701                  * because we only expect rare glitches but nothing
1702                  * critical to prevent us from using GPU
1703                  */
1704                 drm_err(&engine->i915->drm,
1705                         "Ignoring context switch w/a allocation error:%d\n",
1706                         err);
1707                 return;
1708         }
1709
1710         if (!engine->wa_ctx.vma)
1711                 return;
1712
1713         i915_gem_ww_ctx_init(&ww, true);
1714 retry:
1715         err = i915_gem_object_lock(wa_ctx->vma->obj, &ww);
1716         if (!err)
1717                 err = i915_ggtt_pin(wa_ctx->vma, &ww, 0, PIN_HIGH);
1718         if (err)
1719                 goto err;
1720
1721         batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB);
1722         if (IS_ERR(batch)) {
1723                 err = PTR_ERR(batch);
1724                 goto err_unpin;
1725         }
1726
1727         /*
1728          * Emit the two workaround batch buffers, recording the offset from the
1729          * start of the workaround batch buffer object for each and their
1730          * respective sizes.
1731          */
1732         batch_ptr = batch;
1733         for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
1734                 wa_bb[i]->offset = batch_ptr - batch;
1735                 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
1736                                                   CACHELINE_BYTES))) {
1737                         err = -EINVAL;
1738                         break;
1739                 }
1740                 if (wa_bb_fn[i])
1741                         batch_ptr = wa_bb_fn[i](engine, batch_ptr);
1742                 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
1743         }
1744         GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_SIZE);
1745
1746         __i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch);
1747         __i915_gem_object_release_map(wa_ctx->vma->obj);
1748
1749         /* Verify that we can handle failure to setup the wa_ctx */
1750         if (!err)
1751                 err = i915_inject_probe_error(engine->i915, -ENODEV);
1752
1753 err_unpin:
1754         if (err)
1755                 i915_vma_unpin(wa_ctx->vma);
1756 err:
1757         if (err == -EDEADLK) {
1758                 err = i915_gem_ww_ctx_backoff(&ww);
1759                 if (!err)
1760                         goto retry;
1761         }
1762         i915_gem_ww_ctx_fini(&ww);
1763
1764         if (err) {
1765                 i915_vma_put(engine->wa_ctx.vma);
1766
1767                 /* Clear all flags to prevent further use */
1768                 memset(wa_ctx, 0, sizeof(*wa_ctx));
1769         }
1770 }
1771
1772 static void st_runtime_underflow(struct intel_context_stats *stats, s32 dt)
1773 {
1774 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1775         stats->runtime.num_underflow++;
1776         stats->runtime.max_underflow =
1777                 max_t(u32, stats->runtime.max_underflow, -dt);
1778 #endif
1779 }
1780
1781 static u32 lrc_get_runtime(const struct intel_context *ce)
1782 {
1783         /*
1784          * We can use either ppHWSP[16] which is recorded before the context
1785          * switch (and so excludes the cost of context switches) or use the
1786          * value from the context image itself, which is saved/restored earlier
1787          * and so includes the cost of the save.
1788          */
1789         return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]);
1790 }
1791
1792 void lrc_update_runtime(struct intel_context *ce)
1793 {
1794         struct intel_context_stats *stats = &ce->stats;
1795         u32 old;
1796         s32 dt;
1797
1798         old = stats->runtime.last;
1799         stats->runtime.last = lrc_get_runtime(ce);
1800         dt = stats->runtime.last - old;
1801         if (!dt)
1802                 return;
1803
1804         if (unlikely(dt < 0)) {
1805                 CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1806                          old, stats->runtime.last, dt);
1807                 st_runtime_underflow(stats, dt);
1808                 return;
1809         }
1810
1811         ewma_runtime_add(&stats->runtime.avg, dt);
1812         stats->runtime.total += dt;
1813 }
1814
1815 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1816 #include "selftest_lrc.c"
1817 #endif