drivers/gpu/drm/i915/gt/gen8_engine_cs.c

   1 // SPDX-License-Identifier: MIT
   2 /*
   3  * Copyright © 2014 Intel Corporation
   4  */
   5
   6 #include "gen8_engine_cs.h"
   7 #include "i915_drv.h"
   8 #include "intel_lrc.h"
   9 #include "intel_gpu_commands.h"
  10 #include "intel_ring.h"
  11
  12 int gen8_emit_flush_rcs(struct i915_request *rq, u32 mode)
  13 {
  14         bool vf_flush_wa = false, dc_flush_wa = false;
  15         u32 *cs, flags = 0;
  16         int len;
  17
  18         flags |= PIPE_CONTROL_CS_STALL;
  19
  20         if (mode & EMIT_FLUSH) {
  21                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
  22                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
  23                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
  24                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
  25         }
  26
  27         if (mode & EMIT_INVALIDATE) {
  28                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
  29                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
  30                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
  31                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
  32                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
  33                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
  34                 flags |= PIPE_CONTROL_QW_WRITE;
  35                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
  36
  37                 /*
  38                  * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
  39                  * pipe control.
  40                  */
  41                 if (GRAPHICS_VER(rq->engine->i915) == 9)
  42                         vf_flush_wa = true;
  43
  44                 /* WaForGAMHang:kbl */
  45                 if (IS_KBL_GT_STEP(rq->engine->i915, 0, STEP_C0))
  46                         dc_flush_wa = true;
  47         }
  48
  49         len = 6;
  50
  51         if (vf_flush_wa)
  52                 len += 6;
  53
  54         if (dc_flush_wa)
  55                 len += 12;
  56
  57         cs = intel_ring_begin(rq, len);
  58         if (IS_ERR(cs))
  59                 return PTR_ERR(cs);
  60
  61         if (vf_flush_wa)
  62                 cs = gen8_emit_pipe_control(cs, 0, 0);
  63
  64         if (dc_flush_wa)
  65                 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
  66                                             0);
  67
  68         cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
  69
  70         if (dc_flush_wa)
  71                 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
  72
  73         intel_ring_advance(rq, cs);
  74
  75         return 0;
  76 }
  77
  78 int gen8_emit_flush_xcs(struct i915_request *rq, u32 mode)
  79 {
  80         u32 cmd, *cs;
  81
  82         cs = intel_ring_begin(rq, 4);
  83         if (IS_ERR(cs))
  84                 return PTR_ERR(cs);
  85
  86         cmd = MI_FLUSH_DW + 1;
  87
  88         /*
  89          * We always require a command barrier so that subsequent
  90          * commands, such as breadcrumb interrupts, are strictly ordered
  91          * wrt the contents of the write cache being flushed to memory
  92          * (and thus being coherent from the CPU).
  93          */
  94         cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
  95
  96         if (mode & EMIT_INVALIDATE) {
  97                 cmd |= MI_INVALIDATE_TLB;
  98                 if (rq->engine->class == VIDEO_DECODE_CLASS)
  99                         cmd |= MI_INVALIDATE_BSD;
 100         }
 101
 102         *cs++ = cmd;
 103         *cs++ = LRC_PPHWSP_SCRATCH_ADDR;
 104         *cs++ = 0; /* upper addr */
 105         *cs++ = 0; /* value */
 106         intel_ring_advance(rq, cs);
 107
 108         return 0;
 109 }
 110
 111 int gen11_emit_flush_rcs(struct i915_request *rq, u32 mode)
 112 {
 113         if (mode & EMIT_FLUSH) {
 114                 u32 *cs;
 115                 u32 flags = 0;
 116
 117                 flags |= PIPE_CONTROL_CS_STALL;
 118
 119                 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
 120                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
 121                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
 122                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
 123                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
 124                 flags |= PIPE_CONTROL_QW_WRITE;
 125                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
 126
 127                 cs = intel_ring_begin(rq, 6);
 128                 if (IS_ERR(cs))
 129                         return PTR_ERR(cs);
 130
 131                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
 132                 intel_ring_advance(rq, cs);
 133         }
 134
 135         if (mode & EMIT_INVALIDATE) {
 136                 u32 *cs;
 137                 u32 flags = 0;
 138
 139                 flags |= PIPE_CONTROL_CS_STALL;
 140
 141                 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
 142                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
 143                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
 144                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
 145                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
 146                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
 147                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
 148                 flags |= PIPE_CONTROL_QW_WRITE;
 149                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
 150
 151                 cs = intel_ring_begin(rq, 6);
 152                 if (IS_ERR(cs))
 153                         return PTR_ERR(cs);
 154
 155                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
 156                 intel_ring_advance(rq, cs);
 157         }
 158
 159         return 0;
 160 }
 161
 162 static u32 preparser_disable(bool state)
 163 {
 164         return MI_ARB_CHECK | 1 << 8 | state;
 165 }
 166
 167 static i915_reg_t aux_inv_reg(const struct intel_engine_cs *engine)
 168 {
 169         static const i915_reg_t vd[] = {
 170                 GEN12_VD0_AUX_NV,
 171                 GEN12_VD1_AUX_NV,
 172                 GEN12_VD2_AUX_NV,
 173                 GEN12_VD3_AUX_NV,
 174         };
 175
 176         static const i915_reg_t ve[] = {
 177                 GEN12_VE0_AUX_NV,
 178                 GEN12_VE1_AUX_NV,
 179         };
 180
 181         if (engine->class == VIDEO_DECODE_CLASS)
 182                 return vd[engine->instance];
 183
 184         if (engine->class == VIDEO_ENHANCEMENT_CLASS)
 185                 return ve[engine->instance];
 186
 187         GEM_BUG_ON("unknown aux_inv reg\n");
 188         return INVALID_MMIO_REG;
 189 }
 190
 191 static u32 *gen12_emit_aux_table_inv(const i915_reg_t inv_reg, u32 *cs)
 192 {
 193         *cs++ = MI_LOAD_REGISTER_IMM(1);
 194         *cs++ = i915_mmio_reg_offset(inv_reg);
 195         *cs++ = AUX_INV;
 196         *cs++ = MI_NOOP;
 197
 198         return cs;
 199 }
 200
 201 int gen12_emit_flush_rcs(struct i915_request *rq, u32 mode)
 202 {
 203         if (mode & EMIT_FLUSH) {
 204                 u32 flags = 0;
 205                 u32 *cs;
 206
 207                 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
 208                 flags |= PIPE_CONTROL_FLUSH_L3;
 209                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
 210                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
 211                 /* Wa_1409600907:tgl,adl-p */
 212                 flags |= PIPE_CONTROL_DEPTH_STALL;
 213                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
 214                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
 215
 216                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
 217                 flags |= PIPE_CONTROL_QW_WRITE;
 218
 219                 flags |= PIPE_CONTROL_CS_STALL;
 220
 221                 cs = intel_ring_begin(rq, 6);
 222                 if (IS_ERR(cs))
 223                         return PTR_ERR(cs);
 224
 225                 cs = gen12_emit_pipe_control(cs,
 226                                              PIPE_CONTROL0_HDC_PIPELINE_FLUSH,
 227                                              flags, LRC_PPHWSP_SCRATCH_ADDR);
 228                 intel_ring_advance(rq, cs);
 229         }
 230
 231         if (mode & EMIT_INVALIDATE) {
 232                 u32 flags = 0;
 233                 u32 *cs;
 234
 235                 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
 236                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
 237                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
 238                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
 239                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
 240                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
 241                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
 242
 243                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
 244                 flags |= PIPE_CONTROL_QW_WRITE;
 245
 246                 flags |= PIPE_CONTROL_CS_STALL;
 247
 248                 cs = intel_ring_begin(rq, 8 + 4);
 249                 if (IS_ERR(cs))
 250                         return PTR_ERR(cs);
 251
 252                 /*
 253                  * Prevent the pre-parser from skipping past the TLB
 254                  * invalidate and loading a stale page for the batch
 255                  * buffer / request payload.
 256                  */
 257                 *cs++ = preparser_disable(true);
 258
 259                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
 260
 261                 /* hsdes: 1809175790 */
 262                 cs = gen12_emit_aux_table_inv(GEN12_GFX_CCS_AUX_NV, cs);
 263
 264                 *cs++ = preparser_disable(false);
 265                 intel_ring_advance(rq, cs);
 266         }
 267
 268         return 0;
 269 }
 270
 271 int gen12_emit_flush_xcs(struct i915_request *rq, u32 mode)
 272 {
 273         intel_engine_mask_t aux_inv = 0;
 274         u32 cmd, *cs;
 275
 276         cmd = 4;
 277         if (mode & EMIT_INVALIDATE)
 278                 cmd += 2;
 279         if (mode & EMIT_INVALIDATE)
 280                 aux_inv = rq->engine->mask & ~BIT(BCS0);
 281         if (aux_inv)
 282                 cmd += 2 * hweight32(aux_inv) + 2;
 283
 284         cs = intel_ring_begin(rq, cmd);
 285         if (IS_ERR(cs))
 286                 return PTR_ERR(cs);
 287
 288         if (mode & EMIT_INVALIDATE)
 289                 *cs++ = preparser_disable(true);
 290
 291         cmd = MI_FLUSH_DW + 1;
 292
 293         /*
 294          * We always require a command barrier so that subsequent
 295          * commands, such as breadcrumb interrupts, are strictly ordered
 296          * wrt the contents of the write cache being flushed to memory
 297          * (and thus being coherent from the CPU).
 298          */
 299         cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
 300
 301         if (mode & EMIT_INVALIDATE) {
 302                 cmd |= MI_INVALIDATE_TLB;
 303                 if (rq->engine->class == VIDEO_DECODE_CLASS)
 304                         cmd |= MI_INVALIDATE_BSD;
 305         }
 306
 307         *cs++ = cmd;
 308         *cs++ = LRC_PPHWSP_SCRATCH_ADDR;
 309         *cs++ = 0; /* upper addr */
 310         *cs++ = 0; /* value */
 311
 312         if (aux_inv) { /* hsdes: 1809175790 */
 313                 struct intel_engine_cs *engine;
 314                 unsigned int tmp;
 315
 316                 *cs++ = MI_LOAD_REGISTER_IMM(hweight32(aux_inv));
 317                 for_each_engine_masked(engine, rq->engine->gt, aux_inv, tmp) {
 318                         *cs++ = i915_mmio_reg_offset(aux_inv_reg(engine));
 319                         *cs++ = AUX_INV;
 320                 }
 321                 *cs++ = MI_NOOP;
 322         }
 323
 324         if (mode & EMIT_INVALIDATE)
 325                 *cs++ = preparser_disable(false);
 326
 327         intel_ring_advance(rq, cs);
 328
 329         return 0;
 330 }
 331
 332 static u32 preempt_address(struct intel_engine_cs *engine)
 333 {
 334         return (i915_ggtt_offset(engine->status_page.vma) +
 335                 I915_GEM_HWS_PREEMPT_ADDR);
 336 }
 337
 338 static u32 hwsp_offset(const struct i915_request *rq)
 339 {
 340         const struct intel_timeline *tl;
 341
 342         /* Before the request is executed, the timeline is fixed */
 343         tl = rcu_dereference_protected(rq->timeline,
 344                                        !i915_request_signaled(rq));
 345
 346         /* See the comment in i915_request_active_seqno(). */
 347         return page_mask_bits(tl->hwsp_offset) + offset_in_page(rq->hwsp_seqno);
 348 }
 349
 350 int gen8_emit_init_breadcrumb(struct i915_request *rq)
 351 {
 352         u32 *cs;
 353
 354         GEM_BUG_ON(i915_request_has_initial_breadcrumb(rq));
 355         if (!i915_request_timeline(rq)->has_initial_breadcrumb)
 356                 return 0;
 357
 358         cs = intel_ring_begin(rq, 6);
 359         if (IS_ERR(cs))
 360                 return PTR_ERR(cs);
 361
 362         *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
 363         *cs++ = hwsp_offset(rq);
 364         *cs++ = 0;
 365         *cs++ = rq->fence.seqno - 1;
 366
 367         /*
 368          * Check if we have been preempted before we even get started.
 369          *
 370          * After this point i915_request_started() reports true, even if
 371          * we get preempted and so are no longer running.
 372          *
 373          * i915_request_started() is used during preemption processing
 374          * to decide if the request is currently inside the user payload
 375          * or spinning on a kernel semaphore (or earlier). For no-preemption
 376          * requests, we do allow preemption on the semaphore before the user
 377          * payload, but do not allow preemption once the request is started.
 378          *
 379          * i915_request_started() is similarly used during GPU hangs to
 380          * determine if the user's payload was guilty, and if so, the
 381          * request is banned. Before the request is started, it is assumed
 382          * to be unharmed and an innocent victim of another's hang.
 383          */
 384         *cs++ = MI_NOOP;
 385         *cs++ = MI_ARB_CHECK;
 386
 387         intel_ring_advance(rq, cs);
 388
 389         /* Record the updated position of the request's payload */
 390         rq->infix = intel_ring_offset(rq, cs);
 391
 392         __set_bit(I915_FENCE_FLAG_INITIAL_BREADCRUMB, &rq->fence.flags);
 393
 394         return 0;
 395 }
 396
 397 int gen8_emit_bb_start_noarb(struct i915_request *rq,
 398                              u64 offset, u32 len,
 399                              const unsigned int flags)
 400 {
 401         u32 *cs;
 402
 403         cs = intel_ring_begin(rq, 4);
 404         if (IS_ERR(cs))
 405                 return PTR_ERR(cs);
 406
 407         /*
 408          * WaDisableCtxRestoreArbitration:bdw,chv
 409          *
 410          * We don't need to perform MI_ARB_ENABLE as often as we do (in
 411          * particular all the gen that do not need the w/a at all!), if we
 412          * took care to make sure that on every switch into this context
 413          * (both ordinary and for preemption) that arbitrartion was enabled
 414          * we would be fine.  However, for gen8 there is another w/a that
 415          * requires us to not preempt inside GPGPU execution, so we keep
 416          * arbitration disabled for gen8 batches. Arbitration will be
 417          * re-enabled before we close the request
 418          * (engine->emit_fini_breadcrumb).
 419          */
 420         *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
 421
 422         /* FIXME(BDW+): Address space and security selectors. */
 423         *cs++ = MI_BATCH_BUFFER_START_GEN8 |
 424                 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
 425         *cs++ = lower_32_bits(offset);
 426         *cs++ = upper_32_bits(offset);
 427
 428         intel_ring_advance(rq, cs);
 429
 430         return 0;
 431 }
 432
 433 int gen8_emit_bb_start(struct i915_request *rq,
 434                        u64 offset, u32 len,
 435                        const unsigned int flags)
 436 {
 437         u32 *cs;
 438
 439         if (unlikely(i915_request_has_nopreempt(rq)))
 440                 return gen8_emit_bb_start_noarb(rq, offset, len, flags);
 441
 442         cs = intel_ring_begin(rq, 6);
 443         if (IS_ERR(cs))
 444                 return PTR_ERR(cs);
 445
 446         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
 447
 448         *cs++ = MI_BATCH_BUFFER_START_GEN8 |
 449                 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
 450         *cs++ = lower_32_bits(offset);
 451         *cs++ = upper_32_bits(offset);
 452
 453         *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
 454         *cs++ = MI_NOOP;
 455
 456         intel_ring_advance(rq, cs);
 457
 458         return 0;
 459 }
 460
 461 static void assert_request_valid(struct i915_request *rq)
 462 {
 463         struct intel_ring *ring __maybe_unused = rq->ring;
 464
 465         /* Can we unwind this request without appearing to go forwards? */
 466         GEM_BUG_ON(intel_ring_direction(ring, rq->wa_tail, rq->head) <= 0);
 467 }
 468
 469 /*
 470  * Reserve space for 2 NOOPs at the end of each request to be
 471  * used as a workaround for not being allowed to do lite
 472  * restore with HEAD==TAIL (WaIdleLiteRestore).
 473  */
 474 static u32 *gen8_emit_wa_tail(struct i915_request *rq, u32 *cs)
 475 {
 476         /* Ensure there's always at least one preemption point per-request. */
 477         *cs++ = MI_ARB_CHECK;
 478         *cs++ = MI_NOOP;
 479         rq->wa_tail = intel_ring_offset(rq, cs);
 480
 481         /* Check that entire request is less than half the ring */
 482         assert_request_valid(rq);
 483
 484         return cs;
 485 }
 486
 487 static u32 *emit_preempt_busywait(struct i915_request *rq, u32 *cs)
 488 {
 489         *cs++ = MI_ARB_CHECK; /* trigger IDLE->ACTIVE first */
 490         *cs++ = MI_SEMAPHORE_WAIT |
 491                 MI_SEMAPHORE_GLOBAL_GTT |
 492                 MI_SEMAPHORE_POLL |
 493                 MI_SEMAPHORE_SAD_EQ_SDD;
 494         *cs++ = 0;
 495         *cs++ = preempt_address(rq->engine);
 496         *cs++ = 0;
 497         *cs++ = MI_NOOP;
 498
 499         return cs;
 500 }
 501
 502 static __always_inline u32*
 503 gen8_emit_fini_breadcrumb_tail(struct i915_request *rq, u32 *cs)
 504 {
 505         *cs++ = MI_USER_INTERRUPT;
 506
 507         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
 508         if (intel_engine_has_semaphores(rq->engine) &&
 509             !intel_uc_uses_guc_submission(&rq->engine->gt->uc))
 510                 cs = emit_preempt_busywait(rq, cs);
 511
 512         rq->tail = intel_ring_offset(rq, cs);
 513         assert_ring_tail_valid(rq->ring, rq->tail);
 514
 515         return gen8_emit_wa_tail(rq, cs);
 516 }
 517
 518 static u32 *emit_xcs_breadcrumb(struct i915_request *rq, u32 *cs)
 519 {
 520         return gen8_emit_ggtt_write(cs, rq->fence.seqno, hwsp_offset(rq), 0);
 521 }
 522
 523 u32 *gen8_emit_fini_breadcrumb_xcs(struct i915_request *rq, u32 *cs)
 524 {
 525         return gen8_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs));
 526 }
 527
 528 u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
 529 {
 530         cs = gen8_emit_pipe_control(cs,
 531                                     PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
 532                                     PIPE_CONTROL_DEPTH_CACHE_FLUSH |
 533                                     PIPE_CONTROL_DC_FLUSH_ENABLE,
 534                                     0);
 535
 536         /* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */
 537         cs = gen8_emit_ggtt_write_rcs(cs,
 538                                       rq->fence.seqno,
 539                                       hwsp_offset(rq),
 540                                       PIPE_CONTROL_FLUSH_ENABLE |
 541                                       PIPE_CONTROL_CS_STALL);
 542
 543         return gen8_emit_fini_breadcrumb_tail(rq, cs);
 544 }
 545
 546 u32 *gen11_emit_fini_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
 547 {
 548         cs = gen8_emit_ggtt_write_rcs(cs,
 549                                       rq->fence.seqno,
 550                                       hwsp_offset(rq),
 551                                       PIPE_CONTROL_CS_STALL |
 552                                       PIPE_CONTROL_TILE_CACHE_FLUSH |
 553                                       PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
 554                                       PIPE_CONTROL_DEPTH_CACHE_FLUSH |
 555                                       PIPE_CONTROL_DC_FLUSH_ENABLE |
 556                                       PIPE_CONTROL_FLUSH_ENABLE);
 557
 558         return gen8_emit_fini_breadcrumb_tail(rq, cs);
 559 }
 560
 561 /*
 562  * Note that the CS instruction pre-parser will not stall on the breadcrumb
 563  * flush and will continue pre-fetching the instructions after it before the
 564  * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at
 565  * BB_START/END instructions, so, even though we might pre-fetch the pre-amble
 566  * of the next request before the memory has been flushed, we're guaranteed that
 567  * we won't access the batch itself too early.
 568  * However, on gen12+ the parser can pre-fetch across the BB_START/END commands,
 569  * so, if the current request is modifying an instruction in the next request on
 570  * the same intel_context, we might pre-fetch and then execute the pre-update
 571  * instruction. To avoid this, the users of self-modifying code should either
 572  * disable the parser around the code emitting the memory writes, via a new flag
 573  * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For
 574  * the in-kernel use-cases we've opted to use a separate context, see
 575  * reloc_gpu() as an example.
 576  * All the above applies only to the instructions themselves. Non-inline data
 577  * used by the instructions is not pre-fetched.
 578  */
 579
 580 static u32 *gen12_emit_preempt_busywait(struct i915_request *rq, u32 *cs)
 581 {
 582         *cs++ = MI_ARB_CHECK; /* trigger IDLE->ACTIVE first */
 583         *cs++ = MI_SEMAPHORE_WAIT_TOKEN |
 584                 MI_SEMAPHORE_GLOBAL_GTT |
 585                 MI_SEMAPHORE_POLL |
 586                 MI_SEMAPHORE_SAD_EQ_SDD;
 587         *cs++ = 0;
 588         *cs++ = preempt_address(rq->engine);
 589         *cs++ = 0;
 590         *cs++ = 0;
 591
 592         return cs;
 593 }
 594
 595 static __always_inline u32*
 596 gen12_emit_fini_breadcrumb_tail(struct i915_request *rq, u32 *cs)
 597 {
 598         *cs++ = MI_USER_INTERRUPT;
 599
 600         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
 601         if (intel_engine_has_semaphores(rq->engine) &&
 602             !intel_uc_uses_guc_submission(&rq->engine->gt->uc))
 603                 cs = gen12_emit_preempt_busywait(rq, cs);
 604
 605         rq->tail = intel_ring_offset(rq, cs);
 606         assert_ring_tail_valid(rq->ring, rq->tail);
 607
 608         return gen8_emit_wa_tail(rq, cs);
 609 }
 610
 611 u32 *gen12_emit_fini_breadcrumb_xcs(struct i915_request *rq, u32 *cs)
 612 {
 613         /* XXX Stalling flush before seqno write; post-sync not */
 614         cs = emit_xcs_breadcrumb(rq, __gen8_emit_flush_dw(cs, 0, 0, 0));
 615         return gen12_emit_fini_breadcrumb_tail(rq, cs);
 616 }
 617
 618 u32 *gen12_emit_fini_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
 619 {
 620         cs = gen12_emit_ggtt_write_rcs(cs,
 621                                        rq->fence.seqno,
 622                                        hwsp_offset(rq),
 623                                        PIPE_CONTROL0_HDC_PIPELINE_FLUSH,
 624                                        PIPE_CONTROL_CS_STALL |
 625                                        PIPE_CONTROL_TILE_CACHE_FLUSH |
 626                                        PIPE_CONTROL_FLUSH_L3 |
 627                                        PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
 628                                        PIPE_CONTROL_DEPTH_CACHE_FLUSH |
 629                                        /* Wa_1409600907:tgl */
 630                                        PIPE_CONTROL_DEPTH_STALL |
 631                                        PIPE_CONTROL_DC_FLUSH_ENABLE |
 632                                        PIPE_CONTROL_FLUSH_ENABLE);
 633
 634         return gen12_emit_fini_breadcrumb_tail(rq, cs);
 635 }