drivers/gpu/drm/i915/gt/gen8_engine_cs.c

   1 // SPDX-License-Identifier: MIT
   2 /*
   3  * Copyright © 2014 Intel Corporation
   4  */
   5
   6 #include "gen8_engine_cs.h"
   7 #include "i915_drv.h"
   8 #include "intel_engine_regs.h"
   9 #include "intel_gpu_commands.h"
  10 #include "intel_lrc.h"
  11 #include "intel_ring.h"
  12
  13 int gen8_emit_flush_rcs(struct i915_request *rq, u32 mode)
  14 {
  15         bool vf_flush_wa = false, dc_flush_wa = false;
  16         u32 *cs, flags = 0;
  17         int len;
  18
  19         flags |= PIPE_CONTROL_CS_STALL;
  20
  21         if (mode & EMIT_FLUSH) {
  22                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
  23                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
  24                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
  25                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
  26         }
  27
  28         if (mode & EMIT_INVALIDATE) {
  29                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
  30                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
  31                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
  32                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
  33                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
  34                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
  35                 flags |= PIPE_CONTROL_QW_WRITE;
  36                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
  37
  38                 /*
  39                  * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
  40                  * pipe control.
  41                  */
  42                 if (GRAPHICS_VER(rq->engine->i915) == 9)
  43                         vf_flush_wa = true;
  44
  45                 /* WaForGAMHang:kbl */
  46                 if (IS_KBL_GRAPHICS_STEP(rq->engine->i915, 0, STEP_C0))
  47                         dc_flush_wa = true;
  48         }
  49
  50         len = 6;
  51
  52         if (vf_flush_wa)
  53                 len += 6;
  54
  55         if (dc_flush_wa)
  56                 len += 12;
  57
  58         cs = intel_ring_begin(rq, len);
  59         if (IS_ERR(cs))
  60                 return PTR_ERR(cs);
  61
  62         if (vf_flush_wa)
  63                 cs = gen8_emit_pipe_control(cs, 0, 0);
  64
  65         if (dc_flush_wa)
  66                 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
  67                                             0);
  68
  69         cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
  70
  71         if (dc_flush_wa)
  72                 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
  73
  74         intel_ring_advance(rq, cs);
  75
  76         return 0;
  77 }
  78
  79 int gen8_emit_flush_xcs(struct i915_request *rq, u32 mode)
  80 {
  81         u32 cmd, *cs;
  82
  83         cs = intel_ring_begin(rq, 4);
  84         if (IS_ERR(cs))
  85                 return PTR_ERR(cs);
  86
  87         cmd = MI_FLUSH_DW + 1;
  88
  89         /*
  90          * We always require a command barrier so that subsequent
  91          * commands, such as breadcrumb interrupts, are strictly ordered
  92          * wrt the contents of the write cache being flushed to memory
  93          * (and thus being coherent from the CPU).
  94          */
  95         cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
  96
  97         if (mode & EMIT_INVALIDATE) {
  98                 cmd |= MI_INVALIDATE_TLB;
  99                 if (rq->engine->class == VIDEO_DECODE_CLASS)
 100                         cmd |= MI_INVALIDATE_BSD;
 101         }
 102
 103         *cs++ = cmd;
 104         *cs++ = LRC_PPHWSP_SCRATCH_ADDR;
 105         *cs++ = 0; /* upper addr */
 106         *cs++ = 0; /* value */
 107         intel_ring_advance(rq, cs);
 108
 109         return 0;
 110 }
 111
 112 int gen11_emit_flush_rcs(struct i915_request *rq, u32 mode)
 113 {
 114         if (mode & EMIT_FLUSH) {
 115                 u32 *cs;
 116                 u32 flags = 0;
 117
 118                 flags |= PIPE_CONTROL_CS_STALL;
 119
 120                 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
 121                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
 122                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
 123                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
 124                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
 125                 flags |= PIPE_CONTROL_QW_WRITE;
 126                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
 127
 128                 cs = intel_ring_begin(rq, 6);
 129                 if (IS_ERR(cs))
 130                         return PTR_ERR(cs);
 131
 132                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
 133                 intel_ring_advance(rq, cs);
 134         }
 135
 136         if (mode & EMIT_INVALIDATE) {
 137                 u32 *cs;
 138                 u32 flags = 0;
 139
 140                 flags |= PIPE_CONTROL_CS_STALL;
 141
 142                 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
 143                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
 144                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
 145                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
 146                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
 147                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
 148                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
 149                 flags |= PIPE_CONTROL_QW_WRITE;
 150                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
 151
 152                 cs = intel_ring_begin(rq, 6);
 153                 if (IS_ERR(cs))
 154                         return PTR_ERR(cs);
 155
 156                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
 157                 intel_ring_advance(rq, cs);
 158         }
 159
 160         return 0;
 161 }
 162
 163 static u32 preparser_disable(bool state)
 164 {
 165         return MI_ARB_CHECK | 1 << 8 | state;
 166 }
 167
 168 u32 *gen12_emit_aux_table_inv(struct intel_gt *gt, u32 *cs, const i915_reg_t inv_reg)
 169 {
 170         u32 gsi_offset = gt->uncore->gsi_offset;
 171
 172         *cs++ = MI_LOAD_REGISTER_IMM(1) | MI_LRI_MMIO_REMAP_EN;
 173         *cs++ = i915_mmio_reg_offset(inv_reg) + gsi_offset;
 174         *cs++ = AUX_INV;
 175         *cs++ = MI_NOOP;
 176
 177         return cs;
 178 }
 179
 180 static int mtl_dummy_pipe_control(struct i915_request *rq)
 181 {
 182         /* Wa_14016712196 */
 183         if (IS_MTL_GRAPHICS_STEP(rq->engine->i915, M, STEP_A0, STEP_B0) ||
 184             IS_MTL_GRAPHICS_STEP(rq->engine->i915, P, STEP_A0, STEP_B0)) {
 185                 u32 *cs;
 186
 187                 /* dummy PIPE_CONTROL + depth flush */
 188                 cs = intel_ring_begin(rq, 6);
 189                 if (IS_ERR(cs))
 190                         return PTR_ERR(cs);
 191                 cs = gen12_emit_pipe_control(cs,
 192                                              0,
 193                                              PIPE_CONTROL_DEPTH_CACHE_FLUSH,
 194                                              LRC_PPHWSP_SCRATCH_ADDR);
 195                 intel_ring_advance(rq, cs);
 196         }
 197
 198         return 0;
 199 }
 200
 201 int gen12_emit_flush_rcs(struct i915_request *rq, u32 mode)
 202 {
 203         struct intel_engine_cs *engine = rq->engine;
 204
 205         if (mode & EMIT_FLUSH) {
 206                 u32 flags = 0;
 207                 int err;
 208                 u32 *cs;
 209
 210                 err = mtl_dummy_pipe_control(rq);
 211                 if (err)
 212                         return err;
 213
 214                 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
 215                 flags |= PIPE_CONTROL_FLUSH_L3;
 216                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
 217                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
 218                 /* Wa_1409600907:tgl,adl-p */
 219                 flags |= PIPE_CONTROL_DEPTH_STALL;
 220                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
 221                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
 222
 223                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
 224                 flags |= PIPE_CONTROL_QW_WRITE;
 225
 226                 flags |= PIPE_CONTROL_CS_STALL;
 227
 228                 if (!HAS_3D_PIPELINE(engine->i915))
 229                         flags &= ~PIPE_CONTROL_3D_ARCH_FLAGS;
 230                 else if (engine->class == COMPUTE_CLASS)
 231                         flags &= ~PIPE_CONTROL_3D_ENGINE_FLAGS;
 232
 233                 cs = intel_ring_begin(rq, 6);
 234                 if (IS_ERR(cs))
 235                         return PTR_ERR(cs);
 236
 237                 cs = gen12_emit_pipe_control(cs,
 238                                              PIPE_CONTROL0_HDC_PIPELINE_FLUSH,
 239                                              flags, LRC_PPHWSP_SCRATCH_ADDR);
 240                 intel_ring_advance(rq, cs);
 241         }
 242
 243         if (mode & EMIT_INVALIDATE) {
 244                 u32 flags = 0;
 245                 u32 *cs, count;
 246                 int err;
 247
 248                 err = mtl_dummy_pipe_control(rq);
 249                 if (err)
 250                         return err;
 251
 252                 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
 253                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
 254                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
 255                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
 256                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
 257                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
 258                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
 259
 260                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
 261                 flags |= PIPE_CONTROL_QW_WRITE;
 262
 263                 flags |= PIPE_CONTROL_CS_STALL;
 264
 265                 if (!HAS_3D_PIPELINE(engine->i915))
 266                         flags &= ~PIPE_CONTROL_3D_ARCH_FLAGS;
 267                 else if (engine->class == COMPUTE_CLASS)
 268                         flags &= ~PIPE_CONTROL_3D_ENGINE_FLAGS;
 269
 270                 if (!HAS_FLAT_CCS(rq->engine->i915))
 271                         count = 8 + 4;
 272                 else
 273                         count = 8;
 274
 275                 cs = intel_ring_begin(rq, count);
 276                 if (IS_ERR(cs))
 277                         return PTR_ERR(cs);
 278
 279                 /*
 280                  * Prevent the pre-parser from skipping past the TLB
 281                  * invalidate and loading a stale page for the batch
 282                  * buffer / request payload.
 283                  */
 284                 *cs++ = preparser_disable(true);
 285
 286                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
 287
 288                 if (!HAS_FLAT_CCS(rq->engine->i915)) {
 289                         /* hsdes: 1809175790 */
 290                         cs = gen12_emit_aux_table_inv(rq->engine->gt,
 291                                                       cs, GEN12_GFX_CCS_AUX_NV);
 292                 }
 293
 294                 *cs++ = preparser_disable(false);
 295                 intel_ring_advance(rq, cs);
 296         }
 297
 298         return 0;
 299 }
 300
 301 int gen12_emit_flush_xcs(struct i915_request *rq, u32 mode)
 302 {
 303         intel_engine_mask_t aux_inv = 0;
 304         u32 cmd, *cs;
 305
 306         cmd = 4;
 307         if (mode & EMIT_INVALIDATE) {
 308                 cmd += 2;
 309
 310                 if (!HAS_FLAT_CCS(rq->engine->i915) &&
 311                     (rq->engine->class == VIDEO_DECODE_CLASS ||
 312                      rq->engine->class == VIDEO_ENHANCEMENT_CLASS)) {
 313                         aux_inv = rq->engine->mask &
 314                                 ~GENMASK(_BCS(I915_MAX_BCS - 1), BCS0);
 315                         if (aux_inv)
 316                                 cmd += 4;
 317                 }
 318         }
 319
 320         cs = intel_ring_begin(rq, cmd);
 321         if (IS_ERR(cs))
 322                 return PTR_ERR(cs);
 323
 324         if (mode & EMIT_INVALIDATE)
 325                 *cs++ = preparser_disable(true);
 326
 327         cmd = MI_FLUSH_DW + 1;
 328
 329         /*
 330          * We always require a command barrier so that subsequent
 331          * commands, such as breadcrumb interrupts, are strictly ordered
 332          * wrt the contents of the write cache being flushed to memory
 333          * (and thus being coherent from the CPU).
 334          */
 335         cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
 336
 337         if (mode & EMIT_INVALIDATE) {
 338                 cmd |= MI_INVALIDATE_TLB;
 339                 if (rq->engine->class == VIDEO_DECODE_CLASS)
 340                         cmd |= MI_INVALIDATE_BSD;
 341         }
 342
 343         *cs++ = cmd;
 344         *cs++ = LRC_PPHWSP_SCRATCH_ADDR;
 345         *cs++ = 0; /* upper addr */
 346         *cs++ = 0; /* value */
 347
 348         if (aux_inv) { /* hsdes: 1809175790 */
 349                 if (rq->engine->class == VIDEO_DECODE_CLASS)
 350                         cs = gen12_emit_aux_table_inv(rq->engine->gt,
 351                                                       cs, GEN12_VD0_AUX_NV);
 352                 else
 353                         cs = gen12_emit_aux_table_inv(rq->engine->gt,
 354                                                       cs, GEN12_VE0_AUX_NV);
 355         }
 356
 357         if (mode & EMIT_INVALIDATE)
 358                 *cs++ = preparser_disable(false);
 359
 360         intel_ring_advance(rq, cs);
 361
 362         return 0;
 363 }
 364
 365 static u32 preempt_address(struct intel_engine_cs *engine)
 366 {
 367         return (i915_ggtt_offset(engine->status_page.vma) +
 368                 I915_GEM_HWS_PREEMPT_ADDR);
 369 }
 370
 371 static u32 hwsp_offset(const struct i915_request *rq)
 372 {
 373         const struct intel_timeline *tl;
 374
 375         /* Before the request is executed, the timeline is fixed */
 376         tl = rcu_dereference_protected(rq->timeline,
 377                                        !i915_request_signaled(rq));
 378
 379         /* See the comment in i915_request_active_seqno(). */
 380         return page_mask_bits(tl->hwsp_offset) + offset_in_page(rq->hwsp_seqno);
 381 }
 382
 383 int gen8_emit_init_breadcrumb(struct i915_request *rq)
 384 {
 385         u32 *cs;
 386
 387         GEM_BUG_ON(i915_request_has_initial_breadcrumb(rq));
 388         if (!i915_request_timeline(rq)->has_initial_breadcrumb)
 389                 return 0;
 390
 391         cs = intel_ring_begin(rq, 6);
 392         if (IS_ERR(cs))
 393                 return PTR_ERR(cs);
 394
 395         *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
 396         *cs++ = hwsp_offset(rq);
 397         *cs++ = 0;
 398         *cs++ = rq->fence.seqno - 1;
 399
 400         /*
 401          * Check if we have been preempted before we even get started.
 402          *
 403          * After this point i915_request_started() reports true, even if
 404          * we get preempted and so are no longer running.
 405          *
 406          * i915_request_started() is used during preemption processing
 407          * to decide if the request is currently inside the user payload
 408          * or spinning on a kernel semaphore (or earlier). For no-preemption
 409          * requests, we do allow preemption on the semaphore before the user
 410          * payload, but do not allow preemption once the request is started.
 411          *
 412          * i915_request_started() is similarly used during GPU hangs to
 413          * determine if the user's payload was guilty, and if so, the
 414          * request is banned. Before the request is started, it is assumed
 415          * to be unharmed and an innocent victim of another's hang.
 416          */
 417         *cs++ = MI_NOOP;
 418         *cs++ = MI_ARB_CHECK;
 419
 420         intel_ring_advance(rq, cs);
 421
 422         /* Record the updated position of the request's payload */
 423         rq->infix = intel_ring_offset(rq, cs);
 424
 425         __set_bit(I915_FENCE_FLAG_INITIAL_BREADCRUMB, &rq->fence.flags);
 426
 427         return 0;
 428 }
 429
 430 static int __xehp_emit_bb_start(struct i915_request *rq,
 431                                 u64 offset, u32 len,
 432                                 const unsigned int flags,
 433                                 u32 arb)
 434 {
 435         struct intel_context *ce = rq->context;
 436         u32 wa_offset = lrc_indirect_bb(ce);
 437         u32 *cs;
 438
 439         GEM_BUG_ON(!ce->wa_bb_page);
 440
 441         cs = intel_ring_begin(rq, 12);
 442         if (IS_ERR(cs))
 443                 return PTR_ERR(cs);
 444
 445         *cs++ = MI_ARB_ON_OFF | arb;
 446
 447         *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
 448                 MI_SRM_LRM_GLOBAL_GTT |
 449                 MI_LRI_LRM_CS_MMIO;
 450         *cs++ = i915_mmio_reg_offset(RING_PREDICATE_RESULT(0));
 451         *cs++ = wa_offset + DG2_PREDICATE_RESULT_WA;
 452         *cs++ = 0;
 453
 454         *cs++ = MI_BATCH_BUFFER_START_GEN8 |
 455                 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
 456         *cs++ = lower_32_bits(offset);
 457         *cs++ = upper_32_bits(offset);
 458
 459         /* Fixup stray MI_SET_PREDICATE as it prevents us executing the ring */
 460         *cs++ = MI_BATCH_BUFFER_START_GEN8;
 461         *cs++ = wa_offset + DG2_PREDICATE_RESULT_BB;
 462         *cs++ = 0;
 463
 464         *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
 465
 466         intel_ring_advance(rq, cs);
 467
 468         return 0;
 469 }
 470
 471 int xehp_emit_bb_start_noarb(struct i915_request *rq,
 472                              u64 offset, u32 len,
 473                              const unsigned int flags)
 474 {
 475         return __xehp_emit_bb_start(rq, offset, len, flags, MI_ARB_DISABLE);
 476 }
 477
 478 int xehp_emit_bb_start(struct i915_request *rq,
 479                        u64 offset, u32 len,
 480                        const unsigned int flags)
 481 {
 482         return __xehp_emit_bb_start(rq, offset, len, flags, MI_ARB_ENABLE);
 483 }
 484
 485 int gen8_emit_bb_start_noarb(struct i915_request *rq,
 486                              u64 offset, u32 len,
 487                              const unsigned int flags)
 488 {
 489         u32 *cs;
 490
 491         cs = intel_ring_begin(rq, 4);
 492         if (IS_ERR(cs))
 493                 return PTR_ERR(cs);
 494
 495         /*
 496          * WaDisableCtxRestoreArbitration:bdw,chv
 497          *
 498          * We don't need to perform MI_ARB_ENABLE as often as we do (in
 499          * particular all the gen that do not need the w/a at all!), if we
 500          * took care to make sure that on every switch into this context
 501          * (both ordinary and for preemption) that arbitrartion was enabled
 502          * we would be fine.  However, for gen8 there is another w/a that
 503          * requires us to not preempt inside GPGPU execution, so we keep
 504          * arbitration disabled for gen8 batches. Arbitration will be
 505          * re-enabled before we close the request
 506          * (engine->emit_fini_breadcrumb).
 507          */
 508         *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
 509
 510         /* FIXME(BDW+): Address space and security selectors. */
 511         *cs++ = MI_BATCH_BUFFER_START_GEN8 |
 512                 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
 513         *cs++ = lower_32_bits(offset);
 514         *cs++ = upper_32_bits(offset);
 515
 516         intel_ring_advance(rq, cs);
 517
 518         return 0;
 519 }
 520
 521 int gen8_emit_bb_start(struct i915_request *rq,
 522                        u64 offset, u32 len,
 523                        const unsigned int flags)
 524 {
 525         u32 *cs;
 526
 527         if (unlikely(i915_request_has_nopreempt(rq)))
 528                 return gen8_emit_bb_start_noarb(rq, offset, len, flags);
 529
 530         cs = intel_ring_begin(rq, 6);
 531         if (IS_ERR(cs))
 532                 return PTR_ERR(cs);
 533
 534         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
 535
 536         *cs++ = MI_BATCH_BUFFER_START_GEN8 |
 537                 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
 538         *cs++ = lower_32_bits(offset);
 539         *cs++ = upper_32_bits(offset);
 540
 541         *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
 542         *cs++ = MI_NOOP;
 543
 544         intel_ring_advance(rq, cs);
 545
 546         return 0;
 547 }
 548
 549 static void assert_request_valid(struct i915_request *rq)
 550 {
 551         struct intel_ring *ring __maybe_unused = rq->ring;
 552
 553         /* Can we unwind this request without appearing to go forwards? */
 554         GEM_BUG_ON(intel_ring_direction(ring, rq->wa_tail, rq->head) <= 0);
 555 }
 556
 557 /*
 558  * Reserve space for 2 NOOPs at the end of each request to be
 559  * used as a workaround for not being allowed to do lite
 560  * restore with HEAD==TAIL (WaIdleLiteRestore).
 561  */
 562 static u32 *gen8_emit_wa_tail(struct i915_request *rq, u32 *cs)
 563 {
 564         /* Ensure there's always at least one preemption point per-request. */
 565         *cs++ = MI_ARB_CHECK;
 566         *cs++ = MI_NOOP;
 567         rq->wa_tail = intel_ring_offset(rq, cs);
 568
 569         /* Check that entire request is less than half the ring */
 570         assert_request_valid(rq);
 571
 572         return cs;
 573 }
 574
 575 static u32 *emit_preempt_busywait(struct i915_request *rq, u32 *cs)
 576 {
 577         *cs++ = MI_ARB_CHECK; /* trigger IDLE->ACTIVE first */
 578         *cs++ = MI_SEMAPHORE_WAIT |
 579                 MI_SEMAPHORE_GLOBAL_GTT |
 580                 MI_SEMAPHORE_POLL |
 581                 MI_SEMAPHORE_SAD_EQ_SDD;
 582         *cs++ = 0;
 583         *cs++ = preempt_address(rq->engine);
 584         *cs++ = 0;
 585         *cs++ = MI_NOOP;
 586
 587         return cs;
 588 }
 589
 590 static __always_inline u32*
 591 gen8_emit_fini_breadcrumb_tail(struct i915_request *rq, u32 *cs)
 592 {
 593         *cs++ = MI_USER_INTERRUPT;
 594
 595         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
 596         if (intel_engine_has_semaphores(rq->engine) &&
 597             !intel_uc_uses_guc_submission(&rq->engine->gt->uc))
 598                 cs = emit_preempt_busywait(rq, cs);
 599
 600         rq->tail = intel_ring_offset(rq, cs);
 601         assert_ring_tail_valid(rq->ring, rq->tail);
 602
 603         return gen8_emit_wa_tail(rq, cs);
 604 }
 605
 606 static u32 *emit_xcs_breadcrumb(struct i915_request *rq, u32 *cs)
 607 {
 608         return gen8_emit_ggtt_write(cs, rq->fence.seqno, hwsp_offset(rq), 0);
 609 }
 610
 611 u32 *gen8_emit_fini_breadcrumb_xcs(struct i915_request *rq, u32 *cs)
 612 {
 613         return gen8_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs));
 614 }
 615
 616 u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
 617 {
 618         cs = gen8_emit_pipe_control(cs,
 619                                     PIPE_CONTROL_CS_STALL |
 620                                     PIPE_CONTROL_TLB_INVALIDATE |
 621                                     PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
 622                                     PIPE_CONTROL_DEPTH_CACHE_FLUSH |
 623                                     PIPE_CONTROL_DC_FLUSH_ENABLE,
 624                                     0);
 625
 626         /* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */
 627         cs = gen8_emit_ggtt_write_rcs(cs,
 628                                       rq->fence.seqno,
 629                                       hwsp_offset(rq),
 630                                       PIPE_CONTROL_FLUSH_ENABLE |
 631                                       PIPE_CONTROL_CS_STALL);
 632
 633         return gen8_emit_fini_breadcrumb_tail(rq, cs);
 634 }
 635
 636 u32 *gen11_emit_fini_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
 637 {
 638         cs = gen8_emit_pipe_control(cs,
 639                                     PIPE_CONTROL_CS_STALL |
 640                                     PIPE_CONTROL_TLB_INVALIDATE |
 641                                     PIPE_CONTROL_TILE_CACHE_FLUSH |
 642                                     PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
 643                                     PIPE_CONTROL_DEPTH_CACHE_FLUSH |
 644                                     PIPE_CONTROL_DC_FLUSH_ENABLE,
 645                                     0);
 646
 647         /*XXX: Look at gen8_emit_fini_breadcrumb_rcs */
 648         cs = gen8_emit_ggtt_write_rcs(cs,
 649                                       rq->fence.seqno,
 650                                       hwsp_offset(rq),
 651                                       PIPE_CONTROL_FLUSH_ENABLE |
 652                                       PIPE_CONTROL_CS_STALL);
 653
 654         return gen8_emit_fini_breadcrumb_tail(rq, cs);
 655 }
 656
 657 /*
 658  * Note that the CS instruction pre-parser will not stall on the breadcrumb
 659  * flush and will continue pre-fetching the instructions after it before the
 660  * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at
 661  * BB_START/END instructions, so, even though we might pre-fetch the pre-amble
 662  * of the next request before the memory has been flushed, we're guaranteed that
 663  * we won't access the batch itself too early.
 664  * However, on gen12+ the parser can pre-fetch across the BB_START/END commands,
 665  * so, if the current request is modifying an instruction in the next request on
 666  * the same intel_context, we might pre-fetch and then execute the pre-update
 667  * instruction. To avoid this, the users of self-modifying code should either
 668  * disable the parser around the code emitting the memory writes, via a new flag
 669  * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For
 670  * the in-kernel use-cases we've opted to use a separate context, see
 671  * reloc_gpu() as an example.
 672  * All the above applies only to the instructions themselves. Non-inline data
 673  * used by the instructions is not pre-fetched.
 674  */
 675
 676 static u32 *gen12_emit_preempt_busywait(struct i915_request *rq, u32 *cs)
 677 {
 678         *cs++ = MI_ARB_CHECK; /* trigger IDLE->ACTIVE first */
 679         *cs++ = MI_SEMAPHORE_WAIT_TOKEN |
 680                 MI_SEMAPHORE_GLOBAL_GTT |
 681                 MI_SEMAPHORE_POLL |
 682                 MI_SEMAPHORE_SAD_EQ_SDD;
 683         *cs++ = 0;
 684         *cs++ = preempt_address(rq->engine);
 685         *cs++ = 0;
 686         *cs++ = 0;
 687
 688         return cs;
 689 }
 690
 691 /* Wa_14014475959:dg2 */
 692 #define CCS_SEMAPHORE_PPHWSP_OFFSET     0x540
 693 static u32 ccs_semaphore_offset(struct i915_request *rq)
 694 {
 695         return i915_ggtt_offset(rq->context->state) +
 696                 (LRC_PPHWSP_PN * PAGE_SIZE) + CCS_SEMAPHORE_PPHWSP_OFFSET;
 697 }
 698
 699 /* Wa_14014475959:dg2 */
 700 static u32 *ccs_emit_wa_busywait(struct i915_request *rq, u32 *cs)
 701 {
 702         int i;
 703
 704         *cs++ = MI_ATOMIC_INLINE | MI_ATOMIC_GLOBAL_GTT | MI_ATOMIC_CS_STALL |
 705                 MI_ATOMIC_MOVE;
 706         *cs++ = ccs_semaphore_offset(rq);
 707         *cs++ = 0;
 708         *cs++ = 1;
 709
 710         /*
 711          * When MI_ATOMIC_INLINE_DATA set this command must be 11 DW + (1 NOP)
 712          * to align. 4 DWs above + 8 filler DWs here.
 713          */
 714         for (i = 0; i < 8; ++i)
 715                 *cs++ = 0;
 716
 717         *cs++ = MI_SEMAPHORE_WAIT |
 718                 MI_SEMAPHORE_GLOBAL_GTT |
 719                 MI_SEMAPHORE_POLL |
 720                 MI_SEMAPHORE_SAD_EQ_SDD;
 721         *cs++ = 0;
 722         *cs++ = ccs_semaphore_offset(rq);
 723         *cs++ = 0;
 724
 725         return cs;
 726 }
 727
 728 static __always_inline u32*
 729 gen12_emit_fini_breadcrumb_tail(struct i915_request *rq, u32 *cs)
 730 {
 731         *cs++ = MI_USER_INTERRUPT;
 732
 733         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
 734         if (intel_engine_has_semaphores(rq->engine) &&
 735             !intel_uc_uses_guc_submission(&rq->engine->gt->uc))
 736                 cs = gen12_emit_preempt_busywait(rq, cs);
 737
 738         /* Wa_14014475959:dg2 */
 739         if (intel_engine_uses_wa_hold_ccs_switchout(rq->engine))
 740                 cs = ccs_emit_wa_busywait(rq, cs);
 741
 742         rq->tail = intel_ring_offset(rq, cs);
 743         assert_ring_tail_valid(rq->ring, rq->tail);
 744
 745         return gen8_emit_wa_tail(rq, cs);
 746 }
 747
 748 u32 *gen12_emit_fini_breadcrumb_xcs(struct i915_request *rq, u32 *cs)
 749 {
 750         /* XXX Stalling flush before seqno write; post-sync not */
 751         cs = emit_xcs_breadcrumb(rq, __gen8_emit_flush_dw(cs, 0, 0, 0));
 752         return gen12_emit_fini_breadcrumb_tail(rq, cs);
 753 }
 754
 755 u32 *gen12_emit_fini_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
 756 {
 757         struct drm_i915_private *i915 = rq->engine->i915;
 758         u32 flags = (PIPE_CONTROL_CS_STALL |
 759                      PIPE_CONTROL_TLB_INVALIDATE |
 760                      PIPE_CONTROL_TILE_CACHE_FLUSH |
 761                      PIPE_CONTROL_FLUSH_L3 |
 762                      PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
 763                      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
 764                      PIPE_CONTROL_DC_FLUSH_ENABLE |
 765                      PIPE_CONTROL_FLUSH_ENABLE);
 766
 767         /* Wa_14016712196 */
 768         if (IS_MTL_GRAPHICS_STEP(i915, M, STEP_A0, STEP_B0) ||
 769             IS_MTL_GRAPHICS_STEP(i915, P, STEP_A0, STEP_B0))
 770                 /* dummy PIPE_CONTROL + depth flush */
 771                 cs = gen12_emit_pipe_control(cs, 0,
 772                                              PIPE_CONTROL_DEPTH_CACHE_FLUSH, 0);
 773
 774         if (GRAPHICS_VER(i915) == 12 && GRAPHICS_VER_FULL(i915) < IP_VER(12, 50))
 775                 /* Wa_1409600907 */
 776                 flags |= PIPE_CONTROL_DEPTH_STALL;
 777
 778         if (!HAS_3D_PIPELINE(rq->engine->i915))
 779                 flags &= ~PIPE_CONTROL_3D_ARCH_FLAGS;
 780         else if (rq->engine->class == COMPUTE_CLASS)
 781                 flags &= ~PIPE_CONTROL_3D_ENGINE_FLAGS;
 782
 783         cs = gen12_emit_pipe_control(cs, PIPE_CONTROL0_HDC_PIPELINE_FLUSH, flags, 0);
 784
 785         /*XXX: Look at gen8_emit_fini_breadcrumb_rcs */
 786         cs = gen12_emit_ggtt_write_rcs(cs,
 787                                        rq->fence.seqno,
 788                                        hwsp_offset(rq),
 789                                        0,
 790                                        PIPE_CONTROL_FLUSH_ENABLE |
 791                                        PIPE_CONTROL_CS_STALL);
 792
 793         return gen12_emit_fini_breadcrumb_tail(rq, cs);
 794 }