drivers/gpu/drm/i915/gt/gen8_engine_cs.c

   1 // SPDX-License-Identifier: MIT
   2 /*
   3  * Copyright © 2014 Intel Corporation
   4  */
   5
   6 #include "gen8_engine_cs.h"
   7 #include "i915_drv.h"
   8 #include "intel_engine_regs.h"
   9 #include "intel_gpu_commands.h"
  10 #include "intel_lrc.h"
  11 #include "intel_ring.h"
  12
  13 int gen8_emit_flush_rcs(struct i915_request *rq, u32 mode)
  14 {
  15         bool vf_flush_wa = false, dc_flush_wa = false;
  16         u32 *cs, flags = 0;
  17         int len;
  18
  19         flags |= PIPE_CONTROL_CS_STALL;
  20
  21         if (mode & EMIT_FLUSH) {
  22                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
  23                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
  24                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
  25                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
  26         }
  27
  28         if (mode & EMIT_INVALIDATE) {
  29                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
  30                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
  31                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
  32                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
  33                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
  34                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
  35                 flags |= PIPE_CONTROL_QW_WRITE;
  36                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
  37
  38                 /*
  39                  * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
  40                  * pipe control.
  41                  */
  42                 if (GRAPHICS_VER(rq->engine->i915) == 9)
  43                         vf_flush_wa = true;
  44
  45                 /* WaForGAMHang:kbl */
  46                 if (IS_KBL_GRAPHICS_STEP(rq->engine->i915, 0, STEP_C0))
  47                         dc_flush_wa = true;
  48         }
  49
  50         len = 6;
  51
  52         if (vf_flush_wa)
  53                 len += 6;
  54
  55         if (dc_flush_wa)
  56                 len += 12;
  57
  58         cs = intel_ring_begin(rq, len);
  59         if (IS_ERR(cs))
  60                 return PTR_ERR(cs);
  61
  62         if (vf_flush_wa)
  63                 cs = gen8_emit_pipe_control(cs, 0, 0);
  64
  65         if (dc_flush_wa)
  66                 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
  67                                             0);
  68
  69         cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
  70
  71         if (dc_flush_wa)
  72                 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
  73
  74         intel_ring_advance(rq, cs);
  75
  76         return 0;
  77 }
  78
  79 int gen8_emit_flush_xcs(struct i915_request *rq, u32 mode)
  80 {
  81         u32 cmd, *cs;
  82
  83         cs = intel_ring_begin(rq, 4);
  84         if (IS_ERR(cs))
  85                 return PTR_ERR(cs);
  86
  87         cmd = MI_FLUSH_DW + 1;
  88
  89         /*
  90          * We always require a command barrier so that subsequent
  91          * commands, such as breadcrumb interrupts, are strictly ordered
  92          * wrt the contents of the write cache being flushed to memory
  93          * (and thus being coherent from the CPU).
  94          */
  95         cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
  96
  97         if (mode & EMIT_INVALIDATE) {
  98                 cmd |= MI_INVALIDATE_TLB;
  99                 if (rq->engine->class == VIDEO_DECODE_CLASS)
 100                         cmd |= MI_INVALIDATE_BSD;
 101         }
 102
 103         *cs++ = cmd;
 104         *cs++ = LRC_PPHWSP_SCRATCH_ADDR;
 105         *cs++ = 0; /* upper addr */
 106         *cs++ = 0; /* value */
 107         intel_ring_advance(rq, cs);
 108
 109         return 0;
 110 }
 111
 112 int gen11_emit_flush_rcs(struct i915_request *rq, u32 mode)
 113 {
 114         if (mode & EMIT_FLUSH) {
 115                 u32 *cs;
 116                 u32 flags = 0;
 117
 118                 flags |= PIPE_CONTROL_CS_STALL;
 119
 120                 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
 121                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
 122                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
 123                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
 124                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
 125                 flags |= PIPE_CONTROL_QW_WRITE;
 126                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
 127
 128                 cs = intel_ring_begin(rq, 6);
 129                 if (IS_ERR(cs))
 130                         return PTR_ERR(cs);
 131
 132                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
 133                 intel_ring_advance(rq, cs);
 134         }
 135
 136         if (mode & EMIT_INVALIDATE) {
 137                 u32 *cs;
 138                 u32 flags = 0;
 139
 140                 flags |= PIPE_CONTROL_CS_STALL;
 141
 142                 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
 143                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
 144                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
 145                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
 146                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
 147                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
 148                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
 149                 flags |= PIPE_CONTROL_QW_WRITE;
 150                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
 151
 152                 cs = intel_ring_begin(rq, 6);
 153                 if (IS_ERR(cs))
 154                         return PTR_ERR(cs);
 155
 156                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
 157                 intel_ring_advance(rq, cs);
 158         }
 159
 160         return 0;
 161 }
 162
 163 static u32 preparser_disable(bool state)
 164 {
 165         return MI_ARB_CHECK | 1 << 8 | state;
 166 }
 167
 168 u32 *gen12_emit_aux_table_inv(u32 *cs, const i915_reg_t inv_reg)
 169 {
 170         *cs++ = MI_LOAD_REGISTER_IMM(1) | MI_LRI_MMIO_REMAP_EN;
 171         *cs++ = i915_mmio_reg_offset(inv_reg);
 172         *cs++ = AUX_INV;
 173         *cs++ = MI_NOOP;
 174
 175         return cs;
 176 }
 177
 178 int gen12_emit_flush_rcs(struct i915_request *rq, u32 mode)
 179 {
 180         struct intel_engine_cs *engine = rq->engine;
 181
 182         if (mode & EMIT_FLUSH) {
 183                 u32 flags = 0;
 184                 u32 *cs;
 185
 186                 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
 187                 flags |= PIPE_CONTROL_FLUSH_L3;
 188                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
 189                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
 190                 /* Wa_1409600907:tgl,adl-p */
 191                 flags |= PIPE_CONTROL_DEPTH_STALL;
 192                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
 193                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
 194
 195                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
 196                 flags |= PIPE_CONTROL_QW_WRITE;
 197
 198                 flags |= PIPE_CONTROL_CS_STALL;
 199
 200                 if (!HAS_3D_PIPELINE(engine->i915))
 201                         flags &= ~PIPE_CONTROL_3D_ARCH_FLAGS;
 202                 else if (engine->class == COMPUTE_CLASS)
 203                         flags &= ~PIPE_CONTROL_3D_ENGINE_FLAGS;
 204
 205                 cs = intel_ring_begin(rq, 6);
 206                 if (IS_ERR(cs))
 207                         return PTR_ERR(cs);
 208
 209                 cs = gen12_emit_pipe_control(cs,
 210                                              PIPE_CONTROL0_HDC_PIPELINE_FLUSH,
 211                                              flags, LRC_PPHWSP_SCRATCH_ADDR);
 212                 intel_ring_advance(rq, cs);
 213         }
 214
 215         if (mode & EMIT_INVALIDATE) {
 216                 u32 flags = 0;
 217                 u32 *cs, count;
 218
 219                 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
 220                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
 221                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
 222                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
 223                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
 224                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
 225                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
 226
 227                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
 228                 flags |= PIPE_CONTROL_QW_WRITE;
 229
 230                 flags |= PIPE_CONTROL_CS_STALL;
 231
 232                 if (!HAS_3D_PIPELINE(engine->i915))
 233                         flags &= ~PIPE_CONTROL_3D_ARCH_FLAGS;
 234                 else if (engine->class == COMPUTE_CLASS)
 235                         flags &= ~PIPE_CONTROL_3D_ENGINE_FLAGS;
 236
 237                 if (!HAS_FLAT_CCS(rq->engine->i915))
 238                         count = 8 + 4;
 239                 else
 240                         count = 8;
 241
 242                 cs = intel_ring_begin(rq, count);
 243                 if (IS_ERR(cs))
 244                         return PTR_ERR(cs);
 245
 246                 /*
 247                  * Prevent the pre-parser from skipping past the TLB
 248                  * invalidate and loading a stale page for the batch
 249                  * buffer / request payload.
 250                  */
 251                 *cs++ = preparser_disable(true);
 252
 253                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
 254
 255                 if (!HAS_FLAT_CCS(rq->engine->i915)) {
 256                         /* hsdes: 1809175790 */
 257                         cs = gen12_emit_aux_table_inv(cs, GEN12_GFX_CCS_AUX_NV);
 258                 }
 259
 260                 *cs++ = preparser_disable(false);
 261                 intel_ring_advance(rq, cs);
 262         }
 263
 264         return 0;
 265 }
 266
 267 int gen12_emit_flush_xcs(struct i915_request *rq, u32 mode)
 268 {
 269         intel_engine_mask_t aux_inv = 0;
 270         u32 cmd, *cs;
 271
 272         cmd = 4;
 273         if (mode & EMIT_INVALIDATE) {
 274                 cmd += 2;
 275
 276                 if (!HAS_FLAT_CCS(rq->engine->i915) &&
 277                     (rq->engine->class == VIDEO_DECODE_CLASS ||
 278                      rq->engine->class == VIDEO_ENHANCEMENT_CLASS)) {
 279                         aux_inv = rq->engine->mask &
 280                                 ~GENMASK(_BCS(I915_MAX_BCS - 1), BCS0);
 281                         if (aux_inv)
 282                                 cmd += 4;
 283                 }
 284         }
 285
 286         cs = intel_ring_begin(rq, cmd);
 287         if (IS_ERR(cs))
 288                 return PTR_ERR(cs);
 289
 290         if (mode & EMIT_INVALIDATE)
 291                 *cs++ = preparser_disable(true);
 292
 293         cmd = MI_FLUSH_DW + 1;
 294
 295         /*
 296          * We always require a command barrier so that subsequent
 297          * commands, such as breadcrumb interrupts, are strictly ordered
 298          * wrt the contents of the write cache being flushed to memory
 299          * (and thus being coherent from the CPU).
 300          */
 301         cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
 302
 303         if (mode & EMIT_INVALIDATE) {
 304                 cmd |= MI_INVALIDATE_TLB;
 305                 if (rq->engine->class == VIDEO_DECODE_CLASS)
 306                         cmd |= MI_INVALIDATE_BSD;
 307         }
 308
 309         *cs++ = cmd;
 310         *cs++ = LRC_PPHWSP_SCRATCH_ADDR;
 311         *cs++ = 0; /* upper addr */
 312         *cs++ = 0; /* value */
 313
 314         if (aux_inv) { /* hsdes: 1809175790 */
 315                 if (rq->engine->class == VIDEO_DECODE_CLASS)
 316                         cs = gen12_emit_aux_table_inv(cs, GEN12_VD0_AUX_NV);
 317                 else
 318                         cs = gen12_emit_aux_table_inv(cs, GEN12_VE0_AUX_NV);
 319         }
 320
 321         if (mode & EMIT_INVALIDATE)
 322                 *cs++ = preparser_disable(false);
 323
 324         intel_ring_advance(rq, cs);
 325
 326         return 0;
 327 }
 328
 329 static u32 preempt_address(struct intel_engine_cs *engine)
 330 {
 331         return (i915_ggtt_offset(engine->status_page.vma) +
 332                 I915_GEM_HWS_PREEMPT_ADDR);
 333 }
 334
 335 static u32 hwsp_offset(const struct i915_request *rq)
 336 {
 337         const struct intel_timeline *tl;
 338
 339         /* Before the request is executed, the timeline is fixed */
 340         tl = rcu_dereference_protected(rq->timeline,
 341                                        !i915_request_signaled(rq));
 342
 343         /* See the comment in i915_request_active_seqno(). */
 344         return page_mask_bits(tl->hwsp_offset) + offset_in_page(rq->hwsp_seqno);
 345 }
 346
 347 int gen8_emit_init_breadcrumb(struct i915_request *rq)
 348 {
 349         u32 *cs;
 350
 351         GEM_BUG_ON(i915_request_has_initial_breadcrumb(rq));
 352         if (!i915_request_timeline(rq)->has_initial_breadcrumb)
 353                 return 0;
 354
 355         cs = intel_ring_begin(rq, 6);
 356         if (IS_ERR(cs))
 357                 return PTR_ERR(cs);
 358
 359         *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
 360         *cs++ = hwsp_offset(rq);
 361         *cs++ = 0;
 362         *cs++ = rq->fence.seqno - 1;
 363
 364         /*
 365          * Check if we have been preempted before we even get started.
 366          *
 367          * After this point i915_request_started() reports true, even if
 368          * we get preempted and so are no longer running.
 369          *
 370          * i915_request_started() is used during preemption processing
 371          * to decide if the request is currently inside the user payload
 372          * or spinning on a kernel semaphore (or earlier). For no-preemption
 373          * requests, we do allow preemption on the semaphore before the user
 374          * payload, but do not allow preemption once the request is started.
 375          *
 376          * i915_request_started() is similarly used during GPU hangs to
 377          * determine if the user's payload was guilty, and if so, the
 378          * request is banned. Before the request is started, it is assumed
 379          * to be unharmed and an innocent victim of another's hang.
 380          */
 381         *cs++ = MI_NOOP;
 382         *cs++ = MI_ARB_CHECK;
 383
 384         intel_ring_advance(rq, cs);
 385
 386         /* Record the updated position of the request's payload */
 387         rq->infix = intel_ring_offset(rq, cs);
 388
 389         __set_bit(I915_FENCE_FLAG_INITIAL_BREADCRUMB, &rq->fence.flags);
 390
 391         return 0;
 392 }
 393
 394 static int __gen125_emit_bb_start(struct i915_request *rq,
 395                                   u64 offset, u32 len,
 396                                   const unsigned int flags,
 397                                   u32 arb)
 398 {
 399         struct intel_context *ce = rq->context;
 400         u32 wa_offset = lrc_indirect_bb(ce);
 401         u32 *cs;
 402
 403         cs = intel_ring_begin(rq, 12);
 404         if (IS_ERR(cs))
 405                 return PTR_ERR(cs);
 406
 407         *cs++ = MI_ARB_ON_OFF | arb;
 408
 409         *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
 410                 MI_SRM_LRM_GLOBAL_GTT |
 411                 MI_LRI_LRM_CS_MMIO;
 412         *cs++ = i915_mmio_reg_offset(RING_PREDICATE_RESULT(0));
 413         *cs++ = wa_offset + DG2_PREDICATE_RESULT_WA;
 414         *cs++ = 0;
 415
 416         *cs++ = MI_BATCH_BUFFER_START_GEN8 |
 417                 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
 418         *cs++ = lower_32_bits(offset);
 419         *cs++ = upper_32_bits(offset);
 420
 421         /* Fixup stray MI_SET_PREDICATE as it prevents us executing the ring */
 422         *cs++ = MI_BATCH_BUFFER_START_GEN8;
 423         *cs++ = wa_offset + DG2_PREDICATE_RESULT_BB;
 424         *cs++ = 0;
 425
 426         *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
 427
 428         intel_ring_advance(rq, cs);
 429
 430         return 0;
 431 }
 432
 433 int gen125_emit_bb_start_noarb(struct i915_request *rq,
 434                                u64 offset, u32 len,
 435                                const unsigned int flags)
 436 {
 437         return __gen125_emit_bb_start(rq, offset, len, flags, MI_ARB_DISABLE);
 438 }
 439
 440 int gen125_emit_bb_start(struct i915_request *rq,
 441                          u64 offset, u32 len,
 442                          const unsigned int flags)
 443 {
 444         return __gen125_emit_bb_start(rq, offset, len, flags, MI_ARB_ENABLE);
 445 }
 446
 447 int gen8_emit_bb_start_noarb(struct i915_request *rq,
 448                              u64 offset, u32 len,
 449                              const unsigned int flags)
 450 {
 451         u32 *cs;
 452
 453         cs = intel_ring_begin(rq, 4);
 454         if (IS_ERR(cs))
 455                 return PTR_ERR(cs);
 456
 457         /*
 458          * WaDisableCtxRestoreArbitration:bdw,chv
 459          *
 460          * We don't need to perform MI_ARB_ENABLE as often as we do (in
 461          * particular all the gen that do not need the w/a at all!), if we
 462          * took care to make sure that on every switch into this context
 463          * (both ordinary and for preemption) that arbitrartion was enabled
 464          * we would be fine.  However, for gen8 there is another w/a that
 465          * requires us to not preempt inside GPGPU execution, so we keep
 466          * arbitration disabled for gen8 batches. Arbitration will be
 467          * re-enabled before we close the request
 468          * (engine->emit_fini_breadcrumb).
 469          */
 470         *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
 471
 472         /* FIXME(BDW+): Address space and security selectors. */
 473         *cs++ = MI_BATCH_BUFFER_START_GEN8 |
 474                 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
 475         *cs++ = lower_32_bits(offset);
 476         *cs++ = upper_32_bits(offset);
 477
 478         intel_ring_advance(rq, cs);
 479
 480         return 0;
 481 }
 482
 483 int gen8_emit_bb_start(struct i915_request *rq,
 484                        u64 offset, u32 len,
 485                        const unsigned int flags)
 486 {
 487         u32 *cs;
 488
 489         if (unlikely(i915_request_has_nopreempt(rq)))
 490                 return gen8_emit_bb_start_noarb(rq, offset, len, flags);
 491
 492         cs = intel_ring_begin(rq, 6);
 493         if (IS_ERR(cs))
 494                 return PTR_ERR(cs);
 495
 496         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
 497
 498         *cs++ = MI_BATCH_BUFFER_START_GEN8 |
 499                 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
 500         *cs++ = lower_32_bits(offset);
 501         *cs++ = upper_32_bits(offset);
 502
 503         *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
 504         *cs++ = MI_NOOP;
 505
 506         intel_ring_advance(rq, cs);
 507
 508         return 0;
 509 }
 510
 511 static void assert_request_valid(struct i915_request *rq)
 512 {
 513         struct intel_ring *ring __maybe_unused = rq->ring;
 514
 515         /* Can we unwind this request without appearing to go forwards? */
 516         GEM_BUG_ON(intel_ring_direction(ring, rq->wa_tail, rq->head) <= 0);
 517 }
 518
 519 /*
 520  * Reserve space for 2 NOOPs at the end of each request to be
 521  * used as a workaround for not being allowed to do lite
 522  * restore with HEAD==TAIL (WaIdleLiteRestore).
 523  */
 524 static u32 *gen8_emit_wa_tail(struct i915_request *rq, u32 *cs)
 525 {
 526         /* Ensure there's always at least one preemption point per-request. */
 527         *cs++ = MI_ARB_CHECK;
 528         *cs++ = MI_NOOP;
 529         rq->wa_tail = intel_ring_offset(rq, cs);
 530
 531         /* Check that entire request is less than half the ring */
 532         assert_request_valid(rq);
 533
 534         return cs;
 535 }
 536
 537 static u32 *emit_preempt_busywait(struct i915_request *rq, u32 *cs)
 538 {
 539         *cs++ = MI_ARB_CHECK; /* trigger IDLE->ACTIVE first */
 540         *cs++ = MI_SEMAPHORE_WAIT |
 541                 MI_SEMAPHORE_GLOBAL_GTT |
 542                 MI_SEMAPHORE_POLL |
 543                 MI_SEMAPHORE_SAD_EQ_SDD;
 544         *cs++ = 0;
 545         *cs++ = preempt_address(rq->engine);
 546         *cs++ = 0;
 547         *cs++ = MI_NOOP;
 548
 549         return cs;
 550 }
 551
 552 static __always_inline u32*
 553 gen8_emit_fini_breadcrumb_tail(struct i915_request *rq, u32 *cs)
 554 {
 555         *cs++ = MI_USER_INTERRUPT;
 556
 557         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
 558         if (intel_engine_has_semaphores(rq->engine) &&
 559             !intel_uc_uses_guc_submission(&rq->engine->gt->uc))
 560                 cs = emit_preempt_busywait(rq, cs);
 561
 562         rq->tail = intel_ring_offset(rq, cs);
 563         assert_ring_tail_valid(rq->ring, rq->tail);
 564
 565         return gen8_emit_wa_tail(rq, cs);
 566 }
 567
 568 static u32 *emit_xcs_breadcrumb(struct i915_request *rq, u32 *cs)
 569 {
 570         return gen8_emit_ggtt_write(cs, rq->fence.seqno, hwsp_offset(rq), 0);
 571 }
 572
 573 u32 *gen8_emit_fini_breadcrumb_xcs(struct i915_request *rq, u32 *cs)
 574 {
 575         return gen8_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs));
 576 }
 577
 578 u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
 579 {
 580         cs = gen8_emit_pipe_control(cs,
 581                                     PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
 582                                     PIPE_CONTROL_DEPTH_CACHE_FLUSH |
 583                                     PIPE_CONTROL_DC_FLUSH_ENABLE,
 584                                     0);
 585
 586         /* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */
 587         cs = gen8_emit_ggtt_write_rcs(cs,
 588                                       rq->fence.seqno,
 589                                       hwsp_offset(rq),
 590                                       PIPE_CONTROL_FLUSH_ENABLE |
 591                                       PIPE_CONTROL_CS_STALL);
 592
 593         return gen8_emit_fini_breadcrumb_tail(rq, cs);
 594 }
 595
 596 u32 *gen11_emit_fini_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
 597 {
 598         cs = gen8_emit_ggtt_write_rcs(cs,
 599                                       rq->fence.seqno,
 600                                       hwsp_offset(rq),
 601                                       PIPE_CONTROL_CS_STALL |
 602                                       PIPE_CONTROL_TILE_CACHE_FLUSH |
 603                                       PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
 604                                       PIPE_CONTROL_DEPTH_CACHE_FLUSH |
 605                                       PIPE_CONTROL_DC_FLUSH_ENABLE |
 606                                       PIPE_CONTROL_FLUSH_ENABLE);
 607
 608         return gen8_emit_fini_breadcrumb_tail(rq, cs);
 609 }
 610
 611 /*
 612  * Note that the CS instruction pre-parser will not stall on the breadcrumb
 613  * flush and will continue pre-fetching the instructions after it before the
 614  * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at
 615  * BB_START/END instructions, so, even though we might pre-fetch the pre-amble
 616  * of the next request before the memory has been flushed, we're guaranteed that
 617  * we won't access the batch itself too early.
 618  * However, on gen12+ the parser can pre-fetch across the BB_START/END commands,
 619  * so, if the current request is modifying an instruction in the next request on
 620  * the same intel_context, we might pre-fetch and then execute the pre-update
 621  * instruction. To avoid this, the users of self-modifying code should either
 622  * disable the parser around the code emitting the memory writes, via a new flag
 623  * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For
 624  * the in-kernel use-cases we've opted to use a separate context, see
 625  * reloc_gpu() as an example.
 626  * All the above applies only to the instructions themselves. Non-inline data
 627  * used by the instructions is not pre-fetched.
 628  */
 629
 630 static u32 *gen12_emit_preempt_busywait(struct i915_request *rq, u32 *cs)
 631 {
 632         *cs++ = MI_ARB_CHECK; /* trigger IDLE->ACTIVE first */
 633         *cs++ = MI_SEMAPHORE_WAIT_TOKEN |
 634                 MI_SEMAPHORE_GLOBAL_GTT |
 635                 MI_SEMAPHORE_POLL |
 636                 MI_SEMAPHORE_SAD_EQ_SDD;
 637         *cs++ = 0;
 638         *cs++ = preempt_address(rq->engine);
 639         *cs++ = 0;
 640         *cs++ = 0;
 641
 642         return cs;
 643 }
 644
 645 /* Wa_14014475959:dg2 */
 646 #define CCS_SEMAPHORE_PPHWSP_OFFSET     0x540
 647 static u32 ccs_semaphore_offset(struct i915_request *rq)
 648 {
 649         return i915_ggtt_offset(rq->context->state) +
 650                 (LRC_PPHWSP_PN * PAGE_SIZE) + CCS_SEMAPHORE_PPHWSP_OFFSET;
 651 }
 652
 653 /* Wa_14014475959:dg2 */
 654 static u32 *ccs_emit_wa_busywait(struct i915_request *rq, u32 *cs)
 655 {
 656         int i;
 657
 658         *cs++ = MI_ATOMIC_INLINE | MI_ATOMIC_GLOBAL_GTT | MI_ATOMIC_CS_STALL |
 659                 MI_ATOMIC_MOVE;
 660         *cs++ = ccs_semaphore_offset(rq);
 661         *cs++ = 0;
 662         *cs++ = 1;
 663
 664         /*
 665          * When MI_ATOMIC_INLINE_DATA set this command must be 11 DW + (1 NOP)
 666          * to align. 4 DWs above + 8 filler DWs here.
 667          */
 668         for (i = 0; i < 8; ++i)
 669                 *cs++ = 0;
 670
 671         *cs++ = MI_SEMAPHORE_WAIT |
 672                 MI_SEMAPHORE_GLOBAL_GTT |
 673                 MI_SEMAPHORE_POLL |
 674                 MI_SEMAPHORE_SAD_EQ_SDD;
 675         *cs++ = 0;
 676         *cs++ = ccs_semaphore_offset(rq);
 677         *cs++ = 0;
 678
 679         return cs;
 680 }
 681
 682 static __always_inline u32*
 683 gen12_emit_fini_breadcrumb_tail(struct i915_request *rq, u32 *cs)
 684 {
 685         *cs++ = MI_USER_INTERRUPT;
 686
 687         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
 688         if (intel_engine_has_semaphores(rq->engine) &&
 689             !intel_uc_uses_guc_submission(&rq->engine->gt->uc))
 690                 cs = gen12_emit_preempt_busywait(rq, cs);
 691
 692         /* Wa_14014475959:dg2 */
 693         if (intel_engine_uses_wa_hold_ccs_switchout(rq->engine))
 694                 cs = ccs_emit_wa_busywait(rq, cs);
 695
 696         rq->tail = intel_ring_offset(rq, cs);
 697         assert_ring_tail_valid(rq->ring, rq->tail);
 698
 699         return gen8_emit_wa_tail(rq, cs);
 700 }
 701
 702 u32 *gen12_emit_fini_breadcrumb_xcs(struct i915_request *rq, u32 *cs)
 703 {
 704         /* XXX Stalling flush before seqno write; post-sync not */
 705         cs = emit_xcs_breadcrumb(rq, __gen8_emit_flush_dw(cs, 0, 0, 0));
 706         return gen12_emit_fini_breadcrumb_tail(rq, cs);
 707 }
 708
 709 u32 *gen12_emit_fini_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
 710 {
 711         struct drm_i915_private *i915 = rq->engine->i915;
 712         u32 flags = (PIPE_CONTROL_CS_STALL |
 713                      PIPE_CONTROL_TILE_CACHE_FLUSH |
 714                      PIPE_CONTROL_FLUSH_L3 |
 715                      PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
 716                      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
 717                      PIPE_CONTROL_DC_FLUSH_ENABLE |
 718                      PIPE_CONTROL_FLUSH_ENABLE);
 719
 720         if (GRAPHICS_VER(i915) == 12 && GRAPHICS_VER_FULL(i915) < IP_VER(12, 50))
 721                 /* Wa_1409600907 */
 722                 flags |= PIPE_CONTROL_DEPTH_STALL;
 723
 724         if (!HAS_3D_PIPELINE(rq->engine->i915))
 725                 flags &= ~PIPE_CONTROL_3D_ARCH_FLAGS;
 726         else if (rq->engine->class == COMPUTE_CLASS)
 727                 flags &= ~PIPE_CONTROL_3D_ENGINE_FLAGS;
 728
 729         cs = gen12_emit_ggtt_write_rcs(cs,
 730                                        rq->fence.seqno,
 731                                        hwsp_offset(rq),
 732                                        PIPE_CONTROL0_HDC_PIPELINE_FLUSH,
 733                                        flags);
 734
 735         return gen12_emit_fini_breadcrumb_tail(rq, cs);
 736 }