drivers/gpu/drm/i915/gt/gen7_renderclear.c

   1 // SPDX-License-Identifier: MIT
   2 /*
   3  * Copyright © 2019 Intel Corporation
   4  */
   5
   6 #include "gen7_renderclear.h"
   7 #include "i915_drv.h"
   8 #include "intel_gpu_commands.h"
   9 #include "intel_gt_regs.h"
  10
  11 #define GT3_INLINE_DATA_DELAYS 0x1E00
  12 #define batch_advance(Y, CS) GEM_BUG_ON((Y)->end != (CS))
  13
  14 struct cb_kernel {
  15         const void *data;
  16         u32 size;
  17 };
  18
  19 #define CB_KERNEL(name) { .data = (name), .size = sizeof(name) }
  20
  21 #include "ivb_clear_kernel.c"
  22 static const struct cb_kernel cb_kernel_ivb = CB_KERNEL(ivb_clear_kernel);
  23
  24 #include "hsw_clear_kernel.c"
  25 static const struct cb_kernel cb_kernel_hsw = CB_KERNEL(hsw_clear_kernel);
  26
  27 struct batch_chunk {
  28         struct i915_vma *vma;
  29         u32 offset;
  30         u32 *start;
  31         u32 *end;
  32         u32 max_items;
  33 };
  34
  35 struct batch_vals {
  36         u32 max_threads;
  37         u32 state_start;
  38         u32 surface_start;
  39         u32 surface_height;
  40         u32 surface_width;
  41         u32 size;
  42 };
  43
  44 static int num_primitives(const struct batch_vals *bv)
  45 {
  46         /*
  47          * We need to saturate the GPU with work in order to dispatch
  48          * a shader on every HW thread, and clear the thread-local registers.
  49          * In short, we have to dispatch work faster than the shaders can
  50          * run in order to fill the EU and occupy each HW thread.
  51          */
  52         return bv->max_threads;
  53 }
  54
  55 static void
  56 batch_get_defaults(struct drm_i915_private *i915, struct batch_vals *bv)
  57 {
  58         if (IS_HASWELL(i915)) {
  59                 switch (INTEL_INFO(i915)->gt) {
  60                 default:
  61                 case 1:
  62                         bv->max_threads = 70;
  63                         break;
  64                 case 2:
  65                         bv->max_threads = 140;
  66                         break;
  67                 case 3:
  68                         bv->max_threads = 280;
  69                         break;
  70                 }
  71                 bv->surface_height = 16 * 16;
  72                 bv->surface_width = 32 * 2 * 16;
  73         } else {
  74                 switch (INTEL_INFO(i915)->gt) {
  75                 default:
  76                 case 1: /* including vlv */
  77                         bv->max_threads = 36;
  78                         break;
  79                 case 2:
  80                         bv->max_threads = 128;
  81                         break;
  82                 }
  83                 bv->surface_height = 16 * 8;
  84                 bv->surface_width = 32 * 16;
  85         }
  86         bv->state_start = round_up(SZ_1K + num_primitives(bv) * 64, SZ_4K);
  87         bv->surface_start = bv->state_start + SZ_4K;
  88         bv->size = bv->surface_start + bv->surface_height * bv->surface_width;
  89 }
  90
  91 static void batch_init(struct batch_chunk *bc,
  92                        struct i915_vma *vma,
  93                        u32 *start, u32 offset, u32 max_bytes)
  94 {
  95         bc->vma = vma;
  96         bc->offset = offset;
  97         bc->start = start + bc->offset / sizeof(*bc->start);
  98         bc->end = bc->start;
  99         bc->max_items = max_bytes / sizeof(*bc->start);
 100 }
 101
 102 static u32 batch_offset(const struct batch_chunk *bc, u32 *cs)
 103 {
 104         return (cs - bc->start) * sizeof(*bc->start) + bc->offset;
 105 }
 106
 107 static u32 batch_addr(const struct batch_chunk *bc)
 108 {
 109         return i915_vma_offset(bc->vma);
 110 }
 111
 112 static void batch_add(struct batch_chunk *bc, const u32 d)
 113 {
 114         GEM_BUG_ON((bc->end - bc->start) >= bc->max_items);
 115         *bc->end++ = d;
 116 }
 117
 118 static u32 *batch_alloc_items(struct batch_chunk *bc, u32 align, u32 items)
 119 {
 120         u32 *map;
 121
 122         if (align) {
 123                 u32 *end = PTR_ALIGN(bc->end, align);
 124
 125                 memset32(bc->end, 0, end - bc->end);
 126                 bc->end = end;
 127         }
 128
 129         map = bc->end;
 130         bc->end += items;
 131
 132         return map;
 133 }
 134
 135 static u32 *batch_alloc_bytes(struct batch_chunk *bc, u32 align, u32 bytes)
 136 {
 137         GEM_BUG_ON(!IS_ALIGNED(bytes, sizeof(*bc->start)));
 138         return batch_alloc_items(bc, align, bytes / sizeof(*bc->start));
 139 }
 140
 141 static u32
 142 gen7_fill_surface_state(struct batch_chunk *state,
 143                         const u32 dst_offset,
 144                         const struct batch_vals *bv)
 145 {
 146         u32 surface_h = bv->surface_height;
 147         u32 surface_w = bv->surface_width;
 148         u32 *cs = batch_alloc_items(state, 32, 8);
 149         u32 offset = batch_offset(state, cs);
 150
 151 #define SURFACE_2D 1
 152 #define SURFACEFORMAT_B8G8R8A8_UNORM 0x0C0
 153 #define RENDER_CACHE_READ_WRITE 1
 154
 155         *cs++ = SURFACE_2D << 29 |
 156                 (SURFACEFORMAT_B8G8R8A8_UNORM << 18) |
 157                 (RENDER_CACHE_READ_WRITE << 8);
 158
 159         *cs++ = batch_addr(state) + dst_offset;
 160
 161         *cs++ = ((surface_h / 4 - 1) << 16) | (surface_w / 4 - 1);
 162         *cs++ = surface_w;
 163         *cs++ = 0;
 164         *cs++ = 0;
 165         *cs++ = 0;
 166 #define SHADER_CHANNELS(r, g, b, a) \
 167         (((r) << 25) | ((g) << 22) | ((b) << 19) | ((a) << 16))
 168         *cs++ = SHADER_CHANNELS(4, 5, 6, 7);
 169         batch_advance(state, cs);
 170
 171         return offset;
 172 }
 173
 174 static u32
 175 gen7_fill_binding_table(struct batch_chunk *state,
 176                         const struct batch_vals *bv)
 177 {
 178         u32 surface_start =
 179                 gen7_fill_surface_state(state, bv->surface_start, bv);
 180         u32 *cs = batch_alloc_items(state, 32, 8);
 181         u32 offset = batch_offset(state, cs);
 182
 183         *cs++ = surface_start - state->offset;
 184         *cs++ = 0;
 185         *cs++ = 0;
 186         *cs++ = 0;
 187         *cs++ = 0;
 188         *cs++ = 0;
 189         *cs++ = 0;
 190         *cs++ = 0;
 191         batch_advance(state, cs);
 192
 193         return offset;
 194 }
 195
 196 static u32
 197 gen7_fill_kernel_data(struct batch_chunk *state,
 198                       const u32 *data,
 199                       const u32 size)
 200 {
 201         return batch_offset(state,
 202                             memcpy(batch_alloc_bytes(state, 64, size),
 203                                    data, size));
 204 }
 205
 206 static u32
 207 gen7_fill_interface_descriptor(struct batch_chunk *state,
 208                                const struct batch_vals *bv,
 209                                const struct cb_kernel *kernel,
 210                                unsigned int count)
 211 {
 212         u32 kernel_offset =
 213                 gen7_fill_kernel_data(state, kernel->data, kernel->size);
 214         u32 binding_table = gen7_fill_binding_table(state, bv);
 215         u32 *cs = batch_alloc_items(state, 32, 8 * count);
 216         u32 offset = batch_offset(state, cs);
 217
 218         *cs++ = kernel_offset;
 219         *cs++ = (1 << 7) | (1 << 13);
 220         *cs++ = 0;
 221         *cs++ = (binding_table - state->offset) | 1;
 222         *cs++ = 0;
 223         *cs++ = 0;
 224         *cs++ = 0;
 225         *cs++ = 0;
 226
 227         /* 1 - 63dummy idds */
 228         memset32(cs, 0x00, (count - 1) * 8);
 229         batch_advance(state, cs + (count - 1) * 8);
 230
 231         return offset;
 232 }
 233
 234 static void
 235 gen7_emit_state_base_address(struct batch_chunk *batch,
 236                              u32 surface_state_base)
 237 {
 238         u32 *cs = batch_alloc_items(batch, 0, 10);
 239
 240         *cs++ = STATE_BASE_ADDRESS | (10 - 2);
 241         /* general */
 242         *cs++ = batch_addr(batch) | BASE_ADDRESS_MODIFY;
 243         /* surface */
 244         *cs++ = (batch_addr(batch) + surface_state_base) | BASE_ADDRESS_MODIFY;
 245         /* dynamic */
 246         *cs++ = batch_addr(batch) | BASE_ADDRESS_MODIFY;
 247         /* indirect */
 248         *cs++ = batch_addr(batch) | BASE_ADDRESS_MODIFY;
 249         /* instruction */
 250         *cs++ = batch_addr(batch) | BASE_ADDRESS_MODIFY;
 251
 252         /* general/dynamic/indirect/instruction access Bound */
 253         *cs++ = 0;
 254         *cs++ = BASE_ADDRESS_MODIFY;
 255         *cs++ = 0;
 256         *cs++ = BASE_ADDRESS_MODIFY;
 257         batch_advance(batch, cs);
 258 }
 259
 260 static void
 261 gen7_emit_vfe_state(struct batch_chunk *batch,
 262                     const struct batch_vals *bv,
 263                     u32 urb_size, u32 curbe_size,
 264                     u32 mode)
 265 {
 266         u32 threads = bv->max_threads - 1;
 267         u32 *cs = batch_alloc_items(batch, 32, 8);
 268
 269         *cs++ = MEDIA_VFE_STATE | (8 - 2);
 270
 271         /* scratch buffer */
 272         *cs++ = 0;
 273
 274         /* number of threads & urb entries for GPGPU vs Media Mode */
 275         *cs++ = threads << 16 | 1 << 8 | mode << 2;
 276
 277         *cs++ = 0;
 278
 279         /* urb entry size & curbe size in 256 bits unit */
 280         *cs++ = urb_size << 16 | curbe_size;
 281
 282         /* scoreboard */
 283         *cs++ = 0;
 284         *cs++ = 0;
 285         *cs++ = 0;
 286         batch_advance(batch, cs);
 287 }
 288
 289 static void
 290 gen7_emit_interface_descriptor_load(struct batch_chunk *batch,
 291                                     const u32 interface_descriptor,
 292                                     unsigned int count)
 293 {
 294         u32 *cs = batch_alloc_items(batch, 8, 4);
 295
 296         *cs++ = MEDIA_INTERFACE_DESCRIPTOR_LOAD | (4 - 2);
 297         *cs++ = 0;
 298         *cs++ = count * 8 * sizeof(*cs);
 299
 300         /*
 301          * interface descriptor address - it is relative to the dynamics base
 302          * address
 303          */
 304         *cs++ = interface_descriptor;
 305         batch_advance(batch, cs);
 306 }
 307
 308 static void
 309 gen7_emit_media_object(struct batch_chunk *batch,
 310                        unsigned int media_object_index)
 311 {
 312         unsigned int x_offset = (media_object_index % 16) * 64;
 313         unsigned int y_offset = (media_object_index / 16) * 16;
 314         unsigned int pkt = 6 + 3;
 315         u32 *cs;
 316
 317         cs = batch_alloc_items(batch, 8, pkt);
 318
 319         *cs++ = MEDIA_OBJECT | (pkt - 2);
 320
 321         /* interface descriptor offset */
 322         *cs++ = 0;
 323
 324         /* without indirect data */
 325         *cs++ = 0;
 326         *cs++ = 0;
 327
 328         /* scoreboard */
 329         *cs++ = 0;
 330         *cs++ = 0;
 331
 332         /* inline */
 333         *cs++ = y_offset << 16 | x_offset;
 334         *cs++ = 0;
 335         *cs++ = GT3_INLINE_DATA_DELAYS;
 336
 337         batch_advance(batch, cs);
 338 }
 339
 340 static void gen7_emit_pipeline_flush(struct batch_chunk *batch)
 341 {
 342         u32 *cs = batch_alloc_items(batch, 0, 4);
 343
 344         *cs++ = GFX_OP_PIPE_CONTROL(4);
 345         *cs++ = PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
 346                 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
 347                 PIPE_CONTROL_DC_FLUSH_ENABLE |
 348                 PIPE_CONTROL_CS_STALL;
 349         *cs++ = 0;
 350         *cs++ = 0;
 351
 352         batch_advance(batch, cs);
 353 }
 354
 355 static void gen7_emit_pipeline_invalidate(struct batch_chunk *batch)
 356 {
 357         u32 *cs = batch_alloc_items(batch, 0, 10);
 358
 359         /* ivb: Stall before STATE_CACHE_INVALIDATE */
 360         *cs++ = GFX_OP_PIPE_CONTROL(5);
 361         *cs++ = PIPE_CONTROL_STALL_AT_SCOREBOARD |
 362                 PIPE_CONTROL_CS_STALL;
 363         *cs++ = 0;
 364         *cs++ = 0;
 365         *cs++ = 0;
 366
 367         *cs++ = GFX_OP_PIPE_CONTROL(5);
 368         *cs++ = PIPE_CONTROL_STATE_CACHE_INVALIDATE;
 369         *cs++ = 0;
 370         *cs++ = 0;
 371         *cs++ = 0;
 372
 373         batch_advance(batch, cs);
 374 }
 375
 376 static void emit_batch(struct i915_vma * const vma,
 377                        u32 *start,
 378                        const struct batch_vals *bv)
 379 {
 380         struct drm_i915_private *i915 = vma->vm->i915;
 381         const unsigned int desc_count = 1;
 382         const unsigned int urb_size = 1;
 383         struct batch_chunk cmds, state;
 384         u32 descriptors;
 385         unsigned int i;
 386
 387         batch_init(&cmds, vma, start, 0, bv->state_start);
 388         batch_init(&state, vma, start, bv->state_start, SZ_4K);
 389
 390         descriptors = gen7_fill_interface_descriptor(&state, bv,
 391                                                      IS_HASWELL(i915) ?
 392                                                      &cb_kernel_hsw :
 393                                                      &cb_kernel_ivb,
 394                                                      desc_count);
 395
 396         /* Reset inherited context registers */
 397         gen7_emit_pipeline_flush(&cmds);
 398         gen7_emit_pipeline_invalidate(&cmds);
 399         batch_add(&cmds, MI_LOAD_REGISTER_IMM(2));
 400         batch_add(&cmds, i915_mmio_reg_offset(CACHE_MODE_0_GEN7));
 401         batch_add(&cmds, 0xffff0000 |
 402                         (((IS_IVYBRIDGE(i915) && INTEL_INFO(i915)->gt == 1) ||
 403                           IS_VALLEYVIEW(i915)) ?
 404                          HIZ_RAW_STALL_OPT_DISABLE :
 405                          0));
 406         batch_add(&cmds, i915_mmio_reg_offset(CACHE_MODE_1));
 407         batch_add(&cmds, 0xffff0000 | PIXEL_SUBSPAN_COLLECT_OPT_DISABLE);
 408         gen7_emit_pipeline_invalidate(&cmds);
 409         gen7_emit_pipeline_flush(&cmds);
 410
 411         /* Switch to the media pipeline and our base address */
 412         gen7_emit_pipeline_invalidate(&cmds);
 413         batch_add(&cmds, PIPELINE_SELECT | PIPELINE_SELECT_MEDIA);
 414         batch_add(&cmds, MI_NOOP);
 415         gen7_emit_pipeline_invalidate(&cmds);
 416
 417         gen7_emit_pipeline_flush(&cmds);
 418         gen7_emit_state_base_address(&cmds, descriptors);
 419         gen7_emit_pipeline_invalidate(&cmds);
 420
 421         /* Set the clear-residual kernel state */
 422         gen7_emit_vfe_state(&cmds, bv, urb_size - 1, 0, 0);
 423         gen7_emit_interface_descriptor_load(&cmds, descriptors, desc_count);
 424
 425         /* Execute the kernel on all HW threads */
 426         for (i = 0; i < num_primitives(bv); i++)
 427                 gen7_emit_media_object(&cmds, i);
 428
 429         batch_add(&cmds, MI_BATCH_BUFFER_END);
 430 }
 431
 432 int gen7_setup_clear_gpr_bb(struct intel_engine_cs * const engine,
 433                             struct i915_vma * const vma)
 434 {
 435         struct batch_vals bv;
 436         u32 *batch;
 437
 438         batch_get_defaults(engine->i915, &bv);
 439         if (!vma)
 440                 return bv.size;
 441
 442         GEM_BUG_ON(vma->obj->base.size < bv.size);
 443
 444         batch = i915_gem_object_pin_map(vma->obj, I915_MAP_WC);
 445         if (IS_ERR(batch))
 446                 return PTR_ERR(batch);
 447
 448         emit_batch(vma, memset(batch, 0, bv.size), &bv);
 449
 450         i915_gem_object_flush_map(vma->obj);
 451         __i915_gem_object_release_map(vma->obj);
 452
 453         return 0;
 454 }