]> Git Repo - linux.git/blob - drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
Merge tag 'amd-drm-next-6.5-2023-06-09' of https://gitlab.freedesktop.org/agd5f/linux...
[linux.git] / drivers / gpu / drm / i915 / gt / intel_engine_heartbeat.c
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2019 Intel Corporation
4  */
5
6 #include "i915_drv.h"
7 #include "i915_request.h"
8
9 #include "intel_context.h"
10 #include "intel_engine_heartbeat.h"
11 #include "intel_engine_pm.h"
12 #include "intel_engine.h"
13 #include "intel_gt.h"
14 #include "intel_reset.h"
15
16 /*
17  * While the engine is active, we send a periodic pulse along the engine
18  * to check on its health and to flush any idle-barriers. If that request
19  * is stuck, and we fail to preempt it, we declare the engine hung and
20  * issue a reset -- in the hope that restores progress.
21  */
22
23 static bool next_heartbeat(struct intel_engine_cs *engine)
24 {
25         struct i915_request *rq;
26         long delay;
27
28         delay = READ_ONCE(engine->props.heartbeat_interval_ms);
29
30         rq = engine->heartbeat.systole;
31
32         /*
33          * FIXME: The final period extension is disabled if the period has been
34          * modified from the default. This is to prevent issues with certain
35          * selftests which override the value and expect specific behaviour.
36          * Once the selftests have been updated to either cope with variable
37          * heartbeat periods (or to override the pre-emption timeout as well,
38          * or just to add a selftest specific override of the extension), the
39          * generic override can be removed.
40          */
41         if (rq && rq->sched.attr.priority >= I915_PRIORITY_BARRIER &&
42             delay == engine->defaults.heartbeat_interval_ms) {
43                 long longer;
44
45                 /*
46                  * The final try is at the highest priority possible. Up until now
47                  * a pre-emption might not even have been attempted. So make sure
48                  * this last attempt allows enough time for a pre-emption to occur.
49                  */
50                 longer = READ_ONCE(engine->props.preempt_timeout_ms) * 2;
51                 longer = intel_clamp_heartbeat_interval_ms(engine, longer);
52                 if (longer > delay)
53                         delay = longer;
54         }
55
56         if (!delay)
57                 return false;
58
59         delay = msecs_to_jiffies_timeout(delay);
60         if (delay >= HZ)
61                 delay = round_jiffies_up_relative(delay);
62         mod_delayed_work(system_highpri_wq, &engine->heartbeat.work, delay + 1);
63
64         return true;
65 }
66
67 static struct i915_request *
68 heartbeat_create(struct intel_context *ce, gfp_t gfp)
69 {
70         struct i915_request *rq;
71
72         intel_context_enter(ce);
73         rq = __i915_request_create(ce, gfp);
74         intel_context_exit(ce);
75
76         return rq;
77 }
78
79 static void idle_pulse(struct intel_engine_cs *engine, struct i915_request *rq)
80 {
81         engine->wakeref_serial = READ_ONCE(engine->serial) + 1;
82         i915_request_add_active_barriers(rq);
83         if (!engine->heartbeat.systole && intel_engine_has_heartbeat(engine))
84                 engine->heartbeat.systole = i915_request_get(rq);
85 }
86
87 static void heartbeat_commit(struct i915_request *rq,
88                              const struct i915_sched_attr *attr)
89 {
90         idle_pulse(rq->engine, rq);
91
92         __i915_request_commit(rq);
93         __i915_request_queue(rq, attr);
94 }
95
96 static void show_heartbeat(const struct i915_request *rq,
97                            struct intel_engine_cs *engine)
98 {
99         struct drm_printer p = drm_debug_printer("heartbeat");
100
101         if (!rq) {
102                 intel_engine_dump(engine, &p,
103                                   "%s heartbeat not ticking\n",
104                                   engine->name);
105         } else {
106                 intel_engine_dump(engine, &p,
107                                   "%s heartbeat {seqno:%llx:%lld, prio:%d} not ticking\n",
108                                   engine->name,
109                                   rq->fence.context,
110                                   rq->fence.seqno,
111                                   rq->sched.attr.priority);
112         }
113 }
114
115 static void
116 reset_engine(struct intel_engine_cs *engine, struct i915_request *rq)
117 {
118         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
119                 show_heartbeat(rq, engine);
120
121         if (intel_engine_uses_guc(engine))
122                 /*
123                  * GuC itself is toast or GuC's hang detection
124                  * is disabled. Either way, need to find the
125                  * hang culprit manually.
126                  */
127                 intel_guc_find_hung_context(engine);
128
129         intel_gt_handle_error(engine->gt, engine->mask,
130                               I915_ERROR_CAPTURE,
131                               "stopped heartbeat on %s",
132                               engine->name);
133 }
134
135 static void heartbeat(struct work_struct *wrk)
136 {
137         struct i915_sched_attr attr = { .priority = I915_PRIORITY_MIN };
138         struct intel_engine_cs *engine =
139                 container_of(wrk, typeof(*engine), heartbeat.work.work);
140         struct intel_context *ce = engine->kernel_context;
141         struct i915_request *rq;
142         unsigned long serial;
143
144         /* Just in case everything has gone horribly wrong, give it a kick */
145         intel_engine_flush_submission(engine);
146
147         rq = engine->heartbeat.systole;
148         if (rq && i915_request_completed(rq)) {
149                 i915_request_put(rq);
150                 engine->heartbeat.systole = NULL;
151         }
152
153         if (!intel_engine_pm_get_if_awake(engine))
154                 return;
155
156         if (intel_gt_is_wedged(engine->gt))
157                 goto out;
158
159         if (i915_sched_engine_disabled(engine->sched_engine)) {
160                 reset_engine(engine, engine->heartbeat.systole);
161                 goto out;
162         }
163
164         if (engine->heartbeat.systole) {
165                 long delay = READ_ONCE(engine->props.heartbeat_interval_ms);
166
167                 /* Safeguard against too-fast worker invocations */
168                 if (!time_after(jiffies,
169                                 rq->emitted_jiffies + msecs_to_jiffies(delay)))
170                         goto out;
171
172                 if (!i915_sw_fence_signaled(&rq->submit)) {
173                         /*
174                          * Not yet submitted, system is stalled.
175                          *
176                          * This more often happens for ring submission,
177                          * where all contexts are funnelled into a common
178                          * ringbuffer. If one context is blocked on an
179                          * external fence, not only is it not submitted,
180                          * but all other contexts, including the kernel
181                          * context are stuck waiting for the signal.
182                          */
183                 } else if (engine->sched_engine->schedule &&
184                            rq->sched.attr.priority < I915_PRIORITY_BARRIER) {
185                         /*
186                          * Gradually raise the priority of the heartbeat to
187                          * give high priority work [which presumably desires
188                          * low latency and no jitter] the chance to naturally
189                          * complete before being preempted.
190                          */
191                         attr.priority = 0;
192                         if (rq->sched.attr.priority >= attr.priority)
193                                 attr.priority = I915_PRIORITY_HEARTBEAT;
194                         if (rq->sched.attr.priority >= attr.priority)
195                                 attr.priority = I915_PRIORITY_BARRIER;
196
197                         local_bh_disable();
198                         engine->sched_engine->schedule(rq, &attr);
199                         local_bh_enable();
200                 } else {
201                         reset_engine(engine, rq);
202                 }
203
204                 rq->emitted_jiffies = jiffies;
205                 goto out;
206         }
207
208         serial = READ_ONCE(engine->serial);
209         if (engine->wakeref_serial == serial)
210                 goto out;
211
212         if (!mutex_trylock(&ce->timeline->mutex)) {
213                 /* Unable to lock the kernel timeline, is the engine stuck? */
214                 if (xchg(&engine->heartbeat.blocked, serial) == serial)
215                         intel_gt_handle_error(engine->gt, engine->mask,
216                                               I915_ERROR_CAPTURE,
217                                               "no heartbeat on %s",
218                                               engine->name);
219                 goto out;
220         }
221
222         rq = heartbeat_create(ce, GFP_NOWAIT | __GFP_NOWARN);
223         if (IS_ERR(rq))
224                 goto unlock;
225
226         heartbeat_commit(rq, &attr);
227
228 unlock:
229         mutex_unlock(&ce->timeline->mutex);
230 out:
231         if (!engine->i915->params.enable_hangcheck || !next_heartbeat(engine))
232                 i915_request_put(fetch_and_zero(&engine->heartbeat.systole));
233         intel_engine_pm_put(engine);
234 }
235
236 void intel_engine_unpark_heartbeat(struct intel_engine_cs *engine)
237 {
238         if (!CONFIG_DRM_I915_HEARTBEAT_INTERVAL)
239                 return;
240
241         next_heartbeat(engine);
242 }
243
244 void intel_engine_park_heartbeat(struct intel_engine_cs *engine)
245 {
246         if (cancel_delayed_work(&engine->heartbeat.work))
247                 i915_request_put(fetch_and_zero(&engine->heartbeat.systole));
248 }
249
250 void intel_gt_unpark_heartbeats(struct intel_gt *gt)
251 {
252         struct intel_engine_cs *engine;
253         enum intel_engine_id id;
254
255         for_each_engine(engine, gt, id)
256                 if (intel_engine_pm_is_awake(engine))
257                         intel_engine_unpark_heartbeat(engine);
258 }
259
260 void intel_gt_park_heartbeats(struct intel_gt *gt)
261 {
262         struct intel_engine_cs *engine;
263         enum intel_engine_id id;
264
265         for_each_engine(engine, gt, id)
266                 intel_engine_park_heartbeat(engine);
267 }
268
269 void intel_engine_init_heartbeat(struct intel_engine_cs *engine)
270 {
271         INIT_DELAYED_WORK(&engine->heartbeat.work, heartbeat);
272 }
273
274 static int __intel_engine_pulse(struct intel_engine_cs *engine)
275 {
276         struct i915_sched_attr attr = { .priority = I915_PRIORITY_BARRIER };
277         struct intel_context *ce = engine->kernel_context;
278         struct i915_request *rq;
279
280         lockdep_assert_held(&ce->timeline->mutex);
281         GEM_BUG_ON(!intel_engine_has_preemption(engine));
282         GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
283
284         rq = heartbeat_create(ce, GFP_NOWAIT | __GFP_NOWARN);
285         if (IS_ERR(rq))
286                 return PTR_ERR(rq);
287
288         __set_bit(I915_FENCE_FLAG_SENTINEL, &rq->fence.flags);
289
290         heartbeat_commit(rq, &attr);
291         GEM_BUG_ON(rq->sched.attr.priority < I915_PRIORITY_BARRIER);
292
293         return 0;
294 }
295
296 static unsigned long set_heartbeat(struct intel_engine_cs *engine,
297                                    unsigned long delay)
298 {
299         unsigned long old;
300
301         old = xchg(&engine->props.heartbeat_interval_ms, delay);
302         if (delay)
303                 intel_engine_unpark_heartbeat(engine);
304         else
305                 intel_engine_park_heartbeat(engine);
306
307         return old;
308 }
309
310 int intel_engine_set_heartbeat(struct intel_engine_cs *engine,
311                                unsigned long delay)
312 {
313         struct intel_context *ce = engine->kernel_context;
314         int err = 0;
315
316         if (!delay && !intel_engine_has_preempt_reset(engine))
317                 return -ENODEV;
318
319         /* FIXME: Remove together with equally marked hack in next_heartbeat. */
320         if (delay != engine->defaults.heartbeat_interval_ms &&
321             delay < 2 * engine->props.preempt_timeout_ms) {
322                 if (intel_engine_uses_guc(engine))
323                         drm_notice(&engine->i915->drm, "%s heartbeat interval adjusted to a non-default value which may downgrade individual engine resets to full GPU resets!\n",
324                                    engine->name);
325                 else
326                         drm_notice(&engine->i915->drm, "%s heartbeat interval adjusted to a non-default value which may cause engine resets to target innocent contexts!\n",
327                                    engine->name);
328         }
329
330         intel_engine_pm_get(engine);
331
332         err = mutex_lock_interruptible(&ce->timeline->mutex);
333         if (err)
334                 goto out_rpm;
335
336         if (delay != engine->props.heartbeat_interval_ms) {
337                 unsigned long saved = set_heartbeat(engine, delay);
338
339                 /* recheck current execution */
340                 if (intel_engine_has_preemption(engine)) {
341                         err = __intel_engine_pulse(engine);
342                         if (err)
343                                 set_heartbeat(engine, saved);
344                 }
345         }
346
347         mutex_unlock(&ce->timeline->mutex);
348
349 out_rpm:
350         intel_engine_pm_put(engine);
351         return err;
352 }
353
354 int intel_engine_pulse(struct intel_engine_cs *engine)
355 {
356         struct intel_context *ce = engine->kernel_context;
357         int err;
358
359         if (!intel_engine_has_preemption(engine))
360                 return -ENODEV;
361
362         if (!intel_engine_pm_get_if_awake(engine))
363                 return 0;
364
365         err = -EINTR;
366         if (!mutex_lock_interruptible(&ce->timeline->mutex)) {
367                 err = __intel_engine_pulse(engine);
368                 mutex_unlock(&ce->timeline->mutex);
369         }
370
371         intel_engine_flush_submission(engine);
372         intel_engine_pm_put(engine);
373         return err;
374 }
375
376 int intel_engine_flush_barriers(struct intel_engine_cs *engine)
377 {
378         struct i915_sched_attr attr = { .priority = I915_PRIORITY_MIN };
379         struct intel_context *ce = engine->kernel_context;
380         struct i915_request *rq;
381         int err;
382
383         if (llist_empty(&engine->barrier_tasks))
384                 return 0;
385
386         if (!intel_engine_pm_get_if_awake(engine))
387                 return 0;
388
389         if (mutex_lock_interruptible(&ce->timeline->mutex)) {
390                 err = -EINTR;
391                 goto out_rpm;
392         }
393
394         rq = heartbeat_create(ce, GFP_KERNEL);
395         if (IS_ERR(rq)) {
396                 err = PTR_ERR(rq);
397                 goto out_unlock;
398         }
399
400         heartbeat_commit(rq, &attr);
401
402         err = 0;
403 out_unlock:
404         mutex_unlock(&ce->timeline->mutex);
405 out_rpm:
406         intel_engine_pm_put(engine);
407         return err;
408 }
409
410 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
411 #include "selftest_engine_heartbeat.c"
412 #endif
This page took 0.06479 seconds and 4 git commands to generate.