]> Git Repo - linux.git/blob - drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
Linux 6.14-rc3
[linux.git] / drivers / gpu / drm / i915 / gt / intel_engine_heartbeat.c
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2019 Intel Corporation
4  */
5
6 #include "i915_drv.h"
7 #include "i915_request.h"
8
9 #include "intel_context.h"
10 #include "intel_engine_heartbeat.h"
11 #include "intel_engine_pm.h"
12 #include "intel_engine.h"
13 #include "intel_gt.h"
14 #include "intel_reset.h"
15
16 /*
17  * While the engine is active, we send a periodic pulse along the engine
18  * to check on its health and to flush any idle-barriers. If that request
19  * is stuck, and we fail to preempt it, we declare the engine hung and
20  * issue a reset -- in the hope that restores progress.
21  */
22
23 static bool next_heartbeat(struct intel_engine_cs *engine)
24 {
25         struct i915_request *rq;
26         long delay;
27
28         delay = READ_ONCE(engine->props.heartbeat_interval_ms);
29
30         rq = engine->heartbeat.systole;
31
32         /*
33          * FIXME: The final period extension is disabled if the period has been
34          * modified from the default. This is to prevent issues with certain
35          * selftests which override the value and expect specific behaviour.
36          * Once the selftests have been updated to either cope with variable
37          * heartbeat periods (or to override the pre-emption timeout as well,
38          * or just to add a selftest specific override of the extension), the
39          * generic override can be removed.
40          */
41         if (rq && rq->sched.attr.priority >= I915_PRIORITY_BARRIER &&
42             delay == engine->defaults.heartbeat_interval_ms) {
43                 long longer;
44
45                 /*
46                  * The final try is at the highest priority possible. Up until now
47                  * a pre-emption might not even have been attempted. So make sure
48                  * this last attempt allows enough time for a pre-emption to occur.
49                  */
50                 longer = READ_ONCE(engine->props.preempt_timeout_ms) * 2;
51                 longer = intel_clamp_heartbeat_interval_ms(engine, longer);
52                 if (longer > delay)
53                         delay = longer;
54         }
55
56         if (!delay)
57                 return false;
58
59         delay = msecs_to_jiffies_timeout(delay);
60         if (delay >= HZ)
61                 delay = round_jiffies_up_relative(delay);
62         mod_delayed_work(system_highpri_wq, &engine->heartbeat.work, delay + 1);
63
64         return true;
65 }
66
67 static struct i915_request *
68 heartbeat_create(struct intel_context *ce, gfp_t gfp)
69 {
70         struct i915_request *rq;
71
72         intel_context_enter(ce);
73         rq = __i915_request_create(ce, gfp);
74         intel_context_exit(ce);
75
76         return rq;
77 }
78
79 static void idle_pulse(struct intel_engine_cs *engine, struct i915_request *rq)
80 {
81         engine->wakeref_serial = READ_ONCE(engine->serial) + 1;
82         i915_request_add_active_barriers(rq);
83         if (!engine->heartbeat.systole && intel_engine_has_heartbeat(engine))
84                 engine->heartbeat.systole = i915_request_get(rq);
85 }
86
87 static void heartbeat_commit(struct i915_request *rq,
88                              const struct i915_sched_attr *attr)
89 {
90         idle_pulse(rq->engine, rq);
91
92         __i915_request_commit(rq);
93         __i915_request_queue(rq, attr);
94 }
95
96 static void show_heartbeat(const struct i915_request *rq,
97                            struct intel_engine_cs *engine)
98 {
99         struct drm_printer p =
100                 drm_dbg_printer(&engine->i915->drm, DRM_UT_DRIVER, "heartbeat");
101
102         if (!rq) {
103                 intel_engine_dump(engine, &p,
104                                   "%s heartbeat not ticking\n",
105                                   engine->name);
106         } else {
107                 intel_engine_dump(engine, &p,
108                                   "%s heartbeat {seqno:%llx:%lld, prio:%d} not ticking\n",
109                                   engine->name,
110                                   rq->fence.context,
111                                   rq->fence.seqno,
112                                   rq->sched.attr.priority);
113         }
114 }
115
116 static void
117 reset_engine(struct intel_engine_cs *engine, struct i915_request *rq)
118 {
119         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
120                 show_heartbeat(rq, engine);
121
122         if (intel_engine_uses_guc(engine))
123                 /*
124                  * GuC itself is toast or GuC's hang detection
125                  * is disabled. Either way, need to find the
126                  * hang culprit manually.
127                  */
128                 intel_guc_find_hung_context(engine);
129
130         intel_gt_handle_error(engine->gt, engine->mask,
131                               I915_ERROR_CAPTURE,
132                               "stopped heartbeat on %s",
133                               engine->name);
134 }
135
136 static void heartbeat(struct work_struct *wrk)
137 {
138         struct i915_sched_attr attr = { .priority = I915_PRIORITY_MIN };
139         struct intel_engine_cs *engine =
140                 container_of(wrk, typeof(*engine), heartbeat.work.work);
141         struct intel_context *ce = engine->kernel_context;
142         struct i915_request *rq;
143         unsigned long serial;
144
145         /* Just in case everything has gone horribly wrong, give it a kick */
146         intel_engine_flush_submission(engine);
147
148         rq = engine->heartbeat.systole;
149         if (rq && i915_request_completed(rq)) {
150                 i915_request_put(rq);
151                 engine->heartbeat.systole = NULL;
152         }
153
154         if (!intel_engine_pm_get_if_awake(engine))
155                 return;
156
157         if (intel_gt_is_wedged(engine->gt))
158                 goto out;
159
160         if (i915_sched_engine_disabled(engine->sched_engine)) {
161                 reset_engine(engine, engine->heartbeat.systole);
162                 goto out;
163         }
164
165         if (engine->heartbeat.systole) {
166                 long delay = READ_ONCE(engine->props.heartbeat_interval_ms);
167
168                 /* Safeguard against too-fast worker invocations */
169                 if (!time_after(jiffies,
170                                 rq->emitted_jiffies + msecs_to_jiffies(delay)))
171                         goto out;
172
173                 if (!i915_sw_fence_signaled(&rq->submit)) {
174                         /*
175                          * Not yet submitted, system is stalled.
176                          *
177                          * This more often happens for ring submission,
178                          * where all contexts are funnelled into a common
179                          * ringbuffer. If one context is blocked on an
180                          * external fence, not only is it not submitted,
181                          * but all other contexts, including the kernel
182                          * context are stuck waiting for the signal.
183                          */
184                 } else if (engine->sched_engine->schedule &&
185                            rq->sched.attr.priority < I915_PRIORITY_BARRIER) {
186                         /*
187                          * Gradually raise the priority of the heartbeat to
188                          * give high priority work [which presumably desires
189                          * low latency and no jitter] the chance to naturally
190                          * complete before being preempted.
191                          */
192                         attr.priority = I915_PRIORITY_NORMAL;
193                         if (rq->sched.attr.priority >= attr.priority)
194                                 attr.priority = I915_PRIORITY_HEARTBEAT;
195                         if (rq->sched.attr.priority >= attr.priority)
196                                 attr.priority = I915_PRIORITY_BARRIER;
197
198                         local_bh_disable();
199                         engine->sched_engine->schedule(rq, &attr);
200                         local_bh_enable();
201                 } else {
202                         reset_engine(engine, rq);
203                 }
204
205                 rq->emitted_jiffies = jiffies;
206                 goto out;
207         }
208
209         serial = READ_ONCE(engine->serial);
210         if (engine->wakeref_serial == serial)
211                 goto out;
212
213         if (!mutex_trylock(&ce->timeline->mutex)) {
214                 /* Unable to lock the kernel timeline, is the engine stuck? */
215                 if (xchg(&engine->heartbeat.blocked, serial) == serial)
216                         intel_gt_handle_error(engine->gt, engine->mask,
217                                               I915_ERROR_CAPTURE,
218                                               "no heartbeat on %s",
219                                               engine->name);
220                 goto out;
221         }
222
223         rq = heartbeat_create(ce, GFP_NOWAIT | __GFP_NOWARN);
224         if (IS_ERR(rq))
225                 goto unlock;
226
227         heartbeat_commit(rq, &attr);
228
229 unlock:
230         mutex_unlock(&ce->timeline->mutex);
231 out:
232         if (!engine->i915->params.enable_hangcheck || !next_heartbeat(engine))
233                 i915_request_put(fetch_and_zero(&engine->heartbeat.systole));
234         intel_engine_pm_put(engine);
235 }
236
237 void intel_engine_unpark_heartbeat(struct intel_engine_cs *engine)
238 {
239         if (!CONFIG_DRM_I915_HEARTBEAT_INTERVAL)
240                 return;
241
242         next_heartbeat(engine);
243 }
244
245 void intel_engine_park_heartbeat(struct intel_engine_cs *engine)
246 {
247         if (cancel_delayed_work(&engine->heartbeat.work))
248                 i915_request_put(fetch_and_zero(&engine->heartbeat.systole));
249 }
250
251 void intel_gt_unpark_heartbeats(struct intel_gt *gt)
252 {
253         struct intel_engine_cs *engine;
254         enum intel_engine_id id;
255
256         for_each_engine(engine, gt, id)
257                 if (intel_engine_pm_is_awake(engine))
258                         intel_engine_unpark_heartbeat(engine);
259 }
260
261 void intel_gt_park_heartbeats(struct intel_gt *gt)
262 {
263         struct intel_engine_cs *engine;
264         enum intel_engine_id id;
265
266         for_each_engine(engine, gt, id)
267                 intel_engine_park_heartbeat(engine);
268 }
269
270 void intel_engine_init_heartbeat(struct intel_engine_cs *engine)
271 {
272         INIT_DELAYED_WORK(&engine->heartbeat.work, heartbeat);
273 }
274
275 static int __intel_engine_pulse(struct intel_engine_cs *engine)
276 {
277         struct i915_sched_attr attr = { .priority = I915_PRIORITY_BARRIER };
278         struct intel_context *ce = engine->kernel_context;
279         struct i915_request *rq;
280
281         lockdep_assert_held(&ce->timeline->mutex);
282         GEM_BUG_ON(!intel_engine_has_preemption(engine));
283         GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
284
285         rq = heartbeat_create(ce, GFP_NOWAIT | __GFP_NOWARN);
286         if (IS_ERR(rq))
287                 return PTR_ERR(rq);
288
289         __set_bit(I915_FENCE_FLAG_SENTINEL, &rq->fence.flags);
290
291         heartbeat_commit(rq, &attr);
292         GEM_BUG_ON(rq->sched.attr.priority < I915_PRIORITY_BARRIER);
293
294         /* Ensure the forced pulse gets a full period to execute */
295         next_heartbeat(engine);
296
297         return 0;
298 }
299
300 static unsigned long set_heartbeat(struct intel_engine_cs *engine,
301                                    unsigned long delay)
302 {
303         unsigned long old;
304
305         old = xchg(&engine->props.heartbeat_interval_ms, delay);
306         if (delay)
307                 intel_engine_unpark_heartbeat(engine);
308         else
309                 intel_engine_park_heartbeat(engine);
310
311         return old;
312 }
313
314 int intel_engine_set_heartbeat(struct intel_engine_cs *engine,
315                                unsigned long delay)
316 {
317         struct intel_context *ce = engine->kernel_context;
318         int err = 0;
319
320         if (!delay && !intel_engine_has_preempt_reset(engine))
321                 return -ENODEV;
322
323         /* FIXME: Remove together with equally marked hack in next_heartbeat. */
324         if (delay != engine->defaults.heartbeat_interval_ms &&
325             delay < 2 * engine->props.preempt_timeout_ms) {
326                 if (intel_engine_uses_guc(engine))
327                         drm_notice(&engine->i915->drm, "%s heartbeat interval adjusted to a non-default value which may downgrade individual engine resets to full GPU resets!\n",
328                                    engine->name);
329                 else
330                         drm_notice(&engine->i915->drm, "%s heartbeat interval adjusted to a non-default value which may cause engine resets to target innocent contexts!\n",
331                                    engine->name);
332         }
333
334         intel_engine_pm_get(engine);
335
336         err = mutex_lock_interruptible(&ce->timeline->mutex);
337         if (err)
338                 goto out_rpm;
339
340         if (delay != engine->props.heartbeat_interval_ms) {
341                 unsigned long saved = set_heartbeat(engine, delay);
342
343                 /* recheck current execution */
344                 if (intel_engine_has_preemption(engine)) {
345                         err = __intel_engine_pulse(engine);
346                         if (err)
347                                 set_heartbeat(engine, saved);
348                 }
349         }
350
351         mutex_unlock(&ce->timeline->mutex);
352
353 out_rpm:
354         intel_engine_pm_put(engine);
355         return err;
356 }
357
358 int intel_engine_pulse(struct intel_engine_cs *engine)
359 {
360         struct intel_context *ce = engine->kernel_context;
361         int err;
362
363         if (!intel_engine_has_preemption(engine))
364                 return -ENODEV;
365
366         if (!intel_engine_pm_get_if_awake(engine))
367                 return 0;
368
369         err = -EINTR;
370         if (!mutex_lock_interruptible(&ce->timeline->mutex)) {
371                 err = __intel_engine_pulse(engine);
372                 mutex_unlock(&ce->timeline->mutex);
373         }
374
375         intel_engine_flush_submission(engine);
376         intel_engine_pm_put(engine);
377         return err;
378 }
379
380 int intel_engine_flush_barriers(struct intel_engine_cs *engine)
381 {
382         struct i915_sched_attr attr = { .priority = I915_PRIORITY_MIN };
383         struct intel_context *ce = engine->kernel_context;
384         struct i915_request *rq;
385         int err;
386
387         if (llist_empty(&engine->barrier_tasks))
388                 return 0;
389
390         if (!intel_engine_pm_get_if_awake(engine))
391                 return 0;
392
393         if (mutex_lock_interruptible(&ce->timeline->mutex)) {
394                 err = -EINTR;
395                 goto out_rpm;
396         }
397
398         rq = heartbeat_create(ce, GFP_KERNEL);
399         if (IS_ERR(rq)) {
400                 err = PTR_ERR(rq);
401                 goto out_unlock;
402         }
403
404         heartbeat_commit(rq, &attr);
405
406         err = 0;
407 out_unlock:
408         mutex_unlock(&ce->timeline->mutex);
409 out_rpm:
410         intel_engine_pm_put(engine);
411         return err;
412 }
413
414 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
415 #include "selftest_engine_heartbeat.c"
416 #endif
This page took 0.053928 seconds and 4 git commands to generate.