]>
Commit | Line | Data |
---|---|---|
d897a111 MW |
1 | /* |
2 | * SPDX-License-Identifier: MIT | |
3 | * | |
4 | * Copyright � 2008-2018 Intel Corporation | |
5 | */ | |
6 | ||
7 | #ifndef _I915_GPU_ERROR_H_ | |
8 | #define _I915_GPU_ERROR_H_ | |
9 | ||
10 | #include <linux/kref.h> | |
11 | #include <linux/ktime.h> | |
12 | #include <linux/sched.h> | |
13 | ||
14 | #include <drm/drm_mm.h> | |
15 | ||
16 | #include "intel_device_info.h" | |
17 | #include "intel_ringbuffer.h" | |
18 | #include "intel_uc_fw.h" | |
19 | ||
20 | #include "i915_gem.h" | |
21 | #include "i915_gem_gtt.h" | |
22 | #include "i915_params.h" | |
b7268c5e | 23 | #include "i915_scheduler.h" |
d897a111 MW |
24 | |
25 | struct drm_i915_private; | |
26 | struct intel_overlay_error_state; | |
27 | struct intel_display_error_state; | |
28 | ||
29 | struct i915_gpu_state { | |
30 | struct kref ref; | |
31 | ktime_t time; | |
32 | ktime_t boottime; | |
33 | ktime_t uptime; | |
043477b0 MK |
34 | unsigned long capture; |
35 | unsigned long epoch; | |
d897a111 MW |
36 | |
37 | struct drm_i915_private *i915; | |
38 | ||
39 | char error_msg[128]; | |
40 | bool simulated; | |
41 | bool awake; | |
42 | bool wakelock; | |
43 | bool suspended; | |
44 | int iommu; | |
45 | u32 reset_count; | |
46 | u32 suspend_count; | |
47 | struct intel_device_info device_info; | |
48 | struct intel_driver_caps driver_caps; | |
49 | struct i915_params params; | |
50 | ||
51 | struct i915_error_uc { | |
52 | struct intel_uc_fw guc_fw; | |
53 | struct intel_uc_fw huc_fw; | |
54 | struct drm_i915_error_object *guc_log; | |
55 | } uc; | |
56 | ||
57 | /* Generic register state */ | |
58 | u32 eir; | |
59 | u32 pgtbl_er; | |
60 | u32 ier; | |
6b7a6a7b | 61 | u32 gtier[6], ngtier; |
d897a111 MW |
62 | u32 ccid; |
63 | u32 derrmr; | |
64 | u32 forcewake; | |
65 | u32 error; /* gen6+ */ | |
66 | u32 err_int; /* gen7 */ | |
67 | u32 fault_data0; /* gen8, gen9 */ | |
68 | u32 fault_data1; /* gen8, gen9 */ | |
69 | u32 done_reg; | |
70 | u32 gac_eco; | |
71 | u32 gam_ecochk; | |
72 | u32 gab_ctl; | |
73 | u32 gfx_mode; | |
74 | ||
75 | u32 nfence; | |
76 | u64 fence[I915_MAX_NUM_FENCES]; | |
77 | struct intel_overlay_error_state *overlay; | |
78 | struct intel_display_error_state *display; | |
79 | ||
80 | struct drm_i915_error_engine { | |
81 | int engine_id; | |
82 | /* Software tracked state */ | |
83 | bool idle; | |
84 | bool waiting; | |
85 | int num_waiters; | |
86 | unsigned long hangcheck_timestamp; | |
87 | bool hangcheck_stalled; | |
88 | enum intel_engine_hangcheck_action hangcheck_action; | |
89 | struct i915_address_space *vm; | |
90 | int num_requests; | |
91 | u32 reset_count; | |
92 | ||
93 | /* position of active request inside the ring */ | |
94 | u32 rq_head, rq_post, rq_tail; | |
95 | ||
96 | /* our own tracking of ring head and tail */ | |
97 | u32 cpu_ring_head; | |
98 | u32 cpu_ring_tail; | |
99 | ||
100 | u32 last_seqno; | |
101 | ||
102 | /* Register state */ | |
103 | u32 start; | |
104 | u32 tail; | |
105 | u32 head; | |
106 | u32 ctl; | |
107 | u32 mode; | |
108 | u32 hws; | |
109 | u32 ipeir; | |
110 | u32 ipehr; | |
111 | u32 bbstate; | |
112 | u32 instpm; | |
113 | u32 instps; | |
114 | u32 seqno; | |
115 | u64 bbaddr; | |
116 | u64 acthd; | |
117 | u32 fault_reg; | |
118 | u64 faddr; | |
119 | u32 rc_psmi; /* sleep state */ | |
120 | u32 semaphore_mboxes[I915_NUM_ENGINES - 1]; | |
121 | struct intel_instdone instdone; | |
122 | ||
123 | struct drm_i915_error_context { | |
124 | char comm[TASK_COMM_LEN]; | |
125 | pid_t pid; | |
126 | u32 handle; | |
127 | u32 hw_id; | |
d897a111 MW |
128 | int ban_score; |
129 | int active; | |
130 | int guilty; | |
131 | bool bannable; | |
b7268c5e | 132 | struct i915_sched_attr sched_attr; |
d897a111 MW |
133 | } context; |
134 | ||
135 | struct drm_i915_error_object { | |
136 | u64 gtt_offset; | |
137 | u64 gtt_size; | |
4c9613ce | 138 | int num_pages; |
d897a111 MW |
139 | int page_count; |
140 | int unused; | |
141 | u32 *pages[0]; | |
142 | } *ringbuffer, *batchbuffer, *wa_batchbuffer, *ctx, *hws_page; | |
143 | ||
144 | struct drm_i915_error_object **user_bo; | |
145 | long user_bo_count; | |
146 | ||
147 | struct drm_i915_error_object *wa_ctx; | |
148 | struct drm_i915_error_object *default_state; | |
149 | ||
150 | struct drm_i915_error_request { | |
151 | long jiffies; | |
152 | pid_t pid; | |
153 | u32 context; | |
d897a111 MW |
154 | int ban_score; |
155 | u32 seqno; | |
3a068721 | 156 | u32 start; |
d897a111 MW |
157 | u32 head; |
158 | u32 tail; | |
b7268c5e | 159 | struct i915_sched_attr sched_attr; |
d897a111 MW |
160 | } *requests, execlist[EXECLIST_MAX_PORTS]; |
161 | unsigned int num_ports; | |
162 | ||
163 | struct drm_i915_error_waiter { | |
164 | char comm[TASK_COMM_LEN]; | |
165 | pid_t pid; | |
166 | u32 seqno; | |
167 | } *waiters; | |
168 | ||
169 | struct { | |
170 | u32 gfx_mode; | |
171 | union { | |
172 | u64 pdp[4]; | |
173 | u32 pp_dir_base; | |
174 | }; | |
175 | } vm_info; | |
176 | } engine[I915_NUM_ENGINES]; | |
177 | ||
178 | struct drm_i915_error_buffer { | |
179 | u32 size; | |
180 | u32 name; | |
5c3f8c22 | 181 | u32 wseqno; |
d897a111 MW |
182 | u64 gtt_offset; |
183 | u32 read_domains; | |
184 | u32 write_domain; | |
185 | s32 fence_reg:I915_MAX_NUM_FENCE_BITS; | |
186 | u32 tiling:2; | |
187 | u32 dirty:1; | |
188 | u32 purgeable:1; | |
189 | u32 userptr:1; | |
190 | s32 engine:4; | |
191 | u32 cache_level:3; | |
192 | } *active_bo[I915_NUM_ENGINES], *pinned_bo; | |
193 | u32 active_bo_count[I915_NUM_ENGINES], pinned_bo_count; | |
194 | struct i915_address_space *active_vm[I915_NUM_ENGINES]; | |
195 | }; | |
196 | ||
197 | struct i915_gpu_error { | |
198 | /* For hangcheck timer */ | |
199 | #define DRM_I915_HANGCHECK_PERIOD 1500 /* in ms */ | |
200 | #define DRM_I915_HANGCHECK_JIFFIES msecs_to_jiffies(DRM_I915_HANGCHECK_PERIOD) | |
201 | ||
202 | struct delayed_work hangcheck_work; | |
203 | ||
204 | /* For reset and error_state handling. */ | |
205 | spinlock_t lock; | |
206 | /* Protected by the above dev->gpu_error.lock. */ | |
207 | struct i915_gpu_state *first_error; | |
208 | ||
209 | atomic_t pending_fb_pin; | |
210 | ||
211 | unsigned long missed_irq_rings; | |
212 | ||
213 | /** | |
214 | * State variable controlling the reset flow and count | |
215 | * | |
216 | * This is a counter which gets incremented when reset is triggered, | |
217 | * | |
218 | * Before the reset commences, the I915_RESET_BACKOFF bit is set | |
219 | * meaning that any waiters holding onto the struct_mutex should | |
220 | * relinquish the lock immediately in order for the reset to start. | |
221 | * | |
222 | * If reset is not completed successfully, the I915_WEDGE bit is | |
223 | * set meaning that hardware is terminally sour and there is no | |
224 | * recovery. All waiters on the reset_queue will be woken when | |
225 | * that happens. | |
226 | * | |
227 | * This counter is used by the wait_seqno code to notice that reset | |
228 | * event happened and it needs to restart the entire ioctl (since most | |
229 | * likely the seqno it waited for won't ever signal anytime soon). | |
230 | * | |
231 | * This is important for lock-free wait paths, where no contended lock | |
232 | * naturally enforces the correct ordering between the bail-out of the | |
233 | * waiter and the gpu reset work code. | |
234 | */ | |
235 | unsigned long reset_count; | |
236 | ||
237 | /** | |
238 | * flags: Control various stages of the GPU reset | |
239 | * | |
240 | * #I915_RESET_BACKOFF - When we start a reset, we want to stop any | |
241 | * other users acquiring the struct_mutex. To do this we set the | |
242 | * #I915_RESET_BACKOFF bit in the error flags when we detect a reset | |
243 | * and then check for that bit before acquiring the struct_mutex (in | |
244 | * i915_mutex_lock_interruptible()?). I915_RESET_BACKOFF serves a | |
245 | * secondary role in preventing two concurrent global reset attempts. | |
246 | * | |
247 | * #I915_RESET_HANDOFF - To perform the actual GPU reset, we need the | |
248 | * struct_mutex. We try to acquire the struct_mutex in the reset worker, | |
249 | * but it may be held by some long running waiter (that we cannot | |
250 | * interrupt without causing trouble). Once we are ready to do the GPU | |
251 | * reset, we set the I915_RESET_HANDOFF bit and wakeup any waiters. If | |
252 | * they already hold the struct_mutex and want to participate they can | |
253 | * inspect the bit and do the reset directly, otherwise the worker | |
254 | * waits for the struct_mutex. | |
255 | * | |
256 | * #I915_RESET_ENGINE[num_engines] - Since the driver doesn't need to | |
257 | * acquire the struct_mutex to reset an engine, we need an explicit | |
258 | * flag to prevent two concurrent reset attempts in the same engine. | |
259 | * As the number of engines continues to grow, allocate the flags from | |
260 | * the most significant bits. | |
261 | * | |
262 | * #I915_WEDGED - If reset fails and we can no longer use the GPU, | |
263 | * we set the #I915_WEDGED bit. Prior to command submission, e.g. | |
264 | * i915_request_alloc(), this bit is checked and the sequence | |
265 | * aborted (with -EIO reported to userspace) if set. | |
266 | */ | |
267 | unsigned long flags; | |
268 | #define I915_RESET_BACKOFF 0 | |
269 | #define I915_RESET_HANDOFF 1 | |
270 | #define I915_RESET_MODESET 2 | |
271 | #define I915_WEDGED (BITS_PER_LONG - 1) | |
272 | #define I915_RESET_ENGINE (I915_WEDGED - I915_NUM_ENGINES) | |
273 | ||
274 | /** Number of times an engine has been reset */ | |
275 | u32 reset_engine_count[I915_NUM_ENGINES]; | |
276 | ||
d0667e9c CW |
277 | /** Set of stalled engines with guilty requests, in the current reset */ |
278 | u32 stalled_mask; | |
279 | ||
ce800754 CW |
280 | /** Reason for the current *global* reset */ |
281 | const char *reason; | |
282 | ||
d897a111 MW |
283 | /** |
284 | * Waitqueue to signal when a hang is detected. Used to for waiters | |
285 | * to release the struct_mutex for the reset to procede. | |
286 | */ | |
287 | wait_queue_head_t wait_queue; | |
288 | ||
289 | /** | |
290 | * Waitqueue to signal when the reset has completed. Used by clients | |
291 | * that wait for dev_priv->mm.wedged to settle. | |
292 | */ | |
293 | wait_queue_head_t reset_queue; | |
294 | ||
295 | /* For missed irq/seqno simulation. */ | |
296 | unsigned long test_irq_rings; | |
297 | }; | |
298 | ||
299 | struct drm_i915_error_state_buf { | |
300 | struct drm_i915_private *i915; | |
301 | unsigned int bytes; | |
302 | unsigned int size; | |
303 | int err; | |
304 | u8 *buf; | |
305 | loff_t start; | |
306 | loff_t pos; | |
307 | }; | |
308 | ||
309 | #if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) | |
310 | ||
311 | __printf(2, 3) | |
312 | void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...); | |
313 | int i915_error_state_to_str(struct drm_i915_error_state_buf *estr, | |
314 | const struct i915_gpu_state *gpu); | |
315 | int i915_error_state_buf_init(struct drm_i915_error_state_buf *eb, | |
316 | struct drm_i915_private *i915, | |
317 | size_t count, loff_t pos); | |
318 | ||
319 | static inline void | |
320 | i915_error_state_buf_release(struct drm_i915_error_state_buf *eb) | |
321 | { | |
322 | kfree(eb->buf); | |
323 | } | |
324 | ||
325 | struct i915_gpu_state *i915_capture_gpu_state(struct drm_i915_private *i915); | |
326 | void i915_capture_error_state(struct drm_i915_private *dev_priv, | |
327 | u32 engine_mask, | |
328 | const char *error_msg); | |
329 | ||
330 | static inline struct i915_gpu_state * | |
331 | i915_gpu_state_get(struct i915_gpu_state *gpu) | |
332 | { | |
333 | kref_get(&gpu->ref); | |
334 | return gpu; | |
335 | } | |
336 | ||
337 | void __i915_gpu_state_free(struct kref *kref); | |
338 | static inline void i915_gpu_state_put(struct i915_gpu_state *gpu) | |
339 | { | |
340 | if (gpu) | |
341 | kref_put(&gpu->ref, __i915_gpu_state_free); | |
342 | } | |
343 | ||
344 | struct i915_gpu_state *i915_first_error_state(struct drm_i915_private *i915); | |
345 | void i915_reset_error_state(struct drm_i915_private *i915); | |
8830f26b | 346 | void i915_disable_error_state(struct drm_i915_private *i915, int err); |
d897a111 MW |
347 | |
348 | #else | |
349 | ||
350 | static inline void i915_capture_error_state(struct drm_i915_private *dev_priv, | |
351 | u32 engine_mask, | |
352 | const char *error_msg) | |
353 | { | |
354 | } | |
355 | ||
356 | static inline struct i915_gpu_state * | |
357 | i915_first_error_state(struct drm_i915_private *i915) | |
358 | { | |
8830f26b | 359 | return ERR_PTR(-ENODEV); |
d897a111 MW |
360 | } |
361 | ||
362 | static inline void i915_reset_error_state(struct drm_i915_private *i915) | |
363 | { | |
364 | } | |
365 | ||
8830f26b CW |
366 | static inline void i915_disable_error_state(struct drm_i915_private *i915, |
367 | int err) | |
368 | { | |
369 | } | |
370 | ||
d897a111 MW |
371 | #endif /* IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) */ |
372 | ||
373 | #endif /* _I915_GPU_ERROR_H_ */ |