2 * Copyright © 2016 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 #include <linux/prime_numbers.h>
26 #include <linux/pm_qos.h>
27 #include <linux/sort.h>
29 #include "gem/i915_gem_internal.h"
30 #include "gem/i915_gem_pm.h"
31 #include "gem/selftests/mock_context.h"
33 #include "gt/intel_engine_heartbeat.h"
34 #include "gt/intel_engine_pm.h"
35 #include "gt/intel_engine_user.h"
36 #include "gt/intel_gt.h"
37 #include "gt/intel_gt_clock_utils.h"
38 #include "gt/intel_gt_requests.h"
39 #include "gt/selftest_engine_heartbeat.h"
41 #include "i915_random.h"
42 #include "i915_selftest.h"
43 #include "igt_flush_test.h"
44 #include "igt_live_test.h"
45 #include "igt_spinner.h"
46 #include "lib_sw_fence.h"
49 #include "mock_gem_device.h"
51 static unsigned int num_uabi_engines(struct drm_i915_private *i915)
53 struct intel_engine_cs *engine;
57 for_each_uabi_engine(engine, i915)
63 static struct intel_engine_cs *rcs0(struct drm_i915_private *i915)
65 return intel_engine_lookup_user(i915, I915_ENGINE_CLASS_RENDER, 0);
68 static int igt_add_request(void *arg)
70 struct drm_i915_private *i915 = arg;
71 struct i915_request *request;
73 /* Basic preliminary test to create a request and let it loose! */
75 request = mock_request(rcs0(i915)->kernel_context, HZ / 10);
79 i915_request_add(request);
84 static int igt_wait_request(void *arg)
86 const long T = HZ / 4;
87 struct drm_i915_private *i915 = arg;
88 struct i915_request *request;
91 /* Submit a request, then wait upon it */
93 request = mock_request(rcs0(i915)->kernel_context, T);
97 i915_request_get(request);
99 if (i915_request_wait(request, 0, 0) != -ETIME) {
100 pr_err("request wait (busy query) succeeded (expected timeout before submit!)\n");
104 if (i915_request_wait(request, 0, T) != -ETIME) {
105 pr_err("request wait succeeded (expected timeout before submit!)\n");
109 if (i915_request_completed(request)) {
110 pr_err("request completed before submit!!\n");
114 i915_request_add(request);
116 if (i915_request_wait(request, 0, 0) != -ETIME) {
117 pr_err("request wait (busy query) succeeded (expected timeout after submit!)\n");
121 if (i915_request_completed(request)) {
122 pr_err("request completed immediately!\n");
126 if (i915_request_wait(request, 0, T / 2) != -ETIME) {
127 pr_err("request wait succeeded (expected timeout!)\n");
131 if (i915_request_wait(request, 0, T) == -ETIME) {
132 pr_err("request wait timed out!\n");
136 if (!i915_request_completed(request)) {
137 pr_err("request not complete after waiting!\n");
141 if (i915_request_wait(request, 0, T) == -ETIME) {
142 pr_err("request wait timed out when already complete!\n");
148 i915_request_put(request);
149 mock_device_flush(i915);
153 static int igt_fence_wait(void *arg)
155 const long T = HZ / 4;
156 struct drm_i915_private *i915 = arg;
157 struct i915_request *request;
160 /* Submit a request, treat it as a fence and wait upon it */
162 request = mock_request(rcs0(i915)->kernel_context, T);
166 if (dma_fence_wait_timeout(&request->fence, false, T) != -ETIME) {
167 pr_err("fence wait success before submit (expected timeout)!\n");
171 i915_request_add(request);
173 if (dma_fence_is_signaled(&request->fence)) {
174 pr_err("fence signaled immediately!\n");
178 if (dma_fence_wait_timeout(&request->fence, false, T / 2) != -ETIME) {
179 pr_err("fence wait success after submit (expected timeout)!\n");
183 if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) {
184 pr_err("fence wait timed out (expected success)!\n");
188 if (!dma_fence_is_signaled(&request->fence)) {
189 pr_err("fence unsignaled after waiting!\n");
193 if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) {
194 pr_err("fence wait timed out when complete (expected success)!\n");
200 mock_device_flush(i915);
204 static int igt_request_rewind(void *arg)
206 struct drm_i915_private *i915 = arg;
207 struct i915_request *request, *vip;
208 struct i915_gem_context *ctx[2];
209 struct intel_context *ce;
212 ctx[0] = mock_context(i915, "A");
218 ce = i915_gem_context_get_engine(ctx[0], RCS0);
219 GEM_BUG_ON(IS_ERR(ce));
220 request = mock_request(ce, 2 * HZ);
221 intel_context_put(ce);
227 i915_request_get(request);
228 i915_request_add(request);
230 ctx[1] = mock_context(i915, "B");
236 ce = i915_gem_context_get_engine(ctx[1], RCS0);
237 GEM_BUG_ON(IS_ERR(ce));
238 vip = mock_request(ce, 0);
239 intel_context_put(ce);
245 /* Simulate preemption by manual reordering */
246 if (!mock_cancel_request(request)) {
247 pr_err("failed to cancel request (already executed)!\n");
248 i915_request_add(vip);
251 i915_request_get(vip);
252 i915_request_add(vip);
254 request->engine->submit_request(request);
258 if (i915_request_wait(vip, 0, HZ) == -ETIME) {
259 pr_err("timed out waiting for high priority request\n");
263 if (i915_request_completed(request)) {
264 pr_err("low priority request already completed\n");
270 i915_request_put(vip);
272 mock_context_close(ctx[1]);
274 i915_request_put(request);
276 mock_context_close(ctx[0]);
278 mock_device_flush(i915);
283 struct intel_engine_cs *engine;
284 struct i915_gem_context **contexts;
285 atomic_long_t num_waits, num_fences;
286 int ncontexts, max_batch;
287 struct i915_request *(*request_alloc)(struct intel_context *ce);
290 static struct i915_request *
291 __mock_request_alloc(struct intel_context *ce)
293 return mock_request(ce, 0);
296 static struct i915_request *
297 __live_request_alloc(struct intel_context *ce)
299 return intel_context_create_request(ce);
302 struct smoke_thread {
303 struct kthread_worker *worker;
304 struct kthread_work work;
310 static void __igt_breadcrumbs_smoketest(struct kthread_work *work)
312 struct smoke_thread *thread = container_of(work, typeof(*thread), work);
313 struct smoketest *t = thread->t;
314 const unsigned int max_batch = min(t->ncontexts, t->max_batch) - 1;
315 const unsigned int total = 4 * t->ncontexts + 1;
316 unsigned int num_waits = 0, num_fences = 0;
317 struct i915_request **requests;
318 I915_RND_STATE(prng);
323 * A very simple test to catch the most egregious of list handling bugs.
325 * At its heart, we simply create oodles of requests running across
326 * multiple kthreads and enable signaling on them, for the sole purpose
327 * of stressing our breadcrumb handling. The only inspection we do is
328 * that the fences were marked as signaled.
331 requests = kcalloc(total, sizeof(*requests), GFP_KERNEL);
333 thread->result = -ENOMEM;
337 order = i915_random_order(total, &prng);
343 while (!READ_ONCE(thread->stop)) {
344 struct i915_sw_fence *submit, *wait;
345 unsigned int n, count;
347 submit = heap_fence_create(GFP_KERNEL);
353 wait = heap_fence_create(GFP_KERNEL);
355 i915_sw_fence_commit(submit);
356 heap_fence_put(submit);
361 i915_random_reorder(order, total, &prng);
362 count = 1 + i915_prandom_u32_max_state(max_batch, &prng);
364 for (n = 0; n < count; n++) {
365 struct i915_gem_context *ctx =
366 t->contexts[order[n] % t->ncontexts];
367 struct i915_request *rq;
368 struct intel_context *ce;
370 ce = i915_gem_context_get_engine(ctx, t->engine->legacy_idx);
371 GEM_BUG_ON(IS_ERR(ce));
372 rq = t->request_alloc(ce);
373 intel_context_put(ce);
380 err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
384 requests[n] = i915_request_get(rq);
385 i915_request_add(rq);
388 err = i915_sw_fence_await_dma_fence(wait,
394 i915_request_put(rq);
400 i915_sw_fence_commit(submit);
401 i915_sw_fence_commit(wait);
403 if (!wait_event_timeout(wait->wait,
404 i915_sw_fence_done(wait),
406 struct i915_request *rq = requests[count - 1];
408 pr_err("waiting for %d/%d fences (last %llx:%lld) on %s timed out!\n",
409 atomic_read(&wait->pending), count,
410 rq->fence.context, rq->fence.seqno,
414 intel_gt_set_wedged(t->engine->gt);
415 GEM_BUG_ON(!i915_request_completed(rq));
416 i915_sw_fence_wait(wait);
420 for (n = 0; n < count; n++) {
421 struct i915_request *rq = requests[n];
423 if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
425 pr_err("%llu:%llu was not signaled!\n",
426 rq->fence.context, rq->fence.seqno);
430 i915_request_put(rq);
433 heap_fence_put(wait);
434 heap_fence_put(submit);
445 atomic_long_add(num_fences, &t->num_fences);
446 atomic_long_add(num_waits, &t->num_waits);
451 thread->result = err;
454 static int mock_breadcrumbs_smoketest(void *arg)
456 struct drm_i915_private *i915 = arg;
457 struct smoketest t = {
458 .engine = rcs0(i915),
461 .request_alloc = __mock_request_alloc
463 unsigned int ncpus = num_online_cpus();
464 struct smoke_thread *threads;
469 * Smoketest our breadcrumb/signal handling for requests across multiple
470 * threads. A very simple test to only catch the most egregious of bugs.
471 * See __igt_breadcrumbs_smoketest();
474 threads = kcalloc(ncpus, sizeof(*threads), GFP_KERNEL);
478 t.contexts = kcalloc(t.ncontexts, sizeof(*t.contexts), GFP_KERNEL);
484 for (n = 0; n < t.ncontexts; n++) {
485 t.contexts[n] = mock_context(t.engine->i915, "mock");
486 if (!t.contexts[n]) {
492 for (n = 0; n < ncpus; n++) {
493 struct kthread_worker *worker;
495 worker = kthread_create_worker(0, "igt/%d", n);
496 if (IS_ERR(worker)) {
497 ret = PTR_ERR(worker);
502 threads[n].worker = worker;
504 threads[n].stop = false;
505 threads[n].result = 0;
507 kthread_init_work(&threads[n].work,
508 __igt_breadcrumbs_smoketest);
509 kthread_queue_work(worker, &threads[n].work);
512 msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies));
514 for (n = 0; n < ncpus; n++) {
517 WRITE_ONCE(threads[n].stop, true);
518 kthread_flush_work(&threads[n].work);
519 err = READ_ONCE(threads[n].result);
523 kthread_destroy_worker(threads[n].worker);
525 pr_info("Completed %lu waits for %lu fence across %d cpus\n",
526 atomic_long_read(&t.num_waits),
527 atomic_long_read(&t.num_fences),
531 for (n = 0; n < t.ncontexts; n++) {
534 mock_context_close(t.contexts[n]);
542 int i915_request_mock_selftests(void)
544 static const struct i915_subtest tests[] = {
545 SUBTEST(igt_add_request),
546 SUBTEST(igt_wait_request),
547 SUBTEST(igt_fence_wait),
548 SUBTEST(igt_request_rewind),
549 SUBTEST(mock_breadcrumbs_smoketest),
551 struct drm_i915_private *i915;
552 intel_wakeref_t wakeref;
555 i915 = mock_gem_device();
559 with_intel_runtime_pm(&i915->runtime_pm, wakeref)
560 err = i915_subtests(tests, i915);
562 mock_destroy_device(i915);
567 static int live_nop_request(void *arg)
569 struct drm_i915_private *i915 = arg;
570 struct intel_engine_cs *engine;
571 struct igt_live_test t;
575 * Submit various sized batches of empty requests, to each engine
576 * (individually), and wait for the batch to complete. We can check
577 * the overhead of submitting requests to the hardware.
580 for_each_uabi_engine(engine, i915) {
581 unsigned long n, prime;
582 IGT_TIMEOUT(end_time);
583 ktime_t times[2] = {};
585 err = igt_live_test_begin(&t, i915, __func__, engine->name);
589 intel_engine_pm_get(engine);
590 for_each_prime_number_from(prime, 1, 8192) {
591 struct i915_request *request = NULL;
593 times[1] = ktime_get_raw();
595 for (n = 0; n < prime; n++) {
596 i915_request_put(request);
597 request = i915_request_create(engine->kernel_context);
599 return PTR_ERR(request);
602 * This space is left intentionally blank.
604 * We do not actually want to perform any
605 * action with this request, we just want
606 * to measure the latency in allocation
607 * and submission of our breadcrumbs -
608 * ensuring that the bare request is sufficient
609 * for the system to work (i.e. proper HEAD
610 * tracking of the rings, interrupt handling,
611 * etc). It also gives us the lowest bounds
615 i915_request_get(request);
616 i915_request_add(request);
618 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
619 i915_request_put(request);
621 times[1] = ktime_sub(ktime_get_raw(), times[1]);
625 if (__igt_timeout(end_time, NULL))
628 intel_engine_pm_put(engine);
630 err = igt_live_test_end(&t);
634 pr_info("Request latencies on %s: 1 = %lluns, %lu = %lluns\n",
636 ktime_to_ns(times[0]),
637 prime, div64_u64(ktime_to_ns(times[1]), prime));
643 static int __cancel_inactive(struct intel_engine_cs *engine)
645 struct intel_context *ce;
646 struct igt_spinner spin;
647 struct i915_request *rq;
650 if (igt_spinner_init(&spin, engine->gt))
653 ce = intel_context_create(engine);
659 rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
665 pr_debug("%s: Cancelling inactive request\n", engine->name);
666 i915_request_cancel(rq, -EINTR);
667 i915_request_get(rq);
668 i915_request_add(rq);
670 if (i915_request_wait(rq, 0, HZ / 5) < 0) {
671 struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
673 pr_err("%s: Failed to cancel inactive request\n", engine->name);
674 intel_engine_dump(engine, &p, "%s\n", engine->name);
679 if (rq->fence.error != -EINTR) {
680 pr_err("%s: fence not cancelled (%u)\n",
681 engine->name, rq->fence.error);
686 i915_request_put(rq);
688 intel_context_put(ce);
690 igt_spinner_fini(&spin);
692 pr_err("%s: %s error %d\n", __func__, engine->name, err);
696 static int __cancel_active(struct intel_engine_cs *engine)
698 struct intel_context *ce;
699 struct igt_spinner spin;
700 struct i915_request *rq;
703 if (igt_spinner_init(&spin, engine->gt))
706 ce = intel_context_create(engine);
712 rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
718 pr_debug("%s: Cancelling active request\n", engine->name);
719 i915_request_get(rq);
720 i915_request_add(rq);
721 if (!igt_wait_for_spinner(&spin, rq)) {
722 struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
724 pr_err("Failed to start spinner on %s\n", engine->name);
725 intel_engine_dump(engine, &p, "%s\n", engine->name);
729 i915_request_cancel(rq, -EINTR);
731 if (i915_request_wait(rq, 0, HZ / 5) < 0) {
732 struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
734 pr_err("%s: Failed to cancel active request\n", engine->name);
735 intel_engine_dump(engine, &p, "%s\n", engine->name);
740 if (rq->fence.error != -EINTR) {
741 pr_err("%s: fence not cancelled (%u)\n",
742 engine->name, rq->fence.error);
747 i915_request_put(rq);
749 intel_context_put(ce);
751 igt_spinner_fini(&spin);
753 pr_err("%s: %s error %d\n", __func__, engine->name, err);
757 static int __cancel_completed(struct intel_engine_cs *engine)
759 struct intel_context *ce;
760 struct igt_spinner spin;
761 struct i915_request *rq;
764 if (igt_spinner_init(&spin, engine->gt))
767 ce = intel_context_create(engine);
773 rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
778 igt_spinner_end(&spin);
779 i915_request_get(rq);
780 i915_request_add(rq);
782 if (i915_request_wait(rq, 0, HZ / 5) < 0) {
787 pr_debug("%s: Cancelling completed request\n", engine->name);
788 i915_request_cancel(rq, -EINTR);
789 if (rq->fence.error) {
790 pr_err("%s: fence not cancelled (%u)\n",
791 engine->name, rq->fence.error);
796 i915_request_put(rq);
798 intel_context_put(ce);
800 igt_spinner_fini(&spin);
802 pr_err("%s: %s error %d\n", __func__, engine->name, err);
807 * Test to prove a non-preemptable request can be cancelled and a subsequent
808 * request on the same context can successfully complete after cancellation.
810 * Testing methodology is to create a non-preemptible request and submit it,
811 * wait for spinner to start, create a NOP request and submit it, cancel the
812 * spinner, wait for spinner to complete and verify it failed with an error,
813 * finally wait for NOP request to complete verify it succeeded without an
814 * error. Preemption timeout also reduced / restored so test runs in a timely
817 static int __cancel_reset(struct drm_i915_private *i915,
818 struct intel_engine_cs *engine)
820 struct intel_context *ce;
821 struct igt_spinner spin;
822 struct i915_request *rq, *nop;
823 unsigned long preempt_timeout_ms;
826 if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT ||
827 !intel_has_reset_engine(engine->gt))
830 preempt_timeout_ms = engine->props.preempt_timeout_ms;
831 engine->props.preempt_timeout_ms = 100;
833 if (igt_spinner_init(&spin, engine->gt))
836 ce = intel_context_create(engine);
842 rq = igt_spinner_create_request(&spin, ce, MI_NOOP);
848 pr_debug("%s: Cancelling active non-preemptable request\n",
850 i915_request_get(rq);
851 i915_request_add(rq);
852 if (!igt_wait_for_spinner(&spin, rq)) {
853 struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
855 pr_err("Failed to start spinner on %s\n", engine->name);
856 intel_engine_dump(engine, &p, "%s\n", engine->name);
861 nop = intel_context_create_request(ce);
864 i915_request_get(nop);
865 i915_request_add(nop);
867 i915_request_cancel(rq, -EINTR);
869 if (i915_request_wait(rq, 0, HZ) < 0) {
870 struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
872 pr_err("%s: Failed to cancel hung request\n", engine->name);
873 intel_engine_dump(engine, &p, "%s\n", engine->name);
878 if (rq->fence.error != -EINTR) {
879 pr_err("%s: fence not cancelled (%u)\n",
880 engine->name, rq->fence.error);
885 if (i915_request_wait(nop, 0, HZ) < 0) {
886 struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
888 pr_err("%s: Failed to complete nop request\n", engine->name);
889 intel_engine_dump(engine, &p, "%s\n", engine->name);
894 if (nop->fence.error != 0) {
895 pr_err("%s: Nop request errored (%u)\n",
896 engine->name, nop->fence.error);
901 i915_request_put(nop);
903 i915_request_put(rq);
905 intel_context_put(ce);
907 igt_spinner_fini(&spin);
909 engine->props.preempt_timeout_ms = preempt_timeout_ms;
911 pr_err("%s: %s error %d\n", __func__, engine->name, err);
915 static int live_cancel_request(void *arg)
917 struct drm_i915_private *i915 = arg;
918 struct intel_engine_cs *engine;
921 * Check cancellation of requests. We expect to be able to immediately
922 * cancel active requests, even if they are currently on the GPU.
925 for_each_uabi_engine(engine, i915) {
926 struct igt_live_test t;
929 if (!intel_engine_has_preemption(engine))
932 err = igt_live_test_begin(&t, i915, __func__, engine->name);
936 err = __cancel_inactive(engine);
938 err = __cancel_active(engine);
940 err = __cancel_completed(engine);
942 err2 = igt_live_test_end(&t);
948 /* Expects reset so call outside of igt_live_test_* */
949 err = __cancel_reset(i915, engine);
953 if (igt_flush_test(i915))
960 static struct i915_vma *empty_batch(struct drm_i915_private *i915)
962 struct drm_i915_gem_object *obj;
963 struct i915_vma *vma;
967 obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
969 return ERR_CAST(obj);
971 cmd = i915_gem_object_pin_map_unlocked(obj, I915_MAP_WB);
977 *cmd = MI_BATCH_BUFFER_END;
979 __i915_gem_object_flush_map(obj, 0, 64);
980 i915_gem_object_unpin_map(obj);
982 intel_gt_chipset_flush(to_gt(i915));
984 vma = i915_vma_instance(obj, &to_gt(i915)->ggtt->vm, NULL);
990 err = i915_vma_pin(vma, 0, 0, PIN_USER | PIN_GLOBAL);
994 /* Force the wait now to avoid including it in the benchmark */
995 err = i915_vma_sync(vma);
1002 i915_vma_unpin(vma);
1004 i915_gem_object_put(obj);
1005 return ERR_PTR(err);
1008 static struct i915_request *
1009 empty_request(struct intel_engine_cs *engine,
1010 struct i915_vma *batch)
1012 struct i915_request *request;
1015 request = i915_request_create(engine->kernel_context);
1016 if (IS_ERR(request))
1019 err = engine->emit_bb_start(request,
1022 I915_DISPATCH_SECURE);
1026 i915_request_get(request);
1028 i915_request_add(request);
1029 return err ? ERR_PTR(err) : request;
1032 static int live_empty_request(void *arg)
1034 struct drm_i915_private *i915 = arg;
1035 struct intel_engine_cs *engine;
1036 struct igt_live_test t;
1037 struct i915_vma *batch;
1041 * Submit various sized batches of empty requests, to each engine
1042 * (individually), and wait for the batch to complete. We can check
1043 * the overhead of submitting requests to the hardware.
1046 batch = empty_batch(i915);
1048 return PTR_ERR(batch);
1050 for_each_uabi_engine(engine, i915) {
1051 IGT_TIMEOUT(end_time);
1052 struct i915_request *request;
1053 unsigned long n, prime;
1054 ktime_t times[2] = {};
1056 err = igt_live_test_begin(&t, i915, __func__, engine->name);
1060 intel_engine_pm_get(engine);
1062 /* Warmup / preload */
1063 request = empty_request(engine, batch);
1064 if (IS_ERR(request)) {
1065 err = PTR_ERR(request);
1066 intel_engine_pm_put(engine);
1069 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
1071 for_each_prime_number_from(prime, 1, 8192) {
1072 times[1] = ktime_get_raw();
1074 for (n = 0; n < prime; n++) {
1075 i915_request_put(request);
1076 request = empty_request(engine, batch);
1077 if (IS_ERR(request)) {
1078 err = PTR_ERR(request);
1079 intel_engine_pm_put(engine);
1083 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
1085 times[1] = ktime_sub(ktime_get_raw(), times[1]);
1087 times[0] = times[1];
1089 if (__igt_timeout(end_time, NULL))
1092 i915_request_put(request);
1093 intel_engine_pm_put(engine);
1095 err = igt_live_test_end(&t);
1099 pr_info("Batch latencies on %s: 1 = %lluns, %lu = %lluns\n",
1101 ktime_to_ns(times[0]),
1102 prime, div64_u64(ktime_to_ns(times[1]), prime));
1106 i915_vma_unpin(batch);
1107 i915_vma_put(batch);
1111 static struct i915_vma *recursive_batch(struct drm_i915_private *i915)
1113 struct drm_i915_gem_object *obj;
1114 const int ver = GRAPHICS_VER(i915);
1115 struct i915_vma *vma;
1119 obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
1121 return ERR_CAST(obj);
1123 vma = i915_vma_instance(obj, to_gt(i915)->vm, NULL);
1129 err = i915_vma_pin(vma, 0, 0, PIN_USER);
1133 cmd = i915_gem_object_pin_map_unlocked(obj, I915_MAP_WC);
1140 *cmd++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
1141 *cmd++ = lower_32_bits(vma->node.start);
1142 *cmd++ = upper_32_bits(vma->node.start);
1143 } else if (ver >= 6) {
1144 *cmd++ = MI_BATCH_BUFFER_START | 1 << 8;
1145 *cmd++ = lower_32_bits(vma->node.start);
1147 *cmd++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
1148 *cmd++ = lower_32_bits(vma->node.start);
1150 *cmd++ = MI_BATCH_BUFFER_END; /* terminate early in case of error */
1152 __i915_gem_object_flush_map(obj, 0, 64);
1153 i915_gem_object_unpin_map(obj);
1155 intel_gt_chipset_flush(to_gt(i915));
1160 i915_gem_object_put(obj);
1161 return ERR_PTR(err);
1164 static int recursive_batch_resolve(struct i915_vma *batch)
1168 cmd = i915_gem_object_pin_map_unlocked(batch->obj, I915_MAP_WC);
1170 return PTR_ERR(cmd);
1172 *cmd = MI_BATCH_BUFFER_END;
1174 __i915_gem_object_flush_map(batch->obj, 0, sizeof(*cmd));
1175 i915_gem_object_unpin_map(batch->obj);
1177 intel_gt_chipset_flush(batch->vm->gt);
1182 static int live_all_engines(void *arg)
1184 struct drm_i915_private *i915 = arg;
1185 const unsigned int nengines = num_uabi_engines(i915);
1186 struct intel_engine_cs *engine;
1187 struct i915_request **request;
1188 struct igt_live_test t;
1189 struct i915_vma *batch;
1194 * Check we can submit requests to all engines simultaneously. We
1195 * send a recursive batch to each engine - checking that we don't
1196 * block doing so, and that they don't complete too soon.
1199 request = kcalloc(nengines, sizeof(*request), GFP_KERNEL);
1203 err = igt_live_test_begin(&t, i915, __func__, "");
1207 batch = recursive_batch(i915);
1208 if (IS_ERR(batch)) {
1209 err = PTR_ERR(batch);
1210 pr_err("%s: Unable to create batch, err=%d\n", __func__, err);
1214 i915_vma_lock(batch);
1217 for_each_uabi_engine(engine, i915) {
1218 request[idx] = intel_engine_create_kernel_request(engine);
1219 if (IS_ERR(request[idx])) {
1220 err = PTR_ERR(request[idx]);
1221 pr_err("%s: Request allocation failed with err=%d\n",
1226 err = i915_request_await_object(request[idx], batch->obj, 0);
1228 err = i915_vma_move_to_active(batch, request[idx], 0);
1231 err = engine->emit_bb_start(request[idx],
1236 request[idx]->batch = batch;
1238 i915_request_get(request[idx]);
1239 i915_request_add(request[idx]);
1243 i915_vma_unlock(batch);
1246 for_each_uabi_engine(engine, i915) {
1247 if (i915_request_completed(request[idx])) {
1248 pr_err("%s(%s): request completed too early!\n",
1249 __func__, engine->name);
1256 err = recursive_batch_resolve(batch);
1258 pr_err("%s: failed to resolve batch, err=%d\n", __func__, err);
1263 for_each_uabi_engine(engine, i915) {
1266 timeout = i915_request_wait(request[idx], 0,
1267 MAX_SCHEDULE_TIMEOUT);
1270 pr_err("%s: error waiting for request on %s, err=%d\n",
1271 __func__, engine->name, err);
1275 GEM_BUG_ON(!i915_request_completed(request[idx]));
1276 i915_request_put(request[idx]);
1277 request[idx] = NULL;
1281 err = igt_live_test_end(&t);
1285 for_each_uabi_engine(engine, i915) {
1287 i915_request_put(request[idx]);
1290 i915_vma_unpin(batch);
1291 i915_vma_put(batch);
1297 static int live_sequential_engines(void *arg)
1299 struct drm_i915_private *i915 = arg;
1300 const unsigned int nengines = num_uabi_engines(i915);
1301 struct i915_request **request;
1302 struct i915_request *prev = NULL;
1303 struct intel_engine_cs *engine;
1304 struct igt_live_test t;
1309 * Check we can submit requests to all engines sequentially, such
1310 * that each successive request waits for the earlier ones. This
1311 * tests that we don't execute requests out of order, even though
1312 * they are running on independent engines.
1315 request = kcalloc(nengines, sizeof(*request), GFP_KERNEL);
1319 err = igt_live_test_begin(&t, i915, __func__, "");
1324 for_each_uabi_engine(engine, i915) {
1325 struct i915_vma *batch;
1327 batch = recursive_batch(i915);
1328 if (IS_ERR(batch)) {
1329 err = PTR_ERR(batch);
1330 pr_err("%s: Unable to create batch for %s, err=%d\n",
1331 __func__, engine->name, err);
1335 i915_vma_lock(batch);
1336 request[idx] = intel_engine_create_kernel_request(engine);
1337 if (IS_ERR(request[idx])) {
1338 err = PTR_ERR(request[idx]);
1339 pr_err("%s: Request allocation failed for %s with err=%d\n",
1340 __func__, engine->name, err);
1345 err = i915_request_await_dma_fence(request[idx],
1348 i915_request_add(request[idx]);
1349 pr_err("%s: Request await failed for %s with err=%d\n",
1350 __func__, engine->name, err);
1355 err = i915_request_await_object(request[idx],
1358 err = i915_vma_move_to_active(batch, request[idx], 0);
1361 err = engine->emit_bb_start(request[idx],
1366 request[idx]->batch = batch;
1368 i915_request_get(request[idx]);
1369 i915_request_add(request[idx]);
1371 prev = request[idx];
1375 i915_vma_unlock(batch);
1381 for_each_uabi_engine(engine, i915) {
1384 if (i915_request_completed(request[idx])) {
1385 pr_err("%s(%s): request completed too early!\n",
1386 __func__, engine->name);
1391 err = recursive_batch_resolve(request[idx]->batch);
1393 pr_err("%s: failed to resolve batch, err=%d\n",
1398 timeout = i915_request_wait(request[idx], 0,
1399 MAX_SCHEDULE_TIMEOUT);
1402 pr_err("%s: error waiting for request on %s, err=%d\n",
1403 __func__, engine->name, err);
1407 GEM_BUG_ON(!i915_request_completed(request[idx]));
1411 err = igt_live_test_end(&t);
1415 for_each_uabi_engine(engine, i915) {
1421 cmd = i915_gem_object_pin_map_unlocked(request[idx]->batch->obj,
1424 *cmd = MI_BATCH_BUFFER_END;
1426 __i915_gem_object_flush_map(request[idx]->batch->obj,
1428 i915_gem_object_unpin_map(request[idx]->batch->obj);
1430 intel_gt_chipset_flush(engine->gt);
1433 i915_vma_put(request[idx]->batch);
1434 i915_request_put(request[idx]);
1442 struct parallel_thread {
1443 struct kthread_worker *worker;
1444 struct kthread_work work;
1445 struct intel_engine_cs *engine;
1449 static void __live_parallel_engine1(struct kthread_work *work)
1451 struct parallel_thread *thread =
1452 container_of(work, typeof(*thread), work);
1453 struct intel_engine_cs *engine = thread->engine;
1454 IGT_TIMEOUT(end_time);
1455 unsigned long count;
1459 intel_engine_pm_get(engine);
1461 struct i915_request *rq;
1463 rq = i915_request_create(engine->kernel_context);
1469 i915_request_get(rq);
1470 i915_request_add(rq);
1473 if (i915_request_wait(rq, 0, HZ) < 0)
1475 i915_request_put(rq);
1480 } while (!__igt_timeout(end_time, NULL));
1481 intel_engine_pm_put(engine);
1483 pr_info("%s: %lu request + sync\n", engine->name, count);
1484 thread->result = err;
1487 static void __live_parallel_engineN(struct kthread_work *work)
1489 struct parallel_thread *thread =
1490 container_of(work, typeof(*thread), work);
1491 struct intel_engine_cs *engine = thread->engine;
1492 IGT_TIMEOUT(end_time);
1493 unsigned long count;
1497 intel_engine_pm_get(engine);
1499 struct i915_request *rq;
1501 rq = i915_request_create(engine->kernel_context);
1507 i915_request_add(rq);
1509 } while (!__igt_timeout(end_time, NULL));
1510 intel_engine_pm_put(engine);
1512 pr_info("%s: %lu requests\n", engine->name, count);
1513 thread->result = err;
1516 static bool wake_all(struct drm_i915_private *i915)
1518 if (atomic_dec_and_test(&i915->selftest.counter)) {
1519 wake_up_var(&i915->selftest.counter);
1526 static int wait_for_all(struct drm_i915_private *i915)
1531 if (wait_var_event_timeout(&i915->selftest.counter,
1532 !atomic_read(&i915->selftest.counter),
1533 i915_selftest.timeout_jiffies))
1539 static void __live_parallel_spin(struct kthread_work *work)
1541 struct parallel_thread *thread =
1542 container_of(work, typeof(*thread), work);
1543 struct intel_engine_cs *engine = thread->engine;
1544 struct igt_spinner spin;
1545 struct i915_request *rq;
1549 * Create a spinner running for eternity on each engine. If a second
1550 * spinner is incorrectly placed on the same engine, it will not be
1551 * able to start in time.
1554 if (igt_spinner_init(&spin, engine->gt)) {
1555 wake_all(engine->i915);
1556 thread->result = -ENOMEM;
1560 intel_engine_pm_get(engine);
1561 rq = igt_spinner_create_request(&spin,
1562 engine->kernel_context,
1563 MI_NOOP); /* no preemption */
1564 intel_engine_pm_put(engine);
1569 wake_all(engine->i915);
1573 i915_request_get(rq);
1574 i915_request_add(rq);
1575 if (igt_wait_for_spinner(&spin, rq)) {
1576 /* Occupy this engine for the whole test */
1577 err = wait_for_all(engine->i915);
1579 pr_err("Failed to start spinner on %s\n", engine->name);
1582 igt_spinner_end(&spin);
1584 if (err == 0 && i915_request_wait(rq, 0, HZ) < 0)
1586 i915_request_put(rq);
1589 igt_spinner_fini(&spin);
1590 thread->result = err;
1593 static int live_parallel_engines(void *arg)
1595 struct drm_i915_private *i915 = arg;
1596 static void (* const func[])(struct kthread_work *) = {
1597 __live_parallel_engine1,
1598 __live_parallel_engineN,
1599 __live_parallel_spin,
1602 const unsigned int nengines = num_uabi_engines(i915);
1603 struct parallel_thread *threads;
1604 struct intel_engine_cs *engine;
1605 void (* const *fn)(struct kthread_work *);
1609 * Check we can submit requests to all engines concurrently. This
1610 * tests that we load up the system maximally.
1613 threads = kcalloc(nengines, sizeof(*threads), GFP_KERNEL);
1617 for (fn = func; !err && *fn; fn++) {
1618 char name[KSYM_NAME_LEN];
1619 struct igt_live_test t;
1622 snprintf(name, sizeof(name), "%ps", *fn);
1623 err = igt_live_test_begin(&t, i915, __func__, name);
1627 atomic_set(&i915->selftest.counter, nengines);
1630 for_each_uabi_engine(engine, i915) {
1631 struct kthread_worker *worker;
1633 worker = kthread_create_worker(0, "igt/parallel:%s",
1635 if (IS_ERR(worker)) {
1636 err = PTR_ERR(worker);
1640 threads[idx].worker = worker;
1641 threads[idx].result = 0;
1642 threads[idx].engine = engine;
1644 kthread_init_work(&threads[idx].work, *fn);
1645 kthread_queue_work(worker, &threads[idx].work);
1650 for_each_uabi_engine(engine, i915) {
1653 if (!threads[idx].worker)
1656 kthread_flush_work(&threads[idx].work);
1657 status = READ_ONCE(threads[idx].result);
1661 kthread_destroy_worker(threads[idx++].worker);
1664 if (igt_live_test_end(&t))
1673 max_batches(struct i915_gem_context *ctx, struct intel_engine_cs *engine)
1675 struct i915_request *rq;
1679 * Before execlists, all contexts share the same ringbuffer. With
1680 * execlists, each context/engine has a separate ringbuffer and
1681 * for the purposes of this test, inexhaustible.
1683 * For the global ringbuffer though, we have to be very careful
1684 * that we do not wrap while preventing the execution of requests
1685 * with a unsignaled fence.
1687 if (HAS_EXECLISTS(ctx->i915))
1690 rq = igt_request_alloc(ctx, engine);
1696 ret = rq->ring->size - rq->reserved_space;
1697 i915_request_add(rq);
1699 sz = rq->ring->emit - rq->head;
1701 sz += rq->ring->size;
1703 ret /= 2; /* leave half spare, in case of emergency! */
1709 static int live_breadcrumbs_smoketest(void *arg)
1711 struct drm_i915_private *i915 = arg;
1712 const unsigned int nengines = num_uabi_engines(i915);
1713 const unsigned int ncpus = /* saturate with nengines * ncpus */
1714 max_t(int, 2, DIV_ROUND_UP(num_online_cpus(), nengines));
1715 unsigned long num_waits, num_fences;
1716 struct intel_engine_cs *engine;
1717 struct smoke_thread *threads;
1718 struct igt_live_test live;
1719 intel_wakeref_t wakeref;
1720 struct smoketest *smoke;
1721 unsigned int n, idx;
1726 * Smoketest our breadcrumb/signal handling for requests across multiple
1727 * threads. A very simple test to only catch the most egregious of bugs.
1728 * See __igt_breadcrumbs_smoketest();
1730 * On real hardware this time.
1733 wakeref = intel_runtime_pm_get(&i915->runtime_pm);
1735 file = mock_file(i915);
1737 ret = PTR_ERR(file);
1741 smoke = kcalloc(nengines, sizeof(*smoke), GFP_KERNEL);
1747 threads = kcalloc(ncpus * nengines, sizeof(*threads), GFP_KERNEL);
1753 smoke[0].request_alloc = __live_request_alloc;
1754 smoke[0].ncontexts = 64;
1755 smoke[0].contexts = kcalloc(smoke[0].ncontexts,
1756 sizeof(*smoke[0].contexts),
1758 if (!smoke[0].contexts) {
1763 for (n = 0; n < smoke[0].ncontexts; n++) {
1764 smoke[0].contexts[n] = live_context(i915, file);
1765 if (IS_ERR(smoke[0].contexts[n])) {
1766 ret = PTR_ERR(smoke[0].contexts[n]);
1771 ret = igt_live_test_begin(&live, i915, __func__, "");
1776 for_each_uabi_engine(engine, i915) {
1777 smoke[idx] = smoke[0];
1778 smoke[idx].engine = engine;
1779 smoke[idx].max_batch =
1780 max_batches(smoke[0].contexts[0], engine);
1781 if (smoke[idx].max_batch < 0) {
1782 ret = smoke[idx].max_batch;
1785 /* One ring interleaved between requests from all cpus */
1786 smoke[idx].max_batch /= ncpus + 1;
1787 pr_debug("Limiting batches to %d requests on %s\n",
1788 smoke[idx].max_batch, engine->name);
1790 for (n = 0; n < ncpus; n++) {
1791 unsigned int i = idx * ncpus + n;
1792 struct kthread_worker *worker;
1794 worker = kthread_create_worker(0, "igt/%d.%d", idx, n);
1795 if (IS_ERR(worker)) {
1796 ret = PTR_ERR(worker);
1800 threads[i].worker = worker;
1801 threads[i].t = &smoke[idx];
1803 kthread_init_work(&threads[i].work,
1804 __igt_breadcrumbs_smoketest);
1805 kthread_queue_work(worker, &threads[i].work);
1811 msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies));
1817 for_each_uabi_engine(engine, i915) {
1818 for (n = 0; n < ncpus; n++) {
1819 unsigned int i = idx * ncpus + n;
1822 if (!threads[i].worker)
1825 WRITE_ONCE(threads[i].stop, true);
1826 kthread_flush_work(&threads[i].work);
1827 err = READ_ONCE(threads[i].result);
1828 if (err < 0 && !ret)
1831 kthread_destroy_worker(threads[i].worker);
1834 num_waits += atomic_long_read(&smoke[idx].num_waits);
1835 num_fences += atomic_long_read(&smoke[idx].num_fences);
1838 pr_info("Completed %lu waits for %lu fences across %d engines and %d cpus\n",
1839 num_waits, num_fences, idx, ncpus);
1841 ret = igt_live_test_end(&live) ?: ret;
1843 kfree(smoke[0].contexts);
1851 intel_runtime_pm_put(&i915->runtime_pm, wakeref);
1856 int i915_request_live_selftests(struct drm_i915_private *i915)
1858 static const struct i915_subtest tests[] = {
1859 SUBTEST(live_nop_request),
1860 SUBTEST(live_all_engines),
1861 SUBTEST(live_sequential_engines),
1862 SUBTEST(live_parallel_engines),
1863 SUBTEST(live_empty_request),
1864 SUBTEST(live_cancel_request),
1865 SUBTEST(live_breadcrumbs_smoketest),
1868 if (intel_gt_is_wedged(to_gt(i915)))
1871 return i915_live_subtests(tests, i915);
1874 static int switch_to_kernel_sync(struct intel_context *ce, int err)
1876 struct i915_request *rq;
1877 struct dma_fence *fence;
1879 rq = intel_engine_create_kernel_request(ce->engine);
1883 fence = i915_active_fence_get(&ce->timeline->last_request);
1885 i915_request_await_dma_fence(rq, fence);
1886 dma_fence_put(fence);
1889 rq = i915_request_get(rq);
1890 i915_request_add(rq);
1891 if (i915_request_wait(rq, 0, HZ / 2) < 0 && !err)
1893 i915_request_put(rq);
1895 while (!err && !intel_engine_is_idle(ce->engine))
1896 intel_engine_flush_submission(ce->engine);
1902 struct intel_engine_cs *engine;
1903 unsigned long count;
1909 struct perf_series {
1910 struct drm_i915_private *i915;
1911 unsigned int nengines;
1912 struct intel_context *ce[];
1915 static int cmp_u32(const void *A, const void *B)
1917 const u32 *a = A, *b = B;
1922 static u32 trifilter(u32 *a)
1927 sort(a, TF_COUNT, sizeof(*a), cmp_u32, NULL);
1929 sum = mul_u32_u32(a[2], 2);
1933 GEM_BUG_ON(sum > U32_MAX);
1938 static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles)
1940 u64 ns = intel_gt_clock_interval_to_ns(engine->gt, cycles);
1942 return DIV_ROUND_CLOSEST(ns, 1 << TF_BIAS);
1945 static u32 *emit_timestamp_store(u32 *cs, struct intel_context *ce, u32 offset)
1947 *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
1948 *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP((ce->engine->mmio_base)));
1955 static u32 *emit_store_dw(u32 *cs, u32 offset, u32 value)
1957 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
1965 static u32 *emit_semaphore_poll(u32 *cs, u32 mode, u32 value, u32 offset)
1967 *cs++ = MI_SEMAPHORE_WAIT |
1968 MI_SEMAPHORE_GLOBAL_GTT |
1978 static u32 *emit_semaphore_poll_until(u32 *cs, u32 offset, u32 value)
1980 return emit_semaphore_poll(cs, MI_SEMAPHORE_SAD_EQ_SDD, value, offset);
1983 static void semaphore_set(u32 *sema, u32 value)
1985 WRITE_ONCE(*sema, value);
1986 wmb(); /* flush the update to the cache, and beyond */
1989 static u32 *hwsp_scratch(const struct intel_context *ce)
1991 return memset32(ce->engine->status_page.addr + 1000, 0, 21);
1994 static u32 hwsp_offset(const struct intel_context *ce, u32 *dw)
1996 return (i915_ggtt_offset(ce->engine->status_page.vma) +
1997 offset_in_page(dw));
2000 static int measure_semaphore_response(struct intel_context *ce)
2002 u32 *sema = hwsp_scratch(ce);
2003 const u32 offset = hwsp_offset(ce, sema);
2004 u32 elapsed[TF_COUNT], cycles;
2005 struct i915_request *rq;
2011 * Measure how many cycles it takes for the HW to detect the change
2012 * in a semaphore value.
2014 * A: read CS_TIMESTAMP from CPU
2016 * B: read CS_TIMESTAMP on GPU
2018 * Semaphore latency: B - A
2021 semaphore_set(sema, -1);
2023 rq = i915_request_create(ce);
2027 cs = intel_ring_begin(rq, 4 + 12 * ARRAY_SIZE(elapsed));
2029 i915_request_add(rq);
2034 cs = emit_store_dw(cs, offset, 0);
2035 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2036 cs = emit_semaphore_poll_until(cs, offset, i);
2037 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2038 cs = emit_store_dw(cs, offset, 0);
2041 intel_ring_advance(rq, cs);
2042 i915_request_add(rq);
2044 if (wait_for(READ_ONCE(*sema) == 0, 50)) {
2049 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2051 cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2052 semaphore_set(sema, i);
2055 if (wait_for(READ_ONCE(*sema) == 0, 50)) {
2060 elapsed[i - 1] = sema[i] - cycles;
2063 cycles = trifilter(elapsed);
2064 pr_info("%s: semaphore response %d cycles, %lluns\n",
2065 ce->engine->name, cycles >> TF_BIAS,
2066 cycles_to_ns(ce->engine, cycles));
2068 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2071 intel_gt_set_wedged(ce->engine->gt);
2075 static int measure_idle_dispatch(struct intel_context *ce)
2077 u32 *sema = hwsp_scratch(ce);
2078 const u32 offset = hwsp_offset(ce, sema);
2079 u32 elapsed[TF_COUNT], cycles;
2085 * Measure how long it takes for us to submit a request while the
2086 * engine is idle, but is resting in our context.
2088 * A: read CS_TIMESTAMP from CPU
2090 * B: read CS_TIMESTAMP on GPU
2092 * Submission latency: B - A
2095 for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
2096 struct i915_request *rq;
2098 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2102 rq = i915_request_create(ce);
2108 cs = intel_ring_begin(rq, 4);
2110 i915_request_add(rq);
2115 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2117 intel_ring_advance(rq, cs);
2121 elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2122 i915_request_add(rq);
2127 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2131 for (i = 0; i < ARRAY_SIZE(elapsed); i++)
2132 elapsed[i] = sema[i] - elapsed[i];
2134 cycles = trifilter(elapsed);
2135 pr_info("%s: idle dispatch latency %d cycles, %lluns\n",
2136 ce->engine->name, cycles >> TF_BIAS,
2137 cycles_to_ns(ce->engine, cycles));
2139 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2142 intel_gt_set_wedged(ce->engine->gt);
2146 static int measure_busy_dispatch(struct intel_context *ce)
2148 u32 *sema = hwsp_scratch(ce);
2149 const u32 offset = hwsp_offset(ce, sema);
2150 u32 elapsed[TF_COUNT + 1], cycles;
2156 * Measure how long it takes for us to submit a request while the
2157 * engine is busy, polling on a semaphore in our context. With
2158 * direct submission, this will include the cost of a lite restore.
2160 * A: read CS_TIMESTAMP from CPU
2162 * B: read CS_TIMESTAMP on GPU
2164 * Submission latency: B - A
2167 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2168 struct i915_request *rq;
2170 rq = i915_request_create(ce);
2176 cs = intel_ring_begin(rq, 12);
2178 i915_request_add(rq);
2183 cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
2184 cs = emit_semaphore_poll_until(cs, offset, i);
2185 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2187 intel_ring_advance(rq, cs);
2189 if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) {
2196 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2197 i915_request_add(rq);
2199 semaphore_set(sema, i - 1);
2203 wait_for(READ_ONCE(sema[i - 1]), 500);
2204 semaphore_set(sema, i - 1);
2206 for (i = 1; i <= TF_COUNT; i++) {
2207 GEM_BUG_ON(sema[i] == -1);
2208 elapsed[i - 1] = sema[i] - elapsed[i];
2211 cycles = trifilter(elapsed);
2212 pr_info("%s: busy dispatch latency %d cycles, %lluns\n",
2213 ce->engine->name, cycles >> TF_BIAS,
2214 cycles_to_ns(ce->engine, cycles));
2216 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2219 intel_gt_set_wedged(ce->engine->gt);
2223 static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value)
2226 i915_ggtt_offset(engine->status_page.vma) +
2227 offset_in_page(sema);
2228 struct i915_request *rq;
2231 rq = i915_request_create(engine->kernel_context);
2235 cs = intel_ring_begin(rq, 4);
2237 i915_request_add(rq);
2241 cs = emit_semaphore_poll(cs, mode, value, offset);
2243 intel_ring_advance(rq, cs);
2244 i915_request_add(rq);
2249 static int measure_inter_request(struct intel_context *ce)
2251 u32 *sema = hwsp_scratch(ce);
2252 const u32 offset = hwsp_offset(ce, sema);
2253 u32 elapsed[TF_COUNT + 1], cycles;
2254 struct i915_sw_fence *submit;
2258 * Measure how long it takes to advance from one request into the
2259 * next. Between each request we flush the GPU caches to memory,
2260 * update the breadcrumbs, and then invalidate those caches.
2261 * We queue up all the requests to be submitted in one batch so
2262 * it should be one set of contiguous measurements.
2264 * A: read CS_TIMESTAMP on GPU
2266 * B: read CS_TIMESTAMP on GPU
2268 * Request latency: B - A
2271 err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
2275 submit = heap_fence_create(GFP_KERNEL);
2277 semaphore_set(sema, 1);
2281 intel_engine_flush_submission(ce->engine);
2282 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2283 struct i915_request *rq;
2286 rq = i915_request_create(ce);
2292 err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
2296 i915_request_add(rq);
2300 cs = intel_ring_begin(rq, 4);
2302 i915_request_add(rq);
2307 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2309 intel_ring_advance(rq, cs);
2310 i915_request_add(rq);
2312 i915_sw_fence_commit(submit);
2313 intel_engine_flush_submission(ce->engine);
2314 heap_fence_put(submit);
2316 semaphore_set(sema, 1);
2317 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2321 for (i = 1; i <= TF_COUNT; i++)
2322 elapsed[i - 1] = sema[i + 1] - sema[i];
2324 cycles = trifilter(elapsed);
2325 pr_info("%s: inter-request latency %d cycles, %lluns\n",
2326 ce->engine->name, cycles >> TF_BIAS,
2327 cycles_to_ns(ce->engine, cycles));
2329 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2332 i915_sw_fence_commit(submit);
2333 heap_fence_put(submit);
2334 semaphore_set(sema, 1);
2336 intel_gt_set_wedged(ce->engine->gt);
2340 static int measure_context_switch(struct intel_context *ce)
2342 u32 *sema = hwsp_scratch(ce);
2343 const u32 offset = hwsp_offset(ce, sema);
2344 struct i915_request *fence = NULL;
2345 u32 elapsed[TF_COUNT + 1], cycles;
2350 * Measure how long it takes to advance from one request in one
2351 * context to a request in another context. This allows us to
2352 * measure how long the context save/restore take, along with all
2353 * the inter-context setup we require.
2355 * A: read CS_TIMESTAMP on GPU
2357 * B: read CS_TIMESTAMP on GPU
2359 * Context switch latency: B - A
2362 err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
2366 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2367 struct intel_context *arr[] = {
2368 ce, ce->engine->kernel_context
2370 u32 addr = offset + ARRAY_SIZE(arr) * i * sizeof(u32);
2372 for (j = 0; j < ARRAY_SIZE(arr); j++) {
2373 struct i915_request *rq;
2375 rq = i915_request_create(arr[j]);
2382 err = i915_request_await_dma_fence(rq,
2385 i915_request_add(rq);
2390 cs = intel_ring_begin(rq, 4);
2392 i915_request_add(rq);
2397 cs = emit_timestamp_store(cs, ce, addr);
2398 addr += sizeof(u32);
2400 intel_ring_advance(rq, cs);
2402 i915_request_put(fence);
2403 fence = i915_request_get(rq);
2405 i915_request_add(rq);
2408 i915_request_put(fence);
2409 intel_engine_flush_submission(ce->engine);
2411 semaphore_set(sema, 1);
2412 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2416 for (i = 1; i <= TF_COUNT; i++)
2417 elapsed[i - 1] = sema[2 * i + 2] - sema[2 * i + 1];
2419 cycles = trifilter(elapsed);
2420 pr_info("%s: context switch latency %d cycles, %lluns\n",
2421 ce->engine->name, cycles >> TF_BIAS,
2422 cycles_to_ns(ce->engine, cycles));
2424 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2427 i915_request_put(fence);
2428 semaphore_set(sema, 1);
2430 intel_gt_set_wedged(ce->engine->gt);
2434 static int measure_preemption(struct intel_context *ce)
2436 u32 *sema = hwsp_scratch(ce);
2437 const u32 offset = hwsp_offset(ce, sema);
2438 u32 elapsed[TF_COUNT], cycles;
2444 * We measure two latencies while triggering preemption. The first
2445 * latency is how long it takes for us to submit a preempting request.
2446 * The second latency is how it takes for us to return from the
2447 * preemption back to the original context.
2449 * A: read CS_TIMESTAMP from CPU
2451 * B: read CS_TIMESTAMP on GPU (in preempting context)
2453 * C: read CS_TIMESTAMP on GPU (in original context)
2455 * Preemption dispatch latency: B - A
2456 * Preemption switch latency: C - B
2459 if (!intel_engine_has_preemption(ce->engine))
2462 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2463 u32 addr = offset + 2 * i * sizeof(u32);
2464 struct i915_request *rq;
2466 rq = i915_request_create(ce);
2472 cs = intel_ring_begin(rq, 12);
2474 i915_request_add(rq);
2479 cs = emit_store_dw(cs, addr, -1);
2480 cs = emit_semaphore_poll_until(cs, offset, i);
2481 cs = emit_timestamp_store(cs, ce, addr + sizeof(u32));
2483 intel_ring_advance(rq, cs);
2484 i915_request_add(rq);
2486 if (wait_for(READ_ONCE(sema[2 * i]) == -1, 500)) {
2491 rq = i915_request_create(ce->engine->kernel_context);
2497 cs = intel_ring_begin(rq, 8);
2499 i915_request_add(rq);
2504 cs = emit_timestamp_store(cs, ce, addr);
2505 cs = emit_store_dw(cs, offset, i);
2507 intel_ring_advance(rq, cs);
2508 rq->sched.attr.priority = I915_PRIORITY_BARRIER;
2510 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2511 i915_request_add(rq);
2514 if (wait_for(READ_ONCE(sema[2 * i - 2]) != -1, 500)) {
2519 for (i = 1; i <= TF_COUNT; i++)
2520 elapsed[i - 1] = sema[2 * i + 0] - elapsed[i - 1];
2522 cycles = trifilter(elapsed);
2523 pr_info("%s: preemption dispatch latency %d cycles, %lluns\n",
2524 ce->engine->name, cycles >> TF_BIAS,
2525 cycles_to_ns(ce->engine, cycles));
2527 for (i = 1; i <= TF_COUNT; i++)
2528 elapsed[i - 1] = sema[2 * i + 1] - sema[2 * i + 0];
2530 cycles = trifilter(elapsed);
2531 pr_info("%s: preemption switch latency %d cycles, %lluns\n",
2532 ce->engine->name, cycles >> TF_BIAS,
2533 cycles_to_ns(ce->engine, cycles));
2535 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2538 intel_gt_set_wedged(ce->engine->gt);
2543 struct dma_fence_cb base;
2547 static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
2549 struct signal_cb *s = container_of(cb, typeof(*s), base);
2551 smp_store_mb(s->seen, true); /* be safe, be strong */
2554 static int measure_completion(struct intel_context *ce)
2556 u32 *sema = hwsp_scratch(ce);
2557 const u32 offset = hwsp_offset(ce, sema);
2558 u32 elapsed[TF_COUNT], cycles;
2564 * Measure how long it takes for the signal (interrupt) to be
2565 * sent from the GPU to be processed by the CPU.
2567 * A: read CS_TIMESTAMP on GPU
2569 * B: read CS_TIMESTAMP from CPU
2571 * Completion latency: B - A
2574 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2575 struct signal_cb cb = { .seen = false };
2576 struct i915_request *rq;
2578 rq = i915_request_create(ce);
2584 cs = intel_ring_begin(rq, 12);
2586 i915_request_add(rq);
2591 cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
2592 cs = emit_semaphore_poll_until(cs, offset, i);
2593 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2595 intel_ring_advance(rq, cs);
2597 dma_fence_add_callback(&rq->fence, &cb.base, signal_cb);
2598 i915_request_add(rq);
2600 intel_engine_flush_submission(ce->engine);
2601 if (wait_for(READ_ONCE(sema[i]) == -1, 50)) {
2607 semaphore_set(sema, i);
2608 while (!READ_ONCE(cb.seen))
2611 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2615 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2619 for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
2620 GEM_BUG_ON(sema[i + 1] == -1);
2621 elapsed[i] = elapsed[i] - sema[i + 1];
2624 cycles = trifilter(elapsed);
2625 pr_info("%s: completion latency %d cycles, %lluns\n",
2626 ce->engine->name, cycles >> TF_BIAS,
2627 cycles_to_ns(ce->engine, cycles));
2629 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2632 intel_gt_set_wedged(ce->engine->gt);
2636 static void rps_pin(struct intel_gt *gt)
2638 /* Pin the frequency to max */
2639 atomic_inc(>->rps.num_waiters);
2640 intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
2642 mutex_lock(>->rps.lock);
2643 intel_rps_set(>->rps, gt->rps.max_freq);
2644 mutex_unlock(>->rps.lock);
2647 static void rps_unpin(struct intel_gt *gt)
2649 intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
2650 atomic_dec(>->rps.num_waiters);
2653 static int perf_request_latency(void *arg)
2655 struct drm_i915_private *i915 = arg;
2656 struct intel_engine_cs *engine;
2657 struct pm_qos_request qos;
2660 if (GRAPHICS_VER(i915) < 8) /* per-engine CS timestamp, semaphores */
2663 cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
2665 for_each_uabi_engine(engine, i915) {
2666 struct intel_context *ce;
2668 ce = intel_context_create(engine);
2674 err = intel_context_pin(ce);
2676 intel_context_put(ce);
2680 st_engine_heartbeat_disable(engine);
2681 rps_pin(engine->gt);
2684 err = measure_semaphore_response(ce);
2686 err = measure_idle_dispatch(ce);
2688 err = measure_busy_dispatch(ce);
2690 err = measure_inter_request(ce);
2692 err = measure_context_switch(ce);
2694 err = measure_preemption(ce);
2696 err = measure_completion(ce);
2698 rps_unpin(engine->gt);
2699 st_engine_heartbeat_enable(engine);
2701 intel_context_unpin(ce);
2702 intel_context_put(ce);
2708 if (igt_flush_test(i915))
2711 cpu_latency_qos_remove_request(&qos);
2715 static int s_sync0(void *arg)
2717 struct perf_series *ps = arg;
2718 IGT_TIMEOUT(end_time);
2719 unsigned int idx = 0;
2722 GEM_BUG_ON(!ps->nengines);
2724 struct i915_request *rq;
2726 rq = i915_request_create(ps->ce[idx]);
2732 i915_request_get(rq);
2733 i915_request_add(rq);
2735 if (i915_request_wait(rq, 0, HZ / 5) < 0)
2737 i915_request_put(rq);
2741 if (++idx == ps->nengines)
2743 } while (!__igt_timeout(end_time, NULL));
2748 static int s_sync1(void *arg)
2750 struct perf_series *ps = arg;
2751 struct i915_request *prev = NULL;
2752 IGT_TIMEOUT(end_time);
2753 unsigned int idx = 0;
2756 GEM_BUG_ON(!ps->nengines);
2758 struct i915_request *rq;
2760 rq = i915_request_create(ps->ce[idx]);
2766 i915_request_get(rq);
2767 i915_request_add(rq);
2769 if (prev && i915_request_wait(prev, 0, HZ / 5) < 0)
2771 i915_request_put(prev);
2776 if (++idx == ps->nengines)
2778 } while (!__igt_timeout(end_time, NULL));
2779 i915_request_put(prev);
2784 static int s_many(void *arg)
2786 struct perf_series *ps = arg;
2787 IGT_TIMEOUT(end_time);
2788 unsigned int idx = 0;
2790 GEM_BUG_ON(!ps->nengines);
2792 struct i915_request *rq;
2794 rq = i915_request_create(ps->ce[idx]);
2798 i915_request_add(rq);
2800 if (++idx == ps->nengines)
2802 } while (!__igt_timeout(end_time, NULL));
2807 static int perf_series_engines(void *arg)
2809 struct drm_i915_private *i915 = arg;
2810 static int (* const func[])(void *arg) = {
2816 const unsigned int nengines = num_uabi_engines(i915);
2817 struct intel_engine_cs *engine;
2818 int (* const *fn)(void *arg);
2819 struct pm_qos_request qos;
2820 struct perf_stats *stats;
2821 struct perf_series *ps;
2825 stats = kcalloc(nengines, sizeof(*stats), GFP_KERNEL);
2829 ps = kzalloc(struct_size(ps, ce, nengines), GFP_KERNEL);
2835 cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
2838 ps->nengines = nengines;
2841 for_each_uabi_engine(engine, i915) {
2842 struct intel_context *ce;
2844 ce = intel_context_create(engine);
2850 err = intel_context_pin(ce);
2852 intel_context_put(ce);
2858 GEM_BUG_ON(idx != ps->nengines);
2860 for (fn = func; *fn && !err; fn++) {
2861 char name[KSYM_NAME_LEN];
2862 struct igt_live_test t;
2864 snprintf(name, sizeof(name), "%ps", *fn);
2865 err = igt_live_test_begin(&t, i915, __func__, name);
2869 for (idx = 0; idx < nengines; idx++) {
2870 struct perf_stats *p =
2871 memset(&stats[idx], 0, sizeof(stats[idx]));
2872 struct intel_context *ce = ps->ce[idx];
2874 p->engine = ps->ce[idx]->engine;
2875 intel_engine_pm_get(p->engine);
2877 if (intel_engine_supports_stats(p->engine))
2878 p->busy = intel_engine_get_busy_time(p->engine,
2881 p->time = ktime_get();
2882 p->runtime = -intel_context_get_total_runtime_ns(ce);
2886 if (igt_live_test_end(&t))
2889 for (idx = 0; idx < nengines; idx++) {
2890 struct perf_stats *p = &stats[idx];
2891 struct intel_context *ce = ps->ce[idx];
2892 int integer, decimal;
2896 p->busy = ktime_sub(intel_engine_get_busy_time(p->engine,
2901 p->time = ktime_sub(now, p->time);
2903 err = switch_to_kernel_sync(ce, err);
2904 p->runtime += intel_context_get_total_runtime_ns(ce);
2905 intel_engine_pm_put(p->engine);
2907 busy = 100 * ktime_to_ns(p->busy);
2908 dt = ktime_to_ns(p->time);
2910 integer = div64_u64(busy, dt);
2911 busy -= integer * dt;
2912 decimal = div64_u64(100 * busy, dt);
2918 pr_info("%s %5s: { seqno:%d, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n",
2919 name, p->engine->name, ce->timeline->seqno,
2921 div_u64(p->runtime, 1000 * 1000),
2922 div_u64(ktime_to_ns(p->time), 1000 * 1000));
2927 for (idx = 0; idx < nengines; idx++) {
2928 if (IS_ERR_OR_NULL(ps->ce[idx]))
2931 intel_context_unpin(ps->ce[idx]);
2932 intel_context_put(ps->ce[idx]);
2936 cpu_latency_qos_remove_request(&qos);
2942 struct perf_stats p;
2943 struct kthread_worker *worker;
2944 struct kthread_work work;
2945 struct intel_engine_cs *engine;
2949 static void p_sync0(struct kthread_work *work)
2951 struct p_thread *thread = container_of(work, typeof(*thread), work);
2952 struct perf_stats *p = &thread->p;
2953 struct intel_engine_cs *engine = p->engine;
2954 struct intel_context *ce;
2955 IGT_TIMEOUT(end_time);
2956 unsigned long count;
2960 ce = intel_context_create(engine);
2962 thread->result = PTR_ERR(ce);
2966 err = intel_context_pin(ce);
2968 intel_context_put(ce);
2969 thread->result = err;
2973 if (intel_engine_supports_stats(engine)) {
2974 p->busy = intel_engine_get_busy_time(engine, &p->time);
2977 p->time = ktime_get();
2983 struct i915_request *rq;
2985 rq = i915_request_create(ce);
2991 i915_request_get(rq);
2992 i915_request_add(rq);
2995 if (i915_request_wait(rq, 0, HZ) < 0)
2997 i915_request_put(rq);
3002 } while (!__igt_timeout(end_time, NULL));
3007 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
3009 p->time = ktime_sub(now, p->time);
3011 p->time = ktime_sub(ktime_get(), p->time);
3014 err = switch_to_kernel_sync(ce, err);
3015 p->runtime = intel_context_get_total_runtime_ns(ce);
3018 intel_context_unpin(ce);
3019 intel_context_put(ce);
3020 thread->result = err;
3023 static void p_sync1(struct kthread_work *work)
3025 struct p_thread *thread = container_of(work, typeof(*thread), work);
3026 struct perf_stats *p = &thread->p;
3027 struct intel_engine_cs *engine = p->engine;
3028 struct i915_request *prev = NULL;
3029 struct intel_context *ce;
3030 IGT_TIMEOUT(end_time);
3031 unsigned long count;
3035 ce = intel_context_create(engine);
3037 thread->result = PTR_ERR(ce);
3041 err = intel_context_pin(ce);
3043 intel_context_put(ce);
3044 thread->result = err;
3048 if (intel_engine_supports_stats(engine)) {
3049 p->busy = intel_engine_get_busy_time(engine, &p->time);
3052 p->time = ktime_get();
3058 struct i915_request *rq;
3060 rq = i915_request_create(ce);
3066 i915_request_get(rq);
3067 i915_request_add(rq);
3070 if (prev && i915_request_wait(prev, 0, HZ) < 0)
3072 i915_request_put(prev);
3078 } while (!__igt_timeout(end_time, NULL));
3079 i915_request_put(prev);
3084 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
3086 p->time = ktime_sub(now, p->time);
3088 p->time = ktime_sub(ktime_get(), p->time);
3091 err = switch_to_kernel_sync(ce, err);
3092 p->runtime = intel_context_get_total_runtime_ns(ce);
3095 intel_context_unpin(ce);
3096 intel_context_put(ce);
3097 thread->result = err;
3100 static void p_many(struct kthread_work *work)
3102 struct p_thread *thread = container_of(work, typeof(*thread), work);
3103 struct perf_stats *p = &thread->p;
3104 struct intel_engine_cs *engine = p->engine;
3105 struct intel_context *ce;
3106 IGT_TIMEOUT(end_time);
3107 unsigned long count;
3111 ce = intel_context_create(engine);
3113 thread->result = PTR_ERR(ce);
3117 err = intel_context_pin(ce);
3119 intel_context_put(ce);
3120 thread->result = err;
3124 if (intel_engine_supports_stats(engine)) {
3125 p->busy = intel_engine_get_busy_time(engine, &p->time);
3128 p->time = ktime_get();
3134 struct i915_request *rq;
3136 rq = i915_request_create(ce);
3142 i915_request_add(rq);
3144 } while (!__igt_timeout(end_time, NULL));
3149 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
3151 p->time = ktime_sub(now, p->time);
3153 p->time = ktime_sub(ktime_get(), p->time);
3156 err = switch_to_kernel_sync(ce, err);
3157 p->runtime = intel_context_get_total_runtime_ns(ce);
3160 intel_context_unpin(ce);
3161 intel_context_put(ce);
3162 thread->result = err;
3165 static int perf_parallel_engines(void *arg)
3167 struct drm_i915_private *i915 = arg;
3168 static void (* const func[])(struct kthread_work *) = {
3174 const unsigned int nengines = num_uabi_engines(i915);
3175 void (* const *fn)(struct kthread_work *);
3176 struct intel_engine_cs *engine;
3177 struct pm_qos_request qos;
3178 struct p_thread *engines;
3181 engines = kcalloc(nengines, sizeof(*engines), GFP_KERNEL);
3185 cpu_latency_qos_add_request(&qos, 0);
3187 for (fn = func; *fn; fn++) {
3188 char name[KSYM_NAME_LEN];
3189 struct igt_live_test t;
3192 snprintf(name, sizeof(name), "%ps", *fn);
3193 err = igt_live_test_begin(&t, i915, __func__, name);
3197 atomic_set(&i915->selftest.counter, nengines);
3200 for_each_uabi_engine(engine, i915) {
3201 struct kthread_worker *worker;
3203 intel_engine_pm_get(engine);
3205 memset(&engines[idx].p, 0, sizeof(engines[idx].p));
3207 worker = kthread_create_worker(0, "igt:%s",
3209 if (IS_ERR(worker)) {
3210 err = PTR_ERR(worker);
3211 intel_engine_pm_put(engine);
3214 engines[idx].worker = worker;
3215 engines[idx].result = 0;
3216 engines[idx].p.engine = engine;
3217 engines[idx].engine = engine;
3219 kthread_init_work(&engines[idx].work, *fn);
3220 kthread_queue_work(worker, &engines[idx].work);
3225 for_each_uabi_engine(engine, i915) {
3228 if (!engines[idx].worker)
3231 kthread_flush_work(&engines[idx].work);
3232 status = READ_ONCE(engines[idx].result);
3236 intel_engine_pm_put(engine);
3238 kthread_destroy_worker(engines[idx].worker);
3242 if (igt_live_test_end(&t))
3248 for_each_uabi_engine(engine, i915) {
3249 struct perf_stats *p = &engines[idx].p;
3250 u64 busy = 100 * ktime_to_ns(p->busy);
3251 u64 dt = ktime_to_ns(p->time);
3252 int integer, decimal;
3255 integer = div64_u64(busy, dt);
3256 busy -= integer * dt;
3257 decimal = div64_u64(100 * busy, dt);
3263 GEM_BUG_ON(engine != p->engine);
3264 pr_info("%s %5s: { count:%lu, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n",
3265 name, engine->name, p->count, integer, decimal,
3266 div_u64(p->runtime, 1000 * 1000),
3267 div_u64(ktime_to_ns(p->time), 1000 * 1000));
3272 cpu_latency_qos_remove_request(&qos);
3277 int i915_request_perf_selftests(struct drm_i915_private *i915)
3279 static const struct i915_subtest tests[] = {
3280 SUBTEST(perf_request_latency),
3281 SUBTEST(perf_series_engines),
3282 SUBTEST(perf_parallel_engines),
3285 if (intel_gt_is_wedged(to_gt(i915)))
3288 return i915_subtests(tests, i915);