1 // SPDX-License-Identifier: GPL-2.0
3 * Performance event support - Processor Activity Instrumentation Extension
6 * Copyright IBM Corp. 2022
9 #define KMSG_COMPONENT "pai_ext"
10 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
12 #include <linux/kernel.h>
13 #include <linux/kernel_stat.h>
14 #include <linux/percpu.h>
15 #include <linux/notifier.h>
16 #include <linux/init.h>
17 #include <linux/export.h>
19 #include <linux/perf_event.h>
20 #include <asm/ctlreg.h>
22 #include <asm/debug.h>
24 #define PAIE1_CB_SZ 0x200 /* Size of PAIE1 control block */
25 #define PAIE1_CTRBLOCK_SZ 0x400 /* Size of PAIE1 counter blocks */
27 static debug_info_t *paiext_dbg;
28 static unsigned int paiext_cnt; /* Extracted with QPACI instruction */
35 /* Create the PAI extension 1 control block area.
36 * The PAI extension control block 1 is pointed to by lowcore
37 * address 0x1508 for each CPU. This control block is 512 bytes in size
38 * and requires a 512 byte boundary alignment.
40 struct paiext_cb { /* PAI extension 1 control block */
41 u64 header; /* Not used */
43 u64 acc; /* Addr to analytics counter control block */
48 unsigned long *area; /* Area for CPU to store counters */
49 struct pai_userdata *save; /* Area to store non-zero counters */
50 unsigned int active_events; /* # of PAI Extension users */
52 struct perf_event *event; /* Perf event for sampling */
53 struct paiext_cb *paiext_cb; /* PAI extension control block area */
54 struct list_head syswide_list; /* List system-wide sampling events */
57 struct paiext_mapptr {
58 struct paiext_map *mapptr;
61 static struct paiext_root { /* Anchor to per CPU data */
62 refcount_t refcnt; /* Overall active events */
63 struct paiext_mapptr __percpu *mapptr;
66 /* Free per CPU data when the last event is removed. */
67 static void paiext_root_free(void)
69 if (refcount_dec_and_test(&paiext_root.refcnt)) {
70 free_percpu(paiext_root.mapptr);
71 paiext_root.mapptr = NULL;
73 debug_sprintf_event(paiext_dbg, 5, "%s root.refcount %d\n", __func__,
74 refcount_read(&paiext_root.refcnt));
77 /* On initialization of first event also allocate per CPU data dynamically.
78 * Start with an array of pointers, the array size is the maximum number of
79 * CPUs possible, which might be larger than the number of CPUs currently
82 static int paiext_root_alloc(void)
84 if (!refcount_inc_not_zero(&paiext_root.refcnt)) {
85 /* The memory is already zeroed. */
86 paiext_root.mapptr = alloc_percpu(struct paiext_mapptr);
87 if (!paiext_root.mapptr) {
88 /* Returning without refcnt adjustment is ok. The
89 * error code is handled by paiext_alloc() which
90 * decrements refcnt when an event can not be
95 refcount_set(&paiext_root.refcnt, 1);
100 /* Protects against concurrent increment of sampler and counter member
101 * increments at the same time and prohibits concurrent execution of
102 * counting and sampling events.
103 * Ensures that analytics counter block is deallocated only when the
104 * sampling and counting on that cpu is zero.
105 * For details see paiext_alloc().
107 static DEFINE_MUTEX(paiext_reserve_mutex);
109 /* Free all memory allocated for event counting/sampling setup */
110 static void paiext_free(struct paiext_mapptr *mp)
112 kfree(mp->mapptr->area);
113 kfree(mp->mapptr->paiext_cb);
114 kvfree(mp->mapptr->save);
119 /* Release the PMU if event is the last perf event */
120 static void paiext_event_destroy_cpu(struct perf_event *event, int cpu)
122 struct paiext_mapptr *mp = per_cpu_ptr(paiext_root.mapptr, cpu);
123 struct paiext_map *cpump = mp->mapptr;
125 mutex_lock(&paiext_reserve_mutex);
126 if (refcount_dec_and_test(&cpump->refcnt)) /* Last reference gone */
129 mutex_unlock(&paiext_reserve_mutex);
132 static void paiext_event_destroy(struct perf_event *event)
136 free_page(PAI_SAVE_AREA(event));
137 if (event->cpu == -1) {
138 struct cpumask *mask = PAI_CPU_MASK(event);
140 for_each_cpu(cpu, mask)
141 paiext_event_destroy_cpu(event, cpu);
144 paiext_event_destroy_cpu(event, event->cpu);
146 debug_sprintf_event(paiext_dbg, 4, "%s cpu %d\n", __func__,
150 /* Used to avoid races in checking concurrent access of counting and
151 * sampling for pai_extension events.
153 * Only one instance of event pai_ext/NNPA_ALL/ for sampling is
154 * allowed and when this event is running, no counting event is allowed.
155 * Several counting events are allowed in parallel, but no sampling event
156 * is allowed while one (or more) counting events are running.
158 * This function is called in process context and it is safe to block.
159 * When the event initialization functions fails, no other call back will
162 * Allocate the memory for the event.
164 static int paiext_alloc_cpu(struct perf_event *event, int cpu)
166 struct paiext_mapptr *mp;
167 struct paiext_map *cpump;
170 mutex_lock(&paiext_reserve_mutex);
171 rc = paiext_root_alloc();
175 mp = per_cpu_ptr(paiext_root.mapptr, cpu);
177 if (!cpump) { /* Paiext_map allocated? */
179 cpump = kzalloc(sizeof(*cpump), GFP_KERNEL);
183 /* Allocate memory for counter area and counter extraction.
185 * - a 512 byte block and requires 512 byte boundary alignment.
186 * - a 1KB byte block and requires 1KB boundary alignment.
187 * Only the first counting event has to allocate the area.
189 * Note: This works with commit 59bb47985c1d by default.
190 * Backporting this to kernels without this commit might
194 cpump->area = kzalloc(PAIE1_CTRBLOCK_SZ, GFP_KERNEL);
195 cpump->paiext_cb = kzalloc(PAIE1_CB_SZ, GFP_KERNEL);
196 cpump->save = kvmalloc_array(paiext_cnt + 1,
197 sizeof(struct pai_userdata),
199 if (!cpump->save || !cpump->area || !cpump->paiext_cb) {
203 INIT_LIST_HEAD(&cpump->syswide_list);
204 refcount_set(&cpump->refcnt, 1);
207 refcount_inc(&cpump->refcnt);
212 /* Error in allocation of event, decrement anchor. Since
213 * the event in not created, its destroy() function is never
214 * invoked. Adjust the reference counter for the anchor.
219 mutex_unlock(&paiext_reserve_mutex);
220 /* If rc is non-zero, no increment of counter/sampler was done. */
224 static int paiext_alloc(struct perf_event *event)
226 struct cpumask *maskptr;
227 int cpu, rc = -ENOMEM;
229 maskptr = kzalloc(sizeof(*maskptr), GFP_KERNEL);
233 for_each_online_cpu(cpu) {
234 rc = paiext_alloc_cpu(event, cpu);
236 for_each_cpu(cpu, maskptr)
237 paiext_event_destroy_cpu(event, cpu);
241 cpumask_set_cpu(cpu, maskptr);
245 * On error all cpumask are freed and all events have been destroyed.
246 * Save of which CPUs data structures have been allocated for.
247 * Release them in paicrypt_event_destroy call back function
250 PAI_CPU_MASK(event) = maskptr;
256 /* The PAI extension 1 control block supports up to 128 entries. Return
257 * the index within PAIE1_CB given the event number. Also validate event
260 static int paiext_event_valid(struct perf_event *event)
262 u64 cfg = event->attr.config;
264 if (cfg >= PAI_NNPA_BASE && cfg <= PAI_NNPA_BASE + paiext_cnt) {
265 /* Offset NNPA in paiext_cb */
266 event->hw.config_base = offsetof(struct paiext_cb, acc);
272 /* Might be called on different CPU than the one the event is intended for. */
273 static int paiext_event_init(struct perf_event *event)
275 struct perf_event_attr *a = &event->attr;
278 /* PMU pai_ext registered as PERF_TYPE_RAW, check event type */
279 if (a->type != PERF_TYPE_RAW && event->pmu->type != a->type)
281 /* PAI extension event must be valid and in supported range */
282 rc = paiext_event_valid(event);
285 /* Allow only event NNPA_ALL for sampling. */
286 if (a->sample_period && a->config != PAI_NNPA_BASE)
288 /* Prohibit exclude_user event selection */
291 /* Get a page to store last counter values for sampling */
292 if (a->sample_period) {
293 PAI_SAVE_AREA(event) = get_zeroed_page(GFP_KERNEL);
294 if (!PAI_SAVE_AREA(event))
299 rc = paiext_alloc_cpu(event, event->cpu);
301 rc = paiext_alloc(event);
303 free_page(PAI_SAVE_AREA(event));
306 event->destroy = paiext_event_destroy;
308 if (a->sample_period) {
309 a->sample_period = 1;
311 /* Register for paicrypt_sched_task() to be called */
312 event->attach_state |= PERF_ATTACH_SCHED_CB;
313 /* Add raw data which are the memory mapped counters */
314 a->sample_type |= PERF_SAMPLE_RAW;
315 /* Turn off inheritance */
322 static u64 paiext_getctr(unsigned long *area, int nr)
327 /* Read the counter values. Return value from location in buffer. For event
328 * NNPA_ALL sum up all events.
330 static u64 paiext_getdata(struct perf_event *event)
332 struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr);
333 struct paiext_map *cpump = mp->mapptr;
337 if (event->attr.config != PAI_NNPA_BASE)
338 return paiext_getctr(cpump->area,
339 event->attr.config - PAI_NNPA_BASE);
341 for (i = 1; i <= paiext_cnt; i++)
342 sum += paiext_getctr(cpump->area, i);
347 static u64 paiext_getall(struct perf_event *event)
349 return paiext_getdata(event);
352 static void paiext_read(struct perf_event *event)
354 u64 prev, new, delta;
356 prev = local64_read(&event->hw.prev_count);
357 new = paiext_getall(event);
358 local64_set(&event->hw.prev_count, new);
360 local64_add(delta, &event->count);
363 static void paiext_start(struct perf_event *event, int flags)
365 struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr);
366 struct paiext_map *cpump = mp->mapptr;
369 if (!event->attr.sample_period) { /* Counting */
370 sum = paiext_getall(event); /* Get current value */
371 local64_set(&event->hw.prev_count, sum);
372 } else { /* Sampling */
373 memcpy((void *)PAI_SAVE_AREA(event), cpump->area,
375 /* Enable context switch callback for system-wide sampling */
376 if (!(event->attach_state & PERF_ATTACH_TASK)) {
377 list_add_tail(PAI_SWLIST(event), &cpump->syswide_list);
378 perf_sched_cb_inc(event->pmu);
380 cpump->event = event;
385 static int paiext_add(struct perf_event *event, int flags)
387 struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr);
388 struct paiext_map *cpump = mp->mapptr;
389 struct paiext_cb *pcb = cpump->paiext_cb;
391 if (++cpump->active_events == 1) {
392 get_lowcore()->aicd = virt_to_phys(cpump->paiext_cb);
393 pcb->acc = virt_to_phys(cpump->area) | 0x1;
394 /* Enable CPU instruction lookup for PAIE1 control block */
395 local_ctl_set_bit(0, CR0_PAI_EXTENSION_BIT);
397 if (flags & PERF_EF_START)
398 paiext_start(event, PERF_EF_RELOAD);
403 static void paiext_have_sample(struct perf_event *, struct paiext_map *);
404 static void paiext_stop(struct perf_event *event, int flags)
406 struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr);
407 struct paiext_map *cpump = mp->mapptr;
409 if (!event->attr.sample_period) { /* Counting */
411 } else { /* Sampling */
412 if (!(event->attach_state & PERF_ATTACH_TASK)) {
413 list_del(PAI_SWLIST(event));
414 perf_sched_cb_dec(event->pmu);
416 paiext_have_sample(event, cpump);
420 event->hw.state = PERF_HES_STOPPED;
423 static void paiext_del(struct perf_event *event, int flags)
425 struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr);
426 struct paiext_map *cpump = mp->mapptr;
427 struct paiext_cb *pcb = cpump->paiext_cb;
429 paiext_stop(event, PERF_EF_UPDATE);
430 if (--cpump->active_events == 0) {
431 /* Disable CPU instruction lookup for PAIE1 control block */
432 local_ctl_clear_bit(0, CR0_PAI_EXTENSION_BIT);
434 get_lowcore()->aicd = 0;
438 /* Create raw data and save it in buffer. Returns number of bytes copied.
439 * Saves only positive counter entries of the form
440 * 2 bytes: Number of counter
441 * 8 bytes: Value of counter
443 static size_t paiext_copy(struct pai_userdata *userdata, unsigned long *area,
444 unsigned long *area_old)
448 for (i = 1; i <= paiext_cnt; i++) {
449 u64 val = paiext_getctr(area, i);
450 u64 val_old = paiext_getctr(area_old, i);
455 val = (~0ULL - val_old) + val + 1;
457 userdata[outidx].num = i;
458 userdata[outidx].value = val;
462 return outidx * sizeof(*userdata);
465 /* Write sample when one or more counters values are nonzero.
467 * Note: The function paiext_sched_task() and paiext_push_sample() are not
468 * invoked after function paiext_del() has been called because of function
469 * perf_sched_cb_dec().
470 * The function paiext_sched_task() and paiext_push_sample() are only
471 * called when sampling is active. Function perf_sched_cb_inc()
472 * has been invoked to install function paiext_sched_task() as call back
473 * to run at context switch time (see paiext_add()).
475 * This causes function perf_event_context_sched_out() and
476 * perf_event_context_sched_in() to check whether the PMU has installed an
477 * sched_task() callback. That callback is not active after paiext_del()
478 * returns and has deleted the event on that CPU.
480 static int paiext_push_sample(size_t rawsize, struct paiext_map *cpump,
481 struct perf_event *event)
483 struct perf_sample_data data;
484 struct perf_raw_record raw;
488 /* Setup perf sample */
489 memset(®s, 0, sizeof(regs));
490 memset(&raw, 0, sizeof(raw));
491 memset(&data, 0, sizeof(data));
492 perf_sample_data_init(&data, 0, event->hw.last_period);
493 if (event->attr.sample_type & PERF_SAMPLE_TID) {
494 data.tid_entry.pid = task_tgid_nr(current);
495 data.tid_entry.tid = task_pid_nr(current);
497 if (event->attr.sample_type & PERF_SAMPLE_TIME)
498 data.time = event->clock();
499 if (event->attr.sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
501 if (event->attr.sample_type & PERF_SAMPLE_CPU)
502 data.cpu_entry.cpu = smp_processor_id();
503 if (event->attr.sample_type & PERF_SAMPLE_RAW) {
504 raw.frag.size = rawsize;
505 raw.frag.data = cpump->save;
506 perf_sample_save_raw_data(&data, &raw);
509 overflow = perf_event_overflow(event, &data, ®s);
510 perf_event_update_userpage(event);
511 /* Save NNPA lowcore area after read in event */
512 memcpy((void *)PAI_SAVE_AREA(event), cpump->area,
517 /* Check if there is data to be saved on schedule out of a task. */
518 static void paiext_have_sample(struct perf_event *event,
519 struct paiext_map *cpump)
525 rawsize = paiext_copy(cpump->save, cpump->area,
526 (unsigned long *)PAI_SAVE_AREA(event));
527 if (rawsize) /* Incremented counters */
528 paiext_push_sample(rawsize, cpump, event);
531 /* Check if there is data to be saved on schedule out of a task. */
532 static void paiext_have_samples(void)
534 struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr);
535 struct paiext_map *cpump = mp->mapptr;
536 struct perf_event *event;
538 list_for_each_entry(event, &cpump->syswide_list, hw.tp_list)
539 paiext_have_sample(event, cpump);
542 /* Called on schedule-in and schedule-out. No access to event structure,
543 * but for sampling only event NNPA_ALL is allowed.
545 static void paiext_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
547 /* We started with a clean page on event installation. So read out
548 * results on schedule_out and if page was dirty, save old values.
551 paiext_have_samples();
554 /* Attribute definitions for pai extension1 interface. As with other CPU
555 * Measurement Facilities, there is one attribute per mapped counter.
556 * The number of mapped counters may vary per machine generation. Use
557 * the QUERY PROCESSOR ACTIVITY COUNTER INFORMATION (QPACI) instruction
558 * to determine the number of mapped counters. The instructions returns
559 * a positive number, which is the highest number of supported counters.
560 * All counters less than this number are also supported, there are no
561 * holes. A returned number of zero means no support for mapped counters.
563 * The identification of the counter is a unique number. The chosen range
564 * is 0x1800 + offset in mapped kernel page.
565 * All CPU Measurement Facility counters identifiers must be unique and
566 * the numbers from 0 to 496 are already used for the CPU Measurement
567 * Counter facility. Number 0x1000 to 0x103e are used for PAI cryptography
569 * Numbers 0xb0000, 0xbc000 and 0xbd000 are already
570 * used for the CPU Measurement Sampling facility.
572 PMU_FORMAT_ATTR(event, "config:0-63");
574 static struct attribute *paiext_format_attr[] = {
575 &format_attr_event.attr,
579 static struct attribute_group paiext_events_group = {
581 .attrs = NULL, /* Filled in attr_event_init() */
584 static struct attribute_group paiext_format_group = {
586 .attrs = paiext_format_attr,
589 static const struct attribute_group *paiext_attr_groups[] = {
590 &paiext_events_group,
591 &paiext_format_group,
595 /* Performance monitoring unit for mapped counters */
596 static struct pmu paiext = {
597 .task_ctx_nr = perf_hw_context,
598 .event_init = paiext_event_init,
601 .start = paiext_start,
604 .sched_task = paiext_sched_task,
605 .attr_groups = paiext_attr_groups,
608 /* List of symbolic PAI extension 1 NNPA counter names. */
609 static const char * const paiext_ctrnames[] = {
619 [9] = "NNPA_IBM_RESERVED_9",
622 [12] = "NNPA_SIGMOID",
623 [13] = "NNPA_SOFTMAX",
624 [14] = "NNPA_BATCHNORM",
625 [15] = "NNPA_MAXPOOL2D",
626 [16] = "NNPA_AVGPOOL2D",
627 [17] = "NNPA_LSTMACT",
628 [18] = "NNPA_GRUACT",
629 [19] = "NNPA_CONVOLUTION",
630 [20] = "NNPA_MATMUL_OP",
631 [21] = "NNPA_MATMUL_OP_BCAST23",
632 [22] = "NNPA_SMALLBATCH",
633 [23] = "NNPA_LARGEDIM",
634 [24] = "NNPA_SMALLTENSOR",
635 [25] = "NNPA_1MFRAME",
636 [26] = "NNPA_2GFRAME",
637 [27] = "NNPA_ACCESSEXCEPT",
638 [28] = "NNPA_TRANSFORM",
640 [30] = "NNPA_MOMENTS",
641 [31] = "NNPA_LAYERNORM",
642 [32] = "NNPA_MATMUL_OP_BCAST1",
644 [34] = "NNPA_INVSQRT",
646 [36] = "NNPA_REDUCE",
649 static void __init attr_event_free(struct attribute **attrs, int num)
651 struct perf_pmu_events_attr *pa;
652 struct device_attribute *dap;
655 for (i = 0; i < num; i++) {
656 dap = container_of(attrs[i], struct device_attribute, attr);
657 pa = container_of(dap, struct perf_pmu_events_attr, attr);
663 static int __init attr_event_init_one(struct attribute **attrs, int num)
665 struct perf_pmu_events_attr *pa;
667 /* Index larger than array_size, no counter name available */
668 if (num >= ARRAY_SIZE(paiext_ctrnames)) {
673 pa = kzalloc(sizeof(*pa), GFP_KERNEL);
677 sysfs_attr_init(&pa->attr.attr);
678 pa->id = PAI_NNPA_BASE + num;
679 pa->attr.attr.name = paiext_ctrnames[num];
680 pa->attr.attr.mode = 0444;
681 pa->attr.show = cpumf_events_sysfs_show;
682 pa->attr.store = NULL;
683 attrs[num] = &pa->attr.attr;
687 /* Create PMU sysfs event attributes on the fly. */
688 static int __init attr_event_init(void)
690 struct attribute **attrs;
693 attrs = kmalloc_array(paiext_cnt + 2, sizeof(*attrs), GFP_KERNEL);
696 for (i = 0; i <= paiext_cnt; i++) {
697 ret = attr_event_init_one(attrs, i);
699 attr_event_free(attrs, i);
704 paiext_events_group.attrs = attrs;
708 static int __init paiext_init(void)
710 struct qpaci_info_block ib;
713 if (!test_facility(197))
717 paiext_cnt = ib.num_nnpa;
718 if (paiext_cnt >= PAI_NNPA_MAXCTR)
719 paiext_cnt = PAI_NNPA_MAXCTR;
723 rc = attr_event_init();
725 pr_err("Creation of PMU " KMSG_COMPONENT " /sysfs failed\n");
729 /* Setup s390dbf facility */
730 paiext_dbg = debug_register(KMSG_COMPONENT, 2, 256, 128);
732 pr_err("Registration of s390dbf " KMSG_COMPONENT " failed\n");
736 debug_register_view(paiext_dbg, &debug_sprintf_view);
738 rc = perf_pmu_register(&paiext, KMSG_COMPONENT, -1);
740 pr_err("Registration of " KMSG_COMPONENT " PMU failed with "
748 debug_unregister_view(paiext_dbg, &debug_sprintf_view);
749 debug_unregister(paiext_dbg);
751 attr_event_free(paiext_events_group.attrs,
752 ARRAY_SIZE(paiext_ctrnames) + 1);
756 device_initcall(paiext_init);