2 * trace event based perf event profiling/tracing
4 * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra
8 #include <linux/module.h>
9 #include <linux/kprobes.h>
11 #include "trace_probe.h"
13 static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS];
16 * Force it to be aligned to unsigned long to avoid misaligned accesses
19 typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)])
22 /* Count the events in use (per event id, not per instance) */
23 static int total_ref_count;
25 static int perf_trace_event_perm(struct trace_event_call *tp_event,
26 struct perf_event *p_event)
28 if (tp_event->perf_perm) {
29 int ret = tp_event->perf_perm(tp_event, p_event);
35 * We checked and allowed to create parent,
36 * allow children without checking.
42 * It's ok to check current process (owner) permissions in here,
43 * because code below is called only via perf_event_open syscall.
46 /* The ftrace function trace is allowed only for root. */
47 if (ftrace_event_is_function(tp_event)) {
48 if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
51 if (!is_sampling_event(p_event))
55 * We don't allow user space callchains for function trace
56 * event, due to issues with page faults while tracing page
57 * fault handler and its overall trickiness nature.
59 if (!p_event->attr.exclude_callchain_user)
63 * Same reason to disable user stack dump as for user space
66 if (p_event->attr.sample_type & PERF_SAMPLE_STACK_USER)
70 /* No tracing, just counting, so no obvious leak */
71 if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW))
74 /* Some events are ok to be traced by non-root users... */
75 if (p_event->attach_state == PERF_ATTACH_TASK) {
76 if (tp_event->flags & TRACE_EVENT_FL_CAP_ANY)
81 * ...otherwise raw tracepoint data can be a severe data leak,
82 * only allow root to have these.
84 if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
90 static int perf_trace_event_reg(struct trace_event_call *tp_event,
91 struct perf_event *p_event)
93 struct hlist_head __percpu *list;
97 p_event->tp_event = tp_event;
98 if (tp_event->perf_refcount++ > 0)
101 list = alloc_percpu(struct hlist_head);
105 for_each_possible_cpu(cpu)
106 INIT_HLIST_HEAD(per_cpu_ptr(list, cpu));
108 tp_event->perf_events = list;
110 if (!total_ref_count) {
114 for (i = 0; i < PERF_NR_CONTEXTS; i++) {
115 buf = (char __percpu *)alloc_percpu(perf_trace_t);
119 perf_trace_buf[i] = buf;
123 ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER, NULL);
131 if (!total_ref_count) {
134 for (i = 0; i < PERF_NR_CONTEXTS; i++) {
135 free_percpu(perf_trace_buf[i]);
136 perf_trace_buf[i] = NULL;
140 if (!--tp_event->perf_refcount) {
141 free_percpu(tp_event->perf_events);
142 tp_event->perf_events = NULL;
148 static void perf_trace_event_unreg(struct perf_event *p_event)
150 struct trace_event_call *tp_event = p_event->tp_event;
153 if (--tp_event->perf_refcount > 0)
156 tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER, NULL);
159 * Ensure our callback won't be called anymore. The buffers
160 * will be freed after that.
162 tracepoint_synchronize_unregister();
164 free_percpu(tp_event->perf_events);
165 tp_event->perf_events = NULL;
167 if (!--total_ref_count) {
168 for (i = 0; i < PERF_NR_CONTEXTS; i++) {
169 free_percpu(perf_trace_buf[i]);
170 perf_trace_buf[i] = NULL;
174 module_put(tp_event->mod);
177 static int perf_trace_event_open(struct perf_event *p_event)
179 struct trace_event_call *tp_event = p_event->tp_event;
180 return tp_event->class->reg(tp_event, TRACE_REG_PERF_OPEN, p_event);
183 static void perf_trace_event_close(struct perf_event *p_event)
185 struct trace_event_call *tp_event = p_event->tp_event;
186 tp_event->class->reg(tp_event, TRACE_REG_PERF_CLOSE, p_event);
189 static int perf_trace_event_init(struct trace_event_call *tp_event,
190 struct perf_event *p_event)
194 ret = perf_trace_event_perm(tp_event, p_event);
198 ret = perf_trace_event_reg(tp_event, p_event);
202 ret = perf_trace_event_open(p_event);
204 perf_trace_event_unreg(p_event);
211 int perf_trace_init(struct perf_event *p_event)
213 struct trace_event_call *tp_event;
214 u64 event_id = p_event->attr.config;
217 mutex_lock(&event_mutex);
218 list_for_each_entry(tp_event, &ftrace_events, list) {
219 if (tp_event->event.type == event_id &&
220 tp_event->class && tp_event->class->reg &&
221 try_module_get(tp_event->mod)) {
222 ret = perf_trace_event_init(tp_event, p_event);
224 module_put(tp_event->mod);
228 mutex_unlock(&event_mutex);
233 void perf_trace_destroy(struct perf_event *p_event)
235 mutex_lock(&event_mutex);
236 perf_trace_event_close(p_event);
237 perf_trace_event_unreg(p_event);
238 mutex_unlock(&event_mutex);
241 #ifdef CONFIG_KPROBE_EVENTS
242 int perf_kprobe_init(struct perf_event *p_event, bool is_retprobe)
246 struct trace_event_call *tp_event;
248 if (p_event->attr.kprobe_func) {
249 func = kzalloc(KSYM_NAME_LEN, GFP_KERNEL);
252 ret = strncpy_from_user(
253 func, u64_to_user_ptr(p_event->attr.kprobe_func),
258 if (func[0] == '\0') {
264 tp_event = create_local_trace_kprobe(
265 func, (void *)(unsigned long)(p_event->attr.kprobe_addr),
266 p_event->attr.probe_offset, is_retprobe);
267 if (IS_ERR(tp_event)) {
268 ret = PTR_ERR(tp_event);
272 ret = perf_trace_event_init(tp_event, p_event);
274 destroy_local_trace_kprobe(tp_event);
280 void perf_kprobe_destroy(struct perf_event *p_event)
282 perf_trace_event_close(p_event);
283 perf_trace_event_unreg(p_event);
285 destroy_local_trace_kprobe(p_event->tp_event);
287 #endif /* CONFIG_KPROBE_EVENTS */
289 #ifdef CONFIG_UPROBE_EVENTS
290 int perf_uprobe_init(struct perf_event *p_event, bool is_retprobe)
294 struct trace_event_call *tp_event;
296 if (!p_event->attr.uprobe_path)
298 path = kzalloc(PATH_MAX, GFP_KERNEL);
301 ret = strncpy_from_user(
302 path, u64_to_user_ptr(p_event->attr.uprobe_path), PATH_MAX);
305 if (path[0] == '\0') {
310 tp_event = create_local_trace_uprobe(
311 path, p_event->attr.probe_offset, is_retprobe);
312 if (IS_ERR(tp_event)) {
313 ret = PTR_ERR(tp_event);
318 * local trace_uprobe need to hold event_mutex to call
319 * uprobe_buffer_enable() and uprobe_buffer_disable().
320 * event_mutex is not required for local trace_kprobes.
322 mutex_lock(&event_mutex);
323 ret = perf_trace_event_init(tp_event, p_event);
325 destroy_local_trace_uprobe(tp_event);
326 mutex_unlock(&event_mutex);
332 void perf_uprobe_destroy(struct perf_event *p_event)
334 mutex_lock(&event_mutex);
335 perf_trace_event_close(p_event);
336 perf_trace_event_unreg(p_event);
337 mutex_unlock(&event_mutex);
338 destroy_local_trace_uprobe(p_event->tp_event);
340 #endif /* CONFIG_UPROBE_EVENTS */
342 int perf_trace_add(struct perf_event *p_event, int flags)
344 struct trace_event_call *tp_event = p_event->tp_event;
346 if (!(flags & PERF_EF_START))
347 p_event->hw.state = PERF_HES_STOPPED;
350 * If TRACE_REG_PERF_ADD returns false; no custom action was performed
351 * and we need to take the default action of enqueueing our event on
352 * the right per-cpu hlist.
354 if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event)) {
355 struct hlist_head __percpu *pcpu_list;
356 struct hlist_head *list;
358 pcpu_list = tp_event->perf_events;
359 if (WARN_ON_ONCE(!pcpu_list))
362 list = this_cpu_ptr(pcpu_list);
363 hlist_add_head_rcu(&p_event->hlist_entry, list);
369 void perf_trace_del(struct perf_event *p_event, int flags)
371 struct trace_event_call *tp_event = p_event->tp_event;
374 * If TRACE_REG_PERF_DEL returns false; no custom action was performed
375 * and we need to take the default action of dequeueing our event from
376 * the right per-cpu hlist.
378 if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event))
379 hlist_del_rcu(&p_event->hlist_entry);
382 void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp)
387 BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long));
389 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
390 "perf buffer not large enough"))
393 *rctxp = rctx = perf_swevent_get_recursion_context();
398 *regs = this_cpu_ptr(&__perf_regs[rctx]);
399 raw_data = this_cpu_ptr(perf_trace_buf[rctx]);
401 /* zero the dead bytes from align to not leak stack to user */
402 memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64));
405 EXPORT_SYMBOL_GPL(perf_trace_buf_alloc);
406 NOKPROBE_SYMBOL(perf_trace_buf_alloc);
408 void perf_trace_buf_update(void *record, u16 type)
410 struct trace_entry *entry = record;
411 int pc = preempt_count();
414 local_save_flags(flags);
415 tracing_generic_entry_update(entry, flags, pc);
418 NOKPROBE_SYMBOL(perf_trace_buf_update);
420 #ifdef CONFIG_FUNCTION_TRACER
422 perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
423 struct ftrace_ops *ops, struct pt_regs *pt_regs)
425 struct ftrace_entry *entry;
426 struct perf_event *event;
427 struct hlist_head head;
431 if ((unsigned long)ops->private != smp_processor_id())
434 event = container_of(ops, struct perf_event, ftrace_ops);
437 * @event->hlist entry is NULL (per INIT_HLIST_NODE), and all
438 * the perf code does is hlist_for_each_entry_rcu(), so we can
439 * get away with simply setting the @head.first pointer in order
440 * to create a singular list.
442 head.first = &event->hlist_entry;
444 #define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \
445 sizeof(u64)) - sizeof(u32))
447 BUILD_BUG_ON(ENTRY_SIZE > PERF_MAX_TRACE_SIZE);
449 memset(®s, 0, sizeof(regs));
450 perf_fetch_caller_regs(®s);
452 entry = perf_trace_buf_alloc(ENTRY_SIZE, NULL, &rctx);
457 entry->parent_ip = parent_ip;
458 perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN,
459 1, ®s, &head, NULL);
464 static int perf_ftrace_function_register(struct perf_event *event)
466 struct ftrace_ops *ops = &event->ftrace_ops;
468 ops->flags = FTRACE_OPS_FL_RCU;
469 ops->func = perf_ftrace_function_call;
470 ops->private = (void *)(unsigned long)nr_cpu_ids;
472 return register_ftrace_function(ops);
475 static int perf_ftrace_function_unregister(struct perf_event *event)
477 struct ftrace_ops *ops = &event->ftrace_ops;
478 int ret = unregister_ftrace_function(ops);
479 ftrace_free_filter(ops);
483 int perf_ftrace_event_register(struct trace_event_call *call,
484 enum trace_reg type, void *data)
486 struct perf_event *event = data;
489 case TRACE_REG_REGISTER:
490 case TRACE_REG_UNREGISTER:
492 case TRACE_REG_PERF_REGISTER:
493 case TRACE_REG_PERF_UNREGISTER:
495 case TRACE_REG_PERF_OPEN:
496 return perf_ftrace_function_register(data);
497 case TRACE_REG_PERF_CLOSE:
498 return perf_ftrace_function_unregister(data);
499 case TRACE_REG_PERF_ADD:
500 event->ftrace_ops.private = (void *)(unsigned long)smp_processor_id();
502 case TRACE_REG_PERF_DEL:
503 event->ftrace_ops.private = (void *)(unsigned long)nr_cpu_ids;
509 #endif /* CONFIG_FUNCTION_TRACER */