4 * Builtin 'trace' command:
6 * Display a continuously updated trace of any workload, CPU, specific PID,
7 * system wide, etc. Default format is loosely strace like, but any other
8 * event may be specified using --event.
10 * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <
[email protected]>
12 * Initially based on the 'trace' prototype by Thomas Gleixner:
14 * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
16 * Released under the GPL v2. (and only v2, not any later version)
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
22 #include "util/cgroup.h"
23 #include "util/color.h"
24 #include "util/debug.h"
26 #include "util/event.h"
27 #include "util/evlist.h"
28 #include <subcmd/exec-cmd.h>
29 #include "util/machine.h"
30 #include "util/path.h"
31 #include "util/session.h"
32 #include "util/thread.h"
33 #include <subcmd/parse-options.h>
34 #include "util/strlist.h"
35 #include "util/intlist.h"
36 #include "util/thread_map.h"
37 #include "util/stat.h"
38 #include "trace/beauty/beauty.h"
39 #include "trace-event.h"
40 #include "util/parse-events.h"
41 #include "util/bpf-loader.h"
42 #include "callchain.h"
43 #include "print_binary.h"
45 #include "syscalltbl.h"
46 #include "rb_resort.h"
54 #include <linux/err.h>
55 #include <linux/filter.h>
56 #include <linux/kernel.h>
57 #include <linux/random.h>
58 #include <linux/stringify.h>
59 #include <linux/time64.h>
62 #include "sane_ctype.h"
65 # define O_CLOEXEC 02000000
68 #ifndef F_LINUX_SPECIFIC_BASE
69 # define F_LINUX_SPECIFIC_BASE 1024
73 struct perf_tool tool;
74 struct syscalltbl *sctbl;
77 struct syscall *table;
79 struct perf_evsel *sys_enter,
84 struct record_opts opts;
85 struct perf_evlist *evlist;
87 struct thread *current;
88 struct cgroup *cgroup;
91 unsigned long nr_events;
92 struct strlist *ev_qualifier;
101 double duration_filter;
107 unsigned int max_stack;
108 unsigned int min_stack;
109 bool not_ev_qualifier;
113 bool multiple_threads;
119 bool show_tool_stats;
121 bool kernel_syscallchains;
130 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
131 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
135 #define TP_UINT_FIELD(bits) \
136 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
139 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
148 #define TP_UINT_FIELD__SWAPPED(bits) \
149 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
152 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
153 return bswap_##bits(value);\
156 TP_UINT_FIELD__SWAPPED(16);
157 TP_UINT_FIELD__SWAPPED(32);
158 TP_UINT_FIELD__SWAPPED(64);
160 static int __tp_field__init_uint(struct tp_field *field, int size, int offset, bool needs_swap)
162 field->offset = offset;
166 field->integer = tp_field__u8;
169 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
172 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
175 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
184 static int tp_field__init_uint(struct tp_field *field, struct format_field *format_field, bool needs_swap)
186 return __tp_field__init_uint(field, format_field->size, format_field->offset, needs_swap);
189 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
191 return sample->raw_data + field->offset;
194 static int __tp_field__init_ptr(struct tp_field *field, int offset)
196 field->offset = offset;
197 field->pointer = tp_field__ptr;
201 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
203 return __tp_field__init_ptr(field, format_field->offset);
209 struct tp_field args, ret;
213 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
214 struct tp_field *field,
217 struct format_field *format_field = perf_evsel__field(evsel, name);
219 if (format_field == NULL)
222 return tp_field__init_uint(field, format_field, evsel->needs_swap);
225 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
226 ({ struct syscall_tp *sc = evsel->priv;\
227 perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
229 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
230 struct tp_field *field,
233 struct format_field *format_field = perf_evsel__field(evsel, name);
235 if (format_field == NULL)
238 return tp_field__init_ptr(field, format_field);
241 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
242 ({ struct syscall_tp *sc = evsel->priv;\
243 perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
245 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
248 perf_evsel__delete(evsel);
251 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel)
253 struct syscall_tp *sc = evsel->priv = malloc(sizeof(struct syscall_tp));
255 if (evsel->priv != NULL) {
256 if (perf_evsel__init_tp_uint_field(evsel, &sc->id, "__syscall_nr"))
267 static int perf_evsel__init_augmented_syscall_tp(struct perf_evsel *evsel)
269 struct syscall_tp *sc = evsel->priv = malloc(sizeof(struct syscall_tp));
271 if (evsel->priv != NULL) { /* field, sizeof_field, offsetof_field */
272 if (__tp_field__init_uint(&sc->id, sizeof(long), sizeof(long long), evsel->needs_swap))
284 static int perf_evsel__init_augmented_syscall_tp_args(struct perf_evsel *evsel)
286 struct syscall_tp *sc = evsel->priv;
288 return __tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64));
291 static int perf_evsel__init_raw_syscall_tp(struct perf_evsel *evsel, void *handler)
293 evsel->priv = malloc(sizeof(struct syscall_tp));
294 if (evsel->priv != NULL) {
295 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
298 evsel->handler = handler;
309 static struct perf_evsel *perf_evsel__raw_syscall_newtp(const char *direction, void *handler)
311 struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
313 /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
315 evsel = perf_evsel__newtp("syscalls", direction);
320 if (perf_evsel__init_raw_syscall_tp(evsel, handler))
326 perf_evsel__delete_priv(evsel);
330 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
331 ({ struct syscall_tp *fields = evsel->priv; \
332 fields->name.integer(&fields->name, sample); })
334 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
335 ({ struct syscall_tp *fields = evsel->priv; \
336 fields->name.pointer(&fields->name, sample); })
338 size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, int val)
340 int idx = val - sa->offset;
342 if (idx < 0 || idx >= sa->nr_entries || sa->entries[idx] == NULL)
343 return scnprintf(bf, size, intfmt, val);
345 return scnprintf(bf, size, "%s", sa->entries[idx]);
348 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
350 struct syscall_arg *arg)
352 return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->val);
355 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
356 struct syscall_arg *arg)
358 return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
361 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
365 struct strarray **entries;
368 #define DEFINE_STRARRAYS(array) struct strarrays strarrays__##array = { \
369 .nr_entries = ARRAY_SIZE(array), \
373 size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size,
374 struct syscall_arg *arg)
376 struct strarrays *sas = arg->parm;
379 for (i = 0; i < sas->nr_entries; ++i) {
380 struct strarray *sa = sas->entries[i];
381 int idx = arg->val - sa->offset;
383 if (idx >= 0 && idx < sa->nr_entries) {
384 if (sa->entries[idx] == NULL)
386 return scnprintf(bf, size, "%s", sa->entries[idx]);
390 return scnprintf(bf, size, "%d", arg->val);
394 #define AT_FDCWD -100
397 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
398 struct syscall_arg *arg)
403 return scnprintf(bf, size, "CWD");
405 return syscall_arg__scnprintf_fd(bf, size, arg);
408 #define SCA_FDAT syscall_arg__scnprintf_fd_at
410 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
411 struct syscall_arg *arg);
413 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
415 size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg)
417 return scnprintf(bf, size, "%#lx", arg->val);
420 size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg)
422 return scnprintf(bf, size, "%d", arg->val);
425 size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg)
427 return scnprintf(bf, size, "%ld", arg->val);
430 static const char *bpf_cmd[] = {
431 "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
432 "MAP_GET_NEXT_KEY", "PROG_LOAD",
434 static DEFINE_STRARRAY(bpf_cmd);
436 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
437 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
439 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
440 static DEFINE_STRARRAY(itimers);
442 static const char *keyctl_options[] = {
443 "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
444 "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
445 "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
446 "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
447 "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
449 static DEFINE_STRARRAY(keyctl_options);
451 static const char *whences[] = { "SET", "CUR", "END",
459 static DEFINE_STRARRAY(whences);
461 static const char *fcntl_cmds[] = {
462 "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
463 "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64",
464 "SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX",
467 static DEFINE_STRARRAY(fcntl_cmds);
469 static const char *fcntl_linux_specific_cmds[] = {
470 "SETLEASE", "GETLEASE", "NOTIFY", [5] = "CANCELLK", "DUPFD_CLOEXEC",
471 "SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS",
472 "GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT",
475 static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, F_LINUX_SPECIFIC_BASE);
477 static struct strarray *fcntl_cmds_arrays[] = {
478 &strarray__fcntl_cmds,
479 &strarray__fcntl_linux_specific_cmds,
482 static DEFINE_STRARRAYS(fcntl_cmds_arrays);
484 static const char *rlimit_resources[] = {
485 "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
486 "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
489 static DEFINE_STRARRAY(rlimit_resources);
491 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
492 static DEFINE_STRARRAY(sighow);
494 static const char *clockid[] = {
495 "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
496 "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
497 "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
499 static DEFINE_STRARRAY(clockid);
501 static const char *socket_families[] = {
502 "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
503 "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
504 "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
505 "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
506 "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
507 "ALG", "NFC", "VSOCK",
509 static DEFINE_STRARRAY(socket_families);
511 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
512 struct syscall_arg *arg)
517 if (mode == F_OK) /* 0 */
518 return scnprintf(bf, size, "F");
520 if (mode & n##_OK) { \
521 printed += scnprintf(bf + printed, size - printed, "%s", #n); \
531 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
536 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
538 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
539 struct syscall_arg *arg);
541 #define SCA_FILENAME syscall_arg__scnprintf_filename
543 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
544 struct syscall_arg *arg)
546 int printed = 0, flags = arg->val;
549 if (flags & O_##n) { \
550 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
559 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
564 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
566 #ifndef GRND_NONBLOCK
567 #define GRND_NONBLOCK 0x0001
570 #define GRND_RANDOM 0x0002
573 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
574 struct syscall_arg *arg)
576 int printed = 0, flags = arg->val;
579 if (flags & GRND_##n) { \
580 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
581 flags &= ~GRND_##n; \
589 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
594 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
596 #define STRARRAY(name, array) \
597 { .scnprintf = SCA_STRARRAY, \
598 .parm = &strarray__##array, }
600 #include "trace/beauty/arch_errno_names.c"
601 #include "trace/beauty/eventfd.c"
602 #include "trace/beauty/futex_op.c"
603 #include "trace/beauty/futex_val3.c"
604 #include "trace/beauty/mmap.c"
605 #include "trace/beauty/mode_t.c"
606 #include "trace/beauty/msg_flags.c"
607 #include "trace/beauty/open_flags.c"
608 #include "trace/beauty/perf_event_open.c"
609 #include "trace/beauty/pid.c"
610 #include "trace/beauty/sched_policy.c"
611 #include "trace/beauty/seccomp.c"
612 #include "trace/beauty/signum.c"
613 #include "trace/beauty/socket_type.c"
614 #include "trace/beauty/waitid_options.c"
616 struct syscall_arg_fmt {
617 size_t (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
623 static struct syscall_fmt {
626 struct syscall_arg_fmt arg[6];
633 .arg = { [1] = { .scnprintf = SCA_ACCMODE, /* mode */ }, }, },
635 .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, },
636 { .name = "brk", .hexret = true,
637 .arg = { [0] = { .scnprintf = SCA_HEX, /* brk */ }, }, },
638 { .name = "clock_gettime",
639 .arg = { [0] = STRARRAY(clk_id, clockid), }, },
640 { .name = "clone", .errpid = true, .nr_args = 5,
641 .arg = { [0] = { .name = "flags", .scnprintf = SCA_CLONE_FLAGS, },
642 [1] = { .name = "child_stack", .scnprintf = SCA_HEX, },
643 [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, },
644 [3] = { .name = "child_tidptr", .scnprintf = SCA_HEX, },
645 [4] = { .name = "tls", .scnprintf = SCA_HEX, }, }, },
647 .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
648 { .name = "epoll_ctl",
649 .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
650 { .name = "eventfd2",
651 .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, },
652 { .name = "fchmodat",
653 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
654 { .name = "fchownat",
655 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
657 .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD, /* cmd */
658 .parm = &strarrays__fcntl_cmds_arrays,
659 .show_zero = true, },
660 [2] = { .scnprintf = SCA_FCNTL_ARG, /* arg */ }, }, },
662 .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, },
663 { .name = "fstat", .alias = "newfstat", },
664 { .name = "fstatat", .alias = "newfstatat", },
666 .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ },
667 [5] = { .scnprintf = SCA_FUTEX_VAL3, /* val3 */ }, }, },
668 { .name = "futimesat",
669 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
670 { .name = "getitimer",
671 .arg = { [0] = STRARRAY(which, itimers), }, },
672 { .name = "getpid", .errpid = true, },
673 { .name = "getpgid", .errpid = true, },
674 { .name = "getppid", .errpid = true, },
675 { .name = "getrandom",
676 .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, },
677 { .name = "getrlimit",
678 .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
679 { .name = "gettid", .errpid = true, },
682 #if defined(__i386__) || defined(__x86_64__)
684 * FIXME: Make this available to all arches.
686 [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ },
687 [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
689 [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
691 { .name = "kcmp", .nr_args = 5,
692 .arg = { [0] = { .name = "pid1", .scnprintf = SCA_PID, },
693 [1] = { .name = "pid2", .scnprintf = SCA_PID, },
694 [2] = { .name = "type", .scnprintf = SCA_KCMP_TYPE, },
695 [3] = { .name = "idx1", .scnprintf = SCA_KCMP_IDX, },
696 [4] = { .name = "idx2", .scnprintf = SCA_KCMP_IDX, }, }, },
698 .arg = { [0] = STRARRAY(option, keyctl_options), }, },
700 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
702 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
704 .arg = { [2] = STRARRAY(whence, whences), }, },
705 { .name = "lstat", .alias = "newlstat", },
707 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ },
708 [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, },
710 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
712 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
714 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
715 { .name = "mlockall",
716 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
717 { .name = "mmap", .hexret = true,
718 /* The standard mmap maps to old_mmap on s390x */
719 #if defined(__s390x__)
722 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ },
723 [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ },
724 [3] = { .scnprintf = SCA_MMAP_FLAGS, /* flags */ }, }, },
725 { .name = "mprotect",
726 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ },
727 [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ }, }, },
728 { .name = "mq_unlink",
729 .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, },
730 { .name = "mremap", .hexret = true,
731 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ },
732 [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ },
733 [4] = { .scnprintf = SCA_HEX, /* new_addr */ }, }, },
735 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
737 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
738 { .name = "name_to_handle_at",
739 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
740 { .name = "newfstatat",
741 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
743 .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
744 { .name = "open_by_handle_at",
745 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ },
746 [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
748 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ },
749 [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
750 { .name = "perf_event_open",
751 .arg = { [2] = { .scnprintf = SCA_INT, /* cpu */ },
752 [3] = { .scnprintf = SCA_FD, /* group_fd */ },
753 [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, },
755 .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, },
756 { .name = "pkey_alloc",
757 .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS, /* access_rights */ }, }, },
758 { .name = "pkey_free",
759 .arg = { [0] = { .scnprintf = SCA_INT, /* key */ }, }, },
760 { .name = "pkey_mprotect",
761 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ },
762 [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ },
763 [3] = { .scnprintf = SCA_INT, /* pkey */ }, }, },
764 { .name = "poll", .timeout = true, },
765 { .name = "ppoll", .timeout = true, },
766 { .name = "prctl", .alias = "arch_prctl",
767 .arg = { [0] = { .scnprintf = SCA_PRCTL_OPTION, /* option */ },
768 [1] = { .scnprintf = SCA_PRCTL_ARG2, /* arg2 */ },
769 [2] = { .scnprintf = SCA_PRCTL_ARG3, /* arg3 */ }, }, },
770 { .name = "pread", .alias = "pread64", },
771 { .name = "preadv", .alias = "pread", },
772 { .name = "prlimit64",
773 .arg = { [1] = STRARRAY(resource, rlimit_resources), }, },
774 { .name = "pwrite", .alias = "pwrite64", },
775 { .name = "readlinkat",
776 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
777 { .name = "recvfrom",
778 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
779 { .name = "recvmmsg",
780 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
782 .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
783 { .name = "renameat",
784 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
785 { .name = "rt_sigaction",
786 .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
787 { .name = "rt_sigprocmask",
788 .arg = { [0] = STRARRAY(how, sighow), }, },
789 { .name = "rt_sigqueueinfo",
790 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
791 { .name = "rt_tgsigqueueinfo",
792 .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
793 { .name = "sched_setscheduler",
794 .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, },
796 .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP, /* op */ },
797 [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, },
798 { .name = "select", .timeout = true, },
799 { .name = "sendmmsg",
800 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
802 .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
804 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
805 { .name = "set_tid_address", .errpid = true, },
806 { .name = "setitimer",
807 .arg = { [0] = STRARRAY(which, itimers), }, },
808 { .name = "setrlimit",
809 .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
811 .arg = { [0] = STRARRAY(family, socket_families),
812 [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
813 [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
814 { .name = "socketpair",
815 .arg = { [0] = STRARRAY(family, socket_families),
816 [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
817 [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
818 { .name = "stat", .alias = "newstat", },
820 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fdat */ },
821 [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } ,
822 [3] = { .scnprintf = SCA_STATX_MASK, /* mask */ }, }, },
824 .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
826 .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
827 { .name = "symlinkat",
828 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
830 .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
832 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
833 { .name = "uname", .alias = "newuname", },
834 { .name = "unlinkat",
835 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
836 { .name = "utimensat",
837 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, },
838 { .name = "wait4", .errpid = true,
839 .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
840 { .name = "waitid", .errpid = true,
841 .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
844 static int syscall_fmt__cmp(const void *name, const void *fmtp)
846 const struct syscall_fmt *fmt = fmtp;
847 return strcmp(name, fmt->name);
850 static struct syscall_fmt *syscall_fmt__find(const char *name)
852 const int nmemb = ARRAY_SIZE(syscall_fmts);
853 return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
857 * is_exit: is this "exit" or "exit_group"?
858 * is_open: is this "open" or "openat"? To associate the fd returned in sys_exit with the pathname in sys_enter.
861 struct event_format *tp_format;
865 struct format_field *args;
867 struct syscall_fmt *fmt;
868 struct syscall_arg_fmt *arg_fmt;
872 * We need to have this 'calculated' boolean because in some cases we really
873 * don't know what is the duration of a syscall, for instance, when we start
874 * a session and some threads are waiting for a syscall to finish, say 'poll',
875 * in which case all we can do is to print "( ? ) for duration and for the
878 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
880 double duration = (double)t / NSEC_PER_MSEC;
881 size_t printed = fprintf(fp, "(");
884 printed += fprintf(fp, " ");
885 else if (duration >= 1.0)
886 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
887 else if (duration >= 0.01)
888 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
890 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
891 return printed + fprintf(fp, "): ");
895 * filename.ptr: The filename char pointer that will be vfs_getname'd
896 * filename.entry_str_pos: Where to insert the string translated from
897 * filename.ptr by the vfs_getname tracepoint/kprobe.
898 * ret_scnprintf: syscall args may set this to a different syscall return
899 * formatter, for instance, fcntl may return fds, file flags, etc.
901 struct thread_trace {
904 unsigned long nr_events;
905 unsigned long pfmaj, pfmin;
908 size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
911 short int entry_str_pos;
913 unsigned int namelen;
921 struct intlist *syscall_stats;
924 static struct thread_trace *thread_trace__new(void)
926 struct thread_trace *ttrace = zalloc(sizeof(struct thread_trace));
929 ttrace->paths.max = -1;
931 ttrace->syscall_stats = intlist__new(NULL);
936 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
938 struct thread_trace *ttrace;
943 if (thread__priv(thread) == NULL)
944 thread__set_priv(thread, thread_trace__new());
946 if (thread__priv(thread) == NULL)
949 ttrace = thread__priv(thread);
954 color_fprintf(fp, PERF_COLOR_RED,
955 "WARNING: not enough memory, dropping samples!\n");
960 void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
961 size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg))
963 struct thread_trace *ttrace = thread__priv(arg->thread);
965 ttrace->ret_scnprintf = ret_scnprintf;
968 #define TRACE_PFMAJ (1 << 0)
969 #define TRACE_PFMIN (1 << 1)
971 static const size_t trace__entry_str_size = 2048;
973 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
975 struct thread_trace *ttrace = thread__priv(thread);
977 if (fd > ttrace->paths.max) {
978 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
983 if (ttrace->paths.max != -1) {
984 memset(npath + ttrace->paths.max + 1, 0,
985 (fd - ttrace->paths.max) * sizeof(char *));
987 memset(npath, 0, (fd + 1) * sizeof(char *));
990 ttrace->paths.table = npath;
991 ttrace->paths.max = fd;
994 ttrace->paths.table[fd] = strdup(pathname);
996 return ttrace->paths.table[fd] != NULL ? 0 : -1;
999 static int thread__read_fd_path(struct thread *thread, int fd)
1001 char linkname[PATH_MAX], pathname[PATH_MAX];
1005 if (thread->pid_ == thread->tid) {
1006 scnprintf(linkname, sizeof(linkname),
1007 "/proc/%d/fd/%d", thread->pid_, fd);
1009 scnprintf(linkname, sizeof(linkname),
1010 "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
1013 if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
1016 ret = readlink(linkname, pathname, sizeof(pathname));
1018 if (ret < 0 || ret > st.st_size)
1021 pathname[ret] = '\0';
1022 return trace__set_fd_pathname(thread, fd, pathname);
1025 static const char *thread__fd_path(struct thread *thread, int fd,
1026 struct trace *trace)
1028 struct thread_trace *ttrace = thread__priv(thread);
1036 if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
1039 ++trace->stats.proc_getname;
1040 if (thread__read_fd_path(thread, fd))
1044 return ttrace->paths.table[fd];
1047 size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg)
1050 size_t printed = scnprintf(bf, size, "%d", fd);
1051 const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1054 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1059 size_t pid__scnprintf_fd(struct trace *trace, pid_t pid, int fd, char *bf, size_t size)
1061 size_t printed = scnprintf(bf, size, "%d", fd);
1062 struct thread *thread = machine__find_thread(trace->host, pid, pid);
1065 const char *path = thread__fd_path(thread, fd, trace);
1068 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1070 thread__put(thread);
1076 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1077 struct syscall_arg *arg)
1080 size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1081 struct thread_trace *ttrace = thread__priv(arg->thread);
1083 if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1084 zfree(&ttrace->paths.table[fd]);
1089 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1092 struct thread_trace *ttrace = thread__priv(thread);
1094 ttrace->filename.ptr = ptr;
1095 ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1098 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1099 struct syscall_arg *arg)
1101 unsigned long ptr = arg->val;
1103 if (!arg->trace->vfs_getname)
1104 return scnprintf(bf, size, "%#x", ptr);
1106 thread__set_filename_pos(arg->thread, bf, ptr);
1110 static bool trace__filter_duration(struct trace *trace, double t)
1112 return t < (trace->duration_filter * NSEC_PER_MSEC);
1115 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1117 double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1119 return fprintf(fp, "%10.3f ", ts);
1123 * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1124 * using ttrace->entry_time for a thread that receives a sys_exit without
1125 * first having received a sys_enter ("poll" issued before tracing session
1126 * starts, lost sys_enter exit due to ring buffer overflow).
1128 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1131 return __trace__fprintf_tstamp(trace, tstamp, fp);
1133 return fprintf(fp, " ? ");
1136 static bool done = false;
1137 static bool interrupted = false;
1139 static void sig_handler(int sig)
1142 interrupted = sig == SIGINT;
1145 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1146 u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1148 size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1149 printed += fprintf_duration(duration, duration_calculated, fp);
1151 if (trace->multiple_threads) {
1152 if (trace->show_comm)
1153 printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1154 printed += fprintf(fp, "%d ", thread->tid);
1160 static int trace__process_event(struct trace *trace, struct machine *machine,
1161 union perf_event *event, struct perf_sample *sample)
1165 switch (event->header.type) {
1166 case PERF_RECORD_LOST:
1167 color_fprintf(trace->output, PERF_COLOR_RED,
1168 "LOST %" PRIu64 " events!\n", event->lost.lost);
1169 ret = machine__process_lost_event(machine, event, sample);
1172 ret = machine__process_event(machine, event, sample);
1179 static int trace__tool_process(struct perf_tool *tool,
1180 union perf_event *event,
1181 struct perf_sample *sample,
1182 struct machine *machine)
1184 struct trace *trace = container_of(tool, struct trace, tool);
1185 return trace__process_event(trace, machine, event, sample);
1188 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1190 struct machine *machine = vmachine;
1192 if (machine->kptr_restrict_warned)
1195 if (symbol_conf.kptr_restrict) {
1196 pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1197 "Check /proc/sys/kernel/kptr_restrict.\n\n"
1198 "Kernel samples will not be resolved.\n");
1199 machine->kptr_restrict_warned = true;
1203 return machine__resolve_kernel_addr(vmachine, addrp, modp);
1206 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1208 int err = symbol__init(NULL);
1213 trace->host = machine__new_host();
1214 if (trace->host == NULL)
1217 err = trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr);
1221 err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1222 evlist->threads, trace__tool_process, false,
1223 trace->opts.proc_map_timeout, 1);
1231 static void trace__symbols__exit(struct trace *trace)
1233 machine__exit(trace->host);
1239 static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
1243 if (nr_args == 6 && sc->fmt && sc->fmt->nr_args != 0)
1244 nr_args = sc->fmt->nr_args;
1246 sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt));
1247 if (sc->arg_fmt == NULL)
1250 for (idx = 0; idx < nr_args; ++idx) {
1252 sc->arg_fmt[idx] = sc->fmt->arg[idx];
1255 sc->nr_args = nr_args;
1259 static int syscall__set_arg_fmts(struct syscall *sc)
1261 struct format_field *field;
1264 for (field = sc->args; field; field = field->next, ++idx) {
1265 if (sc->fmt && sc->fmt->arg[idx].scnprintf)
1268 if (strcmp(field->type, "const char *") == 0 &&
1269 (strcmp(field->name, "filename") == 0 ||
1270 strcmp(field->name, "path") == 0 ||
1271 strcmp(field->name, "pathname") == 0))
1272 sc->arg_fmt[idx].scnprintf = SCA_FILENAME;
1273 else if (field->flags & FIELD_IS_POINTER)
1274 sc->arg_fmt[idx].scnprintf = syscall_arg__scnprintf_hex;
1275 else if (strcmp(field->type, "pid_t") == 0)
1276 sc->arg_fmt[idx].scnprintf = SCA_PID;
1277 else if (strcmp(field->type, "umode_t") == 0)
1278 sc->arg_fmt[idx].scnprintf = SCA_MODE_T;
1279 else if ((strcmp(field->type, "int") == 0 ||
1280 strcmp(field->type, "unsigned int") == 0 ||
1281 strcmp(field->type, "long") == 0) &&
1282 (len = strlen(field->name)) >= 2 &&
1283 strcmp(field->name + len - 2, "fd") == 0) {
1285 * /sys/kernel/tracing/events/syscalls/sys_enter*
1286 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1291 sc->arg_fmt[idx].scnprintf = SCA_FD;
1298 static int trace__read_syscall_info(struct trace *trace, int id)
1302 const char *name = syscalltbl__name(trace->sctbl, id);
1307 if (id > trace->syscalls.max) {
1308 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1310 if (nsyscalls == NULL)
1313 if (trace->syscalls.max != -1) {
1314 memset(nsyscalls + trace->syscalls.max + 1, 0,
1315 (id - trace->syscalls.max) * sizeof(*sc));
1317 memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1320 trace->syscalls.table = nsyscalls;
1321 trace->syscalls.max = id;
1324 sc = trace->syscalls.table + id;
1327 sc->fmt = syscall_fmt__find(sc->name);
1329 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1330 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1332 if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1333 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1334 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1337 if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? 6 : sc->tp_format->format.nr_fields))
1340 if (IS_ERR(sc->tp_format))
1343 sc->args = sc->tp_format->format.fields;
1345 * We need to check and discard the first variable '__syscall_nr'
1346 * or 'nr' that mean the syscall number. It is needless here.
1347 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1349 if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1350 sc->args = sc->args->next;
1354 sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1355 sc->is_open = !strcmp(name, "open") || !strcmp(name, "openat");
1357 return syscall__set_arg_fmts(sc);
1360 static int trace__validate_ev_qualifier(struct trace *trace)
1363 size_t nr_allocated;
1364 struct str_node *pos;
1366 trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1367 trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1368 sizeof(trace->ev_qualifier_ids.entries[0]));
1370 if (trace->ev_qualifier_ids.entries == NULL) {
1371 fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1377 nr_allocated = trace->ev_qualifier_ids.nr;
1380 strlist__for_each_entry(pos, trace->ev_qualifier) {
1381 const char *sc = pos->s;
1382 int id = syscalltbl__id(trace->sctbl, sc), match_next = -1;
1385 id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next);
1390 fputs("Error:\tInvalid syscall ", trace->output);
1393 fputs(", ", trace->output);
1396 fputs(sc, trace->output);
1399 trace->ev_qualifier_ids.entries[i++] = id;
1400 if (match_next == -1)
1404 id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next);
1407 if (nr_allocated == trace->ev_qualifier_ids.nr) {
1411 entries = realloc(trace->ev_qualifier_ids.entries,
1412 nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0]));
1413 if (entries == NULL) {
1415 fputs("\nError:\t Not enough memory for parsing\n", trace->output);
1418 trace->ev_qualifier_ids.entries = entries;
1420 trace->ev_qualifier_ids.nr++;
1421 trace->ev_qualifier_ids.entries[i++] = id;
1426 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1427 "\nHint:\tand: 'man syscalls'\n", trace->output);
1429 zfree(&trace->ev_qualifier_ids.entries);
1430 trace->ev_qualifier_ids.nr = 0;
1437 * args is to be interpreted as a series of longs but we need to handle
1438 * 8-byte unaligned accesses. args points to raw_data within the event
1439 * and raw_data is guaranteed to be 8-byte unaligned because it is
1440 * preceded by raw_size which is a u32. So we need to copy args to a temp
1441 * variable to read it. Most notably this avoids extended load instructions
1442 * on unaligned addresses
1444 unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx)
1447 unsigned char *p = arg->args + sizeof(unsigned long) * idx;
1449 memcpy(&val, p, sizeof(val));
1453 static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size,
1454 struct syscall_arg *arg)
1456 if (sc->arg_fmt && sc->arg_fmt[arg->idx].name)
1457 return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name);
1459 return scnprintf(bf, size, "arg%d: ", arg->idx);
1462 static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size,
1463 struct syscall_arg *arg, unsigned long val)
1465 if (sc->arg_fmt && sc->arg_fmt[arg->idx].scnprintf) {
1467 if (sc->arg_fmt[arg->idx].parm)
1468 arg->parm = sc->arg_fmt[arg->idx].parm;
1469 return sc->arg_fmt[arg->idx].scnprintf(bf, size, arg);
1471 return scnprintf(bf, size, "%ld", val);
1474 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1475 unsigned char *args, struct trace *trace,
1476 struct thread *thread)
1481 struct syscall_arg arg = {
1488 struct thread_trace *ttrace = thread__priv(thread);
1491 * Things like fcntl will set this in its 'cmd' formatter to pick the
1492 * right formatter for the return value (an fd? file flags?), which is
1493 * not needed for syscalls that always return a given type, say an fd.
1495 ttrace->ret_scnprintf = NULL;
1497 if (sc->args != NULL) {
1498 struct format_field *field;
1500 for (field = sc->args; field;
1501 field = field->next, ++arg.idx, bit <<= 1) {
1505 val = syscall_arg__val(&arg, arg.idx);
1508 * Suppress this argument if its value is zero and
1509 * and we don't have a string associated in an
1514 (sc->arg_fmt[arg.idx].show_zero ||
1515 sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAY ||
1516 sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAYS) &&
1517 sc->arg_fmt[arg.idx].parm))
1520 printed += scnprintf(bf + printed, size - printed,
1521 "%s%s: ", printed ? ", " : "", field->name);
1522 printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1524 } else if (IS_ERR(sc->tp_format)) {
1526 * If we managed to read the tracepoint /format file, then we
1527 * may end up not having any args, like with gettid(), so only
1528 * print the raw args when we didn't manage to read it.
1530 while (arg.idx < sc->nr_args) {
1533 val = syscall_arg__val(&arg, arg.idx);
1535 printed += scnprintf(bf + printed, size - printed, ", ");
1536 printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg);
1537 printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1547 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1548 union perf_event *event,
1549 struct perf_sample *sample);
1551 static struct syscall *trace__syscall_info(struct trace *trace,
1552 struct perf_evsel *evsel, int id)
1558 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1559 * before that, leaving at a higher verbosity level till that is
1560 * explained. Reproduced with plain ftrace with:
1562 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1563 * grep "NR -1 " /t/trace_pipe
1565 * After generating some load on the machine.
1569 fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1570 id, perf_evsel__name(evsel), ++n);
1575 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1576 trace__read_syscall_info(trace, id))
1579 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1582 return &trace->syscalls.table[id];
1586 fprintf(trace->output, "Problems reading syscall %d", id);
1587 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1588 fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1589 fputs(" information\n", trace->output);
1594 static void thread__update_stats(struct thread_trace *ttrace,
1595 int id, struct perf_sample *sample)
1597 struct int_node *inode;
1598 struct stats *stats;
1601 inode = intlist__findnew(ttrace->syscall_stats, id);
1605 stats = inode->priv;
1606 if (stats == NULL) {
1607 stats = malloc(sizeof(struct stats));
1611 inode->priv = stats;
1614 if (ttrace->entry_time && sample->time > ttrace->entry_time)
1615 duration = sample->time - ttrace->entry_time;
1617 update_stats(stats, duration);
1620 static int trace__printf_interrupted_entry(struct trace *trace)
1622 struct thread_trace *ttrace;
1625 if (trace->failure_only || trace->current == NULL)
1628 ttrace = thread__priv(trace->current);
1630 if (!ttrace->entry_pending)
1633 printed = trace__fprintf_entry_head(trace, trace->current, 0, false, ttrace->entry_time, trace->output);
1634 printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1635 ttrace->entry_pending = false;
1640 static int trace__fprintf_sample(struct trace *trace, struct perf_evsel *evsel,
1641 struct perf_sample *sample, struct thread *thread)
1645 if (trace->print_sample) {
1646 double ts = (double)sample->time / NSEC_PER_MSEC;
1648 printed += fprintf(trace->output, "%22s %10.3f %s %d/%d [%d]\n",
1649 perf_evsel__name(evsel), ts,
1650 thread__comm_str(thread),
1651 sample->pid, sample->tid, sample->cpu);
1657 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1658 union perf_event *event __maybe_unused,
1659 struct perf_sample *sample)
1664 struct thread *thread;
1665 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1666 struct syscall *sc = trace__syscall_info(trace, evsel, id);
1667 struct thread_trace *ttrace;
1672 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1673 ttrace = thread__trace(thread, trace->output);
1677 trace__fprintf_sample(trace, evsel, sample, thread);
1679 args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1681 if (ttrace->entry_str == NULL) {
1682 ttrace->entry_str = malloc(trace__entry_str_size);
1683 if (!ttrace->entry_str)
1687 if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1688 trace__printf_interrupted_entry(trace);
1690 ttrace->entry_time = sample->time;
1691 msg = ttrace->entry_str;
1692 printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1694 printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1695 args, trace, thread);
1698 if (!(trace->duration_filter || trace->summary_only || trace->failure_only || trace->min_stack)) {
1699 trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1700 fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1703 ttrace->entry_pending = true;
1704 /* See trace__vfs_getname & trace__sys_exit */
1705 ttrace->filename.pending_open = false;
1708 if (trace->current != thread) {
1709 thread__put(trace->current);
1710 trace->current = thread__get(thread);
1714 thread__put(thread);
1718 static int trace__fprintf_sys_enter(struct trace *trace, struct perf_evsel *evsel,
1719 struct perf_sample *sample)
1721 struct thread_trace *ttrace;
1722 struct thread *thread;
1723 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1724 struct syscall *sc = trace__syscall_info(trace, evsel, id);
1731 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1732 ttrace = thread__trace(thread, trace->output);
1734 * We need to get ttrace just to make sure it is there when syscall__scnprintf_args()
1735 * and the rest of the beautifiers accessing it via struct syscall_arg touches it.
1740 args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1741 syscall__scnprintf_args(sc, msg, sizeof(msg), args, trace, thread);
1742 fprintf(trace->output, "%s", msg);
1745 thread__put(thread);
1749 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1750 struct perf_sample *sample,
1751 struct callchain_cursor *cursor)
1753 struct addr_location al;
1754 int max_stack = evsel->attr.sample_max_stack ?
1755 evsel->attr.sample_max_stack :
1758 if (machine__resolve(trace->host, &al, sample) < 0 ||
1759 thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, max_stack))
1765 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1767 /* TODO: user-configurable print_opts */
1768 const unsigned int print_opts = EVSEL__PRINT_SYM |
1770 EVSEL__PRINT_UNKNOWN_AS_ADDR;
1772 return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1775 static const char *errno_to_name(struct perf_evsel *evsel, int err)
1777 struct perf_env *env = perf_evsel__env(evsel);
1778 const char *arch_name = perf_env__arch(env);
1780 return arch_syscalls__strerrno(arch_name, err);
1783 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1784 union perf_event *event __maybe_unused,
1785 struct perf_sample *sample)
1789 bool duration_calculated = false;
1790 struct thread *thread;
1791 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1792 struct syscall *sc = trace__syscall_info(trace, evsel, id);
1793 struct thread_trace *ttrace;
1798 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1799 ttrace = thread__trace(thread, trace->output);
1803 trace__fprintf_sample(trace, evsel, sample, thread);
1806 thread__update_stats(ttrace, id, sample);
1808 ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1810 if (sc->is_open && ret >= 0 && ttrace->filename.pending_open) {
1811 trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1812 ttrace->filename.pending_open = false;
1813 ++trace->stats.vfs_getname;
1816 if (ttrace->entry_time) {
1817 duration = sample->time - ttrace->entry_time;
1818 if (trace__filter_duration(trace, duration))
1820 duration_calculated = true;
1821 } else if (trace->duration_filter)
1824 if (sample->callchain) {
1825 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1826 if (callchain_ret == 0) {
1827 if (callchain_cursor.nr < trace->min_stack)
1833 if (trace->summary_only || (ret >= 0 && trace->failure_only))
1836 trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1838 if (ttrace->entry_pending) {
1839 fprintf(trace->output, "%-70s", ttrace->entry_str);
1841 fprintf(trace->output, " ... [");
1842 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1843 fprintf(trace->output, "]: %s()", sc->name);
1846 if (sc->fmt == NULL) {
1850 fprintf(trace->output, ") = %ld", ret);
1851 } else if (ret < 0) {
1853 char bf[STRERR_BUFSIZE];
1854 const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1855 *e = errno_to_name(evsel, -ret);
1857 fprintf(trace->output, ") = -1 %s %s", e, emsg);
1859 } else if (ret == 0 && sc->fmt->timeout)
1860 fprintf(trace->output, ") = 0 Timeout");
1861 else if (ttrace->ret_scnprintf) {
1863 struct syscall_arg arg = {
1868 ttrace->ret_scnprintf(bf, sizeof(bf), &arg);
1869 ttrace->ret_scnprintf = NULL;
1870 fprintf(trace->output, ") = %s", bf);
1871 } else if (sc->fmt->hexret)
1872 fprintf(trace->output, ") = %#lx", ret);
1873 else if (sc->fmt->errpid) {
1874 struct thread *child = machine__find_thread(trace->host, ret, ret);
1876 if (child != NULL) {
1877 fprintf(trace->output, ") = %ld", ret);
1878 if (child->comm_set)
1879 fprintf(trace->output, " (%s)", thread__comm_str(child));
1885 fputc('\n', trace->output);
1887 if (callchain_ret > 0)
1888 trace__fprintf_callchain(trace, sample);
1889 else if (callchain_ret < 0)
1890 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1892 ttrace->entry_pending = false;
1895 thread__put(thread);
1899 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1900 union perf_event *event __maybe_unused,
1901 struct perf_sample *sample)
1903 struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1904 struct thread_trace *ttrace;
1905 size_t filename_len, entry_str_len, to_move;
1906 ssize_t remaining_space;
1908 const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1913 ttrace = thread__priv(thread);
1917 filename_len = strlen(filename);
1918 if (filename_len == 0)
1921 if (ttrace->filename.namelen < filename_len) {
1922 char *f = realloc(ttrace->filename.name, filename_len + 1);
1927 ttrace->filename.namelen = filename_len;
1928 ttrace->filename.name = f;
1931 strcpy(ttrace->filename.name, filename);
1932 ttrace->filename.pending_open = true;
1934 if (!ttrace->filename.ptr)
1937 entry_str_len = strlen(ttrace->entry_str);
1938 remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1939 if (remaining_space <= 0)
1942 if (filename_len > (size_t)remaining_space) {
1943 filename += filename_len - remaining_space;
1944 filename_len = remaining_space;
1947 to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1948 pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1949 memmove(pos + filename_len, pos, to_move);
1950 memcpy(pos, filename, filename_len);
1952 ttrace->filename.ptr = 0;
1953 ttrace->filename.entry_str_pos = 0;
1955 thread__put(thread);
1960 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1961 union perf_event *event __maybe_unused,
1962 struct perf_sample *sample)
1964 u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1965 double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1966 struct thread *thread = machine__findnew_thread(trace->host,
1969 struct thread_trace *ttrace = thread__trace(thread, trace->output);
1974 ttrace->runtime_ms += runtime_ms;
1975 trace->runtime_ms += runtime_ms;
1977 thread__put(thread);
1981 fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1983 perf_evsel__strval(evsel, sample, "comm"),
1984 (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1986 perf_evsel__intval(evsel, sample, "vruntime"));
1990 static int bpf_output__printer(enum binary_printer_ops op,
1991 unsigned int val, void *extra __maybe_unused, FILE *fp)
1993 unsigned char ch = (unsigned char)val;
1996 case BINARY_PRINT_CHAR_DATA:
1997 return fprintf(fp, "%c", isprint(ch) ? ch : '.');
1998 case BINARY_PRINT_DATA_BEGIN:
1999 case BINARY_PRINT_LINE_BEGIN:
2000 case BINARY_PRINT_ADDR:
2001 case BINARY_PRINT_NUM_DATA:
2002 case BINARY_PRINT_NUM_PAD:
2003 case BINARY_PRINT_SEP:
2004 case BINARY_PRINT_CHAR_PAD:
2005 case BINARY_PRINT_LINE_END:
2006 case BINARY_PRINT_DATA_END:
2014 static void bpf_output__fprintf(struct trace *trace,
2015 struct perf_sample *sample)
2017 binary__fprintf(sample->raw_data, sample->raw_size, 8,
2018 bpf_output__printer, NULL, trace->output);
2021 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
2022 union perf_event *event __maybe_unused,
2023 struct perf_sample *sample)
2025 int callchain_ret = 0;
2027 if (sample->callchain) {
2028 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
2029 if (callchain_ret == 0) {
2030 if (callchain_cursor.nr < trace->min_stack)
2036 trace__printf_interrupted_entry(trace);
2037 trace__fprintf_tstamp(trace, sample->time, trace->output);
2039 if (trace->trace_syscalls)
2040 fprintf(trace->output, "( ): ");
2042 fprintf(trace->output, "%s:", evsel->name);
2044 if (perf_evsel__is_bpf_output(evsel)) {
2045 if (evsel == trace->syscalls.events.augmented)
2046 trace__fprintf_sys_enter(trace, evsel, sample);
2048 bpf_output__fprintf(trace, sample);
2049 } else if (evsel->tp_format) {
2050 if (strncmp(evsel->tp_format->name, "sys_enter_", 10) ||
2051 trace__fprintf_sys_enter(trace, evsel, sample)) {
2052 event_format__fprintf(evsel->tp_format, sample->cpu,
2053 sample->raw_data, sample->raw_size,
2058 fprintf(trace->output, "\n");
2060 if (callchain_ret > 0)
2061 trace__fprintf_callchain(trace, sample);
2062 else if (callchain_ret < 0)
2063 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2068 static void print_location(FILE *f, struct perf_sample *sample,
2069 struct addr_location *al,
2070 bool print_dso, bool print_sym)
2073 if ((verbose > 0 || print_dso) && al->map)
2074 fprintf(f, "%s@", al->map->dso->long_name);
2076 if ((verbose > 0 || print_sym) && al->sym)
2077 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
2078 al->addr - al->sym->start);
2080 fprintf(f, "0x%" PRIx64, al->addr);
2082 fprintf(f, "0x%" PRIx64, sample->addr);
2085 static int trace__pgfault(struct trace *trace,
2086 struct perf_evsel *evsel,
2087 union perf_event *event __maybe_unused,
2088 struct perf_sample *sample)
2090 struct thread *thread;
2091 struct addr_location al;
2092 char map_type = 'd';
2093 struct thread_trace *ttrace;
2095 int callchain_ret = 0;
2097 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2099 if (sample->callchain) {
2100 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
2101 if (callchain_ret == 0) {
2102 if (callchain_cursor.nr < trace->min_stack)
2108 ttrace = thread__trace(thread, trace->output);
2112 if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
2117 if (trace->summary_only)
2120 thread__find_symbol(thread, sample->cpumode, sample->ip, &al);
2122 trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
2124 fprintf(trace->output, "%sfault [",
2125 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
2128 print_location(trace->output, sample, &al, false, true);
2130 fprintf(trace->output, "] => ");
2132 thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
2135 thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
2143 print_location(trace->output, sample, &al, true, false);
2145 fprintf(trace->output, " (%c%c)\n", map_type, al.level);
2147 if (callchain_ret > 0)
2148 trace__fprintf_callchain(trace, sample);
2149 else if (callchain_ret < 0)
2150 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2154 thread__put(thread);
2158 static void trace__set_base_time(struct trace *trace,
2159 struct perf_evsel *evsel,
2160 struct perf_sample *sample)
2163 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
2164 * and don't use sample->time unconditionally, we may end up having
2165 * some other event in the future without PERF_SAMPLE_TIME for good
2166 * reason, i.e. we may not be interested in its timestamps, just in
2167 * it taking place, picking some piece of information when it
2168 * appears in our event stream (vfs_getname comes to mind).
2170 if (trace->base_time == 0 && !trace->full_time &&
2171 (evsel->attr.sample_type & PERF_SAMPLE_TIME))
2172 trace->base_time = sample->time;
2175 static int trace__process_sample(struct perf_tool *tool,
2176 union perf_event *event,
2177 struct perf_sample *sample,
2178 struct perf_evsel *evsel,
2179 struct machine *machine __maybe_unused)
2181 struct trace *trace = container_of(tool, struct trace, tool);
2182 struct thread *thread;
2185 tracepoint_handler handler = evsel->handler;
2187 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2188 if (thread && thread__is_filtered(thread))
2191 trace__set_base_time(trace, evsel, sample);
2195 handler(trace, evsel, event, sample);
2198 thread__put(thread);
2202 static int trace__record(struct trace *trace, int argc, const char **argv)
2204 unsigned int rec_argc, i, j;
2205 const char **rec_argv;
2206 const char * const record_args[] = {
2213 const char * const sc_args[] = { "-e", };
2214 unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2215 const char * const majpf_args[] = { "-e", "major-faults" };
2216 unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2217 const char * const minpf_args[] = { "-e", "minor-faults" };
2218 unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2220 /* +1 is for the event string below */
2221 rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2222 majpf_args_nr + minpf_args_nr + argc;
2223 rec_argv = calloc(rec_argc + 1, sizeof(char *));
2225 if (rec_argv == NULL)
2229 for (i = 0; i < ARRAY_SIZE(record_args); i++)
2230 rec_argv[j++] = record_args[i];
2232 if (trace->trace_syscalls) {
2233 for (i = 0; i < sc_args_nr; i++)
2234 rec_argv[j++] = sc_args[i];
2236 /* event string may be different for older kernels - e.g., RHEL6 */
2237 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2238 rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2239 else if (is_valid_tracepoint("syscalls:sys_enter"))
2240 rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2242 pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2248 if (trace->trace_pgfaults & TRACE_PFMAJ)
2249 for (i = 0; i < majpf_args_nr; i++)
2250 rec_argv[j++] = majpf_args[i];
2252 if (trace->trace_pgfaults & TRACE_PFMIN)
2253 for (i = 0; i < minpf_args_nr; i++)
2254 rec_argv[j++] = minpf_args[i];
2256 for (i = 0; i < (unsigned int)argc; i++)
2257 rec_argv[j++] = argv[i];
2259 return cmd_record(j, rec_argv);
2262 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2264 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2266 struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2271 if (perf_evsel__field(evsel, "pathname") == NULL) {
2272 perf_evsel__delete(evsel);
2276 evsel->handler = trace__vfs_getname;
2277 perf_evlist__add(evlist, evsel);
2281 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2283 struct perf_evsel *evsel;
2284 struct perf_event_attr attr = {
2285 .type = PERF_TYPE_SOFTWARE,
2289 attr.config = config;
2290 attr.sample_period = 1;
2292 event_attr_init(&attr);
2294 evsel = perf_evsel__new(&attr);
2296 evsel->handler = trace__pgfault;
2301 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2303 const u32 type = event->header.type;
2304 struct perf_evsel *evsel;
2306 if (type != PERF_RECORD_SAMPLE) {
2307 trace__process_event(trace, trace->host, event, sample);
2311 evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2312 if (evsel == NULL) {
2313 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2317 trace__set_base_time(trace, evsel, sample);
2319 if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2320 sample->raw_data == NULL) {
2321 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2322 perf_evsel__name(evsel), sample->tid,
2323 sample->cpu, sample->raw_size);
2325 tracepoint_handler handler = evsel->handler;
2326 handler(trace, evsel, event, sample);
2330 static int trace__add_syscall_newtp(struct trace *trace)
2333 struct perf_evlist *evlist = trace->evlist;
2334 struct perf_evsel *sys_enter, *sys_exit;
2336 sys_enter = perf_evsel__raw_syscall_newtp("sys_enter", trace__sys_enter);
2337 if (sys_enter == NULL)
2340 if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2341 goto out_delete_sys_enter;
2343 sys_exit = perf_evsel__raw_syscall_newtp("sys_exit", trace__sys_exit);
2344 if (sys_exit == NULL)
2345 goto out_delete_sys_enter;
2347 if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2348 goto out_delete_sys_exit;
2350 perf_evsel__config_callchain(sys_enter, &trace->opts, &callchain_param);
2351 perf_evsel__config_callchain(sys_exit, &trace->opts, &callchain_param);
2353 perf_evlist__add(evlist, sys_enter);
2354 perf_evlist__add(evlist, sys_exit);
2356 if (callchain_param.enabled && !trace->kernel_syscallchains) {
2358 * We're interested only in the user space callchain
2359 * leading to the syscall, allow overriding that for
2360 * debugging reasons using --kernel_syscall_callchains
2362 sys_exit->attr.exclude_callchain_kernel = 1;
2365 trace->syscalls.events.sys_enter = sys_enter;
2366 trace->syscalls.events.sys_exit = sys_exit;
2372 out_delete_sys_exit:
2373 perf_evsel__delete_priv(sys_exit);
2374 out_delete_sys_enter:
2375 perf_evsel__delete_priv(sys_enter);
2379 static int trace__set_ev_qualifier_filter(struct trace *trace)
2382 struct perf_evsel *sys_exit;
2383 char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2384 trace->ev_qualifier_ids.nr,
2385 trace->ev_qualifier_ids.entries);
2390 if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2392 sys_exit = trace->syscalls.events.sys_exit;
2393 err = perf_evsel__append_tp_filter(sys_exit, filter);
2404 static int trace__set_filter_loop_pids(struct trace *trace)
2406 unsigned int nr = 1;
2410 struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]);
2412 while (thread && nr < ARRAY_SIZE(pids)) {
2413 struct thread *parent = machine__find_thread(trace->host, thread->ppid, thread->ppid);
2418 if (!strcmp(thread__comm_str(parent), "sshd")) {
2419 pids[nr++] = parent->tid;
2425 return perf_evlist__set_filter_pids(trace->evlist, nr, pids);
2428 static int trace__run(struct trace *trace, int argc, const char **argv)
2430 struct perf_evlist *evlist = trace->evlist;
2431 struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2433 unsigned long before;
2434 const bool forks = argc > 0;
2435 bool draining = false;
2439 if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2440 goto out_error_raw_syscalls;
2442 if (trace->trace_syscalls)
2443 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2445 if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2446 pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2447 if (pgfault_maj == NULL)
2449 perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2450 perf_evlist__add(evlist, pgfault_maj);
2453 if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2454 pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2455 if (pgfault_min == NULL)
2457 perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2458 perf_evlist__add(evlist, pgfault_min);
2462 perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2463 trace__sched_stat_runtime))
2464 goto out_error_sched_stat_runtime;
2467 * If a global cgroup was set, apply it to all the events without an
2468 * explicit cgroup. I.e.:
2470 * trace -G A -e sched:*switch
2472 * Will set all raw_syscalls:sys_{enter,exit}, pgfault, vfs_getname, etc
2473 * _and_ sched:sched_switch to the 'A' cgroup, while:
2475 * trace -e sched:*switch -G A
2477 * will only set the sched:sched_switch event to the 'A' cgroup, all the
2478 * other events (raw_syscalls:sys_{enter,exit}, etc are left "without"
2479 * a cgroup (on the root cgroup, sys wide, etc).
2483 * trace -G A -e sched:*switch -G B
2485 * the syscall ones go to the 'A' cgroup, the sched:sched_switch goes
2486 * to the 'B' cgroup.
2488 * evlist__set_default_cgroup() grabs a reference of the passed cgroup
2489 * only for the evsels still without a cgroup, i.e. evsel->cgroup == NULL.
2492 evlist__set_default_cgroup(trace->evlist, trace->cgroup);
2494 err = perf_evlist__create_maps(evlist, &trace->opts.target);
2496 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2497 goto out_delete_evlist;
2500 err = trace__symbols_init(trace, evlist);
2502 fprintf(trace->output, "Problems initializing symbol libraries!\n");
2503 goto out_delete_evlist;
2506 perf_evlist__config(evlist, &trace->opts, &callchain_param);
2508 signal(SIGCHLD, sig_handler);
2509 signal(SIGINT, sig_handler);
2512 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2515 fprintf(trace->output, "Couldn't run the workload!\n");
2516 goto out_delete_evlist;
2520 err = perf_evlist__open(evlist);
2522 goto out_error_open;
2524 err = bpf__apply_obj_config();
2526 char errbuf[BUFSIZ];
2528 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2529 pr_err("ERROR: Apply config to BPF failed: %s\n",
2531 goto out_error_open;
2535 * Better not use !target__has_task() here because we need to cover the
2536 * case where no threads were specified in the command line, but a
2537 * workload was, and in that case we will fill in the thread_map when
2538 * we fork the workload in perf_evlist__prepare_workload.
2540 if (trace->filter_pids.nr > 0)
2541 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2542 else if (thread_map__pid(evlist->threads, 0) == -1)
2543 err = trace__set_filter_loop_pids(trace);
2548 if (trace->ev_qualifier_ids.nr > 0) {
2549 err = trace__set_ev_qualifier_filter(trace);
2553 pr_debug("event qualifier tracepoint filter: %s\n",
2554 trace->syscalls.events.sys_exit->filter);
2557 err = perf_evlist__apply_filters(evlist, &evsel);
2559 goto out_error_apply_filters;
2561 err = perf_evlist__mmap(evlist, trace->opts.mmap_pages);
2563 goto out_error_mmap;
2565 if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2566 perf_evlist__enable(evlist);
2569 perf_evlist__start_workload(evlist);
2571 if (trace->opts.initial_delay) {
2572 usleep(trace->opts.initial_delay * 1000);
2573 perf_evlist__enable(evlist);
2576 trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2577 evlist->threads->nr > 1 ||
2578 perf_evlist__first(evlist)->attr.inherit;
2581 * Now that we already used evsel->attr to ask the kernel to setup the
2582 * events, lets reuse evsel->attr.sample_max_stack as the limit in
2583 * trace__resolve_callchain(), allowing per-event max-stack settings
2584 * to override an explicitely set --max-stack global setting.
2586 evlist__for_each_entry(evlist, evsel) {
2587 if (evsel__has_callchain(evsel) &&
2588 evsel->attr.sample_max_stack == 0)
2589 evsel->attr.sample_max_stack = trace->max_stack;
2592 before = trace->nr_events;
2594 for (i = 0; i < evlist->nr_mmaps; i++) {
2595 union perf_event *event;
2596 struct perf_mmap *md;
2598 md = &evlist->mmap[i];
2599 if (perf_mmap__read_init(md) < 0)
2602 while ((event = perf_mmap__read_event(md)) != NULL) {
2603 struct perf_sample sample;
2607 err = perf_evlist__parse_sample(evlist, event, &sample);
2609 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2613 trace__handle_event(trace, event, &sample);
2615 perf_mmap__consume(md);
2620 if (done && !draining) {
2621 perf_evlist__disable(evlist);
2625 perf_mmap__read_done(md);
2628 if (trace->nr_events == before) {
2629 int timeout = done ? 100 : -1;
2631 if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2632 if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2642 thread__zput(trace->current);
2644 perf_evlist__disable(evlist);
2648 trace__fprintf_thread_summary(trace, trace->output);
2650 if (trace->show_tool_stats) {
2651 fprintf(trace->output, "Stats:\n "
2652 " vfs_getname : %" PRIu64 "\n"
2653 " proc_getname: %" PRIu64 "\n",
2654 trace->stats.vfs_getname,
2655 trace->stats.proc_getname);
2660 trace__symbols__exit(trace);
2662 perf_evlist__delete(evlist);
2663 cgroup__put(trace->cgroup);
2664 trace->evlist = NULL;
2665 trace->live = false;
2668 char errbuf[BUFSIZ];
2670 out_error_sched_stat_runtime:
2671 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2674 out_error_raw_syscalls:
2675 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2679 perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2683 perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2686 fprintf(trace->output, "%s\n", errbuf);
2687 goto out_delete_evlist;
2689 out_error_apply_filters:
2690 fprintf(trace->output,
2691 "Failed to set filter \"%s\" on event %s with %d (%s)\n",
2692 evsel->filter, perf_evsel__name(evsel), errno,
2693 str_error_r(errno, errbuf, sizeof(errbuf)));
2694 goto out_delete_evlist;
2697 fprintf(trace->output, "Not enough memory to run!\n");
2698 goto out_delete_evlist;
2701 fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2702 goto out_delete_evlist;
2705 static int trace__replay(struct trace *trace)
2707 const struct perf_evsel_str_handler handlers[] = {
2708 { "probe:vfs_getname", trace__vfs_getname, },
2710 struct perf_data data = {
2714 .mode = PERF_DATA_MODE_READ,
2715 .force = trace->force,
2717 struct perf_session *session;
2718 struct perf_evsel *evsel;
2721 trace->tool.sample = trace__process_sample;
2722 trace->tool.mmap = perf_event__process_mmap;
2723 trace->tool.mmap2 = perf_event__process_mmap2;
2724 trace->tool.comm = perf_event__process_comm;
2725 trace->tool.exit = perf_event__process_exit;
2726 trace->tool.fork = perf_event__process_fork;
2727 trace->tool.attr = perf_event__process_attr;
2728 trace->tool.tracing_data = perf_event__process_tracing_data;
2729 trace->tool.build_id = perf_event__process_build_id;
2730 trace->tool.namespaces = perf_event__process_namespaces;
2732 trace->tool.ordered_events = true;
2733 trace->tool.ordering_requires_timestamps = true;
2735 /* add tid to output */
2736 trace->multiple_threads = true;
2738 session = perf_session__new(&data, false, &trace->tool);
2739 if (session == NULL)
2742 if (trace->opts.target.pid)
2743 symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
2745 if (trace->opts.target.tid)
2746 symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
2748 if (symbol__init(&session->header.env) < 0)
2751 trace->host = &session->machines.host;
2753 err = perf_session__set_tracepoints_handlers(session, handlers);
2757 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2758 "raw_syscalls:sys_enter");
2759 /* older kernels have syscalls tp versus raw_syscalls */
2761 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2762 "syscalls:sys_enter");
2765 (perf_evsel__init_raw_syscall_tp(evsel, trace__sys_enter) < 0 ||
2766 perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2767 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2771 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2772 "raw_syscalls:sys_exit");
2774 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2775 "syscalls:sys_exit");
2777 (perf_evsel__init_raw_syscall_tp(evsel, trace__sys_exit) < 0 ||
2778 perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2779 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2783 evlist__for_each_entry(session->evlist, evsel) {
2784 if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2785 (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2786 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2787 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2788 evsel->handler = trace__pgfault;
2793 err = perf_session__process_events(session);
2795 pr_err("Failed to process events, error %d", err);
2797 else if (trace->summary)
2798 trace__fprintf_thread_summary(trace, trace->output);
2801 perf_session__delete(session);
2806 static size_t trace__fprintf_threads_header(FILE *fp)
2810 printed = fprintf(fp, "\n Summary of events:\n\n");
2815 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2816 struct stats *stats;
2821 struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2822 struct stats *stats = source->priv;
2824 entry->syscall = source->i;
2825 entry->stats = stats;
2826 entry->msecs = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2829 static size_t thread__dump_stats(struct thread_trace *ttrace,
2830 struct trace *trace, FILE *fp)
2835 DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2837 if (syscall_stats == NULL)
2840 printed += fprintf(fp, "\n");
2842 printed += fprintf(fp, " syscall calls total min avg max stddev\n");
2843 printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n");
2844 printed += fprintf(fp, " --------------- -------- --------- --------- --------- --------- ------\n");
2846 resort_rb__for_each_entry(nd, syscall_stats) {
2847 struct stats *stats = syscall_stats_entry->stats;
2849 double min = (double)(stats->min) / NSEC_PER_MSEC;
2850 double max = (double)(stats->max) / NSEC_PER_MSEC;
2851 double avg = avg_stats(stats);
2853 u64 n = (u64) stats->n;
2855 pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2856 avg /= NSEC_PER_MSEC;
2858 sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2859 printed += fprintf(fp, " %-15s", sc->name);
2860 printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2861 n, syscall_stats_entry->msecs, min, avg);
2862 printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2866 resort_rb__delete(syscall_stats);
2867 printed += fprintf(fp, "\n\n");
2872 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2875 struct thread_trace *ttrace = thread__priv(thread);
2881 ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2883 printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2884 printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2885 printed += fprintf(fp, "%.1f%%", ratio);
2887 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2889 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2891 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2892 else if (fputc('\n', fp) != EOF)
2895 printed += thread__dump_stats(ttrace, trace, fp);
2900 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2902 return ttrace ? ttrace->nr_events : 0;
2905 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2906 struct thread *thread;
2909 entry->thread = rb_entry(nd, struct thread, rb_node);
2912 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2914 size_t printed = trace__fprintf_threads_header(fp);
2918 for (i = 0; i < THREADS__TABLE_SIZE; i++) {
2919 DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host, i);
2921 if (threads == NULL) {
2922 fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2926 resort_rb__for_each_entry(nd, threads)
2927 printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2929 resort_rb__delete(threads);
2934 static int trace__set_duration(const struct option *opt, const char *str,
2935 int unset __maybe_unused)
2937 struct trace *trace = opt->value;
2939 trace->duration_filter = atof(str);
2943 static int trace__set_filter_pids(const struct option *opt, const char *str,
2944 int unset __maybe_unused)
2948 struct trace *trace = opt->value;
2950 * FIXME: introduce a intarray class, plain parse csv and create a
2951 * { int nr, int entries[] } struct...
2953 struct intlist *list = intlist__new(str);
2958 i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2959 trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2961 if (trace->filter_pids.entries == NULL)
2964 trace->filter_pids.entries[0] = getpid();
2966 for (i = 1; i < trace->filter_pids.nr; ++i)
2967 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2969 intlist__delete(list);
2975 static int trace__open_output(struct trace *trace, const char *filename)
2979 if (!stat(filename, &st) && st.st_size) {
2980 char oldname[PATH_MAX];
2982 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2984 rename(filename, oldname);
2987 trace->output = fopen(filename, "w");
2989 return trace->output == NULL ? -errno : 0;
2992 static int parse_pagefaults(const struct option *opt, const char *str,
2993 int unset __maybe_unused)
2995 int *trace_pgfaults = opt->value;
2997 if (strcmp(str, "all") == 0)
2998 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2999 else if (strcmp(str, "maj") == 0)
3000 *trace_pgfaults |= TRACE_PFMAJ;
3001 else if (strcmp(str, "min") == 0)
3002 *trace_pgfaults |= TRACE_PFMIN;
3009 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
3011 struct perf_evsel *evsel;
3013 evlist__for_each_entry(evlist, evsel)
3014 evsel->handler = handler;
3017 static int evlist__set_syscall_tp_fields(struct perf_evlist *evlist)
3019 struct perf_evsel *evsel;
3021 evlist__for_each_entry(evlist, evsel) {
3022 if (evsel->priv || !evsel->tp_format)
3025 if (strcmp(evsel->tp_format->system, "syscalls"))
3028 if (perf_evsel__init_syscall_tp(evsel))
3031 if (!strncmp(evsel->tp_format->name, "sys_enter_", 10)) {
3032 struct syscall_tp *sc = evsel->priv;
3034 if (__tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64)))
3036 } else if (!strncmp(evsel->tp_format->name, "sys_exit_", 9)) {
3037 struct syscall_tp *sc = evsel->priv;
3039 if (__tp_field__init_uint(&sc->ret, sizeof(u64), sc->id.offset + sizeof(u64), evsel->needs_swap))
3048 * XXX: Hackish, just splitting the combined -e+--event (syscalls
3049 * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
3050 * existing facilities unchanged (trace->ev_qualifier + parse_options()).
3052 * It'd be better to introduce a parse_options() variant that would return a
3053 * list with the terms it didn't match to an event...
3055 static int trace__parse_events_option(const struct option *opt, const char *str,
3056 int unset __maybe_unused)
3058 struct trace *trace = (struct trace *)opt->value;
3059 const char *s = str;
3060 char *sep = NULL, *lists[2] = { NULL, NULL, };
3061 int len = strlen(str) + 1, err = -1, list, idx;
3062 char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
3063 char group_name[PATH_MAX];
3065 if (strace_groups_dir == NULL)
3070 trace->not_ev_qualifier = true;
3074 if ((sep = strchr(s, ',')) != NULL)
3078 if (syscalltbl__id(trace->sctbl, s) >= 0 ||
3079 syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) {
3082 path__join(group_name, sizeof(group_name), strace_groups_dir, s);
3083 if (access(group_name, R_OK) == 0)
3088 sprintf(lists[list] + strlen(lists[list]), ",%s", s);
3090 lists[list] = malloc(len);
3091 if (lists[list] == NULL)
3093 strcpy(lists[list], s);
3103 if (lists[1] != NULL) {
3104 struct strlist_config slist_config = {
3105 .dirname = strace_groups_dir,
3108 trace->ev_qualifier = strlist__new(lists[1], &slist_config);
3109 if (trace->ev_qualifier == NULL) {
3110 fputs("Not enough memory to parse event qualifier", trace->output);
3114 if (trace__validate_ev_qualifier(trace))
3116 trace->trace_syscalls = true;
3122 struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
3123 "event selector. use 'perf list' to list available events",
3124 parse_events_option);
3125 err = parse_events_option(&o, lists[0], 0);
3134 static int trace__parse_cgroups(const struct option *opt, const char *str, int unset)
3136 struct trace *trace = opt->value;
3138 if (!list_empty(&trace->evlist->entries))
3139 return parse_cgroups(opt, str, unset);
3141 trace->cgroup = evlist__findnew_cgroup(trace->evlist, str);
3146 int cmd_trace(int argc, const char **argv)
3148 const char *trace_usage[] = {
3149 "perf trace [<options>] [<command>]",
3150 "perf trace [<options>] -- <command> [<options>]",
3151 "perf trace record [<options>] [<command>]",
3152 "perf trace record [<options>] -- <command> [<options>]",
3155 struct trace trace = {
3164 .user_freq = UINT_MAX,
3165 .user_interval = ULLONG_MAX,
3166 .no_buffering = true,
3167 .mmap_pages = UINT_MAX,
3168 .proc_map_timeout = 500,
3172 .trace_syscalls = false,
3173 .kernel_syscallchains = false,
3174 .max_stack = UINT_MAX,
3176 const char *output_name = NULL;
3177 const struct option trace_options[] = {
3178 OPT_CALLBACK('e', "event", &trace, "event",
3179 "event/syscall selector. use 'perf list' to list available events",
3180 trace__parse_events_option),
3181 OPT_BOOLEAN(0, "comm", &trace.show_comm,
3182 "show the thread COMM next to its id"),
3183 OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
3184 OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
3185 trace__parse_events_option),
3186 OPT_STRING('o', "output", &output_name, "file", "output file name"),
3187 OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
3188 OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
3189 "trace events on existing process id"),
3190 OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
3191 "trace events on existing thread id"),
3192 OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
3193 "pids to filter (by the kernel)", trace__set_filter_pids),
3194 OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
3195 "system-wide collection from all CPUs"),
3196 OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
3197 "list of cpus to monitor"),
3198 OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
3199 "child tasks do not inherit counters"),
3200 OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
3201 "number of mmap data pages",
3202 perf_evlist__parse_mmap_pages),
3203 OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
3205 OPT_CALLBACK(0, "duration", &trace, "float",
3206 "show only events with duration > N.M ms",
3207 trace__set_duration),
3208 OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
3209 OPT_INCR('v', "verbose", &verbose, "be more verbose"),
3210 OPT_BOOLEAN('T', "time", &trace.full_time,
3211 "Show full timestamp, not time relative to first start"),
3212 OPT_BOOLEAN(0, "failure", &trace.failure_only,
3213 "Show only syscalls that failed"),
3214 OPT_BOOLEAN('s', "summary", &trace.summary_only,
3215 "Show only syscall summary with statistics"),
3216 OPT_BOOLEAN('S', "with-summary", &trace.summary,
3217 "Show all syscalls and summary with statistics"),
3218 OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
3219 "Trace pagefaults", parse_pagefaults, "maj"),
3220 OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
3221 OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
3222 OPT_CALLBACK(0, "call-graph", &trace.opts,
3223 "record_mode[,record_size]", record_callchain_help,
3224 &record_parse_callchain_opt),
3225 OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
3226 "Show the kernel callchains on the syscall exit path"),
3227 OPT_UINTEGER(0, "min-stack", &trace.min_stack,
3228 "Set the minimum stack depth when parsing the callchain, "
3229 "anything below the specified depth will be ignored."),
3230 OPT_UINTEGER(0, "max-stack", &trace.max_stack,
3231 "Set the maximum stack depth when parsing the callchain, "
3232 "anything beyond the specified depth will be ignored. "
3233 "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
3234 OPT_BOOLEAN(0, "print-sample", &trace.print_sample,
3235 "print the PERF_RECORD_SAMPLE PERF_SAMPLE_ info, for debugging"),
3236 OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
3237 "per thread proc mmap processing timeout in ms"),
3238 OPT_CALLBACK('G', "cgroup", &trace, "name", "monitor event in cgroup name only",
3239 trace__parse_cgroups),
3240 OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
3241 "ms to wait before starting measurement after program "
3245 bool __maybe_unused max_stack_user_set = true;
3246 bool mmap_pages_user_set = true;
3247 struct perf_evsel *evsel;
3248 const char * const trace_subcommands[] = { "record", NULL };
3252 signal(SIGSEGV, sighandler_dump_stack);
3253 signal(SIGFPE, sighandler_dump_stack);
3255 trace.evlist = perf_evlist__new();
3256 trace.sctbl = syscalltbl__new();
3258 if (trace.evlist == NULL || trace.sctbl == NULL) {
3259 pr_err("Not enough memory to run!\n");
3264 argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
3265 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3267 if ((nr_cgroups || trace.cgroup) && !trace.opts.target.system_wide) {
3268 usage_with_options_msg(trace_usage, trace_options,
3269 "cgroup monitoring only available in system-wide mode");
3272 evsel = bpf__setup_output_event(trace.evlist, "__augmented_syscalls__");
3273 if (IS_ERR(evsel)) {
3274 bpf__strerror_setup_output_event(trace.evlist, PTR_ERR(evsel), bf, sizeof(bf));
3275 pr_err("ERROR: Setup trace syscalls enter failed: %s\n", bf);
3280 if (perf_evsel__init_augmented_syscall_tp(evsel) ||
3281 perf_evsel__init_augmented_syscall_tp_args(evsel))
3283 trace.syscalls.events.augmented = evsel;
3286 err = bpf__setup_stdout(trace.evlist);
3288 bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
3289 pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
3295 if (trace.trace_pgfaults) {
3296 trace.opts.sample_address = true;
3297 trace.opts.sample_time = true;
3300 if (trace.opts.mmap_pages == UINT_MAX)
3301 mmap_pages_user_set = false;
3303 if (trace.max_stack == UINT_MAX) {
3304 trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl__max_stack();
3305 max_stack_user_set = false;
3308 #ifdef HAVE_DWARF_UNWIND_SUPPORT
3309 if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled) {
3310 record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
3314 if (callchain_param.enabled) {
3315 if (!mmap_pages_user_set && geteuid() == 0)
3316 trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
3318 symbol_conf.use_callchain = true;
3321 if (trace.evlist->nr_entries > 0) {
3322 evlist__set_evsel_handler(trace.evlist, trace__event_handler);
3323 if (evlist__set_syscall_tp_fields(trace.evlist)) {
3324 perror("failed to set syscalls:* tracepoint fields");
3329 if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
3330 return trace__record(&trace, argc-1, &argv[1]);
3332 /* summary_only implies summary option, but don't overwrite summary if set */
3333 if (trace.summary_only)
3334 trace.summary = trace.summary_only;
3336 if (!trace.trace_syscalls && !trace.trace_pgfaults &&
3337 trace.evlist->nr_entries == 0 /* Was --events used? */) {
3338 trace.trace_syscalls = true;
3341 if (output_name != NULL) {
3342 err = trace__open_output(&trace, output_name);
3344 perror("failed to create output file");
3349 err = target__validate(&trace.opts.target);
3351 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3352 fprintf(trace.output, "%s", bf);
3356 err = target__parse_uid(&trace.opts.target);
3358 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3359 fprintf(trace.output, "%s", bf);
3363 if (!argc && target__none(&trace.opts.target))
3364 trace.opts.target.system_wide = true;
3367 err = trace__replay(&trace);
3369 err = trace__run(&trace, argc, argv);
3372 if (output_name != NULL)
3373 fclose(trace.output);