4 * Builtin 'trace' command:
6 * Display a continuously updated trace of any workload, CPU, specific PID,
7 * system wide, etc. Default format is loosely strace like, but any other
8 * event may be specified using --event.
10 * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <
[email protected]>
12 * Initially based on the 'trace' prototype by Thomas Gleixner:
14 * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
16 * Released under the GPL v2. (and only v2, not any later version)
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
22 #include "util/color.h"
23 #include "util/debug.h"
24 #include "util/evlist.h"
25 #include <subcmd/exec-cmd.h>
26 #include "util/machine.h"
27 #include "util/session.h"
28 #include "util/thread.h"
29 #include <subcmd/parse-options.h>
30 #include "util/strlist.h"
31 #include "util/intlist.h"
32 #include "util/thread_map.h"
33 #include "util/stat.h"
34 #include "trace-event.h"
35 #include "util/parse-events.h"
36 #include "util/bpf-loader.h"
37 #include "callchain.h"
38 #include "syscalltbl.h"
39 #include "rb_resort.h"
41 #include <libaudit.h> /* FIXME: Still needed for audit_errno_to_name */
43 #include <linux/err.h>
44 #include <linux/filter.h>
45 #include <linux/audit.h>
46 #include <sys/ptrace.h>
47 #include <linux/random.h>
48 #include <linux/stringify.h>
51 # define O_CLOEXEC 02000000
55 struct perf_tool tool;
56 struct syscalltbl *sctbl;
59 struct syscall *table;
61 struct perf_evsel *sys_enter,
65 struct record_opts opts;
66 struct perf_evlist *evlist;
68 struct thread *current;
71 unsigned long nr_events;
72 struct strlist *ev_qualifier;
77 struct intlist *tid_list;
78 struct intlist *pid_list;
83 double duration_filter;
89 unsigned int max_stack;
90 unsigned int min_stack;
91 bool not_ev_qualifier;
95 bool multiple_threads;
101 bool kernel_syscallchains;
111 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
112 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
116 #define TP_UINT_FIELD(bits) \
117 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
120 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
129 #define TP_UINT_FIELD__SWAPPED(bits) \
130 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
133 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
134 return bswap_##bits(value);\
137 TP_UINT_FIELD__SWAPPED(16);
138 TP_UINT_FIELD__SWAPPED(32);
139 TP_UINT_FIELD__SWAPPED(64);
141 static int tp_field__init_uint(struct tp_field *field,
142 struct format_field *format_field,
145 field->offset = format_field->offset;
147 switch (format_field->size) {
149 field->integer = tp_field__u8;
152 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
155 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
158 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
167 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
169 return sample->raw_data + field->offset;
172 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
174 field->offset = format_field->offset;
175 field->pointer = tp_field__ptr;
182 struct tp_field args, ret;
186 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
187 struct tp_field *field,
190 struct format_field *format_field = perf_evsel__field(evsel, name);
192 if (format_field == NULL)
195 return tp_field__init_uint(field, format_field, evsel->needs_swap);
198 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
199 ({ struct syscall_tp *sc = evsel->priv;\
200 perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
202 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
203 struct tp_field *field,
206 struct format_field *format_field = perf_evsel__field(evsel, name);
208 if (format_field == NULL)
211 return tp_field__init_ptr(field, format_field);
214 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
215 ({ struct syscall_tp *sc = evsel->priv;\
216 perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
218 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
221 perf_evsel__delete(evsel);
224 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
226 evsel->priv = malloc(sizeof(struct syscall_tp));
227 if (evsel->priv != NULL) {
228 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
231 evsel->handler = handler;
242 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
244 struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
246 /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
248 evsel = perf_evsel__newtp("syscalls", direction);
253 if (perf_evsel__init_syscall_tp(evsel, handler))
259 perf_evsel__delete_priv(evsel);
263 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
264 ({ struct syscall_tp *fields = evsel->priv; \
265 fields->name.integer(&fields->name, sample); })
267 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
268 ({ struct syscall_tp *fields = evsel->priv; \
269 fields->name.pointer(&fields->name, sample); })
273 struct thread *thread;
283 const char **entries;
286 #define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
287 .nr_entries = ARRAY_SIZE(array), \
291 #define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
293 .nr_entries = ARRAY_SIZE(array), \
297 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
299 struct syscall_arg *arg)
301 struct strarray *sa = arg->parm;
302 int idx = arg->val - sa->offset;
304 if (idx < 0 || idx >= sa->nr_entries)
305 return scnprintf(bf, size, intfmt, arg->val);
307 return scnprintf(bf, size, "%s", sa->entries[idx]);
310 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
311 struct syscall_arg *arg)
313 return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
316 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
318 #if defined(__i386__) || defined(__x86_64__)
320 * FIXME: Make this available to all arches as soon as the ioctl beautifier
321 * gets rewritten to support all arches.
323 static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
324 struct syscall_arg *arg)
326 return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
329 #define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
330 #endif /* defined(__i386__) || defined(__x86_64__) */
332 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
333 struct syscall_arg *arg);
335 #define SCA_FD syscall_arg__scnprintf_fd
337 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
338 struct syscall_arg *arg)
343 return scnprintf(bf, size, "CWD");
345 return syscall_arg__scnprintf_fd(bf, size, arg);
348 #define SCA_FDAT syscall_arg__scnprintf_fd_at
350 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
351 struct syscall_arg *arg);
353 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
355 static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
356 struct syscall_arg *arg)
358 return scnprintf(bf, size, "%#lx", arg->val);
361 #define SCA_HEX syscall_arg__scnprintf_hex
363 static size_t syscall_arg__scnprintf_int(char *bf, size_t size,
364 struct syscall_arg *arg)
366 return scnprintf(bf, size, "%d", arg->val);
369 #define SCA_INT syscall_arg__scnprintf_int
371 static const char *bpf_cmd[] = {
372 "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
373 "MAP_GET_NEXT_KEY", "PROG_LOAD",
375 static DEFINE_STRARRAY(bpf_cmd);
377 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
378 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
380 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
381 static DEFINE_STRARRAY(itimers);
383 static const char *keyctl_options[] = {
384 "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
385 "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
386 "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
387 "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
388 "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
390 static DEFINE_STRARRAY(keyctl_options);
392 static const char *whences[] = { "SET", "CUR", "END",
400 static DEFINE_STRARRAY(whences);
402 static const char *fcntl_cmds[] = {
403 "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
404 "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
405 "F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
408 static DEFINE_STRARRAY(fcntl_cmds);
410 static const char *rlimit_resources[] = {
411 "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
412 "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
415 static DEFINE_STRARRAY(rlimit_resources);
417 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
418 static DEFINE_STRARRAY(sighow);
420 static const char *clockid[] = {
421 "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
422 "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
423 "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
425 static DEFINE_STRARRAY(clockid);
427 static const char *socket_families[] = {
428 "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
429 "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
430 "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
431 "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
432 "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
433 "ALG", "NFC", "VSOCK",
435 static DEFINE_STRARRAY(socket_families);
437 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
438 struct syscall_arg *arg)
443 if (mode == F_OK) /* 0 */
444 return scnprintf(bf, size, "F");
446 if (mode & n##_OK) { \
447 printed += scnprintf(bf + printed, size - printed, "%s", #n); \
457 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
462 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
464 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
465 struct syscall_arg *arg);
467 #define SCA_FILENAME syscall_arg__scnprintf_filename
469 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
470 struct syscall_arg *arg)
472 int printed = 0, flags = arg->val;
475 if (flags & O_##n) { \
476 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
485 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
490 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
492 #if defined(__i386__) || defined(__x86_64__)
494 * FIXME: Make this available to all arches.
496 #define TCGETS 0x5401
498 static const char *tioctls[] = {
499 "TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
500 "TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
501 "TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
502 "TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
503 "TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
504 "TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
505 "TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
506 "TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
507 "TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
508 "TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
509 "TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
510 [0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
511 "TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
512 "TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
513 "TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
516 static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
517 #endif /* defined(__i386__) || defined(__x86_64__) */
519 #ifndef GRND_NONBLOCK
520 #define GRND_NONBLOCK 0x0001
523 #define GRND_RANDOM 0x0002
526 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
527 struct syscall_arg *arg)
529 int printed = 0, flags = arg->val;
532 if (flags & GRND_##n) { \
533 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
534 flags &= ~GRND_##n; \
542 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
547 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
549 #define STRARRAY(arg, name, array) \
550 .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
551 .arg_parm = { [arg] = &strarray__##array, }
553 #include "trace/beauty/eventfd.c"
554 #include "trace/beauty/flock.c"
555 #include "trace/beauty/futex_op.c"
556 #include "trace/beauty/mmap.c"
557 #include "trace/beauty/mode_t.c"
558 #include "trace/beauty/msg_flags.c"
559 #include "trace/beauty/open_flags.c"
560 #include "trace/beauty/perf_event_open.c"
561 #include "trace/beauty/pid.c"
562 #include "trace/beauty/sched_policy.c"
563 #include "trace/beauty/seccomp.c"
564 #include "trace/beauty/signum.c"
565 #include "trace/beauty/socket_type.c"
566 #include "trace/beauty/waitid_options.c"
568 static struct syscall_fmt {
571 size_t (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
578 { .name = "access", .errmsg = true,
579 .arg_scnprintf = { [1] = SCA_ACCMODE, /* mode */ }, },
580 { .name = "arch_prctl", .errmsg = true, .alias = "prctl", },
581 { .name = "bpf", .errmsg = true, STRARRAY(0, cmd, bpf_cmd), },
582 { .name = "brk", .hexret = true,
583 .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
584 { .name = "chdir", .errmsg = true, },
585 { .name = "chmod", .errmsg = true, },
586 { .name = "chroot", .errmsg = true, },
587 { .name = "clock_gettime", .errmsg = true, STRARRAY(0, clk_id, clockid), },
588 { .name = "clone", .errpid = true, },
589 { .name = "close", .errmsg = true,
590 .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
591 { .name = "connect", .errmsg = true, },
592 { .name = "creat", .errmsg = true, },
593 { .name = "dup", .errmsg = true, },
594 { .name = "dup2", .errmsg = true, },
595 { .name = "dup3", .errmsg = true, },
596 { .name = "epoll_ctl", .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
597 { .name = "eventfd2", .errmsg = true,
598 .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
599 { .name = "faccessat", .errmsg = true, },
600 { .name = "fadvise64", .errmsg = true, },
601 { .name = "fallocate", .errmsg = true, },
602 { .name = "fchdir", .errmsg = true, },
603 { .name = "fchmod", .errmsg = true, },
604 { .name = "fchmodat", .errmsg = true,
605 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
606 { .name = "fchown", .errmsg = true, },
607 { .name = "fchownat", .errmsg = true,
608 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
609 { .name = "fcntl", .errmsg = true,
610 .arg_scnprintf = { [1] = SCA_STRARRAY, /* cmd */ },
611 .arg_parm = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
612 { .name = "fdatasync", .errmsg = true, },
613 { .name = "flock", .errmsg = true,
614 .arg_scnprintf = { [1] = SCA_FLOCK, /* cmd */ }, },
615 { .name = "fsetxattr", .errmsg = true, },
616 { .name = "fstat", .errmsg = true, .alias = "newfstat", },
617 { .name = "fstatat", .errmsg = true, .alias = "newfstatat", },
618 { .name = "fstatfs", .errmsg = true, },
619 { .name = "fsync", .errmsg = true, },
620 { .name = "ftruncate", .errmsg = true, },
621 { .name = "futex", .errmsg = true,
622 .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
623 { .name = "futimesat", .errmsg = true,
624 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
625 { .name = "getdents", .errmsg = true, },
626 { .name = "getdents64", .errmsg = true, },
627 { .name = "getitimer", .errmsg = true, STRARRAY(0, which, itimers), },
628 { .name = "getpid", .errpid = true, },
629 { .name = "getpgid", .errpid = true, },
630 { .name = "getppid", .errpid = true, },
631 { .name = "getrandom", .errmsg = true,
632 .arg_scnprintf = { [2] = SCA_GETRANDOM_FLAGS, /* flags */ }, },
633 { .name = "getrlimit", .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
634 { .name = "getxattr", .errmsg = true, },
635 { .name = "inotify_add_watch", .errmsg = true, },
636 { .name = "ioctl", .errmsg = true,
638 #if defined(__i386__) || defined(__x86_64__)
640 * FIXME: Make this available to all arches.
642 [1] = SCA_STRHEXARRAY, /* cmd */
643 [2] = SCA_HEX, /* arg */ },
644 .arg_parm = { [1] = &strarray__tioctls, /* cmd */ }, },
646 [2] = SCA_HEX, /* arg */ }, },
648 { .name = "keyctl", .errmsg = true, STRARRAY(0, option, keyctl_options), },
649 { .name = "kill", .errmsg = true,
650 .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
651 { .name = "lchown", .errmsg = true, },
652 { .name = "lgetxattr", .errmsg = true, },
653 { .name = "linkat", .errmsg = true,
654 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
655 { .name = "listxattr", .errmsg = true, },
656 { .name = "llistxattr", .errmsg = true, },
657 { .name = "lremovexattr", .errmsg = true, },
658 { .name = "lseek", .errmsg = true,
659 .arg_scnprintf = { [2] = SCA_STRARRAY, /* whence */ },
660 .arg_parm = { [2] = &strarray__whences, /* whence */ }, },
661 { .name = "lsetxattr", .errmsg = true, },
662 { .name = "lstat", .errmsg = true, .alias = "newlstat", },
663 { .name = "lsxattr", .errmsg = true, },
664 { .name = "madvise", .errmsg = true,
665 .arg_scnprintf = { [0] = SCA_HEX, /* start */
666 [2] = SCA_MADV_BHV, /* behavior */ }, },
667 { .name = "mkdir", .errmsg = true, },
668 { .name = "mkdirat", .errmsg = true,
669 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
670 { .name = "mknod", .errmsg = true, },
671 { .name = "mknodat", .errmsg = true,
672 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
673 { .name = "mlock", .errmsg = true,
674 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
675 { .name = "mlockall", .errmsg = true,
676 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
677 { .name = "mmap", .hexret = true,
678 .arg_scnprintf = { [0] = SCA_HEX, /* addr */
679 [2] = SCA_MMAP_PROT, /* prot */
680 [3] = SCA_MMAP_FLAGS, /* flags */ }, },
681 { .name = "mprotect", .errmsg = true,
682 .arg_scnprintf = { [0] = SCA_HEX, /* start */
683 [2] = SCA_MMAP_PROT, /* prot */ }, },
684 { .name = "mq_unlink", .errmsg = true,
685 .arg_scnprintf = { [0] = SCA_FILENAME, /* u_name */ }, },
686 { .name = "mremap", .hexret = true,
687 .arg_scnprintf = { [0] = SCA_HEX, /* addr */
688 [3] = SCA_MREMAP_FLAGS, /* flags */
689 [4] = SCA_HEX, /* new_addr */ }, },
690 { .name = "munlock", .errmsg = true,
691 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
692 { .name = "munmap", .errmsg = true,
693 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
694 { .name = "name_to_handle_at", .errmsg = true,
695 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
696 { .name = "newfstatat", .errmsg = true,
697 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
698 { .name = "open", .errmsg = true,
699 .arg_scnprintf = { [1] = SCA_OPEN_FLAGS, /* flags */ }, },
700 { .name = "open_by_handle_at", .errmsg = true,
701 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
702 [2] = SCA_OPEN_FLAGS, /* flags */ }, },
703 { .name = "openat", .errmsg = true,
704 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
705 [2] = SCA_OPEN_FLAGS, /* flags */ }, },
706 { .name = "perf_event_open", .errmsg = true,
707 .arg_scnprintf = { [2] = SCA_INT, /* cpu */
708 [3] = SCA_FD, /* group_fd */
709 [4] = SCA_PERF_FLAGS, /* flags */ }, },
710 { .name = "pipe2", .errmsg = true,
711 .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
712 { .name = "poll", .errmsg = true, .timeout = true, },
713 { .name = "ppoll", .errmsg = true, .timeout = true, },
714 { .name = "pread", .errmsg = true, .alias = "pread64", },
715 { .name = "preadv", .errmsg = true, .alias = "pread", },
716 { .name = "prlimit64", .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
717 { .name = "pwrite", .errmsg = true, .alias = "pwrite64", },
718 { .name = "pwritev", .errmsg = true, },
719 { .name = "read", .errmsg = true, },
720 { .name = "readlink", .errmsg = true, },
721 { .name = "readlinkat", .errmsg = true,
722 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
723 { .name = "readv", .errmsg = true, },
724 { .name = "recvfrom", .errmsg = true,
725 .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
726 { .name = "recvmmsg", .errmsg = true,
727 .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
728 { .name = "recvmsg", .errmsg = true,
729 .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
730 { .name = "removexattr", .errmsg = true, },
731 { .name = "renameat", .errmsg = true,
732 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
733 { .name = "rmdir", .errmsg = true, },
734 { .name = "rt_sigaction", .errmsg = true,
735 .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
736 { .name = "rt_sigprocmask", .errmsg = true, STRARRAY(0, how, sighow), },
737 { .name = "rt_sigqueueinfo", .errmsg = true,
738 .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
739 { .name = "rt_tgsigqueueinfo", .errmsg = true,
740 .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
741 { .name = "sched_setscheduler", .errmsg = true,
742 .arg_scnprintf = { [1] = SCA_SCHED_POLICY, /* policy */ }, },
743 { .name = "seccomp", .errmsg = true,
744 .arg_scnprintf = { [0] = SCA_SECCOMP_OP, /* op */
745 [1] = SCA_SECCOMP_FLAGS, /* flags */ }, },
746 { .name = "select", .errmsg = true, .timeout = true, },
747 { .name = "sendmmsg", .errmsg = true,
748 .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
749 { .name = "sendmsg", .errmsg = true,
750 .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
751 { .name = "sendto", .errmsg = true,
752 .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
753 { .name = "set_tid_address", .errpid = true, },
754 { .name = "setitimer", .errmsg = true, STRARRAY(0, which, itimers), },
755 { .name = "setpgid", .errmsg = true, },
756 { .name = "setrlimit", .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
757 { .name = "setxattr", .errmsg = true, },
758 { .name = "shutdown", .errmsg = true, },
759 { .name = "socket", .errmsg = true,
760 .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
761 [1] = SCA_SK_TYPE, /* type */ },
762 .arg_parm = { [0] = &strarray__socket_families, /* family */ }, },
763 { .name = "socketpair", .errmsg = true,
764 .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
765 [1] = SCA_SK_TYPE, /* type */ },
766 .arg_parm = { [0] = &strarray__socket_families, /* family */ }, },
767 { .name = "stat", .errmsg = true, .alias = "newstat", },
768 { .name = "statfs", .errmsg = true, },
769 { .name = "swapoff", .errmsg = true,
770 .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
771 { .name = "swapon", .errmsg = true,
772 .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
773 { .name = "symlinkat", .errmsg = true,
774 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
775 { .name = "tgkill", .errmsg = true,
776 .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
777 { .name = "tkill", .errmsg = true,
778 .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
779 { .name = "truncate", .errmsg = true, },
780 { .name = "uname", .errmsg = true, .alias = "newuname", },
781 { .name = "unlinkat", .errmsg = true,
782 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
783 { .name = "utime", .errmsg = true, },
784 { .name = "utimensat", .errmsg = true,
785 .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */ }, },
786 { .name = "utimes", .errmsg = true, },
787 { .name = "vmsplice", .errmsg = true, },
788 { .name = "wait4", .errpid = true,
789 .arg_scnprintf = { [2] = SCA_WAITID_OPTIONS, /* options */ }, },
790 { .name = "waitid", .errpid = true,
791 .arg_scnprintf = { [3] = SCA_WAITID_OPTIONS, /* options */ }, },
792 { .name = "write", .errmsg = true, },
793 { .name = "writev", .errmsg = true, },
796 static int syscall_fmt__cmp(const void *name, const void *fmtp)
798 const struct syscall_fmt *fmt = fmtp;
799 return strcmp(name, fmt->name);
802 static struct syscall_fmt *syscall_fmt__find(const char *name)
804 const int nmemb = ARRAY_SIZE(syscall_fmts);
805 return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
809 struct event_format *tp_format;
811 struct format_field *args;
814 struct syscall_fmt *fmt;
815 size_t (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
819 static size_t fprintf_duration(unsigned long t, FILE *fp)
821 double duration = (double)t / NSEC_PER_MSEC;
822 size_t printed = fprintf(fp, "(");
825 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
826 else if (duration >= 0.01)
827 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
829 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
830 return printed + fprintf(fp, "): ");
834 * filename.ptr: The filename char pointer that will be vfs_getname'd
835 * filename.entry_str_pos: Where to insert the string translated from
836 * filename.ptr by the vfs_getname tracepoint/kprobe.
838 struct thread_trace {
842 unsigned long nr_events;
843 unsigned long pfmaj, pfmin;
848 short int entry_str_pos;
850 unsigned int namelen;
858 struct intlist *syscall_stats;
861 static struct thread_trace *thread_trace__new(void)
863 struct thread_trace *ttrace = zalloc(sizeof(struct thread_trace));
866 ttrace->paths.max = -1;
868 ttrace->syscall_stats = intlist__new(NULL);
873 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
875 struct thread_trace *ttrace;
880 if (thread__priv(thread) == NULL)
881 thread__set_priv(thread, thread_trace__new());
883 if (thread__priv(thread) == NULL)
886 ttrace = thread__priv(thread);
891 color_fprintf(fp, PERF_COLOR_RED,
892 "WARNING: not enough memory, dropping samples!\n");
896 #define TRACE_PFMAJ (1 << 0)
897 #define TRACE_PFMIN (1 << 1)
899 static const size_t trace__entry_str_size = 2048;
901 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
903 struct thread_trace *ttrace = thread__priv(thread);
905 if (fd > ttrace->paths.max) {
906 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
911 if (ttrace->paths.max != -1) {
912 memset(npath + ttrace->paths.max + 1, 0,
913 (fd - ttrace->paths.max) * sizeof(char *));
915 memset(npath, 0, (fd + 1) * sizeof(char *));
918 ttrace->paths.table = npath;
919 ttrace->paths.max = fd;
922 ttrace->paths.table[fd] = strdup(pathname);
924 return ttrace->paths.table[fd] != NULL ? 0 : -1;
927 static int thread__read_fd_path(struct thread *thread, int fd)
929 char linkname[PATH_MAX], pathname[PATH_MAX];
933 if (thread->pid_ == thread->tid) {
934 scnprintf(linkname, sizeof(linkname),
935 "/proc/%d/fd/%d", thread->pid_, fd);
937 scnprintf(linkname, sizeof(linkname),
938 "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
941 if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
944 ret = readlink(linkname, pathname, sizeof(pathname));
946 if (ret < 0 || ret > st.st_size)
949 pathname[ret] = '\0';
950 return trace__set_fd_pathname(thread, fd, pathname);
953 static const char *thread__fd_path(struct thread *thread, int fd,
956 struct thread_trace *ttrace = thread__priv(thread);
964 if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
967 ++trace->stats.proc_getname;
968 if (thread__read_fd_path(thread, fd))
972 return ttrace->paths.table[fd];
975 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
976 struct syscall_arg *arg)
979 size_t printed = scnprintf(bf, size, "%d", fd);
980 const char *path = thread__fd_path(arg->thread, fd, arg->trace);
983 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
988 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
989 struct syscall_arg *arg)
992 size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
993 struct thread_trace *ttrace = thread__priv(arg->thread);
995 if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
996 zfree(&ttrace->paths.table[fd]);
1001 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1004 struct thread_trace *ttrace = thread__priv(thread);
1006 ttrace->filename.ptr = ptr;
1007 ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1010 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1011 struct syscall_arg *arg)
1013 unsigned long ptr = arg->val;
1015 if (!arg->trace->vfs_getname)
1016 return scnprintf(bf, size, "%#x", ptr);
1018 thread__set_filename_pos(arg->thread, bf, ptr);
1022 static bool trace__filter_duration(struct trace *trace, double t)
1024 return t < (trace->duration_filter * NSEC_PER_MSEC);
1027 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1029 double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1031 return fprintf(fp, "%10.3f ", ts);
1034 static bool done = false;
1035 static bool interrupted = false;
1037 static void sig_handler(int sig)
1040 interrupted = sig == SIGINT;
1043 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1044 u64 duration, u64 tstamp, FILE *fp)
1046 size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1047 printed += fprintf_duration(duration, fp);
1049 if (trace->multiple_threads) {
1050 if (trace->show_comm)
1051 printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1052 printed += fprintf(fp, "%d ", thread->tid);
1058 static int trace__process_event(struct trace *trace, struct machine *machine,
1059 union perf_event *event, struct perf_sample *sample)
1063 switch (event->header.type) {
1064 case PERF_RECORD_LOST:
1065 color_fprintf(trace->output, PERF_COLOR_RED,
1066 "LOST %" PRIu64 " events!\n", event->lost.lost);
1067 ret = machine__process_lost_event(machine, event, sample);
1070 ret = machine__process_event(machine, event, sample);
1077 static int trace__tool_process(struct perf_tool *tool,
1078 union perf_event *event,
1079 struct perf_sample *sample,
1080 struct machine *machine)
1082 struct trace *trace = container_of(tool, struct trace, tool);
1083 return trace__process_event(trace, machine, event, sample);
1086 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1088 struct machine *machine = vmachine;
1090 if (machine->kptr_restrict_warned)
1093 if (symbol_conf.kptr_restrict) {
1094 pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1095 "Check /proc/sys/kernel/kptr_restrict.\n\n"
1096 "Kernel samples will not be resolved.\n");
1097 machine->kptr_restrict_warned = true;
1101 return machine__resolve_kernel_addr(vmachine, addrp, modp);
1104 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1106 int err = symbol__init(NULL);
1111 trace->host = machine__new_host();
1112 if (trace->host == NULL)
1115 if (trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr) < 0)
1118 err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1119 evlist->threads, trace__tool_process, false,
1120 trace->opts.proc_map_timeout);
1127 static int syscall__set_arg_fmts(struct syscall *sc)
1129 struct format_field *field;
1132 sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *));
1133 if (sc->arg_scnprintf == NULL)
1137 sc->arg_parm = sc->fmt->arg_parm;
1139 for (field = sc->args; field; field = field->next) {
1140 if (sc->fmt && sc->fmt->arg_scnprintf[idx])
1141 sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
1142 else if (strcmp(field->type, "const char *") == 0 &&
1143 (strcmp(field->name, "filename") == 0 ||
1144 strcmp(field->name, "path") == 0 ||
1145 strcmp(field->name, "pathname") == 0))
1146 sc->arg_scnprintf[idx] = SCA_FILENAME;
1147 else if (field->flags & FIELD_IS_POINTER)
1148 sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
1149 else if (strcmp(field->type, "pid_t") == 0)
1150 sc->arg_scnprintf[idx] = SCA_PID;
1151 else if (strcmp(field->type, "umode_t") == 0)
1152 sc->arg_scnprintf[idx] = SCA_MODE_T;
1153 else if ((strcmp(field->type, "int") == 0 ||
1154 strcmp(field->type, "unsigned int") == 0 ||
1155 strcmp(field->type, "long") == 0) &&
1156 (len = strlen(field->name)) >= 2 &&
1157 strcmp(field->name + len - 2, "fd") == 0) {
1159 * /sys/kernel/tracing/events/syscalls/sys_enter*
1160 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1165 sc->arg_scnprintf[idx] = SCA_FD;
1173 static int trace__read_syscall_info(struct trace *trace, int id)
1177 const char *name = syscalltbl__name(trace->sctbl, id);
1182 if (id > trace->syscalls.max) {
1183 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1185 if (nsyscalls == NULL)
1188 if (trace->syscalls.max != -1) {
1189 memset(nsyscalls + trace->syscalls.max + 1, 0,
1190 (id - trace->syscalls.max) * sizeof(*sc));
1192 memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1195 trace->syscalls.table = nsyscalls;
1196 trace->syscalls.max = id;
1199 sc = trace->syscalls.table + id;
1202 sc->fmt = syscall_fmt__find(sc->name);
1204 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1205 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1207 if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1208 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1209 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1212 if (IS_ERR(sc->tp_format))
1215 sc->args = sc->tp_format->format.fields;
1216 sc->nr_args = sc->tp_format->format.nr_fields;
1218 * We need to check and discard the first variable '__syscall_nr'
1219 * or 'nr' that mean the syscall number. It is needless here.
1220 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1222 if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1223 sc->args = sc->args->next;
1227 sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1229 return syscall__set_arg_fmts(sc);
1232 static int trace__validate_ev_qualifier(struct trace *trace)
1235 struct str_node *pos;
1237 trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1238 trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1239 sizeof(trace->ev_qualifier_ids.entries[0]));
1241 if (trace->ev_qualifier_ids.entries == NULL) {
1242 fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1250 strlist__for_each(pos, trace->ev_qualifier) {
1251 const char *sc = pos->s;
1252 int id = syscalltbl__id(trace->sctbl, sc);
1256 fputs("Error:\tInvalid syscall ", trace->output);
1259 fputs(", ", trace->output);
1262 fputs(sc, trace->output);
1265 trace->ev_qualifier_ids.entries[i++] = id;
1269 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1270 "\nHint:\tand: 'man syscalls'\n", trace->output);
1271 zfree(&trace->ev_qualifier_ids.entries);
1272 trace->ev_qualifier_ids.nr = 0;
1279 * args is to be interpreted as a series of longs but we need to handle
1280 * 8-byte unaligned accesses. args points to raw_data within the event
1281 * and raw_data is guaranteed to be 8-byte unaligned because it is
1282 * preceded by raw_size which is a u32. So we need to copy args to a temp
1283 * variable to read it. Most notably this avoids extended load instructions
1284 * on unaligned addresses
1287 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1288 unsigned char *args, struct trace *trace,
1289 struct thread *thread)
1295 if (sc->args != NULL) {
1296 struct format_field *field;
1298 struct syscall_arg arg = {
1305 for (field = sc->args; field;
1306 field = field->next, ++arg.idx, bit <<= 1) {
1310 /* special care for unaligned accesses */
1311 p = args + sizeof(unsigned long) * arg.idx;
1312 memcpy(&val, p, sizeof(val));
1315 * Suppress this argument if its value is zero and
1316 * and we don't have a string associated in an
1320 !(sc->arg_scnprintf &&
1321 sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
1322 sc->arg_parm[arg.idx]))
1325 printed += scnprintf(bf + printed, size - printed,
1326 "%s%s: ", printed ? ", " : "", field->name);
1327 if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
1330 arg.parm = sc->arg_parm[arg.idx];
1331 printed += sc->arg_scnprintf[arg.idx](bf + printed,
1332 size - printed, &arg);
1334 printed += scnprintf(bf + printed, size - printed,
1338 } else if (IS_ERR(sc->tp_format)) {
1340 * If we managed to read the tracepoint /format file, then we
1341 * may end up not having any args, like with gettid(), so only
1342 * print the raw args when we didn't manage to read it.
1347 /* special care for unaligned accesses */
1348 p = args + sizeof(unsigned long) * i;
1349 memcpy(&val, p, sizeof(val));
1350 printed += scnprintf(bf + printed, size - printed,
1352 printed ? ", " : "", i, val);
1360 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1361 union perf_event *event,
1362 struct perf_sample *sample);
1364 static struct syscall *trace__syscall_info(struct trace *trace,
1365 struct perf_evsel *evsel, int id)
1371 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1372 * before that, leaving at a higher verbosity level till that is
1373 * explained. Reproduced with plain ftrace with:
1375 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1376 * grep "NR -1 " /t/trace_pipe
1378 * After generating some load on the machine.
1382 fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1383 id, perf_evsel__name(evsel), ++n);
1388 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1389 trace__read_syscall_info(trace, id))
1392 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1395 return &trace->syscalls.table[id];
1399 fprintf(trace->output, "Problems reading syscall %d", id);
1400 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1401 fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1402 fputs(" information\n", trace->output);
1407 static void thread__update_stats(struct thread_trace *ttrace,
1408 int id, struct perf_sample *sample)
1410 struct int_node *inode;
1411 struct stats *stats;
1414 inode = intlist__findnew(ttrace->syscall_stats, id);
1418 stats = inode->priv;
1419 if (stats == NULL) {
1420 stats = malloc(sizeof(struct stats));
1424 inode->priv = stats;
1427 if (ttrace->entry_time && sample->time > ttrace->entry_time)
1428 duration = sample->time - ttrace->entry_time;
1430 update_stats(stats, duration);
1433 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1435 struct thread_trace *ttrace;
1439 if (trace->current == NULL)
1442 ttrace = thread__priv(trace->current);
1444 if (!ttrace->entry_pending)
1447 duration = sample->time - ttrace->entry_time;
1449 printed = trace__fprintf_entry_head(trace, trace->current, duration, sample->time, trace->output);
1450 printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1451 ttrace->entry_pending = false;
1456 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1457 union perf_event *event __maybe_unused,
1458 struct perf_sample *sample)
1463 struct thread *thread;
1464 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1465 struct syscall *sc = trace__syscall_info(trace, evsel, id);
1466 struct thread_trace *ttrace;
1471 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1472 ttrace = thread__trace(thread, trace->output);
1476 args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1478 if (ttrace->entry_str == NULL) {
1479 ttrace->entry_str = malloc(trace__entry_str_size);
1480 if (!ttrace->entry_str)
1484 if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1485 trace__printf_interrupted_entry(trace, sample);
1487 ttrace->entry_time = sample->time;
1488 msg = ttrace->entry_str;
1489 printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1491 printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1492 args, trace, thread);
1495 if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) {
1496 trace__fprintf_entry_head(trace, thread, 1, sample->time, trace->output);
1497 fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1500 ttrace->entry_pending = true;
1501 /* See trace__vfs_getname & trace__sys_exit */
1502 ttrace->filename.pending_open = false;
1505 if (trace->current != thread) {
1506 thread__put(trace->current);
1507 trace->current = thread__get(thread);
1511 thread__put(thread);
1515 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1516 struct perf_sample *sample,
1517 struct callchain_cursor *cursor)
1519 struct addr_location al;
1521 if (machine__resolve(trace->host, &al, sample) < 0 ||
1522 thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, trace->max_stack))
1528 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1530 /* TODO: user-configurable print_opts */
1531 const unsigned int print_opts = EVSEL__PRINT_SYM |
1533 EVSEL__PRINT_UNKNOWN_AS_ADDR;
1535 return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1538 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1539 union perf_event *event __maybe_unused,
1540 struct perf_sample *sample)
1544 struct thread *thread;
1545 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1546 struct syscall *sc = trace__syscall_info(trace, evsel, id);
1547 struct thread_trace *ttrace;
1552 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1553 ttrace = thread__trace(thread, trace->output);
1558 thread__update_stats(ttrace, id, sample);
1560 ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1562 if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1563 trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1564 ttrace->filename.pending_open = false;
1565 ++trace->stats.vfs_getname;
1568 ttrace->exit_time = sample->time;
1570 if (ttrace->entry_time) {
1571 duration = sample->time - ttrace->entry_time;
1572 if (trace__filter_duration(trace, duration))
1574 } else if (trace->duration_filter)
1577 if (sample->callchain) {
1578 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1579 if (callchain_ret == 0) {
1580 if (callchain_cursor.nr < trace->min_stack)
1586 if (trace->summary_only)
1589 trace__fprintf_entry_head(trace, thread, duration, sample->time, trace->output);
1591 if (ttrace->entry_pending) {
1592 fprintf(trace->output, "%-70s", ttrace->entry_str);
1594 fprintf(trace->output, " ... [");
1595 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1596 fprintf(trace->output, "]: %s()", sc->name);
1599 if (sc->fmt == NULL) {
1601 fprintf(trace->output, ") = %ld", ret);
1602 } else if (ret < 0 && (sc->fmt->errmsg || sc->fmt->errpid)) {
1603 char bf[STRERR_BUFSIZE];
1604 const char *emsg = strerror_r(-ret, bf, sizeof(bf)),
1605 *e = audit_errno_to_name(-ret);
1607 fprintf(trace->output, ") = -1 %s %s", e, emsg);
1608 } else if (ret == 0 && sc->fmt->timeout)
1609 fprintf(trace->output, ") = 0 Timeout");
1610 else if (sc->fmt->hexret)
1611 fprintf(trace->output, ") = %#lx", ret);
1612 else if (sc->fmt->errpid) {
1613 struct thread *child = machine__find_thread(trace->host, ret, ret);
1615 if (child != NULL) {
1616 fprintf(trace->output, ") = %ld", ret);
1617 if (child->comm_set)
1618 fprintf(trace->output, " (%s)", thread__comm_str(child));
1624 fputc('\n', trace->output);
1626 if (callchain_ret > 0)
1627 trace__fprintf_callchain(trace, sample);
1628 else if (callchain_ret < 0)
1629 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1631 ttrace->entry_pending = false;
1634 thread__put(thread);
1638 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1639 union perf_event *event __maybe_unused,
1640 struct perf_sample *sample)
1642 struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1643 struct thread_trace *ttrace;
1644 size_t filename_len, entry_str_len, to_move;
1645 ssize_t remaining_space;
1647 const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1652 ttrace = thread__priv(thread);
1656 filename_len = strlen(filename);
1658 if (ttrace->filename.namelen < filename_len) {
1659 char *f = realloc(ttrace->filename.name, filename_len + 1);
1664 ttrace->filename.namelen = filename_len;
1665 ttrace->filename.name = f;
1668 strcpy(ttrace->filename.name, filename);
1669 ttrace->filename.pending_open = true;
1671 if (!ttrace->filename.ptr)
1674 entry_str_len = strlen(ttrace->entry_str);
1675 remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1676 if (remaining_space <= 0)
1679 if (filename_len > (size_t)remaining_space) {
1680 filename += filename_len - remaining_space;
1681 filename_len = remaining_space;
1684 to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1685 pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1686 memmove(pos + filename_len, pos, to_move);
1687 memcpy(pos, filename, filename_len);
1689 ttrace->filename.ptr = 0;
1690 ttrace->filename.entry_str_pos = 0;
1695 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1696 union perf_event *event __maybe_unused,
1697 struct perf_sample *sample)
1699 u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1700 double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1701 struct thread *thread = machine__findnew_thread(trace->host,
1704 struct thread_trace *ttrace = thread__trace(thread, trace->output);
1709 ttrace->runtime_ms += runtime_ms;
1710 trace->runtime_ms += runtime_ms;
1711 thread__put(thread);
1715 fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1717 perf_evsel__strval(evsel, sample, "comm"),
1718 (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1720 perf_evsel__intval(evsel, sample, "vruntime"));
1721 thread__put(thread);
1725 static void bpf_output__printer(enum binary_printer_ops op,
1726 unsigned int val, void *extra)
1728 FILE *output = extra;
1729 unsigned char ch = (unsigned char)val;
1732 case BINARY_PRINT_CHAR_DATA:
1733 fprintf(output, "%c", isprint(ch) ? ch : '.');
1735 case BINARY_PRINT_DATA_BEGIN:
1736 case BINARY_PRINT_LINE_BEGIN:
1737 case BINARY_PRINT_ADDR:
1738 case BINARY_PRINT_NUM_DATA:
1739 case BINARY_PRINT_NUM_PAD:
1740 case BINARY_PRINT_SEP:
1741 case BINARY_PRINT_CHAR_PAD:
1742 case BINARY_PRINT_LINE_END:
1743 case BINARY_PRINT_DATA_END:
1749 static void bpf_output__fprintf(struct trace *trace,
1750 struct perf_sample *sample)
1752 print_binary(sample->raw_data, sample->raw_size, 8,
1753 bpf_output__printer, trace->output);
1756 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1757 union perf_event *event __maybe_unused,
1758 struct perf_sample *sample)
1760 int callchain_ret = 0;
1762 if (sample->callchain) {
1763 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1764 if (callchain_ret == 0) {
1765 if (callchain_cursor.nr < trace->min_stack)
1771 trace__printf_interrupted_entry(trace, sample);
1772 trace__fprintf_tstamp(trace, sample->time, trace->output);
1774 if (trace->trace_syscalls)
1775 fprintf(trace->output, "( ): ");
1777 fprintf(trace->output, "%s:", evsel->name);
1779 if (perf_evsel__is_bpf_output(evsel)) {
1780 bpf_output__fprintf(trace, sample);
1781 } else if (evsel->tp_format) {
1782 event_format__fprintf(evsel->tp_format, sample->cpu,
1783 sample->raw_data, sample->raw_size,
1787 fprintf(trace->output, ")\n");
1789 if (callchain_ret > 0)
1790 trace__fprintf_callchain(trace, sample);
1791 else if (callchain_ret < 0)
1792 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1797 static void print_location(FILE *f, struct perf_sample *sample,
1798 struct addr_location *al,
1799 bool print_dso, bool print_sym)
1802 if ((verbose || print_dso) && al->map)
1803 fprintf(f, "%s@", al->map->dso->long_name);
1805 if ((verbose || print_sym) && al->sym)
1806 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1807 al->addr - al->sym->start);
1809 fprintf(f, "0x%" PRIx64, al->addr);
1811 fprintf(f, "0x%" PRIx64, sample->addr);
1814 static int trace__pgfault(struct trace *trace,
1815 struct perf_evsel *evsel,
1816 union perf_event *event __maybe_unused,
1817 struct perf_sample *sample)
1819 struct thread *thread;
1820 struct addr_location al;
1821 char map_type = 'd';
1822 struct thread_trace *ttrace;
1824 int callchain_ret = 0;
1826 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1828 if (sample->callchain) {
1829 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1830 if (callchain_ret == 0) {
1831 if (callchain_cursor.nr < trace->min_stack)
1837 ttrace = thread__trace(thread, trace->output);
1841 if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
1846 if (trace->summary_only)
1849 thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
1852 trace__fprintf_entry_head(trace, thread, 0, sample->time, trace->output);
1854 fprintf(trace->output, "%sfault [",
1855 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
1858 print_location(trace->output, sample, &al, false, true);
1860 fprintf(trace->output, "] => ");
1862 thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
1866 thread__find_addr_location(thread, sample->cpumode,
1867 MAP__FUNCTION, sample->addr, &al);
1875 print_location(trace->output, sample, &al, true, false);
1877 fprintf(trace->output, " (%c%c)\n", map_type, al.level);
1879 if (callchain_ret > 0)
1880 trace__fprintf_callchain(trace, sample);
1881 else if (callchain_ret < 0)
1882 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1886 thread__put(thread);
1890 static bool skip_sample(struct trace *trace, struct perf_sample *sample)
1892 if ((trace->pid_list && intlist__find(trace->pid_list, sample->pid)) ||
1893 (trace->tid_list && intlist__find(trace->tid_list, sample->tid)))
1896 if (trace->pid_list || trace->tid_list)
1902 static void trace__set_base_time(struct trace *trace,
1903 struct perf_evsel *evsel,
1904 struct perf_sample *sample)
1907 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
1908 * and don't use sample->time unconditionally, we may end up having
1909 * some other event in the future without PERF_SAMPLE_TIME for good
1910 * reason, i.e. we may not be interested in its timestamps, just in
1911 * it taking place, picking some piece of information when it
1912 * appears in our event stream (vfs_getname comes to mind).
1914 if (trace->base_time == 0 && !trace->full_time &&
1915 (evsel->attr.sample_type & PERF_SAMPLE_TIME))
1916 trace->base_time = sample->time;
1919 static int trace__process_sample(struct perf_tool *tool,
1920 union perf_event *event,
1921 struct perf_sample *sample,
1922 struct perf_evsel *evsel,
1923 struct machine *machine __maybe_unused)
1925 struct trace *trace = container_of(tool, struct trace, tool);
1928 tracepoint_handler handler = evsel->handler;
1930 if (skip_sample(trace, sample))
1933 trace__set_base_time(trace, evsel, sample);
1937 handler(trace, evsel, event, sample);
1943 static int parse_target_str(struct trace *trace)
1945 if (trace->opts.target.pid) {
1946 trace->pid_list = intlist__new(trace->opts.target.pid);
1947 if (trace->pid_list == NULL) {
1948 pr_err("Error parsing process id string\n");
1953 if (trace->opts.target.tid) {
1954 trace->tid_list = intlist__new(trace->opts.target.tid);
1955 if (trace->tid_list == NULL) {
1956 pr_err("Error parsing thread id string\n");
1964 static int trace__record(struct trace *trace, int argc, const char **argv)
1966 unsigned int rec_argc, i, j;
1967 const char **rec_argv;
1968 const char * const record_args[] = {
1975 const char * const sc_args[] = { "-e", };
1976 unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
1977 const char * const majpf_args[] = { "-e", "major-faults" };
1978 unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
1979 const char * const minpf_args[] = { "-e", "minor-faults" };
1980 unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
1982 /* +1 is for the event string below */
1983 rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
1984 majpf_args_nr + minpf_args_nr + argc;
1985 rec_argv = calloc(rec_argc + 1, sizeof(char *));
1987 if (rec_argv == NULL)
1991 for (i = 0; i < ARRAY_SIZE(record_args); i++)
1992 rec_argv[j++] = record_args[i];
1994 if (trace->trace_syscalls) {
1995 for (i = 0; i < sc_args_nr; i++)
1996 rec_argv[j++] = sc_args[i];
1998 /* event string may be different for older kernels - e.g., RHEL6 */
1999 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2000 rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2001 else if (is_valid_tracepoint("syscalls:sys_enter"))
2002 rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2004 pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2009 if (trace->trace_pgfaults & TRACE_PFMAJ)
2010 for (i = 0; i < majpf_args_nr; i++)
2011 rec_argv[j++] = majpf_args[i];
2013 if (trace->trace_pgfaults & TRACE_PFMIN)
2014 for (i = 0; i < minpf_args_nr; i++)
2015 rec_argv[j++] = minpf_args[i];
2017 for (i = 0; i < (unsigned int)argc; i++)
2018 rec_argv[j++] = argv[i];
2020 return cmd_record(j, rec_argv, NULL);
2023 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2025 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2027 struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2032 if (perf_evsel__field(evsel, "pathname") == NULL) {
2033 perf_evsel__delete(evsel);
2037 evsel->handler = trace__vfs_getname;
2038 perf_evlist__add(evlist, evsel);
2042 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2044 struct perf_evsel *evsel;
2045 struct perf_event_attr attr = {
2046 .type = PERF_TYPE_SOFTWARE,
2050 attr.config = config;
2051 attr.sample_period = 1;
2053 event_attr_init(&attr);
2055 evsel = perf_evsel__new(&attr);
2057 evsel->handler = trace__pgfault;
2062 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2064 const u32 type = event->header.type;
2065 struct perf_evsel *evsel;
2067 if (type != PERF_RECORD_SAMPLE) {
2068 trace__process_event(trace, trace->host, event, sample);
2072 evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2073 if (evsel == NULL) {
2074 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2078 trace__set_base_time(trace, evsel, sample);
2080 if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2081 sample->raw_data == NULL) {
2082 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2083 perf_evsel__name(evsel), sample->tid,
2084 sample->cpu, sample->raw_size);
2086 tracepoint_handler handler = evsel->handler;
2087 handler(trace, evsel, event, sample);
2091 static int trace__add_syscall_newtp(struct trace *trace)
2094 struct perf_evlist *evlist = trace->evlist;
2095 struct perf_evsel *sys_enter, *sys_exit;
2097 sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2098 if (sys_enter == NULL)
2101 if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2102 goto out_delete_sys_enter;
2104 sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2105 if (sys_exit == NULL)
2106 goto out_delete_sys_enter;
2108 if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2109 goto out_delete_sys_exit;
2111 perf_evlist__add(evlist, sys_enter);
2112 perf_evlist__add(evlist, sys_exit);
2114 if (callchain_param.enabled && !trace->kernel_syscallchains) {
2116 * We're interested only in the user space callchain
2117 * leading to the syscall, allow overriding that for
2118 * debugging reasons using --kernel_syscall_callchains
2120 sys_exit->attr.exclude_callchain_kernel = 1;
2123 trace->syscalls.events.sys_enter = sys_enter;
2124 trace->syscalls.events.sys_exit = sys_exit;
2130 out_delete_sys_exit:
2131 perf_evsel__delete_priv(sys_exit);
2132 out_delete_sys_enter:
2133 perf_evsel__delete_priv(sys_enter);
2137 static int trace__set_ev_qualifier_filter(struct trace *trace)
2140 char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2141 trace->ev_qualifier_ids.nr,
2142 trace->ev_qualifier_ids.entries);
2147 if (!perf_evsel__append_filter(trace->syscalls.events.sys_enter, "&&", filter))
2148 err = perf_evsel__append_filter(trace->syscalls.events.sys_exit, "&&", filter);
2158 static int trace__run(struct trace *trace, int argc, const char **argv)
2160 struct perf_evlist *evlist = trace->evlist;
2161 struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2163 unsigned long before;
2164 const bool forks = argc > 0;
2165 bool draining = false;
2169 if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2170 goto out_error_raw_syscalls;
2172 if (trace->trace_syscalls)
2173 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2175 if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2176 pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2177 if (pgfault_maj == NULL)
2179 perf_evlist__add(evlist, pgfault_maj);
2182 if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2183 pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2184 if (pgfault_min == NULL)
2186 perf_evlist__add(evlist, pgfault_min);
2190 perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2191 trace__sched_stat_runtime))
2192 goto out_error_sched_stat_runtime;
2194 err = perf_evlist__create_maps(evlist, &trace->opts.target);
2196 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2197 goto out_delete_evlist;
2200 err = trace__symbols_init(trace, evlist);
2202 fprintf(trace->output, "Problems initializing symbol libraries!\n");
2203 goto out_delete_evlist;
2206 perf_evlist__config(evlist, &trace->opts, NULL);
2208 if (callchain_param.enabled) {
2209 bool use_identifier = false;
2211 if (trace->syscalls.events.sys_exit) {
2212 perf_evsel__config_callchain(trace->syscalls.events.sys_exit,
2213 &trace->opts, &callchain_param);
2214 use_identifier = true;
2218 perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2219 use_identifier = true;
2223 perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2224 use_identifier = true;
2227 if (use_identifier) {
2229 * Now we have evsels with different sample_ids, use
2230 * PERF_SAMPLE_IDENTIFIER to map from sample to evsel
2231 * from a fixed position in each ring buffer record.
2233 * As of this the changeset introducing this comment, this
2234 * isn't strictly needed, as the fields that can come before
2235 * PERF_SAMPLE_ID are all used, but we'll probably disable
2236 * some of those for things like copying the payload of
2237 * pointer syscall arguments, and for vfs_getname we don't
2238 * need PERF_SAMPLE_ADDR and PERF_SAMPLE_IP, so do this
2239 * here as a warning we need to use PERF_SAMPLE_IDENTIFIER.
2241 perf_evlist__set_sample_bit(evlist, IDENTIFIER);
2242 perf_evlist__reset_sample_bit(evlist, ID);
2246 signal(SIGCHLD, sig_handler);
2247 signal(SIGINT, sig_handler);
2250 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2253 fprintf(trace->output, "Couldn't run the workload!\n");
2254 goto out_delete_evlist;
2258 err = perf_evlist__open(evlist);
2260 goto out_error_open;
2262 err = bpf__apply_obj_config();
2264 char errbuf[BUFSIZ];
2266 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2267 pr_err("ERROR: Apply config to BPF failed: %s\n",
2269 goto out_error_open;
2273 * Better not use !target__has_task() here because we need to cover the
2274 * case where no threads were specified in the command line, but a
2275 * workload was, and in that case we will fill in the thread_map when
2276 * we fork the workload in perf_evlist__prepare_workload.
2278 if (trace->filter_pids.nr > 0)
2279 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2280 else if (thread_map__pid(evlist->threads, 0) == -1)
2281 err = perf_evlist__set_filter_pid(evlist, getpid());
2286 if (trace->ev_qualifier_ids.nr > 0) {
2287 err = trace__set_ev_qualifier_filter(trace);
2291 pr_debug("event qualifier tracepoint filter: %s\n",
2292 trace->syscalls.events.sys_exit->filter);
2295 err = perf_evlist__apply_filters(evlist, &evsel);
2297 goto out_error_apply_filters;
2299 err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2301 goto out_error_mmap;
2303 if (!target__none(&trace->opts.target))
2304 perf_evlist__enable(evlist);
2307 perf_evlist__start_workload(evlist);
2309 trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2310 evlist->threads->nr > 1 ||
2311 perf_evlist__first(evlist)->attr.inherit;
2313 before = trace->nr_events;
2315 for (i = 0; i < evlist->nr_mmaps; i++) {
2316 union perf_event *event;
2318 while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2319 struct perf_sample sample;
2323 err = perf_evlist__parse_sample(evlist, event, &sample);
2325 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2329 trace__handle_event(trace, event, &sample);
2331 perf_evlist__mmap_consume(evlist, i);
2336 if (done && !draining) {
2337 perf_evlist__disable(evlist);
2343 if (trace->nr_events == before) {
2344 int timeout = done ? 100 : -1;
2346 if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2347 if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2357 thread__zput(trace->current);
2359 perf_evlist__disable(evlist);
2363 trace__fprintf_thread_summary(trace, trace->output);
2365 if (trace->show_tool_stats) {
2366 fprintf(trace->output, "Stats:\n "
2367 " vfs_getname : %" PRIu64 "\n"
2368 " proc_getname: %" PRIu64 "\n",
2369 trace->stats.vfs_getname,
2370 trace->stats.proc_getname);
2375 perf_evlist__delete(evlist);
2376 trace->evlist = NULL;
2377 trace->live = false;
2380 char errbuf[BUFSIZ];
2382 out_error_sched_stat_runtime:
2383 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2386 out_error_raw_syscalls:
2387 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2391 perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2395 perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2398 fprintf(trace->output, "%s\n", errbuf);
2399 goto out_delete_evlist;
2401 out_error_apply_filters:
2402 fprintf(trace->output,
2403 "Failed to set filter \"%s\" on event %s with %d (%s)\n",
2404 evsel->filter, perf_evsel__name(evsel), errno,
2405 strerror_r(errno, errbuf, sizeof(errbuf)));
2406 goto out_delete_evlist;
2409 fprintf(trace->output, "Not enough memory to run!\n");
2410 goto out_delete_evlist;
2413 fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2414 goto out_delete_evlist;
2417 static int trace__replay(struct trace *trace)
2419 const struct perf_evsel_str_handler handlers[] = {
2420 { "probe:vfs_getname", trace__vfs_getname, },
2422 struct perf_data_file file = {
2424 .mode = PERF_DATA_MODE_READ,
2425 .force = trace->force,
2427 struct perf_session *session;
2428 struct perf_evsel *evsel;
2431 trace->tool.sample = trace__process_sample;
2432 trace->tool.mmap = perf_event__process_mmap;
2433 trace->tool.mmap2 = perf_event__process_mmap2;
2434 trace->tool.comm = perf_event__process_comm;
2435 trace->tool.exit = perf_event__process_exit;
2436 trace->tool.fork = perf_event__process_fork;
2437 trace->tool.attr = perf_event__process_attr;
2438 trace->tool.tracing_data = perf_event__process_tracing_data;
2439 trace->tool.build_id = perf_event__process_build_id;
2441 trace->tool.ordered_events = true;
2442 trace->tool.ordering_requires_timestamps = true;
2444 /* add tid to output */
2445 trace->multiple_threads = true;
2447 session = perf_session__new(&file, false, &trace->tool);
2448 if (session == NULL)
2451 if (symbol__init(&session->header.env) < 0)
2454 trace->host = &session->machines.host;
2456 err = perf_session__set_tracepoints_handlers(session, handlers);
2460 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2461 "raw_syscalls:sys_enter");
2462 /* older kernels have syscalls tp versus raw_syscalls */
2464 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2465 "syscalls:sys_enter");
2468 (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2469 perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2470 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2474 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2475 "raw_syscalls:sys_exit");
2477 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2478 "syscalls:sys_exit");
2480 (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2481 perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2482 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2486 evlist__for_each(session->evlist, evsel) {
2487 if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2488 (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2489 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2490 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2491 evsel->handler = trace__pgfault;
2494 err = parse_target_str(trace);
2500 err = perf_session__process_events(session);
2502 pr_err("Failed to process events, error %d", err);
2504 else if (trace->summary)
2505 trace__fprintf_thread_summary(trace, trace->output);
2508 perf_session__delete(session);
2513 static size_t trace__fprintf_threads_header(FILE *fp)
2517 printed = fprintf(fp, "\n Summary of events:\n\n");
2522 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2523 struct stats *stats;
2528 struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2529 struct stats *stats = source->priv;
2531 entry->syscall = source->i;
2532 entry->stats = stats;
2533 entry->msecs = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2536 static size_t thread__dump_stats(struct thread_trace *ttrace,
2537 struct trace *trace, FILE *fp)
2542 DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2544 if (syscall_stats == NULL)
2547 printed += fprintf(fp, "\n");
2549 printed += fprintf(fp, " syscall calls total min avg max stddev\n");
2550 printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n");
2551 printed += fprintf(fp, " --------------- -------- --------- --------- --------- --------- ------\n");
2553 resort_rb__for_each(nd, syscall_stats) {
2554 struct stats *stats = syscall_stats_entry->stats;
2556 double min = (double)(stats->min) / NSEC_PER_MSEC;
2557 double max = (double)(stats->max) / NSEC_PER_MSEC;
2558 double avg = avg_stats(stats);
2560 u64 n = (u64) stats->n;
2562 pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2563 avg /= NSEC_PER_MSEC;
2565 sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2566 printed += fprintf(fp, " %-15s", sc->name);
2567 printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2568 n, syscall_stats_entry->msecs, min, avg);
2569 printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2573 resort_rb__delete(syscall_stats);
2574 printed += fprintf(fp, "\n\n");
2579 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2582 struct thread_trace *ttrace = thread__priv(thread);
2588 ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2590 printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2591 printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2592 printed += fprintf(fp, "%.1f%%", ratio);
2594 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2596 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2598 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2599 else if (fputc('\n', fp) != EOF)
2602 printed += thread__dump_stats(ttrace, trace, fp);
2607 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2609 return ttrace ? ttrace->nr_events : 0;
2612 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2613 struct thread *thread;
2616 entry->thread = rb_entry(nd, struct thread, rb_node);
2619 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2621 DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host);
2622 size_t printed = trace__fprintf_threads_header(fp);
2625 if (threads == NULL) {
2626 fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2630 resort_rb__for_each(nd, threads)
2631 printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2633 resort_rb__delete(threads);
2638 static int trace__set_duration(const struct option *opt, const char *str,
2639 int unset __maybe_unused)
2641 struct trace *trace = opt->value;
2643 trace->duration_filter = atof(str);
2647 static int trace__set_filter_pids(const struct option *opt, const char *str,
2648 int unset __maybe_unused)
2652 struct trace *trace = opt->value;
2654 * FIXME: introduce a intarray class, plain parse csv and create a
2655 * { int nr, int entries[] } struct...
2657 struct intlist *list = intlist__new(str);
2662 i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2663 trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2665 if (trace->filter_pids.entries == NULL)
2668 trace->filter_pids.entries[0] = getpid();
2670 for (i = 1; i < trace->filter_pids.nr; ++i)
2671 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2673 intlist__delete(list);
2679 static int trace__open_output(struct trace *trace, const char *filename)
2683 if (!stat(filename, &st) && st.st_size) {
2684 char oldname[PATH_MAX];
2686 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2688 rename(filename, oldname);
2691 trace->output = fopen(filename, "w");
2693 return trace->output == NULL ? -errno : 0;
2696 static int parse_pagefaults(const struct option *opt, const char *str,
2697 int unset __maybe_unused)
2699 int *trace_pgfaults = opt->value;
2701 if (strcmp(str, "all") == 0)
2702 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2703 else if (strcmp(str, "maj") == 0)
2704 *trace_pgfaults |= TRACE_PFMAJ;
2705 else if (strcmp(str, "min") == 0)
2706 *trace_pgfaults |= TRACE_PFMIN;
2713 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2715 struct perf_evsel *evsel;
2717 evlist__for_each(evlist, evsel)
2718 evsel->handler = handler;
2721 int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
2723 const char *trace_usage[] = {
2724 "perf trace [<options>] [<command>]",
2725 "perf trace [<options>] -- <command> [<options>]",
2726 "perf trace record [<options>] [<command>]",
2727 "perf trace record [<options>] -- <command> [<options>]",
2730 struct trace trace = {
2739 .user_freq = UINT_MAX,
2740 .user_interval = ULLONG_MAX,
2741 .no_buffering = true,
2742 .mmap_pages = UINT_MAX,
2743 .proc_map_timeout = 500,
2747 .trace_syscalls = true,
2748 .kernel_syscallchains = false,
2749 .max_stack = UINT_MAX,
2751 const char *output_name = NULL;
2752 const char *ev_qualifier_str = NULL;
2753 const struct option trace_options[] = {
2754 OPT_CALLBACK(0, "event", &trace.evlist, "event",
2755 "event selector. use 'perf list' to list available events",
2756 parse_events_option),
2757 OPT_BOOLEAN(0, "comm", &trace.show_comm,
2758 "show the thread COMM next to its id"),
2759 OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
2760 OPT_STRING('e', "expr", &ev_qualifier_str, "expr", "list of syscalls to trace"),
2761 OPT_STRING('o', "output", &output_name, "file", "output file name"),
2762 OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
2763 OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
2764 "trace events on existing process id"),
2765 OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
2766 "trace events on existing thread id"),
2767 OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
2768 "pids to filter (by the kernel)", trace__set_filter_pids),
2769 OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
2770 "system-wide collection from all CPUs"),
2771 OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
2772 "list of cpus to monitor"),
2773 OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
2774 "child tasks do not inherit counters"),
2775 OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
2776 "number of mmap data pages",
2777 perf_evlist__parse_mmap_pages),
2778 OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
2780 OPT_CALLBACK(0, "duration", &trace, "float",
2781 "show only events with duration > N.M ms",
2782 trace__set_duration),
2783 OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
2784 OPT_INCR('v', "verbose", &verbose, "be more verbose"),
2785 OPT_BOOLEAN('T', "time", &trace.full_time,
2786 "Show full timestamp, not time relative to first start"),
2787 OPT_BOOLEAN('s', "summary", &trace.summary_only,
2788 "Show only syscall summary with statistics"),
2789 OPT_BOOLEAN('S', "with-summary", &trace.summary,
2790 "Show all syscalls and summary with statistics"),
2791 OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
2792 "Trace pagefaults", parse_pagefaults, "maj"),
2793 OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
2794 OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
2795 OPT_CALLBACK(0, "call-graph", &trace.opts,
2796 "record_mode[,record_size]", record_callchain_help,
2797 &record_parse_callchain_opt),
2798 OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
2799 "Show the kernel callchains on the syscall exit path"),
2800 OPT_UINTEGER(0, "min-stack", &trace.min_stack,
2801 "Set the minimum stack depth when parsing the callchain, "
2802 "anything below the specified depth will be ignored."),
2803 OPT_UINTEGER(0, "max-stack", &trace.max_stack,
2804 "Set the maximum stack depth when parsing the callchain, "
2805 "anything beyond the specified depth will be ignored. "
2806 "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
2807 OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
2808 "per thread proc mmap processing timeout in ms"),
2811 bool __maybe_unused max_stack_user_set = true;
2812 bool mmap_pages_user_set = true;
2813 const char * const trace_subcommands[] = { "record", NULL };
2817 signal(SIGSEGV, sighandler_dump_stack);
2818 signal(SIGFPE, sighandler_dump_stack);
2820 trace.evlist = perf_evlist__new();
2821 trace.sctbl = syscalltbl__new();
2823 if (trace.evlist == NULL || trace.sctbl == NULL) {
2824 pr_err("Not enough memory to run!\n");
2829 argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
2830 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
2832 err = bpf__setup_stdout(trace.evlist);
2834 bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
2835 pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
2841 if (trace.trace_pgfaults) {
2842 trace.opts.sample_address = true;
2843 trace.opts.sample_time = true;
2846 if (trace.opts.mmap_pages == UINT_MAX)
2847 mmap_pages_user_set = false;
2849 if (trace.max_stack == UINT_MAX) {
2850 trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
2851 max_stack_user_set = false;
2854 #ifdef HAVE_DWARF_UNWIND_SUPPORT
2855 if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled && trace.trace_syscalls)
2856 record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
2859 if (callchain_param.enabled) {
2860 if (!mmap_pages_user_set && geteuid() == 0)
2861 trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
2863 symbol_conf.use_callchain = true;
2866 if (trace.evlist->nr_entries > 0)
2867 evlist__set_evsel_handler(trace.evlist, trace__event_handler);
2869 if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
2870 return trace__record(&trace, argc-1, &argv[1]);
2872 /* summary_only implies summary option, but don't overwrite summary if set */
2873 if (trace.summary_only)
2874 trace.summary = trace.summary_only;
2876 if (!trace.trace_syscalls && !trace.trace_pgfaults &&
2877 trace.evlist->nr_entries == 0 /* Was --events used? */) {
2878 pr_err("Please specify something to trace.\n");
2882 if (!trace.trace_syscalls && ev_qualifier_str) {
2883 pr_err("The -e option can't be used with --no-syscalls.\n");
2887 if (output_name != NULL) {
2888 err = trace__open_output(&trace, output_name);
2890 perror("failed to create output file");
2895 trace.open_id = syscalltbl__id(trace.sctbl, "open");
2897 if (ev_qualifier_str != NULL) {
2898 const char *s = ev_qualifier_str;
2899 struct strlist_config slist_config = {
2900 .dirname = system_path(STRACE_GROUPS_DIR),
2903 trace.not_ev_qualifier = *s == '!';
2904 if (trace.not_ev_qualifier)
2906 trace.ev_qualifier = strlist__new(s, &slist_config);
2907 if (trace.ev_qualifier == NULL) {
2908 fputs("Not enough memory to parse event qualifier",
2914 err = trace__validate_ev_qualifier(&trace);
2919 err = target__validate(&trace.opts.target);
2921 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2922 fprintf(trace.output, "%s", bf);
2926 err = target__parse_uid(&trace.opts.target);
2928 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2929 fprintf(trace.output, "%s", bf);
2933 if (!argc && target__none(&trace.opts.target))
2934 trace.opts.target.system_wide = true;
2937 err = trace__replay(&trace);
2939 err = trace__run(&trace, argc, argv);
2942 if (output_name != NULL)
2943 fclose(trace.output);