]> Git Repo - linux.git/blob - tools/perf/builtin-record.c
Merge patch series "riscv: Extension parsing fixes"
[linux.git] / tools / perf / builtin-record.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include <internal/xyarray.h>
14 #include "util/parse-events.h"
15 #include "util/config.h"
16
17 #include "util/callchain.h"
18 #include "util/cgroup.h"
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/mmap.h"
25 #include "util/mutex.h"
26 #include "util/target.h"
27 #include "util/session.h"
28 #include "util/tool.h"
29 #include "util/symbol.h"
30 #include "util/record.h"
31 #include "util/cpumap.h"
32 #include "util/thread_map.h"
33 #include "util/data.h"
34 #include "util/perf_regs.h"
35 #include "util/auxtrace.h"
36 #include "util/tsc.h"
37 #include "util/parse-branch-options.h"
38 #include "util/parse-regs-options.h"
39 #include "util/perf_api_probe.h"
40 #include "util/trigger.h"
41 #include "util/perf-hooks.h"
42 #include "util/cpu-set-sched.h"
43 #include "util/synthetic-events.h"
44 #include "util/time-utils.h"
45 #include "util/units.h"
46 #include "util/bpf-event.h"
47 #include "util/util.h"
48 #include "util/pfm.h"
49 #include "util/pmu.h"
50 #include "util/pmus.h"
51 #include "util/clockid.h"
52 #include "util/off_cpu.h"
53 #include "util/bpf-filter.h"
54 #include "asm/bug.h"
55 #include "perf.h"
56 #include "cputopo.h"
57
58 #include <errno.h>
59 #include <inttypes.h>
60 #include <locale.h>
61 #include <poll.h>
62 #include <pthread.h>
63 #include <unistd.h>
64 #ifndef HAVE_GETTID
65 #include <syscall.h>
66 #endif
67 #include <sched.h>
68 #include <signal.h>
69 #ifdef HAVE_EVENTFD_SUPPORT
70 #include <sys/eventfd.h>
71 #endif
72 #include <sys/mman.h>
73 #include <sys/wait.h>
74 #include <sys/types.h>
75 #include <sys/stat.h>
76 #include <fcntl.h>
77 #include <linux/err.h>
78 #include <linux/string.h>
79 #include <linux/time64.h>
80 #include <linux/zalloc.h>
81 #include <linux/bitmap.h>
82 #include <sys/time.h>
83
84 struct switch_output {
85         bool             enabled;
86         bool             signal;
87         unsigned long    size;
88         unsigned long    time;
89         const char      *str;
90         bool             set;
91         char             **filenames;
92         int              num_files;
93         int              cur_file;
94 };
95
96 struct thread_mask {
97         struct mmap_cpu_mask    maps;
98         struct mmap_cpu_mask    affinity;
99 };
100
101 struct record_thread {
102         pid_t                   tid;
103         struct thread_mask      *mask;
104         struct {
105                 int             msg[2];
106                 int             ack[2];
107         } pipes;
108         struct fdarray          pollfd;
109         int                     ctlfd_pos;
110         int                     nr_mmaps;
111         struct mmap             **maps;
112         struct mmap             **overwrite_maps;
113         struct record           *rec;
114         unsigned long long      samples;
115         unsigned long           waking;
116         u64                     bytes_written;
117         u64                     bytes_transferred;
118         u64                     bytes_compressed;
119 };
120
121 static __thread struct record_thread *thread;
122
123 enum thread_msg {
124         THREAD_MSG__UNDEFINED = 0,
125         THREAD_MSG__READY,
126         THREAD_MSG__MAX,
127 };
128
129 static const char *thread_msg_tags[THREAD_MSG__MAX] = {
130         "UNDEFINED", "READY"
131 };
132
133 enum thread_spec {
134         THREAD_SPEC__UNDEFINED = 0,
135         THREAD_SPEC__CPU,
136         THREAD_SPEC__CORE,
137         THREAD_SPEC__PACKAGE,
138         THREAD_SPEC__NUMA,
139         THREAD_SPEC__USER,
140         THREAD_SPEC__MAX,
141 };
142
143 static const char *thread_spec_tags[THREAD_SPEC__MAX] = {
144         "undefined", "cpu", "core", "package", "numa", "user"
145 };
146
147 struct pollfd_index_map {
148         int evlist_pollfd_index;
149         int thread_pollfd_index;
150 };
151
152 struct record {
153         struct perf_tool        tool;
154         struct record_opts      opts;
155         u64                     bytes_written;
156         u64                     thread_bytes_written;
157         struct perf_data        data;
158         struct auxtrace_record  *itr;
159         struct evlist   *evlist;
160         struct perf_session     *session;
161         struct evlist           *sb_evlist;
162         pthread_t               thread_id;
163         int                     realtime_prio;
164         bool                    switch_output_event_set;
165         bool                    no_buildid;
166         bool                    no_buildid_set;
167         bool                    no_buildid_cache;
168         bool                    no_buildid_cache_set;
169         bool                    buildid_all;
170         bool                    buildid_mmap;
171         bool                    timestamp_filename;
172         bool                    timestamp_boundary;
173         bool                    off_cpu;
174         struct switch_output    switch_output;
175         unsigned long long      samples;
176         unsigned long           output_max_size;        /* = 0: unlimited */
177         struct perf_debuginfod  debuginfod;
178         int                     nr_threads;
179         struct thread_mask      *thread_masks;
180         struct record_thread    *thread_data;
181         struct pollfd_index_map *index_map;
182         size_t                  index_map_sz;
183         size_t                  index_map_cnt;
184 };
185
186 static volatile int done;
187
188 static volatile int auxtrace_record__snapshot_started;
189 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
190 static DEFINE_TRIGGER(switch_output_trigger);
191
192 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
193         "SYS", "NODE", "CPU"
194 };
195
196 #ifndef HAVE_GETTID
197 static inline pid_t gettid(void)
198 {
199         return (pid_t)syscall(__NR_gettid);
200 }
201 #endif
202
203 static int record__threads_enabled(struct record *rec)
204 {
205         return rec->opts.threads_spec;
206 }
207
208 static bool switch_output_signal(struct record *rec)
209 {
210         return rec->switch_output.signal &&
211                trigger_is_ready(&switch_output_trigger);
212 }
213
214 static bool switch_output_size(struct record *rec)
215 {
216         return rec->switch_output.size &&
217                trigger_is_ready(&switch_output_trigger) &&
218                (rec->bytes_written >= rec->switch_output.size);
219 }
220
221 static bool switch_output_time(struct record *rec)
222 {
223         return rec->switch_output.time &&
224                trigger_is_ready(&switch_output_trigger);
225 }
226
227 static u64 record__bytes_written(struct record *rec)
228 {
229         return rec->bytes_written + rec->thread_bytes_written;
230 }
231
232 static bool record__output_max_size_exceeded(struct record *rec)
233 {
234         return rec->output_max_size &&
235                (record__bytes_written(rec) >= rec->output_max_size);
236 }
237
238 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
239                          void *bf, size_t size)
240 {
241         struct perf_data_file *file = &rec->session->data->file;
242
243         if (map && map->file)
244                 file = map->file;
245
246         if (perf_data_file__write(file, bf, size) < 0) {
247                 pr_err("failed to write perf data, error: %m\n");
248                 return -1;
249         }
250
251         if (map && map->file) {
252                 thread->bytes_written += size;
253                 rec->thread_bytes_written += size;
254         } else {
255                 rec->bytes_written += size;
256         }
257
258         if (record__output_max_size_exceeded(rec) && !done) {
259                 fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
260                                 " stopping session ]\n",
261                                 record__bytes_written(rec) >> 10);
262                 done = 1;
263         }
264
265         if (switch_output_size(rec))
266                 trigger_hit(&switch_output_trigger);
267
268         return 0;
269 }
270
271 static int record__aio_enabled(struct record *rec);
272 static int record__comp_enabled(struct record *rec);
273 static ssize_t zstd_compress(struct perf_session *session, struct mmap *map,
274                             void *dst, size_t dst_size, void *src, size_t src_size);
275
276 #ifdef HAVE_AIO_SUPPORT
277 static int record__aio_write(struct aiocb *cblock, int trace_fd,
278                 void *buf, size_t size, off_t off)
279 {
280         int rc;
281
282         cblock->aio_fildes = trace_fd;
283         cblock->aio_buf    = buf;
284         cblock->aio_nbytes = size;
285         cblock->aio_offset = off;
286         cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
287
288         do {
289                 rc = aio_write(cblock);
290                 if (rc == 0) {
291                         break;
292                 } else if (errno != EAGAIN) {
293                         cblock->aio_fildes = -1;
294                         pr_err("failed to queue perf data, error: %m\n");
295                         break;
296                 }
297         } while (1);
298
299         return rc;
300 }
301
302 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
303 {
304         void *rem_buf;
305         off_t rem_off;
306         size_t rem_size;
307         int rc, aio_errno;
308         ssize_t aio_ret, written;
309
310         aio_errno = aio_error(cblock);
311         if (aio_errno == EINPROGRESS)
312                 return 0;
313
314         written = aio_ret = aio_return(cblock);
315         if (aio_ret < 0) {
316                 if (aio_errno != EINTR)
317                         pr_err("failed to write perf data, error: %m\n");
318                 written = 0;
319         }
320
321         rem_size = cblock->aio_nbytes - written;
322
323         if (rem_size == 0) {
324                 cblock->aio_fildes = -1;
325                 /*
326                  * md->refcount is incremented in record__aio_pushfn() for
327                  * every aio write request started in record__aio_push() so
328                  * decrement it because the request is now complete.
329                  */
330                 perf_mmap__put(&md->core);
331                 rc = 1;
332         } else {
333                 /*
334                  * aio write request may require restart with the
335                  * remainder if the kernel didn't write whole
336                  * chunk at once.
337                  */
338                 rem_off = cblock->aio_offset + written;
339                 rem_buf = (void *)(cblock->aio_buf + written);
340                 record__aio_write(cblock, cblock->aio_fildes,
341                                 rem_buf, rem_size, rem_off);
342                 rc = 0;
343         }
344
345         return rc;
346 }
347
348 static int record__aio_sync(struct mmap *md, bool sync_all)
349 {
350         struct aiocb **aiocb = md->aio.aiocb;
351         struct aiocb *cblocks = md->aio.cblocks;
352         struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
353         int i, do_suspend;
354
355         do {
356                 do_suspend = 0;
357                 for (i = 0; i < md->aio.nr_cblocks; ++i) {
358                         if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
359                                 if (sync_all)
360                                         aiocb[i] = NULL;
361                                 else
362                                         return i;
363                         } else {
364                                 /*
365                                  * Started aio write is not complete yet
366                                  * so it has to be waited before the
367                                  * next allocation.
368                                  */
369                                 aiocb[i] = &cblocks[i];
370                                 do_suspend = 1;
371                         }
372                 }
373                 if (!do_suspend)
374                         return -1;
375
376                 while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
377                         if (!(errno == EAGAIN || errno == EINTR))
378                                 pr_err("failed to sync perf data, error: %m\n");
379                 }
380         } while (1);
381 }
382
383 struct record_aio {
384         struct record   *rec;
385         void            *data;
386         size_t          size;
387 };
388
389 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
390 {
391         struct record_aio *aio = to;
392
393         /*
394          * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
395          * to release space in the kernel buffer as fast as possible, calling
396          * perf_mmap__consume() from perf_mmap__push() function.
397          *
398          * That lets the kernel to proceed with storing more profiling data into
399          * the kernel buffer earlier than other per-cpu kernel buffers are handled.
400          *
401          * Coping can be done in two steps in case the chunk of profiling data
402          * crosses the upper bound of the kernel buffer. In this case we first move
403          * part of data from map->start till the upper bound and then the remainder
404          * from the beginning of the kernel buffer till the end of the data chunk.
405          */
406
407         if (record__comp_enabled(aio->rec)) {
408                 ssize_t compressed = zstd_compress(aio->rec->session, NULL, aio->data + aio->size,
409                                                    mmap__mmap_len(map) - aio->size,
410                                                    buf, size);
411                 if (compressed < 0)
412                         return (int)compressed;
413
414                 size = compressed;
415         } else {
416                 memcpy(aio->data + aio->size, buf, size);
417         }
418
419         if (!aio->size) {
420                 /*
421                  * Increment map->refcount to guard map->aio.data[] buffer
422                  * from premature deallocation because map object can be
423                  * released earlier than aio write request started on
424                  * map->aio.data[] buffer is complete.
425                  *
426                  * perf_mmap__put() is done at record__aio_complete()
427                  * after started aio request completion or at record__aio_push()
428                  * if the request failed to start.
429                  */
430                 perf_mmap__get(&map->core);
431         }
432
433         aio->size += size;
434
435         return size;
436 }
437
438 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
439 {
440         int ret, idx;
441         int trace_fd = rec->session->data->file.fd;
442         struct record_aio aio = { .rec = rec, .size = 0 };
443
444         /*
445          * Call record__aio_sync() to wait till map->aio.data[] buffer
446          * becomes available after previous aio write operation.
447          */
448
449         idx = record__aio_sync(map, false);
450         aio.data = map->aio.data[idx];
451         ret = perf_mmap__push(map, &aio, record__aio_pushfn);
452         if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
453                 return ret;
454
455         rec->samples++;
456         ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
457         if (!ret) {
458                 *off += aio.size;
459                 rec->bytes_written += aio.size;
460                 if (switch_output_size(rec))
461                         trigger_hit(&switch_output_trigger);
462         } else {
463                 /*
464                  * Decrement map->refcount incremented in record__aio_pushfn()
465                  * back if record__aio_write() operation failed to start, otherwise
466                  * map->refcount is decremented in record__aio_complete() after
467                  * aio write operation finishes successfully.
468                  */
469                 perf_mmap__put(&map->core);
470         }
471
472         return ret;
473 }
474
475 static off_t record__aio_get_pos(int trace_fd)
476 {
477         return lseek(trace_fd, 0, SEEK_CUR);
478 }
479
480 static void record__aio_set_pos(int trace_fd, off_t pos)
481 {
482         lseek(trace_fd, pos, SEEK_SET);
483 }
484
485 static void record__aio_mmap_read_sync(struct record *rec)
486 {
487         int i;
488         struct evlist *evlist = rec->evlist;
489         struct mmap *maps = evlist->mmap;
490
491         if (!record__aio_enabled(rec))
492                 return;
493
494         for (i = 0; i < evlist->core.nr_mmaps; i++) {
495                 struct mmap *map = &maps[i];
496
497                 if (map->core.base)
498                         record__aio_sync(map, true);
499         }
500 }
501
502 static int nr_cblocks_default = 1;
503 static int nr_cblocks_max = 4;
504
505 static int record__aio_parse(const struct option *opt,
506                              const char *str,
507                              int unset)
508 {
509         struct record_opts *opts = (struct record_opts *)opt->value;
510
511         if (unset) {
512                 opts->nr_cblocks = 0;
513         } else {
514                 if (str)
515                         opts->nr_cblocks = strtol(str, NULL, 0);
516                 if (!opts->nr_cblocks)
517                         opts->nr_cblocks = nr_cblocks_default;
518         }
519
520         return 0;
521 }
522 #else /* HAVE_AIO_SUPPORT */
523 static int nr_cblocks_max = 0;
524
525 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
526                             off_t *off __maybe_unused)
527 {
528         return -1;
529 }
530
531 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
532 {
533         return -1;
534 }
535
536 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
537 {
538 }
539
540 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
541 {
542 }
543 #endif
544
545 static int record__aio_enabled(struct record *rec)
546 {
547         return rec->opts.nr_cblocks > 0;
548 }
549
550 #define MMAP_FLUSH_DEFAULT 1
551 static int record__mmap_flush_parse(const struct option *opt,
552                                     const char *str,
553                                     int unset)
554 {
555         int flush_max;
556         struct record_opts *opts = (struct record_opts *)opt->value;
557         static struct parse_tag tags[] = {
558                         { .tag  = 'B', .mult = 1       },
559                         { .tag  = 'K', .mult = 1 << 10 },
560                         { .tag  = 'M', .mult = 1 << 20 },
561                         { .tag  = 'G', .mult = 1 << 30 },
562                         { .tag  = 0 },
563         };
564
565         if (unset)
566                 return 0;
567
568         if (str) {
569                 opts->mmap_flush = parse_tag_value(str, tags);
570                 if (opts->mmap_flush == (int)-1)
571                         opts->mmap_flush = strtol(str, NULL, 0);
572         }
573
574         if (!opts->mmap_flush)
575                 opts->mmap_flush = MMAP_FLUSH_DEFAULT;
576
577         flush_max = evlist__mmap_size(opts->mmap_pages);
578         flush_max /= 4;
579         if (opts->mmap_flush > flush_max)
580                 opts->mmap_flush = flush_max;
581
582         return 0;
583 }
584
585 #ifdef HAVE_ZSTD_SUPPORT
586 static unsigned int comp_level_default = 1;
587
588 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
589 {
590         struct record_opts *opts = opt->value;
591
592         if (unset) {
593                 opts->comp_level = 0;
594         } else {
595                 if (str)
596                         opts->comp_level = strtol(str, NULL, 0);
597                 if (!opts->comp_level)
598                         opts->comp_level = comp_level_default;
599         }
600
601         return 0;
602 }
603 #endif
604 static unsigned int comp_level_max = 22;
605
606 static int record__comp_enabled(struct record *rec)
607 {
608         return rec->opts.comp_level > 0;
609 }
610
611 static int process_synthesized_event(struct perf_tool *tool,
612                                      union perf_event *event,
613                                      struct perf_sample *sample __maybe_unused,
614                                      struct machine *machine __maybe_unused)
615 {
616         struct record *rec = container_of(tool, struct record, tool);
617         return record__write(rec, NULL, event, event->header.size);
618 }
619
620 static struct mutex synth_lock;
621
622 static int process_locked_synthesized_event(struct perf_tool *tool,
623                                      union perf_event *event,
624                                      struct perf_sample *sample __maybe_unused,
625                                      struct machine *machine __maybe_unused)
626 {
627         int ret;
628
629         mutex_lock(&synth_lock);
630         ret = process_synthesized_event(tool, event, sample, machine);
631         mutex_unlock(&synth_lock);
632         return ret;
633 }
634
635 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
636 {
637         struct record *rec = to;
638
639         if (record__comp_enabled(rec)) {
640                 ssize_t compressed = zstd_compress(rec->session, map, map->data,
641                                                    mmap__mmap_len(map), bf, size);
642
643                 if (compressed < 0)
644                         return (int)compressed;
645
646                 size = compressed;
647                 bf   = map->data;
648         }
649
650         thread->samples++;
651         return record__write(rec, map, bf, size);
652 }
653
654 static volatile sig_atomic_t signr = -1;
655 static volatile sig_atomic_t child_finished;
656 #ifdef HAVE_EVENTFD_SUPPORT
657 static volatile sig_atomic_t done_fd = -1;
658 #endif
659
660 static void sig_handler(int sig)
661 {
662         if (sig == SIGCHLD)
663                 child_finished = 1;
664         else
665                 signr = sig;
666
667         done = 1;
668 #ifdef HAVE_EVENTFD_SUPPORT
669         if (done_fd >= 0) {
670                 u64 tmp = 1;
671                 int orig_errno = errno;
672
673                 /*
674                  * It is possible for this signal handler to run after done is
675                  * checked in the main loop, but before the perf counter fds are
676                  * polled. If this happens, the poll() will continue to wait
677                  * even though done is set, and will only break out if either
678                  * another signal is received, or the counters are ready for
679                  * read. To ensure the poll() doesn't sleep when done is set,
680                  * use an eventfd (done_fd) to wake up the poll().
681                  */
682                 if (write(done_fd, &tmp, sizeof(tmp)) < 0)
683                         pr_err("failed to signal wakeup fd, error: %m\n");
684
685                 errno = orig_errno;
686         }
687 #endif // HAVE_EVENTFD_SUPPORT
688 }
689
690 static void sigsegv_handler(int sig)
691 {
692         perf_hooks__recover();
693         sighandler_dump_stack(sig);
694 }
695
696 static void record__sig_exit(void)
697 {
698         if (signr == -1)
699                 return;
700
701         signal(signr, SIG_DFL);
702         raise(signr);
703 }
704
705 #ifdef HAVE_AUXTRACE_SUPPORT
706
707 static int record__process_auxtrace(struct perf_tool *tool,
708                                     struct mmap *map,
709                                     union perf_event *event, void *data1,
710                                     size_t len1, void *data2, size_t len2)
711 {
712         struct record *rec = container_of(tool, struct record, tool);
713         struct perf_data *data = &rec->data;
714         size_t padding;
715         u8 pad[8] = {0};
716
717         if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
718                 off_t file_offset;
719                 int fd = perf_data__fd(data);
720                 int err;
721
722                 file_offset = lseek(fd, 0, SEEK_CUR);
723                 if (file_offset == -1)
724                         return -1;
725                 err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
726                                                      event, file_offset);
727                 if (err)
728                         return err;
729         }
730
731         /* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
732         padding = (len1 + len2) & 7;
733         if (padding)
734                 padding = 8 - padding;
735
736         record__write(rec, map, event, event->header.size);
737         record__write(rec, map, data1, len1);
738         if (len2)
739                 record__write(rec, map, data2, len2);
740         record__write(rec, map, &pad, padding);
741
742         return 0;
743 }
744
745 static int record__auxtrace_mmap_read(struct record *rec,
746                                       struct mmap *map)
747 {
748         int ret;
749
750         ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
751                                   record__process_auxtrace);
752         if (ret < 0)
753                 return ret;
754
755         if (ret)
756                 rec->samples++;
757
758         return 0;
759 }
760
761 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
762                                                struct mmap *map)
763 {
764         int ret;
765
766         ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
767                                            record__process_auxtrace,
768                                            rec->opts.auxtrace_snapshot_size);
769         if (ret < 0)
770                 return ret;
771
772         if (ret)
773                 rec->samples++;
774
775         return 0;
776 }
777
778 static int record__auxtrace_read_snapshot_all(struct record *rec)
779 {
780         int i;
781         int rc = 0;
782
783         for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
784                 struct mmap *map = &rec->evlist->mmap[i];
785
786                 if (!map->auxtrace_mmap.base)
787                         continue;
788
789                 if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
790                         rc = -1;
791                         goto out;
792                 }
793         }
794 out:
795         return rc;
796 }
797
798 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
799 {
800         pr_debug("Recording AUX area tracing snapshot\n");
801         if (record__auxtrace_read_snapshot_all(rec) < 0) {
802                 trigger_error(&auxtrace_snapshot_trigger);
803         } else {
804                 if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
805                         trigger_error(&auxtrace_snapshot_trigger);
806                 else
807                         trigger_ready(&auxtrace_snapshot_trigger);
808         }
809 }
810
811 static int record__auxtrace_snapshot_exit(struct record *rec)
812 {
813         if (trigger_is_error(&auxtrace_snapshot_trigger))
814                 return 0;
815
816         if (!auxtrace_record__snapshot_started &&
817             auxtrace_record__snapshot_start(rec->itr))
818                 return -1;
819
820         record__read_auxtrace_snapshot(rec, true);
821         if (trigger_is_error(&auxtrace_snapshot_trigger))
822                 return -1;
823
824         return 0;
825 }
826
827 static int record__auxtrace_init(struct record *rec)
828 {
829         int err;
830
831         if ((rec->opts.auxtrace_snapshot_opts || rec->opts.auxtrace_sample_opts)
832             && record__threads_enabled(rec)) {
833                 pr_err("AUX area tracing options are not available in parallel streaming mode.\n");
834                 return -EINVAL;
835         }
836
837         if (!rec->itr) {
838                 rec->itr = auxtrace_record__init(rec->evlist, &err);
839                 if (err)
840                         return err;
841         }
842
843         err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
844                                               rec->opts.auxtrace_snapshot_opts);
845         if (err)
846                 return err;
847
848         err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
849                                             rec->opts.auxtrace_sample_opts);
850         if (err)
851                 return err;
852
853         auxtrace_regroup_aux_output(rec->evlist);
854
855         return auxtrace_parse_filters(rec->evlist);
856 }
857
858 #else
859
860 static inline
861 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
862                                struct mmap *map __maybe_unused)
863 {
864         return 0;
865 }
866
867 static inline
868 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
869                                     bool on_exit __maybe_unused)
870 {
871 }
872
873 static inline
874 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
875 {
876         return 0;
877 }
878
879 static inline
880 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
881 {
882         return 0;
883 }
884
885 static int record__auxtrace_init(struct record *rec __maybe_unused)
886 {
887         return 0;
888 }
889
890 #endif
891
892 static int record__config_text_poke(struct evlist *evlist)
893 {
894         struct evsel *evsel;
895
896         /* Nothing to do if text poke is already configured */
897         evlist__for_each_entry(evlist, evsel) {
898                 if (evsel->core.attr.text_poke)
899                         return 0;
900         }
901
902         evsel = evlist__add_dummy_on_all_cpus(evlist);
903         if (!evsel)
904                 return -ENOMEM;
905
906         evsel->core.attr.text_poke = 1;
907         evsel->core.attr.ksymbol = 1;
908         evsel->immediate = true;
909         evsel__set_sample_bit(evsel, TIME);
910
911         return 0;
912 }
913
914 static int record__config_off_cpu(struct record *rec)
915 {
916         return off_cpu_prepare(rec->evlist, &rec->opts.target, &rec->opts);
917 }
918
919 static bool record__tracking_system_wide(struct record *rec)
920 {
921         struct evlist *evlist = rec->evlist;
922         struct evsel *evsel;
923
924         /*
925          * If non-dummy evsel exists, system_wide sideband is need to
926          * help parse sample information.
927          * For example, PERF_EVENT_MMAP event to help parse symbol,
928          * and PERF_EVENT_COMM event to help parse task executable name.
929          */
930         evlist__for_each_entry(evlist, evsel) {
931                 if (!evsel__is_dummy_event(evsel))
932                         return true;
933         }
934
935         return false;
936 }
937
938 static int record__config_tracking_events(struct record *rec)
939 {
940         struct record_opts *opts = &rec->opts;
941         struct evlist *evlist = rec->evlist;
942         bool system_wide = false;
943         struct evsel *evsel;
944
945         /*
946          * For initial_delay, system wide or a hybrid system, we need to add
947          * tracking event so that we can track PERF_RECORD_MMAP to cover the
948          * delay of waiting or event synthesis.
949          */
950         if (opts->target.initial_delay || target__has_cpu(&opts->target) ||
951             perf_pmus__num_core_pmus() > 1) {
952
953                 /*
954                  * User space tasks can migrate between CPUs, so when tracing
955                  * selected CPUs, sideband for all CPUs is still needed.
956                  */
957                 if (!!opts->target.cpu_list && record__tracking_system_wide(rec))
958                         system_wide = true;
959
960                 evsel = evlist__findnew_tracking_event(evlist, system_wide);
961                 if (!evsel)
962                         return -ENOMEM;
963
964                 /*
965                  * Enable the tracking event when the process is forked for
966                  * initial_delay, immediately for system wide.
967                  */
968                 if (opts->target.initial_delay && !evsel->immediate &&
969                     !target__has_cpu(&opts->target))
970                         evsel->core.attr.enable_on_exec = 1;
971                 else
972                         evsel->immediate = 1;
973         }
974
975         return 0;
976 }
977
978 static bool record__kcore_readable(struct machine *machine)
979 {
980         char kcore[PATH_MAX];
981         int fd;
982
983         scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
984
985         fd = open(kcore, O_RDONLY);
986         if (fd < 0)
987                 return false;
988
989         close(fd);
990
991         return true;
992 }
993
994 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
995 {
996         char from_dir[PATH_MAX];
997         char kcore_dir[PATH_MAX];
998         int ret;
999
1000         snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
1001
1002         ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
1003         if (ret)
1004                 return ret;
1005
1006         return kcore_copy(from_dir, kcore_dir);
1007 }
1008
1009 static void record__thread_data_init_pipes(struct record_thread *thread_data)
1010 {
1011         thread_data->pipes.msg[0] = -1;
1012         thread_data->pipes.msg[1] = -1;
1013         thread_data->pipes.ack[0] = -1;
1014         thread_data->pipes.ack[1] = -1;
1015 }
1016
1017 static int record__thread_data_open_pipes(struct record_thread *thread_data)
1018 {
1019         if (pipe(thread_data->pipes.msg))
1020                 return -EINVAL;
1021
1022         if (pipe(thread_data->pipes.ack)) {
1023                 close(thread_data->pipes.msg[0]);
1024                 thread_data->pipes.msg[0] = -1;
1025                 close(thread_data->pipes.msg[1]);
1026                 thread_data->pipes.msg[1] = -1;
1027                 return -EINVAL;
1028         }
1029
1030         pr_debug2("thread_data[%p]: msg=[%d,%d], ack=[%d,%d]\n", thread_data,
1031                  thread_data->pipes.msg[0], thread_data->pipes.msg[1],
1032                  thread_data->pipes.ack[0], thread_data->pipes.ack[1]);
1033
1034         return 0;
1035 }
1036
1037 static void record__thread_data_close_pipes(struct record_thread *thread_data)
1038 {
1039         if (thread_data->pipes.msg[0] != -1) {
1040                 close(thread_data->pipes.msg[0]);
1041                 thread_data->pipes.msg[0] = -1;
1042         }
1043         if (thread_data->pipes.msg[1] != -1) {
1044                 close(thread_data->pipes.msg[1]);
1045                 thread_data->pipes.msg[1] = -1;
1046         }
1047         if (thread_data->pipes.ack[0] != -1) {
1048                 close(thread_data->pipes.ack[0]);
1049                 thread_data->pipes.ack[0] = -1;
1050         }
1051         if (thread_data->pipes.ack[1] != -1) {
1052                 close(thread_data->pipes.ack[1]);
1053                 thread_data->pipes.ack[1] = -1;
1054         }
1055 }
1056
1057 static bool evlist__per_thread(struct evlist *evlist)
1058 {
1059         return cpu_map__is_dummy(evlist->core.user_requested_cpus);
1060 }
1061
1062 static int record__thread_data_init_maps(struct record_thread *thread_data, struct evlist *evlist)
1063 {
1064         int m, tm, nr_mmaps = evlist->core.nr_mmaps;
1065         struct mmap *mmap = evlist->mmap;
1066         struct mmap *overwrite_mmap = evlist->overwrite_mmap;
1067         struct perf_cpu_map *cpus = evlist->core.all_cpus;
1068         bool per_thread = evlist__per_thread(evlist);
1069
1070         if (per_thread)
1071                 thread_data->nr_mmaps = nr_mmaps;
1072         else
1073                 thread_data->nr_mmaps = bitmap_weight(thread_data->mask->maps.bits,
1074                                                       thread_data->mask->maps.nbits);
1075         if (mmap) {
1076                 thread_data->maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1077                 if (!thread_data->maps)
1078                         return -ENOMEM;
1079         }
1080         if (overwrite_mmap) {
1081                 thread_data->overwrite_maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1082                 if (!thread_data->overwrite_maps) {
1083                         zfree(&thread_data->maps);
1084                         return -ENOMEM;
1085                 }
1086         }
1087         pr_debug2("thread_data[%p]: nr_mmaps=%d, maps=%p, ow_maps=%p\n", thread_data,
1088                  thread_data->nr_mmaps, thread_data->maps, thread_data->overwrite_maps);
1089
1090         for (m = 0, tm = 0; m < nr_mmaps && tm < thread_data->nr_mmaps; m++) {
1091                 if (per_thread ||
1092                     test_bit(perf_cpu_map__cpu(cpus, m).cpu, thread_data->mask->maps.bits)) {
1093                         if (thread_data->maps) {
1094                                 thread_data->maps[tm] = &mmap[m];
1095                                 pr_debug2("thread_data[%p]: cpu%d: maps[%d] -> mmap[%d]\n",
1096                                           thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1097                         }
1098                         if (thread_data->overwrite_maps) {
1099                                 thread_data->overwrite_maps[tm] = &overwrite_mmap[m];
1100                                 pr_debug2("thread_data[%p]: cpu%d: ow_maps[%d] -> ow_mmap[%d]\n",
1101                                           thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1102                         }
1103                         tm++;
1104                 }
1105         }
1106
1107         return 0;
1108 }
1109
1110 static int record__thread_data_init_pollfd(struct record_thread *thread_data, struct evlist *evlist)
1111 {
1112         int f, tm, pos;
1113         struct mmap *map, *overwrite_map;
1114
1115         fdarray__init(&thread_data->pollfd, 64);
1116
1117         for (tm = 0; tm < thread_data->nr_mmaps; tm++) {
1118                 map = thread_data->maps ? thread_data->maps[tm] : NULL;
1119                 overwrite_map = thread_data->overwrite_maps ?
1120                                 thread_data->overwrite_maps[tm] : NULL;
1121
1122                 for (f = 0; f < evlist->core.pollfd.nr; f++) {
1123                         void *ptr = evlist->core.pollfd.priv[f].ptr;
1124
1125                         if ((map && ptr == map) || (overwrite_map && ptr == overwrite_map)) {
1126                                 pos = fdarray__dup_entry_from(&thread_data->pollfd, f,
1127                                                               &evlist->core.pollfd);
1128                                 if (pos < 0)
1129                                         return pos;
1130                                 pr_debug2("thread_data[%p]: pollfd[%d] <- event_fd=%d\n",
1131                                          thread_data, pos, evlist->core.pollfd.entries[f].fd);
1132                         }
1133                 }
1134         }
1135
1136         return 0;
1137 }
1138
1139 static void record__free_thread_data(struct record *rec)
1140 {
1141         int t;
1142         struct record_thread *thread_data = rec->thread_data;
1143
1144         if (thread_data == NULL)
1145                 return;
1146
1147         for (t = 0; t < rec->nr_threads; t++) {
1148                 record__thread_data_close_pipes(&thread_data[t]);
1149                 zfree(&thread_data[t].maps);
1150                 zfree(&thread_data[t].overwrite_maps);
1151                 fdarray__exit(&thread_data[t].pollfd);
1152         }
1153
1154         zfree(&rec->thread_data);
1155 }
1156
1157 static int record__map_thread_evlist_pollfd_indexes(struct record *rec,
1158                                                     int evlist_pollfd_index,
1159                                                     int thread_pollfd_index)
1160 {
1161         size_t x = rec->index_map_cnt;
1162
1163         if (realloc_array_as_needed(rec->index_map, rec->index_map_sz, x, NULL))
1164                 return -ENOMEM;
1165         rec->index_map[x].evlist_pollfd_index = evlist_pollfd_index;
1166         rec->index_map[x].thread_pollfd_index = thread_pollfd_index;
1167         rec->index_map_cnt += 1;
1168         return 0;
1169 }
1170
1171 static int record__update_evlist_pollfd_from_thread(struct record *rec,
1172                                                     struct evlist *evlist,
1173                                                     struct record_thread *thread_data)
1174 {
1175         struct pollfd *e_entries = evlist->core.pollfd.entries;
1176         struct pollfd *t_entries = thread_data->pollfd.entries;
1177         int err = 0;
1178         size_t i;
1179
1180         for (i = 0; i < rec->index_map_cnt; i++) {
1181                 int e_pos = rec->index_map[i].evlist_pollfd_index;
1182                 int t_pos = rec->index_map[i].thread_pollfd_index;
1183
1184                 if (e_entries[e_pos].fd != t_entries[t_pos].fd ||
1185                     e_entries[e_pos].events != t_entries[t_pos].events) {
1186                         pr_err("Thread and evlist pollfd index mismatch\n");
1187                         err = -EINVAL;
1188                         continue;
1189                 }
1190                 e_entries[e_pos].revents = t_entries[t_pos].revents;
1191         }
1192         return err;
1193 }
1194
1195 static int record__dup_non_perf_events(struct record *rec,
1196                                        struct evlist *evlist,
1197                                        struct record_thread *thread_data)
1198 {
1199         struct fdarray *fda = &evlist->core.pollfd;
1200         int i, ret;
1201
1202         for (i = 0; i < fda->nr; i++) {
1203                 if (!(fda->priv[i].flags & fdarray_flag__non_perf_event))
1204                         continue;
1205                 ret = fdarray__dup_entry_from(&thread_data->pollfd, i, fda);
1206                 if (ret < 0) {
1207                         pr_err("Failed to duplicate descriptor in main thread pollfd\n");
1208                         return ret;
1209                 }
1210                 pr_debug2("thread_data[%p]: pollfd[%d] <- non_perf_event fd=%d\n",
1211                           thread_data, ret, fda->entries[i].fd);
1212                 ret = record__map_thread_evlist_pollfd_indexes(rec, i, ret);
1213                 if (ret < 0) {
1214                         pr_err("Failed to map thread and evlist pollfd indexes\n");
1215                         return ret;
1216                 }
1217         }
1218         return 0;
1219 }
1220
1221 static int record__alloc_thread_data(struct record *rec, struct evlist *evlist)
1222 {
1223         int t, ret;
1224         struct record_thread *thread_data;
1225
1226         rec->thread_data = zalloc(rec->nr_threads * sizeof(*(rec->thread_data)));
1227         if (!rec->thread_data) {
1228                 pr_err("Failed to allocate thread data\n");
1229                 return -ENOMEM;
1230         }
1231         thread_data = rec->thread_data;
1232
1233         for (t = 0; t < rec->nr_threads; t++)
1234                 record__thread_data_init_pipes(&thread_data[t]);
1235
1236         for (t = 0; t < rec->nr_threads; t++) {
1237                 thread_data[t].rec = rec;
1238                 thread_data[t].mask = &rec->thread_masks[t];
1239                 ret = record__thread_data_init_maps(&thread_data[t], evlist);
1240                 if (ret) {
1241                         pr_err("Failed to initialize thread[%d] maps\n", t);
1242                         goto out_free;
1243                 }
1244                 ret = record__thread_data_init_pollfd(&thread_data[t], evlist);
1245                 if (ret) {
1246                         pr_err("Failed to initialize thread[%d] pollfd\n", t);
1247                         goto out_free;
1248                 }
1249                 if (t) {
1250                         thread_data[t].tid = -1;
1251                         ret = record__thread_data_open_pipes(&thread_data[t]);
1252                         if (ret) {
1253                                 pr_err("Failed to open thread[%d] communication pipes\n", t);
1254                                 goto out_free;
1255                         }
1256                         ret = fdarray__add(&thread_data[t].pollfd, thread_data[t].pipes.msg[0],
1257                                            POLLIN | POLLERR | POLLHUP, fdarray_flag__nonfilterable);
1258                         if (ret < 0) {
1259                                 pr_err("Failed to add descriptor to thread[%d] pollfd\n", t);
1260                                 goto out_free;
1261                         }
1262                         thread_data[t].ctlfd_pos = ret;
1263                         pr_debug2("thread_data[%p]: pollfd[%d] <- ctl_fd=%d\n",
1264                                  thread_data, thread_data[t].ctlfd_pos,
1265                                  thread_data[t].pipes.msg[0]);
1266                 } else {
1267                         thread_data[t].tid = gettid();
1268
1269                         ret = record__dup_non_perf_events(rec, evlist, &thread_data[t]);
1270                         if (ret < 0)
1271                                 goto out_free;
1272
1273                         thread_data[t].ctlfd_pos = -1; /* Not used */
1274                 }
1275         }
1276
1277         return 0;
1278
1279 out_free:
1280         record__free_thread_data(rec);
1281
1282         return ret;
1283 }
1284
1285 static int record__mmap_evlist(struct record *rec,
1286                                struct evlist *evlist)
1287 {
1288         int i, ret;
1289         struct record_opts *opts = &rec->opts;
1290         bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
1291                                   opts->auxtrace_sample_mode;
1292         char msg[512];
1293
1294         if (opts->affinity != PERF_AFFINITY_SYS)
1295                 cpu__setup_cpunode_map();
1296
1297         if (evlist__mmap_ex(evlist, opts->mmap_pages,
1298                                  opts->auxtrace_mmap_pages,
1299                                  auxtrace_overwrite,
1300                                  opts->nr_cblocks, opts->affinity,
1301                                  opts->mmap_flush, opts->comp_level) < 0) {
1302                 if (errno == EPERM) {
1303                         pr_err("Permission error mapping pages.\n"
1304                                "Consider increasing "
1305                                "/proc/sys/kernel/perf_event_mlock_kb,\n"
1306                                "or try again with a smaller value of -m/--mmap_pages.\n"
1307                                "(current value: %u,%u)\n",
1308                                opts->mmap_pages, opts->auxtrace_mmap_pages);
1309                         return -errno;
1310                 } else {
1311                         pr_err("failed to mmap with %d (%s)\n", errno,
1312                                 str_error_r(errno, msg, sizeof(msg)));
1313                         if (errno)
1314                                 return -errno;
1315                         else
1316                                 return -EINVAL;
1317                 }
1318         }
1319
1320         if (evlist__initialize_ctlfd(evlist, opts->ctl_fd, opts->ctl_fd_ack))
1321                 return -1;
1322
1323         ret = record__alloc_thread_data(rec, evlist);
1324         if (ret)
1325                 return ret;
1326
1327         if (record__threads_enabled(rec)) {
1328                 ret = perf_data__create_dir(&rec->data, evlist->core.nr_mmaps);
1329                 if (ret) {
1330                         pr_err("Failed to create data directory: %s\n", strerror(-ret));
1331                         return ret;
1332                 }
1333                 for (i = 0; i < evlist->core.nr_mmaps; i++) {
1334                         if (evlist->mmap)
1335                                 evlist->mmap[i].file = &rec->data.dir.files[i];
1336                         if (evlist->overwrite_mmap)
1337                                 evlist->overwrite_mmap[i].file = &rec->data.dir.files[i];
1338                 }
1339         }
1340
1341         return 0;
1342 }
1343
1344 static int record__mmap(struct record *rec)
1345 {
1346         return record__mmap_evlist(rec, rec->evlist);
1347 }
1348
1349 static int record__open(struct record *rec)
1350 {
1351         char msg[BUFSIZ];
1352         struct evsel *pos;
1353         struct evlist *evlist = rec->evlist;
1354         struct perf_session *session = rec->session;
1355         struct record_opts *opts = &rec->opts;
1356         int rc = 0;
1357
1358         evlist__for_each_entry(evlist, pos) {
1359 try_again:
1360                 if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
1361                         if (evsel__fallback(pos, &opts->target, errno, msg, sizeof(msg))) {
1362                                 if (verbose > 0)
1363                                         ui__warning("%s\n", msg);
1364                                 goto try_again;
1365                         }
1366                         if ((errno == EINVAL || errno == EBADF) &&
1367                             pos->core.leader != &pos->core &&
1368                             pos->weak_group) {
1369                                 pos = evlist__reset_weak_group(evlist, pos, true);
1370                                 goto try_again;
1371                         }
1372                         rc = -errno;
1373                         evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
1374                         ui__error("%s\n", msg);
1375                         goto out;
1376                 }
1377
1378                 pos->supported = true;
1379         }
1380
1381         if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
1382                 pr_warning(
1383 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1384 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
1385 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1386 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1387 "Samples in kernel modules won't be resolved at all.\n\n"
1388 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1389 "even with a suitable vmlinux or kallsyms file.\n\n");
1390         }
1391
1392         if (evlist__apply_filters(evlist, &pos)) {
1393                 pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
1394                         pos->filter ?: "BPF", evsel__name(pos), errno,
1395                         str_error_r(errno, msg, sizeof(msg)));
1396                 rc = -1;
1397                 goto out;
1398         }
1399
1400         rc = record__mmap(rec);
1401         if (rc)
1402                 goto out;
1403
1404         session->evlist = evlist;
1405         perf_session__set_id_hdr_size(session);
1406 out:
1407         return rc;
1408 }
1409
1410 static void set_timestamp_boundary(struct record *rec, u64 sample_time)
1411 {
1412         if (rec->evlist->first_sample_time == 0)
1413                 rec->evlist->first_sample_time = sample_time;
1414
1415         if (sample_time)
1416                 rec->evlist->last_sample_time = sample_time;
1417 }
1418
1419 static int process_sample_event(struct perf_tool *tool,
1420                                 union perf_event *event,
1421                                 struct perf_sample *sample,
1422                                 struct evsel *evsel,
1423                                 struct machine *machine)
1424 {
1425         struct record *rec = container_of(tool, struct record, tool);
1426
1427         set_timestamp_boundary(rec, sample->time);
1428
1429         if (rec->buildid_all)
1430                 return 0;
1431
1432         rec->samples++;
1433         return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
1434 }
1435
1436 static int process_buildids(struct record *rec)
1437 {
1438         struct perf_session *session = rec->session;
1439
1440         if (perf_data__size(&rec->data) == 0)
1441                 return 0;
1442
1443         /*
1444          * During this process, it'll load kernel map and replace the
1445          * dso->long_name to a real pathname it found.  In this case
1446          * we prefer the vmlinux path like
1447          *   /lib/modules/3.16.4/build/vmlinux
1448          *
1449          * rather than build-id path (in debug directory).
1450          *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1451          */
1452         symbol_conf.ignore_vmlinux_buildid = true;
1453
1454         /*
1455          * If --buildid-all is given, it marks all DSO regardless of hits,
1456          * so no need to process samples. But if timestamp_boundary is enabled,
1457          * it still needs to walk on all samples to get the timestamps of
1458          * first/last samples.
1459          */
1460         if (rec->buildid_all && !rec->timestamp_boundary)
1461                 rec->tool.sample = NULL;
1462
1463         return perf_session__process_events(session);
1464 }
1465
1466 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1467 {
1468         int err;
1469         struct perf_tool *tool = data;
1470         /*
1471          *As for guest kernel when processing subcommand record&report,
1472          *we arrange module mmap prior to guest kernel mmap and trigger
1473          *a preload dso because default guest module symbols are loaded
1474          *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1475          *method is used to avoid symbol missing when the first addr is
1476          *in module instead of in guest kernel.
1477          */
1478         err = perf_event__synthesize_modules(tool, process_synthesized_event,
1479                                              machine);
1480         if (err < 0)
1481                 pr_err("Couldn't record guest kernel [%d]'s reference"
1482                        " relocation symbol.\n", machine->pid);
1483
1484         /*
1485          * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1486          * have no _text sometimes.
1487          */
1488         err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1489                                                  machine);
1490         if (err < 0)
1491                 pr_err("Couldn't record guest kernel [%d]'s reference"
1492                        " relocation symbol.\n", machine->pid);
1493 }
1494
1495 static struct perf_event_header finished_round_event = {
1496         .size = sizeof(struct perf_event_header),
1497         .type = PERF_RECORD_FINISHED_ROUND,
1498 };
1499
1500 static struct perf_event_header finished_init_event = {
1501         .size = sizeof(struct perf_event_header),
1502         .type = PERF_RECORD_FINISHED_INIT,
1503 };
1504
1505 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1506 {
1507         if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1508             !bitmap_equal(thread->mask->affinity.bits, map->affinity_mask.bits,
1509                           thread->mask->affinity.nbits)) {
1510                 bitmap_zero(thread->mask->affinity.bits, thread->mask->affinity.nbits);
1511                 bitmap_or(thread->mask->affinity.bits, thread->mask->affinity.bits,
1512                           map->affinity_mask.bits, thread->mask->affinity.nbits);
1513                 sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
1514                                         (cpu_set_t *)thread->mask->affinity.bits);
1515                 if (verbose == 2) {
1516                         pr_debug("threads[%d]: running on cpu%d: ", thread->tid, sched_getcpu());
1517                         mmap_cpu_mask__scnprintf(&thread->mask->affinity, "affinity");
1518                 }
1519         }
1520 }
1521
1522 static size_t process_comp_header(void *record, size_t increment)
1523 {
1524         struct perf_record_compressed *event = record;
1525         size_t size = sizeof(*event);
1526
1527         if (increment) {
1528                 event->header.size += increment;
1529                 return increment;
1530         }
1531
1532         event->header.type = PERF_RECORD_COMPRESSED;
1533         event->header.size = size;
1534
1535         return size;
1536 }
1537
1538 static ssize_t zstd_compress(struct perf_session *session, struct mmap *map,
1539                             void *dst, size_t dst_size, void *src, size_t src_size)
1540 {
1541         ssize_t compressed;
1542         size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
1543         struct zstd_data *zstd_data = &session->zstd_data;
1544
1545         if (map && map->file)
1546                 zstd_data = &map->zstd_data;
1547
1548         compressed = zstd_compress_stream_to_records(zstd_data, dst, dst_size, src, src_size,
1549                                                      max_record_size, process_comp_header);
1550         if (compressed < 0)
1551                 return compressed;
1552
1553         if (map && map->file) {
1554                 thread->bytes_transferred += src_size;
1555                 thread->bytes_compressed  += compressed;
1556         } else {
1557                 session->bytes_transferred += src_size;
1558                 session->bytes_compressed  += compressed;
1559         }
1560
1561         return compressed;
1562 }
1563
1564 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1565                                     bool overwrite, bool synch)
1566 {
1567         u64 bytes_written = rec->bytes_written;
1568         int i;
1569         int rc = 0;
1570         int nr_mmaps;
1571         struct mmap **maps;
1572         int trace_fd = rec->data.file.fd;
1573         off_t off = 0;
1574
1575         if (!evlist)
1576                 return 0;
1577
1578         nr_mmaps = thread->nr_mmaps;
1579         maps = overwrite ? thread->overwrite_maps : thread->maps;
1580
1581         if (!maps)
1582                 return 0;
1583
1584         if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1585                 return 0;
1586
1587         if (record__aio_enabled(rec))
1588                 off = record__aio_get_pos(trace_fd);
1589
1590         for (i = 0; i < nr_mmaps; i++) {
1591                 u64 flush = 0;
1592                 struct mmap *map = maps[i];
1593
1594                 if (map->core.base) {
1595                         record__adjust_affinity(rec, map);
1596                         if (synch) {
1597                                 flush = map->core.flush;
1598                                 map->core.flush = 1;
1599                         }
1600                         if (!record__aio_enabled(rec)) {
1601                                 if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1602                                         if (synch)
1603                                                 map->core.flush = flush;
1604                                         rc = -1;
1605                                         goto out;
1606                                 }
1607                         } else {
1608                                 if (record__aio_push(rec, map, &off) < 0) {
1609                                         record__aio_set_pos(trace_fd, off);
1610                                         if (synch)
1611                                                 map->core.flush = flush;
1612                                         rc = -1;
1613                                         goto out;
1614                                 }
1615                         }
1616                         if (synch)
1617                                 map->core.flush = flush;
1618                 }
1619
1620                 if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1621                     !rec->opts.auxtrace_sample_mode &&
1622                     record__auxtrace_mmap_read(rec, map) != 0) {
1623                         rc = -1;
1624                         goto out;
1625                 }
1626         }
1627
1628         if (record__aio_enabled(rec))
1629                 record__aio_set_pos(trace_fd, off);
1630
1631         /*
1632          * Mark the round finished in case we wrote
1633          * at least one event.
1634          *
1635          * No need for round events in directory mode,
1636          * because per-cpu maps and files have data
1637          * sorted by kernel.
1638          */
1639         if (!record__threads_enabled(rec) && bytes_written != rec->bytes_written)
1640                 rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1641
1642         if (overwrite)
1643                 evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1644 out:
1645         return rc;
1646 }
1647
1648 static int record__mmap_read_all(struct record *rec, bool synch)
1649 {
1650         int err;
1651
1652         err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1653         if (err)
1654                 return err;
1655
1656         return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1657 }
1658
1659 static void record__thread_munmap_filtered(struct fdarray *fda, int fd,
1660                                            void *arg __maybe_unused)
1661 {
1662         struct perf_mmap *map = fda->priv[fd].ptr;
1663
1664         if (map)
1665                 perf_mmap__put(map);
1666 }
1667
1668 static void *record__thread(void *arg)
1669 {
1670         enum thread_msg msg = THREAD_MSG__READY;
1671         bool terminate = false;
1672         struct fdarray *pollfd;
1673         int err, ctlfd_pos;
1674
1675         thread = arg;
1676         thread->tid = gettid();
1677
1678         err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1679         if (err == -1)
1680                 pr_warning("threads[%d]: failed to notify on start: %s\n",
1681                            thread->tid, strerror(errno));
1682
1683         pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
1684
1685         pollfd = &thread->pollfd;
1686         ctlfd_pos = thread->ctlfd_pos;
1687
1688         for (;;) {
1689                 unsigned long long hits = thread->samples;
1690
1691                 if (record__mmap_read_all(thread->rec, false) < 0 || terminate)
1692                         break;
1693
1694                 if (hits == thread->samples) {
1695
1696                         err = fdarray__poll(pollfd, -1);
1697                         /*
1698                          * Propagate error, only if there's any. Ignore positive
1699                          * number of returned events and interrupt error.
1700                          */
1701                         if (err > 0 || (err < 0 && errno == EINTR))
1702                                 err = 0;
1703                         thread->waking++;
1704
1705                         if (fdarray__filter(pollfd, POLLERR | POLLHUP,
1706                                             record__thread_munmap_filtered, NULL) == 0)
1707                                 break;
1708                 }
1709
1710                 if (pollfd->entries[ctlfd_pos].revents & POLLHUP) {
1711                         terminate = true;
1712                         close(thread->pipes.msg[0]);
1713                         thread->pipes.msg[0] = -1;
1714                         pollfd->entries[ctlfd_pos].fd = -1;
1715                         pollfd->entries[ctlfd_pos].events = 0;
1716                 }
1717
1718                 pollfd->entries[ctlfd_pos].revents = 0;
1719         }
1720         record__mmap_read_all(thread->rec, true);
1721
1722         err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1723         if (err == -1)
1724                 pr_warning("threads[%d]: failed to notify on termination: %s\n",
1725                            thread->tid, strerror(errno));
1726
1727         return NULL;
1728 }
1729
1730 static void record__init_features(struct record *rec)
1731 {
1732         struct perf_session *session = rec->session;
1733         int feat;
1734
1735         for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1736                 perf_header__set_feat(&session->header, feat);
1737
1738         if (rec->no_buildid)
1739                 perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1740
1741 #ifdef HAVE_LIBTRACEEVENT
1742         if (!have_tracepoints(&rec->evlist->core.entries))
1743                 perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1744 #endif
1745
1746         if (!rec->opts.branch_stack)
1747                 perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1748
1749         if (!rec->opts.full_auxtrace)
1750                 perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1751
1752         if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1753                 perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1754
1755         if (!rec->opts.use_clockid)
1756                 perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1757
1758         if (!record__threads_enabled(rec))
1759                 perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1760
1761         if (!record__comp_enabled(rec))
1762                 perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1763
1764         perf_header__clear_feat(&session->header, HEADER_STAT);
1765 }
1766
1767 static void
1768 record__finish_output(struct record *rec)
1769 {
1770         int i;
1771         struct perf_data *data = &rec->data;
1772         int fd = perf_data__fd(data);
1773
1774         if (data->is_pipe) {
1775                 /* Just to display approx. size */
1776                 data->file.size = rec->bytes_written;
1777                 return;
1778         }
1779
1780         rec->session->header.data_size += rec->bytes_written;
1781         data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1782         if (record__threads_enabled(rec)) {
1783                 for (i = 0; i < data->dir.nr; i++)
1784                         data->dir.files[i].size = lseek(data->dir.files[i].fd, 0, SEEK_CUR);
1785         }
1786
1787         if (!rec->no_buildid) {
1788                 process_buildids(rec);
1789
1790                 if (rec->buildid_all)
1791                         perf_session__dsos_hit_all(rec->session);
1792         }
1793         perf_session__write_header(rec->session, rec->evlist, fd, true);
1794
1795         return;
1796 }
1797
1798 static int record__synthesize_workload(struct record *rec, bool tail)
1799 {
1800         int err;
1801         struct perf_thread_map *thread_map;
1802         bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
1803
1804         if (rec->opts.tail_synthesize != tail)
1805                 return 0;
1806
1807         thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1808         if (thread_map == NULL)
1809                 return -1;
1810
1811         err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1812                                                  process_synthesized_event,
1813                                                  &rec->session->machines.host,
1814                                                  needs_mmap,
1815                                                  rec->opts.sample_address);
1816         perf_thread_map__put(thread_map);
1817         return err;
1818 }
1819
1820 static int write_finished_init(struct record *rec, bool tail)
1821 {
1822         if (rec->opts.tail_synthesize != tail)
1823                 return 0;
1824
1825         return record__write(rec, NULL, &finished_init_event, sizeof(finished_init_event));
1826 }
1827
1828 static int record__synthesize(struct record *rec, bool tail);
1829
1830 static int
1831 record__switch_output(struct record *rec, bool at_exit)
1832 {
1833         struct perf_data *data = &rec->data;
1834         char *new_filename = NULL;
1835         int fd, err;
1836
1837         /* Same Size:      "2015122520103046"*/
1838         char timestamp[] = "InvalidTimestamp";
1839
1840         record__aio_mmap_read_sync(rec);
1841
1842         write_finished_init(rec, true);
1843
1844         record__synthesize(rec, true);
1845         if (target__none(&rec->opts.target))
1846                 record__synthesize_workload(rec, true);
1847
1848         rec->samples = 0;
1849         record__finish_output(rec);
1850         err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1851         if (err) {
1852                 pr_err("Failed to get current timestamp\n");
1853                 return -EINVAL;
1854         }
1855
1856         fd = perf_data__switch(data, timestamp,
1857                                rec->session->header.data_offset,
1858                                at_exit, &new_filename);
1859         if (fd >= 0 && !at_exit) {
1860                 rec->bytes_written = 0;
1861                 rec->session->header.data_size = 0;
1862         }
1863
1864         if (!quiet) {
1865                 fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1866                         data->path, timestamp);
1867         }
1868
1869         if (rec->switch_output.num_files) {
1870                 int n = rec->switch_output.cur_file + 1;
1871
1872                 if (n >= rec->switch_output.num_files)
1873                         n = 0;
1874                 rec->switch_output.cur_file = n;
1875                 if (rec->switch_output.filenames[n]) {
1876                         remove(rec->switch_output.filenames[n]);
1877                         zfree(&rec->switch_output.filenames[n]);
1878                 }
1879                 rec->switch_output.filenames[n] = new_filename;
1880         } else {
1881                 free(new_filename);
1882         }
1883
1884         /* Output tracking events */
1885         if (!at_exit) {
1886                 record__synthesize(rec, false);
1887
1888                 /*
1889                  * In 'perf record --switch-output' without -a,
1890                  * record__synthesize() in record__switch_output() won't
1891                  * generate tracking events because there's no thread_map
1892                  * in evlist. Which causes newly created perf.data doesn't
1893                  * contain map and comm information.
1894                  * Create a fake thread_map and directly call
1895                  * perf_event__synthesize_thread_map() for those events.
1896                  */
1897                 if (target__none(&rec->opts.target))
1898                         record__synthesize_workload(rec, false);
1899                 write_finished_init(rec, false);
1900         }
1901         return fd;
1902 }
1903
1904 static void __record__save_lost_samples(struct record *rec, struct evsel *evsel,
1905                                         struct perf_record_lost_samples *lost,
1906                                         int cpu_idx, int thread_idx, u64 lost_count,
1907                                         u16 misc_flag)
1908 {
1909         struct perf_sample_id *sid;
1910         struct perf_sample sample = {};
1911         int id_hdr_size;
1912
1913         lost->lost = lost_count;
1914         if (evsel->core.ids) {
1915                 sid = xyarray__entry(evsel->core.sample_id, cpu_idx, thread_idx);
1916                 sample.id = sid->id;
1917         }
1918
1919         id_hdr_size = perf_event__synthesize_id_sample((void *)(lost + 1),
1920                                                        evsel->core.attr.sample_type, &sample);
1921         lost->header.size = sizeof(*lost) + id_hdr_size;
1922         lost->header.misc = misc_flag;
1923         record__write(rec, NULL, lost, lost->header.size);
1924 }
1925
1926 static void record__read_lost_samples(struct record *rec)
1927 {
1928         struct perf_session *session = rec->session;
1929         struct perf_record_lost_samples *lost = NULL;
1930         struct evsel *evsel;
1931
1932         /* there was an error during record__open */
1933         if (session->evlist == NULL)
1934                 return;
1935
1936         evlist__for_each_entry(session->evlist, evsel) {
1937                 struct xyarray *xy = evsel->core.sample_id;
1938                 u64 lost_count;
1939
1940                 if (xy == NULL || evsel->core.fd == NULL)
1941                         continue;
1942                 if (xyarray__max_x(evsel->core.fd) != xyarray__max_x(xy) ||
1943                     xyarray__max_y(evsel->core.fd) != xyarray__max_y(xy)) {
1944                         pr_debug("Unmatched FD vs. sample ID: skip reading LOST count\n");
1945                         continue;
1946                 }
1947
1948                 for (int x = 0; x < xyarray__max_x(xy); x++) {
1949                         for (int y = 0; y < xyarray__max_y(xy); y++) {
1950                                 struct perf_counts_values count;
1951
1952                                 if (perf_evsel__read(&evsel->core, x, y, &count) < 0) {
1953                                         pr_debug("read LOST count failed\n");
1954                                         goto out;
1955                                 }
1956
1957                                 if (count.lost) {
1958                                         if (!lost) {
1959                                                 lost = zalloc(sizeof(*lost) +
1960                                                               session->machines.host.id_hdr_size);
1961                                                 if (!lost) {
1962                                                         pr_debug("Memory allocation failed\n");
1963                                                         return;
1964                                                 }
1965                                                 lost->header.type = PERF_RECORD_LOST_SAMPLES;
1966                                         }
1967                                         __record__save_lost_samples(rec, evsel, lost,
1968                                                                     x, y, count.lost, 0);
1969                                 }
1970                         }
1971                 }
1972
1973                 lost_count = perf_bpf_filter__lost_count(evsel);
1974                 if (lost_count) {
1975                         if (!lost) {
1976                                 lost = zalloc(sizeof(*lost) +
1977                                               session->machines.host.id_hdr_size);
1978                                 if (!lost) {
1979                                         pr_debug("Memory allocation failed\n");
1980                                         return;
1981                                 }
1982                                 lost->header.type = PERF_RECORD_LOST_SAMPLES;
1983                         }
1984                         __record__save_lost_samples(rec, evsel, lost, 0, 0, lost_count,
1985                                                     PERF_RECORD_MISC_LOST_SAMPLES_BPF);
1986                 }
1987         }
1988 out:
1989         free(lost);
1990 }
1991
1992 static volatile sig_atomic_t workload_exec_errno;
1993
1994 /*
1995  * evlist__prepare_workload will send a SIGUSR1
1996  * if the fork fails, since we asked by setting its
1997  * want_signal to true.
1998  */
1999 static void workload_exec_failed_signal(int signo __maybe_unused,
2000                                         siginfo_t *info,
2001                                         void *ucontext __maybe_unused)
2002 {
2003         workload_exec_errno = info->si_value.sival_int;
2004         done = 1;
2005         child_finished = 1;
2006 }
2007
2008 static void snapshot_sig_handler(int sig);
2009 static void alarm_sig_handler(int sig);
2010
2011 static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist)
2012 {
2013         if (evlist) {
2014                 if (evlist->mmap && evlist->mmap[0].core.base)
2015                         return evlist->mmap[0].core.base;
2016                 if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
2017                         return evlist->overwrite_mmap[0].core.base;
2018         }
2019         return NULL;
2020 }
2021
2022 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
2023 {
2024         const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist);
2025         if (pc)
2026                 return pc;
2027         return NULL;
2028 }
2029
2030 static int record__synthesize(struct record *rec, bool tail)
2031 {
2032         struct perf_session *session = rec->session;
2033         struct machine *machine = &session->machines.host;
2034         struct perf_data *data = &rec->data;
2035         struct record_opts *opts = &rec->opts;
2036         struct perf_tool *tool = &rec->tool;
2037         int err = 0;
2038         event_op f = process_synthesized_event;
2039
2040         if (rec->opts.tail_synthesize != tail)
2041                 return 0;
2042
2043         if (data->is_pipe) {
2044                 err = perf_event__synthesize_for_pipe(tool, session, data,
2045                                                       process_synthesized_event);
2046                 if (err < 0)
2047                         goto out;
2048
2049                 rec->bytes_written += err;
2050         }
2051
2052         err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
2053                                           process_synthesized_event, machine);
2054         if (err)
2055                 goto out;
2056
2057         /* Synthesize id_index before auxtrace_info */
2058         err = perf_event__synthesize_id_index(tool,
2059                                               process_synthesized_event,
2060                                               session->evlist, machine);
2061         if (err)
2062                 goto out;
2063
2064         if (rec->opts.full_auxtrace) {
2065                 err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
2066                                         session, process_synthesized_event);
2067                 if (err)
2068                         goto out;
2069         }
2070
2071         if (!evlist__exclude_kernel(rec->evlist)) {
2072                 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
2073                                                          machine);
2074                 WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
2075                                    "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2076                                    "Check /proc/kallsyms permission or run as root.\n");
2077
2078                 err = perf_event__synthesize_modules(tool, process_synthesized_event,
2079                                                      machine);
2080                 WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
2081                                    "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2082                                    "Check /proc/modules permission or run as root.\n");
2083         }
2084
2085         if (perf_guest) {
2086                 machines__process_guests(&session->machines,
2087                                          perf_event__synthesize_guest_os, tool);
2088         }
2089
2090         err = perf_event__synthesize_extra_attr(&rec->tool,
2091                                                 rec->evlist,
2092                                                 process_synthesized_event,
2093                                                 data->is_pipe);
2094         if (err)
2095                 goto out;
2096
2097         err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
2098                                                  process_synthesized_event,
2099                                                 NULL);
2100         if (err < 0) {
2101                 pr_err("Couldn't synthesize thread map.\n");
2102                 return err;
2103         }
2104
2105         err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.all_cpus,
2106                                              process_synthesized_event, NULL);
2107         if (err < 0) {
2108                 pr_err("Couldn't synthesize cpu map.\n");
2109                 return err;
2110         }
2111
2112         err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
2113                                                 machine, opts);
2114         if (err < 0) {
2115                 pr_warning("Couldn't synthesize bpf events.\n");
2116                 err = 0;
2117         }
2118
2119         if (rec->opts.synth & PERF_SYNTH_CGROUP) {
2120                 err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
2121                                                      machine);
2122                 if (err < 0) {
2123                         pr_warning("Couldn't synthesize cgroup events.\n");
2124                         err = 0;
2125                 }
2126         }
2127
2128         if (rec->opts.nr_threads_synthesize > 1) {
2129                 mutex_init(&synth_lock);
2130                 perf_set_multithreaded();
2131                 f = process_locked_synthesized_event;
2132         }
2133
2134         if (rec->opts.synth & PERF_SYNTH_TASK) {
2135                 bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
2136
2137                 err = __machine__synthesize_threads(machine, tool, &opts->target,
2138                                                     rec->evlist->core.threads,
2139                                                     f, needs_mmap, opts->sample_address,
2140                                                     rec->opts.nr_threads_synthesize);
2141         }
2142
2143         if (rec->opts.nr_threads_synthesize > 1) {
2144                 perf_set_singlethreaded();
2145                 mutex_destroy(&synth_lock);
2146         }
2147
2148 out:
2149         return err;
2150 }
2151
2152 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
2153 {
2154         struct record *rec = data;
2155         pthread_kill(rec->thread_id, SIGUSR2);
2156         return 0;
2157 }
2158
2159 static int record__setup_sb_evlist(struct record *rec)
2160 {
2161         struct record_opts *opts = &rec->opts;
2162
2163         if (rec->sb_evlist != NULL) {
2164                 /*
2165                  * We get here if --switch-output-event populated the
2166                  * sb_evlist, so associate a callback that will send a SIGUSR2
2167                  * to the main thread.
2168                  */
2169                 evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
2170                 rec->thread_id = pthread_self();
2171         }
2172 #ifdef HAVE_LIBBPF_SUPPORT
2173         if (!opts->no_bpf_event) {
2174                 if (rec->sb_evlist == NULL) {
2175                         rec->sb_evlist = evlist__new();
2176
2177                         if (rec->sb_evlist == NULL) {
2178                                 pr_err("Couldn't create side band evlist.\n.");
2179                                 return -1;
2180                         }
2181                 }
2182
2183                 if (evlist__add_bpf_sb_event(rec->sb_evlist, &rec->session->header.env)) {
2184                         pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
2185                         return -1;
2186                 }
2187         }
2188 #endif
2189         if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
2190                 pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
2191                 opts->no_bpf_event = true;
2192         }
2193
2194         return 0;
2195 }
2196
2197 static int record__init_clock(struct record *rec)
2198 {
2199         struct perf_session *session = rec->session;
2200         struct timespec ref_clockid;
2201         struct timeval ref_tod;
2202         u64 ref;
2203
2204         if (!rec->opts.use_clockid)
2205                 return 0;
2206
2207         if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
2208                 session->header.env.clock.clockid_res_ns = rec->opts.clockid_res_ns;
2209
2210         session->header.env.clock.clockid = rec->opts.clockid;
2211
2212         if (gettimeofday(&ref_tod, NULL) != 0) {
2213                 pr_err("gettimeofday failed, cannot set reference time.\n");
2214                 return -1;
2215         }
2216
2217         if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
2218                 pr_err("clock_gettime failed, cannot set reference time.\n");
2219                 return -1;
2220         }
2221
2222         ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
2223               (u64) ref_tod.tv_usec * NSEC_PER_USEC;
2224
2225         session->header.env.clock.tod_ns = ref;
2226
2227         ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
2228               (u64) ref_clockid.tv_nsec;
2229
2230         session->header.env.clock.clockid_ns = ref;
2231         return 0;
2232 }
2233
2234 static void hit_auxtrace_snapshot_trigger(struct record *rec)
2235 {
2236         if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2237                 trigger_hit(&auxtrace_snapshot_trigger);
2238                 auxtrace_record__snapshot_started = 1;
2239                 if (auxtrace_record__snapshot_start(rec->itr))
2240                         trigger_error(&auxtrace_snapshot_trigger);
2241         }
2242 }
2243
2244 static int record__terminate_thread(struct record_thread *thread_data)
2245 {
2246         int err;
2247         enum thread_msg ack = THREAD_MSG__UNDEFINED;
2248         pid_t tid = thread_data->tid;
2249
2250         close(thread_data->pipes.msg[1]);
2251         thread_data->pipes.msg[1] = -1;
2252         err = read(thread_data->pipes.ack[0], &ack, sizeof(ack));
2253         if (err > 0)
2254                 pr_debug2("threads[%d]: sent %s\n", tid, thread_msg_tags[ack]);
2255         else
2256                 pr_warning("threads[%d]: failed to receive termination notification from %d\n",
2257                            thread->tid, tid);
2258
2259         return 0;
2260 }
2261
2262 static int record__start_threads(struct record *rec)
2263 {
2264         int t, tt, err, ret = 0, nr_threads = rec->nr_threads;
2265         struct record_thread *thread_data = rec->thread_data;
2266         sigset_t full, mask;
2267         pthread_t handle;
2268         pthread_attr_t attrs;
2269
2270         thread = &thread_data[0];
2271
2272         if (!record__threads_enabled(rec))
2273                 return 0;
2274
2275         sigfillset(&full);
2276         if (sigprocmask(SIG_SETMASK, &full, &mask)) {
2277                 pr_err("Failed to block signals on threads start: %s\n", strerror(errno));
2278                 return -1;
2279         }
2280
2281         pthread_attr_init(&attrs);
2282         pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
2283
2284         for (t = 1; t < nr_threads; t++) {
2285                 enum thread_msg msg = THREAD_MSG__UNDEFINED;
2286
2287 #ifdef HAVE_PTHREAD_ATTR_SETAFFINITY_NP
2288                 pthread_attr_setaffinity_np(&attrs,
2289                                             MMAP_CPU_MASK_BYTES(&(thread_data[t].mask->affinity)),
2290                                             (cpu_set_t *)(thread_data[t].mask->affinity.bits));
2291 #endif
2292                 if (pthread_create(&handle, &attrs, record__thread, &thread_data[t])) {
2293                         for (tt = 1; tt < t; tt++)
2294                                 record__terminate_thread(&thread_data[t]);
2295                         pr_err("Failed to start threads: %s\n", strerror(errno));
2296                         ret = -1;
2297                         goto out_err;
2298                 }
2299
2300                 err = read(thread_data[t].pipes.ack[0], &msg, sizeof(msg));
2301                 if (err > 0)
2302                         pr_debug2("threads[%d]: sent %s\n", rec->thread_data[t].tid,
2303                                   thread_msg_tags[msg]);
2304                 else
2305                         pr_warning("threads[%d]: failed to receive start notification from %d\n",
2306                                    thread->tid, rec->thread_data[t].tid);
2307         }
2308
2309         sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
2310                         (cpu_set_t *)thread->mask->affinity.bits);
2311
2312         pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
2313
2314 out_err:
2315         pthread_attr_destroy(&attrs);
2316
2317         if (sigprocmask(SIG_SETMASK, &mask, NULL)) {
2318                 pr_err("Failed to unblock signals on threads start: %s\n", strerror(errno));
2319                 ret = -1;
2320         }
2321
2322         return ret;
2323 }
2324
2325 static int record__stop_threads(struct record *rec)
2326 {
2327         int t;
2328         struct record_thread *thread_data = rec->thread_data;
2329
2330         for (t = 1; t < rec->nr_threads; t++)
2331                 record__terminate_thread(&thread_data[t]);
2332
2333         for (t = 0; t < rec->nr_threads; t++) {
2334                 rec->samples += thread_data[t].samples;
2335                 if (!record__threads_enabled(rec))
2336                         continue;
2337                 rec->session->bytes_transferred += thread_data[t].bytes_transferred;
2338                 rec->session->bytes_compressed += thread_data[t].bytes_compressed;
2339                 pr_debug("threads[%d]: samples=%lld, wakes=%ld, ", thread_data[t].tid,
2340                          thread_data[t].samples, thread_data[t].waking);
2341                 if (thread_data[t].bytes_transferred && thread_data[t].bytes_compressed)
2342                         pr_debug("transferred=%" PRIu64 ", compressed=%" PRIu64 "\n",
2343                                  thread_data[t].bytes_transferred, thread_data[t].bytes_compressed);
2344                 else
2345                         pr_debug("written=%" PRIu64 "\n", thread_data[t].bytes_written);
2346         }
2347
2348         return 0;
2349 }
2350
2351 static unsigned long record__waking(struct record *rec)
2352 {
2353         int t;
2354         unsigned long waking = 0;
2355         struct record_thread *thread_data = rec->thread_data;
2356
2357         for (t = 0; t < rec->nr_threads; t++)
2358                 waking += thread_data[t].waking;
2359
2360         return waking;
2361 }
2362
2363 static int __cmd_record(struct record *rec, int argc, const char **argv)
2364 {
2365         int err;
2366         int status = 0;
2367         const bool forks = argc > 0;
2368         struct perf_tool *tool = &rec->tool;
2369         struct record_opts *opts = &rec->opts;
2370         struct perf_data *data = &rec->data;
2371         struct perf_session *session;
2372         bool disabled = false, draining = false;
2373         int fd;
2374         float ratio = 0;
2375         enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
2376
2377         atexit(record__sig_exit);
2378         signal(SIGCHLD, sig_handler);
2379         signal(SIGINT, sig_handler);
2380         signal(SIGTERM, sig_handler);
2381         signal(SIGSEGV, sigsegv_handler);
2382
2383         if (rec->opts.record_namespaces)
2384                 tool->namespace_events = true;
2385
2386         if (rec->opts.record_cgroup) {
2387 #ifdef HAVE_FILE_HANDLE
2388                 tool->cgroup_events = true;
2389 #else
2390                 pr_err("cgroup tracking is not supported\n");
2391                 return -1;
2392 #endif
2393         }
2394
2395         if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
2396                 signal(SIGUSR2, snapshot_sig_handler);
2397                 if (rec->opts.auxtrace_snapshot_mode)
2398                         trigger_on(&auxtrace_snapshot_trigger);
2399                 if (rec->switch_output.enabled)
2400                         trigger_on(&switch_output_trigger);
2401         } else {
2402                 signal(SIGUSR2, SIG_IGN);
2403         }
2404
2405         session = perf_session__new(data, tool);
2406         if (IS_ERR(session)) {
2407                 pr_err("Perf session creation failed.\n");
2408                 return PTR_ERR(session);
2409         }
2410
2411         if (record__threads_enabled(rec)) {
2412                 if (perf_data__is_pipe(&rec->data)) {
2413                         pr_err("Parallel trace streaming is not available in pipe mode.\n");
2414                         return -1;
2415                 }
2416                 if (rec->opts.full_auxtrace) {
2417                         pr_err("Parallel trace streaming is not available in AUX area tracing mode.\n");
2418                         return -1;
2419                 }
2420         }
2421
2422         fd = perf_data__fd(data);
2423         rec->session = session;
2424
2425         if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
2426                 pr_err("Compression initialization failed.\n");
2427                 return -1;
2428         }
2429 #ifdef HAVE_EVENTFD_SUPPORT
2430         done_fd = eventfd(0, EFD_NONBLOCK);
2431         if (done_fd < 0) {
2432                 pr_err("Failed to create wakeup eventfd, error: %m\n");
2433                 status = -1;
2434                 goto out_delete_session;
2435         }
2436         err = evlist__add_wakeup_eventfd(rec->evlist, done_fd);
2437         if (err < 0) {
2438                 pr_err("Failed to add wakeup eventfd to poll list\n");
2439                 status = err;
2440                 goto out_delete_session;
2441         }
2442 #endif // HAVE_EVENTFD_SUPPORT
2443
2444         session->header.env.comp_type  = PERF_COMP_ZSTD;
2445         session->header.env.comp_level = rec->opts.comp_level;
2446
2447         if (rec->opts.kcore &&
2448             !record__kcore_readable(&session->machines.host)) {
2449                 pr_err("ERROR: kcore is not readable.\n");
2450                 return -1;
2451         }
2452
2453         if (record__init_clock(rec))
2454                 return -1;
2455
2456         record__init_features(rec);
2457
2458         if (forks) {
2459                 err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
2460                                                workload_exec_failed_signal);
2461                 if (err < 0) {
2462                         pr_err("Couldn't run the workload!\n");
2463                         status = err;
2464                         goto out_delete_session;
2465                 }
2466         }
2467
2468         /*
2469          * If we have just single event and are sending data
2470          * through pipe, we need to force the ids allocation,
2471          * because we synthesize event name through the pipe
2472          * and need the id for that.
2473          */
2474         if (data->is_pipe && rec->evlist->core.nr_entries == 1)
2475                 rec->opts.sample_id = true;
2476
2477         if (rec->timestamp_filename && perf_data__is_pipe(data)) {
2478                 rec->timestamp_filename = false;
2479                 pr_warning("WARNING: --timestamp-filename option is not available in pipe mode.\n");
2480         }
2481
2482         evlist__uniquify_name(rec->evlist);
2483
2484         evlist__config(rec->evlist, opts, &callchain_param);
2485
2486         /* Debug message used by test scripts */
2487         pr_debug3("perf record opening and mmapping events\n");
2488         if (record__open(rec) != 0) {
2489                 err = -1;
2490                 goto out_free_threads;
2491         }
2492         /* Debug message used by test scripts */
2493         pr_debug3("perf record done opening and mmapping events\n");
2494         session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
2495
2496         if (rec->opts.kcore) {
2497                 err = record__kcore_copy(&session->machines.host, data);
2498                 if (err) {
2499                         pr_err("ERROR: Failed to copy kcore\n");
2500                         goto out_free_threads;
2501                 }
2502         }
2503
2504         /*
2505          * Normally perf_session__new would do this, but it doesn't have the
2506          * evlist.
2507          */
2508         if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
2509                 pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
2510                 rec->tool.ordered_events = false;
2511         }
2512
2513         if (evlist__nr_groups(rec->evlist) == 0)
2514                 perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
2515
2516         if (data->is_pipe) {
2517                 err = perf_header__write_pipe(fd);
2518                 if (err < 0)
2519                         goto out_free_threads;
2520         } else {
2521                 err = perf_session__write_header(session, rec->evlist, fd, false);
2522                 if (err < 0)
2523                         goto out_free_threads;
2524         }
2525
2526         err = -1;
2527         if (!rec->no_buildid
2528             && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
2529                 pr_err("Couldn't generate buildids. "
2530                        "Use --no-buildid to profile anyway.\n");
2531                 goto out_free_threads;
2532         }
2533
2534         err = record__setup_sb_evlist(rec);
2535         if (err)
2536                 goto out_free_threads;
2537
2538         err = record__synthesize(rec, false);
2539         if (err < 0)
2540                 goto out_free_threads;
2541
2542         if (rec->realtime_prio) {
2543                 struct sched_param param;
2544
2545                 param.sched_priority = rec->realtime_prio;
2546                 if (sched_setscheduler(0, SCHED_FIFO, &param)) {
2547                         pr_err("Could not set realtime priority.\n");
2548                         err = -1;
2549                         goto out_free_threads;
2550                 }
2551         }
2552
2553         if (record__start_threads(rec))
2554                 goto out_free_threads;
2555
2556         /*
2557          * When perf is starting the traced process, all the events
2558          * (apart from group members) have enable_on_exec=1 set,
2559          * so don't spoil it by prematurely enabling them.
2560          */
2561         if (!target__none(&opts->target) && !opts->target.initial_delay)
2562                 evlist__enable(rec->evlist);
2563
2564         /*
2565          * Let the child rip
2566          */
2567         if (forks) {
2568                 struct machine *machine = &session->machines.host;
2569                 union perf_event *event;
2570                 pid_t tgid;
2571
2572                 event = malloc(sizeof(event->comm) + machine->id_hdr_size);
2573                 if (event == NULL) {
2574                         err = -ENOMEM;
2575                         goto out_child;
2576                 }
2577
2578                 /*
2579                  * Some H/W events are generated before COMM event
2580                  * which is emitted during exec(), so perf script
2581                  * cannot see a correct process name for those events.
2582                  * Synthesize COMM event to prevent it.
2583                  */
2584                 tgid = perf_event__synthesize_comm(tool, event,
2585                                                    rec->evlist->workload.pid,
2586                                                    process_synthesized_event,
2587                                                    machine);
2588                 free(event);
2589
2590                 if (tgid == -1)
2591                         goto out_child;
2592
2593                 event = malloc(sizeof(event->namespaces) +
2594                                (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
2595                                machine->id_hdr_size);
2596                 if (event == NULL) {
2597                         err = -ENOMEM;
2598                         goto out_child;
2599                 }
2600
2601                 /*
2602                  * Synthesize NAMESPACES event for the command specified.
2603                  */
2604                 perf_event__synthesize_namespaces(tool, event,
2605                                                   rec->evlist->workload.pid,
2606                                                   tgid, process_synthesized_event,
2607                                                   machine);
2608                 free(event);
2609
2610                 evlist__start_workload(rec->evlist);
2611         }
2612
2613         if (opts->target.initial_delay) {
2614                 pr_info(EVLIST_DISABLED_MSG);
2615                 if (opts->target.initial_delay > 0) {
2616                         usleep(opts->target.initial_delay * USEC_PER_MSEC);
2617                         evlist__enable(rec->evlist);
2618                         pr_info(EVLIST_ENABLED_MSG);
2619                 }
2620         }
2621
2622         err = event_enable_timer__start(rec->evlist->eet);
2623         if (err)
2624                 goto out_child;
2625
2626         /* Debug message used by test scripts */
2627         pr_debug3("perf record has started\n");
2628         fflush(stderr);
2629
2630         trigger_ready(&auxtrace_snapshot_trigger);
2631         trigger_ready(&switch_output_trigger);
2632         perf_hooks__invoke_record_start();
2633
2634         /*
2635          * Must write FINISHED_INIT so it will be seen after all other
2636          * synthesized user events, but before any regular events.
2637          */
2638         err = write_finished_init(rec, false);
2639         if (err < 0)
2640                 goto out_child;
2641
2642         for (;;) {
2643                 unsigned long long hits = thread->samples;
2644
2645                 /*
2646                  * rec->evlist->bkw_mmap_state is possible to be
2647                  * BKW_MMAP_EMPTY here: when done == true and
2648                  * hits != rec->samples in previous round.
2649                  *
2650                  * evlist__toggle_bkw_mmap ensure we never
2651                  * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
2652                  */
2653                 if (trigger_is_hit(&switch_output_trigger) || done || draining)
2654                         evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
2655
2656                 if (record__mmap_read_all(rec, false) < 0) {
2657                         trigger_error(&auxtrace_snapshot_trigger);
2658                         trigger_error(&switch_output_trigger);
2659                         err = -1;
2660                         goto out_child;
2661                 }
2662
2663                 if (auxtrace_record__snapshot_started) {
2664                         auxtrace_record__snapshot_started = 0;
2665                         if (!trigger_is_error(&auxtrace_snapshot_trigger))
2666                                 record__read_auxtrace_snapshot(rec, false);
2667                         if (trigger_is_error(&auxtrace_snapshot_trigger)) {
2668                                 pr_err("AUX area tracing snapshot failed\n");
2669                                 err = -1;
2670                                 goto out_child;
2671                         }
2672                 }
2673
2674                 if (trigger_is_hit(&switch_output_trigger)) {
2675                         /*
2676                          * If switch_output_trigger is hit, the data in
2677                          * overwritable ring buffer should have been collected,
2678                          * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
2679                          *
2680                          * If SIGUSR2 raise after or during record__mmap_read_all(),
2681                          * record__mmap_read_all() didn't collect data from
2682                          * overwritable ring buffer. Read again.
2683                          */
2684                         if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
2685                                 continue;
2686                         trigger_ready(&switch_output_trigger);
2687
2688                         /*
2689                          * Reenable events in overwrite ring buffer after
2690                          * record__mmap_read_all(): we should have collected
2691                          * data from it.
2692                          */
2693                         evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
2694
2695                         if (!quiet)
2696                                 fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
2697                                         record__waking(rec));
2698                         thread->waking = 0;
2699                         fd = record__switch_output(rec, false);
2700                         if (fd < 0) {
2701                                 pr_err("Failed to switch to new file\n");
2702                                 trigger_error(&switch_output_trigger);
2703                                 err = fd;
2704                                 goto out_child;
2705                         }
2706
2707                         /* re-arm the alarm */
2708                         if (rec->switch_output.time)
2709                                 alarm(rec->switch_output.time);
2710                 }
2711
2712                 if (hits == thread->samples) {
2713                         if (done || draining)
2714                                 break;
2715                         err = fdarray__poll(&thread->pollfd, -1);
2716                         /*
2717                          * Propagate error, only if there's any. Ignore positive
2718                          * number of returned events and interrupt error.
2719                          */
2720                         if (err > 0 || (err < 0 && errno == EINTR))
2721                                 err = 0;
2722                         thread->waking++;
2723
2724                         if (fdarray__filter(&thread->pollfd, POLLERR | POLLHUP,
2725                                             record__thread_munmap_filtered, NULL) == 0)
2726                                 draining = true;
2727
2728                         err = record__update_evlist_pollfd_from_thread(rec, rec->evlist, thread);
2729                         if (err)
2730                                 goto out_child;
2731                 }
2732
2733                 if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
2734                         switch (cmd) {
2735                         case EVLIST_CTL_CMD_SNAPSHOT:
2736                                 hit_auxtrace_snapshot_trigger(rec);
2737                                 evlist__ctlfd_ack(rec->evlist);
2738                                 break;
2739                         case EVLIST_CTL_CMD_STOP:
2740                                 done = 1;
2741                                 break;
2742                         case EVLIST_CTL_CMD_ACK:
2743                         case EVLIST_CTL_CMD_UNSUPPORTED:
2744                         case EVLIST_CTL_CMD_ENABLE:
2745                         case EVLIST_CTL_CMD_DISABLE:
2746                         case EVLIST_CTL_CMD_EVLIST:
2747                         case EVLIST_CTL_CMD_PING:
2748                         default:
2749                                 break;
2750                         }
2751                 }
2752
2753                 err = event_enable_timer__process(rec->evlist->eet);
2754                 if (err < 0)
2755                         goto out_child;
2756                 if (err) {
2757                         err = 0;
2758                         done = 1;
2759                 }
2760
2761                 /*
2762                  * When perf is starting the traced process, at the end events
2763                  * die with the process and we wait for that. Thus no need to
2764                  * disable events in this case.
2765                  */
2766                 if (done && !disabled && !target__none(&opts->target)) {
2767                         trigger_off(&auxtrace_snapshot_trigger);
2768                         evlist__disable(rec->evlist);
2769                         disabled = true;
2770                 }
2771         }
2772
2773         trigger_off(&auxtrace_snapshot_trigger);
2774         trigger_off(&switch_output_trigger);
2775
2776         if (opts->auxtrace_snapshot_on_exit)
2777                 record__auxtrace_snapshot_exit(rec);
2778
2779         if (forks && workload_exec_errno) {
2780                 char msg[STRERR_BUFSIZE], strevsels[2048];
2781                 const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
2782
2783                 evlist__scnprintf_evsels(rec->evlist, sizeof(strevsels), strevsels);
2784
2785                 pr_err("Failed to collect '%s' for the '%s' workload: %s\n",
2786                         strevsels, argv[0], emsg);
2787                 err = -1;
2788                 goto out_child;
2789         }
2790
2791         if (!quiet)
2792                 fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n",
2793                         record__waking(rec));
2794
2795         write_finished_init(rec, true);
2796
2797         if (target__none(&rec->opts.target))
2798                 record__synthesize_workload(rec, true);
2799
2800 out_child:
2801         record__stop_threads(rec);
2802         record__mmap_read_all(rec, true);
2803 out_free_threads:
2804         record__free_thread_data(rec);
2805         evlist__finalize_ctlfd(rec->evlist);
2806         record__aio_mmap_read_sync(rec);
2807
2808         if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
2809                 ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
2810                 session->header.env.comp_ratio = ratio + 0.5;
2811         }
2812
2813         if (forks) {
2814                 int exit_status;
2815
2816                 if (!child_finished)
2817                         kill(rec->evlist->workload.pid, SIGTERM);
2818
2819                 wait(&exit_status);
2820
2821                 if (err < 0)
2822                         status = err;
2823                 else if (WIFEXITED(exit_status))
2824                         status = WEXITSTATUS(exit_status);
2825                 else if (WIFSIGNALED(exit_status))
2826                         signr = WTERMSIG(exit_status);
2827         } else
2828                 status = err;
2829
2830         if (rec->off_cpu)
2831                 rec->bytes_written += off_cpu_write(rec->session);
2832
2833         record__read_lost_samples(rec);
2834         record__synthesize(rec, true);
2835         /* this will be recalculated during process_buildids() */
2836         rec->samples = 0;
2837
2838         if (!err) {
2839                 if (!rec->timestamp_filename) {
2840                         record__finish_output(rec);
2841                 } else {
2842                         fd = record__switch_output(rec, true);
2843                         if (fd < 0) {
2844                                 status = fd;
2845                                 goto out_delete_session;
2846                         }
2847                 }
2848         }
2849
2850         perf_hooks__invoke_record_end();
2851
2852         if (!err && !quiet) {
2853                 char samples[128];
2854                 const char *postfix = rec->timestamp_filename ?
2855                                         ".<timestamp>" : "";
2856
2857                 if (rec->samples && !rec->opts.full_auxtrace)
2858                         scnprintf(samples, sizeof(samples),
2859                                   " (%" PRIu64 " samples)", rec->samples);
2860                 else
2861                         samples[0] = '\0';
2862
2863                 fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s%s%s",
2864                         perf_data__size(data) / 1024.0 / 1024.0,
2865                         data->path, postfix, samples);
2866                 if (ratio) {
2867                         fprintf(stderr, ", compressed (original %.3f MB, ratio is %.3f)",
2868                                         rec->session->bytes_transferred / 1024.0 / 1024.0,
2869                                         ratio);
2870                 }
2871                 fprintf(stderr, " ]\n");
2872         }
2873
2874 out_delete_session:
2875 #ifdef HAVE_EVENTFD_SUPPORT
2876         if (done_fd >= 0) {
2877                 fd = done_fd;
2878                 done_fd = -1;
2879
2880                 close(fd);
2881         }
2882 #endif
2883         zstd_fini(&session->zstd_data);
2884         if (!opts->no_bpf_event)
2885                 evlist__stop_sb_thread(rec->sb_evlist);
2886
2887         perf_session__delete(session);
2888         return status;
2889 }
2890
2891 static void callchain_debug(struct callchain_param *callchain)
2892 {
2893         static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
2894
2895         pr_debug("callchain: type %s\n", str[callchain->record_mode]);
2896
2897         if (callchain->record_mode == CALLCHAIN_DWARF)
2898                 pr_debug("callchain: stack dump size %d\n",
2899                          callchain->dump_size);
2900 }
2901
2902 int record_opts__parse_callchain(struct record_opts *record,
2903                                  struct callchain_param *callchain,
2904                                  const char *arg, bool unset)
2905 {
2906         int ret;
2907         callchain->enabled = !unset;
2908
2909         /* --no-call-graph */
2910         if (unset) {
2911                 callchain->record_mode = CALLCHAIN_NONE;
2912                 pr_debug("callchain: disabled\n");
2913                 return 0;
2914         }
2915
2916         ret = parse_callchain_record_opt(arg, callchain);
2917         if (!ret) {
2918                 /* Enable data address sampling for DWARF unwind. */
2919                 if (callchain->record_mode == CALLCHAIN_DWARF)
2920                         record->sample_address = true;
2921                 callchain_debug(callchain);
2922         }
2923
2924         return ret;
2925 }
2926
2927 int record_parse_callchain_opt(const struct option *opt,
2928                                const char *arg,
2929                                int unset)
2930 {
2931         return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
2932 }
2933
2934 int record_callchain_opt(const struct option *opt,
2935                          const char *arg __maybe_unused,
2936                          int unset __maybe_unused)
2937 {
2938         struct callchain_param *callchain = opt->value;
2939
2940         callchain->enabled = true;
2941
2942         if (callchain->record_mode == CALLCHAIN_NONE)
2943                 callchain->record_mode = CALLCHAIN_FP;
2944
2945         callchain_debug(callchain);
2946         return 0;
2947 }
2948
2949 static int perf_record_config(const char *var, const char *value, void *cb)
2950 {
2951         struct record *rec = cb;
2952
2953         if (!strcmp(var, "record.build-id")) {
2954                 if (!strcmp(value, "cache"))
2955                         rec->no_buildid_cache = false;
2956                 else if (!strcmp(value, "no-cache"))
2957                         rec->no_buildid_cache = true;
2958                 else if (!strcmp(value, "skip"))
2959                         rec->no_buildid = true;
2960                 else if (!strcmp(value, "mmap"))
2961                         rec->buildid_mmap = true;
2962                 else
2963                         return -1;
2964                 return 0;
2965         }
2966         if (!strcmp(var, "record.call-graph")) {
2967                 var = "call-graph.record-mode";
2968                 return perf_default_config(var, value, cb);
2969         }
2970 #ifdef HAVE_AIO_SUPPORT
2971         if (!strcmp(var, "record.aio")) {
2972                 rec->opts.nr_cblocks = strtol(value, NULL, 0);
2973                 if (!rec->opts.nr_cblocks)
2974                         rec->opts.nr_cblocks = nr_cblocks_default;
2975         }
2976 #endif
2977         if (!strcmp(var, "record.debuginfod")) {
2978                 rec->debuginfod.urls = strdup(value);
2979                 if (!rec->debuginfod.urls)
2980                         return -ENOMEM;
2981                 rec->debuginfod.set = true;
2982         }
2983
2984         return 0;
2985 }
2986
2987 static int record__parse_event_enable_time(const struct option *opt, const char *str, int unset)
2988 {
2989         struct record *rec = (struct record *)opt->value;
2990
2991         return evlist__parse_event_enable_time(rec->evlist, &rec->opts, str, unset);
2992 }
2993
2994 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
2995 {
2996         struct record_opts *opts = (struct record_opts *)opt->value;
2997
2998         if (unset || !str)
2999                 return 0;
3000
3001         if (!strcasecmp(str, "node"))
3002                 opts->affinity = PERF_AFFINITY_NODE;
3003         else if (!strcasecmp(str, "cpu"))
3004                 opts->affinity = PERF_AFFINITY_CPU;
3005
3006         return 0;
3007 }
3008
3009 static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
3010 {
3011         mask->nbits = nr_bits;
3012         mask->bits = bitmap_zalloc(mask->nbits);
3013         if (!mask->bits)
3014                 return -ENOMEM;
3015
3016         return 0;
3017 }
3018
3019 static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask)
3020 {
3021         bitmap_free(mask->bits);
3022         mask->nbits = 0;
3023 }
3024
3025 static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits)
3026 {
3027         int ret;
3028
3029         ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
3030         if (ret) {
3031                 mask->affinity.bits = NULL;
3032                 return ret;
3033         }
3034
3035         ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits);
3036         if (ret) {
3037                 record__mmap_cpu_mask_free(&mask->maps);
3038                 mask->maps.bits = NULL;
3039         }
3040
3041         return ret;
3042 }
3043
3044 static void record__thread_mask_free(struct thread_mask *mask)
3045 {
3046         record__mmap_cpu_mask_free(&mask->maps);
3047         record__mmap_cpu_mask_free(&mask->affinity);
3048 }
3049
3050 static int record__parse_threads(const struct option *opt, const char *str, int unset)
3051 {
3052         int s;
3053         struct record_opts *opts = opt->value;
3054
3055         if (unset || !str || !strlen(str)) {
3056                 opts->threads_spec = THREAD_SPEC__CPU;
3057         } else {
3058                 for (s = 1; s < THREAD_SPEC__MAX; s++) {
3059                         if (s == THREAD_SPEC__USER) {
3060                                 opts->threads_user_spec = strdup(str);
3061                                 if (!opts->threads_user_spec)
3062                                         return -ENOMEM;
3063                                 opts->threads_spec = THREAD_SPEC__USER;
3064                                 break;
3065                         }
3066                         if (!strncasecmp(str, thread_spec_tags[s], strlen(thread_spec_tags[s]))) {
3067                                 opts->threads_spec = s;
3068                                 break;
3069                         }
3070                 }
3071         }
3072
3073         if (opts->threads_spec == THREAD_SPEC__USER)
3074                 pr_debug("threads_spec: %s\n", opts->threads_user_spec);
3075         else
3076                 pr_debug("threads_spec: %s\n", thread_spec_tags[opts->threads_spec]);
3077
3078         return 0;
3079 }
3080
3081 static int parse_output_max_size(const struct option *opt,
3082                                  const char *str, int unset)
3083 {
3084         unsigned long *s = (unsigned long *)opt->value;
3085         static struct parse_tag tags_size[] = {
3086                 { .tag  = 'B', .mult = 1       },
3087                 { .tag  = 'K', .mult = 1 << 10 },
3088                 { .tag  = 'M', .mult = 1 << 20 },
3089                 { .tag  = 'G', .mult = 1 << 30 },
3090                 { .tag  = 0 },
3091         };
3092         unsigned long val;
3093
3094         if (unset) {
3095                 *s = 0;
3096                 return 0;
3097         }
3098
3099         val = parse_tag_value(str, tags_size);
3100         if (val != (unsigned long) -1) {
3101                 *s = val;
3102                 return 0;
3103         }
3104
3105         return -1;
3106 }
3107
3108 static int record__parse_mmap_pages(const struct option *opt,
3109                                     const char *str,
3110                                     int unset __maybe_unused)
3111 {
3112         struct record_opts *opts = opt->value;
3113         char *s, *p;
3114         unsigned int mmap_pages;
3115         int ret;
3116
3117         if (!str)
3118                 return -EINVAL;
3119
3120         s = strdup(str);
3121         if (!s)
3122                 return -ENOMEM;
3123
3124         p = strchr(s, ',');
3125         if (p)
3126                 *p = '\0';
3127
3128         if (*s) {
3129                 ret = __evlist__parse_mmap_pages(&mmap_pages, s);
3130                 if (ret)
3131                         goto out_free;
3132                 opts->mmap_pages = mmap_pages;
3133         }
3134
3135         if (!p) {
3136                 ret = 0;
3137                 goto out_free;
3138         }
3139
3140         ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1);
3141         if (ret)
3142                 goto out_free;
3143
3144         opts->auxtrace_mmap_pages = mmap_pages;
3145
3146 out_free:
3147         free(s);
3148         return ret;
3149 }
3150
3151 void __weak arch__add_leaf_frame_record_opts(struct record_opts *opts __maybe_unused)
3152 {
3153 }
3154
3155 static int parse_control_option(const struct option *opt,
3156                                 const char *str,
3157                                 int unset __maybe_unused)
3158 {
3159         struct record_opts *opts = opt->value;
3160
3161         return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
3162 }
3163
3164 static void switch_output_size_warn(struct record *rec)
3165 {
3166         u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
3167         struct switch_output *s = &rec->switch_output;
3168
3169         wakeup_size /= 2;
3170
3171         if (s->size < wakeup_size) {
3172                 char buf[100];
3173
3174                 unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
3175                 pr_warning("WARNING: switch-output data size lower than "
3176                            "wakeup kernel buffer size (%s) "
3177                            "expect bigger perf.data sizes\n", buf);
3178         }
3179 }
3180
3181 static int switch_output_setup(struct record *rec)
3182 {
3183         struct switch_output *s = &rec->switch_output;
3184         static struct parse_tag tags_size[] = {
3185                 { .tag  = 'B', .mult = 1       },
3186                 { .tag  = 'K', .mult = 1 << 10 },
3187                 { .tag  = 'M', .mult = 1 << 20 },
3188                 { .tag  = 'G', .mult = 1 << 30 },
3189                 { .tag  = 0 },
3190         };
3191         static struct parse_tag tags_time[] = {
3192                 { .tag  = 's', .mult = 1        },
3193                 { .tag  = 'm', .mult = 60       },
3194                 { .tag  = 'h', .mult = 60*60    },
3195                 { .tag  = 'd', .mult = 60*60*24 },
3196                 { .tag  = 0 },
3197         };
3198         unsigned long val;
3199
3200         /*
3201          * If we're using --switch-output-events, then we imply its 
3202          * --switch-output=signal, as we'll send a SIGUSR2 from the side band
3203          *  thread to its parent.
3204          */
3205         if (rec->switch_output_event_set) {
3206                 if (record__threads_enabled(rec)) {
3207                         pr_warning("WARNING: --switch-output-event option is not available in parallel streaming mode.\n");
3208                         return 0;
3209                 }
3210                 goto do_signal;
3211         }
3212
3213         if (!s->set)
3214                 return 0;
3215
3216         if (record__threads_enabled(rec)) {
3217                 pr_warning("WARNING: --switch-output option is not available in parallel streaming mode.\n");
3218                 return 0;
3219         }
3220
3221         if (!strcmp(s->str, "signal")) {
3222 do_signal:
3223                 s->signal = true;
3224                 pr_debug("switch-output with SIGUSR2 signal\n");
3225                 goto enabled;
3226         }
3227
3228         val = parse_tag_value(s->str, tags_size);
3229         if (val != (unsigned long) -1) {
3230                 s->size = val;
3231                 pr_debug("switch-output with %s size threshold\n", s->str);
3232                 goto enabled;
3233         }
3234
3235         val = parse_tag_value(s->str, tags_time);
3236         if (val != (unsigned long) -1) {
3237                 s->time = val;
3238                 pr_debug("switch-output with %s time threshold (%lu seconds)\n",
3239                          s->str, s->time);
3240                 goto enabled;
3241         }
3242
3243         return -1;
3244
3245 enabled:
3246         rec->timestamp_filename = true;
3247         s->enabled              = true;
3248
3249         if (s->size && !rec->opts.no_buffering)
3250                 switch_output_size_warn(rec);
3251
3252         return 0;
3253 }
3254
3255 static const char * const __record_usage[] = {
3256         "perf record [<options>] [<command>]",
3257         "perf record [<options>] -- <command> [<options>]",
3258         NULL
3259 };
3260 const char * const *record_usage = __record_usage;
3261
3262 static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event,
3263                                   struct perf_sample *sample, struct machine *machine)
3264 {
3265         /*
3266          * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3267          * no need to add them twice.
3268          */
3269         if (!(event->header.misc & PERF_RECORD_MISC_USER))
3270                 return 0;
3271         return perf_event__process_mmap(tool, event, sample, machine);
3272 }
3273
3274 static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event,
3275                                    struct perf_sample *sample, struct machine *machine)
3276 {
3277         /*
3278          * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3279          * no need to add them twice.
3280          */
3281         if (!(event->header.misc & PERF_RECORD_MISC_USER))
3282                 return 0;
3283
3284         return perf_event__process_mmap2(tool, event, sample, machine);
3285 }
3286
3287 static int process_timestamp_boundary(struct perf_tool *tool,
3288                                       union perf_event *event __maybe_unused,
3289                                       struct perf_sample *sample,
3290                                       struct machine *machine __maybe_unused)
3291 {
3292         struct record *rec = container_of(tool, struct record, tool);
3293
3294         set_timestamp_boundary(rec, sample->time);
3295         return 0;
3296 }
3297
3298 static int parse_record_synth_option(const struct option *opt,
3299                                      const char *str,
3300                                      int unset __maybe_unused)
3301 {
3302         struct record_opts *opts = opt->value;
3303         char *p = strdup(str);
3304
3305         if (p == NULL)
3306                 return -1;
3307
3308         opts->synth = parse_synth_opt(p);
3309         free(p);
3310
3311         if (opts->synth < 0) {
3312                 pr_err("Invalid synth option: %s\n", str);
3313                 return -1;
3314         }
3315         return 0;
3316 }
3317
3318 /*
3319  * XXX Ideally would be local to cmd_record() and passed to a record__new
3320  * because we need to have access to it in record__exit, that is called
3321  * after cmd_record() exits, but since record_options need to be accessible to
3322  * builtin-script, leave it here.
3323  *
3324  * At least we don't ouch it in all the other functions here directly.
3325  *
3326  * Just say no to tons of global variables, sigh.
3327  */
3328 static struct record record = {
3329         .opts = {
3330                 .sample_time         = true,
3331                 .mmap_pages          = UINT_MAX,
3332                 .user_freq           = UINT_MAX,
3333                 .user_interval       = ULLONG_MAX,
3334                 .freq                = 4000,
3335                 .target              = {
3336                         .uses_mmap   = true,
3337                         .default_per_cpu = true,
3338                 },
3339                 .mmap_flush          = MMAP_FLUSH_DEFAULT,
3340                 .nr_threads_synthesize = 1,
3341                 .ctl_fd              = -1,
3342                 .ctl_fd_ack          = -1,
3343                 .synth               = PERF_SYNTH_ALL,
3344         },
3345         .tool = {
3346                 .sample         = process_sample_event,
3347                 .fork           = perf_event__process_fork,
3348                 .exit           = perf_event__process_exit,
3349                 .comm           = perf_event__process_comm,
3350                 .namespaces     = perf_event__process_namespaces,
3351                 .mmap           = build_id__process_mmap,
3352                 .mmap2          = build_id__process_mmap2,
3353                 .itrace_start   = process_timestamp_boundary,
3354                 .aux            = process_timestamp_boundary,
3355                 .ordered_events = true,
3356         },
3357 };
3358
3359 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
3360         "\n\t\t\t\tDefault: fp";
3361
3362 static bool dry_run;
3363
3364 static struct parse_events_option_args parse_events_option_args = {
3365         .evlistp = &record.evlist,
3366 };
3367
3368 static struct parse_events_option_args switch_output_parse_events_option_args = {
3369         .evlistp = &record.sb_evlist,
3370 };
3371
3372 /*
3373  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
3374  * with it and switch to use the library functions in perf_evlist that came
3375  * from builtin-record.c, i.e. use record_opts,
3376  * evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
3377  * using pipes, etc.
3378  */
3379 static struct option __record_options[] = {
3380         OPT_CALLBACK('e', "event", &parse_events_option_args, "event",
3381                      "event selector. use 'perf list' to list available events",
3382                      parse_events_option),
3383         OPT_CALLBACK(0, "filter", &record.evlist, "filter",
3384                      "event filter", parse_filter),
3385         OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
3386                            NULL, "don't record events from perf itself",
3387                            exclude_perf),
3388         OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
3389                     "record events on existing process id"),
3390         OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
3391                     "record events on existing thread id"),
3392         OPT_INTEGER('r', "realtime", &record.realtime_prio,
3393                     "collect data with this RT SCHED_FIFO priority"),
3394         OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
3395                     "collect data without buffering"),
3396         OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
3397                     "collect raw sample records from all opened counters"),
3398         OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
3399                             "system-wide collection from all CPUs"),
3400         OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
3401                     "list of cpus to monitor"),
3402         OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
3403         OPT_STRING('o', "output", &record.data.path, "file",
3404                     "output file name"),
3405         OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
3406                         &record.opts.no_inherit_set,
3407                         "child tasks do not inherit counters"),
3408         OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
3409                     "synthesize non-sample events at the end of output"),
3410         OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
3411         OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
3412         OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
3413                     "Fail if the specified frequency can't be used"),
3414         OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
3415                      "profile at this frequency",
3416                       record__parse_freq),
3417         OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
3418                      "number of mmap data pages and AUX area tracing mmap pages",
3419                      record__parse_mmap_pages),
3420         OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
3421                      "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
3422                      record__mmap_flush_parse),
3423         OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
3424                            NULL, "enables call-graph recording" ,
3425                            &record_callchain_opt),
3426         OPT_CALLBACK(0, "call-graph", &record.opts,
3427                      "record_mode[,record_size]", record_callchain_help,
3428                      &record_parse_callchain_opt),
3429         OPT_INCR('v', "verbose", &verbose,
3430                     "be more verbose (show counter open errors, etc)"),
3431         OPT_BOOLEAN('q', "quiet", &quiet, "don't print any warnings or messages"),
3432         OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
3433                     "per thread counts"),
3434         OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
3435         OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
3436                     "Record the sample physical addresses"),
3437         OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
3438                     "Record the sampled data address data page size"),
3439         OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size,
3440                     "Record the sampled code address (ip) page size"),
3441         OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
3442         OPT_BOOLEAN(0, "sample-identifier", &record.opts.sample_identifier,
3443                     "Record the sample identifier"),
3444         OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
3445                         &record.opts.sample_time_set,
3446                         "Record the sample timestamps"),
3447         OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
3448                         "Record the sample period"),
3449         OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
3450                     "don't sample"),
3451         OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
3452                         &record.no_buildid_cache_set,
3453                         "do not update the buildid cache"),
3454         OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
3455                         &record.no_buildid_set,
3456                         "do not collect buildids in perf.data"),
3457         OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
3458                      "monitor event in cgroup name only",
3459                      parse_cgroups),
3460         OPT_CALLBACK('D', "delay", &record, "ms",
3461                      "ms to wait before starting measurement after program start (-1: start with events disabled), "
3462                      "or ranges of time to enable events e.g. '-D 10-20,30-40'",
3463                      record__parse_event_enable_time),
3464         OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
3465         OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
3466                    "user to profile"),
3467
3468         OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
3469                      "branch any", "sample any taken branches",
3470                      parse_branch_stack),
3471
3472         OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
3473                      "branch filter mask", "branch stack filter modes",
3474                      parse_branch_stack),
3475         OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
3476                     "sample by weight (on special events only)"),
3477         OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
3478                     "sample transaction flags (special events only)"),
3479         OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
3480                     "use per-thread mmaps"),
3481         OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
3482                     "sample selected machine registers on interrupt,"
3483                     " use '-I?' to list register names", parse_intr_regs),
3484         OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
3485                     "sample selected machine registers on interrupt,"
3486                     " use '--user-regs=?' to list register names", parse_user_regs),
3487         OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
3488                     "Record running/enabled time of read (:S) events"),
3489         OPT_CALLBACK('k', "clockid", &record.opts,
3490         "clockid", "clockid to use for events, see clock_gettime()",
3491         parse_clockid),
3492         OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
3493                           "opts", "AUX area tracing Snapshot Mode", ""),
3494         OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
3495                           "opts", "sample AUX area", ""),
3496         OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
3497                         "per thread proc mmap processing timeout in ms"),
3498         OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
3499                     "Record namespaces events"),
3500         OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
3501                     "Record cgroup events"),
3502         OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
3503                         &record.opts.record_switch_events_set,
3504                         "Record context switch events"),
3505         OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
3506                          "Configure all used events to run in kernel space.",
3507                          PARSE_OPT_EXCLUSIVE),
3508         OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
3509                          "Configure all used events to run in user space.",
3510                          PARSE_OPT_EXCLUSIVE),
3511         OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
3512                     "collect kernel callchains"),
3513         OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
3514                     "collect user callchains"),
3515         OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
3516                    "file", "vmlinux pathname"),
3517         OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
3518                     "Record build-id of all DSOs regardless of hits"),
3519         OPT_BOOLEAN(0, "buildid-mmap", &record.buildid_mmap,
3520                     "Record build-id in map events"),
3521         OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
3522                     "append timestamp to output filename"),
3523         OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
3524                     "Record timestamp boundary (time of first/last samples)"),
3525         OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
3526                           &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
3527                           "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
3528                           "signal"),
3529         OPT_CALLBACK_SET(0, "switch-output-event", &switch_output_parse_events_option_args,
3530                          &record.switch_output_event_set, "switch output event",
3531                          "switch output event selector. use 'perf list' to list available events",
3532                          parse_events_option_new_evlist),
3533         OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
3534                    "Limit number of switch output generated files"),
3535         OPT_BOOLEAN(0, "dry-run", &dry_run,
3536                     "Parse options then exit"),
3537 #ifdef HAVE_AIO_SUPPORT
3538         OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
3539                      &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
3540                      record__aio_parse),
3541 #endif
3542         OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
3543                      "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
3544                      record__parse_affinity),
3545 #ifdef HAVE_ZSTD_SUPPORT
3546         OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default, "n",
3547                             "Compress records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
3548                             record__parse_comp_level),
3549 #endif
3550         OPT_CALLBACK(0, "max-size", &record.output_max_size,
3551                      "size", "Limit the maximum size of the output file", parse_output_max_size),
3552         OPT_UINTEGER(0, "num-thread-synthesize",
3553                      &record.opts.nr_threads_synthesize,
3554                      "number of threads to run for event synthesis"),
3555 #ifdef HAVE_LIBPFM
3556         OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
3557                 "libpfm4 event selector. use 'perf list' to list available events",
3558                 parse_libpfm_events_option),
3559 #endif
3560         OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
3561                      "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
3562                      "\t\t\t  'snapshot': AUX area tracing snapshot).\n"
3563                      "\t\t\t  Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
3564                      "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
3565                       parse_control_option),
3566         OPT_CALLBACK(0, "synth", &record.opts, "no|all|task|mmap|cgroup",
3567                      "Fine-tune event synthesis: default=all", parse_record_synth_option),
3568         OPT_STRING_OPTARG_SET(0, "debuginfod", &record.debuginfod.urls,
3569                           &record.debuginfod.set, "debuginfod urls",
3570                           "Enable debuginfod data retrieval from DEBUGINFOD_URLS or specified urls",
3571                           "system"),
3572         OPT_CALLBACK_OPTARG(0, "threads", &record.opts, NULL, "spec",
3573                             "write collected trace data into several data files using parallel threads",
3574                             record__parse_threads),
3575         OPT_BOOLEAN(0, "off-cpu", &record.off_cpu, "Enable off-cpu analysis"),
3576         OPT_END()
3577 };
3578
3579 struct option *record_options = __record_options;
3580
3581 static int record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
3582 {
3583         struct perf_cpu cpu;
3584         int idx;
3585
3586         if (cpu_map__is_dummy(cpus))
3587                 return 0;
3588
3589         perf_cpu_map__for_each_cpu_skip_any(cpu, idx, cpus) {
3590                 /* Return ENODEV is input cpu is greater than max cpu */
3591                 if ((unsigned long)cpu.cpu > mask->nbits)
3592                         return -ENODEV;
3593                 __set_bit(cpu.cpu, mask->bits);
3594         }
3595
3596         return 0;
3597 }
3598
3599 static int record__mmap_cpu_mask_init_spec(struct mmap_cpu_mask *mask, const char *mask_spec)
3600 {
3601         struct perf_cpu_map *cpus;
3602
3603         cpus = perf_cpu_map__new(mask_spec);
3604         if (!cpus)
3605                 return -ENOMEM;
3606
3607         bitmap_zero(mask->bits, mask->nbits);
3608         if (record__mmap_cpu_mask_init(mask, cpus))
3609                 return -ENODEV;
3610
3611         perf_cpu_map__put(cpus);
3612
3613         return 0;
3614 }
3615
3616 static void record__free_thread_masks(struct record *rec, int nr_threads)
3617 {
3618         int t;
3619
3620         if (rec->thread_masks)
3621                 for (t = 0; t < nr_threads; t++)
3622                         record__thread_mask_free(&rec->thread_masks[t]);
3623
3624         zfree(&rec->thread_masks);
3625 }
3626
3627 static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
3628 {
3629         int t, ret;
3630
3631         rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
3632         if (!rec->thread_masks) {
3633                 pr_err("Failed to allocate thread masks\n");
3634                 return -ENOMEM;
3635         }
3636
3637         for (t = 0; t < nr_threads; t++) {
3638                 ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
3639                 if (ret) {
3640                         pr_err("Failed to allocate thread masks[%d]\n", t);
3641                         goto out_free;
3642                 }
3643         }
3644
3645         return 0;
3646
3647 out_free:
3648         record__free_thread_masks(rec, nr_threads);
3649
3650         return ret;
3651 }
3652
3653 static int record__init_thread_cpu_masks(struct record *rec, struct perf_cpu_map *cpus)
3654 {
3655         int t, ret, nr_cpus = perf_cpu_map__nr(cpus);
3656
3657         ret = record__alloc_thread_masks(rec, nr_cpus, cpu__max_cpu().cpu);
3658         if (ret)
3659                 return ret;
3660
3661         rec->nr_threads = nr_cpus;
3662         pr_debug("nr_threads: %d\n", rec->nr_threads);
3663
3664         for (t = 0; t < rec->nr_threads; t++) {
3665                 __set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].maps.bits);
3666                 __set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].affinity.bits);
3667                 if (verbose > 0) {
3668                         pr_debug("thread_masks[%d]: ", t);
3669                         mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3670                         pr_debug("thread_masks[%d]: ", t);
3671                         mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3672                 }
3673         }
3674
3675         return 0;
3676 }
3677
3678 static int record__init_thread_masks_spec(struct record *rec, struct perf_cpu_map *cpus,
3679                                           const char **maps_spec, const char **affinity_spec,
3680                                           u32 nr_spec)
3681 {
3682         u32 s;
3683         int ret = 0, t = 0;
3684         struct mmap_cpu_mask cpus_mask;
3685         struct thread_mask thread_mask, full_mask, *thread_masks;
3686
3687         ret = record__mmap_cpu_mask_alloc(&cpus_mask, cpu__max_cpu().cpu);
3688         if (ret) {
3689                 pr_err("Failed to allocate CPUs mask\n");
3690                 return ret;
3691         }
3692
3693         ret = record__mmap_cpu_mask_init(&cpus_mask, cpus);
3694         if (ret) {
3695                 pr_err("Failed to init cpu mask\n");
3696                 goto out_free_cpu_mask;
3697         }
3698
3699         ret = record__thread_mask_alloc(&full_mask, cpu__max_cpu().cpu);
3700         if (ret) {
3701                 pr_err("Failed to allocate full mask\n");
3702                 goto out_free_cpu_mask;
3703         }
3704
3705         ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3706         if (ret) {
3707                 pr_err("Failed to allocate thread mask\n");
3708                 goto out_free_full_and_cpu_masks;
3709         }
3710
3711         for (s = 0; s < nr_spec; s++) {
3712                 ret = record__mmap_cpu_mask_init_spec(&thread_mask.maps, maps_spec[s]);
3713                 if (ret) {
3714                         pr_err("Failed to initialize maps thread mask\n");
3715                         goto out_free;
3716                 }
3717                 ret = record__mmap_cpu_mask_init_spec(&thread_mask.affinity, affinity_spec[s]);
3718                 if (ret) {
3719                         pr_err("Failed to initialize affinity thread mask\n");
3720                         goto out_free;
3721                 }
3722
3723                 /* ignore invalid CPUs but do not allow empty masks */
3724                 if (!bitmap_and(thread_mask.maps.bits, thread_mask.maps.bits,
3725                                 cpus_mask.bits, thread_mask.maps.nbits)) {
3726                         pr_err("Empty maps mask: %s\n", maps_spec[s]);
3727                         ret = -EINVAL;
3728                         goto out_free;
3729                 }
3730                 if (!bitmap_and(thread_mask.affinity.bits, thread_mask.affinity.bits,
3731                                 cpus_mask.bits, thread_mask.affinity.nbits)) {
3732                         pr_err("Empty affinity mask: %s\n", affinity_spec[s]);
3733                         ret = -EINVAL;
3734                         goto out_free;
3735                 }
3736
3737                 /* do not allow intersection with other masks (full_mask) */
3738                 if (bitmap_intersects(thread_mask.maps.bits, full_mask.maps.bits,
3739                                       thread_mask.maps.nbits)) {
3740                         pr_err("Intersecting maps mask: %s\n", maps_spec[s]);
3741                         ret = -EINVAL;
3742                         goto out_free;
3743                 }
3744                 if (bitmap_intersects(thread_mask.affinity.bits, full_mask.affinity.bits,
3745                                       thread_mask.affinity.nbits)) {
3746                         pr_err("Intersecting affinity mask: %s\n", affinity_spec[s]);
3747                         ret = -EINVAL;
3748                         goto out_free;
3749                 }
3750
3751                 bitmap_or(full_mask.maps.bits, full_mask.maps.bits,
3752                           thread_mask.maps.bits, full_mask.maps.nbits);
3753                 bitmap_or(full_mask.affinity.bits, full_mask.affinity.bits,
3754                           thread_mask.affinity.bits, full_mask.maps.nbits);
3755
3756                 thread_masks = realloc(rec->thread_masks, (t + 1) * sizeof(struct thread_mask));
3757                 if (!thread_masks) {
3758                         pr_err("Failed to reallocate thread masks\n");
3759                         ret = -ENOMEM;
3760                         goto out_free;
3761                 }
3762                 rec->thread_masks = thread_masks;
3763                 rec->thread_masks[t] = thread_mask;
3764                 if (verbose > 0) {
3765                         pr_debug("thread_masks[%d]: ", t);
3766                         mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3767                         pr_debug("thread_masks[%d]: ", t);
3768                         mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3769                 }
3770                 t++;
3771                 ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3772                 if (ret) {
3773                         pr_err("Failed to allocate thread mask\n");
3774                         goto out_free_full_and_cpu_masks;
3775                 }
3776         }
3777         rec->nr_threads = t;
3778         pr_debug("nr_threads: %d\n", rec->nr_threads);
3779         if (!rec->nr_threads)
3780                 ret = -EINVAL;
3781
3782 out_free:
3783         record__thread_mask_free(&thread_mask);
3784 out_free_full_and_cpu_masks:
3785         record__thread_mask_free(&full_mask);
3786 out_free_cpu_mask:
3787         record__mmap_cpu_mask_free(&cpus_mask);
3788
3789         return ret;
3790 }
3791
3792 static int record__init_thread_core_masks(struct record *rec, struct perf_cpu_map *cpus)
3793 {
3794         int ret;
3795         struct cpu_topology *topo;
3796
3797         topo = cpu_topology__new();
3798         if (!topo) {
3799                 pr_err("Failed to allocate CPU topology\n");
3800                 return -ENOMEM;
3801         }
3802
3803         ret = record__init_thread_masks_spec(rec, cpus, topo->core_cpus_list,
3804                                              topo->core_cpus_list, topo->core_cpus_lists);
3805         cpu_topology__delete(topo);
3806
3807         return ret;
3808 }
3809
3810 static int record__init_thread_package_masks(struct record *rec, struct perf_cpu_map *cpus)
3811 {
3812         int ret;
3813         struct cpu_topology *topo;
3814
3815         topo = cpu_topology__new();
3816         if (!topo) {
3817                 pr_err("Failed to allocate CPU topology\n");
3818                 return -ENOMEM;
3819         }
3820
3821         ret = record__init_thread_masks_spec(rec, cpus, topo->package_cpus_list,
3822                                              topo->package_cpus_list, topo->package_cpus_lists);
3823         cpu_topology__delete(topo);
3824
3825         return ret;
3826 }
3827
3828 static int record__init_thread_numa_masks(struct record *rec, struct perf_cpu_map *cpus)
3829 {
3830         u32 s;
3831         int ret;
3832         const char **spec;
3833         struct numa_topology *topo;
3834
3835         topo = numa_topology__new();
3836         if (!topo) {
3837                 pr_err("Failed to allocate NUMA topology\n");
3838                 return -ENOMEM;
3839         }
3840
3841         spec = zalloc(topo->nr * sizeof(char *));
3842         if (!spec) {
3843                 pr_err("Failed to allocate NUMA spec\n");
3844                 ret = -ENOMEM;
3845                 goto out_delete_topo;
3846         }
3847         for (s = 0; s < topo->nr; s++)
3848                 spec[s] = topo->nodes[s].cpus;
3849
3850         ret = record__init_thread_masks_spec(rec, cpus, spec, spec, topo->nr);
3851
3852         zfree(&spec);
3853
3854 out_delete_topo:
3855         numa_topology__delete(topo);
3856
3857         return ret;
3858 }
3859
3860 static int record__init_thread_user_masks(struct record *rec, struct perf_cpu_map *cpus)
3861 {
3862         int t, ret;
3863         u32 s, nr_spec = 0;
3864         char **maps_spec = NULL, **affinity_spec = NULL, **tmp_spec;
3865         char *user_spec, *spec, *spec_ptr, *mask, *mask_ptr, *dup_mask = NULL;
3866
3867         for (t = 0, user_spec = (char *)rec->opts.threads_user_spec; ; t++, user_spec = NULL) {
3868                 spec = strtok_r(user_spec, ":", &spec_ptr);
3869                 if (spec == NULL)
3870                         break;
3871                 pr_debug2("threads_spec[%d]: %s\n", t, spec);
3872                 mask = strtok_r(spec, "/", &mask_ptr);
3873                 if (mask == NULL)
3874                         break;
3875                 pr_debug2("  maps mask: %s\n", mask);
3876                 tmp_spec = realloc(maps_spec, (nr_spec + 1) * sizeof(char *));
3877                 if (!tmp_spec) {
3878                         pr_err("Failed to reallocate maps spec\n");
3879                         ret = -ENOMEM;
3880                         goto out_free;
3881                 }
3882                 maps_spec = tmp_spec;
3883                 maps_spec[nr_spec] = dup_mask = strdup(mask);
3884                 if (!maps_spec[nr_spec]) {
3885                         pr_err("Failed to allocate maps spec[%d]\n", nr_spec);
3886                         ret = -ENOMEM;
3887                         goto out_free;
3888                 }
3889                 mask = strtok_r(NULL, "/", &mask_ptr);
3890                 if (mask == NULL) {
3891                         pr_err("Invalid thread maps or affinity specs\n");
3892                         ret = -EINVAL;
3893                         goto out_free;
3894                 }
3895                 pr_debug2("  affinity mask: %s\n", mask);
3896                 tmp_spec = realloc(affinity_spec, (nr_spec + 1) * sizeof(char *));
3897                 if (!tmp_spec) {
3898                         pr_err("Failed to reallocate affinity spec\n");
3899                         ret = -ENOMEM;
3900                         goto out_free;
3901                 }
3902                 affinity_spec = tmp_spec;
3903                 affinity_spec[nr_spec] = strdup(mask);
3904                 if (!affinity_spec[nr_spec]) {
3905                         pr_err("Failed to allocate affinity spec[%d]\n", nr_spec);
3906                         ret = -ENOMEM;
3907                         goto out_free;
3908                 }
3909                 dup_mask = NULL;
3910                 nr_spec++;
3911         }
3912
3913         ret = record__init_thread_masks_spec(rec, cpus, (const char **)maps_spec,
3914                                              (const char **)affinity_spec, nr_spec);
3915
3916 out_free:
3917         free(dup_mask);
3918         for (s = 0; s < nr_spec; s++) {
3919                 if (maps_spec)
3920                         free(maps_spec[s]);
3921                 if (affinity_spec)
3922                         free(affinity_spec[s]);
3923         }
3924         free(affinity_spec);
3925         free(maps_spec);
3926
3927         return ret;
3928 }
3929
3930 static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
3931 {
3932         int ret;
3933
3934         ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
3935         if (ret)
3936                 return ret;
3937
3938         if (record__mmap_cpu_mask_init(&rec->thread_masks->maps, cpus))
3939                 return -ENODEV;
3940
3941         rec->nr_threads = 1;
3942
3943         return 0;
3944 }
3945
3946 static int record__init_thread_masks(struct record *rec)
3947 {
3948         int ret = 0;
3949         struct perf_cpu_map *cpus = rec->evlist->core.all_cpus;
3950
3951         if (!record__threads_enabled(rec))
3952                 return record__init_thread_default_masks(rec, cpus);
3953
3954         if (evlist__per_thread(rec->evlist)) {
3955                 pr_err("--per-thread option is mutually exclusive to parallel streaming mode.\n");
3956                 return -EINVAL;
3957         }
3958
3959         switch (rec->opts.threads_spec) {
3960         case THREAD_SPEC__CPU:
3961                 ret = record__init_thread_cpu_masks(rec, cpus);
3962                 break;
3963         case THREAD_SPEC__CORE:
3964                 ret = record__init_thread_core_masks(rec, cpus);
3965                 break;
3966         case THREAD_SPEC__PACKAGE:
3967                 ret = record__init_thread_package_masks(rec, cpus);
3968                 break;
3969         case THREAD_SPEC__NUMA:
3970                 ret = record__init_thread_numa_masks(rec, cpus);
3971                 break;
3972         case THREAD_SPEC__USER:
3973                 ret = record__init_thread_user_masks(rec, cpus);
3974                 break;
3975         default:
3976                 break;
3977         }
3978
3979         return ret;
3980 }
3981
3982 int cmd_record(int argc, const char **argv)
3983 {
3984         int err;
3985         struct record *rec = &record;
3986         char errbuf[BUFSIZ];
3987
3988         setlocale(LC_ALL, "");
3989
3990 #ifndef HAVE_BPF_SKEL
3991 # define set_nobuild(s, l, m, c) set_option_nobuild(record_options, s, l, m, c)
3992         set_nobuild('\0', "off-cpu", "no BUILD_BPF_SKEL=1", true);
3993 # undef set_nobuild
3994 #endif
3995
3996         /* Disable eager loading of kernel symbols that adds overhead to perf record. */
3997         symbol_conf.lazy_load_kernel_maps = true;
3998         rec->opts.affinity = PERF_AFFINITY_SYS;
3999
4000         rec->evlist = evlist__new();
4001         if (rec->evlist == NULL)
4002                 return -ENOMEM;
4003
4004         err = perf_config(perf_record_config, rec);
4005         if (err)
4006                 return err;
4007
4008         argc = parse_options(argc, argv, record_options, record_usage,
4009                             PARSE_OPT_STOP_AT_NON_OPTION);
4010         if (quiet)
4011                 perf_quiet_option();
4012
4013         err = symbol__validate_sym_arguments();
4014         if (err)
4015                 return err;
4016
4017         perf_debuginfod_setup(&record.debuginfod);
4018
4019         /* Make system wide (-a) the default target. */
4020         if (!argc && target__none(&rec->opts.target))
4021                 rec->opts.target.system_wide = true;
4022
4023         if (nr_cgroups && !rec->opts.target.system_wide) {
4024                 usage_with_options_msg(record_usage, record_options,
4025                         "cgroup monitoring only available in system-wide mode");
4026
4027         }
4028
4029         if (rec->buildid_mmap) {
4030                 if (!perf_can_record_build_id()) {
4031                         pr_err("Failed: no support to record build id in mmap events, update your kernel.\n");
4032                         err = -EINVAL;
4033                         goto out_opts;
4034                 }
4035                 pr_debug("Enabling build id in mmap2 events.\n");
4036                 /* Enable mmap build id synthesizing. */
4037                 symbol_conf.buildid_mmap2 = true;
4038                 /* Enable perf_event_attr::build_id bit. */
4039                 rec->opts.build_id = true;
4040                 /* Disable build id cache. */
4041                 rec->no_buildid = true;
4042         }
4043
4044         if (rec->opts.record_cgroup && !perf_can_record_cgroup()) {
4045                 pr_err("Kernel has no cgroup sampling support.\n");
4046                 err = -EINVAL;
4047                 goto out_opts;
4048         }
4049
4050         if (rec->opts.kcore)
4051                 rec->opts.text_poke = true;
4052
4053         if (rec->opts.kcore || record__threads_enabled(rec))
4054                 rec->data.is_dir = true;
4055
4056         if (record__threads_enabled(rec)) {
4057                 if (rec->opts.affinity != PERF_AFFINITY_SYS) {
4058                         pr_err("--affinity option is mutually exclusive to parallel streaming mode.\n");
4059                         goto out_opts;
4060                 }
4061                 if (record__aio_enabled(rec)) {
4062                         pr_err("Asynchronous streaming mode (--aio) is mutually exclusive to parallel streaming mode.\n");
4063                         goto out_opts;
4064                 }
4065         }
4066
4067         if (rec->opts.comp_level != 0) {
4068                 pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
4069                 rec->no_buildid = true;
4070         }
4071
4072         if (rec->opts.record_switch_events &&
4073             !perf_can_record_switch_events()) {
4074                 ui__error("kernel does not support recording context switch events\n");
4075                 parse_options_usage(record_usage, record_options, "switch-events", 0);
4076                 err = -EINVAL;
4077                 goto out_opts;
4078         }
4079
4080         if (switch_output_setup(rec)) {
4081                 parse_options_usage(record_usage, record_options, "switch-output", 0);
4082                 err = -EINVAL;
4083                 goto out_opts;
4084         }
4085
4086         if (rec->switch_output.time) {
4087                 signal(SIGALRM, alarm_sig_handler);
4088                 alarm(rec->switch_output.time);
4089         }
4090
4091         if (rec->switch_output.num_files) {
4092                 rec->switch_output.filenames = calloc(rec->switch_output.num_files,
4093                                                       sizeof(char *));
4094                 if (!rec->switch_output.filenames) {
4095                         err = -EINVAL;
4096                         goto out_opts;
4097                 }
4098         }
4099
4100         if (rec->timestamp_filename && record__threads_enabled(rec)) {
4101                 rec->timestamp_filename = false;
4102                 pr_warning("WARNING: --timestamp-filename option is not available in parallel streaming mode.\n");
4103         }
4104
4105         /*
4106          * Allow aliases to facilitate the lookup of symbols for address
4107          * filters. Refer to auxtrace_parse_filters().
4108          */
4109         symbol_conf.allow_aliases = true;
4110
4111         symbol__init(NULL);
4112
4113         err = record__auxtrace_init(rec);
4114         if (err)
4115                 goto out;
4116
4117         if (dry_run)
4118                 goto out;
4119
4120         err = -ENOMEM;
4121
4122         if (rec->no_buildid_cache || rec->no_buildid) {
4123                 disable_buildid_cache();
4124         } else if (rec->switch_output.enabled) {
4125                 /*
4126                  * In 'perf record --switch-output', disable buildid
4127                  * generation by default to reduce data file switching
4128                  * overhead. Still generate buildid if they are required
4129                  * explicitly using
4130                  *
4131                  *  perf record --switch-output --no-no-buildid \
4132                  *              --no-no-buildid-cache
4133                  *
4134                  * Following code equals to:
4135                  *
4136                  * if ((rec->no_buildid || !rec->no_buildid_set) &&
4137                  *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
4138                  *         disable_buildid_cache();
4139                  */
4140                 bool disable = true;
4141
4142                 if (rec->no_buildid_set && !rec->no_buildid)
4143                         disable = false;
4144                 if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
4145                         disable = false;
4146                 if (disable) {
4147                         rec->no_buildid = true;
4148                         rec->no_buildid_cache = true;
4149                         disable_buildid_cache();
4150                 }
4151         }
4152
4153         if (record.opts.overwrite)
4154                 record.opts.tail_synthesize = true;
4155
4156         if (rec->evlist->core.nr_entries == 0) {
4157                 bool can_profile_kernel = perf_event_paranoid_check(1);
4158
4159                 err = parse_event(rec->evlist, can_profile_kernel ? "cycles:P" : "cycles:Pu");
4160                 if (err)
4161                         goto out;
4162         }
4163
4164         if (rec->opts.target.tid && !rec->opts.no_inherit_set)
4165                 rec->opts.no_inherit = true;
4166
4167         err = target__validate(&rec->opts.target);
4168         if (err) {
4169                 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4170                 ui__warning("%s\n", errbuf);
4171         }
4172
4173         err = target__parse_uid(&rec->opts.target);
4174         if (err) {
4175                 int saved_errno = errno;
4176
4177                 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4178                 ui__error("%s", errbuf);
4179
4180                 err = -saved_errno;
4181                 goto out;
4182         }
4183
4184         /* Enable ignoring missing threads when -u/-p option is defined. */
4185         rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
4186
4187         evlist__warn_user_requested_cpus(rec->evlist, rec->opts.target.cpu_list);
4188
4189         if (callchain_param.enabled && callchain_param.record_mode == CALLCHAIN_FP)
4190                 arch__add_leaf_frame_record_opts(&rec->opts);
4191
4192         err = -ENOMEM;
4193         if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0) {
4194                 if (rec->opts.target.pid != NULL) {
4195                         pr_err("Couldn't create thread/CPU maps: %s\n",
4196                                 errno == ENOENT ? "No such process" : str_error_r(errno, errbuf, sizeof(errbuf)));
4197                         goto out;
4198                 }
4199                 else
4200                         usage_with_options(record_usage, record_options);
4201         }
4202
4203         err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
4204         if (err)
4205                 goto out;
4206
4207         /*
4208          * We take all buildids when the file contains
4209          * AUX area tracing data because we do not decode the
4210          * trace because it would take too long.
4211          */
4212         if (rec->opts.full_auxtrace)
4213                 rec->buildid_all = true;
4214
4215         if (rec->opts.text_poke) {
4216                 err = record__config_text_poke(rec->evlist);
4217                 if (err) {
4218                         pr_err("record__config_text_poke failed, error %d\n", err);
4219                         goto out;
4220                 }
4221         }
4222
4223         if (rec->off_cpu) {
4224                 err = record__config_off_cpu(rec);
4225                 if (err) {
4226                         pr_err("record__config_off_cpu failed, error %d\n", err);
4227                         goto out;
4228                 }
4229         }
4230
4231         if (record_opts__config(&rec->opts)) {
4232                 err = -EINVAL;
4233                 goto out;
4234         }
4235
4236         err = record__config_tracking_events(rec);
4237         if (err) {
4238                 pr_err("record__config_tracking_events failed, error %d\n", err);
4239                 goto out;
4240         }
4241
4242         err = record__init_thread_masks(rec);
4243         if (err) {
4244                 pr_err("Failed to initialize parallel data streaming masks\n");
4245                 goto out;
4246         }
4247
4248         if (rec->opts.nr_cblocks > nr_cblocks_max)
4249                 rec->opts.nr_cblocks = nr_cblocks_max;
4250         pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
4251
4252         pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
4253         pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
4254
4255         if (rec->opts.comp_level > comp_level_max)
4256                 rec->opts.comp_level = comp_level_max;
4257         pr_debug("comp level: %d\n", rec->opts.comp_level);
4258
4259         err = __cmd_record(&record, argc, argv);
4260 out:
4261         evlist__delete(rec->evlist);
4262         symbol__exit();
4263         auxtrace_record__free(rec->itr);
4264 out_opts:
4265         record__free_thread_masks(rec, rec->nr_threads);
4266         rec->nr_threads = 0;
4267         evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
4268         return err;
4269 }
4270
4271 static void snapshot_sig_handler(int sig __maybe_unused)
4272 {
4273         struct record *rec = &record;
4274
4275         hit_auxtrace_snapshot_trigger(rec);
4276
4277         if (switch_output_signal(rec))
4278                 trigger_hit(&switch_output_trigger);
4279 }
4280
4281 static void alarm_sig_handler(int sig __maybe_unused)
4282 {
4283         struct record *rec = &record;
4284
4285         if (switch_output_time(rec))
4286                 trigger_hit(&switch_output_trigger);
4287 }
This page took 0.297222 seconds and 4 git commands to generate.