4 * Builtin record command: Record the profile of a workload
5 * (or a CPU, or a PID) into the perf.data output file - for
6 * later analysis via perf report.
12 #include "util/build-id.h"
13 #include "util/util.h"
14 #include "util/parse-options.h"
15 #include "util/parse-events.h"
17 #include "util/header.h"
18 #include "util/event.h"
19 #include "util/evlist.h"
20 #include "util/evsel.h"
21 #include "util/debug.h"
22 #include "util/session.h"
23 #include "util/tool.h"
24 #include "util/symbol.h"
25 #include "util/cpumap.h"
26 #include "util/thread_map.h"
27 #include "util/data.h"
35 struct perf_tool tool;
36 struct record_opts opts;
38 struct perf_data_file file;
39 struct perf_evlist *evlist;
40 struct perf_session *session;
44 bool no_buildid_cache;
48 static int record__write(struct record *rec, void *bf, size_t size)
50 if (perf_data_file__write(rec->session->file, bf, size) < 0) {
51 pr_err("failed to write perf data, error: %m\n");
55 rec->bytes_written += size;
59 static int process_synthesized_event(struct perf_tool *tool,
60 union perf_event *event,
61 struct perf_sample *sample __maybe_unused,
62 struct machine *machine __maybe_unused)
64 struct record *rec = container_of(tool, struct record, tool);
65 return record__write(rec, event, event->header.size);
68 static int record__mmap_read(struct record *rec, struct perf_mmap *md)
70 unsigned int head = perf_mmap__read_head(md);
71 unsigned int old = md->prev;
72 unsigned char *data = md->base + page_size;
84 if ((old & md->mask) + size != (head & md->mask)) {
85 buf = &data[old & md->mask];
86 size = md->mask + 1 - (old & md->mask);
89 if (record__write(rec, buf, size) < 0) {
95 buf = &data[old & md->mask];
99 if (record__write(rec, buf, size) < 0) {
105 perf_mmap__write_tail(md, old);
111 static volatile int done = 0;
112 static volatile int signr = -1;
113 static volatile int child_finished = 0;
115 static void sig_handler(int sig)
125 static void record__sig_exit(void)
130 signal(signr, SIG_DFL);
134 static int record__open(struct record *rec)
137 struct perf_evsel *pos;
138 struct perf_evlist *evlist = rec->evlist;
139 struct perf_session *session = rec->session;
140 struct record_opts *opts = &rec->opts;
143 perf_evlist__config(evlist, opts);
145 evlist__for_each(evlist, pos) {
147 if (perf_evsel__open(pos, evlist->cpus, evlist->threads) < 0) {
148 if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) {
150 ui__warning("%s\n", msg);
155 perf_evsel__open_strerror(pos, &opts->target,
156 errno, msg, sizeof(msg));
157 ui__error("%s\n", msg);
162 if (perf_evlist__apply_filters(evlist)) {
163 error("failed to set filter with %d (%s)\n", errno,
169 if (perf_evlist__mmap(evlist, opts->mmap_pages, false) < 0) {
170 if (errno == EPERM) {
171 pr_err("Permission error mapping pages.\n"
172 "Consider increasing "
173 "/proc/sys/kernel/perf_event_mlock_kb,\n"
174 "or try again with a smaller value of -m/--mmap_pages.\n"
175 "(current value: %u)\n", opts->mmap_pages);
178 pr_err("failed to mmap with %d (%s)\n", errno, strerror(errno));
184 session->evlist = evlist;
185 perf_session__set_id_hdr_size(session);
190 static int process_buildids(struct record *rec)
192 struct perf_data_file *file = &rec->file;
193 struct perf_session *session = rec->session;
194 u64 start = session->header.data_offset;
196 u64 size = lseek(file->fd, 0, SEEK_CUR);
200 return __perf_session__process_events(session, start,
202 size, &build_id__mark_dso_hit_ops);
205 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
208 struct perf_tool *tool = data;
210 *As for guest kernel when processing subcommand record&report,
211 *we arrange module mmap prior to guest kernel mmap and trigger
212 *a preload dso because default guest module symbols are loaded
213 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
214 *method is used to avoid symbol missing when the first addr is
215 *in module instead of in guest kernel.
217 err = perf_event__synthesize_modules(tool, process_synthesized_event,
220 pr_err("Couldn't record guest kernel [%d]'s reference"
221 " relocation symbol.\n", machine->pid);
224 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
225 * have no _text sometimes.
227 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
230 pr_err("Couldn't record guest kernel [%d]'s reference"
231 " relocation symbol.\n", machine->pid);
234 static struct perf_event_header finished_round_event = {
235 .size = sizeof(struct perf_event_header),
236 .type = PERF_RECORD_FINISHED_ROUND,
239 static int record__mmap_read_all(struct record *rec)
244 for (i = 0; i < rec->evlist->nr_mmaps; i++) {
245 if (rec->evlist->mmap[i].base) {
246 if (record__mmap_read(rec, &rec->evlist->mmap[i]) != 0) {
253 if (perf_header__has_feat(&rec->session->header, HEADER_TRACING_DATA))
254 rc = record__write(rec, &finished_round_event, sizeof(finished_round_event));
260 static void record__init_features(struct record *rec)
262 struct perf_session *session = rec->session;
265 for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
266 perf_header__set_feat(&session->header, feat);
269 perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
271 if (!have_tracepoints(&rec->evlist->entries))
272 perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
274 if (!rec->opts.branch_stack)
275 perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
278 static volatile int workload_exec_errno;
281 * perf_evlist__prepare_workload will send a SIGUSR1
282 * if the fork fails, since we asked by setting its
283 * want_signal to true.
285 static void workload_exec_failed_signal(int signo __maybe_unused,
287 void *ucontext __maybe_unused)
289 workload_exec_errno = info->si_value.sival_int;
294 static int __cmd_record(struct record *rec, int argc, const char **argv)
298 unsigned long waking = 0;
299 const bool forks = argc > 0;
300 struct machine *machine;
301 struct perf_tool *tool = &rec->tool;
302 struct record_opts *opts = &rec->opts;
303 struct perf_data_file *file = &rec->file;
304 struct perf_session *session;
305 bool disabled = false;
307 rec->progname = argv[0];
309 atexit(record__sig_exit);
310 signal(SIGCHLD, sig_handler);
311 signal(SIGINT, sig_handler);
312 signal(SIGTERM, sig_handler);
314 session = perf_session__new(file, false, NULL);
315 if (session == NULL) {
316 pr_err("Perf session creation failed.\n");
320 rec->session = session;
322 record__init_features(rec);
325 err = perf_evlist__prepare_workload(rec->evlist, &opts->target,
327 workload_exec_failed_signal);
329 pr_err("Couldn't run the workload!\n");
331 goto out_delete_session;
335 if (record__open(rec) != 0) {
340 if (!rec->evlist->nr_groups)
341 perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
344 err = perf_header__write_pipe(file->fd);
348 err = perf_session__write_header(session, rec->evlist,
355 && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
356 pr_err("Couldn't generate buildids. "
357 "Use --no-buildid to profile anyway.\n");
362 machine = &session->machines.host;
365 err = perf_event__synthesize_attrs(tool, session,
366 process_synthesized_event);
368 pr_err("Couldn't synthesize attrs.\n");
372 if (have_tracepoints(&rec->evlist->entries)) {
374 * FIXME err <= 0 here actually means that
375 * there were no tracepoints so its not really
376 * an error, just that we don't need to
377 * synthesize anything. We really have to
378 * return this more properly and also
379 * propagate errors that now are calling die()
381 err = perf_event__synthesize_tracing_data(tool, file->fd, rec->evlist,
382 process_synthesized_event);
384 pr_err("Couldn't record tracing data.\n");
387 rec->bytes_written += err;
391 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
394 pr_err("Couldn't record kernel reference relocation symbol\n"
395 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
396 "Check /proc/kallsyms permission or run as root.\n");
398 err = perf_event__synthesize_modules(tool, process_synthesized_event,
401 pr_err("Couldn't record kernel module information.\n"
402 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
403 "Check /proc/modules permission or run as root.\n");
406 machines__process_guests(&session->machines,
407 perf_event__synthesize_guest_os, tool);
410 err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->threads,
411 process_synthesized_event, opts->sample_address);
415 if (rec->realtime_prio) {
416 struct sched_param param;
418 param.sched_priority = rec->realtime_prio;
419 if (sched_setscheduler(0, SCHED_FIFO, ¶m)) {
420 pr_err("Could not set realtime priority.\n");
427 * When perf is starting the traced process, all the events
428 * (apart from group members) have enable_on_exec=1 set,
429 * so don't spoil it by prematurely enabling them.
431 if (!target__none(&opts->target) && !opts->initial_delay)
432 perf_evlist__enable(rec->evlist);
438 perf_evlist__start_workload(rec->evlist);
440 if (opts->initial_delay) {
441 usleep(opts->initial_delay * 1000);
442 perf_evlist__enable(rec->evlist);
446 int hits = rec->samples;
448 if (record__mmap_read_all(rec) < 0) {
453 if (hits == rec->samples) {
456 err = poll(rec->evlist->pollfd, rec->evlist->nr_fds, -1);
457 if (err < 0 && errno == EINTR)
463 * When perf is starting the traced process, at the end events
464 * die with the process and we wait for that. Thus no need to
465 * disable events in this case.
467 if (done && !disabled && !target__none(&opts->target)) {
468 perf_evlist__disable(rec->evlist);
473 if (forks && workload_exec_errno) {
475 const char *emsg = strerror_r(workload_exec_errno, msg, sizeof(msg));
476 pr_err("Workload failed: %s\n", emsg);
482 fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
485 * Approximate RIP event size: 24 bytes.
488 "[ perf record: Captured and wrote %.3f MB %s (~%" PRIu64 " samples) ]\n",
489 (double)rec->bytes_written / 1024.0 / 1024.0,
491 rec->bytes_written / 24);
499 kill(rec->evlist->workload.pid, SIGTERM);
505 else if (WIFEXITED(exit_status))
506 status = WEXITSTATUS(exit_status);
507 else if (WIFSIGNALED(exit_status))
508 signr = WTERMSIG(exit_status);
512 if (!err && !file->is_pipe) {
513 rec->session->header.data_size += rec->bytes_written;
515 if (!rec->no_buildid)
516 process_buildids(rec);
517 perf_session__write_header(rec->session, rec->evlist,
522 perf_session__delete(session);
526 #define BRANCH_OPT(n, m) \
527 { .name = n, .mode = (m) }
529 #define BRANCH_END { .name = NULL }
536 static const struct branch_mode branch_modes[] = {
537 BRANCH_OPT("u", PERF_SAMPLE_BRANCH_USER),
538 BRANCH_OPT("k", PERF_SAMPLE_BRANCH_KERNEL),
539 BRANCH_OPT("hv", PERF_SAMPLE_BRANCH_HV),
540 BRANCH_OPT("any", PERF_SAMPLE_BRANCH_ANY),
541 BRANCH_OPT("any_call", PERF_SAMPLE_BRANCH_ANY_CALL),
542 BRANCH_OPT("any_ret", PERF_SAMPLE_BRANCH_ANY_RETURN),
543 BRANCH_OPT("ind_call", PERF_SAMPLE_BRANCH_IND_CALL),
544 BRANCH_OPT("abort_tx", PERF_SAMPLE_BRANCH_ABORT_TX),
545 BRANCH_OPT("in_tx", PERF_SAMPLE_BRANCH_IN_TX),
546 BRANCH_OPT("no_tx", PERF_SAMPLE_BRANCH_NO_TX),
551 parse_branch_stack(const struct option *opt, const char *str, int unset)
554 (PERF_SAMPLE_BRANCH_USER |\
555 PERF_SAMPLE_BRANCH_KERNEL |\
556 PERF_SAMPLE_BRANCH_HV)
558 uint64_t *mode = (uint64_t *)opt->value;
559 const struct branch_mode *br;
560 char *s, *os = NULL, *p;
567 * cannot set it twice, -b + --branch-filter for instance
572 /* str may be NULL in case no arg is passed to -b */
574 /* because str is read-only */
575 s = os = strdup(str);
584 for (br = branch_modes; br->name; br++) {
585 if (!strcasecmp(s, br->name))
589 ui__warning("unknown branch filter %s,"
590 " check man page\n", s);
604 /* default to any branch */
605 if ((*mode & ~ONLY_PLM) == 0) {
606 *mode = PERF_SAMPLE_BRANCH_ANY;
613 #ifdef HAVE_DWARF_UNWIND_SUPPORT
614 static int get_stack_size(char *str, unsigned long *_size)
618 unsigned long max_size = round_down(USHRT_MAX, sizeof(u64));
620 size = strtoul(str, &endptr, 0);
626 size = round_up(size, sizeof(u64));
627 if (!size || size > max_size)
635 pr_err("callchain: Incorrect stack dump size (max %ld): %s\n",
639 #endif /* HAVE_DWARF_UNWIND_SUPPORT */
641 int record_parse_callchain(const char *arg, struct record_opts *opts)
643 char *tok, *name, *saveptr = NULL;
647 /* We need buffer that we know we can write to. */
648 buf = malloc(strlen(arg) + 1);
654 tok = strtok_r((char *)buf, ",", &saveptr);
655 name = tok ? : (char *)buf;
658 /* Framepointer style */
659 if (!strncmp(name, "fp", sizeof("fp"))) {
660 if (!strtok_r(NULL, ",", &saveptr)) {
661 opts->call_graph = CALLCHAIN_FP;
664 pr_err("callchain: No more arguments "
665 "needed for -g fp\n");
668 #ifdef HAVE_DWARF_UNWIND_SUPPORT
670 } else if (!strncmp(name, "dwarf", sizeof("dwarf"))) {
671 const unsigned long default_stack_dump_size = 8192;
674 opts->call_graph = CALLCHAIN_DWARF;
675 opts->stack_dump_size = default_stack_dump_size;
677 tok = strtok_r(NULL, ",", &saveptr);
679 unsigned long size = 0;
681 ret = get_stack_size(tok, &size);
682 opts->stack_dump_size = size;
684 #endif /* HAVE_DWARF_UNWIND_SUPPORT */
686 pr_err("callchain: Unknown --call-graph option "
697 static void callchain_debug(struct record_opts *opts)
699 static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF" };
701 pr_debug("callchain: type %s\n", str[opts->call_graph]);
703 if (opts->call_graph == CALLCHAIN_DWARF)
704 pr_debug("callchain: stack dump size %d\n",
705 opts->stack_dump_size);
708 int record_parse_callchain_opt(const struct option *opt,
712 struct record_opts *opts = opt->value;
715 opts->call_graph_enabled = !unset;
717 /* --no-call-graph */
719 opts->call_graph = CALLCHAIN_NONE;
720 pr_debug("callchain: disabled\n");
724 ret = record_parse_callchain(arg, opts);
726 callchain_debug(opts);
731 int record_callchain_opt(const struct option *opt,
732 const char *arg __maybe_unused,
733 int unset __maybe_unused)
735 struct record_opts *opts = opt->value;
737 opts->call_graph_enabled = !unset;
739 if (opts->call_graph == CALLCHAIN_NONE)
740 opts->call_graph = CALLCHAIN_FP;
742 callchain_debug(opts);
746 static int perf_record_config(const char *var, const char *value, void *cb)
748 struct record *rec = cb;
750 if (!strcmp(var, "record.call-graph"))
751 return record_parse_callchain(value, &rec->opts);
753 return perf_default_config(var, value, cb);
756 static const char * const record_usage[] = {
757 "perf record [<options>] [<command>]",
758 "perf record [<options>] -- <command> [<options>]",
763 * XXX Ideally would be local to cmd_record() and passed to a record__new
764 * because we need to have access to it in record__exit, that is called
765 * after cmd_record() exits, but since record_options need to be accessible to
766 * builtin-script, leave it here.
768 * At least we don't ouch it in all the other functions here directly.
770 * Just say no to tons of global variables, sigh.
772 static struct record record = {
774 .mmap_pages = UINT_MAX,
775 .user_freq = UINT_MAX,
776 .user_interval = ULLONG_MAX,
780 .default_per_cpu = true,
785 #define CALLCHAIN_HELP "setup and enables call-graph (stack chain/backtrace) recording: "
787 #ifdef HAVE_DWARF_UNWIND_SUPPORT
788 const char record_callchain_help[] = CALLCHAIN_HELP "fp dwarf";
790 const char record_callchain_help[] = CALLCHAIN_HELP "fp";
794 * XXX Will stay a global variable till we fix builtin-script.c to stop messing
795 * with it and switch to use the library functions in perf_evlist that came
796 * from builtin-record.c, i.e. use record_opts,
797 * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
800 const struct option record_options[] = {
801 OPT_CALLBACK('e', "event", &record.evlist, "event",
802 "event selector. use 'perf list' to list available events",
803 parse_events_option),
804 OPT_CALLBACK(0, "filter", &record.evlist, "filter",
805 "event filter", parse_filter),
806 OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
807 "record events on existing process id"),
808 OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
809 "record events on existing thread id"),
810 OPT_INTEGER('r', "realtime", &record.realtime_prio,
811 "collect data with this RT SCHED_FIFO priority"),
812 OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
813 "collect data without buffering"),
814 OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
815 "collect raw sample records from all opened counters"),
816 OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
817 "system-wide collection from all CPUs"),
818 OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
819 "list of cpus to monitor"),
820 OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
821 OPT_STRING('o', "output", &record.file.path, "file",
823 OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
824 &record.opts.no_inherit_set,
825 "child tasks do not inherit counters"),
826 OPT_UINTEGER('F', "freq", &record.opts.user_freq, "profile at this frequency"),
827 OPT_CALLBACK('m', "mmap-pages", &record.opts.mmap_pages, "pages",
828 "number of mmap data pages",
829 perf_evlist__parse_mmap_pages),
830 OPT_BOOLEAN(0, "group", &record.opts.group,
831 "put the counters into a counter group"),
832 OPT_CALLBACK_NOOPT('g', NULL, &record.opts,
833 NULL, "enables call-graph recording" ,
834 &record_callchain_opt),
835 OPT_CALLBACK(0, "call-graph", &record.opts,
836 "mode[,dump_size]", record_callchain_help,
837 &record_parse_callchain_opt),
838 OPT_INCR('v', "verbose", &verbose,
839 "be more verbose (show counter open errors, etc)"),
840 OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
841 OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
842 "per thread counts"),
843 OPT_BOOLEAN('d', "data", &record.opts.sample_address,
845 OPT_BOOLEAN('T', "timestamp", &record.opts.sample_time, "Sample timestamps"),
846 OPT_BOOLEAN('P', "period", &record.opts.period, "Sample period"),
847 OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
849 OPT_BOOLEAN('N', "no-buildid-cache", &record.no_buildid_cache,
850 "do not update the buildid cache"),
851 OPT_BOOLEAN('B', "no-buildid", &record.no_buildid,
852 "do not collect buildids in perf.data"),
853 OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
854 "monitor event in cgroup name only",
856 OPT_UINTEGER('D', "delay", &record.opts.initial_delay,
857 "ms to wait before starting measurement after program start"),
858 OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
861 OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
862 "branch any", "sample any taken branches",
865 OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
866 "branch filter mask", "branch stack filter modes",
868 OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
869 "sample by weight (on special events only)"),
870 OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
871 "sample transaction flags (special events only)"),
872 OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
873 "use per-thread mmaps"),
877 int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
880 struct record *rec = &record;
883 rec->evlist = perf_evlist__new();
884 if (rec->evlist == NULL)
887 perf_config(perf_record_config, rec);
889 argc = parse_options(argc, argv, record_options, record_usage,
890 PARSE_OPT_STOP_AT_NON_OPTION);
891 if (!argc && target__none(&rec->opts.target))
892 usage_with_options(record_usage, record_options);
894 if (nr_cgroups && !rec->opts.target.system_wide) {
895 ui__error("cgroup monitoring only available in"
896 " system-wide mode\n");
897 usage_with_options(record_usage, record_options);
902 if (symbol_conf.kptr_restrict)
904 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
905 "check /proc/sys/kernel/kptr_restrict.\n\n"
906 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
907 "file is not found in the buildid cache or in the vmlinux path.\n\n"
908 "Samples in kernel modules won't be resolved at all.\n\n"
909 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
910 "even with a suitable vmlinux or kallsyms file.\n\n");
912 if (rec->no_buildid_cache || rec->no_buildid)
913 disable_buildid_cache();
915 if (rec->evlist->nr_entries == 0 &&
916 perf_evlist__add_default(rec->evlist) < 0) {
917 pr_err("Not enough memory for event selector list\n");
918 goto out_symbol_exit;
921 if (rec->opts.target.tid && !rec->opts.no_inherit_set)
922 rec->opts.no_inherit = true;
924 err = target__validate(&rec->opts.target);
926 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
927 ui__warning("%s", errbuf);
930 err = target__parse_uid(&rec->opts.target);
932 int saved_errno = errno;
934 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
935 ui__error("%s", errbuf);
938 goto out_symbol_exit;
942 if (perf_evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
943 usage_with_options(record_usage, record_options);
945 if (record_opts__config(&rec->opts)) {
947 goto out_symbol_exit;
950 err = __cmd_record(&record, argc, argv);
952 perf_evlist__delete(rec->evlist);