]> git.kernelconcepts.de Git - karo-tx-linux.git/blob - tools/perf/builtin-trace.c
Merge branch 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git...
[karo-tx-linux.git] / tools / perf / builtin-trace.c
1 /*
2  * builtin-trace.c
3  *
4  * Builtin 'trace' command:
5  *
6  * Display a continuously updated trace of any workload, CPU, specific PID,
7  * system wide, etc.  Default format is loosely strace like, but any other
8  * event may be specified using --event.
9  *
10  * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11  *
12  * Initially based on the 'trace' prototype by Thomas Gleixner:
13  *
14  * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15  *
16  * Released under the GPL v2. (and only v2, not any later version)
17  */
18
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/color.h"
23 #include "util/debug.h"
24 #include "util/evlist.h"
25 #include "util/exec_cmd.h"
26 #include "util/machine.h"
27 #include "util/session.h"
28 #include "util/thread.h"
29 #include "util/parse-options.h"
30 #include "util/strlist.h"
31 #include "util/intlist.h"
32 #include "util/thread_map.h"
33 #include "util/stat.h"
34 #include "trace-event.h"
35 #include "util/parse-events.h"
36
37 #include <libaudit.h>
38 #include <stdlib.h>
39 #include <sys/mman.h>
40 #include <linux/futex.h>
41 #include <linux/err.h>
42
43 /* For older distros: */
44 #ifndef MAP_STACK
45 # define MAP_STACK              0x20000
46 #endif
47
48 #ifndef MADV_HWPOISON
49 # define MADV_HWPOISON          100
50
51 #endif
52
53 #ifndef MADV_MERGEABLE
54 # define MADV_MERGEABLE         12
55 #endif
56
57 #ifndef MADV_UNMERGEABLE
58 # define MADV_UNMERGEABLE       13
59 #endif
60
61 #ifndef EFD_SEMAPHORE
62 # define EFD_SEMAPHORE          1
63 #endif
64
65 #ifndef EFD_NONBLOCK
66 # define EFD_NONBLOCK           00004000
67 #endif
68
69 #ifndef EFD_CLOEXEC
70 # define EFD_CLOEXEC            02000000
71 #endif
72
73 #ifndef O_CLOEXEC
74 # define O_CLOEXEC              02000000
75 #endif
76
77 #ifndef SOCK_DCCP
78 # define SOCK_DCCP              6
79 #endif
80
81 #ifndef SOCK_CLOEXEC
82 # define SOCK_CLOEXEC           02000000
83 #endif
84
85 #ifndef SOCK_NONBLOCK
86 # define SOCK_NONBLOCK          00004000
87 #endif
88
89 #ifndef MSG_CMSG_CLOEXEC
90 # define MSG_CMSG_CLOEXEC       0x40000000
91 #endif
92
93 #ifndef PERF_FLAG_FD_NO_GROUP
94 # define PERF_FLAG_FD_NO_GROUP          (1UL << 0)
95 #endif
96
97 #ifndef PERF_FLAG_FD_OUTPUT
98 # define PERF_FLAG_FD_OUTPUT            (1UL << 1)
99 #endif
100
101 #ifndef PERF_FLAG_PID_CGROUP
102 # define PERF_FLAG_PID_CGROUP           (1UL << 2) /* pid=cgroup id, per-cpu mode only */
103 #endif
104
105 #ifndef PERF_FLAG_FD_CLOEXEC
106 # define PERF_FLAG_FD_CLOEXEC           (1UL << 3) /* O_CLOEXEC */
107 #endif
108
109
110 struct tp_field {
111         int offset;
112         union {
113                 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
114                 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
115         };
116 };
117
118 #define TP_UINT_FIELD(bits) \
119 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
120 { \
121         u##bits value; \
122         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
123         return value;  \
124 }
125
126 TP_UINT_FIELD(8);
127 TP_UINT_FIELD(16);
128 TP_UINT_FIELD(32);
129 TP_UINT_FIELD(64);
130
131 #define TP_UINT_FIELD__SWAPPED(bits) \
132 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
133 { \
134         u##bits value; \
135         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
136         return bswap_##bits(value);\
137 }
138
139 TP_UINT_FIELD__SWAPPED(16);
140 TP_UINT_FIELD__SWAPPED(32);
141 TP_UINT_FIELD__SWAPPED(64);
142
143 static int tp_field__init_uint(struct tp_field *field,
144                                struct format_field *format_field,
145                                bool needs_swap)
146 {
147         field->offset = format_field->offset;
148
149         switch (format_field->size) {
150         case 1:
151                 field->integer = tp_field__u8;
152                 break;
153         case 2:
154                 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
155                 break;
156         case 4:
157                 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
158                 break;
159         case 8:
160                 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
161                 break;
162         default:
163                 return -1;
164         }
165
166         return 0;
167 }
168
169 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
170 {
171         return sample->raw_data + field->offset;
172 }
173
174 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
175 {
176         field->offset = format_field->offset;
177         field->pointer = tp_field__ptr;
178         return 0;
179 }
180
181 struct syscall_tp {
182         struct tp_field id;
183         union {
184                 struct tp_field args, ret;
185         };
186 };
187
188 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
189                                           struct tp_field *field,
190                                           const char *name)
191 {
192         struct format_field *format_field = perf_evsel__field(evsel, name);
193
194         if (format_field == NULL)
195                 return -1;
196
197         return tp_field__init_uint(field, format_field, evsel->needs_swap);
198 }
199
200 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
201         ({ struct syscall_tp *sc = evsel->priv;\
202            perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
203
204 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
205                                          struct tp_field *field,
206                                          const char *name)
207 {
208         struct format_field *format_field = perf_evsel__field(evsel, name);
209
210         if (format_field == NULL)
211                 return -1;
212
213         return tp_field__init_ptr(field, format_field);
214 }
215
216 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
217         ({ struct syscall_tp *sc = evsel->priv;\
218            perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
219
220 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
221 {
222         zfree(&evsel->priv);
223         perf_evsel__delete(evsel);
224 }
225
226 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
227 {
228         evsel->priv = malloc(sizeof(struct syscall_tp));
229         if (evsel->priv != NULL) {
230                 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
231                         goto out_delete;
232
233                 evsel->handler = handler;
234                 return 0;
235         }
236
237         return -ENOMEM;
238
239 out_delete:
240         zfree(&evsel->priv);
241         return -ENOENT;
242 }
243
244 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
245 {
246         struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
247
248         /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
249         if (IS_ERR(evsel))
250                 evsel = perf_evsel__newtp("syscalls", direction);
251
252         if (IS_ERR(evsel))
253                 return NULL;
254
255         if (perf_evsel__init_syscall_tp(evsel, handler))
256                 goto out_delete;
257
258         return evsel;
259
260 out_delete:
261         perf_evsel__delete_priv(evsel);
262         return NULL;
263 }
264
265 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
266         ({ struct syscall_tp *fields = evsel->priv; \
267            fields->name.integer(&fields->name, sample); })
268
269 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
270         ({ struct syscall_tp *fields = evsel->priv; \
271            fields->name.pointer(&fields->name, sample); })
272
273 struct syscall_arg {
274         unsigned long val;
275         struct thread *thread;
276         struct trace  *trace;
277         void          *parm;
278         u8            idx;
279         u8            mask;
280 };
281
282 struct strarray {
283         int         offset;
284         int         nr_entries;
285         const char **entries;
286 };
287
288 #define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
289         .nr_entries = ARRAY_SIZE(array), \
290         .entries = array, \
291 }
292
293 #define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
294         .offset     = off, \
295         .nr_entries = ARRAY_SIZE(array), \
296         .entries = array, \
297 }
298
299 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
300                                                 const char *intfmt,
301                                                 struct syscall_arg *arg)
302 {
303         struct strarray *sa = arg->parm;
304         int idx = arg->val - sa->offset;
305
306         if (idx < 0 || idx >= sa->nr_entries)
307                 return scnprintf(bf, size, intfmt, arg->val);
308
309         return scnprintf(bf, size, "%s", sa->entries[idx]);
310 }
311
312 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
313                                               struct syscall_arg *arg)
314 {
315         return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
316 }
317
318 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
319
320 #if defined(__i386__) || defined(__x86_64__)
321 /*
322  * FIXME: Make this available to all arches as soon as the ioctl beautifier
323  *        gets rewritten to support all arches.
324  */
325 static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
326                                                  struct syscall_arg *arg)
327 {
328         return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
329 }
330
331 #define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
332 #endif /* defined(__i386__) || defined(__x86_64__) */
333
334 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
335                                         struct syscall_arg *arg);
336
337 #define SCA_FD syscall_arg__scnprintf_fd
338
339 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
340                                            struct syscall_arg *arg)
341 {
342         int fd = arg->val;
343
344         if (fd == AT_FDCWD)
345                 return scnprintf(bf, size, "CWD");
346
347         return syscall_arg__scnprintf_fd(bf, size, arg);
348 }
349
350 #define SCA_FDAT syscall_arg__scnprintf_fd_at
351
352 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
353                                               struct syscall_arg *arg);
354
355 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
356
357 static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
358                                          struct syscall_arg *arg)
359 {
360         return scnprintf(bf, size, "%#lx", arg->val);
361 }
362
363 #define SCA_HEX syscall_arg__scnprintf_hex
364
365 static size_t syscall_arg__scnprintf_int(char *bf, size_t size,
366                                          struct syscall_arg *arg)
367 {
368         return scnprintf(bf, size, "%d", arg->val);
369 }
370
371 #define SCA_INT syscall_arg__scnprintf_int
372
373 static size_t syscall_arg__scnprintf_mmap_prot(char *bf, size_t size,
374                                                struct syscall_arg *arg)
375 {
376         int printed = 0, prot = arg->val;
377
378         if (prot == PROT_NONE)
379                 return scnprintf(bf, size, "NONE");
380 #define P_MMAP_PROT(n) \
381         if (prot & PROT_##n) { \
382                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
383                 prot &= ~PROT_##n; \
384         }
385
386         P_MMAP_PROT(EXEC);
387         P_MMAP_PROT(READ);
388         P_MMAP_PROT(WRITE);
389 #ifdef PROT_SEM
390         P_MMAP_PROT(SEM);
391 #endif
392         P_MMAP_PROT(GROWSDOWN);
393         P_MMAP_PROT(GROWSUP);
394 #undef P_MMAP_PROT
395
396         if (prot)
397                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", prot);
398
399         return printed;
400 }
401
402 #define SCA_MMAP_PROT syscall_arg__scnprintf_mmap_prot
403
404 static size_t syscall_arg__scnprintf_mmap_flags(char *bf, size_t size,
405                                                 struct syscall_arg *arg)
406 {
407         int printed = 0, flags = arg->val;
408
409 #define P_MMAP_FLAG(n) \
410         if (flags & MAP_##n) { \
411                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
412                 flags &= ~MAP_##n; \
413         }
414
415         P_MMAP_FLAG(SHARED);
416         P_MMAP_FLAG(PRIVATE);
417 #ifdef MAP_32BIT
418         P_MMAP_FLAG(32BIT);
419 #endif
420         P_MMAP_FLAG(ANONYMOUS);
421         P_MMAP_FLAG(DENYWRITE);
422         P_MMAP_FLAG(EXECUTABLE);
423         P_MMAP_FLAG(FILE);
424         P_MMAP_FLAG(FIXED);
425         P_MMAP_FLAG(GROWSDOWN);
426 #ifdef MAP_HUGETLB
427         P_MMAP_FLAG(HUGETLB);
428 #endif
429         P_MMAP_FLAG(LOCKED);
430         P_MMAP_FLAG(NONBLOCK);
431         P_MMAP_FLAG(NORESERVE);
432         P_MMAP_FLAG(POPULATE);
433         P_MMAP_FLAG(STACK);
434 #ifdef MAP_UNINITIALIZED
435         P_MMAP_FLAG(UNINITIALIZED);
436 #endif
437 #undef P_MMAP_FLAG
438
439         if (flags)
440                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
441
442         return printed;
443 }
444
445 #define SCA_MMAP_FLAGS syscall_arg__scnprintf_mmap_flags
446
447 static size_t syscall_arg__scnprintf_mremap_flags(char *bf, size_t size,
448                                                   struct syscall_arg *arg)
449 {
450         int printed = 0, flags = arg->val;
451
452 #define P_MREMAP_FLAG(n) \
453         if (flags & MREMAP_##n) { \
454                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
455                 flags &= ~MREMAP_##n; \
456         }
457
458         P_MREMAP_FLAG(MAYMOVE);
459 #ifdef MREMAP_FIXED
460         P_MREMAP_FLAG(FIXED);
461 #endif
462 #undef P_MREMAP_FLAG
463
464         if (flags)
465                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
466
467         return printed;
468 }
469
470 #define SCA_MREMAP_FLAGS syscall_arg__scnprintf_mremap_flags
471
472 static size_t syscall_arg__scnprintf_madvise_behavior(char *bf, size_t size,
473                                                       struct syscall_arg *arg)
474 {
475         int behavior = arg->val;
476
477         switch (behavior) {
478 #define P_MADV_BHV(n) case MADV_##n: return scnprintf(bf, size, #n)
479         P_MADV_BHV(NORMAL);
480         P_MADV_BHV(RANDOM);
481         P_MADV_BHV(SEQUENTIAL);
482         P_MADV_BHV(WILLNEED);
483         P_MADV_BHV(DONTNEED);
484         P_MADV_BHV(REMOVE);
485         P_MADV_BHV(DONTFORK);
486         P_MADV_BHV(DOFORK);
487         P_MADV_BHV(HWPOISON);
488 #ifdef MADV_SOFT_OFFLINE
489         P_MADV_BHV(SOFT_OFFLINE);
490 #endif
491         P_MADV_BHV(MERGEABLE);
492         P_MADV_BHV(UNMERGEABLE);
493 #ifdef MADV_HUGEPAGE
494         P_MADV_BHV(HUGEPAGE);
495 #endif
496 #ifdef MADV_NOHUGEPAGE
497         P_MADV_BHV(NOHUGEPAGE);
498 #endif
499 #ifdef MADV_DONTDUMP
500         P_MADV_BHV(DONTDUMP);
501 #endif
502 #ifdef MADV_DODUMP
503         P_MADV_BHV(DODUMP);
504 #endif
505 #undef P_MADV_PHV
506         default: break;
507         }
508
509         return scnprintf(bf, size, "%#x", behavior);
510 }
511
512 #define SCA_MADV_BHV syscall_arg__scnprintf_madvise_behavior
513
514 static size_t syscall_arg__scnprintf_flock(char *bf, size_t size,
515                                            struct syscall_arg *arg)
516 {
517         int printed = 0, op = arg->val;
518
519         if (op == 0)
520                 return scnprintf(bf, size, "NONE");
521 #define P_CMD(cmd) \
522         if ((op & LOCK_##cmd) == LOCK_##cmd) { \
523                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #cmd); \
524                 op &= ~LOCK_##cmd; \
525         }
526
527         P_CMD(SH);
528         P_CMD(EX);
529         P_CMD(NB);
530         P_CMD(UN);
531         P_CMD(MAND);
532         P_CMD(RW);
533         P_CMD(READ);
534         P_CMD(WRITE);
535 #undef P_OP
536
537         if (op)
538                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", op);
539
540         return printed;
541 }
542
543 #define SCA_FLOCK syscall_arg__scnprintf_flock
544
545 static size_t syscall_arg__scnprintf_futex_op(char *bf, size_t size, struct syscall_arg *arg)
546 {
547         enum syscall_futex_args {
548                 SCF_UADDR   = (1 << 0),
549                 SCF_OP      = (1 << 1),
550                 SCF_VAL     = (1 << 2),
551                 SCF_TIMEOUT = (1 << 3),
552                 SCF_UADDR2  = (1 << 4),
553                 SCF_VAL3    = (1 << 5),
554         };
555         int op = arg->val;
556         int cmd = op & FUTEX_CMD_MASK;
557         size_t printed = 0;
558
559         switch (cmd) {
560 #define P_FUTEX_OP(n) case FUTEX_##n: printed = scnprintf(bf, size, #n);
561         P_FUTEX_OP(WAIT);           arg->mask |= SCF_VAL3|SCF_UADDR2;             break;
562         P_FUTEX_OP(WAKE);           arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
563         P_FUTEX_OP(FD);             arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
564         P_FUTEX_OP(REQUEUE);        arg->mask |= SCF_VAL3|SCF_TIMEOUT;            break;
565         P_FUTEX_OP(CMP_REQUEUE);    arg->mask |= SCF_TIMEOUT;                     break;
566         P_FUTEX_OP(CMP_REQUEUE_PI); arg->mask |= SCF_TIMEOUT;                     break;
567         P_FUTEX_OP(WAKE_OP);                                                      break;
568         P_FUTEX_OP(LOCK_PI);        arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
569         P_FUTEX_OP(UNLOCK_PI);      arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
570         P_FUTEX_OP(TRYLOCK_PI);     arg->mask |= SCF_VAL3|SCF_UADDR2;             break;
571         P_FUTEX_OP(WAIT_BITSET);    arg->mask |= SCF_UADDR2;                      break;
572         P_FUTEX_OP(WAKE_BITSET);    arg->mask |= SCF_UADDR2;                      break;
573         P_FUTEX_OP(WAIT_REQUEUE_PI);                                              break;
574         default: printed = scnprintf(bf, size, "%#x", cmd);                       break;
575         }
576
577         if (op & FUTEX_PRIVATE_FLAG)
578                 printed += scnprintf(bf + printed, size - printed, "|PRIV");
579
580         if (op & FUTEX_CLOCK_REALTIME)
581                 printed += scnprintf(bf + printed, size - printed, "|CLKRT");
582
583         return printed;
584 }
585
586 #define SCA_FUTEX_OP  syscall_arg__scnprintf_futex_op
587
588 static const char *bpf_cmd[] = {
589         "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
590         "MAP_GET_NEXT_KEY", "PROG_LOAD",
591 };
592 static DEFINE_STRARRAY(bpf_cmd);
593
594 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
595 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
596
597 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
598 static DEFINE_STRARRAY(itimers);
599
600 static const char *keyctl_options[] = {
601         "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
602         "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
603         "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
604         "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
605         "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
606 };
607 static DEFINE_STRARRAY(keyctl_options);
608
609 static const char *whences[] = { "SET", "CUR", "END",
610 #ifdef SEEK_DATA
611 "DATA",
612 #endif
613 #ifdef SEEK_HOLE
614 "HOLE",
615 #endif
616 };
617 static DEFINE_STRARRAY(whences);
618
619 static const char *fcntl_cmds[] = {
620         "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
621         "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
622         "F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
623         "F_GETOWNER_UIDS",
624 };
625 static DEFINE_STRARRAY(fcntl_cmds);
626
627 static const char *rlimit_resources[] = {
628         "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
629         "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
630         "RTTIME",
631 };
632 static DEFINE_STRARRAY(rlimit_resources);
633
634 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
635 static DEFINE_STRARRAY(sighow);
636
637 static const char *clockid[] = {
638         "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
639         "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
640         "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
641 };
642 static DEFINE_STRARRAY(clockid);
643
644 static const char *socket_families[] = {
645         "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
646         "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
647         "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
648         "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
649         "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
650         "ALG", "NFC", "VSOCK",
651 };
652 static DEFINE_STRARRAY(socket_families);
653
654 #ifndef SOCK_TYPE_MASK
655 #define SOCK_TYPE_MASK 0xf
656 #endif
657
658 static size_t syscall_arg__scnprintf_socket_type(char *bf, size_t size,
659                                                       struct syscall_arg *arg)
660 {
661         size_t printed;
662         int type = arg->val,
663             flags = type & ~SOCK_TYPE_MASK;
664
665         type &= SOCK_TYPE_MASK;
666         /*
667          * Can't use a strarray, MIPS may override for ABI reasons.
668          */
669         switch (type) {
670 #define P_SK_TYPE(n) case SOCK_##n: printed = scnprintf(bf, size, #n); break;
671         P_SK_TYPE(STREAM);
672         P_SK_TYPE(DGRAM);
673         P_SK_TYPE(RAW);
674         P_SK_TYPE(RDM);
675         P_SK_TYPE(SEQPACKET);
676         P_SK_TYPE(DCCP);
677         P_SK_TYPE(PACKET);
678 #undef P_SK_TYPE
679         default:
680                 printed = scnprintf(bf, size, "%#x", type);
681         }
682
683 #define P_SK_FLAG(n) \
684         if (flags & SOCK_##n) { \
685                 printed += scnprintf(bf + printed, size - printed, "|%s", #n); \
686                 flags &= ~SOCK_##n; \
687         }
688
689         P_SK_FLAG(CLOEXEC);
690         P_SK_FLAG(NONBLOCK);
691 #undef P_SK_FLAG
692
693         if (flags)
694                 printed += scnprintf(bf + printed, size - printed, "|%#x", flags);
695
696         return printed;
697 }
698
699 #define SCA_SK_TYPE syscall_arg__scnprintf_socket_type
700
701 #ifndef MSG_PROBE
702 #define MSG_PROBE            0x10
703 #endif
704 #ifndef MSG_WAITFORONE
705 #define MSG_WAITFORONE  0x10000
706 #endif
707 #ifndef MSG_SENDPAGE_NOTLAST
708 #define MSG_SENDPAGE_NOTLAST 0x20000
709 #endif
710 #ifndef MSG_FASTOPEN
711 #define MSG_FASTOPEN         0x20000000
712 #endif
713
714 static size_t syscall_arg__scnprintf_msg_flags(char *bf, size_t size,
715                                                struct syscall_arg *arg)
716 {
717         int printed = 0, flags = arg->val;
718
719         if (flags == 0)
720                 return scnprintf(bf, size, "NONE");
721 #define P_MSG_FLAG(n) \
722         if (flags & MSG_##n) { \
723                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
724                 flags &= ~MSG_##n; \
725         }
726
727         P_MSG_FLAG(OOB);
728         P_MSG_FLAG(PEEK);
729         P_MSG_FLAG(DONTROUTE);
730         P_MSG_FLAG(TRYHARD);
731         P_MSG_FLAG(CTRUNC);
732         P_MSG_FLAG(PROBE);
733         P_MSG_FLAG(TRUNC);
734         P_MSG_FLAG(DONTWAIT);
735         P_MSG_FLAG(EOR);
736         P_MSG_FLAG(WAITALL);
737         P_MSG_FLAG(FIN);
738         P_MSG_FLAG(SYN);
739         P_MSG_FLAG(CONFIRM);
740         P_MSG_FLAG(RST);
741         P_MSG_FLAG(ERRQUEUE);
742         P_MSG_FLAG(NOSIGNAL);
743         P_MSG_FLAG(MORE);
744         P_MSG_FLAG(WAITFORONE);
745         P_MSG_FLAG(SENDPAGE_NOTLAST);
746         P_MSG_FLAG(FASTOPEN);
747         P_MSG_FLAG(CMSG_CLOEXEC);
748 #undef P_MSG_FLAG
749
750         if (flags)
751                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
752
753         return printed;
754 }
755
756 #define SCA_MSG_FLAGS syscall_arg__scnprintf_msg_flags
757
758 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
759                                                  struct syscall_arg *arg)
760 {
761         size_t printed = 0;
762         int mode = arg->val;
763
764         if (mode == F_OK) /* 0 */
765                 return scnprintf(bf, size, "F");
766 #define P_MODE(n) \
767         if (mode & n##_OK) { \
768                 printed += scnprintf(bf + printed, size - printed, "%s", #n); \
769                 mode &= ~n##_OK; \
770         }
771
772         P_MODE(R);
773         P_MODE(W);
774         P_MODE(X);
775 #undef P_MODE
776
777         if (mode)
778                 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
779
780         return printed;
781 }
782
783 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
784
785 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
786                                               struct syscall_arg *arg);
787
788 #define SCA_FILENAME syscall_arg__scnprintf_filename
789
790 static size_t syscall_arg__scnprintf_open_flags(char *bf, size_t size,
791                                                struct syscall_arg *arg)
792 {
793         int printed = 0, flags = arg->val;
794
795         if (!(flags & O_CREAT))
796                 arg->mask |= 1 << (arg->idx + 1); /* Mask the mode parm */
797
798         if (flags == 0)
799                 return scnprintf(bf, size, "RDONLY");
800 #define P_FLAG(n) \
801         if (flags & O_##n) { \
802                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
803                 flags &= ~O_##n; \
804         }
805
806         P_FLAG(APPEND);
807         P_FLAG(ASYNC);
808         P_FLAG(CLOEXEC);
809         P_FLAG(CREAT);
810         P_FLAG(DIRECT);
811         P_FLAG(DIRECTORY);
812         P_FLAG(EXCL);
813         P_FLAG(LARGEFILE);
814         P_FLAG(NOATIME);
815         P_FLAG(NOCTTY);
816 #ifdef O_NONBLOCK
817         P_FLAG(NONBLOCK);
818 #elif O_NDELAY
819         P_FLAG(NDELAY);
820 #endif
821 #ifdef O_PATH
822         P_FLAG(PATH);
823 #endif
824         P_FLAG(RDWR);
825 #ifdef O_DSYNC
826         if ((flags & O_SYNC) == O_SYNC)
827                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", "SYNC");
828         else {
829                 P_FLAG(DSYNC);
830         }
831 #else
832         P_FLAG(SYNC);
833 #endif
834         P_FLAG(TRUNC);
835         P_FLAG(WRONLY);
836 #undef P_FLAG
837
838         if (flags)
839                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
840
841         return printed;
842 }
843
844 #define SCA_OPEN_FLAGS syscall_arg__scnprintf_open_flags
845
846 static size_t syscall_arg__scnprintf_perf_flags(char *bf, size_t size,
847                                                 struct syscall_arg *arg)
848 {
849         int printed = 0, flags = arg->val;
850
851         if (flags == 0)
852                 return 0;
853
854 #define P_FLAG(n) \
855         if (flags & PERF_FLAG_##n) { \
856                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
857                 flags &= ~PERF_FLAG_##n; \
858         }
859
860         P_FLAG(FD_NO_GROUP);
861         P_FLAG(FD_OUTPUT);
862         P_FLAG(PID_CGROUP);
863         P_FLAG(FD_CLOEXEC);
864 #undef P_FLAG
865
866         if (flags)
867                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
868
869         return printed;
870 }
871
872 #define SCA_PERF_FLAGS syscall_arg__scnprintf_perf_flags
873
874 static size_t syscall_arg__scnprintf_eventfd_flags(char *bf, size_t size,
875                                                    struct syscall_arg *arg)
876 {
877         int printed = 0, flags = arg->val;
878
879         if (flags == 0)
880                 return scnprintf(bf, size, "NONE");
881 #define P_FLAG(n) \
882         if (flags & EFD_##n) { \
883                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
884                 flags &= ~EFD_##n; \
885         }
886
887         P_FLAG(SEMAPHORE);
888         P_FLAG(CLOEXEC);
889         P_FLAG(NONBLOCK);
890 #undef P_FLAG
891
892         if (flags)
893                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
894
895         return printed;
896 }
897
898 #define SCA_EFD_FLAGS syscall_arg__scnprintf_eventfd_flags
899
900 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
901                                                 struct syscall_arg *arg)
902 {
903         int printed = 0, flags = arg->val;
904
905 #define P_FLAG(n) \
906         if (flags & O_##n) { \
907                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
908                 flags &= ~O_##n; \
909         }
910
911         P_FLAG(CLOEXEC);
912         P_FLAG(NONBLOCK);
913 #undef P_FLAG
914
915         if (flags)
916                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
917
918         return printed;
919 }
920
921 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
922
923 static size_t syscall_arg__scnprintf_signum(char *bf, size_t size, struct syscall_arg *arg)
924 {
925         int sig = arg->val;
926
927         switch (sig) {
928 #define P_SIGNUM(n) case SIG##n: return scnprintf(bf, size, #n)
929         P_SIGNUM(HUP);
930         P_SIGNUM(INT);
931         P_SIGNUM(QUIT);
932         P_SIGNUM(ILL);
933         P_SIGNUM(TRAP);
934         P_SIGNUM(ABRT);
935         P_SIGNUM(BUS);
936         P_SIGNUM(FPE);
937         P_SIGNUM(KILL);
938         P_SIGNUM(USR1);
939         P_SIGNUM(SEGV);
940         P_SIGNUM(USR2);
941         P_SIGNUM(PIPE);
942         P_SIGNUM(ALRM);
943         P_SIGNUM(TERM);
944         P_SIGNUM(CHLD);
945         P_SIGNUM(CONT);
946         P_SIGNUM(STOP);
947         P_SIGNUM(TSTP);
948         P_SIGNUM(TTIN);
949         P_SIGNUM(TTOU);
950         P_SIGNUM(URG);
951         P_SIGNUM(XCPU);
952         P_SIGNUM(XFSZ);
953         P_SIGNUM(VTALRM);
954         P_SIGNUM(PROF);
955         P_SIGNUM(WINCH);
956         P_SIGNUM(IO);
957         P_SIGNUM(PWR);
958         P_SIGNUM(SYS);
959 #ifdef SIGEMT
960         P_SIGNUM(EMT);
961 #endif
962 #ifdef SIGSTKFLT
963         P_SIGNUM(STKFLT);
964 #endif
965 #ifdef SIGSWI
966         P_SIGNUM(SWI);
967 #endif
968         default: break;
969         }
970
971         return scnprintf(bf, size, "%#x", sig);
972 }
973
974 #define SCA_SIGNUM syscall_arg__scnprintf_signum
975
976 #if defined(__i386__) || defined(__x86_64__)
977 /*
978  * FIXME: Make this available to all arches.
979  */
980 #define TCGETS          0x5401
981
982 static const char *tioctls[] = {
983         "TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
984         "TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
985         "TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
986         "TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
987         "TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
988         "TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
989         "TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
990         "TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
991         "TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
992         "TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
993         "TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
994         [0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
995         "TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
996         "TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
997         "TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
998 };
999
1000 static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
1001 #endif /* defined(__i386__) || defined(__x86_64__) */
1002
1003 #define STRARRAY(arg, name, array) \
1004           .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
1005           .arg_parm      = { [arg] = &strarray__##array, }
1006
1007 static struct syscall_fmt {
1008         const char *name;
1009         const char *alias;
1010         size_t     (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
1011         void       *arg_parm[6];
1012         bool       errmsg;
1013         bool       timeout;
1014         bool       hexret;
1015 } syscall_fmts[] = {
1016         { .name     = "access",     .errmsg = true,
1017           .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */
1018                              [1] = SCA_ACCMODE,  /* mode */ }, },
1019         { .name     = "arch_prctl", .errmsg = true, .alias = "prctl", },
1020         { .name     = "bpf",        .errmsg = true, STRARRAY(0, cmd, bpf_cmd), },
1021         { .name     = "brk",        .hexret = true,
1022           .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
1023         { .name     = "chdir",      .errmsg = true,
1024           .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1025         { .name     = "chmod",      .errmsg = true,
1026           .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1027         { .name     = "chroot",     .errmsg = true,
1028           .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1029         { .name     = "clock_gettime",  .errmsg = true, STRARRAY(0, clk_id, clockid), },
1030         { .name     = "close",      .errmsg = true,
1031           .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
1032         { .name     = "connect",    .errmsg = true, },
1033         { .name     = "creat",      .errmsg = true,
1034           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1035         { .name     = "dup",        .errmsg = true,
1036           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1037         { .name     = "dup2",       .errmsg = true,
1038           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1039         { .name     = "dup3",       .errmsg = true,
1040           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1041         { .name     = "epoll_ctl",  .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
1042         { .name     = "eventfd2",   .errmsg = true,
1043           .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
1044         { .name     = "faccessat",  .errmsg = true,
1045           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1046                              [1] = SCA_FILENAME, /* filename */ }, },
1047         { .name     = "fadvise64",  .errmsg = true,
1048           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1049         { .name     = "fallocate",  .errmsg = true,
1050           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1051         { .name     = "fchdir",     .errmsg = true,
1052           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1053         { .name     = "fchmod",     .errmsg = true,
1054           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1055         { .name     = "fchmodat",   .errmsg = true,
1056           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
1057                              [1] = SCA_FILENAME, /* filename */ }, },
1058         { .name     = "fchown",     .errmsg = true,
1059           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1060         { .name     = "fchownat",   .errmsg = true,
1061           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
1062                              [1] = SCA_FILENAME, /* filename */ }, },
1063         { .name     = "fcntl",      .errmsg = true,
1064           .arg_scnprintf = { [0] = SCA_FD, /* fd */
1065                              [1] = SCA_STRARRAY, /* cmd */ },
1066           .arg_parm      = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
1067         { .name     = "fdatasync",  .errmsg = true,
1068           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1069         { .name     = "flock",      .errmsg = true,
1070           .arg_scnprintf = { [0] = SCA_FD, /* fd */
1071                              [1] = SCA_FLOCK, /* cmd */ }, },
1072         { .name     = "fsetxattr",  .errmsg = true,
1073           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1074         { .name     = "fstat",      .errmsg = true, .alias = "newfstat",
1075           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1076         { .name     = "fstatat",    .errmsg = true, .alias = "newfstatat",
1077           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1078                              [1] = SCA_FILENAME, /* filename */ }, },
1079         { .name     = "fstatfs",    .errmsg = true,
1080           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1081         { .name     = "fsync",    .errmsg = true,
1082           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1083         { .name     = "ftruncate", .errmsg = true,
1084           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1085         { .name     = "futex",      .errmsg = true,
1086           .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
1087         { .name     = "futimesat", .errmsg = true,
1088           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
1089                              [1] = SCA_FILENAME, /* filename */ }, },
1090         { .name     = "getdents",   .errmsg = true,
1091           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1092         { .name     = "getdents64", .errmsg = true,
1093           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1094         { .name     = "getitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
1095         { .name     = "getrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
1096         { .name     = "getxattr",    .errmsg = true,
1097           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1098         { .name     = "inotify_add_watch",          .errmsg = true,
1099           .arg_scnprintf = { [1] = SCA_FILENAME, /* pathname */ }, },
1100         { .name     = "ioctl",      .errmsg = true,
1101           .arg_scnprintf = { [0] = SCA_FD, /* fd */
1102 #if defined(__i386__) || defined(__x86_64__)
1103 /*
1104  * FIXME: Make this available to all arches.
1105  */
1106                              [1] = SCA_STRHEXARRAY, /* cmd */
1107                              [2] = SCA_HEX, /* arg */ },
1108           .arg_parm      = { [1] = &strarray__tioctls, /* cmd */ }, },
1109 #else
1110                              [2] = SCA_HEX, /* arg */ }, },
1111 #endif
1112         { .name     = "keyctl",     .errmsg = true, STRARRAY(0, option, keyctl_options), },
1113         { .name     = "kill",       .errmsg = true,
1114           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1115         { .name     = "lchown",    .errmsg = true,
1116           .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1117         { .name     = "lgetxattr",  .errmsg = true,
1118           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1119         { .name     = "linkat",     .errmsg = true,
1120           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1121         { .name     = "listxattr",  .errmsg = true,
1122           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1123         { .name     = "llistxattr", .errmsg = true,
1124           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1125         { .name     = "lremovexattr",  .errmsg = true,
1126           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1127         { .name     = "lseek",      .errmsg = true,
1128           .arg_scnprintf = { [0] = SCA_FD, /* fd */
1129                              [2] = SCA_STRARRAY, /* whence */ },
1130           .arg_parm      = { [2] = &strarray__whences, /* whence */ }, },
1131         { .name     = "lsetxattr",  .errmsg = true,
1132           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1133         { .name     = "lstat",      .errmsg = true, .alias = "newlstat",
1134           .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1135         { .name     = "lsxattr",    .errmsg = true,
1136           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1137         { .name     = "madvise",    .errmsg = true,
1138           .arg_scnprintf = { [0] = SCA_HEX,      /* start */
1139                              [2] = SCA_MADV_BHV, /* behavior */ }, },
1140         { .name     = "mkdir",    .errmsg = true,
1141           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1142         { .name     = "mkdirat",    .errmsg = true,
1143           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
1144                              [1] = SCA_FILENAME, /* pathname */ }, },
1145         { .name     = "mknod",      .errmsg = true,
1146           .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1147         { .name     = "mknodat",    .errmsg = true,
1148           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
1149                              [1] = SCA_FILENAME, /* filename */ }, },
1150         { .name     = "mlock",      .errmsg = true,
1151           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1152         { .name     = "mlockall",   .errmsg = true,
1153           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1154         { .name     = "mmap",       .hexret = true,
1155           .arg_scnprintf = { [0] = SCA_HEX,       /* addr */
1156                              [2] = SCA_MMAP_PROT, /* prot */
1157                              [3] = SCA_MMAP_FLAGS, /* flags */
1158                              [4] = SCA_FD,        /* fd */ }, },
1159         { .name     = "mprotect",   .errmsg = true,
1160           .arg_scnprintf = { [0] = SCA_HEX, /* start */
1161                              [2] = SCA_MMAP_PROT, /* prot */ }, },
1162         { .name     = "mq_unlink", .errmsg = true,
1163           .arg_scnprintf = { [0] = SCA_FILENAME, /* u_name */ }, },
1164         { .name     = "mremap",     .hexret = true,
1165           .arg_scnprintf = { [0] = SCA_HEX, /* addr */
1166                              [3] = SCA_MREMAP_FLAGS, /* flags */
1167                              [4] = SCA_HEX, /* new_addr */ }, },
1168         { .name     = "munlock",    .errmsg = true,
1169           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1170         { .name     = "munmap",     .errmsg = true,
1171           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1172         { .name     = "name_to_handle_at", .errmsg = true,
1173           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1174         { .name     = "newfstatat", .errmsg = true,
1175           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1176                              [1] = SCA_FILENAME, /* filename */ }, },
1177         { .name     = "open",       .errmsg = true,
1178           .arg_scnprintf = { [0] = SCA_FILENAME,   /* filename */
1179                              [1] = SCA_OPEN_FLAGS, /* flags */ }, },
1180         { .name     = "open_by_handle_at", .errmsg = true,
1181           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1182                              [2] = SCA_OPEN_FLAGS, /* flags */ }, },
1183         { .name     = "openat",     .errmsg = true,
1184           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1185                              [1] = SCA_FILENAME, /* filename */
1186                              [2] = SCA_OPEN_FLAGS, /* flags */ }, },
1187         { .name     = "perf_event_open", .errmsg = true,
1188           .arg_scnprintf = { [1] = SCA_INT, /* pid */
1189                              [2] = SCA_INT, /* cpu */
1190                              [3] = SCA_FD,  /* group_fd */
1191                              [4] = SCA_PERF_FLAGS,  /* flags */ }, },
1192         { .name     = "pipe2",      .errmsg = true,
1193           .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
1194         { .name     = "poll",       .errmsg = true, .timeout = true, },
1195         { .name     = "ppoll",      .errmsg = true, .timeout = true, },
1196         { .name     = "pread",      .errmsg = true, .alias = "pread64",
1197           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1198         { .name     = "preadv",     .errmsg = true, .alias = "pread",
1199           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1200         { .name     = "prlimit64",  .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
1201         { .name     = "pwrite",     .errmsg = true, .alias = "pwrite64",
1202           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1203         { .name     = "pwritev",    .errmsg = true,
1204           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1205         { .name     = "read",       .errmsg = true,
1206           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1207         { .name     = "readlink",   .errmsg = true,
1208           .arg_scnprintf = { [0] = SCA_FILENAME, /* path */ }, },
1209         { .name     = "readlinkat", .errmsg = true,
1210           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1211                              [1] = SCA_FILENAME, /* pathname */ }, },
1212         { .name     = "readv",      .errmsg = true,
1213           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1214         { .name     = "recvfrom",   .errmsg = true,
1215           .arg_scnprintf = { [0] = SCA_FD, /* fd */
1216                              [3] = SCA_MSG_FLAGS, /* flags */ }, },
1217         { .name     = "recvmmsg",   .errmsg = true,
1218           .arg_scnprintf = { [0] = SCA_FD, /* fd */
1219                              [3] = SCA_MSG_FLAGS, /* flags */ }, },
1220         { .name     = "recvmsg",    .errmsg = true,
1221           .arg_scnprintf = { [0] = SCA_FD, /* fd */
1222                              [2] = SCA_MSG_FLAGS, /* flags */ }, },
1223         { .name     = "removexattr", .errmsg = true,
1224           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1225         { .name     = "renameat",   .errmsg = true,
1226           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1227         { .name     = "rmdir",    .errmsg = true,
1228           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1229         { .name     = "rt_sigaction", .errmsg = true,
1230           .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
1231         { .name     = "rt_sigprocmask",  .errmsg = true, STRARRAY(0, how, sighow), },
1232         { .name     = "rt_sigqueueinfo", .errmsg = true,
1233           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1234         { .name     = "rt_tgsigqueueinfo", .errmsg = true,
1235           .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
1236         { .name     = "select",     .errmsg = true, .timeout = true, },
1237         { .name     = "sendmmsg",    .errmsg = true,
1238           .arg_scnprintf = { [0] = SCA_FD, /* fd */
1239                              [3] = SCA_MSG_FLAGS, /* flags */ }, },
1240         { .name     = "sendmsg",    .errmsg = true,
1241           .arg_scnprintf = { [0] = SCA_FD, /* fd */
1242                              [2] = SCA_MSG_FLAGS, /* flags */ }, },
1243         { .name     = "sendto",     .errmsg = true,
1244           .arg_scnprintf = { [0] = SCA_FD, /* fd */
1245                              [3] = SCA_MSG_FLAGS, /* flags */ }, },
1246         { .name     = "setitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
1247         { .name     = "setrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
1248         { .name     = "setxattr",   .errmsg = true,
1249           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1250         { .name     = "shutdown",   .errmsg = true,
1251           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1252         { .name     = "socket",     .errmsg = true,
1253           .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
1254                              [1] = SCA_SK_TYPE, /* type */ },
1255           .arg_parm      = { [0] = &strarray__socket_families, /* family */ }, },
1256         { .name     = "socketpair", .errmsg = true,
1257           .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
1258                              [1] = SCA_SK_TYPE, /* type */ },
1259           .arg_parm      = { [0] = &strarray__socket_families, /* family */ }, },
1260         { .name     = "stat",       .errmsg = true, .alias = "newstat",
1261           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1262         { .name     = "statfs",     .errmsg = true,
1263           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1264         { .name     = "swapoff",    .errmsg = true,
1265           .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
1266         { .name     = "swapon",     .errmsg = true,
1267           .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
1268         { .name     = "symlinkat",  .errmsg = true,
1269           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1270         { .name     = "tgkill",     .errmsg = true,
1271           .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
1272         { .name     = "tkill",      .errmsg = true,
1273           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1274         { .name     = "truncate",   .errmsg = true,
1275           .arg_scnprintf = { [0] = SCA_FILENAME, /* path */ }, },
1276         { .name     = "uname",      .errmsg = true, .alias = "newuname", },
1277         { .name     = "unlinkat",   .errmsg = true,
1278           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1279                              [1] = SCA_FILENAME, /* pathname */ }, },
1280         { .name     = "utime",  .errmsg = true,
1281           .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1282         { .name     = "utimensat",  .errmsg = true,
1283           .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */
1284                              [1] = SCA_FILENAME, /* filename */ }, },
1285         { .name     = "utimes",  .errmsg = true,
1286           .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1287         { .name     = "vmsplice",  .errmsg = true,
1288           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1289         { .name     = "write",      .errmsg = true,
1290           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1291         { .name     = "writev",     .errmsg = true,
1292           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1293 };
1294
1295 static int syscall_fmt__cmp(const void *name, const void *fmtp)
1296 {
1297         const struct syscall_fmt *fmt = fmtp;
1298         return strcmp(name, fmt->name);
1299 }
1300
1301 static struct syscall_fmt *syscall_fmt__find(const char *name)
1302 {
1303         const int nmemb = ARRAY_SIZE(syscall_fmts);
1304         return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
1305 }
1306
1307 struct syscall {
1308         struct event_format *tp_format;
1309         int                 nr_args;
1310         struct format_field *args;
1311         const char          *name;
1312         bool                is_exit;
1313         struct syscall_fmt  *fmt;
1314         size_t              (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
1315         void                **arg_parm;
1316 };
1317
1318 static size_t fprintf_duration(unsigned long t, FILE *fp)
1319 {
1320         double duration = (double)t / NSEC_PER_MSEC;
1321         size_t printed = fprintf(fp, "(");
1322
1323         if (duration >= 1.0)
1324                 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
1325         else if (duration >= 0.01)
1326                 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
1327         else
1328                 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
1329         return printed + fprintf(fp, "): ");
1330 }
1331
1332 /**
1333  * filename.ptr: The filename char pointer that will be vfs_getname'd
1334  * filename.entry_str_pos: Where to insert the string translated from
1335  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
1336  */
1337 struct thread_trace {
1338         u64               entry_time;
1339         u64               exit_time;
1340         bool              entry_pending;
1341         unsigned long     nr_events;
1342         unsigned long     pfmaj, pfmin;
1343         char              *entry_str;
1344         double            runtime_ms;
1345         struct {
1346                 unsigned long ptr;
1347                 short int     entry_str_pos;
1348                 bool          pending_open;
1349                 unsigned int  namelen;
1350                 char          *name;
1351         } filename;
1352         struct {
1353                 int       max;
1354                 char      **table;
1355         } paths;
1356
1357         struct intlist *syscall_stats;
1358 };
1359
1360 static struct thread_trace *thread_trace__new(void)
1361 {
1362         struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
1363
1364         if (ttrace)
1365                 ttrace->paths.max = -1;
1366
1367         ttrace->syscall_stats = intlist__new(NULL);
1368
1369         return ttrace;
1370 }
1371
1372 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
1373 {
1374         struct thread_trace *ttrace;
1375
1376         if (thread == NULL)
1377                 goto fail;
1378
1379         if (thread__priv(thread) == NULL)
1380                 thread__set_priv(thread, thread_trace__new());
1381
1382         if (thread__priv(thread) == NULL)
1383                 goto fail;
1384
1385         ttrace = thread__priv(thread);
1386         ++ttrace->nr_events;
1387
1388         return ttrace;
1389 fail:
1390         color_fprintf(fp, PERF_COLOR_RED,
1391                       "WARNING: not enough memory, dropping samples!\n");
1392         return NULL;
1393 }
1394
1395 #define TRACE_PFMAJ             (1 << 0)
1396 #define TRACE_PFMIN             (1 << 1)
1397
1398 static const size_t trace__entry_str_size = 2048;
1399
1400 struct trace {
1401         struct perf_tool        tool;
1402         struct {
1403                 int             machine;
1404                 int             open_id;
1405         }                       audit;
1406         struct {
1407                 int             max;
1408                 struct syscall  *table;
1409                 struct {
1410                         struct perf_evsel *sys_enter,
1411                                           *sys_exit;
1412                 }               events;
1413         } syscalls;
1414         struct record_opts      opts;
1415         struct perf_evlist      *evlist;
1416         struct machine          *host;
1417         struct thread           *current;
1418         u64                     base_time;
1419         FILE                    *output;
1420         unsigned long           nr_events;
1421         struct strlist          *ev_qualifier;
1422         struct {
1423                 size_t          nr;
1424                 int             *entries;
1425         }                       ev_qualifier_ids;
1426         struct intlist          *tid_list;
1427         struct intlist          *pid_list;
1428         struct {
1429                 size_t          nr;
1430                 pid_t           *entries;
1431         }                       filter_pids;
1432         double                  duration_filter;
1433         double                  runtime_ms;
1434         struct {
1435                 u64             vfs_getname,
1436                                 proc_getname;
1437         } stats;
1438         bool                    not_ev_qualifier;
1439         bool                    live;
1440         bool                    full_time;
1441         bool                    sched;
1442         bool                    multiple_threads;
1443         bool                    summary;
1444         bool                    summary_only;
1445         bool                    show_comm;
1446         bool                    show_tool_stats;
1447         bool                    trace_syscalls;
1448         bool                    force;
1449         bool                    vfs_getname;
1450         int                     trace_pgfaults;
1451 };
1452
1453 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
1454 {
1455         struct thread_trace *ttrace = thread__priv(thread);
1456
1457         if (fd > ttrace->paths.max) {
1458                 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
1459
1460                 if (npath == NULL)
1461                         return -1;
1462
1463                 if (ttrace->paths.max != -1) {
1464                         memset(npath + ttrace->paths.max + 1, 0,
1465                                (fd - ttrace->paths.max) * sizeof(char *));
1466                 } else {
1467                         memset(npath, 0, (fd + 1) * sizeof(char *));
1468                 }
1469
1470                 ttrace->paths.table = npath;
1471                 ttrace->paths.max   = fd;
1472         }
1473
1474         ttrace->paths.table[fd] = strdup(pathname);
1475
1476         return ttrace->paths.table[fd] != NULL ? 0 : -1;
1477 }
1478
1479 static int thread__read_fd_path(struct thread *thread, int fd)
1480 {
1481         char linkname[PATH_MAX], pathname[PATH_MAX];
1482         struct stat st;
1483         int ret;
1484
1485         if (thread->pid_ == thread->tid) {
1486                 scnprintf(linkname, sizeof(linkname),
1487                           "/proc/%d/fd/%d", thread->pid_, fd);
1488         } else {
1489                 scnprintf(linkname, sizeof(linkname),
1490                           "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
1491         }
1492
1493         if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
1494                 return -1;
1495
1496         ret = readlink(linkname, pathname, sizeof(pathname));
1497
1498         if (ret < 0 || ret > st.st_size)
1499                 return -1;
1500
1501         pathname[ret] = '\0';
1502         return trace__set_fd_pathname(thread, fd, pathname);
1503 }
1504
1505 static const char *thread__fd_path(struct thread *thread, int fd,
1506                                    struct trace *trace)
1507 {
1508         struct thread_trace *ttrace = thread__priv(thread);
1509
1510         if (ttrace == NULL)
1511                 return NULL;
1512
1513         if (fd < 0)
1514                 return NULL;
1515
1516         if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
1517                 if (!trace->live)
1518                         return NULL;
1519                 ++trace->stats.proc_getname;
1520                 if (thread__read_fd_path(thread, fd))
1521                         return NULL;
1522         }
1523
1524         return ttrace->paths.table[fd];
1525 }
1526
1527 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
1528                                         struct syscall_arg *arg)
1529 {
1530         int fd = arg->val;
1531         size_t printed = scnprintf(bf, size, "%d", fd);
1532         const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1533
1534         if (path)
1535                 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1536
1537         return printed;
1538 }
1539
1540 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1541                                               struct syscall_arg *arg)
1542 {
1543         int fd = arg->val;
1544         size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1545         struct thread_trace *ttrace = thread__priv(arg->thread);
1546
1547         if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1548                 zfree(&ttrace->paths.table[fd]);
1549
1550         return printed;
1551 }
1552
1553 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1554                                      unsigned long ptr)
1555 {
1556         struct thread_trace *ttrace = thread__priv(thread);
1557
1558         ttrace->filename.ptr = ptr;
1559         ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1560 }
1561
1562 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1563                                               struct syscall_arg *arg)
1564 {
1565         unsigned long ptr = arg->val;
1566
1567         if (!arg->trace->vfs_getname)
1568                 return scnprintf(bf, size, "%#x", ptr);
1569
1570         thread__set_filename_pos(arg->thread, bf, ptr);
1571         return 0;
1572 }
1573
1574 static bool trace__filter_duration(struct trace *trace, double t)
1575 {
1576         return t < (trace->duration_filter * NSEC_PER_MSEC);
1577 }
1578
1579 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1580 {
1581         double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1582
1583         return fprintf(fp, "%10.3f ", ts);
1584 }
1585
1586 static bool done = false;
1587 static bool interrupted = false;
1588
1589 static void sig_handler(int sig)
1590 {
1591         done = true;
1592         interrupted = sig == SIGINT;
1593 }
1594
1595 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1596                                         u64 duration, u64 tstamp, FILE *fp)
1597 {
1598         size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1599         printed += fprintf_duration(duration, fp);
1600
1601         if (trace->multiple_threads) {
1602                 if (trace->show_comm)
1603                         printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1604                 printed += fprintf(fp, "%d ", thread->tid);
1605         }
1606
1607         return printed;
1608 }
1609
1610 static int trace__process_event(struct trace *trace, struct machine *machine,
1611                                 union perf_event *event, struct perf_sample *sample)
1612 {
1613         int ret = 0;
1614
1615         switch (event->header.type) {
1616         case PERF_RECORD_LOST:
1617                 color_fprintf(trace->output, PERF_COLOR_RED,
1618                               "LOST %" PRIu64 " events!\n", event->lost.lost);
1619                 ret = machine__process_lost_event(machine, event, sample);
1620         default:
1621                 ret = machine__process_event(machine, event, sample);
1622                 break;
1623         }
1624
1625         return ret;
1626 }
1627
1628 static int trace__tool_process(struct perf_tool *tool,
1629                                union perf_event *event,
1630                                struct perf_sample *sample,
1631                                struct machine *machine)
1632 {
1633         struct trace *trace = container_of(tool, struct trace, tool);
1634         return trace__process_event(trace, machine, event, sample);
1635 }
1636
1637 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1638 {
1639         int err = symbol__init(NULL);
1640
1641         if (err)
1642                 return err;
1643
1644         trace->host = machine__new_host();
1645         if (trace->host == NULL)
1646                 return -ENOMEM;
1647
1648         if (trace_event__register_resolver(trace->host, machine__resolve_kernel_addr) < 0)
1649                 return -errno;
1650
1651         err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1652                                             evlist->threads, trace__tool_process, false,
1653                                             trace->opts.proc_map_timeout);
1654         if (err)
1655                 symbol__exit();
1656
1657         return err;
1658 }
1659
1660 static int syscall__set_arg_fmts(struct syscall *sc)
1661 {
1662         struct format_field *field;
1663         int idx = 0;
1664
1665         sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *));
1666         if (sc->arg_scnprintf == NULL)
1667                 return -1;
1668
1669         if (sc->fmt)
1670                 sc->arg_parm = sc->fmt->arg_parm;
1671
1672         for (field = sc->args; field; field = field->next) {
1673                 if (sc->fmt && sc->fmt->arg_scnprintf[idx])
1674                         sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
1675                 else if (field->flags & FIELD_IS_POINTER)
1676                         sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
1677                 ++idx;
1678         }
1679
1680         return 0;
1681 }
1682
1683 static int trace__read_syscall_info(struct trace *trace, int id)
1684 {
1685         char tp_name[128];
1686         struct syscall *sc;
1687         const char *name = audit_syscall_to_name(id, trace->audit.machine);
1688
1689         if (name == NULL)
1690                 return -1;
1691
1692         if (id > trace->syscalls.max) {
1693                 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1694
1695                 if (nsyscalls == NULL)
1696                         return -1;
1697
1698                 if (trace->syscalls.max != -1) {
1699                         memset(nsyscalls + trace->syscalls.max + 1, 0,
1700                                (id - trace->syscalls.max) * sizeof(*sc));
1701                 } else {
1702                         memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1703                 }
1704
1705                 trace->syscalls.table = nsyscalls;
1706                 trace->syscalls.max   = id;
1707         }
1708
1709         sc = trace->syscalls.table + id;
1710         sc->name = name;
1711
1712         sc->fmt  = syscall_fmt__find(sc->name);
1713
1714         snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1715         sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1716
1717         if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1718                 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1719                 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1720         }
1721
1722         if (IS_ERR(sc->tp_format))
1723                 return -1;
1724
1725         sc->args = sc->tp_format->format.fields;
1726         sc->nr_args = sc->tp_format->format.nr_fields;
1727         /* drop nr field - not relevant here; does not exist on older kernels */
1728         if (sc->args && strcmp(sc->args->name, "nr") == 0) {
1729                 sc->args = sc->args->next;
1730                 --sc->nr_args;
1731         }
1732
1733         sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1734
1735         return syscall__set_arg_fmts(sc);
1736 }
1737
1738 static int trace__validate_ev_qualifier(struct trace *trace)
1739 {
1740         int err = 0, i;
1741         struct str_node *pos;
1742
1743         trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1744         trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1745                                                  sizeof(trace->ev_qualifier_ids.entries[0]));
1746
1747         if (trace->ev_qualifier_ids.entries == NULL) {
1748                 fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1749                        trace->output);
1750                 err = -EINVAL;
1751                 goto out;
1752         }
1753
1754         i = 0;
1755
1756         strlist__for_each(pos, trace->ev_qualifier) {
1757                 const char *sc = pos->s;
1758                 int id = audit_name_to_syscall(sc, trace->audit.machine);
1759
1760                 if (id < 0) {
1761                         if (err == 0) {
1762                                 fputs("Error:\tInvalid syscall ", trace->output);
1763                                 err = -EINVAL;
1764                         } else {
1765                                 fputs(", ", trace->output);
1766                         }
1767
1768                         fputs(sc, trace->output);
1769                 }
1770
1771                 trace->ev_qualifier_ids.entries[i++] = id;
1772         }
1773
1774         if (err < 0) {
1775                 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1776                       "\nHint:\tand: 'man syscalls'\n", trace->output);
1777                 zfree(&trace->ev_qualifier_ids.entries);
1778                 trace->ev_qualifier_ids.nr = 0;
1779         }
1780 out:
1781         return err;
1782 }
1783
1784 /*
1785  * args is to be interpreted as a series of longs but we need to handle
1786  * 8-byte unaligned accesses. args points to raw_data within the event
1787  * and raw_data is guaranteed to be 8-byte unaligned because it is
1788  * preceded by raw_size which is a u32. So we need to copy args to a temp
1789  * variable to read it. Most notably this avoids extended load instructions
1790  * on unaligned addresses
1791  */
1792
1793 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1794                                       unsigned char *args, struct trace *trace,
1795                                       struct thread *thread)
1796 {
1797         size_t printed = 0;
1798         unsigned char *p;
1799         unsigned long val;
1800
1801         if (sc->args != NULL) {
1802                 struct format_field *field;
1803                 u8 bit = 1;
1804                 struct syscall_arg arg = {
1805                         .idx    = 0,
1806                         .mask   = 0,
1807                         .trace  = trace,
1808                         .thread = thread,
1809                 };
1810
1811                 for (field = sc->args; field;
1812                      field = field->next, ++arg.idx, bit <<= 1) {
1813                         if (arg.mask & bit)
1814                                 continue;
1815
1816                         /* special care for unaligned accesses */
1817                         p = args + sizeof(unsigned long) * arg.idx;
1818                         memcpy(&val, p, sizeof(val));
1819
1820                         /*
1821                          * Suppress this argument if its value is zero and
1822                          * and we don't have a string associated in an
1823                          * strarray for it.
1824                          */
1825                         if (val == 0 &&
1826                             !(sc->arg_scnprintf &&
1827                               sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
1828                               sc->arg_parm[arg.idx]))
1829                                 continue;
1830
1831                         printed += scnprintf(bf + printed, size - printed,
1832                                              "%s%s: ", printed ? ", " : "", field->name);
1833                         if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
1834                                 arg.val = val;
1835                                 if (sc->arg_parm)
1836                                         arg.parm = sc->arg_parm[arg.idx];
1837                                 printed += sc->arg_scnprintf[arg.idx](bf + printed,
1838                                                                       size - printed, &arg);
1839                         } else {
1840                                 printed += scnprintf(bf + printed, size - printed,
1841                                                      "%ld", val);
1842                         }
1843                 }
1844         } else {
1845                 int i = 0;
1846
1847                 while (i < 6) {
1848                         /* special care for unaligned accesses */
1849                         p = args + sizeof(unsigned long) * i;
1850                         memcpy(&val, p, sizeof(val));
1851                         printed += scnprintf(bf + printed, size - printed,
1852                                              "%sarg%d: %ld",
1853                                              printed ? ", " : "", i, val);
1854                         ++i;
1855                 }
1856         }
1857
1858         return printed;
1859 }
1860
1861 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1862                                   union perf_event *event,
1863                                   struct perf_sample *sample);
1864
1865 static struct syscall *trace__syscall_info(struct trace *trace,
1866                                            struct perf_evsel *evsel, int id)
1867 {
1868
1869         if (id < 0) {
1870
1871                 /*
1872                  * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1873                  * before that, leaving at a higher verbosity level till that is
1874                  * explained. Reproduced with plain ftrace with:
1875                  *
1876                  * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1877                  * grep "NR -1 " /t/trace_pipe
1878                  *
1879                  * After generating some load on the machine.
1880                  */
1881                 if (verbose > 1) {
1882                         static u64 n;
1883                         fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1884                                 id, perf_evsel__name(evsel), ++n);
1885                 }
1886                 return NULL;
1887         }
1888
1889         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1890             trace__read_syscall_info(trace, id))
1891                 goto out_cant_read;
1892
1893         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1894                 goto out_cant_read;
1895
1896         return &trace->syscalls.table[id];
1897
1898 out_cant_read:
1899         if (verbose) {
1900                 fprintf(trace->output, "Problems reading syscall %d", id);
1901                 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1902                         fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1903                 fputs(" information\n", trace->output);
1904         }
1905         return NULL;
1906 }
1907
1908 static void thread__update_stats(struct thread_trace *ttrace,
1909                                  int id, struct perf_sample *sample)
1910 {
1911         struct int_node *inode;
1912         struct stats *stats;
1913         u64 duration = 0;
1914
1915         inode = intlist__findnew(ttrace->syscall_stats, id);
1916         if (inode == NULL)
1917                 return;
1918
1919         stats = inode->priv;
1920         if (stats == NULL) {
1921                 stats = malloc(sizeof(struct stats));
1922                 if (stats == NULL)
1923                         return;
1924                 init_stats(stats);
1925                 inode->priv = stats;
1926         }
1927
1928         if (ttrace->entry_time && sample->time > ttrace->entry_time)
1929                 duration = sample->time - ttrace->entry_time;
1930
1931         update_stats(stats, duration);
1932 }
1933
1934 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1935 {
1936         struct thread_trace *ttrace;
1937         u64 duration;
1938         size_t printed;
1939
1940         if (trace->current == NULL)
1941                 return 0;
1942
1943         ttrace = thread__priv(trace->current);
1944
1945         if (!ttrace->entry_pending)
1946                 return 0;
1947
1948         duration = sample->time - ttrace->entry_time;
1949
1950         printed  = trace__fprintf_entry_head(trace, trace->current, duration, sample->time, trace->output);
1951         printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1952         ttrace->entry_pending = false;
1953
1954         return printed;
1955 }
1956
1957 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1958                             union perf_event *event __maybe_unused,
1959                             struct perf_sample *sample)
1960 {
1961         char *msg;
1962         void *args;
1963         size_t printed = 0;
1964         struct thread *thread;
1965         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1966         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1967         struct thread_trace *ttrace;
1968
1969         if (sc == NULL)
1970                 return -1;
1971
1972         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1973         ttrace = thread__trace(thread, trace->output);
1974         if (ttrace == NULL)
1975                 goto out_put;
1976
1977         args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1978
1979         if (ttrace->entry_str == NULL) {
1980                 ttrace->entry_str = malloc(trace__entry_str_size);
1981                 if (!ttrace->entry_str)
1982                         goto out_put;
1983         }
1984
1985         if (!trace->summary_only)
1986                 trace__printf_interrupted_entry(trace, sample);
1987
1988         ttrace->entry_time = sample->time;
1989         msg = ttrace->entry_str;
1990         printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1991
1992         printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1993                                            args, trace, thread);
1994
1995         if (sc->is_exit) {
1996                 if (!trace->duration_filter && !trace->summary_only) {
1997                         trace__fprintf_entry_head(trace, thread, 1, sample->time, trace->output);
1998                         fprintf(trace->output, "%-70s\n", ttrace->entry_str);
1999                 }
2000         } else {
2001                 ttrace->entry_pending = true;
2002                 /* See trace__vfs_getname & trace__sys_exit */
2003                 ttrace->filename.pending_open = false;
2004         }
2005
2006         if (trace->current != thread) {
2007                 thread__put(trace->current);
2008                 trace->current = thread__get(thread);
2009         }
2010         err = 0;
2011 out_put:
2012         thread__put(thread);
2013         return err;
2014 }
2015
2016 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
2017                            union perf_event *event __maybe_unused,
2018                            struct perf_sample *sample)
2019 {
2020         long ret;
2021         u64 duration = 0;
2022         struct thread *thread;
2023         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
2024         struct syscall *sc = trace__syscall_info(trace, evsel, id);
2025         struct thread_trace *ttrace;
2026
2027         if (sc == NULL)
2028                 return -1;
2029
2030         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2031         ttrace = thread__trace(thread, trace->output);
2032         if (ttrace == NULL)
2033                 goto out_put;
2034
2035         if (trace->summary)
2036                 thread__update_stats(ttrace, id, sample);
2037
2038         ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
2039
2040         if (id == trace->audit.open_id && ret >= 0 && ttrace->filename.pending_open) {
2041                 trace__set_fd_pathname(thread, ret, ttrace->filename.name);
2042                 ttrace->filename.pending_open = false;
2043                 ++trace->stats.vfs_getname;
2044         }
2045
2046         ttrace->exit_time = sample->time;
2047
2048         if (ttrace->entry_time) {
2049                 duration = sample->time - ttrace->entry_time;
2050                 if (trace__filter_duration(trace, duration))
2051                         goto out;
2052         } else if (trace->duration_filter)
2053                 goto out;
2054
2055         if (trace->summary_only)
2056                 goto out;
2057
2058         trace__fprintf_entry_head(trace, thread, duration, sample->time, trace->output);
2059
2060         if (ttrace->entry_pending) {
2061                 fprintf(trace->output, "%-70s", ttrace->entry_str);
2062         } else {
2063                 fprintf(trace->output, " ... [");
2064                 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
2065                 fprintf(trace->output, "]: %s()", sc->name);
2066         }
2067
2068         if (sc->fmt == NULL) {
2069 signed_print:
2070                 fprintf(trace->output, ") = %ld", ret);
2071         } else if (ret < 0 && sc->fmt->errmsg) {
2072                 char bf[STRERR_BUFSIZE];
2073                 const char *emsg = strerror_r(-ret, bf, sizeof(bf)),
2074                            *e = audit_errno_to_name(-ret);
2075
2076                 fprintf(trace->output, ") = -1 %s %s", e, emsg);
2077         } else if (ret == 0 && sc->fmt->timeout)
2078                 fprintf(trace->output, ") = 0 Timeout");
2079         else if (sc->fmt->hexret)
2080                 fprintf(trace->output, ") = %#lx", ret);
2081         else
2082                 goto signed_print;
2083
2084         fputc('\n', trace->output);
2085 out:
2086         ttrace->entry_pending = false;
2087         err = 0;
2088 out_put:
2089         thread__put(thread);
2090         return err;
2091 }
2092
2093 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
2094                               union perf_event *event __maybe_unused,
2095                               struct perf_sample *sample)
2096 {
2097         struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2098         struct thread_trace *ttrace;
2099         size_t filename_len, entry_str_len, to_move;
2100         ssize_t remaining_space;
2101         char *pos;
2102         const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
2103
2104         if (!thread)
2105                 goto out;
2106
2107         ttrace = thread__priv(thread);
2108         if (!ttrace)
2109                 goto out;
2110
2111         filename_len = strlen(filename);
2112
2113         if (ttrace->filename.namelen < filename_len) {
2114                 char *f = realloc(ttrace->filename.name, filename_len + 1);
2115
2116                 if (f == NULL)
2117                                 goto out;
2118
2119                 ttrace->filename.namelen = filename_len;
2120                 ttrace->filename.name = f;
2121         }
2122
2123         strcpy(ttrace->filename.name, filename);
2124         ttrace->filename.pending_open = true;
2125
2126         if (!ttrace->filename.ptr)
2127                 goto out;
2128
2129         entry_str_len = strlen(ttrace->entry_str);
2130         remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
2131         if (remaining_space <= 0)
2132                 goto out;
2133
2134         if (filename_len > (size_t)remaining_space) {
2135                 filename += filename_len - remaining_space;
2136                 filename_len = remaining_space;
2137         }
2138
2139         to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
2140         pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
2141         memmove(pos + filename_len, pos, to_move);
2142         memcpy(pos, filename, filename_len);
2143
2144         ttrace->filename.ptr = 0;
2145         ttrace->filename.entry_str_pos = 0;
2146 out:
2147         return 0;
2148 }
2149
2150 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
2151                                      union perf_event *event __maybe_unused,
2152                                      struct perf_sample *sample)
2153 {
2154         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
2155         double runtime_ms = (double)runtime / NSEC_PER_MSEC;
2156         struct thread *thread = machine__findnew_thread(trace->host,
2157                                                         sample->pid,
2158                                                         sample->tid);
2159         struct thread_trace *ttrace = thread__trace(thread, trace->output);
2160
2161         if (ttrace == NULL)
2162                 goto out_dump;
2163
2164         ttrace->runtime_ms += runtime_ms;
2165         trace->runtime_ms += runtime_ms;
2166         thread__put(thread);
2167         return 0;
2168
2169 out_dump:
2170         fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
2171                evsel->name,
2172                perf_evsel__strval(evsel, sample, "comm"),
2173                (pid_t)perf_evsel__intval(evsel, sample, "pid"),
2174                runtime,
2175                perf_evsel__intval(evsel, sample, "vruntime"));
2176         thread__put(thread);
2177         return 0;
2178 }
2179
2180 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
2181                                 union perf_event *event __maybe_unused,
2182                                 struct perf_sample *sample)
2183 {
2184         trace__printf_interrupted_entry(trace, sample);
2185         trace__fprintf_tstamp(trace, sample->time, trace->output);
2186
2187         if (trace->trace_syscalls)
2188                 fprintf(trace->output, "(         ): ");
2189
2190         fprintf(trace->output, "%s:", evsel->name);
2191
2192         if (evsel->tp_format) {
2193                 event_format__fprintf(evsel->tp_format, sample->cpu,
2194                                       sample->raw_data, sample->raw_size,
2195                                       trace->output);
2196         }
2197
2198         fprintf(trace->output, ")\n");
2199         return 0;
2200 }
2201
2202 static void print_location(FILE *f, struct perf_sample *sample,
2203                            struct addr_location *al,
2204                            bool print_dso, bool print_sym)
2205 {
2206
2207         if ((verbose || print_dso) && al->map)
2208                 fprintf(f, "%s@", al->map->dso->long_name);
2209
2210         if ((verbose || print_sym) && al->sym)
2211                 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
2212                         al->addr - al->sym->start);
2213         else if (al->map)
2214                 fprintf(f, "0x%" PRIx64, al->addr);
2215         else
2216                 fprintf(f, "0x%" PRIx64, sample->addr);
2217 }
2218
2219 static int trace__pgfault(struct trace *trace,
2220                           struct perf_evsel *evsel,
2221                           union perf_event *event,
2222                           struct perf_sample *sample)
2223 {
2224         struct thread *thread;
2225         u8 cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
2226         struct addr_location al;
2227         char map_type = 'd';
2228         struct thread_trace *ttrace;
2229         int err = -1;
2230
2231         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2232         ttrace = thread__trace(thread, trace->output);
2233         if (ttrace == NULL)
2234                 goto out_put;
2235
2236         if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
2237                 ttrace->pfmaj++;
2238         else
2239                 ttrace->pfmin++;
2240
2241         if (trace->summary_only)
2242                 goto out;
2243
2244         thread__find_addr_location(thread, cpumode, MAP__FUNCTION,
2245                               sample->ip, &al);
2246
2247         trace__fprintf_entry_head(trace, thread, 0, sample->time, trace->output);
2248
2249         fprintf(trace->output, "%sfault [",
2250                 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
2251                 "maj" : "min");
2252
2253         print_location(trace->output, sample, &al, false, true);
2254
2255         fprintf(trace->output, "] => ");
2256
2257         thread__find_addr_location(thread, cpumode, MAP__VARIABLE,
2258                                    sample->addr, &al);
2259
2260         if (!al.map) {
2261                 thread__find_addr_location(thread, cpumode,
2262                                            MAP__FUNCTION, sample->addr, &al);
2263
2264                 if (al.map)
2265                         map_type = 'x';
2266                 else
2267                         map_type = '?';
2268         }
2269
2270         print_location(trace->output, sample, &al, true, false);
2271
2272         fprintf(trace->output, " (%c%c)\n", map_type, al.level);
2273 out:
2274         err = 0;
2275 out_put:
2276         thread__put(thread);
2277         return err;
2278 }
2279
2280 static bool skip_sample(struct trace *trace, struct perf_sample *sample)
2281 {
2282         if ((trace->pid_list && intlist__find(trace->pid_list, sample->pid)) ||
2283             (trace->tid_list && intlist__find(trace->tid_list, sample->tid)))
2284                 return false;
2285
2286         if (trace->pid_list || trace->tid_list)
2287                 return true;
2288
2289         return false;
2290 }
2291
2292 static int trace__process_sample(struct perf_tool *tool,
2293                                  union perf_event *event,
2294                                  struct perf_sample *sample,
2295                                  struct perf_evsel *evsel,
2296                                  struct machine *machine __maybe_unused)
2297 {
2298         struct trace *trace = container_of(tool, struct trace, tool);
2299         int err = 0;
2300
2301         tracepoint_handler handler = evsel->handler;
2302
2303         if (skip_sample(trace, sample))
2304                 return 0;
2305
2306         if (!trace->full_time && trace->base_time == 0)
2307                 trace->base_time = sample->time;
2308
2309         if (handler) {
2310                 ++trace->nr_events;
2311                 handler(trace, evsel, event, sample);
2312         }
2313
2314         return err;
2315 }
2316
2317 static int parse_target_str(struct trace *trace)
2318 {
2319         if (trace->opts.target.pid) {
2320                 trace->pid_list = intlist__new(trace->opts.target.pid);
2321                 if (trace->pid_list == NULL) {
2322                         pr_err("Error parsing process id string\n");
2323                         return -EINVAL;
2324                 }
2325         }
2326
2327         if (trace->opts.target.tid) {
2328                 trace->tid_list = intlist__new(trace->opts.target.tid);
2329                 if (trace->tid_list == NULL) {
2330                         pr_err("Error parsing thread id string\n");
2331                         return -EINVAL;
2332                 }
2333         }
2334
2335         return 0;
2336 }
2337
2338 static int trace__record(struct trace *trace, int argc, const char **argv)
2339 {
2340         unsigned int rec_argc, i, j;
2341         const char **rec_argv;
2342         const char * const record_args[] = {
2343                 "record",
2344                 "-R",
2345                 "-m", "1024",
2346                 "-c", "1",
2347         };
2348
2349         const char * const sc_args[] = { "-e", };
2350         unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2351         const char * const majpf_args[] = { "-e", "major-faults" };
2352         unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2353         const char * const minpf_args[] = { "-e", "minor-faults" };
2354         unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2355
2356         /* +1 is for the event string below */
2357         rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2358                 majpf_args_nr + minpf_args_nr + argc;
2359         rec_argv = calloc(rec_argc + 1, sizeof(char *));
2360
2361         if (rec_argv == NULL)
2362                 return -ENOMEM;
2363
2364         j = 0;
2365         for (i = 0; i < ARRAY_SIZE(record_args); i++)
2366                 rec_argv[j++] = record_args[i];
2367
2368         if (trace->trace_syscalls) {
2369                 for (i = 0; i < sc_args_nr; i++)
2370                         rec_argv[j++] = sc_args[i];
2371
2372                 /* event string may be different for older kernels - e.g., RHEL6 */
2373                 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2374                         rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2375                 else if (is_valid_tracepoint("syscalls:sys_enter"))
2376                         rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2377                 else {
2378                         pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2379                         return -1;
2380                 }
2381         }
2382
2383         if (trace->trace_pgfaults & TRACE_PFMAJ)
2384                 for (i = 0; i < majpf_args_nr; i++)
2385                         rec_argv[j++] = majpf_args[i];
2386
2387         if (trace->trace_pgfaults & TRACE_PFMIN)
2388                 for (i = 0; i < minpf_args_nr; i++)
2389                         rec_argv[j++] = minpf_args[i];
2390
2391         for (i = 0; i < (unsigned int)argc; i++)
2392                 rec_argv[j++] = argv[i];
2393
2394         return cmd_record(j, rec_argv, NULL);
2395 }
2396
2397 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2398
2399 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2400 {
2401         struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2402
2403         if (IS_ERR(evsel))
2404                 return false;
2405
2406         if (perf_evsel__field(evsel, "pathname") == NULL) {
2407                 perf_evsel__delete(evsel);
2408                 return false;
2409         }
2410
2411         evsel->handler = trace__vfs_getname;
2412         perf_evlist__add(evlist, evsel);
2413         return true;
2414 }
2415
2416 static int perf_evlist__add_pgfault(struct perf_evlist *evlist,
2417                                     u64 config)
2418 {
2419         struct perf_evsel *evsel;
2420         struct perf_event_attr attr = {
2421                 .type = PERF_TYPE_SOFTWARE,
2422                 .mmap_data = 1,
2423         };
2424
2425         attr.config = config;
2426         attr.sample_period = 1;
2427
2428         event_attr_init(&attr);
2429
2430         evsel = perf_evsel__new(&attr);
2431         if (!evsel)
2432                 return -ENOMEM;
2433
2434         evsel->handler = trace__pgfault;
2435         perf_evlist__add(evlist, evsel);
2436
2437         return 0;
2438 }
2439
2440 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2441 {
2442         const u32 type = event->header.type;
2443         struct perf_evsel *evsel;
2444
2445         if (!trace->full_time && trace->base_time == 0)
2446                 trace->base_time = sample->time;
2447
2448         if (type != PERF_RECORD_SAMPLE) {
2449                 trace__process_event(trace, trace->host, event, sample);
2450                 return;
2451         }
2452
2453         evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2454         if (evsel == NULL) {
2455                 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2456                 return;
2457         }
2458
2459         if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2460             sample->raw_data == NULL) {
2461                 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2462                        perf_evsel__name(evsel), sample->tid,
2463                        sample->cpu, sample->raw_size);
2464         } else {
2465                 tracepoint_handler handler = evsel->handler;
2466                 handler(trace, evsel, event, sample);
2467         }
2468 }
2469
2470 static int trace__add_syscall_newtp(struct trace *trace)
2471 {
2472         int ret = -1;
2473         struct perf_evlist *evlist = trace->evlist;
2474         struct perf_evsel *sys_enter, *sys_exit;
2475
2476         sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2477         if (sys_enter == NULL)
2478                 goto out;
2479
2480         if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2481                 goto out_delete_sys_enter;
2482
2483         sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2484         if (sys_exit == NULL)
2485                 goto out_delete_sys_enter;
2486
2487         if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2488                 goto out_delete_sys_exit;
2489
2490         perf_evlist__add(evlist, sys_enter);
2491         perf_evlist__add(evlist, sys_exit);
2492
2493         trace->syscalls.events.sys_enter = sys_enter;
2494         trace->syscalls.events.sys_exit  = sys_exit;
2495
2496         ret = 0;
2497 out:
2498         return ret;
2499
2500 out_delete_sys_exit:
2501         perf_evsel__delete_priv(sys_exit);
2502 out_delete_sys_enter:
2503         perf_evsel__delete_priv(sys_enter);
2504         goto out;
2505 }
2506
2507 static int trace__set_ev_qualifier_filter(struct trace *trace)
2508 {
2509         int err = -1;
2510         char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2511                                                 trace->ev_qualifier_ids.nr,
2512                                                 trace->ev_qualifier_ids.entries);
2513
2514         if (filter == NULL)
2515                 goto out_enomem;
2516
2517         if (!perf_evsel__append_filter(trace->syscalls.events.sys_enter, "&&", filter))
2518                 err = perf_evsel__append_filter(trace->syscalls.events.sys_exit, "&&", filter);
2519
2520         free(filter);
2521 out:
2522         return err;
2523 out_enomem:
2524         errno = ENOMEM;
2525         goto out;
2526 }
2527
2528 static int trace__run(struct trace *trace, int argc, const char **argv)
2529 {
2530         struct perf_evlist *evlist = trace->evlist;
2531         struct perf_evsel *evsel;
2532         int err = -1, i;
2533         unsigned long before;
2534         const bool forks = argc > 0;
2535         bool draining = false;
2536
2537         trace->live = true;
2538
2539         if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2540                 goto out_error_raw_syscalls;
2541
2542         if (trace->trace_syscalls)
2543                 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2544
2545         if ((trace->trace_pgfaults & TRACE_PFMAJ) &&
2546             perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MAJ)) {
2547                 goto out_error_mem;
2548         }
2549
2550         if ((trace->trace_pgfaults & TRACE_PFMIN) &&
2551             perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MIN))
2552                 goto out_error_mem;
2553
2554         if (trace->sched &&
2555             perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2556                                    trace__sched_stat_runtime))
2557                 goto out_error_sched_stat_runtime;
2558
2559         err = perf_evlist__create_maps(evlist, &trace->opts.target);
2560         if (err < 0) {
2561                 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2562                 goto out_delete_evlist;
2563         }
2564
2565         err = trace__symbols_init(trace, evlist);
2566         if (err < 0) {
2567                 fprintf(trace->output, "Problems initializing symbol libraries!\n");
2568                 goto out_delete_evlist;
2569         }
2570
2571         perf_evlist__config(evlist, &trace->opts);
2572
2573         signal(SIGCHLD, sig_handler);
2574         signal(SIGINT, sig_handler);
2575
2576         if (forks) {
2577                 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2578                                                     argv, false, NULL);
2579                 if (err < 0) {
2580                         fprintf(trace->output, "Couldn't run the workload!\n");
2581                         goto out_delete_evlist;
2582                 }
2583         }
2584
2585         err = perf_evlist__open(evlist);
2586         if (err < 0)
2587                 goto out_error_open;
2588
2589         /*
2590          * Better not use !target__has_task() here because we need to cover the
2591          * case where no threads were specified in the command line, but a
2592          * workload was, and in that case we will fill in the thread_map when
2593          * we fork the workload in perf_evlist__prepare_workload.
2594          */
2595         if (trace->filter_pids.nr > 0)
2596                 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2597         else if (thread_map__pid(evlist->threads, 0) == -1)
2598                 err = perf_evlist__set_filter_pid(evlist, getpid());
2599
2600         if (err < 0)
2601                 goto out_error_mem;
2602
2603         if (trace->ev_qualifier_ids.nr > 0) {
2604                 err = trace__set_ev_qualifier_filter(trace);
2605                 if (err < 0)
2606                         goto out_errno;
2607
2608                 pr_debug("event qualifier tracepoint filter: %s\n",
2609                          trace->syscalls.events.sys_exit->filter);
2610         }
2611
2612         err = perf_evlist__apply_filters(evlist, &evsel);
2613         if (err < 0)
2614                 goto out_error_apply_filters;
2615
2616         err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2617         if (err < 0)
2618                 goto out_error_mmap;
2619
2620         if (!target__none(&trace->opts.target))
2621                 perf_evlist__enable(evlist);
2622
2623         if (forks)
2624                 perf_evlist__start_workload(evlist);
2625
2626         trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2627                                   evlist->threads->nr > 1 ||
2628                                   perf_evlist__first(evlist)->attr.inherit;
2629 again:
2630         before = trace->nr_events;
2631
2632         for (i = 0; i < evlist->nr_mmaps; i++) {
2633                 union perf_event *event;
2634
2635                 while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2636                         struct perf_sample sample;
2637
2638                         ++trace->nr_events;
2639
2640                         err = perf_evlist__parse_sample(evlist, event, &sample);
2641                         if (err) {
2642                                 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2643                                 goto next_event;
2644                         }
2645
2646                         trace__handle_event(trace, event, &sample);
2647 next_event:
2648                         perf_evlist__mmap_consume(evlist, i);
2649
2650                         if (interrupted)
2651                                 goto out_disable;
2652
2653                         if (done && !draining) {
2654                                 perf_evlist__disable(evlist);
2655                                 draining = true;
2656                         }
2657                 }
2658         }
2659
2660         if (trace->nr_events == before) {
2661                 int timeout = done ? 100 : -1;
2662
2663                 if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2664                         if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2665                                 draining = true;
2666
2667                         goto again;
2668                 }
2669         } else {
2670                 goto again;
2671         }
2672
2673 out_disable:
2674         thread__zput(trace->current);
2675
2676         perf_evlist__disable(evlist);
2677
2678         if (!err) {
2679                 if (trace->summary)
2680                         trace__fprintf_thread_summary(trace, trace->output);
2681
2682                 if (trace->show_tool_stats) {
2683                         fprintf(trace->output, "Stats:\n "
2684                                                " vfs_getname : %" PRIu64 "\n"
2685                                                " proc_getname: %" PRIu64 "\n",
2686                                 trace->stats.vfs_getname,
2687                                 trace->stats.proc_getname);
2688                 }
2689         }
2690
2691 out_delete_evlist:
2692         perf_evlist__delete(evlist);
2693         trace->evlist = NULL;
2694         trace->live = false;
2695         return err;
2696 {
2697         char errbuf[BUFSIZ];
2698
2699 out_error_sched_stat_runtime:
2700         tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2701         goto out_error;
2702
2703 out_error_raw_syscalls:
2704         tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2705         goto out_error;
2706
2707 out_error_mmap:
2708         perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2709         goto out_error;
2710
2711 out_error_open:
2712         perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2713
2714 out_error:
2715         fprintf(trace->output, "%s\n", errbuf);
2716         goto out_delete_evlist;
2717
2718 out_error_apply_filters:
2719         fprintf(trace->output,
2720                 "Failed to set filter \"%s\" on event %s with %d (%s)\n",
2721                 evsel->filter, perf_evsel__name(evsel), errno,
2722                 strerror_r(errno, errbuf, sizeof(errbuf)));
2723         goto out_delete_evlist;
2724 }
2725 out_error_mem:
2726         fprintf(trace->output, "Not enough memory to run!\n");
2727         goto out_delete_evlist;
2728
2729 out_errno:
2730         fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2731         goto out_delete_evlist;
2732 }
2733
2734 static int trace__replay(struct trace *trace)
2735 {
2736         const struct perf_evsel_str_handler handlers[] = {
2737                 { "probe:vfs_getname",       trace__vfs_getname, },
2738         };
2739         struct perf_data_file file = {
2740                 .path  = input_name,
2741                 .mode  = PERF_DATA_MODE_READ,
2742                 .force = trace->force,
2743         };
2744         struct perf_session *session;
2745         struct perf_evsel *evsel;
2746         int err = -1;
2747
2748         trace->tool.sample        = trace__process_sample;
2749         trace->tool.mmap          = perf_event__process_mmap;
2750         trace->tool.mmap2         = perf_event__process_mmap2;
2751         trace->tool.comm          = perf_event__process_comm;
2752         trace->tool.exit          = perf_event__process_exit;
2753         trace->tool.fork          = perf_event__process_fork;
2754         trace->tool.attr          = perf_event__process_attr;
2755         trace->tool.tracing_data = perf_event__process_tracing_data;
2756         trace->tool.build_id      = perf_event__process_build_id;
2757
2758         trace->tool.ordered_events = true;
2759         trace->tool.ordering_requires_timestamps = true;
2760
2761         /* add tid to output */
2762         trace->multiple_threads = true;
2763
2764         session = perf_session__new(&file, false, &trace->tool);
2765         if (session == NULL)
2766                 return -1;
2767
2768         if (symbol__init(&session->header.env) < 0)
2769                 goto out;
2770
2771         trace->host = &session->machines.host;
2772
2773         err = perf_session__set_tracepoints_handlers(session, handlers);
2774         if (err)
2775                 goto out;
2776
2777         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2778                                                      "raw_syscalls:sys_enter");
2779         /* older kernels have syscalls tp versus raw_syscalls */
2780         if (evsel == NULL)
2781                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2782                                                              "syscalls:sys_enter");
2783
2784         if (evsel &&
2785             (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2786             perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2787                 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2788                 goto out;
2789         }
2790
2791         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2792                                                      "raw_syscalls:sys_exit");
2793         if (evsel == NULL)
2794                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2795                                                              "syscalls:sys_exit");
2796         if (evsel &&
2797             (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2798             perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2799                 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2800                 goto out;
2801         }
2802
2803         evlist__for_each(session->evlist, evsel) {
2804                 if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2805                     (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2806                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2807                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2808                         evsel->handler = trace__pgfault;
2809         }
2810
2811         err = parse_target_str(trace);
2812         if (err != 0)
2813                 goto out;
2814
2815         setup_pager();
2816
2817         err = perf_session__process_events(session);
2818         if (err)
2819                 pr_err("Failed to process events, error %d", err);
2820
2821         else if (trace->summary)
2822                 trace__fprintf_thread_summary(trace, trace->output);
2823
2824 out:
2825         perf_session__delete(session);
2826
2827         return err;
2828 }
2829
2830 static size_t trace__fprintf_threads_header(FILE *fp)
2831 {
2832         size_t printed;
2833
2834         printed  = fprintf(fp, "\n Summary of events:\n\n");
2835
2836         return printed;
2837 }
2838
2839 static size_t thread__dump_stats(struct thread_trace *ttrace,
2840                                  struct trace *trace, FILE *fp)
2841 {
2842         struct stats *stats;
2843         size_t printed = 0;
2844         struct syscall *sc;
2845         struct int_node *inode = intlist__first(ttrace->syscall_stats);
2846
2847         if (inode == NULL)
2848                 return 0;
2849
2850         printed += fprintf(fp, "\n");
2851
2852         printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2853         printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2854         printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2855
2856         /* each int_node is a syscall */
2857         while (inode) {
2858                 stats = inode->priv;
2859                 if (stats) {
2860                         double min = (double)(stats->min) / NSEC_PER_MSEC;
2861                         double max = (double)(stats->max) / NSEC_PER_MSEC;
2862                         double avg = avg_stats(stats);
2863                         double pct;
2864                         u64 n = (u64) stats->n;
2865
2866                         pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2867                         avg /= NSEC_PER_MSEC;
2868
2869                         sc = &trace->syscalls.table[inode->i];
2870                         printed += fprintf(fp, "   %-15s", sc->name);
2871                         printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2872                                            n, avg * n, min, avg);
2873                         printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2874                 }
2875
2876                 inode = intlist__next(inode);
2877         }
2878
2879         printed += fprintf(fp, "\n\n");
2880
2881         return printed;
2882 }
2883
2884 /* struct used to pass data to per-thread function */
2885 struct summary_data {
2886         FILE *fp;
2887         struct trace *trace;
2888         size_t printed;
2889 };
2890
2891 static int trace__fprintf_one_thread(struct thread *thread, void *priv)
2892 {
2893         struct summary_data *data = priv;
2894         FILE *fp = data->fp;
2895         size_t printed = data->printed;
2896         struct trace *trace = data->trace;
2897         struct thread_trace *ttrace = thread__priv(thread);
2898         double ratio;
2899
2900         if (ttrace == NULL)
2901                 return 0;
2902
2903         ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2904
2905         printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2906         printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2907         printed += fprintf(fp, "%.1f%%", ratio);
2908         if (ttrace->pfmaj)
2909                 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2910         if (ttrace->pfmin)
2911                 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2912         printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2913         printed += thread__dump_stats(ttrace, trace, fp);
2914
2915         data->printed += printed;
2916
2917         return 0;
2918 }
2919
2920 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2921 {
2922         struct summary_data data = {
2923                 .fp = fp,
2924                 .trace = trace
2925         };
2926         data.printed = trace__fprintf_threads_header(fp);
2927
2928         machine__for_each_thread(trace->host, trace__fprintf_one_thread, &data);
2929
2930         return data.printed;
2931 }
2932
2933 static int trace__set_duration(const struct option *opt, const char *str,
2934                                int unset __maybe_unused)
2935 {
2936         struct trace *trace = opt->value;
2937
2938         trace->duration_filter = atof(str);
2939         return 0;
2940 }
2941
2942 static int trace__set_filter_pids(const struct option *opt, const char *str,
2943                                   int unset __maybe_unused)
2944 {
2945         int ret = -1;
2946         size_t i;
2947         struct trace *trace = opt->value;
2948         /*
2949          * FIXME: introduce a intarray class, plain parse csv and create a
2950          * { int nr, int entries[] } struct...
2951          */
2952         struct intlist *list = intlist__new(str);
2953
2954         if (list == NULL)
2955                 return -1;
2956
2957         i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2958         trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2959
2960         if (trace->filter_pids.entries == NULL)
2961                 goto out;
2962
2963         trace->filter_pids.entries[0] = getpid();
2964
2965         for (i = 1; i < trace->filter_pids.nr; ++i)
2966                 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2967
2968         intlist__delete(list);
2969         ret = 0;
2970 out:
2971         return ret;
2972 }
2973
2974 static int trace__open_output(struct trace *trace, const char *filename)
2975 {
2976         struct stat st;
2977
2978         if (!stat(filename, &st) && st.st_size) {
2979                 char oldname[PATH_MAX];
2980
2981                 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2982                 unlink(oldname);
2983                 rename(filename, oldname);
2984         }
2985
2986         trace->output = fopen(filename, "w");
2987
2988         return trace->output == NULL ? -errno : 0;
2989 }
2990
2991 static int parse_pagefaults(const struct option *opt, const char *str,
2992                             int unset __maybe_unused)
2993 {
2994         int *trace_pgfaults = opt->value;
2995
2996         if (strcmp(str, "all") == 0)
2997                 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2998         else if (strcmp(str, "maj") == 0)
2999                 *trace_pgfaults |= TRACE_PFMAJ;
3000         else if (strcmp(str, "min") == 0)
3001                 *trace_pgfaults |= TRACE_PFMIN;
3002         else
3003                 return -1;
3004
3005         return 0;
3006 }
3007
3008 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
3009 {
3010         struct perf_evsel *evsel;
3011
3012         evlist__for_each(evlist, evsel)
3013                 evsel->handler = handler;
3014 }
3015
3016 int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
3017 {
3018         const char *trace_usage[] = {
3019                 "perf trace [<options>] [<command>]",
3020                 "perf trace [<options>] -- <command> [<options>]",
3021                 "perf trace record [<options>] [<command>]",
3022                 "perf trace record [<options>] -- <command> [<options>]",
3023                 NULL
3024         };
3025         struct trace trace = {
3026                 .audit = {
3027                         .machine = audit_detect_machine(),
3028                         .open_id = audit_name_to_syscall("open", trace.audit.machine),
3029                 },
3030                 .syscalls = {
3031                         . max = -1,
3032                 },
3033                 .opts = {
3034                         .target = {
3035                                 .uid       = UINT_MAX,
3036                                 .uses_mmap = true,
3037                         },
3038                         .user_freq     = UINT_MAX,
3039                         .user_interval = ULLONG_MAX,
3040                         .no_buffering  = true,
3041                         .mmap_pages    = UINT_MAX,
3042                         .proc_map_timeout  = 500,
3043                 },
3044                 .output = stderr,
3045                 .show_comm = true,
3046                 .trace_syscalls = true,
3047         };
3048         const char *output_name = NULL;
3049         const char *ev_qualifier_str = NULL;
3050         const struct option trace_options[] = {
3051         OPT_CALLBACK(0, "event", &trace.evlist, "event",
3052                      "event selector. use 'perf list' to list available events",
3053                      parse_events_option),
3054         OPT_BOOLEAN(0, "comm", &trace.show_comm,
3055                     "show the thread COMM next to its id"),
3056         OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
3057         OPT_STRING('e', "expr", &ev_qualifier_str, "expr", "list of syscalls to trace"),
3058         OPT_STRING('o', "output", &output_name, "file", "output file name"),
3059         OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
3060         OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
3061                     "trace events on existing process id"),
3062         OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
3063                     "trace events on existing thread id"),
3064         OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
3065                      "pids to filter (by the kernel)", trace__set_filter_pids),
3066         OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
3067                     "system-wide collection from all CPUs"),
3068         OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
3069                     "list of cpus to monitor"),
3070         OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
3071                     "child tasks do not inherit counters"),
3072         OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
3073                      "number of mmap data pages",
3074                      perf_evlist__parse_mmap_pages),
3075         OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
3076                    "user to profile"),
3077         OPT_CALLBACK(0, "duration", &trace, "float",
3078                      "show only events with duration > N.M ms",
3079                      trace__set_duration),
3080         OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
3081         OPT_INCR('v', "verbose", &verbose, "be more verbose"),
3082         OPT_BOOLEAN('T', "time", &trace.full_time,
3083                     "Show full timestamp, not time relative to first start"),
3084         OPT_BOOLEAN('s', "summary", &trace.summary_only,
3085                     "Show only syscall summary with statistics"),
3086         OPT_BOOLEAN('S', "with-summary", &trace.summary,
3087                     "Show all syscalls and summary with statistics"),
3088         OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
3089                      "Trace pagefaults", parse_pagefaults, "maj"),
3090         OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
3091         OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
3092         OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
3093                         "per thread proc mmap processing timeout in ms"),
3094         OPT_END()
3095         };
3096         const char * const trace_subcommands[] = { "record", NULL };
3097         int err;
3098         char bf[BUFSIZ];
3099
3100         signal(SIGSEGV, sighandler_dump_stack);
3101         signal(SIGFPE, sighandler_dump_stack);
3102
3103         trace.evlist = perf_evlist__new();
3104
3105         if (trace.evlist == NULL) {
3106                 pr_err("Not enough memory to run!\n");
3107                 err = -ENOMEM;
3108                 goto out;
3109         }
3110
3111         argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
3112                                  trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3113
3114         if (trace.trace_pgfaults) {
3115                 trace.opts.sample_address = true;
3116                 trace.opts.sample_time = true;
3117         }
3118
3119         if (trace.evlist->nr_entries > 0)
3120                 evlist__set_evsel_handler(trace.evlist, trace__event_handler);
3121
3122         if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
3123                 return trace__record(&trace, argc-1, &argv[1]);
3124
3125         /* summary_only implies summary option, but don't overwrite summary if set */
3126         if (trace.summary_only)
3127                 trace.summary = trace.summary_only;
3128
3129         if (!trace.trace_syscalls && !trace.trace_pgfaults &&
3130             trace.evlist->nr_entries == 0 /* Was --events used? */) {
3131                 pr_err("Please specify something to trace.\n");
3132                 return -1;
3133         }
3134
3135         if (output_name != NULL) {
3136                 err = trace__open_output(&trace, output_name);
3137                 if (err < 0) {
3138                         perror("failed to create output file");
3139                         goto out;
3140                 }
3141         }
3142
3143         if (ev_qualifier_str != NULL) {
3144                 const char *s = ev_qualifier_str;
3145                 struct strlist_config slist_config = {
3146                         .dirname = system_path(STRACE_GROUPS_DIR),
3147                 };
3148
3149                 trace.not_ev_qualifier = *s == '!';
3150                 if (trace.not_ev_qualifier)
3151                         ++s;
3152                 trace.ev_qualifier = strlist__new(s, &slist_config);
3153                 if (trace.ev_qualifier == NULL) {
3154                         fputs("Not enough memory to parse event qualifier",
3155                               trace.output);
3156                         err = -ENOMEM;
3157                         goto out_close;
3158                 }
3159
3160                 err = trace__validate_ev_qualifier(&trace);
3161                 if (err)
3162                         goto out_close;
3163         }
3164
3165         err = target__validate(&trace.opts.target);
3166         if (err) {
3167                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3168                 fprintf(trace.output, "%s", bf);
3169                 goto out_close;
3170         }
3171
3172         err = target__parse_uid(&trace.opts.target);
3173         if (err) {
3174                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3175                 fprintf(trace.output, "%s", bf);
3176                 goto out_close;
3177         }
3178
3179         if (!argc && target__none(&trace.opts.target))
3180                 trace.opts.target.system_wide = true;
3181
3182         if (input_name)
3183                 err = trace__replay(&trace);
3184         else
3185                 err = trace__run(&trace, argc, argv);
3186
3187 out_close:
3188         if (output_name != NULL)
3189                 fclose(trace.output);
3190 out:
3191         return err;
3192 }