]> git.kernelconcepts.de Git - karo-tx-linux.git/blob - tools/perf/builtin-trace.c
sched: Don't scan all-offline ->cpus_allowed twice if !CONFIG_CPUSETS
[karo-tx-linux.git] / tools / perf / builtin-trace.c
1 /*
2  * builtin-trace.c
3  *
4  * Builtin 'trace' command:
5  *
6  * Display a continuously updated trace of any workload, CPU, specific PID,
7  * system wide, etc.  Default format is loosely strace like, but any other
8  * event may be specified using --event.
9  *
10  * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11  *
12  * Initially based on the 'trace' prototype by Thomas Gleixner:
13  *
14  * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15  *
16  * Released under the GPL v2. (and only v2, not any later version)
17  */
18
19 #include <traceevent/event-parse.h>
20 #include "builtin.h"
21 #include "util/color.h"
22 #include "util/debug.h"
23 #include "util/evlist.h"
24 #include "util/exec_cmd.h"
25 #include "util/machine.h"
26 #include "util/session.h"
27 #include "util/thread.h"
28 #include "util/parse-options.h"
29 #include "util/strlist.h"
30 #include "util/intlist.h"
31 #include "util/thread_map.h"
32 #include "util/stat.h"
33 #include "trace-event.h"
34 #include "util/parse-events.h"
35
36 #include <libaudit.h>
37 #include <stdlib.h>
38 #include <sys/mman.h>
39 #include <linux/futex.h>
40
41 /* For older distros: */
42 #ifndef MAP_STACK
43 # define MAP_STACK              0x20000
44 #endif
45
46 #ifndef MADV_HWPOISON
47 # define MADV_HWPOISON          100
48
49 #endif
50
51 #ifndef MADV_MERGEABLE
52 # define MADV_MERGEABLE         12
53 #endif
54
55 #ifndef MADV_UNMERGEABLE
56 # define MADV_UNMERGEABLE       13
57 #endif
58
59 #ifndef EFD_SEMAPHORE
60 # define EFD_SEMAPHORE          1
61 #endif
62
63 #ifndef EFD_NONBLOCK
64 # define EFD_NONBLOCK           00004000
65 #endif
66
67 #ifndef EFD_CLOEXEC
68 # define EFD_CLOEXEC            02000000
69 #endif
70
71 #ifndef O_CLOEXEC
72 # define O_CLOEXEC              02000000
73 #endif
74
75 #ifndef SOCK_DCCP
76 # define SOCK_DCCP              6
77 #endif
78
79 #ifndef SOCK_CLOEXEC
80 # define SOCK_CLOEXEC           02000000
81 #endif
82
83 #ifndef SOCK_NONBLOCK
84 # define SOCK_NONBLOCK          00004000
85 #endif
86
87 #ifndef MSG_CMSG_CLOEXEC
88 # define MSG_CMSG_CLOEXEC       0x40000000
89 #endif
90
91 #ifndef PERF_FLAG_FD_NO_GROUP
92 # define PERF_FLAG_FD_NO_GROUP          (1UL << 0)
93 #endif
94
95 #ifndef PERF_FLAG_FD_OUTPUT
96 # define PERF_FLAG_FD_OUTPUT            (1UL << 1)
97 #endif
98
99 #ifndef PERF_FLAG_PID_CGROUP
100 # define PERF_FLAG_PID_CGROUP           (1UL << 2) /* pid=cgroup id, per-cpu mode only */
101 #endif
102
103 #ifndef PERF_FLAG_FD_CLOEXEC
104 # define PERF_FLAG_FD_CLOEXEC           (1UL << 3) /* O_CLOEXEC */
105 #endif
106
107
108 struct tp_field {
109         int offset;
110         union {
111                 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
112                 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
113         };
114 };
115
116 #define TP_UINT_FIELD(bits) \
117 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
118 { \
119         u##bits value; \
120         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
121         return value;  \
122 }
123
124 TP_UINT_FIELD(8);
125 TP_UINT_FIELD(16);
126 TP_UINT_FIELD(32);
127 TP_UINT_FIELD(64);
128
129 #define TP_UINT_FIELD__SWAPPED(bits) \
130 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
131 { \
132         u##bits value; \
133         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
134         return bswap_##bits(value);\
135 }
136
137 TP_UINT_FIELD__SWAPPED(16);
138 TP_UINT_FIELD__SWAPPED(32);
139 TP_UINT_FIELD__SWAPPED(64);
140
141 static int tp_field__init_uint(struct tp_field *field,
142                                struct format_field *format_field,
143                                bool needs_swap)
144 {
145         field->offset = format_field->offset;
146
147         switch (format_field->size) {
148         case 1:
149                 field->integer = tp_field__u8;
150                 break;
151         case 2:
152                 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
153                 break;
154         case 4:
155                 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
156                 break;
157         case 8:
158                 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
159                 break;
160         default:
161                 return -1;
162         }
163
164         return 0;
165 }
166
167 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
168 {
169         return sample->raw_data + field->offset;
170 }
171
172 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
173 {
174         field->offset = format_field->offset;
175         field->pointer = tp_field__ptr;
176         return 0;
177 }
178
179 struct syscall_tp {
180         struct tp_field id;
181         union {
182                 struct tp_field args, ret;
183         };
184 };
185
186 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
187                                           struct tp_field *field,
188                                           const char *name)
189 {
190         struct format_field *format_field = perf_evsel__field(evsel, name);
191
192         if (format_field == NULL)
193                 return -1;
194
195         return tp_field__init_uint(field, format_field, evsel->needs_swap);
196 }
197
198 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
199         ({ struct syscall_tp *sc = evsel->priv;\
200            perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
201
202 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
203                                          struct tp_field *field,
204                                          const char *name)
205 {
206         struct format_field *format_field = perf_evsel__field(evsel, name);
207
208         if (format_field == NULL)
209                 return -1;
210
211         return tp_field__init_ptr(field, format_field);
212 }
213
214 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
215         ({ struct syscall_tp *sc = evsel->priv;\
216            perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
217
218 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
219 {
220         zfree(&evsel->priv);
221         perf_evsel__delete(evsel);
222 }
223
224 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
225 {
226         evsel->priv = malloc(sizeof(struct syscall_tp));
227         if (evsel->priv != NULL) {
228                 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
229                         goto out_delete;
230
231                 evsel->handler = handler;
232                 return 0;
233         }
234
235         return -ENOMEM;
236
237 out_delete:
238         zfree(&evsel->priv);
239         return -ENOENT;
240 }
241
242 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
243 {
244         struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
245
246         /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
247         if (evsel == NULL)
248                 evsel = perf_evsel__newtp("syscalls", direction);
249
250         if (evsel) {
251                 if (perf_evsel__init_syscall_tp(evsel, handler))
252                         goto out_delete;
253         }
254
255         return evsel;
256
257 out_delete:
258         perf_evsel__delete_priv(evsel);
259         return NULL;
260 }
261
262 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
263         ({ struct syscall_tp *fields = evsel->priv; \
264            fields->name.integer(&fields->name, sample); })
265
266 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
267         ({ struct syscall_tp *fields = evsel->priv; \
268            fields->name.pointer(&fields->name, sample); })
269
270 struct syscall_arg {
271         unsigned long val;
272         struct thread *thread;
273         struct trace  *trace;
274         void          *parm;
275         u8            idx;
276         u8            mask;
277 };
278
279 struct strarray {
280         int         offset;
281         int         nr_entries;
282         const char **entries;
283 };
284
285 #define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
286         .nr_entries = ARRAY_SIZE(array), \
287         .entries = array, \
288 }
289
290 #define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
291         .offset     = off, \
292         .nr_entries = ARRAY_SIZE(array), \
293         .entries = array, \
294 }
295
296 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
297                                                 const char *intfmt,
298                                                 struct syscall_arg *arg)
299 {
300         struct strarray *sa = arg->parm;
301         int idx = arg->val - sa->offset;
302
303         if (idx < 0 || idx >= sa->nr_entries)
304                 return scnprintf(bf, size, intfmt, arg->val);
305
306         return scnprintf(bf, size, "%s", sa->entries[idx]);
307 }
308
309 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
310                                               struct syscall_arg *arg)
311 {
312         return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
313 }
314
315 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
316
317 #if defined(__i386__) || defined(__x86_64__)
318 /*
319  * FIXME: Make this available to all arches as soon as the ioctl beautifier
320  *        gets rewritten to support all arches.
321  */
322 static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
323                                                  struct syscall_arg *arg)
324 {
325         return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
326 }
327
328 #define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
329 #endif /* defined(__i386__) || defined(__x86_64__) */
330
331 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
332                                         struct syscall_arg *arg);
333
334 #define SCA_FD syscall_arg__scnprintf_fd
335
336 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
337                                            struct syscall_arg *arg)
338 {
339         int fd = arg->val;
340
341         if (fd == AT_FDCWD)
342                 return scnprintf(bf, size, "CWD");
343
344         return syscall_arg__scnprintf_fd(bf, size, arg);
345 }
346
347 #define SCA_FDAT syscall_arg__scnprintf_fd_at
348
349 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
350                                               struct syscall_arg *arg);
351
352 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
353
354 static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
355                                          struct syscall_arg *arg)
356 {
357         return scnprintf(bf, size, "%#lx", arg->val);
358 }
359
360 #define SCA_HEX syscall_arg__scnprintf_hex
361
362 static size_t syscall_arg__scnprintf_int(char *bf, size_t size,
363                                          struct syscall_arg *arg)
364 {
365         return scnprintf(bf, size, "%d", arg->val);
366 }
367
368 #define SCA_INT syscall_arg__scnprintf_int
369
370 static size_t syscall_arg__scnprintf_mmap_prot(char *bf, size_t size,
371                                                struct syscall_arg *arg)
372 {
373         int printed = 0, prot = arg->val;
374
375         if (prot == PROT_NONE)
376                 return scnprintf(bf, size, "NONE");
377 #define P_MMAP_PROT(n) \
378         if (prot & PROT_##n) { \
379                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
380                 prot &= ~PROT_##n; \
381         }
382
383         P_MMAP_PROT(EXEC);
384         P_MMAP_PROT(READ);
385         P_MMAP_PROT(WRITE);
386 #ifdef PROT_SEM
387         P_MMAP_PROT(SEM);
388 #endif
389         P_MMAP_PROT(GROWSDOWN);
390         P_MMAP_PROT(GROWSUP);
391 #undef P_MMAP_PROT
392
393         if (prot)
394                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", prot);
395
396         return printed;
397 }
398
399 #define SCA_MMAP_PROT syscall_arg__scnprintf_mmap_prot
400
401 static size_t syscall_arg__scnprintf_mmap_flags(char *bf, size_t size,
402                                                 struct syscall_arg *arg)
403 {
404         int printed = 0, flags = arg->val;
405
406 #define P_MMAP_FLAG(n) \
407         if (flags & MAP_##n) { \
408                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
409                 flags &= ~MAP_##n; \
410         }
411
412         P_MMAP_FLAG(SHARED);
413         P_MMAP_FLAG(PRIVATE);
414 #ifdef MAP_32BIT
415         P_MMAP_FLAG(32BIT);
416 #endif
417         P_MMAP_FLAG(ANONYMOUS);
418         P_MMAP_FLAG(DENYWRITE);
419         P_MMAP_FLAG(EXECUTABLE);
420         P_MMAP_FLAG(FILE);
421         P_MMAP_FLAG(FIXED);
422         P_MMAP_FLAG(GROWSDOWN);
423 #ifdef MAP_HUGETLB
424         P_MMAP_FLAG(HUGETLB);
425 #endif
426         P_MMAP_FLAG(LOCKED);
427         P_MMAP_FLAG(NONBLOCK);
428         P_MMAP_FLAG(NORESERVE);
429         P_MMAP_FLAG(POPULATE);
430         P_MMAP_FLAG(STACK);
431 #ifdef MAP_UNINITIALIZED
432         P_MMAP_FLAG(UNINITIALIZED);
433 #endif
434 #undef P_MMAP_FLAG
435
436         if (flags)
437                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
438
439         return printed;
440 }
441
442 #define SCA_MMAP_FLAGS syscall_arg__scnprintf_mmap_flags
443
444 static size_t syscall_arg__scnprintf_mremap_flags(char *bf, size_t size,
445                                                   struct syscall_arg *arg)
446 {
447         int printed = 0, flags = arg->val;
448
449 #define P_MREMAP_FLAG(n) \
450         if (flags & MREMAP_##n) { \
451                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
452                 flags &= ~MREMAP_##n; \
453         }
454
455         P_MREMAP_FLAG(MAYMOVE);
456 #ifdef MREMAP_FIXED
457         P_MREMAP_FLAG(FIXED);
458 #endif
459 #undef P_MREMAP_FLAG
460
461         if (flags)
462                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
463
464         return printed;
465 }
466
467 #define SCA_MREMAP_FLAGS syscall_arg__scnprintf_mremap_flags
468
469 static size_t syscall_arg__scnprintf_madvise_behavior(char *bf, size_t size,
470                                                       struct syscall_arg *arg)
471 {
472         int behavior = arg->val;
473
474         switch (behavior) {
475 #define P_MADV_BHV(n) case MADV_##n: return scnprintf(bf, size, #n)
476         P_MADV_BHV(NORMAL);
477         P_MADV_BHV(RANDOM);
478         P_MADV_BHV(SEQUENTIAL);
479         P_MADV_BHV(WILLNEED);
480         P_MADV_BHV(DONTNEED);
481         P_MADV_BHV(REMOVE);
482         P_MADV_BHV(DONTFORK);
483         P_MADV_BHV(DOFORK);
484         P_MADV_BHV(HWPOISON);
485 #ifdef MADV_SOFT_OFFLINE
486         P_MADV_BHV(SOFT_OFFLINE);
487 #endif
488         P_MADV_BHV(MERGEABLE);
489         P_MADV_BHV(UNMERGEABLE);
490 #ifdef MADV_HUGEPAGE
491         P_MADV_BHV(HUGEPAGE);
492 #endif
493 #ifdef MADV_NOHUGEPAGE
494         P_MADV_BHV(NOHUGEPAGE);
495 #endif
496 #ifdef MADV_DONTDUMP
497         P_MADV_BHV(DONTDUMP);
498 #endif
499 #ifdef MADV_DODUMP
500         P_MADV_BHV(DODUMP);
501 #endif
502 #undef P_MADV_PHV
503         default: break;
504         }
505
506         return scnprintf(bf, size, "%#x", behavior);
507 }
508
509 #define SCA_MADV_BHV syscall_arg__scnprintf_madvise_behavior
510
511 static size_t syscall_arg__scnprintf_flock(char *bf, size_t size,
512                                            struct syscall_arg *arg)
513 {
514         int printed = 0, op = arg->val;
515
516         if (op == 0)
517                 return scnprintf(bf, size, "NONE");
518 #define P_CMD(cmd) \
519         if ((op & LOCK_##cmd) == LOCK_##cmd) { \
520                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #cmd); \
521                 op &= ~LOCK_##cmd; \
522         }
523
524         P_CMD(SH);
525         P_CMD(EX);
526         P_CMD(NB);
527         P_CMD(UN);
528         P_CMD(MAND);
529         P_CMD(RW);
530         P_CMD(READ);
531         P_CMD(WRITE);
532 #undef P_OP
533
534         if (op)
535                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", op);
536
537         return printed;
538 }
539
540 #define SCA_FLOCK syscall_arg__scnprintf_flock
541
542 static size_t syscall_arg__scnprintf_futex_op(char *bf, size_t size, struct syscall_arg *arg)
543 {
544         enum syscall_futex_args {
545                 SCF_UADDR   = (1 << 0),
546                 SCF_OP      = (1 << 1),
547                 SCF_VAL     = (1 << 2),
548                 SCF_TIMEOUT = (1 << 3),
549                 SCF_UADDR2  = (1 << 4),
550                 SCF_VAL3    = (1 << 5),
551         };
552         int op = arg->val;
553         int cmd = op & FUTEX_CMD_MASK;
554         size_t printed = 0;
555
556         switch (cmd) {
557 #define P_FUTEX_OP(n) case FUTEX_##n: printed = scnprintf(bf, size, #n);
558         P_FUTEX_OP(WAIT);           arg->mask |= SCF_VAL3|SCF_UADDR2;             break;
559         P_FUTEX_OP(WAKE);           arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
560         P_FUTEX_OP(FD);             arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
561         P_FUTEX_OP(REQUEUE);        arg->mask |= SCF_VAL3|SCF_TIMEOUT;            break;
562         P_FUTEX_OP(CMP_REQUEUE);    arg->mask |= SCF_TIMEOUT;                     break;
563         P_FUTEX_OP(CMP_REQUEUE_PI); arg->mask |= SCF_TIMEOUT;                     break;
564         P_FUTEX_OP(WAKE_OP);                                                      break;
565         P_FUTEX_OP(LOCK_PI);        arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
566         P_FUTEX_OP(UNLOCK_PI);      arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
567         P_FUTEX_OP(TRYLOCK_PI);     arg->mask |= SCF_VAL3|SCF_UADDR2;             break;
568         P_FUTEX_OP(WAIT_BITSET);    arg->mask |= SCF_UADDR2;                      break;
569         P_FUTEX_OP(WAKE_BITSET);    arg->mask |= SCF_UADDR2;                      break;
570         P_FUTEX_OP(WAIT_REQUEUE_PI);                                              break;
571         default: printed = scnprintf(bf, size, "%#x", cmd);                       break;
572         }
573
574         if (op & FUTEX_PRIVATE_FLAG)
575                 printed += scnprintf(bf + printed, size - printed, "|PRIV");
576
577         if (op & FUTEX_CLOCK_REALTIME)
578                 printed += scnprintf(bf + printed, size - printed, "|CLKRT");
579
580         return printed;
581 }
582
583 #define SCA_FUTEX_OP  syscall_arg__scnprintf_futex_op
584
585 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
586 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
587
588 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
589 static DEFINE_STRARRAY(itimers);
590
591 static const char *keyctl_options[] = {
592         "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
593         "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
594         "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
595         "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
596         "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
597 };
598 static DEFINE_STRARRAY(keyctl_options);
599
600 static const char *whences[] = { "SET", "CUR", "END",
601 #ifdef SEEK_DATA
602 "DATA",
603 #endif
604 #ifdef SEEK_HOLE
605 "HOLE",
606 #endif
607 };
608 static DEFINE_STRARRAY(whences);
609
610 static const char *fcntl_cmds[] = {
611         "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
612         "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
613         "F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
614         "F_GETOWNER_UIDS",
615 };
616 static DEFINE_STRARRAY(fcntl_cmds);
617
618 static const char *rlimit_resources[] = {
619         "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
620         "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
621         "RTTIME",
622 };
623 static DEFINE_STRARRAY(rlimit_resources);
624
625 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
626 static DEFINE_STRARRAY(sighow);
627
628 static const char *clockid[] = {
629         "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
630         "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
631         "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
632 };
633 static DEFINE_STRARRAY(clockid);
634
635 static const char *socket_families[] = {
636         "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
637         "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
638         "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
639         "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
640         "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
641         "ALG", "NFC", "VSOCK",
642 };
643 static DEFINE_STRARRAY(socket_families);
644
645 #ifndef SOCK_TYPE_MASK
646 #define SOCK_TYPE_MASK 0xf
647 #endif
648
649 static size_t syscall_arg__scnprintf_socket_type(char *bf, size_t size,
650                                                       struct syscall_arg *arg)
651 {
652         size_t printed;
653         int type = arg->val,
654             flags = type & ~SOCK_TYPE_MASK;
655
656         type &= SOCK_TYPE_MASK;
657         /*
658          * Can't use a strarray, MIPS may override for ABI reasons.
659          */
660         switch (type) {
661 #define P_SK_TYPE(n) case SOCK_##n: printed = scnprintf(bf, size, #n); break;
662         P_SK_TYPE(STREAM);
663         P_SK_TYPE(DGRAM);
664         P_SK_TYPE(RAW);
665         P_SK_TYPE(RDM);
666         P_SK_TYPE(SEQPACKET);
667         P_SK_TYPE(DCCP);
668         P_SK_TYPE(PACKET);
669 #undef P_SK_TYPE
670         default:
671                 printed = scnprintf(bf, size, "%#x", type);
672         }
673
674 #define P_SK_FLAG(n) \
675         if (flags & SOCK_##n) { \
676                 printed += scnprintf(bf + printed, size - printed, "|%s", #n); \
677                 flags &= ~SOCK_##n; \
678         }
679
680         P_SK_FLAG(CLOEXEC);
681         P_SK_FLAG(NONBLOCK);
682 #undef P_SK_FLAG
683
684         if (flags)
685                 printed += scnprintf(bf + printed, size - printed, "|%#x", flags);
686
687         return printed;
688 }
689
690 #define SCA_SK_TYPE syscall_arg__scnprintf_socket_type
691
692 #ifndef MSG_PROBE
693 #define MSG_PROBE            0x10
694 #endif
695 #ifndef MSG_WAITFORONE
696 #define MSG_WAITFORONE  0x10000
697 #endif
698 #ifndef MSG_SENDPAGE_NOTLAST
699 #define MSG_SENDPAGE_NOTLAST 0x20000
700 #endif
701 #ifndef MSG_FASTOPEN
702 #define MSG_FASTOPEN         0x20000000
703 #endif
704
705 static size_t syscall_arg__scnprintf_msg_flags(char *bf, size_t size,
706                                                struct syscall_arg *arg)
707 {
708         int printed = 0, flags = arg->val;
709
710         if (flags == 0)
711                 return scnprintf(bf, size, "NONE");
712 #define P_MSG_FLAG(n) \
713         if (flags & MSG_##n) { \
714                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
715                 flags &= ~MSG_##n; \
716         }
717
718         P_MSG_FLAG(OOB);
719         P_MSG_FLAG(PEEK);
720         P_MSG_FLAG(DONTROUTE);
721         P_MSG_FLAG(TRYHARD);
722         P_MSG_FLAG(CTRUNC);
723         P_MSG_FLAG(PROBE);
724         P_MSG_FLAG(TRUNC);
725         P_MSG_FLAG(DONTWAIT);
726         P_MSG_FLAG(EOR);
727         P_MSG_FLAG(WAITALL);
728         P_MSG_FLAG(FIN);
729         P_MSG_FLAG(SYN);
730         P_MSG_FLAG(CONFIRM);
731         P_MSG_FLAG(RST);
732         P_MSG_FLAG(ERRQUEUE);
733         P_MSG_FLAG(NOSIGNAL);
734         P_MSG_FLAG(MORE);
735         P_MSG_FLAG(WAITFORONE);
736         P_MSG_FLAG(SENDPAGE_NOTLAST);
737         P_MSG_FLAG(FASTOPEN);
738         P_MSG_FLAG(CMSG_CLOEXEC);
739 #undef P_MSG_FLAG
740
741         if (flags)
742                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
743
744         return printed;
745 }
746
747 #define SCA_MSG_FLAGS syscall_arg__scnprintf_msg_flags
748
749 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
750                                                  struct syscall_arg *arg)
751 {
752         size_t printed = 0;
753         int mode = arg->val;
754
755         if (mode == F_OK) /* 0 */
756                 return scnprintf(bf, size, "F");
757 #define P_MODE(n) \
758         if (mode & n##_OK) { \
759                 printed += scnprintf(bf + printed, size - printed, "%s", #n); \
760                 mode &= ~n##_OK; \
761         }
762
763         P_MODE(R);
764         P_MODE(W);
765         P_MODE(X);
766 #undef P_MODE
767
768         if (mode)
769                 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
770
771         return printed;
772 }
773
774 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
775
776 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
777                                               struct syscall_arg *arg);
778
779 #define SCA_FILENAME syscall_arg__scnprintf_filename
780
781 static size_t syscall_arg__scnprintf_open_flags(char *bf, size_t size,
782                                                struct syscall_arg *arg)
783 {
784         int printed = 0, flags = arg->val;
785
786         if (!(flags & O_CREAT))
787                 arg->mask |= 1 << (arg->idx + 1); /* Mask the mode parm */
788
789         if (flags == 0)
790                 return scnprintf(bf, size, "RDONLY");
791 #define P_FLAG(n) \
792         if (flags & O_##n) { \
793                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
794                 flags &= ~O_##n; \
795         }
796
797         P_FLAG(APPEND);
798         P_FLAG(ASYNC);
799         P_FLAG(CLOEXEC);
800         P_FLAG(CREAT);
801         P_FLAG(DIRECT);
802         P_FLAG(DIRECTORY);
803         P_FLAG(EXCL);
804         P_FLAG(LARGEFILE);
805         P_FLAG(NOATIME);
806         P_FLAG(NOCTTY);
807 #ifdef O_NONBLOCK
808         P_FLAG(NONBLOCK);
809 #elif O_NDELAY
810         P_FLAG(NDELAY);
811 #endif
812 #ifdef O_PATH
813         P_FLAG(PATH);
814 #endif
815         P_FLAG(RDWR);
816 #ifdef O_DSYNC
817         if ((flags & O_SYNC) == O_SYNC)
818                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", "SYNC");
819         else {
820                 P_FLAG(DSYNC);
821         }
822 #else
823         P_FLAG(SYNC);
824 #endif
825         P_FLAG(TRUNC);
826         P_FLAG(WRONLY);
827 #undef P_FLAG
828
829         if (flags)
830                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
831
832         return printed;
833 }
834
835 #define SCA_OPEN_FLAGS syscall_arg__scnprintf_open_flags
836
837 static size_t syscall_arg__scnprintf_perf_flags(char *bf, size_t size,
838                                                 struct syscall_arg *arg)
839 {
840         int printed = 0, flags = arg->val;
841
842         if (flags == 0)
843                 return 0;
844
845 #define P_FLAG(n) \
846         if (flags & PERF_FLAG_##n) { \
847                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
848                 flags &= ~PERF_FLAG_##n; \
849         }
850
851         P_FLAG(FD_NO_GROUP);
852         P_FLAG(FD_OUTPUT);
853         P_FLAG(PID_CGROUP);
854         P_FLAG(FD_CLOEXEC);
855 #undef P_FLAG
856
857         if (flags)
858                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
859
860         return printed;
861 }
862
863 #define SCA_PERF_FLAGS syscall_arg__scnprintf_perf_flags
864
865 static size_t syscall_arg__scnprintf_eventfd_flags(char *bf, size_t size,
866                                                    struct syscall_arg *arg)
867 {
868         int printed = 0, flags = arg->val;
869
870         if (flags == 0)
871                 return scnprintf(bf, size, "NONE");
872 #define P_FLAG(n) \
873         if (flags & EFD_##n) { \
874                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
875                 flags &= ~EFD_##n; \
876         }
877
878         P_FLAG(SEMAPHORE);
879         P_FLAG(CLOEXEC);
880         P_FLAG(NONBLOCK);
881 #undef P_FLAG
882
883         if (flags)
884                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
885
886         return printed;
887 }
888
889 #define SCA_EFD_FLAGS syscall_arg__scnprintf_eventfd_flags
890
891 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
892                                                 struct syscall_arg *arg)
893 {
894         int printed = 0, flags = arg->val;
895
896 #define P_FLAG(n) \
897         if (flags & O_##n) { \
898                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
899                 flags &= ~O_##n; \
900         }
901
902         P_FLAG(CLOEXEC);
903         P_FLAG(NONBLOCK);
904 #undef P_FLAG
905
906         if (flags)
907                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
908
909         return printed;
910 }
911
912 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
913
914 static size_t syscall_arg__scnprintf_signum(char *bf, size_t size, struct syscall_arg *arg)
915 {
916         int sig = arg->val;
917
918         switch (sig) {
919 #define P_SIGNUM(n) case SIG##n: return scnprintf(bf, size, #n)
920         P_SIGNUM(HUP);
921         P_SIGNUM(INT);
922         P_SIGNUM(QUIT);
923         P_SIGNUM(ILL);
924         P_SIGNUM(TRAP);
925         P_SIGNUM(ABRT);
926         P_SIGNUM(BUS);
927         P_SIGNUM(FPE);
928         P_SIGNUM(KILL);
929         P_SIGNUM(USR1);
930         P_SIGNUM(SEGV);
931         P_SIGNUM(USR2);
932         P_SIGNUM(PIPE);
933         P_SIGNUM(ALRM);
934         P_SIGNUM(TERM);
935         P_SIGNUM(CHLD);
936         P_SIGNUM(CONT);
937         P_SIGNUM(STOP);
938         P_SIGNUM(TSTP);
939         P_SIGNUM(TTIN);
940         P_SIGNUM(TTOU);
941         P_SIGNUM(URG);
942         P_SIGNUM(XCPU);
943         P_SIGNUM(XFSZ);
944         P_SIGNUM(VTALRM);
945         P_SIGNUM(PROF);
946         P_SIGNUM(WINCH);
947         P_SIGNUM(IO);
948         P_SIGNUM(PWR);
949         P_SIGNUM(SYS);
950 #ifdef SIGEMT
951         P_SIGNUM(EMT);
952 #endif
953 #ifdef SIGSTKFLT
954         P_SIGNUM(STKFLT);
955 #endif
956 #ifdef SIGSWI
957         P_SIGNUM(SWI);
958 #endif
959         default: break;
960         }
961
962         return scnprintf(bf, size, "%#x", sig);
963 }
964
965 #define SCA_SIGNUM syscall_arg__scnprintf_signum
966
967 #if defined(__i386__) || defined(__x86_64__)
968 /*
969  * FIXME: Make this available to all arches.
970  */
971 #define TCGETS          0x5401
972
973 static const char *tioctls[] = {
974         "TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
975         "TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
976         "TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
977         "TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
978         "TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
979         "TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
980         "TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
981         "TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
982         "TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
983         "TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
984         "TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
985         [0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
986         "TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
987         "TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
988         "TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
989 };
990
991 static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
992 #endif /* defined(__i386__) || defined(__x86_64__) */
993
994 #define STRARRAY(arg, name, array) \
995           .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
996           .arg_parm      = { [arg] = &strarray__##array, }
997
998 static struct syscall_fmt {
999         const char *name;
1000         const char *alias;
1001         size_t     (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
1002         void       *arg_parm[6];
1003         bool       errmsg;
1004         bool       timeout;
1005         bool       hexret;
1006 } syscall_fmts[] = {
1007         { .name     = "access",     .errmsg = true,
1008           .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */
1009                              [1] = SCA_ACCMODE,  /* mode */ }, },
1010         { .name     = "arch_prctl", .errmsg = true, .alias = "prctl", },
1011         { .name     = "brk",        .hexret = true,
1012           .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
1013         { .name     = "chdir",      .errmsg = true,
1014           .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1015         { .name     = "chmod",      .errmsg = true,
1016           .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1017         { .name     = "chroot",     .errmsg = true,
1018           .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1019         { .name     = "clock_gettime",  .errmsg = true, STRARRAY(0, clk_id, clockid), },
1020         { .name     = "close",      .errmsg = true,
1021           .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
1022         { .name     = "connect",    .errmsg = true, },
1023         { .name     = "creat",      .errmsg = true,
1024           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1025         { .name     = "dup",        .errmsg = true,
1026           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1027         { .name     = "dup2",       .errmsg = true,
1028           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1029         { .name     = "dup3",       .errmsg = true,
1030           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1031         { .name     = "epoll_ctl",  .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
1032         { .name     = "eventfd2",   .errmsg = true,
1033           .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
1034         { .name     = "faccessat",  .errmsg = true,
1035           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1036                              [1] = SCA_FILENAME, /* filename */ }, },
1037         { .name     = "fadvise64",  .errmsg = true,
1038           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1039         { .name     = "fallocate",  .errmsg = true,
1040           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1041         { .name     = "fchdir",     .errmsg = true,
1042           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1043         { .name     = "fchmod",     .errmsg = true,
1044           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1045         { .name     = "fchmodat",   .errmsg = true,
1046           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
1047                              [1] = SCA_FILENAME, /* filename */ }, },
1048         { .name     = "fchown",     .errmsg = true,
1049           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1050         { .name     = "fchownat",   .errmsg = true,
1051           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
1052                              [1] = SCA_FILENAME, /* filename */ }, },
1053         { .name     = "fcntl",      .errmsg = true,
1054           .arg_scnprintf = { [0] = SCA_FD, /* fd */
1055                              [1] = SCA_STRARRAY, /* cmd */ },
1056           .arg_parm      = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
1057         { .name     = "fdatasync",  .errmsg = true,
1058           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1059         { .name     = "flock",      .errmsg = true,
1060           .arg_scnprintf = { [0] = SCA_FD, /* fd */
1061                              [1] = SCA_FLOCK, /* cmd */ }, },
1062         { .name     = "fsetxattr",  .errmsg = true,
1063           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1064         { .name     = "fstat",      .errmsg = true, .alias = "newfstat",
1065           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1066         { .name     = "fstatat",    .errmsg = true, .alias = "newfstatat",
1067           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1068                              [1] = SCA_FILENAME, /* filename */ }, },
1069         { .name     = "fstatfs",    .errmsg = true,
1070           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1071         { .name     = "fsync",    .errmsg = true,
1072           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1073         { .name     = "ftruncate", .errmsg = true,
1074           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1075         { .name     = "futex",      .errmsg = true,
1076           .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
1077         { .name     = "futimesat", .errmsg = true,
1078           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
1079                              [1] = SCA_FILENAME, /* filename */ }, },
1080         { .name     = "getdents",   .errmsg = true,
1081           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1082         { .name     = "getdents64", .errmsg = true,
1083           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1084         { .name     = "getitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
1085         { .name     = "getrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
1086         { .name     = "getxattr",    .errmsg = true,
1087           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1088         { .name     = "inotify_add_watch",          .errmsg = true,
1089           .arg_scnprintf = { [1] = SCA_FILENAME, /* pathname */ }, },
1090         { .name     = "ioctl",      .errmsg = true,
1091           .arg_scnprintf = { [0] = SCA_FD, /* fd */
1092 #if defined(__i386__) || defined(__x86_64__)
1093 /*
1094  * FIXME: Make this available to all arches.
1095  */
1096                              [1] = SCA_STRHEXARRAY, /* cmd */
1097                              [2] = SCA_HEX, /* arg */ },
1098           .arg_parm      = { [1] = &strarray__tioctls, /* cmd */ }, },
1099 #else
1100                              [2] = SCA_HEX, /* arg */ }, },
1101 #endif
1102         { .name     = "keyctl",     .errmsg = true, STRARRAY(0, option, keyctl_options), },
1103         { .name     = "kill",       .errmsg = true,
1104           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1105         { .name     = "lchown",    .errmsg = true,
1106           .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1107         { .name     = "lgetxattr",  .errmsg = true,
1108           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1109         { .name     = "linkat",     .errmsg = true,
1110           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1111         { .name     = "listxattr",  .errmsg = true,
1112           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1113         { .name     = "llistxattr", .errmsg = true,
1114           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1115         { .name     = "lremovexattr",  .errmsg = true,
1116           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1117         { .name     = "lseek",      .errmsg = true,
1118           .arg_scnprintf = { [0] = SCA_FD, /* fd */
1119                              [2] = SCA_STRARRAY, /* whence */ },
1120           .arg_parm      = { [2] = &strarray__whences, /* whence */ }, },
1121         { .name     = "lsetxattr",  .errmsg = true,
1122           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1123         { .name     = "lstat",      .errmsg = true, .alias = "newlstat",
1124           .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1125         { .name     = "lsxattr",    .errmsg = true,
1126           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1127         { .name     = "madvise",    .errmsg = true,
1128           .arg_scnprintf = { [0] = SCA_HEX,      /* start */
1129                              [2] = SCA_MADV_BHV, /* behavior */ }, },
1130         { .name     = "mkdir",    .errmsg = true,
1131           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1132         { .name     = "mkdirat",    .errmsg = true,
1133           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
1134                              [1] = SCA_FILENAME, /* pathname */ }, },
1135         { .name     = "mknod",      .errmsg = true,
1136           .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1137         { .name     = "mknodat",    .errmsg = true,
1138           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
1139                              [1] = SCA_FILENAME, /* filename */ }, },
1140         { .name     = "mlock",      .errmsg = true,
1141           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1142         { .name     = "mlockall",   .errmsg = true,
1143           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1144         { .name     = "mmap",       .hexret = true,
1145           .arg_scnprintf = { [0] = SCA_HEX,       /* addr */
1146                              [2] = SCA_MMAP_PROT, /* prot */
1147                              [3] = SCA_MMAP_FLAGS, /* flags */
1148                              [4] = SCA_FD,        /* fd */ }, },
1149         { .name     = "mprotect",   .errmsg = true,
1150           .arg_scnprintf = { [0] = SCA_HEX, /* start */
1151                              [2] = SCA_MMAP_PROT, /* prot */ }, },
1152         { .name     = "mq_unlink", .errmsg = true,
1153           .arg_scnprintf = { [0] = SCA_FILENAME, /* u_name */ }, },
1154         { .name     = "mremap",     .hexret = true,
1155           .arg_scnprintf = { [0] = SCA_HEX, /* addr */
1156                              [3] = SCA_MREMAP_FLAGS, /* flags */
1157                              [4] = SCA_HEX, /* new_addr */ }, },
1158         { .name     = "munlock",    .errmsg = true,
1159           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1160         { .name     = "munmap",     .errmsg = true,
1161           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1162         { .name     = "name_to_handle_at", .errmsg = true,
1163           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1164         { .name     = "newfstatat", .errmsg = true,
1165           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1166                              [1] = SCA_FILENAME, /* filename */ }, },
1167         { .name     = "open",       .errmsg = true,
1168           .arg_scnprintf = { [0] = SCA_FILENAME,   /* filename */
1169                              [1] = SCA_OPEN_FLAGS, /* flags */ }, },
1170         { .name     = "open_by_handle_at", .errmsg = true,
1171           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1172                              [2] = SCA_OPEN_FLAGS, /* flags */ }, },
1173         { .name     = "openat",     .errmsg = true,
1174           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1175                              [1] = SCA_FILENAME, /* filename */
1176                              [2] = SCA_OPEN_FLAGS, /* flags */ }, },
1177         { .name     = "perf_event_open", .errmsg = true,
1178           .arg_scnprintf = { [1] = SCA_INT, /* pid */
1179                              [2] = SCA_INT, /* cpu */
1180                              [3] = SCA_FD,  /* group_fd */
1181                              [4] = SCA_PERF_FLAGS,  /* flags */ }, },
1182         { .name     = "pipe2",      .errmsg = true,
1183           .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
1184         { .name     = "poll",       .errmsg = true, .timeout = true, },
1185         { .name     = "ppoll",      .errmsg = true, .timeout = true, },
1186         { .name     = "pread",      .errmsg = true, .alias = "pread64",
1187           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1188         { .name     = "preadv",     .errmsg = true, .alias = "pread",
1189           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1190         { .name     = "prlimit64",  .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
1191         { .name     = "pwrite",     .errmsg = true, .alias = "pwrite64",
1192           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1193         { .name     = "pwritev",    .errmsg = true,
1194           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1195         { .name     = "read",       .errmsg = true,
1196           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1197         { .name     = "readlink",   .errmsg = true,
1198           .arg_scnprintf = { [0] = SCA_FILENAME, /* path */ }, },
1199         { .name     = "readlinkat", .errmsg = true,
1200           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1201                              [1] = SCA_FILENAME, /* pathname */ }, },
1202         { .name     = "readv",      .errmsg = true,
1203           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1204         { .name     = "recvfrom",   .errmsg = true,
1205           .arg_scnprintf = { [0] = SCA_FD, /* fd */
1206                              [3] = SCA_MSG_FLAGS, /* flags */ }, },
1207         { .name     = "recvmmsg",   .errmsg = true,
1208           .arg_scnprintf = { [0] = SCA_FD, /* fd */
1209                              [3] = SCA_MSG_FLAGS, /* flags */ }, },
1210         { .name     = "recvmsg",    .errmsg = true,
1211           .arg_scnprintf = { [0] = SCA_FD, /* fd */
1212                              [2] = SCA_MSG_FLAGS, /* flags */ }, },
1213         { .name     = "removexattr", .errmsg = true,
1214           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1215         { .name     = "renameat",   .errmsg = true,
1216           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1217         { .name     = "rmdir",    .errmsg = true,
1218           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1219         { .name     = "rt_sigaction", .errmsg = true,
1220           .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
1221         { .name     = "rt_sigprocmask",  .errmsg = true, STRARRAY(0, how, sighow), },
1222         { .name     = "rt_sigqueueinfo", .errmsg = true,
1223           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1224         { .name     = "rt_tgsigqueueinfo", .errmsg = true,
1225           .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
1226         { .name     = "select",     .errmsg = true, .timeout = true, },
1227         { .name     = "sendmmsg",    .errmsg = true,
1228           .arg_scnprintf = { [0] = SCA_FD, /* fd */
1229                              [3] = SCA_MSG_FLAGS, /* flags */ }, },
1230         { .name     = "sendmsg",    .errmsg = true,
1231           .arg_scnprintf = { [0] = SCA_FD, /* fd */
1232                              [2] = SCA_MSG_FLAGS, /* flags */ }, },
1233         { .name     = "sendto",     .errmsg = true,
1234           .arg_scnprintf = { [0] = SCA_FD, /* fd */
1235                              [3] = SCA_MSG_FLAGS, /* flags */ }, },
1236         { .name     = "setitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
1237         { .name     = "setrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
1238         { .name     = "setxattr",   .errmsg = true,
1239           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1240         { .name     = "shutdown",   .errmsg = true,
1241           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1242         { .name     = "socket",     .errmsg = true,
1243           .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
1244                              [1] = SCA_SK_TYPE, /* type */ },
1245           .arg_parm      = { [0] = &strarray__socket_families, /* family */ }, },
1246         { .name     = "socketpair", .errmsg = true,
1247           .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
1248                              [1] = SCA_SK_TYPE, /* type */ },
1249           .arg_parm      = { [0] = &strarray__socket_families, /* family */ }, },
1250         { .name     = "stat",       .errmsg = true, .alias = "newstat",
1251           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1252         { .name     = "statfs",     .errmsg = true,
1253           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1254         { .name     = "swapoff",    .errmsg = true,
1255           .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
1256         { .name     = "swapon",     .errmsg = true,
1257           .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
1258         { .name     = "symlinkat",  .errmsg = true,
1259           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1260         { .name     = "tgkill",     .errmsg = true,
1261           .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
1262         { .name     = "tkill",      .errmsg = true,
1263           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1264         { .name     = "truncate",   .errmsg = true,
1265           .arg_scnprintf = { [0] = SCA_FILENAME, /* path */ }, },
1266         { .name     = "uname",      .errmsg = true, .alias = "newuname", },
1267         { .name     = "unlinkat",   .errmsg = true,
1268           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1269                              [1] = SCA_FILENAME, /* pathname */ }, },
1270         { .name     = "utime",  .errmsg = true,
1271           .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1272         { .name     = "utimensat",  .errmsg = true,
1273           .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */
1274                              [1] = SCA_FILENAME, /* filename */ }, },
1275         { .name     = "utimes",  .errmsg = true,
1276           .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1277         { .name     = "vmsplice",  .errmsg = true,
1278           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1279         { .name     = "write",      .errmsg = true,
1280           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1281         { .name     = "writev",     .errmsg = true,
1282           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1283 };
1284
1285 static int syscall_fmt__cmp(const void *name, const void *fmtp)
1286 {
1287         const struct syscall_fmt *fmt = fmtp;
1288         return strcmp(name, fmt->name);
1289 }
1290
1291 static struct syscall_fmt *syscall_fmt__find(const char *name)
1292 {
1293         const int nmemb = ARRAY_SIZE(syscall_fmts);
1294         return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
1295 }
1296
1297 struct syscall {
1298         struct event_format *tp_format;
1299         int                 nr_args;
1300         struct format_field *args;
1301         const char          *name;
1302         bool                is_exit;
1303         struct syscall_fmt  *fmt;
1304         size_t              (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
1305         void                **arg_parm;
1306 };
1307
1308 static size_t fprintf_duration(unsigned long t, FILE *fp)
1309 {
1310         double duration = (double)t / NSEC_PER_MSEC;
1311         size_t printed = fprintf(fp, "(");
1312
1313         if (duration >= 1.0)
1314                 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
1315         else if (duration >= 0.01)
1316                 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
1317         else
1318                 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
1319         return printed + fprintf(fp, "): ");
1320 }
1321
1322 /**
1323  * filename.ptr: The filename char pointer that will be vfs_getname'd
1324  * filename.entry_str_pos: Where to insert the string translated from
1325  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
1326  */
1327 struct thread_trace {
1328         u64               entry_time;
1329         u64               exit_time;
1330         bool              entry_pending;
1331         unsigned long     nr_events;
1332         unsigned long     pfmaj, pfmin;
1333         char              *entry_str;
1334         double            runtime_ms;
1335         struct {
1336                 unsigned long ptr;
1337                 short int     entry_str_pos;
1338                 bool          pending_open;
1339                 unsigned int  namelen;
1340                 char          *name;
1341         } filename;
1342         struct {
1343                 int       max;
1344                 char      **table;
1345         } paths;
1346
1347         struct intlist *syscall_stats;
1348 };
1349
1350 static struct thread_trace *thread_trace__new(void)
1351 {
1352         struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
1353
1354         if (ttrace)
1355                 ttrace->paths.max = -1;
1356
1357         ttrace->syscall_stats = intlist__new(NULL);
1358
1359         return ttrace;
1360 }
1361
1362 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
1363 {
1364         struct thread_trace *ttrace;
1365
1366         if (thread == NULL)
1367                 goto fail;
1368
1369         if (thread__priv(thread) == NULL)
1370                 thread__set_priv(thread, thread_trace__new());
1371
1372         if (thread__priv(thread) == NULL)
1373                 goto fail;
1374
1375         ttrace = thread__priv(thread);
1376         ++ttrace->nr_events;
1377
1378         return ttrace;
1379 fail:
1380         color_fprintf(fp, PERF_COLOR_RED,
1381                       "WARNING: not enough memory, dropping samples!\n");
1382         return NULL;
1383 }
1384
1385 #define TRACE_PFMAJ             (1 << 0)
1386 #define TRACE_PFMIN             (1 << 1)
1387
1388 static const size_t trace__entry_str_size = 2048;
1389
1390 struct trace {
1391         struct perf_tool        tool;
1392         struct {
1393                 int             machine;
1394                 int             open_id;
1395         }                       audit;
1396         struct {
1397                 int             max;
1398                 struct syscall  *table;
1399                 struct {
1400                         struct perf_evsel *sys_enter,
1401                                           *sys_exit;
1402                 }               events;
1403         } syscalls;
1404         struct record_opts      opts;
1405         struct perf_evlist      *evlist;
1406         struct machine          *host;
1407         struct thread           *current;
1408         u64                     base_time;
1409         FILE                    *output;
1410         unsigned long           nr_events;
1411         struct strlist          *ev_qualifier;
1412         struct {
1413                 size_t          nr;
1414                 int             *entries;
1415         }                       ev_qualifier_ids;
1416         struct intlist          *tid_list;
1417         struct intlist          *pid_list;
1418         struct {
1419                 size_t          nr;
1420                 pid_t           *entries;
1421         }                       filter_pids;
1422         double                  duration_filter;
1423         double                  runtime_ms;
1424         struct {
1425                 u64             vfs_getname,
1426                                 proc_getname;
1427         } stats;
1428         bool                    not_ev_qualifier;
1429         bool                    live;
1430         bool                    full_time;
1431         bool                    sched;
1432         bool                    multiple_threads;
1433         bool                    summary;
1434         bool                    summary_only;
1435         bool                    show_comm;
1436         bool                    show_tool_stats;
1437         bool                    trace_syscalls;
1438         bool                    force;
1439         bool                    vfs_getname;
1440         int                     trace_pgfaults;
1441 };
1442
1443 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
1444 {
1445         struct thread_trace *ttrace = thread__priv(thread);
1446
1447         if (fd > ttrace->paths.max) {
1448                 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
1449
1450                 if (npath == NULL)
1451                         return -1;
1452
1453                 if (ttrace->paths.max != -1) {
1454                         memset(npath + ttrace->paths.max + 1, 0,
1455                                (fd - ttrace->paths.max) * sizeof(char *));
1456                 } else {
1457                         memset(npath, 0, (fd + 1) * sizeof(char *));
1458                 }
1459
1460                 ttrace->paths.table = npath;
1461                 ttrace->paths.max   = fd;
1462         }
1463
1464         ttrace->paths.table[fd] = strdup(pathname);
1465
1466         return ttrace->paths.table[fd] != NULL ? 0 : -1;
1467 }
1468
1469 static int thread__read_fd_path(struct thread *thread, int fd)
1470 {
1471         char linkname[PATH_MAX], pathname[PATH_MAX];
1472         struct stat st;
1473         int ret;
1474
1475         if (thread->pid_ == thread->tid) {
1476                 scnprintf(linkname, sizeof(linkname),
1477                           "/proc/%d/fd/%d", thread->pid_, fd);
1478         } else {
1479                 scnprintf(linkname, sizeof(linkname),
1480                           "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
1481         }
1482
1483         if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
1484                 return -1;
1485
1486         ret = readlink(linkname, pathname, sizeof(pathname));
1487
1488         if (ret < 0 || ret > st.st_size)
1489                 return -1;
1490
1491         pathname[ret] = '\0';
1492         return trace__set_fd_pathname(thread, fd, pathname);
1493 }
1494
1495 static const char *thread__fd_path(struct thread *thread, int fd,
1496                                    struct trace *trace)
1497 {
1498         struct thread_trace *ttrace = thread__priv(thread);
1499
1500         if (ttrace == NULL)
1501                 return NULL;
1502
1503         if (fd < 0)
1504                 return NULL;
1505
1506         if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
1507                 if (!trace->live)
1508                         return NULL;
1509                 ++trace->stats.proc_getname;
1510                 if (thread__read_fd_path(thread, fd))
1511                         return NULL;
1512         }
1513
1514         return ttrace->paths.table[fd];
1515 }
1516
1517 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
1518                                         struct syscall_arg *arg)
1519 {
1520         int fd = arg->val;
1521         size_t printed = scnprintf(bf, size, "%d", fd);
1522         const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1523
1524         if (path)
1525                 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1526
1527         return printed;
1528 }
1529
1530 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1531                                               struct syscall_arg *arg)
1532 {
1533         int fd = arg->val;
1534         size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1535         struct thread_trace *ttrace = thread__priv(arg->thread);
1536
1537         if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1538                 zfree(&ttrace->paths.table[fd]);
1539
1540         return printed;
1541 }
1542
1543 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1544                                      unsigned long ptr)
1545 {
1546         struct thread_trace *ttrace = thread__priv(thread);
1547
1548         ttrace->filename.ptr = ptr;
1549         ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1550 }
1551
1552 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1553                                               struct syscall_arg *arg)
1554 {
1555         unsigned long ptr = arg->val;
1556
1557         if (!arg->trace->vfs_getname)
1558                 return scnprintf(bf, size, "%#x", ptr);
1559
1560         thread__set_filename_pos(arg->thread, bf, ptr);
1561         return 0;
1562 }
1563
1564 static bool trace__filter_duration(struct trace *trace, double t)
1565 {
1566         return t < (trace->duration_filter * NSEC_PER_MSEC);
1567 }
1568
1569 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1570 {
1571         double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1572
1573         return fprintf(fp, "%10.3f ", ts);
1574 }
1575
1576 static bool done = false;
1577 static bool interrupted = false;
1578
1579 static void sig_handler(int sig)
1580 {
1581         done = true;
1582         interrupted = sig == SIGINT;
1583 }
1584
1585 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1586                                         u64 duration, u64 tstamp, FILE *fp)
1587 {
1588         size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1589         printed += fprintf_duration(duration, fp);
1590
1591         if (trace->multiple_threads) {
1592                 if (trace->show_comm)
1593                         printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1594                 printed += fprintf(fp, "%d ", thread->tid);
1595         }
1596
1597         return printed;
1598 }
1599
1600 static int trace__process_event(struct trace *trace, struct machine *machine,
1601                                 union perf_event *event, struct perf_sample *sample)
1602 {
1603         int ret = 0;
1604
1605         switch (event->header.type) {
1606         case PERF_RECORD_LOST:
1607                 color_fprintf(trace->output, PERF_COLOR_RED,
1608                               "LOST %" PRIu64 " events!\n", event->lost.lost);
1609                 ret = machine__process_lost_event(machine, event, sample);
1610         default:
1611                 ret = machine__process_event(machine, event, sample);
1612                 break;
1613         }
1614
1615         return ret;
1616 }
1617
1618 static int trace__tool_process(struct perf_tool *tool,
1619                                union perf_event *event,
1620                                struct perf_sample *sample,
1621                                struct machine *machine)
1622 {
1623         struct trace *trace = container_of(tool, struct trace, tool);
1624         return trace__process_event(trace, machine, event, sample);
1625 }
1626
1627 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1628 {
1629         int err = symbol__init(NULL);
1630
1631         if (err)
1632                 return err;
1633
1634         trace->host = machine__new_host();
1635         if (trace->host == NULL)
1636                 return -ENOMEM;
1637
1638         if (trace_event__register_resolver(trace->host, machine__resolve_kernel_addr) < 0)
1639                 return -errno;
1640
1641         err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1642                                             evlist->threads, trace__tool_process, false,
1643                                             trace->opts.proc_map_timeout);
1644         if (err)
1645                 symbol__exit();
1646
1647         return err;
1648 }
1649
1650 static int syscall__set_arg_fmts(struct syscall *sc)
1651 {
1652         struct format_field *field;
1653         int idx = 0;
1654
1655         sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *));
1656         if (sc->arg_scnprintf == NULL)
1657                 return -1;
1658
1659         if (sc->fmt)
1660                 sc->arg_parm = sc->fmt->arg_parm;
1661
1662         for (field = sc->args; field; field = field->next) {
1663                 if (sc->fmt && sc->fmt->arg_scnprintf[idx])
1664                         sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
1665                 else if (field->flags & FIELD_IS_POINTER)
1666                         sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
1667                 ++idx;
1668         }
1669
1670         return 0;
1671 }
1672
1673 static int trace__read_syscall_info(struct trace *trace, int id)
1674 {
1675         char tp_name[128];
1676         struct syscall *sc;
1677         const char *name = audit_syscall_to_name(id, trace->audit.machine);
1678
1679         if (name == NULL)
1680                 return -1;
1681
1682         if (id > trace->syscalls.max) {
1683                 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1684
1685                 if (nsyscalls == NULL)
1686                         return -1;
1687
1688                 if (trace->syscalls.max != -1) {
1689                         memset(nsyscalls + trace->syscalls.max + 1, 0,
1690                                (id - trace->syscalls.max) * sizeof(*sc));
1691                 } else {
1692                         memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1693                 }
1694
1695                 trace->syscalls.table = nsyscalls;
1696                 trace->syscalls.max   = id;
1697         }
1698
1699         sc = trace->syscalls.table + id;
1700         sc->name = name;
1701
1702         sc->fmt  = syscall_fmt__find(sc->name);
1703
1704         snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1705         sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1706
1707         if (sc->tp_format == NULL && sc->fmt && sc->fmt->alias) {
1708                 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1709                 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1710         }
1711
1712         if (sc->tp_format == NULL)
1713                 return -1;
1714
1715         sc->args = sc->tp_format->format.fields;
1716         sc->nr_args = sc->tp_format->format.nr_fields;
1717         /* drop nr field - not relevant here; does not exist on older kernels */
1718         if (sc->args && strcmp(sc->args->name, "nr") == 0) {
1719                 sc->args = sc->args->next;
1720                 --sc->nr_args;
1721         }
1722
1723         sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1724
1725         return syscall__set_arg_fmts(sc);
1726 }
1727
1728 static int trace__validate_ev_qualifier(struct trace *trace)
1729 {
1730         int err = 0, i;
1731         struct str_node *pos;
1732
1733         trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1734         trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1735                                                  sizeof(trace->ev_qualifier_ids.entries[0]));
1736
1737         if (trace->ev_qualifier_ids.entries == NULL) {
1738                 fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1739                        trace->output);
1740                 err = -EINVAL;
1741                 goto out;
1742         }
1743
1744         i = 0;
1745
1746         strlist__for_each(pos, trace->ev_qualifier) {
1747                 const char *sc = pos->s;
1748                 int id = audit_name_to_syscall(sc, trace->audit.machine);
1749
1750                 if (id < 0) {
1751                         if (err == 0) {
1752                                 fputs("Error:\tInvalid syscall ", trace->output);
1753                                 err = -EINVAL;
1754                         } else {
1755                                 fputs(", ", trace->output);
1756                         }
1757
1758                         fputs(sc, trace->output);
1759                 }
1760
1761                 trace->ev_qualifier_ids.entries[i++] = id;
1762         }
1763
1764         if (err < 0) {
1765                 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1766                       "\nHint:\tand: 'man syscalls'\n", trace->output);
1767                 zfree(&trace->ev_qualifier_ids.entries);
1768                 trace->ev_qualifier_ids.nr = 0;
1769         }
1770 out:
1771         return err;
1772 }
1773
1774 /*
1775  * args is to be interpreted as a series of longs but we need to handle
1776  * 8-byte unaligned accesses. args points to raw_data within the event
1777  * and raw_data is guaranteed to be 8-byte unaligned because it is
1778  * preceded by raw_size which is a u32. So we need to copy args to a temp
1779  * variable to read it. Most notably this avoids extended load instructions
1780  * on unaligned addresses
1781  */
1782
1783 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1784                                       unsigned char *args, struct trace *trace,
1785                                       struct thread *thread)
1786 {
1787         size_t printed = 0;
1788         unsigned char *p;
1789         unsigned long val;
1790
1791         if (sc->args != NULL) {
1792                 struct format_field *field;
1793                 u8 bit = 1;
1794                 struct syscall_arg arg = {
1795                         .idx    = 0,
1796                         .mask   = 0,
1797                         .trace  = trace,
1798                         .thread = thread,
1799                 };
1800
1801                 for (field = sc->args; field;
1802                      field = field->next, ++arg.idx, bit <<= 1) {
1803                         if (arg.mask & bit)
1804                                 continue;
1805
1806                         /* special care for unaligned accesses */
1807                         p = args + sizeof(unsigned long) * arg.idx;
1808                         memcpy(&val, p, sizeof(val));
1809
1810                         /*
1811                          * Suppress this argument if its value is zero and
1812                          * and we don't have a string associated in an
1813                          * strarray for it.
1814                          */
1815                         if (val == 0 &&
1816                             !(sc->arg_scnprintf &&
1817                               sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
1818                               sc->arg_parm[arg.idx]))
1819                                 continue;
1820
1821                         printed += scnprintf(bf + printed, size - printed,
1822                                              "%s%s: ", printed ? ", " : "", field->name);
1823                         if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
1824                                 arg.val = val;
1825                                 if (sc->arg_parm)
1826                                         arg.parm = sc->arg_parm[arg.idx];
1827                                 printed += sc->arg_scnprintf[arg.idx](bf + printed,
1828                                                                       size - printed, &arg);
1829                         } else {
1830                                 printed += scnprintf(bf + printed, size - printed,
1831                                                      "%ld", val);
1832                         }
1833                 }
1834         } else {
1835                 int i = 0;
1836
1837                 while (i < 6) {
1838                         /* special care for unaligned accesses */
1839                         p = args + sizeof(unsigned long) * i;
1840                         memcpy(&val, p, sizeof(val));
1841                         printed += scnprintf(bf + printed, size - printed,
1842                                              "%sarg%d: %ld",
1843                                              printed ? ", " : "", i, val);
1844                         ++i;
1845                 }
1846         }
1847
1848         return printed;
1849 }
1850
1851 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1852                                   union perf_event *event,
1853                                   struct perf_sample *sample);
1854
1855 static struct syscall *trace__syscall_info(struct trace *trace,
1856                                            struct perf_evsel *evsel, int id)
1857 {
1858
1859         if (id < 0) {
1860
1861                 /*
1862                  * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1863                  * before that, leaving at a higher verbosity level till that is
1864                  * explained. Reproduced with plain ftrace with:
1865                  *
1866                  * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1867                  * grep "NR -1 " /t/trace_pipe
1868                  *
1869                  * After generating some load on the machine.
1870                  */
1871                 if (verbose > 1) {
1872                         static u64 n;
1873                         fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1874                                 id, perf_evsel__name(evsel), ++n);
1875                 }
1876                 return NULL;
1877         }
1878
1879         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1880             trace__read_syscall_info(trace, id))
1881                 goto out_cant_read;
1882
1883         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1884                 goto out_cant_read;
1885
1886         return &trace->syscalls.table[id];
1887
1888 out_cant_read:
1889         if (verbose) {
1890                 fprintf(trace->output, "Problems reading syscall %d", id);
1891                 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1892                         fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1893                 fputs(" information\n", trace->output);
1894         }
1895         return NULL;
1896 }
1897
1898 static void thread__update_stats(struct thread_trace *ttrace,
1899                                  int id, struct perf_sample *sample)
1900 {
1901         struct int_node *inode;
1902         struct stats *stats;
1903         u64 duration = 0;
1904
1905         inode = intlist__findnew(ttrace->syscall_stats, id);
1906         if (inode == NULL)
1907                 return;
1908
1909         stats = inode->priv;
1910         if (stats == NULL) {
1911                 stats = malloc(sizeof(struct stats));
1912                 if (stats == NULL)
1913                         return;
1914                 init_stats(stats);
1915                 inode->priv = stats;
1916         }
1917
1918         if (ttrace->entry_time && sample->time > ttrace->entry_time)
1919                 duration = sample->time - ttrace->entry_time;
1920
1921         update_stats(stats, duration);
1922 }
1923
1924 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1925 {
1926         struct thread_trace *ttrace;
1927         u64 duration;
1928         size_t printed;
1929
1930         if (trace->current == NULL)
1931                 return 0;
1932
1933         ttrace = thread__priv(trace->current);
1934
1935         if (!ttrace->entry_pending)
1936                 return 0;
1937
1938         duration = sample->time - ttrace->entry_time;
1939
1940         printed  = trace__fprintf_entry_head(trace, trace->current, duration, sample->time, trace->output);
1941         printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1942         ttrace->entry_pending = false;
1943
1944         return printed;
1945 }
1946
1947 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1948                             union perf_event *event __maybe_unused,
1949                             struct perf_sample *sample)
1950 {
1951         char *msg;
1952         void *args;
1953         size_t printed = 0;
1954         struct thread *thread;
1955         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1956         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1957         struct thread_trace *ttrace;
1958
1959         if (sc == NULL)
1960                 return -1;
1961
1962         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1963         ttrace = thread__trace(thread, trace->output);
1964         if (ttrace == NULL)
1965                 goto out_put;
1966
1967         args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1968
1969         if (ttrace->entry_str == NULL) {
1970                 ttrace->entry_str = malloc(trace__entry_str_size);
1971                 if (!ttrace->entry_str)
1972                         goto out_put;
1973         }
1974
1975         if (!trace->summary_only)
1976                 trace__printf_interrupted_entry(trace, sample);
1977
1978         ttrace->entry_time = sample->time;
1979         msg = ttrace->entry_str;
1980         printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1981
1982         printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1983                                            args, trace, thread);
1984
1985         if (sc->is_exit) {
1986                 if (!trace->duration_filter && !trace->summary_only) {
1987                         trace__fprintf_entry_head(trace, thread, 1, sample->time, trace->output);
1988                         fprintf(trace->output, "%-70s\n", ttrace->entry_str);
1989                 }
1990         } else {
1991                 ttrace->entry_pending = true;
1992                 /* See trace__vfs_getname & trace__sys_exit */
1993                 ttrace->filename.pending_open = false;
1994         }
1995
1996         if (trace->current != thread) {
1997                 thread__put(trace->current);
1998                 trace->current = thread__get(thread);
1999         }
2000         err = 0;
2001 out_put:
2002         thread__put(thread);
2003         return err;
2004 }
2005
2006 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
2007                            union perf_event *event __maybe_unused,
2008                            struct perf_sample *sample)
2009 {
2010         long ret;
2011         u64 duration = 0;
2012         struct thread *thread;
2013         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
2014         struct syscall *sc = trace__syscall_info(trace, evsel, id);
2015         struct thread_trace *ttrace;
2016
2017         if (sc == NULL)
2018                 return -1;
2019
2020         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2021         ttrace = thread__trace(thread, trace->output);
2022         if (ttrace == NULL)
2023                 goto out_put;
2024
2025         if (trace->summary)
2026                 thread__update_stats(ttrace, id, sample);
2027
2028         ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
2029
2030         if (id == trace->audit.open_id && ret >= 0 && ttrace->filename.pending_open) {
2031                 trace__set_fd_pathname(thread, ret, ttrace->filename.name);
2032                 ttrace->filename.pending_open = false;
2033                 ++trace->stats.vfs_getname;
2034         }
2035
2036         ttrace->exit_time = sample->time;
2037
2038         if (ttrace->entry_time) {
2039                 duration = sample->time - ttrace->entry_time;
2040                 if (trace__filter_duration(trace, duration))
2041                         goto out;
2042         } else if (trace->duration_filter)
2043                 goto out;
2044
2045         if (trace->summary_only)
2046                 goto out;
2047
2048         trace__fprintf_entry_head(trace, thread, duration, sample->time, trace->output);
2049
2050         if (ttrace->entry_pending) {
2051                 fprintf(trace->output, "%-70s", ttrace->entry_str);
2052         } else {
2053                 fprintf(trace->output, " ... [");
2054                 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
2055                 fprintf(trace->output, "]: %s()", sc->name);
2056         }
2057
2058         if (sc->fmt == NULL) {
2059 signed_print:
2060                 fprintf(trace->output, ") = %ld", ret);
2061         } else if (ret < 0 && sc->fmt->errmsg) {
2062                 char bf[STRERR_BUFSIZE];
2063                 const char *emsg = strerror_r(-ret, bf, sizeof(bf)),
2064                            *e = audit_errno_to_name(-ret);
2065
2066                 fprintf(trace->output, ") = -1 %s %s", e, emsg);
2067         } else if (ret == 0 && sc->fmt->timeout)
2068                 fprintf(trace->output, ") = 0 Timeout");
2069         else if (sc->fmt->hexret)
2070                 fprintf(trace->output, ") = %#lx", ret);
2071         else
2072                 goto signed_print;
2073
2074         fputc('\n', trace->output);
2075 out:
2076         ttrace->entry_pending = false;
2077         err = 0;
2078 out_put:
2079         thread__put(thread);
2080         return err;
2081 }
2082
2083 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
2084                               union perf_event *event __maybe_unused,
2085                               struct perf_sample *sample)
2086 {
2087         struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2088         struct thread_trace *ttrace;
2089         size_t filename_len, entry_str_len, to_move;
2090         ssize_t remaining_space;
2091         char *pos;
2092         const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
2093
2094         if (!thread)
2095                 goto out;
2096
2097         ttrace = thread__priv(thread);
2098         if (!ttrace)
2099                 goto out;
2100
2101         filename_len = strlen(filename);
2102
2103         if (ttrace->filename.namelen < filename_len) {
2104                 char *f = realloc(ttrace->filename.name, filename_len + 1);
2105
2106                 if (f == NULL)
2107                                 goto out;
2108
2109                 ttrace->filename.namelen = filename_len;
2110                 ttrace->filename.name = f;
2111         }
2112
2113         strcpy(ttrace->filename.name, filename);
2114         ttrace->filename.pending_open = true;
2115
2116         if (!ttrace->filename.ptr)
2117                 goto out;
2118
2119         entry_str_len = strlen(ttrace->entry_str);
2120         remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
2121         if (remaining_space <= 0)
2122                 goto out;
2123
2124         if (filename_len > (size_t)remaining_space) {
2125                 filename += filename_len - remaining_space;
2126                 filename_len = remaining_space;
2127         }
2128
2129         to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
2130         pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
2131         memmove(pos + filename_len, pos, to_move);
2132         memcpy(pos, filename, filename_len);
2133
2134         ttrace->filename.ptr = 0;
2135         ttrace->filename.entry_str_pos = 0;
2136 out:
2137         return 0;
2138 }
2139
2140 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
2141                                      union perf_event *event __maybe_unused,
2142                                      struct perf_sample *sample)
2143 {
2144         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
2145         double runtime_ms = (double)runtime / NSEC_PER_MSEC;
2146         struct thread *thread = machine__findnew_thread(trace->host,
2147                                                         sample->pid,
2148                                                         sample->tid);
2149         struct thread_trace *ttrace = thread__trace(thread, trace->output);
2150
2151         if (ttrace == NULL)
2152                 goto out_dump;
2153
2154         ttrace->runtime_ms += runtime_ms;
2155         trace->runtime_ms += runtime_ms;
2156         thread__put(thread);
2157         return 0;
2158
2159 out_dump:
2160         fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
2161                evsel->name,
2162                perf_evsel__strval(evsel, sample, "comm"),
2163                (pid_t)perf_evsel__intval(evsel, sample, "pid"),
2164                runtime,
2165                perf_evsel__intval(evsel, sample, "vruntime"));
2166         thread__put(thread);
2167         return 0;
2168 }
2169
2170 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
2171                                 union perf_event *event __maybe_unused,
2172                                 struct perf_sample *sample)
2173 {
2174         trace__printf_interrupted_entry(trace, sample);
2175         trace__fprintf_tstamp(trace, sample->time, trace->output);
2176
2177         if (trace->trace_syscalls)
2178                 fprintf(trace->output, "(         ): ");
2179
2180         fprintf(trace->output, "%s:", evsel->name);
2181
2182         if (evsel->tp_format) {
2183                 event_format__fprintf(evsel->tp_format, sample->cpu,
2184                                       sample->raw_data, sample->raw_size,
2185                                       trace->output);
2186         }
2187
2188         fprintf(trace->output, ")\n");
2189         return 0;
2190 }
2191
2192 static void print_location(FILE *f, struct perf_sample *sample,
2193                            struct addr_location *al,
2194                            bool print_dso, bool print_sym)
2195 {
2196
2197         if ((verbose || print_dso) && al->map)
2198                 fprintf(f, "%s@", al->map->dso->long_name);
2199
2200         if ((verbose || print_sym) && al->sym)
2201                 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
2202                         al->addr - al->sym->start);
2203         else if (al->map)
2204                 fprintf(f, "0x%" PRIx64, al->addr);
2205         else
2206                 fprintf(f, "0x%" PRIx64, sample->addr);
2207 }
2208
2209 static int trace__pgfault(struct trace *trace,
2210                           struct perf_evsel *evsel,
2211                           union perf_event *event,
2212                           struct perf_sample *sample)
2213 {
2214         struct thread *thread;
2215         u8 cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
2216         struct addr_location al;
2217         char map_type = 'd';
2218         struct thread_trace *ttrace;
2219         int err = -1;
2220
2221         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2222         ttrace = thread__trace(thread, trace->output);
2223         if (ttrace == NULL)
2224                 goto out_put;
2225
2226         if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
2227                 ttrace->pfmaj++;
2228         else
2229                 ttrace->pfmin++;
2230
2231         if (trace->summary_only)
2232                 goto out;
2233
2234         thread__find_addr_location(thread, cpumode, MAP__FUNCTION,
2235                               sample->ip, &al);
2236
2237         trace__fprintf_entry_head(trace, thread, 0, sample->time, trace->output);
2238
2239         fprintf(trace->output, "%sfault [",
2240                 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
2241                 "maj" : "min");
2242
2243         print_location(trace->output, sample, &al, false, true);
2244
2245         fprintf(trace->output, "] => ");
2246
2247         thread__find_addr_location(thread, cpumode, MAP__VARIABLE,
2248                                    sample->addr, &al);
2249
2250         if (!al.map) {
2251                 thread__find_addr_location(thread, cpumode,
2252                                            MAP__FUNCTION, sample->addr, &al);
2253
2254                 if (al.map)
2255                         map_type = 'x';
2256                 else
2257                         map_type = '?';
2258         }
2259
2260         print_location(trace->output, sample, &al, true, false);
2261
2262         fprintf(trace->output, " (%c%c)\n", map_type, al.level);
2263 out:
2264         err = 0;
2265 out_put:
2266         thread__put(thread);
2267         return err;
2268 }
2269
2270 static bool skip_sample(struct trace *trace, struct perf_sample *sample)
2271 {
2272         if ((trace->pid_list && intlist__find(trace->pid_list, sample->pid)) ||
2273             (trace->tid_list && intlist__find(trace->tid_list, sample->tid)))
2274                 return false;
2275
2276         if (trace->pid_list || trace->tid_list)
2277                 return true;
2278
2279         return false;
2280 }
2281
2282 static int trace__process_sample(struct perf_tool *tool,
2283                                  union perf_event *event,
2284                                  struct perf_sample *sample,
2285                                  struct perf_evsel *evsel,
2286                                  struct machine *machine __maybe_unused)
2287 {
2288         struct trace *trace = container_of(tool, struct trace, tool);
2289         int err = 0;
2290
2291         tracepoint_handler handler = evsel->handler;
2292
2293         if (skip_sample(trace, sample))
2294                 return 0;
2295
2296         if (!trace->full_time && trace->base_time == 0)
2297                 trace->base_time = sample->time;
2298
2299         if (handler) {
2300                 ++trace->nr_events;
2301                 handler(trace, evsel, event, sample);
2302         }
2303
2304         return err;
2305 }
2306
2307 static int parse_target_str(struct trace *trace)
2308 {
2309         if (trace->opts.target.pid) {
2310                 trace->pid_list = intlist__new(trace->opts.target.pid);
2311                 if (trace->pid_list == NULL) {
2312                         pr_err("Error parsing process id string\n");
2313                         return -EINVAL;
2314                 }
2315         }
2316
2317         if (trace->opts.target.tid) {
2318                 trace->tid_list = intlist__new(trace->opts.target.tid);
2319                 if (trace->tid_list == NULL) {
2320                         pr_err("Error parsing thread id string\n");
2321                         return -EINVAL;
2322                 }
2323         }
2324
2325         return 0;
2326 }
2327
2328 static int trace__record(struct trace *trace, int argc, const char **argv)
2329 {
2330         unsigned int rec_argc, i, j;
2331         const char **rec_argv;
2332         const char * const record_args[] = {
2333                 "record",
2334                 "-R",
2335                 "-m", "1024",
2336                 "-c", "1",
2337         };
2338
2339         const char * const sc_args[] = { "-e", };
2340         unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2341         const char * const majpf_args[] = { "-e", "major-faults" };
2342         unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2343         const char * const minpf_args[] = { "-e", "minor-faults" };
2344         unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2345
2346         /* +1 is for the event string below */
2347         rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2348                 majpf_args_nr + minpf_args_nr + argc;
2349         rec_argv = calloc(rec_argc + 1, sizeof(char *));
2350
2351         if (rec_argv == NULL)
2352                 return -ENOMEM;
2353
2354         j = 0;
2355         for (i = 0; i < ARRAY_SIZE(record_args); i++)
2356                 rec_argv[j++] = record_args[i];
2357
2358         if (trace->trace_syscalls) {
2359                 for (i = 0; i < sc_args_nr; i++)
2360                         rec_argv[j++] = sc_args[i];
2361
2362                 /* event string may be different for older kernels - e.g., RHEL6 */
2363                 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2364                         rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2365                 else if (is_valid_tracepoint("syscalls:sys_enter"))
2366                         rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2367                 else {
2368                         pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2369                         return -1;
2370                 }
2371         }
2372
2373         if (trace->trace_pgfaults & TRACE_PFMAJ)
2374                 for (i = 0; i < majpf_args_nr; i++)
2375                         rec_argv[j++] = majpf_args[i];
2376
2377         if (trace->trace_pgfaults & TRACE_PFMIN)
2378                 for (i = 0; i < minpf_args_nr; i++)
2379                         rec_argv[j++] = minpf_args[i];
2380
2381         for (i = 0; i < (unsigned int)argc; i++)
2382                 rec_argv[j++] = argv[i];
2383
2384         return cmd_record(j, rec_argv, NULL);
2385 }
2386
2387 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2388
2389 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2390 {
2391         struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2392         if (evsel == NULL)
2393                 return false;
2394
2395         if (perf_evsel__field(evsel, "pathname") == NULL) {
2396                 perf_evsel__delete(evsel);
2397                 return false;
2398         }
2399
2400         evsel->handler = trace__vfs_getname;
2401         perf_evlist__add(evlist, evsel);
2402         return true;
2403 }
2404
2405 static int perf_evlist__add_pgfault(struct perf_evlist *evlist,
2406                                     u64 config)
2407 {
2408         struct perf_evsel *evsel;
2409         struct perf_event_attr attr = {
2410                 .type = PERF_TYPE_SOFTWARE,
2411                 .mmap_data = 1,
2412         };
2413
2414         attr.config = config;
2415         attr.sample_period = 1;
2416
2417         event_attr_init(&attr);
2418
2419         evsel = perf_evsel__new(&attr);
2420         if (!evsel)
2421                 return -ENOMEM;
2422
2423         evsel->handler = trace__pgfault;
2424         perf_evlist__add(evlist, evsel);
2425
2426         return 0;
2427 }
2428
2429 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2430 {
2431         const u32 type = event->header.type;
2432         struct perf_evsel *evsel;
2433
2434         if (!trace->full_time && trace->base_time == 0)
2435                 trace->base_time = sample->time;
2436
2437         if (type != PERF_RECORD_SAMPLE) {
2438                 trace__process_event(trace, trace->host, event, sample);
2439                 return;
2440         }
2441
2442         evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2443         if (evsel == NULL) {
2444                 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2445                 return;
2446         }
2447
2448         if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2449             sample->raw_data == NULL) {
2450                 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2451                        perf_evsel__name(evsel), sample->tid,
2452                        sample->cpu, sample->raw_size);
2453         } else {
2454                 tracepoint_handler handler = evsel->handler;
2455                 handler(trace, evsel, event, sample);
2456         }
2457 }
2458
2459 static int trace__add_syscall_newtp(struct trace *trace)
2460 {
2461         int ret = -1;
2462         struct perf_evlist *evlist = trace->evlist;
2463         struct perf_evsel *sys_enter, *sys_exit;
2464
2465         sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2466         if (sys_enter == NULL)
2467                 goto out;
2468
2469         if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2470                 goto out_delete_sys_enter;
2471
2472         sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2473         if (sys_exit == NULL)
2474                 goto out_delete_sys_enter;
2475
2476         if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2477                 goto out_delete_sys_exit;
2478
2479         perf_evlist__add(evlist, sys_enter);
2480         perf_evlist__add(evlist, sys_exit);
2481
2482         trace->syscalls.events.sys_enter = sys_enter;
2483         trace->syscalls.events.sys_exit  = sys_exit;
2484
2485         ret = 0;
2486 out:
2487         return ret;
2488
2489 out_delete_sys_exit:
2490         perf_evsel__delete_priv(sys_exit);
2491 out_delete_sys_enter:
2492         perf_evsel__delete_priv(sys_enter);
2493         goto out;
2494 }
2495
2496 static int trace__set_ev_qualifier_filter(struct trace *trace)
2497 {
2498         int err = -1;
2499         char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2500                                                 trace->ev_qualifier_ids.nr,
2501                                                 trace->ev_qualifier_ids.entries);
2502
2503         if (filter == NULL)
2504                 goto out_enomem;
2505
2506         if (!perf_evsel__append_filter(trace->syscalls.events.sys_enter, "&&", filter))
2507                 err = perf_evsel__append_filter(trace->syscalls.events.sys_exit, "&&", filter);
2508
2509         free(filter);
2510 out:
2511         return err;
2512 out_enomem:
2513         errno = ENOMEM;
2514         goto out;
2515 }
2516
2517 static int trace__run(struct trace *trace, int argc, const char **argv)
2518 {
2519         struct perf_evlist *evlist = trace->evlist;
2520         struct perf_evsel *evsel;
2521         int err = -1, i;
2522         unsigned long before;
2523         const bool forks = argc > 0;
2524         bool draining = false;
2525
2526         trace->live = true;
2527
2528         if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2529                 goto out_error_raw_syscalls;
2530
2531         if (trace->trace_syscalls)
2532                 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2533
2534         if ((trace->trace_pgfaults & TRACE_PFMAJ) &&
2535             perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MAJ)) {
2536                 goto out_error_mem;
2537         }
2538
2539         if ((trace->trace_pgfaults & TRACE_PFMIN) &&
2540             perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MIN))
2541                 goto out_error_mem;
2542
2543         if (trace->sched &&
2544             perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2545                                    trace__sched_stat_runtime))
2546                 goto out_error_sched_stat_runtime;
2547
2548         err = perf_evlist__create_maps(evlist, &trace->opts.target);
2549         if (err < 0) {
2550                 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2551                 goto out_delete_evlist;
2552         }
2553
2554         err = trace__symbols_init(trace, evlist);
2555         if (err < 0) {
2556                 fprintf(trace->output, "Problems initializing symbol libraries!\n");
2557                 goto out_delete_evlist;
2558         }
2559
2560         perf_evlist__config(evlist, &trace->opts);
2561
2562         signal(SIGCHLD, sig_handler);
2563         signal(SIGINT, sig_handler);
2564
2565         if (forks) {
2566                 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2567                                                     argv, false, NULL);
2568                 if (err < 0) {
2569                         fprintf(trace->output, "Couldn't run the workload!\n");
2570                         goto out_delete_evlist;
2571                 }
2572         }
2573
2574         err = perf_evlist__open(evlist);
2575         if (err < 0)
2576                 goto out_error_open;
2577
2578         /*
2579          * Better not use !target__has_task() here because we need to cover the
2580          * case where no threads were specified in the command line, but a
2581          * workload was, and in that case we will fill in the thread_map when
2582          * we fork the workload in perf_evlist__prepare_workload.
2583          */
2584         if (trace->filter_pids.nr > 0)
2585                 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2586         else if (thread_map__pid(evlist->threads, 0) == -1)
2587                 err = perf_evlist__set_filter_pid(evlist, getpid());
2588
2589         if (err < 0)
2590                 goto out_error_mem;
2591
2592         if (trace->ev_qualifier_ids.nr > 0) {
2593                 err = trace__set_ev_qualifier_filter(trace);
2594                 if (err < 0)
2595                         goto out_errno;
2596
2597                 pr_debug("event qualifier tracepoint filter: %s\n",
2598                          trace->syscalls.events.sys_exit->filter);
2599         }
2600
2601         err = perf_evlist__apply_filters(evlist, &evsel);
2602         if (err < 0)
2603                 goto out_error_apply_filters;
2604
2605         err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2606         if (err < 0)
2607                 goto out_error_mmap;
2608
2609         if (!target__none(&trace->opts.target))
2610                 perf_evlist__enable(evlist);
2611
2612         if (forks)
2613                 perf_evlist__start_workload(evlist);
2614
2615         trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2616                                   evlist->threads->nr > 1 ||
2617                                   perf_evlist__first(evlist)->attr.inherit;
2618 again:
2619         before = trace->nr_events;
2620
2621         for (i = 0; i < evlist->nr_mmaps; i++) {
2622                 union perf_event *event;
2623
2624                 while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2625                         struct perf_sample sample;
2626
2627                         ++trace->nr_events;
2628
2629                         err = perf_evlist__parse_sample(evlist, event, &sample);
2630                         if (err) {
2631                                 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2632                                 goto next_event;
2633                         }
2634
2635                         trace__handle_event(trace, event, &sample);
2636 next_event:
2637                         perf_evlist__mmap_consume(evlist, i);
2638
2639                         if (interrupted)
2640                                 goto out_disable;
2641
2642                         if (done && !draining) {
2643                                 perf_evlist__disable(evlist);
2644                                 draining = true;
2645                         }
2646                 }
2647         }
2648
2649         if (trace->nr_events == before) {
2650                 int timeout = done ? 100 : -1;
2651
2652                 if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2653                         if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2654                                 draining = true;
2655
2656                         goto again;
2657                 }
2658         } else {
2659                 goto again;
2660         }
2661
2662 out_disable:
2663         thread__zput(trace->current);
2664
2665         perf_evlist__disable(evlist);
2666
2667         if (!err) {
2668                 if (trace->summary)
2669                         trace__fprintf_thread_summary(trace, trace->output);
2670
2671                 if (trace->show_tool_stats) {
2672                         fprintf(trace->output, "Stats:\n "
2673                                                " vfs_getname : %" PRIu64 "\n"
2674                                                " proc_getname: %" PRIu64 "\n",
2675                                 trace->stats.vfs_getname,
2676                                 trace->stats.proc_getname);
2677                 }
2678         }
2679
2680 out_delete_evlist:
2681         perf_evlist__delete(evlist);
2682         trace->evlist = NULL;
2683         trace->live = false;
2684         return err;
2685 {
2686         char errbuf[BUFSIZ];
2687
2688 out_error_sched_stat_runtime:
2689         debugfs__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2690         goto out_error;
2691
2692 out_error_raw_syscalls:
2693         debugfs__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2694         goto out_error;
2695
2696 out_error_mmap:
2697         perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2698         goto out_error;
2699
2700 out_error_open:
2701         perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2702
2703 out_error:
2704         fprintf(trace->output, "%s\n", errbuf);
2705         goto out_delete_evlist;
2706
2707 out_error_apply_filters:
2708         fprintf(trace->output,
2709                 "Failed to set filter \"%s\" on event %s with %d (%s)\n",
2710                 evsel->filter, perf_evsel__name(evsel), errno,
2711                 strerror_r(errno, errbuf, sizeof(errbuf)));
2712         goto out_delete_evlist;
2713 }
2714 out_error_mem:
2715         fprintf(trace->output, "Not enough memory to run!\n");
2716         goto out_delete_evlist;
2717
2718 out_errno:
2719         fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2720         goto out_delete_evlist;
2721 }
2722
2723 static int trace__replay(struct trace *trace)
2724 {
2725         const struct perf_evsel_str_handler handlers[] = {
2726                 { "probe:vfs_getname",       trace__vfs_getname, },
2727         };
2728         struct perf_data_file file = {
2729                 .path  = input_name,
2730                 .mode  = PERF_DATA_MODE_READ,
2731                 .force = trace->force,
2732         };
2733         struct perf_session *session;
2734         struct perf_evsel *evsel;
2735         int err = -1;
2736
2737         trace->tool.sample        = trace__process_sample;
2738         trace->tool.mmap          = perf_event__process_mmap;
2739         trace->tool.mmap2         = perf_event__process_mmap2;
2740         trace->tool.comm          = perf_event__process_comm;
2741         trace->tool.exit          = perf_event__process_exit;
2742         trace->tool.fork          = perf_event__process_fork;
2743         trace->tool.attr          = perf_event__process_attr;
2744         trace->tool.tracing_data = perf_event__process_tracing_data;
2745         trace->tool.build_id      = perf_event__process_build_id;
2746
2747         trace->tool.ordered_events = true;
2748         trace->tool.ordering_requires_timestamps = true;
2749
2750         /* add tid to output */
2751         trace->multiple_threads = true;
2752
2753         session = perf_session__new(&file, false, &trace->tool);
2754         if (session == NULL)
2755                 return -1;
2756
2757         if (symbol__init(&session->header.env) < 0)
2758                 goto out;
2759
2760         trace->host = &session->machines.host;
2761
2762         err = perf_session__set_tracepoints_handlers(session, handlers);
2763         if (err)
2764                 goto out;
2765
2766         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2767                                                      "raw_syscalls:sys_enter");
2768         /* older kernels have syscalls tp versus raw_syscalls */
2769         if (evsel == NULL)
2770                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2771                                                              "syscalls:sys_enter");
2772
2773         if (evsel &&
2774             (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2775             perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2776                 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2777                 goto out;
2778         }
2779
2780         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2781                                                      "raw_syscalls:sys_exit");
2782         if (evsel == NULL)
2783                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2784                                                              "syscalls:sys_exit");
2785         if (evsel &&
2786             (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2787             perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2788                 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2789                 goto out;
2790         }
2791
2792         evlist__for_each(session->evlist, evsel) {
2793                 if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2794                     (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2795                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2796                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2797                         evsel->handler = trace__pgfault;
2798         }
2799
2800         err = parse_target_str(trace);
2801         if (err != 0)
2802                 goto out;
2803
2804         setup_pager();
2805
2806         err = perf_session__process_events(session);
2807         if (err)
2808                 pr_err("Failed to process events, error %d", err);
2809
2810         else if (trace->summary)
2811                 trace__fprintf_thread_summary(trace, trace->output);
2812
2813 out:
2814         perf_session__delete(session);
2815
2816         return err;
2817 }
2818
2819 static size_t trace__fprintf_threads_header(FILE *fp)
2820 {
2821         size_t printed;
2822
2823         printed  = fprintf(fp, "\n Summary of events:\n\n");
2824
2825         return printed;
2826 }
2827
2828 static size_t thread__dump_stats(struct thread_trace *ttrace,
2829                                  struct trace *trace, FILE *fp)
2830 {
2831         struct stats *stats;
2832         size_t printed = 0;
2833         struct syscall *sc;
2834         struct int_node *inode = intlist__first(ttrace->syscall_stats);
2835
2836         if (inode == NULL)
2837                 return 0;
2838
2839         printed += fprintf(fp, "\n");
2840
2841         printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2842         printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2843         printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2844
2845         /* each int_node is a syscall */
2846         while (inode) {
2847                 stats = inode->priv;
2848                 if (stats) {
2849                         double min = (double)(stats->min) / NSEC_PER_MSEC;
2850                         double max = (double)(stats->max) / NSEC_PER_MSEC;
2851                         double avg = avg_stats(stats);
2852                         double pct;
2853                         u64 n = (u64) stats->n;
2854
2855                         pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2856                         avg /= NSEC_PER_MSEC;
2857
2858                         sc = &trace->syscalls.table[inode->i];
2859                         printed += fprintf(fp, "   %-15s", sc->name);
2860                         printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2861                                            n, avg * n, min, avg);
2862                         printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2863                 }
2864
2865                 inode = intlist__next(inode);
2866         }
2867
2868         printed += fprintf(fp, "\n\n");
2869
2870         return printed;
2871 }
2872
2873 /* struct used to pass data to per-thread function */
2874 struct summary_data {
2875         FILE *fp;
2876         struct trace *trace;
2877         size_t printed;
2878 };
2879
2880 static int trace__fprintf_one_thread(struct thread *thread, void *priv)
2881 {
2882         struct summary_data *data = priv;
2883         FILE *fp = data->fp;
2884         size_t printed = data->printed;
2885         struct trace *trace = data->trace;
2886         struct thread_trace *ttrace = thread__priv(thread);
2887         double ratio;
2888
2889         if (ttrace == NULL)
2890                 return 0;
2891
2892         ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2893
2894         printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2895         printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2896         printed += fprintf(fp, "%.1f%%", ratio);
2897         if (ttrace->pfmaj)
2898                 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2899         if (ttrace->pfmin)
2900                 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2901         printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2902         printed += thread__dump_stats(ttrace, trace, fp);
2903
2904         data->printed += printed;
2905
2906         return 0;
2907 }
2908
2909 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2910 {
2911         struct summary_data data = {
2912                 .fp = fp,
2913                 .trace = trace
2914         };
2915         data.printed = trace__fprintf_threads_header(fp);
2916
2917         machine__for_each_thread(trace->host, trace__fprintf_one_thread, &data);
2918
2919         return data.printed;
2920 }
2921
2922 static int trace__set_duration(const struct option *opt, const char *str,
2923                                int unset __maybe_unused)
2924 {
2925         struct trace *trace = opt->value;
2926
2927         trace->duration_filter = atof(str);
2928         return 0;
2929 }
2930
2931 static int trace__set_filter_pids(const struct option *opt, const char *str,
2932                                   int unset __maybe_unused)
2933 {
2934         int ret = -1;
2935         size_t i;
2936         struct trace *trace = opt->value;
2937         /*
2938          * FIXME: introduce a intarray class, plain parse csv and create a
2939          * { int nr, int entries[] } struct...
2940          */
2941         struct intlist *list = intlist__new(str);
2942
2943         if (list == NULL)
2944                 return -1;
2945
2946         i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2947         trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2948
2949         if (trace->filter_pids.entries == NULL)
2950                 goto out;
2951
2952         trace->filter_pids.entries[0] = getpid();
2953
2954         for (i = 1; i < trace->filter_pids.nr; ++i)
2955                 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2956
2957         intlist__delete(list);
2958         ret = 0;
2959 out:
2960         return ret;
2961 }
2962
2963 static int trace__open_output(struct trace *trace, const char *filename)
2964 {
2965         struct stat st;
2966
2967         if (!stat(filename, &st) && st.st_size) {
2968                 char oldname[PATH_MAX];
2969
2970                 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2971                 unlink(oldname);
2972                 rename(filename, oldname);
2973         }
2974
2975         trace->output = fopen(filename, "w");
2976
2977         return trace->output == NULL ? -errno : 0;
2978 }
2979
2980 static int parse_pagefaults(const struct option *opt, const char *str,
2981                             int unset __maybe_unused)
2982 {
2983         int *trace_pgfaults = opt->value;
2984
2985         if (strcmp(str, "all") == 0)
2986                 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2987         else if (strcmp(str, "maj") == 0)
2988                 *trace_pgfaults |= TRACE_PFMAJ;
2989         else if (strcmp(str, "min") == 0)
2990                 *trace_pgfaults |= TRACE_PFMIN;
2991         else
2992                 return -1;
2993
2994         return 0;
2995 }
2996
2997 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2998 {
2999         struct perf_evsel *evsel;
3000
3001         evlist__for_each(evlist, evsel)
3002                 evsel->handler = handler;
3003 }
3004
3005 int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
3006 {
3007         const char *trace_usage[] = {
3008                 "perf trace [<options>] [<command>]",
3009                 "perf trace [<options>] -- <command> [<options>]",
3010                 "perf trace record [<options>] [<command>]",
3011                 "perf trace record [<options>] -- <command> [<options>]",
3012                 NULL
3013         };
3014         struct trace trace = {
3015                 .audit = {
3016                         .machine = audit_detect_machine(),
3017                         .open_id = audit_name_to_syscall("open", trace.audit.machine),
3018                 },
3019                 .syscalls = {
3020                         . max = -1,
3021                 },
3022                 .opts = {
3023                         .target = {
3024                                 .uid       = UINT_MAX,
3025                                 .uses_mmap = true,
3026                         },
3027                         .user_freq     = UINT_MAX,
3028                         .user_interval = ULLONG_MAX,
3029                         .no_buffering  = true,
3030                         .mmap_pages    = UINT_MAX,
3031                         .proc_map_timeout  = 500,
3032                 },
3033                 .output = stderr,
3034                 .show_comm = true,
3035                 .trace_syscalls = true,
3036         };
3037         const char *output_name = NULL;
3038         const char *ev_qualifier_str = NULL;
3039         const struct option trace_options[] = {
3040         OPT_CALLBACK(0, "event", &trace.evlist, "event",
3041                      "event selector. use 'perf list' to list available events",
3042                      parse_events_option),
3043         OPT_BOOLEAN(0, "comm", &trace.show_comm,
3044                     "show the thread COMM next to its id"),
3045         OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
3046         OPT_STRING('e', "expr", &ev_qualifier_str, "expr", "list of syscalls to trace"),
3047         OPT_STRING('o', "output", &output_name, "file", "output file name"),
3048         OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
3049         OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
3050                     "trace events on existing process id"),
3051         OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
3052                     "trace events on existing thread id"),
3053         OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
3054                      "pids to filter (by the kernel)", trace__set_filter_pids),
3055         OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
3056                     "system-wide collection from all CPUs"),
3057         OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
3058                     "list of cpus to monitor"),
3059         OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
3060                     "child tasks do not inherit counters"),
3061         OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
3062                      "number of mmap data pages",
3063                      perf_evlist__parse_mmap_pages),
3064         OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
3065                    "user to profile"),
3066         OPT_CALLBACK(0, "duration", &trace, "float",
3067                      "show only events with duration > N.M ms",
3068                      trace__set_duration),
3069         OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
3070         OPT_INCR('v', "verbose", &verbose, "be more verbose"),
3071         OPT_BOOLEAN('T', "time", &trace.full_time,
3072                     "Show full timestamp, not time relative to first start"),
3073         OPT_BOOLEAN('s', "summary", &trace.summary_only,
3074                     "Show only syscall summary with statistics"),
3075         OPT_BOOLEAN('S', "with-summary", &trace.summary,
3076                     "Show all syscalls and summary with statistics"),
3077         OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
3078                      "Trace pagefaults", parse_pagefaults, "maj"),
3079         OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
3080         OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
3081         OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
3082                         "per thread proc mmap processing timeout in ms"),
3083         OPT_END()
3084         };
3085         const char * const trace_subcommands[] = { "record", NULL };
3086         int err;
3087         char bf[BUFSIZ];
3088
3089         signal(SIGSEGV, sighandler_dump_stack);
3090         signal(SIGFPE, sighandler_dump_stack);
3091
3092         trace.evlist = perf_evlist__new();
3093
3094         if (trace.evlist == NULL) {
3095                 pr_err("Not enough memory to run!\n");
3096                 err = -ENOMEM;
3097                 goto out;
3098         }
3099
3100         argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
3101                                  trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3102
3103         if (trace.trace_pgfaults) {
3104                 trace.opts.sample_address = true;
3105                 trace.opts.sample_time = true;
3106         }
3107
3108         if (trace.evlist->nr_entries > 0)
3109                 evlist__set_evsel_handler(trace.evlist, trace__event_handler);
3110
3111         if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
3112                 return trace__record(&trace, argc-1, &argv[1]);
3113
3114         /* summary_only implies summary option, but don't overwrite summary if set */
3115         if (trace.summary_only)
3116                 trace.summary = trace.summary_only;
3117
3118         if (!trace.trace_syscalls && !trace.trace_pgfaults &&
3119             trace.evlist->nr_entries == 0 /* Was --events used? */) {
3120                 pr_err("Please specify something to trace.\n");
3121                 return -1;
3122         }
3123
3124         if (output_name != NULL) {
3125                 err = trace__open_output(&trace, output_name);
3126                 if (err < 0) {
3127                         perror("failed to create output file");
3128                         goto out;
3129                 }
3130         }
3131
3132         if (ev_qualifier_str != NULL) {
3133                 const char *s = ev_qualifier_str;
3134                 struct strlist_config slist_config = {
3135                         .dirname = system_path(STRACE_GROUPS_DIR),
3136                 };
3137
3138                 trace.not_ev_qualifier = *s == '!';
3139                 if (trace.not_ev_qualifier)
3140                         ++s;
3141                 trace.ev_qualifier = strlist__new(s, &slist_config);
3142                 if (trace.ev_qualifier == NULL) {
3143                         fputs("Not enough memory to parse event qualifier",
3144                               trace.output);
3145                         err = -ENOMEM;
3146                         goto out_close;
3147                 }
3148
3149                 err = trace__validate_ev_qualifier(&trace);
3150                 if (err)
3151                         goto out_close;
3152         }
3153
3154         err = target__validate(&trace.opts.target);
3155         if (err) {
3156                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3157                 fprintf(trace.output, "%s", bf);
3158                 goto out_close;
3159         }
3160
3161         err = target__parse_uid(&trace.opts.target);
3162         if (err) {
3163                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3164                 fprintf(trace.output, "%s", bf);
3165                 goto out_close;
3166         }
3167
3168         if (!argc && target__none(&trace.opts.target))
3169                 trace.opts.target.system_wide = true;
3170
3171         if (input_name)
3172                 err = trace__replay(&trace);
3173         else
3174                 err = trace__run(&trace, argc, argv);
3175
3176 out_close:
3177         if (output_name != NULL)
3178                 fclose(trace.output);
3179 out:
3180         return err;
3181 }