]> git.kernelconcepts.de Git - karo-tx-linux.git/blob - tools/perf/builtin-trace.c
iio: pressure: bmp280: add support for BMP180
[karo-tx-linux.git] / tools / perf / builtin-trace.c
1 /*
2  * builtin-trace.c
3  *
4  * Builtin 'trace' command:
5  *
6  * Display a continuously updated trace of any workload, CPU, specific PID,
7  * system wide, etc.  Default format is loosely strace like, but any other
8  * event may be specified using --event.
9  *
10  * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11  *
12  * Initially based on the 'trace' prototype by Thomas Gleixner:
13  *
14  * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15  *
16  * Released under the GPL v2. (and only v2, not any later version)
17  */
18
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/color.h"
23 #include "util/debug.h"
24 #include "util/evlist.h"
25 #include <subcmd/exec-cmd.h>
26 #include "util/machine.h"
27 #include "util/session.h"
28 #include "util/thread.h"
29 #include <subcmd/parse-options.h>
30 #include "util/strlist.h"
31 #include "util/intlist.h"
32 #include "util/thread_map.h"
33 #include "util/stat.h"
34 #include "trace-event.h"
35 #include "util/parse-events.h"
36 #include "util/bpf-loader.h"
37
38 #include <libaudit.h>
39 #include <stdlib.h>
40 #include <sys/mman.h>
41 #include <linux/futex.h>
42 #include <linux/err.h>
43
44 /* For older distros: */
45 #ifndef MAP_STACK
46 # define MAP_STACK              0x20000
47 #endif
48
49 #ifndef MADV_HWPOISON
50 # define MADV_HWPOISON          100
51
52 #endif
53
54 #ifndef MADV_MERGEABLE
55 # define MADV_MERGEABLE         12
56 #endif
57
58 #ifndef MADV_UNMERGEABLE
59 # define MADV_UNMERGEABLE       13
60 #endif
61
62 #ifndef EFD_SEMAPHORE
63 # define EFD_SEMAPHORE          1
64 #endif
65
66 #ifndef EFD_NONBLOCK
67 # define EFD_NONBLOCK           00004000
68 #endif
69
70 #ifndef EFD_CLOEXEC
71 # define EFD_CLOEXEC            02000000
72 #endif
73
74 #ifndef O_CLOEXEC
75 # define O_CLOEXEC              02000000
76 #endif
77
78 #ifndef SOCK_DCCP
79 # define SOCK_DCCP              6
80 #endif
81
82 #ifndef SOCK_CLOEXEC
83 # define SOCK_CLOEXEC           02000000
84 #endif
85
86 #ifndef SOCK_NONBLOCK
87 # define SOCK_NONBLOCK          00004000
88 #endif
89
90 #ifndef MSG_CMSG_CLOEXEC
91 # define MSG_CMSG_CLOEXEC       0x40000000
92 #endif
93
94 #ifndef PERF_FLAG_FD_NO_GROUP
95 # define PERF_FLAG_FD_NO_GROUP          (1UL << 0)
96 #endif
97
98 #ifndef PERF_FLAG_FD_OUTPUT
99 # define PERF_FLAG_FD_OUTPUT            (1UL << 1)
100 #endif
101
102 #ifndef PERF_FLAG_PID_CGROUP
103 # define PERF_FLAG_PID_CGROUP           (1UL << 2) /* pid=cgroup id, per-cpu mode only */
104 #endif
105
106 #ifndef PERF_FLAG_FD_CLOEXEC
107 # define PERF_FLAG_FD_CLOEXEC           (1UL << 3) /* O_CLOEXEC */
108 #endif
109
110
111 struct tp_field {
112         int offset;
113         union {
114                 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
115                 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
116         };
117 };
118
119 #define TP_UINT_FIELD(bits) \
120 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
121 { \
122         u##bits value; \
123         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
124         return value;  \
125 }
126
127 TP_UINT_FIELD(8);
128 TP_UINT_FIELD(16);
129 TP_UINT_FIELD(32);
130 TP_UINT_FIELD(64);
131
132 #define TP_UINT_FIELD__SWAPPED(bits) \
133 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
134 { \
135         u##bits value; \
136         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
137         return bswap_##bits(value);\
138 }
139
140 TP_UINT_FIELD__SWAPPED(16);
141 TP_UINT_FIELD__SWAPPED(32);
142 TP_UINT_FIELD__SWAPPED(64);
143
144 static int tp_field__init_uint(struct tp_field *field,
145                                struct format_field *format_field,
146                                bool needs_swap)
147 {
148         field->offset = format_field->offset;
149
150         switch (format_field->size) {
151         case 1:
152                 field->integer = tp_field__u8;
153                 break;
154         case 2:
155                 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
156                 break;
157         case 4:
158                 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
159                 break;
160         case 8:
161                 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
162                 break;
163         default:
164                 return -1;
165         }
166
167         return 0;
168 }
169
170 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
171 {
172         return sample->raw_data + field->offset;
173 }
174
175 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
176 {
177         field->offset = format_field->offset;
178         field->pointer = tp_field__ptr;
179         return 0;
180 }
181
182 struct syscall_tp {
183         struct tp_field id;
184         union {
185                 struct tp_field args, ret;
186         };
187 };
188
189 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
190                                           struct tp_field *field,
191                                           const char *name)
192 {
193         struct format_field *format_field = perf_evsel__field(evsel, name);
194
195         if (format_field == NULL)
196                 return -1;
197
198         return tp_field__init_uint(field, format_field, evsel->needs_swap);
199 }
200
201 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
202         ({ struct syscall_tp *sc = evsel->priv;\
203            perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
204
205 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
206                                          struct tp_field *field,
207                                          const char *name)
208 {
209         struct format_field *format_field = perf_evsel__field(evsel, name);
210
211         if (format_field == NULL)
212                 return -1;
213
214         return tp_field__init_ptr(field, format_field);
215 }
216
217 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
218         ({ struct syscall_tp *sc = evsel->priv;\
219            perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
220
221 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
222 {
223         zfree(&evsel->priv);
224         perf_evsel__delete(evsel);
225 }
226
227 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
228 {
229         evsel->priv = malloc(sizeof(struct syscall_tp));
230         if (evsel->priv != NULL) {
231                 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
232                         goto out_delete;
233
234                 evsel->handler = handler;
235                 return 0;
236         }
237
238         return -ENOMEM;
239
240 out_delete:
241         zfree(&evsel->priv);
242         return -ENOENT;
243 }
244
245 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
246 {
247         struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
248
249         /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
250         if (IS_ERR(evsel))
251                 evsel = perf_evsel__newtp("syscalls", direction);
252
253         if (IS_ERR(evsel))
254                 return NULL;
255
256         if (perf_evsel__init_syscall_tp(evsel, handler))
257                 goto out_delete;
258
259         return evsel;
260
261 out_delete:
262         perf_evsel__delete_priv(evsel);
263         return NULL;
264 }
265
266 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
267         ({ struct syscall_tp *fields = evsel->priv; \
268            fields->name.integer(&fields->name, sample); })
269
270 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
271         ({ struct syscall_tp *fields = evsel->priv; \
272            fields->name.pointer(&fields->name, sample); })
273
274 struct syscall_arg {
275         unsigned long val;
276         struct thread *thread;
277         struct trace  *trace;
278         void          *parm;
279         u8            idx;
280         u8            mask;
281 };
282
283 struct strarray {
284         int         offset;
285         int         nr_entries;
286         const char **entries;
287 };
288
289 #define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
290         .nr_entries = ARRAY_SIZE(array), \
291         .entries = array, \
292 }
293
294 #define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
295         .offset     = off, \
296         .nr_entries = ARRAY_SIZE(array), \
297         .entries = array, \
298 }
299
300 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
301                                                 const char *intfmt,
302                                                 struct syscall_arg *arg)
303 {
304         struct strarray *sa = arg->parm;
305         int idx = arg->val - sa->offset;
306
307         if (idx < 0 || idx >= sa->nr_entries)
308                 return scnprintf(bf, size, intfmt, arg->val);
309
310         return scnprintf(bf, size, "%s", sa->entries[idx]);
311 }
312
313 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
314                                               struct syscall_arg *arg)
315 {
316         return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
317 }
318
319 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
320
321 #if defined(__i386__) || defined(__x86_64__)
322 /*
323  * FIXME: Make this available to all arches as soon as the ioctl beautifier
324  *        gets rewritten to support all arches.
325  */
326 static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
327                                                  struct syscall_arg *arg)
328 {
329         return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
330 }
331
332 #define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
333 #endif /* defined(__i386__) || defined(__x86_64__) */
334
335 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
336                                         struct syscall_arg *arg);
337
338 #define SCA_FD syscall_arg__scnprintf_fd
339
340 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
341                                            struct syscall_arg *arg)
342 {
343         int fd = arg->val;
344
345         if (fd == AT_FDCWD)
346                 return scnprintf(bf, size, "CWD");
347
348         return syscall_arg__scnprintf_fd(bf, size, arg);
349 }
350
351 #define SCA_FDAT syscall_arg__scnprintf_fd_at
352
353 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
354                                               struct syscall_arg *arg);
355
356 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
357
358 static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
359                                          struct syscall_arg *arg)
360 {
361         return scnprintf(bf, size, "%#lx", arg->val);
362 }
363
364 #define SCA_HEX syscall_arg__scnprintf_hex
365
366 static size_t syscall_arg__scnprintf_int(char *bf, size_t size,
367                                          struct syscall_arg *arg)
368 {
369         return scnprintf(bf, size, "%d", arg->val);
370 }
371
372 #define SCA_INT syscall_arg__scnprintf_int
373
374 static size_t syscall_arg__scnprintf_mmap_prot(char *bf, size_t size,
375                                                struct syscall_arg *arg)
376 {
377         int printed = 0, prot = arg->val;
378
379         if (prot == PROT_NONE)
380                 return scnprintf(bf, size, "NONE");
381 #define P_MMAP_PROT(n) \
382         if (prot & PROT_##n) { \
383                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
384                 prot &= ~PROT_##n; \
385         }
386
387         P_MMAP_PROT(EXEC);
388         P_MMAP_PROT(READ);
389         P_MMAP_PROT(WRITE);
390 #ifdef PROT_SEM
391         P_MMAP_PROT(SEM);
392 #endif
393         P_MMAP_PROT(GROWSDOWN);
394         P_MMAP_PROT(GROWSUP);
395 #undef P_MMAP_PROT
396
397         if (prot)
398                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", prot);
399
400         return printed;
401 }
402
403 #define SCA_MMAP_PROT syscall_arg__scnprintf_mmap_prot
404
405 static size_t syscall_arg__scnprintf_mmap_flags(char *bf, size_t size,
406                                                 struct syscall_arg *arg)
407 {
408         int printed = 0, flags = arg->val;
409
410 #define P_MMAP_FLAG(n) \
411         if (flags & MAP_##n) { \
412                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
413                 flags &= ~MAP_##n; \
414         }
415
416         P_MMAP_FLAG(SHARED);
417         P_MMAP_FLAG(PRIVATE);
418 #ifdef MAP_32BIT
419         P_MMAP_FLAG(32BIT);
420 #endif
421         P_MMAP_FLAG(ANONYMOUS);
422         P_MMAP_FLAG(DENYWRITE);
423         P_MMAP_FLAG(EXECUTABLE);
424         P_MMAP_FLAG(FILE);
425         P_MMAP_FLAG(FIXED);
426         P_MMAP_FLAG(GROWSDOWN);
427 #ifdef MAP_HUGETLB
428         P_MMAP_FLAG(HUGETLB);
429 #endif
430         P_MMAP_FLAG(LOCKED);
431         P_MMAP_FLAG(NONBLOCK);
432         P_MMAP_FLAG(NORESERVE);
433         P_MMAP_FLAG(POPULATE);
434         P_MMAP_FLAG(STACK);
435 #ifdef MAP_UNINITIALIZED
436         P_MMAP_FLAG(UNINITIALIZED);
437 #endif
438 #undef P_MMAP_FLAG
439
440         if (flags)
441                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
442
443         return printed;
444 }
445
446 #define SCA_MMAP_FLAGS syscall_arg__scnprintf_mmap_flags
447
448 static size_t syscall_arg__scnprintf_mremap_flags(char *bf, size_t size,
449                                                   struct syscall_arg *arg)
450 {
451         int printed = 0, flags = arg->val;
452
453 #define P_MREMAP_FLAG(n) \
454         if (flags & MREMAP_##n) { \
455                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
456                 flags &= ~MREMAP_##n; \
457         }
458
459         P_MREMAP_FLAG(MAYMOVE);
460 #ifdef MREMAP_FIXED
461         P_MREMAP_FLAG(FIXED);
462 #endif
463 #undef P_MREMAP_FLAG
464
465         if (flags)
466                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
467
468         return printed;
469 }
470
471 #define SCA_MREMAP_FLAGS syscall_arg__scnprintf_mremap_flags
472
473 static size_t syscall_arg__scnprintf_madvise_behavior(char *bf, size_t size,
474                                                       struct syscall_arg *arg)
475 {
476         int behavior = arg->val;
477
478         switch (behavior) {
479 #define P_MADV_BHV(n) case MADV_##n: return scnprintf(bf, size, #n)
480         P_MADV_BHV(NORMAL);
481         P_MADV_BHV(RANDOM);
482         P_MADV_BHV(SEQUENTIAL);
483         P_MADV_BHV(WILLNEED);
484         P_MADV_BHV(DONTNEED);
485         P_MADV_BHV(REMOVE);
486         P_MADV_BHV(DONTFORK);
487         P_MADV_BHV(DOFORK);
488         P_MADV_BHV(HWPOISON);
489 #ifdef MADV_SOFT_OFFLINE
490         P_MADV_BHV(SOFT_OFFLINE);
491 #endif
492         P_MADV_BHV(MERGEABLE);
493         P_MADV_BHV(UNMERGEABLE);
494 #ifdef MADV_HUGEPAGE
495         P_MADV_BHV(HUGEPAGE);
496 #endif
497 #ifdef MADV_NOHUGEPAGE
498         P_MADV_BHV(NOHUGEPAGE);
499 #endif
500 #ifdef MADV_DONTDUMP
501         P_MADV_BHV(DONTDUMP);
502 #endif
503 #ifdef MADV_DODUMP
504         P_MADV_BHV(DODUMP);
505 #endif
506 #undef P_MADV_PHV
507         default: break;
508         }
509
510         return scnprintf(bf, size, "%#x", behavior);
511 }
512
513 #define SCA_MADV_BHV syscall_arg__scnprintf_madvise_behavior
514
515 static size_t syscall_arg__scnprintf_flock(char *bf, size_t size,
516                                            struct syscall_arg *arg)
517 {
518         int printed = 0, op = arg->val;
519
520         if (op == 0)
521                 return scnprintf(bf, size, "NONE");
522 #define P_CMD(cmd) \
523         if ((op & LOCK_##cmd) == LOCK_##cmd) { \
524                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #cmd); \
525                 op &= ~LOCK_##cmd; \
526         }
527
528         P_CMD(SH);
529         P_CMD(EX);
530         P_CMD(NB);
531         P_CMD(UN);
532         P_CMD(MAND);
533         P_CMD(RW);
534         P_CMD(READ);
535         P_CMD(WRITE);
536 #undef P_OP
537
538         if (op)
539                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", op);
540
541         return printed;
542 }
543
544 #define SCA_FLOCK syscall_arg__scnprintf_flock
545
546 static size_t syscall_arg__scnprintf_futex_op(char *bf, size_t size, struct syscall_arg *arg)
547 {
548         enum syscall_futex_args {
549                 SCF_UADDR   = (1 << 0),
550                 SCF_OP      = (1 << 1),
551                 SCF_VAL     = (1 << 2),
552                 SCF_TIMEOUT = (1 << 3),
553                 SCF_UADDR2  = (1 << 4),
554                 SCF_VAL3    = (1 << 5),
555         };
556         int op = arg->val;
557         int cmd = op & FUTEX_CMD_MASK;
558         size_t printed = 0;
559
560         switch (cmd) {
561 #define P_FUTEX_OP(n) case FUTEX_##n: printed = scnprintf(bf, size, #n);
562         P_FUTEX_OP(WAIT);           arg->mask |= SCF_VAL3|SCF_UADDR2;             break;
563         P_FUTEX_OP(WAKE);           arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
564         P_FUTEX_OP(FD);             arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
565         P_FUTEX_OP(REQUEUE);        arg->mask |= SCF_VAL3|SCF_TIMEOUT;            break;
566         P_FUTEX_OP(CMP_REQUEUE);    arg->mask |= SCF_TIMEOUT;                     break;
567         P_FUTEX_OP(CMP_REQUEUE_PI); arg->mask |= SCF_TIMEOUT;                     break;
568         P_FUTEX_OP(WAKE_OP);                                                      break;
569         P_FUTEX_OP(LOCK_PI);        arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
570         P_FUTEX_OP(UNLOCK_PI);      arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
571         P_FUTEX_OP(TRYLOCK_PI);     arg->mask |= SCF_VAL3|SCF_UADDR2;             break;
572         P_FUTEX_OP(WAIT_BITSET);    arg->mask |= SCF_UADDR2;                      break;
573         P_FUTEX_OP(WAKE_BITSET);    arg->mask |= SCF_UADDR2;                      break;
574         P_FUTEX_OP(WAIT_REQUEUE_PI);                                              break;
575         default: printed = scnprintf(bf, size, "%#x", cmd);                       break;
576         }
577
578         if (op & FUTEX_PRIVATE_FLAG)
579                 printed += scnprintf(bf + printed, size - printed, "|PRIV");
580
581         if (op & FUTEX_CLOCK_REALTIME)
582                 printed += scnprintf(bf + printed, size - printed, "|CLKRT");
583
584         return printed;
585 }
586
587 #define SCA_FUTEX_OP  syscall_arg__scnprintf_futex_op
588
589 static const char *bpf_cmd[] = {
590         "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
591         "MAP_GET_NEXT_KEY", "PROG_LOAD",
592 };
593 static DEFINE_STRARRAY(bpf_cmd);
594
595 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
596 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
597
598 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
599 static DEFINE_STRARRAY(itimers);
600
601 static const char *keyctl_options[] = {
602         "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
603         "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
604         "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
605         "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
606         "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
607 };
608 static DEFINE_STRARRAY(keyctl_options);
609
610 static const char *whences[] = { "SET", "CUR", "END",
611 #ifdef SEEK_DATA
612 "DATA",
613 #endif
614 #ifdef SEEK_HOLE
615 "HOLE",
616 #endif
617 };
618 static DEFINE_STRARRAY(whences);
619
620 static const char *fcntl_cmds[] = {
621         "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
622         "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
623         "F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
624         "F_GETOWNER_UIDS",
625 };
626 static DEFINE_STRARRAY(fcntl_cmds);
627
628 static const char *rlimit_resources[] = {
629         "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
630         "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
631         "RTTIME",
632 };
633 static DEFINE_STRARRAY(rlimit_resources);
634
635 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
636 static DEFINE_STRARRAY(sighow);
637
638 static const char *clockid[] = {
639         "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
640         "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
641         "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
642 };
643 static DEFINE_STRARRAY(clockid);
644
645 static const char *socket_families[] = {
646         "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
647         "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
648         "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
649         "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
650         "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
651         "ALG", "NFC", "VSOCK",
652 };
653 static DEFINE_STRARRAY(socket_families);
654
655 #ifndef SOCK_TYPE_MASK
656 #define SOCK_TYPE_MASK 0xf
657 #endif
658
659 static size_t syscall_arg__scnprintf_socket_type(char *bf, size_t size,
660                                                       struct syscall_arg *arg)
661 {
662         size_t printed;
663         int type = arg->val,
664             flags = type & ~SOCK_TYPE_MASK;
665
666         type &= SOCK_TYPE_MASK;
667         /*
668          * Can't use a strarray, MIPS may override for ABI reasons.
669          */
670         switch (type) {
671 #define P_SK_TYPE(n) case SOCK_##n: printed = scnprintf(bf, size, #n); break;
672         P_SK_TYPE(STREAM);
673         P_SK_TYPE(DGRAM);
674         P_SK_TYPE(RAW);
675         P_SK_TYPE(RDM);
676         P_SK_TYPE(SEQPACKET);
677         P_SK_TYPE(DCCP);
678         P_SK_TYPE(PACKET);
679 #undef P_SK_TYPE
680         default:
681                 printed = scnprintf(bf, size, "%#x", type);
682         }
683
684 #define P_SK_FLAG(n) \
685         if (flags & SOCK_##n) { \
686                 printed += scnprintf(bf + printed, size - printed, "|%s", #n); \
687                 flags &= ~SOCK_##n; \
688         }
689
690         P_SK_FLAG(CLOEXEC);
691         P_SK_FLAG(NONBLOCK);
692 #undef P_SK_FLAG
693
694         if (flags)
695                 printed += scnprintf(bf + printed, size - printed, "|%#x", flags);
696
697         return printed;
698 }
699
700 #define SCA_SK_TYPE syscall_arg__scnprintf_socket_type
701
702 #ifndef MSG_PROBE
703 #define MSG_PROBE            0x10
704 #endif
705 #ifndef MSG_WAITFORONE
706 #define MSG_WAITFORONE  0x10000
707 #endif
708 #ifndef MSG_SENDPAGE_NOTLAST
709 #define MSG_SENDPAGE_NOTLAST 0x20000
710 #endif
711 #ifndef MSG_FASTOPEN
712 #define MSG_FASTOPEN         0x20000000
713 #endif
714
715 static size_t syscall_arg__scnprintf_msg_flags(char *bf, size_t size,
716                                                struct syscall_arg *arg)
717 {
718         int printed = 0, flags = arg->val;
719
720         if (flags == 0)
721                 return scnprintf(bf, size, "NONE");
722 #define P_MSG_FLAG(n) \
723         if (flags & MSG_##n) { \
724                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
725                 flags &= ~MSG_##n; \
726         }
727
728         P_MSG_FLAG(OOB);
729         P_MSG_FLAG(PEEK);
730         P_MSG_FLAG(DONTROUTE);
731         P_MSG_FLAG(TRYHARD);
732         P_MSG_FLAG(CTRUNC);
733         P_MSG_FLAG(PROBE);
734         P_MSG_FLAG(TRUNC);
735         P_MSG_FLAG(DONTWAIT);
736         P_MSG_FLAG(EOR);
737         P_MSG_FLAG(WAITALL);
738         P_MSG_FLAG(FIN);
739         P_MSG_FLAG(SYN);
740         P_MSG_FLAG(CONFIRM);
741         P_MSG_FLAG(RST);
742         P_MSG_FLAG(ERRQUEUE);
743         P_MSG_FLAG(NOSIGNAL);
744         P_MSG_FLAG(MORE);
745         P_MSG_FLAG(WAITFORONE);
746         P_MSG_FLAG(SENDPAGE_NOTLAST);
747         P_MSG_FLAG(FASTOPEN);
748         P_MSG_FLAG(CMSG_CLOEXEC);
749 #undef P_MSG_FLAG
750
751         if (flags)
752                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
753
754         return printed;
755 }
756
757 #define SCA_MSG_FLAGS syscall_arg__scnprintf_msg_flags
758
759 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
760                                                  struct syscall_arg *arg)
761 {
762         size_t printed = 0;
763         int mode = arg->val;
764
765         if (mode == F_OK) /* 0 */
766                 return scnprintf(bf, size, "F");
767 #define P_MODE(n) \
768         if (mode & n##_OK) { \
769                 printed += scnprintf(bf + printed, size - printed, "%s", #n); \
770                 mode &= ~n##_OK; \
771         }
772
773         P_MODE(R);
774         P_MODE(W);
775         P_MODE(X);
776 #undef P_MODE
777
778         if (mode)
779                 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
780
781         return printed;
782 }
783
784 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
785
786 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
787                                               struct syscall_arg *arg);
788
789 #define SCA_FILENAME syscall_arg__scnprintf_filename
790
791 static size_t syscall_arg__scnprintf_open_flags(char *bf, size_t size,
792                                                struct syscall_arg *arg)
793 {
794         int printed = 0, flags = arg->val;
795
796         if (!(flags & O_CREAT))
797                 arg->mask |= 1 << (arg->idx + 1); /* Mask the mode parm */
798
799         if (flags == 0)
800                 return scnprintf(bf, size, "RDONLY");
801 #define P_FLAG(n) \
802         if (flags & O_##n) { \
803                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
804                 flags &= ~O_##n; \
805         }
806
807         P_FLAG(APPEND);
808         P_FLAG(ASYNC);
809         P_FLAG(CLOEXEC);
810         P_FLAG(CREAT);
811         P_FLAG(DIRECT);
812         P_FLAG(DIRECTORY);
813         P_FLAG(EXCL);
814         P_FLAG(LARGEFILE);
815         P_FLAG(NOATIME);
816         P_FLAG(NOCTTY);
817 #ifdef O_NONBLOCK
818         P_FLAG(NONBLOCK);
819 #elif O_NDELAY
820         P_FLAG(NDELAY);
821 #endif
822 #ifdef O_PATH
823         P_FLAG(PATH);
824 #endif
825         P_FLAG(RDWR);
826 #ifdef O_DSYNC
827         if ((flags & O_SYNC) == O_SYNC)
828                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", "SYNC");
829         else {
830                 P_FLAG(DSYNC);
831         }
832 #else
833         P_FLAG(SYNC);
834 #endif
835         P_FLAG(TRUNC);
836         P_FLAG(WRONLY);
837 #undef P_FLAG
838
839         if (flags)
840                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
841
842         return printed;
843 }
844
845 #define SCA_OPEN_FLAGS syscall_arg__scnprintf_open_flags
846
847 static size_t syscall_arg__scnprintf_perf_flags(char *bf, size_t size,
848                                                 struct syscall_arg *arg)
849 {
850         int printed = 0, flags = arg->val;
851
852         if (flags == 0)
853                 return 0;
854
855 #define P_FLAG(n) \
856         if (flags & PERF_FLAG_##n) { \
857                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
858                 flags &= ~PERF_FLAG_##n; \
859         }
860
861         P_FLAG(FD_NO_GROUP);
862         P_FLAG(FD_OUTPUT);
863         P_FLAG(PID_CGROUP);
864         P_FLAG(FD_CLOEXEC);
865 #undef P_FLAG
866
867         if (flags)
868                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
869
870         return printed;
871 }
872
873 #define SCA_PERF_FLAGS syscall_arg__scnprintf_perf_flags
874
875 static size_t syscall_arg__scnprintf_eventfd_flags(char *bf, size_t size,
876                                                    struct syscall_arg *arg)
877 {
878         int printed = 0, flags = arg->val;
879
880         if (flags == 0)
881                 return scnprintf(bf, size, "NONE");
882 #define P_FLAG(n) \
883         if (flags & EFD_##n) { \
884                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
885                 flags &= ~EFD_##n; \
886         }
887
888         P_FLAG(SEMAPHORE);
889         P_FLAG(CLOEXEC);
890         P_FLAG(NONBLOCK);
891 #undef P_FLAG
892
893         if (flags)
894                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
895
896         return printed;
897 }
898
899 #define SCA_EFD_FLAGS syscall_arg__scnprintf_eventfd_flags
900
901 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
902                                                 struct syscall_arg *arg)
903 {
904         int printed = 0, flags = arg->val;
905
906 #define P_FLAG(n) \
907         if (flags & O_##n) { \
908                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
909                 flags &= ~O_##n; \
910         }
911
912         P_FLAG(CLOEXEC);
913         P_FLAG(NONBLOCK);
914 #undef P_FLAG
915
916         if (flags)
917                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
918
919         return printed;
920 }
921
922 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
923
924 static size_t syscall_arg__scnprintf_signum(char *bf, size_t size, struct syscall_arg *arg)
925 {
926         int sig = arg->val;
927
928         switch (sig) {
929 #define P_SIGNUM(n) case SIG##n: return scnprintf(bf, size, #n)
930         P_SIGNUM(HUP);
931         P_SIGNUM(INT);
932         P_SIGNUM(QUIT);
933         P_SIGNUM(ILL);
934         P_SIGNUM(TRAP);
935         P_SIGNUM(ABRT);
936         P_SIGNUM(BUS);
937         P_SIGNUM(FPE);
938         P_SIGNUM(KILL);
939         P_SIGNUM(USR1);
940         P_SIGNUM(SEGV);
941         P_SIGNUM(USR2);
942         P_SIGNUM(PIPE);
943         P_SIGNUM(ALRM);
944         P_SIGNUM(TERM);
945         P_SIGNUM(CHLD);
946         P_SIGNUM(CONT);
947         P_SIGNUM(STOP);
948         P_SIGNUM(TSTP);
949         P_SIGNUM(TTIN);
950         P_SIGNUM(TTOU);
951         P_SIGNUM(URG);
952         P_SIGNUM(XCPU);
953         P_SIGNUM(XFSZ);
954         P_SIGNUM(VTALRM);
955         P_SIGNUM(PROF);
956         P_SIGNUM(WINCH);
957         P_SIGNUM(IO);
958         P_SIGNUM(PWR);
959         P_SIGNUM(SYS);
960 #ifdef SIGEMT
961         P_SIGNUM(EMT);
962 #endif
963 #ifdef SIGSTKFLT
964         P_SIGNUM(STKFLT);
965 #endif
966 #ifdef SIGSWI
967         P_SIGNUM(SWI);
968 #endif
969         default: break;
970         }
971
972         return scnprintf(bf, size, "%#x", sig);
973 }
974
975 #define SCA_SIGNUM syscall_arg__scnprintf_signum
976
977 #if defined(__i386__) || defined(__x86_64__)
978 /*
979  * FIXME: Make this available to all arches.
980  */
981 #define TCGETS          0x5401
982
983 static const char *tioctls[] = {
984         "TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
985         "TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
986         "TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
987         "TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
988         "TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
989         "TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
990         "TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
991         "TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
992         "TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
993         "TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
994         "TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
995         [0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
996         "TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
997         "TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
998         "TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
999 };
1000
1001 static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
1002 #endif /* defined(__i386__) || defined(__x86_64__) */
1003
1004 #define STRARRAY(arg, name, array) \
1005           .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
1006           .arg_parm      = { [arg] = &strarray__##array, }
1007
1008 static struct syscall_fmt {
1009         const char *name;
1010         const char *alias;
1011         size_t     (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
1012         void       *arg_parm[6];
1013         bool       errmsg;
1014         bool       timeout;
1015         bool       hexret;
1016 } syscall_fmts[] = {
1017         { .name     = "access",     .errmsg = true,
1018           .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */
1019                              [1] = SCA_ACCMODE,  /* mode */ }, },
1020         { .name     = "arch_prctl", .errmsg = true, .alias = "prctl", },
1021         { .name     = "bpf",        .errmsg = true, STRARRAY(0, cmd, bpf_cmd), },
1022         { .name     = "brk",        .hexret = true,
1023           .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
1024         { .name     = "chdir",      .errmsg = true,
1025           .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1026         { .name     = "chmod",      .errmsg = true,
1027           .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1028         { .name     = "chroot",     .errmsg = true,
1029           .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1030         { .name     = "clock_gettime",  .errmsg = true, STRARRAY(0, clk_id, clockid), },
1031         { .name     = "close",      .errmsg = true,
1032           .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
1033         { .name     = "connect",    .errmsg = true, },
1034         { .name     = "creat",      .errmsg = true,
1035           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1036         { .name     = "dup",        .errmsg = true,
1037           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1038         { .name     = "dup2",       .errmsg = true,
1039           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1040         { .name     = "dup3",       .errmsg = true,
1041           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1042         { .name     = "epoll_ctl",  .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
1043         { .name     = "eventfd2",   .errmsg = true,
1044           .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
1045         { .name     = "faccessat",  .errmsg = true,
1046           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1047                              [1] = SCA_FILENAME, /* filename */ }, },
1048         { .name     = "fadvise64",  .errmsg = true,
1049           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1050         { .name     = "fallocate",  .errmsg = true,
1051           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1052         { .name     = "fchdir",     .errmsg = true,
1053           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1054         { .name     = "fchmod",     .errmsg = true,
1055           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1056         { .name     = "fchmodat",   .errmsg = true,
1057           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
1058                              [1] = SCA_FILENAME, /* filename */ }, },
1059         { .name     = "fchown",     .errmsg = true,
1060           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1061         { .name     = "fchownat",   .errmsg = true,
1062           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
1063                              [1] = SCA_FILENAME, /* filename */ }, },
1064         { .name     = "fcntl",      .errmsg = true,
1065           .arg_scnprintf = { [0] = SCA_FD, /* fd */
1066                              [1] = SCA_STRARRAY, /* cmd */ },
1067           .arg_parm      = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
1068         { .name     = "fdatasync",  .errmsg = true,
1069           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1070         { .name     = "flock",      .errmsg = true,
1071           .arg_scnprintf = { [0] = SCA_FD, /* fd */
1072                              [1] = SCA_FLOCK, /* cmd */ }, },
1073         { .name     = "fsetxattr",  .errmsg = true,
1074           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1075         { .name     = "fstat",      .errmsg = true, .alias = "newfstat",
1076           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1077         { .name     = "fstatat",    .errmsg = true, .alias = "newfstatat",
1078           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1079                              [1] = SCA_FILENAME, /* filename */ }, },
1080         { .name     = "fstatfs",    .errmsg = true,
1081           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1082         { .name     = "fsync",    .errmsg = true,
1083           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1084         { .name     = "ftruncate", .errmsg = true,
1085           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1086         { .name     = "futex",      .errmsg = true,
1087           .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
1088         { .name     = "futimesat", .errmsg = true,
1089           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
1090                              [1] = SCA_FILENAME, /* filename */ }, },
1091         { .name     = "getdents",   .errmsg = true,
1092           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1093         { .name     = "getdents64", .errmsg = true,
1094           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1095         { .name     = "getitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
1096         { .name     = "getrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
1097         { .name     = "getxattr",    .errmsg = true,
1098           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1099         { .name     = "inotify_add_watch",          .errmsg = true,
1100           .arg_scnprintf = { [1] = SCA_FILENAME, /* pathname */ }, },
1101         { .name     = "ioctl",      .errmsg = true,
1102           .arg_scnprintf = { [0] = SCA_FD, /* fd */
1103 #if defined(__i386__) || defined(__x86_64__)
1104 /*
1105  * FIXME: Make this available to all arches.
1106  */
1107                              [1] = SCA_STRHEXARRAY, /* cmd */
1108                              [2] = SCA_HEX, /* arg */ },
1109           .arg_parm      = { [1] = &strarray__tioctls, /* cmd */ }, },
1110 #else
1111                              [2] = SCA_HEX, /* arg */ }, },
1112 #endif
1113         { .name     = "keyctl",     .errmsg = true, STRARRAY(0, option, keyctl_options), },
1114         { .name     = "kill",       .errmsg = true,
1115           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1116         { .name     = "lchown",    .errmsg = true,
1117           .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1118         { .name     = "lgetxattr",  .errmsg = true,
1119           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1120         { .name     = "linkat",     .errmsg = true,
1121           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1122         { .name     = "listxattr",  .errmsg = true,
1123           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1124         { .name     = "llistxattr", .errmsg = true,
1125           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1126         { .name     = "lremovexattr",  .errmsg = true,
1127           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1128         { .name     = "lseek",      .errmsg = true,
1129           .arg_scnprintf = { [0] = SCA_FD, /* fd */
1130                              [2] = SCA_STRARRAY, /* whence */ },
1131           .arg_parm      = { [2] = &strarray__whences, /* whence */ }, },
1132         { .name     = "lsetxattr",  .errmsg = true,
1133           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1134         { .name     = "lstat",      .errmsg = true, .alias = "newlstat",
1135           .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1136         { .name     = "lsxattr",    .errmsg = true,
1137           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1138         { .name     = "madvise",    .errmsg = true,
1139           .arg_scnprintf = { [0] = SCA_HEX,      /* start */
1140                              [2] = SCA_MADV_BHV, /* behavior */ }, },
1141         { .name     = "mkdir",    .errmsg = true,
1142           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1143         { .name     = "mkdirat",    .errmsg = true,
1144           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
1145                              [1] = SCA_FILENAME, /* pathname */ }, },
1146         { .name     = "mknod",      .errmsg = true,
1147           .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1148         { .name     = "mknodat",    .errmsg = true,
1149           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
1150                              [1] = SCA_FILENAME, /* filename */ }, },
1151         { .name     = "mlock",      .errmsg = true,
1152           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1153         { .name     = "mlockall",   .errmsg = true,
1154           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1155         { .name     = "mmap",       .hexret = true,
1156           .arg_scnprintf = { [0] = SCA_HEX,       /* addr */
1157                              [2] = SCA_MMAP_PROT, /* prot */
1158                              [3] = SCA_MMAP_FLAGS, /* flags */
1159                              [4] = SCA_FD,        /* fd */ }, },
1160         { .name     = "mprotect",   .errmsg = true,
1161           .arg_scnprintf = { [0] = SCA_HEX, /* start */
1162                              [2] = SCA_MMAP_PROT, /* prot */ }, },
1163         { .name     = "mq_unlink", .errmsg = true,
1164           .arg_scnprintf = { [0] = SCA_FILENAME, /* u_name */ }, },
1165         { .name     = "mremap",     .hexret = true,
1166           .arg_scnprintf = { [0] = SCA_HEX, /* addr */
1167                              [3] = SCA_MREMAP_FLAGS, /* flags */
1168                              [4] = SCA_HEX, /* new_addr */ }, },
1169         { .name     = "munlock",    .errmsg = true,
1170           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1171         { .name     = "munmap",     .errmsg = true,
1172           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1173         { .name     = "name_to_handle_at", .errmsg = true,
1174           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1175         { .name     = "newfstatat", .errmsg = true,
1176           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1177                              [1] = SCA_FILENAME, /* filename */ }, },
1178         { .name     = "open",       .errmsg = true,
1179           .arg_scnprintf = { [0] = SCA_FILENAME,   /* filename */
1180                              [1] = SCA_OPEN_FLAGS, /* flags */ }, },
1181         { .name     = "open_by_handle_at", .errmsg = true,
1182           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1183                              [2] = SCA_OPEN_FLAGS, /* flags */ }, },
1184         { .name     = "openat",     .errmsg = true,
1185           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1186                              [1] = SCA_FILENAME, /* filename */
1187                              [2] = SCA_OPEN_FLAGS, /* flags */ }, },
1188         { .name     = "perf_event_open", .errmsg = true,
1189           .arg_scnprintf = { [1] = SCA_INT, /* pid */
1190                              [2] = SCA_INT, /* cpu */
1191                              [3] = SCA_FD,  /* group_fd */
1192                              [4] = SCA_PERF_FLAGS,  /* flags */ }, },
1193         { .name     = "pipe2",      .errmsg = true,
1194           .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
1195         { .name     = "poll",       .errmsg = true, .timeout = true, },
1196         { .name     = "ppoll",      .errmsg = true, .timeout = true, },
1197         { .name     = "pread",      .errmsg = true, .alias = "pread64",
1198           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1199         { .name     = "preadv",     .errmsg = true, .alias = "pread",
1200           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1201         { .name     = "prlimit64",  .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
1202         { .name     = "pwrite",     .errmsg = true, .alias = "pwrite64",
1203           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1204         { .name     = "pwritev",    .errmsg = true,
1205           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1206         { .name     = "read",       .errmsg = true,
1207           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1208         { .name     = "readlink",   .errmsg = true,
1209           .arg_scnprintf = { [0] = SCA_FILENAME, /* path */ }, },
1210         { .name     = "readlinkat", .errmsg = true,
1211           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1212                              [1] = SCA_FILENAME, /* pathname */ }, },
1213         { .name     = "readv",      .errmsg = true,
1214           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1215         { .name     = "recvfrom",   .errmsg = true,
1216           .arg_scnprintf = { [0] = SCA_FD, /* fd */
1217                              [3] = SCA_MSG_FLAGS, /* flags */ }, },
1218         { .name     = "recvmmsg",   .errmsg = true,
1219           .arg_scnprintf = { [0] = SCA_FD, /* fd */
1220                              [3] = SCA_MSG_FLAGS, /* flags */ }, },
1221         { .name     = "recvmsg",    .errmsg = true,
1222           .arg_scnprintf = { [0] = SCA_FD, /* fd */
1223                              [2] = SCA_MSG_FLAGS, /* flags */ }, },
1224         { .name     = "removexattr", .errmsg = true,
1225           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1226         { .name     = "renameat",   .errmsg = true,
1227           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1228         { .name     = "rmdir",    .errmsg = true,
1229           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1230         { .name     = "rt_sigaction", .errmsg = true,
1231           .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
1232         { .name     = "rt_sigprocmask",  .errmsg = true, STRARRAY(0, how, sighow), },
1233         { .name     = "rt_sigqueueinfo", .errmsg = true,
1234           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1235         { .name     = "rt_tgsigqueueinfo", .errmsg = true,
1236           .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
1237         { .name     = "select",     .errmsg = true, .timeout = true, },
1238         { .name     = "sendmmsg",    .errmsg = true,
1239           .arg_scnprintf = { [0] = SCA_FD, /* fd */
1240                              [3] = SCA_MSG_FLAGS, /* flags */ }, },
1241         { .name     = "sendmsg",    .errmsg = true,
1242           .arg_scnprintf = { [0] = SCA_FD, /* fd */
1243                              [2] = SCA_MSG_FLAGS, /* flags */ }, },
1244         { .name     = "sendto",     .errmsg = true,
1245           .arg_scnprintf = { [0] = SCA_FD, /* fd */
1246                              [3] = SCA_MSG_FLAGS, /* flags */ }, },
1247         { .name     = "setitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
1248         { .name     = "setrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
1249         { .name     = "setxattr",   .errmsg = true,
1250           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1251         { .name     = "shutdown",   .errmsg = true,
1252           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1253         { .name     = "socket",     .errmsg = true,
1254           .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
1255                              [1] = SCA_SK_TYPE, /* type */ },
1256           .arg_parm      = { [0] = &strarray__socket_families, /* family */ }, },
1257         { .name     = "socketpair", .errmsg = true,
1258           .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
1259                              [1] = SCA_SK_TYPE, /* type */ },
1260           .arg_parm      = { [0] = &strarray__socket_families, /* family */ }, },
1261         { .name     = "stat",       .errmsg = true, .alias = "newstat",
1262           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1263         { .name     = "statfs",     .errmsg = true,
1264           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1265         { .name     = "swapoff",    .errmsg = true,
1266           .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
1267         { .name     = "swapon",     .errmsg = true,
1268           .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
1269         { .name     = "symlinkat",  .errmsg = true,
1270           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1271         { .name     = "tgkill",     .errmsg = true,
1272           .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
1273         { .name     = "tkill",      .errmsg = true,
1274           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1275         { .name     = "truncate",   .errmsg = true,
1276           .arg_scnprintf = { [0] = SCA_FILENAME, /* path */ }, },
1277         { .name     = "uname",      .errmsg = true, .alias = "newuname", },
1278         { .name     = "unlinkat",   .errmsg = true,
1279           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1280                              [1] = SCA_FILENAME, /* pathname */ }, },
1281         { .name     = "utime",  .errmsg = true,
1282           .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1283         { .name     = "utimensat",  .errmsg = true,
1284           .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */
1285                              [1] = SCA_FILENAME, /* filename */ }, },
1286         { .name     = "utimes",  .errmsg = true,
1287           .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1288         { .name     = "vmsplice",  .errmsg = true,
1289           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1290         { .name     = "write",      .errmsg = true,
1291           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1292         { .name     = "writev",     .errmsg = true,
1293           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1294 };
1295
1296 static int syscall_fmt__cmp(const void *name, const void *fmtp)
1297 {
1298         const struct syscall_fmt *fmt = fmtp;
1299         return strcmp(name, fmt->name);
1300 }
1301
1302 static struct syscall_fmt *syscall_fmt__find(const char *name)
1303 {
1304         const int nmemb = ARRAY_SIZE(syscall_fmts);
1305         return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
1306 }
1307
1308 struct syscall {
1309         struct event_format *tp_format;
1310         int                 nr_args;
1311         struct format_field *args;
1312         const char          *name;
1313         bool                is_exit;
1314         struct syscall_fmt  *fmt;
1315         size_t              (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
1316         void                **arg_parm;
1317 };
1318
1319 static size_t fprintf_duration(unsigned long t, FILE *fp)
1320 {
1321         double duration = (double)t / NSEC_PER_MSEC;
1322         size_t printed = fprintf(fp, "(");
1323
1324         if (duration >= 1.0)
1325                 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
1326         else if (duration >= 0.01)
1327                 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
1328         else
1329                 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
1330         return printed + fprintf(fp, "): ");
1331 }
1332
1333 /**
1334  * filename.ptr: The filename char pointer that will be vfs_getname'd
1335  * filename.entry_str_pos: Where to insert the string translated from
1336  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
1337  */
1338 struct thread_trace {
1339         u64               entry_time;
1340         u64               exit_time;
1341         bool              entry_pending;
1342         unsigned long     nr_events;
1343         unsigned long     pfmaj, pfmin;
1344         char              *entry_str;
1345         double            runtime_ms;
1346         struct {
1347                 unsigned long ptr;
1348                 short int     entry_str_pos;
1349                 bool          pending_open;
1350                 unsigned int  namelen;
1351                 char          *name;
1352         } filename;
1353         struct {
1354                 int       max;
1355                 char      **table;
1356         } paths;
1357
1358         struct intlist *syscall_stats;
1359 };
1360
1361 static struct thread_trace *thread_trace__new(void)
1362 {
1363         struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
1364
1365         if (ttrace)
1366                 ttrace->paths.max = -1;
1367
1368         ttrace->syscall_stats = intlist__new(NULL);
1369
1370         return ttrace;
1371 }
1372
1373 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
1374 {
1375         struct thread_trace *ttrace;
1376
1377         if (thread == NULL)
1378                 goto fail;
1379
1380         if (thread__priv(thread) == NULL)
1381                 thread__set_priv(thread, thread_trace__new());
1382
1383         if (thread__priv(thread) == NULL)
1384                 goto fail;
1385
1386         ttrace = thread__priv(thread);
1387         ++ttrace->nr_events;
1388
1389         return ttrace;
1390 fail:
1391         color_fprintf(fp, PERF_COLOR_RED,
1392                       "WARNING: not enough memory, dropping samples!\n");
1393         return NULL;
1394 }
1395
1396 #define TRACE_PFMAJ             (1 << 0)
1397 #define TRACE_PFMIN             (1 << 1)
1398
1399 static const size_t trace__entry_str_size = 2048;
1400
1401 struct trace {
1402         struct perf_tool        tool;
1403         struct {
1404                 int             machine;
1405                 int             open_id;
1406         }                       audit;
1407         struct {
1408                 int             max;
1409                 struct syscall  *table;
1410                 struct {
1411                         struct perf_evsel *sys_enter,
1412                                           *sys_exit;
1413                 }               events;
1414         } syscalls;
1415         struct record_opts      opts;
1416         struct perf_evlist      *evlist;
1417         struct machine          *host;
1418         struct thread           *current;
1419         u64                     base_time;
1420         FILE                    *output;
1421         unsigned long           nr_events;
1422         struct strlist          *ev_qualifier;
1423         struct {
1424                 size_t          nr;
1425                 int             *entries;
1426         }                       ev_qualifier_ids;
1427         struct intlist          *tid_list;
1428         struct intlist          *pid_list;
1429         struct {
1430                 size_t          nr;
1431                 pid_t           *entries;
1432         }                       filter_pids;
1433         double                  duration_filter;
1434         double                  runtime_ms;
1435         struct {
1436                 u64             vfs_getname,
1437                                 proc_getname;
1438         } stats;
1439         bool                    not_ev_qualifier;
1440         bool                    live;
1441         bool                    full_time;
1442         bool                    sched;
1443         bool                    multiple_threads;
1444         bool                    summary;
1445         bool                    summary_only;
1446         bool                    show_comm;
1447         bool                    show_tool_stats;
1448         bool                    trace_syscalls;
1449         bool                    force;
1450         bool                    vfs_getname;
1451         int                     trace_pgfaults;
1452 };
1453
1454 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
1455 {
1456         struct thread_trace *ttrace = thread__priv(thread);
1457
1458         if (fd > ttrace->paths.max) {
1459                 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
1460
1461                 if (npath == NULL)
1462                         return -1;
1463
1464                 if (ttrace->paths.max != -1) {
1465                         memset(npath + ttrace->paths.max + 1, 0,
1466                                (fd - ttrace->paths.max) * sizeof(char *));
1467                 } else {
1468                         memset(npath, 0, (fd + 1) * sizeof(char *));
1469                 }
1470
1471                 ttrace->paths.table = npath;
1472                 ttrace->paths.max   = fd;
1473         }
1474
1475         ttrace->paths.table[fd] = strdup(pathname);
1476
1477         return ttrace->paths.table[fd] != NULL ? 0 : -1;
1478 }
1479
1480 static int thread__read_fd_path(struct thread *thread, int fd)
1481 {
1482         char linkname[PATH_MAX], pathname[PATH_MAX];
1483         struct stat st;
1484         int ret;
1485
1486         if (thread->pid_ == thread->tid) {
1487                 scnprintf(linkname, sizeof(linkname),
1488                           "/proc/%d/fd/%d", thread->pid_, fd);
1489         } else {
1490                 scnprintf(linkname, sizeof(linkname),
1491                           "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
1492         }
1493
1494         if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
1495                 return -1;
1496
1497         ret = readlink(linkname, pathname, sizeof(pathname));
1498
1499         if (ret < 0 || ret > st.st_size)
1500                 return -1;
1501
1502         pathname[ret] = '\0';
1503         return trace__set_fd_pathname(thread, fd, pathname);
1504 }
1505
1506 static const char *thread__fd_path(struct thread *thread, int fd,
1507                                    struct trace *trace)
1508 {
1509         struct thread_trace *ttrace = thread__priv(thread);
1510
1511         if (ttrace == NULL)
1512                 return NULL;
1513
1514         if (fd < 0)
1515                 return NULL;
1516
1517         if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
1518                 if (!trace->live)
1519                         return NULL;
1520                 ++trace->stats.proc_getname;
1521                 if (thread__read_fd_path(thread, fd))
1522                         return NULL;
1523         }
1524
1525         return ttrace->paths.table[fd];
1526 }
1527
1528 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
1529                                         struct syscall_arg *arg)
1530 {
1531         int fd = arg->val;
1532         size_t printed = scnprintf(bf, size, "%d", fd);
1533         const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1534
1535         if (path)
1536                 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1537
1538         return printed;
1539 }
1540
1541 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1542                                               struct syscall_arg *arg)
1543 {
1544         int fd = arg->val;
1545         size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1546         struct thread_trace *ttrace = thread__priv(arg->thread);
1547
1548         if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1549                 zfree(&ttrace->paths.table[fd]);
1550
1551         return printed;
1552 }
1553
1554 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1555                                      unsigned long ptr)
1556 {
1557         struct thread_trace *ttrace = thread__priv(thread);
1558
1559         ttrace->filename.ptr = ptr;
1560         ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1561 }
1562
1563 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1564                                               struct syscall_arg *arg)
1565 {
1566         unsigned long ptr = arg->val;
1567
1568         if (!arg->trace->vfs_getname)
1569                 return scnprintf(bf, size, "%#x", ptr);
1570
1571         thread__set_filename_pos(arg->thread, bf, ptr);
1572         return 0;
1573 }
1574
1575 static bool trace__filter_duration(struct trace *trace, double t)
1576 {
1577         return t < (trace->duration_filter * NSEC_PER_MSEC);
1578 }
1579
1580 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1581 {
1582         double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1583
1584         return fprintf(fp, "%10.3f ", ts);
1585 }
1586
1587 static bool done = false;
1588 static bool interrupted = false;
1589
1590 static void sig_handler(int sig)
1591 {
1592         done = true;
1593         interrupted = sig == SIGINT;
1594 }
1595
1596 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1597                                         u64 duration, u64 tstamp, FILE *fp)
1598 {
1599         size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1600         printed += fprintf_duration(duration, fp);
1601
1602         if (trace->multiple_threads) {
1603                 if (trace->show_comm)
1604                         printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1605                 printed += fprintf(fp, "%d ", thread->tid);
1606         }
1607
1608         return printed;
1609 }
1610
1611 static int trace__process_event(struct trace *trace, struct machine *machine,
1612                                 union perf_event *event, struct perf_sample *sample)
1613 {
1614         int ret = 0;
1615
1616         switch (event->header.type) {
1617         case PERF_RECORD_LOST:
1618                 color_fprintf(trace->output, PERF_COLOR_RED,
1619                               "LOST %" PRIu64 " events!\n", event->lost.lost);
1620                 ret = machine__process_lost_event(machine, event, sample);
1621         default:
1622                 ret = machine__process_event(machine, event, sample);
1623                 break;
1624         }
1625
1626         return ret;
1627 }
1628
1629 static int trace__tool_process(struct perf_tool *tool,
1630                                union perf_event *event,
1631                                struct perf_sample *sample,
1632                                struct machine *machine)
1633 {
1634         struct trace *trace = container_of(tool, struct trace, tool);
1635         return trace__process_event(trace, machine, event, sample);
1636 }
1637
1638 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1639 {
1640         int err = symbol__init(NULL);
1641
1642         if (err)
1643                 return err;
1644
1645         trace->host = machine__new_host();
1646         if (trace->host == NULL)
1647                 return -ENOMEM;
1648
1649         if (trace_event__register_resolver(trace->host, machine__resolve_kernel_addr) < 0)
1650                 return -errno;
1651
1652         err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1653                                             evlist->threads, trace__tool_process, false,
1654                                             trace->opts.proc_map_timeout);
1655         if (err)
1656                 symbol__exit();
1657
1658         return err;
1659 }
1660
1661 static int syscall__set_arg_fmts(struct syscall *sc)
1662 {
1663         struct format_field *field;
1664         int idx = 0;
1665
1666         sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *));
1667         if (sc->arg_scnprintf == NULL)
1668                 return -1;
1669
1670         if (sc->fmt)
1671                 sc->arg_parm = sc->fmt->arg_parm;
1672
1673         for (field = sc->args; field; field = field->next) {
1674                 if (sc->fmt && sc->fmt->arg_scnprintf[idx])
1675                         sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
1676                 else if (field->flags & FIELD_IS_POINTER)
1677                         sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
1678                 ++idx;
1679         }
1680
1681         return 0;
1682 }
1683
1684 static int trace__read_syscall_info(struct trace *trace, int id)
1685 {
1686         char tp_name[128];
1687         struct syscall *sc;
1688         const char *name = audit_syscall_to_name(id, trace->audit.machine);
1689
1690         if (name == NULL)
1691                 return -1;
1692
1693         if (id > trace->syscalls.max) {
1694                 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1695
1696                 if (nsyscalls == NULL)
1697                         return -1;
1698
1699                 if (trace->syscalls.max != -1) {
1700                         memset(nsyscalls + trace->syscalls.max + 1, 0,
1701                                (id - trace->syscalls.max) * sizeof(*sc));
1702                 } else {
1703                         memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1704                 }
1705
1706                 trace->syscalls.table = nsyscalls;
1707                 trace->syscalls.max   = id;
1708         }
1709
1710         sc = trace->syscalls.table + id;
1711         sc->name = name;
1712
1713         sc->fmt  = syscall_fmt__find(sc->name);
1714
1715         snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1716         sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1717
1718         if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1719                 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1720                 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1721         }
1722
1723         if (IS_ERR(sc->tp_format))
1724                 return -1;
1725
1726         sc->args = sc->tp_format->format.fields;
1727         sc->nr_args = sc->tp_format->format.nr_fields;
1728         /*
1729          * We need to check and discard the first variable '__syscall_nr'
1730          * or 'nr' that mean the syscall number. It is needless here.
1731          * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1732          */
1733         if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1734                 sc->args = sc->args->next;
1735                 --sc->nr_args;
1736         }
1737
1738         sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1739
1740         return syscall__set_arg_fmts(sc);
1741 }
1742
1743 static int trace__validate_ev_qualifier(struct trace *trace)
1744 {
1745         int err = 0, i;
1746         struct str_node *pos;
1747
1748         trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1749         trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1750                                                  sizeof(trace->ev_qualifier_ids.entries[0]));
1751
1752         if (trace->ev_qualifier_ids.entries == NULL) {
1753                 fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1754                        trace->output);
1755                 err = -EINVAL;
1756                 goto out;
1757         }
1758
1759         i = 0;
1760
1761         strlist__for_each(pos, trace->ev_qualifier) {
1762                 const char *sc = pos->s;
1763                 int id = audit_name_to_syscall(sc, trace->audit.machine);
1764
1765                 if (id < 0) {
1766                         if (err == 0) {
1767                                 fputs("Error:\tInvalid syscall ", trace->output);
1768                                 err = -EINVAL;
1769                         } else {
1770                                 fputs(", ", trace->output);
1771                         }
1772
1773                         fputs(sc, trace->output);
1774                 }
1775
1776                 trace->ev_qualifier_ids.entries[i++] = id;
1777         }
1778
1779         if (err < 0) {
1780                 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1781                       "\nHint:\tand: 'man syscalls'\n", trace->output);
1782                 zfree(&trace->ev_qualifier_ids.entries);
1783                 trace->ev_qualifier_ids.nr = 0;
1784         }
1785 out:
1786         return err;
1787 }
1788
1789 /*
1790  * args is to be interpreted as a series of longs but we need to handle
1791  * 8-byte unaligned accesses. args points to raw_data within the event
1792  * and raw_data is guaranteed to be 8-byte unaligned because it is
1793  * preceded by raw_size which is a u32. So we need to copy args to a temp
1794  * variable to read it. Most notably this avoids extended load instructions
1795  * on unaligned addresses
1796  */
1797
1798 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1799                                       unsigned char *args, struct trace *trace,
1800                                       struct thread *thread)
1801 {
1802         size_t printed = 0;
1803         unsigned char *p;
1804         unsigned long val;
1805
1806         if (sc->args != NULL) {
1807                 struct format_field *field;
1808                 u8 bit = 1;
1809                 struct syscall_arg arg = {
1810                         .idx    = 0,
1811                         .mask   = 0,
1812                         .trace  = trace,
1813                         .thread = thread,
1814                 };
1815
1816                 for (field = sc->args; field;
1817                      field = field->next, ++arg.idx, bit <<= 1) {
1818                         if (arg.mask & bit)
1819                                 continue;
1820
1821                         /* special care for unaligned accesses */
1822                         p = args + sizeof(unsigned long) * arg.idx;
1823                         memcpy(&val, p, sizeof(val));
1824
1825                         /*
1826                          * Suppress this argument if its value is zero and
1827                          * and we don't have a string associated in an
1828                          * strarray for it.
1829                          */
1830                         if (val == 0 &&
1831                             !(sc->arg_scnprintf &&
1832                               sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
1833                               sc->arg_parm[arg.idx]))
1834                                 continue;
1835
1836                         printed += scnprintf(bf + printed, size - printed,
1837                                              "%s%s: ", printed ? ", " : "", field->name);
1838                         if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
1839                                 arg.val = val;
1840                                 if (sc->arg_parm)
1841                                         arg.parm = sc->arg_parm[arg.idx];
1842                                 printed += sc->arg_scnprintf[arg.idx](bf + printed,
1843                                                                       size - printed, &arg);
1844                         } else {
1845                                 printed += scnprintf(bf + printed, size - printed,
1846                                                      "%ld", val);
1847                         }
1848                 }
1849         } else {
1850                 int i = 0;
1851
1852                 while (i < 6) {
1853                         /* special care for unaligned accesses */
1854                         p = args + sizeof(unsigned long) * i;
1855                         memcpy(&val, p, sizeof(val));
1856                         printed += scnprintf(bf + printed, size - printed,
1857                                              "%sarg%d: %ld",
1858                                              printed ? ", " : "", i, val);
1859                         ++i;
1860                 }
1861         }
1862
1863         return printed;
1864 }
1865
1866 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1867                                   union perf_event *event,
1868                                   struct perf_sample *sample);
1869
1870 static struct syscall *trace__syscall_info(struct trace *trace,
1871                                            struct perf_evsel *evsel, int id)
1872 {
1873
1874         if (id < 0) {
1875
1876                 /*
1877                  * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1878                  * before that, leaving at a higher verbosity level till that is
1879                  * explained. Reproduced with plain ftrace with:
1880                  *
1881                  * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1882                  * grep "NR -1 " /t/trace_pipe
1883                  *
1884                  * After generating some load on the machine.
1885                  */
1886                 if (verbose > 1) {
1887                         static u64 n;
1888                         fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1889                                 id, perf_evsel__name(evsel), ++n);
1890                 }
1891                 return NULL;
1892         }
1893
1894         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1895             trace__read_syscall_info(trace, id))
1896                 goto out_cant_read;
1897
1898         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1899                 goto out_cant_read;
1900
1901         return &trace->syscalls.table[id];
1902
1903 out_cant_read:
1904         if (verbose) {
1905                 fprintf(trace->output, "Problems reading syscall %d", id);
1906                 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1907                         fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1908                 fputs(" information\n", trace->output);
1909         }
1910         return NULL;
1911 }
1912
1913 static void thread__update_stats(struct thread_trace *ttrace,
1914                                  int id, struct perf_sample *sample)
1915 {
1916         struct int_node *inode;
1917         struct stats *stats;
1918         u64 duration = 0;
1919
1920         inode = intlist__findnew(ttrace->syscall_stats, id);
1921         if (inode == NULL)
1922                 return;
1923
1924         stats = inode->priv;
1925         if (stats == NULL) {
1926                 stats = malloc(sizeof(struct stats));
1927                 if (stats == NULL)
1928                         return;
1929                 init_stats(stats);
1930                 inode->priv = stats;
1931         }
1932
1933         if (ttrace->entry_time && sample->time > ttrace->entry_time)
1934                 duration = sample->time - ttrace->entry_time;
1935
1936         update_stats(stats, duration);
1937 }
1938
1939 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1940 {
1941         struct thread_trace *ttrace;
1942         u64 duration;
1943         size_t printed;
1944
1945         if (trace->current == NULL)
1946                 return 0;
1947
1948         ttrace = thread__priv(trace->current);
1949
1950         if (!ttrace->entry_pending)
1951                 return 0;
1952
1953         duration = sample->time - ttrace->entry_time;
1954
1955         printed  = trace__fprintf_entry_head(trace, trace->current, duration, sample->time, trace->output);
1956         printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1957         ttrace->entry_pending = false;
1958
1959         return printed;
1960 }
1961
1962 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1963                             union perf_event *event __maybe_unused,
1964                             struct perf_sample *sample)
1965 {
1966         char *msg;
1967         void *args;
1968         size_t printed = 0;
1969         struct thread *thread;
1970         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1971         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1972         struct thread_trace *ttrace;
1973
1974         if (sc == NULL)
1975                 return -1;
1976
1977         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1978         ttrace = thread__trace(thread, trace->output);
1979         if (ttrace == NULL)
1980                 goto out_put;
1981
1982         args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1983
1984         if (ttrace->entry_str == NULL) {
1985                 ttrace->entry_str = malloc(trace__entry_str_size);
1986                 if (!ttrace->entry_str)
1987                         goto out_put;
1988         }
1989
1990         if (!trace->summary_only)
1991                 trace__printf_interrupted_entry(trace, sample);
1992
1993         ttrace->entry_time = sample->time;
1994         msg = ttrace->entry_str;
1995         printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1996
1997         printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1998                                            args, trace, thread);
1999
2000         if (sc->is_exit) {
2001                 if (!trace->duration_filter && !trace->summary_only) {
2002                         trace__fprintf_entry_head(trace, thread, 1, sample->time, trace->output);
2003                         fprintf(trace->output, "%-70s\n", ttrace->entry_str);
2004                 }
2005         } else {
2006                 ttrace->entry_pending = true;
2007                 /* See trace__vfs_getname & trace__sys_exit */
2008                 ttrace->filename.pending_open = false;
2009         }
2010
2011         if (trace->current != thread) {
2012                 thread__put(trace->current);
2013                 trace->current = thread__get(thread);
2014         }
2015         err = 0;
2016 out_put:
2017         thread__put(thread);
2018         return err;
2019 }
2020
2021 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
2022                            union perf_event *event __maybe_unused,
2023                            struct perf_sample *sample)
2024 {
2025         long ret;
2026         u64 duration = 0;
2027         struct thread *thread;
2028         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
2029         struct syscall *sc = trace__syscall_info(trace, evsel, id);
2030         struct thread_trace *ttrace;
2031
2032         if (sc == NULL)
2033                 return -1;
2034
2035         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2036         ttrace = thread__trace(thread, trace->output);
2037         if (ttrace == NULL)
2038                 goto out_put;
2039
2040         if (trace->summary)
2041                 thread__update_stats(ttrace, id, sample);
2042
2043         ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
2044
2045         if (id == trace->audit.open_id && ret >= 0 && ttrace->filename.pending_open) {
2046                 trace__set_fd_pathname(thread, ret, ttrace->filename.name);
2047                 ttrace->filename.pending_open = false;
2048                 ++trace->stats.vfs_getname;
2049         }
2050
2051         ttrace->exit_time = sample->time;
2052
2053         if (ttrace->entry_time) {
2054                 duration = sample->time - ttrace->entry_time;
2055                 if (trace__filter_duration(trace, duration))
2056                         goto out;
2057         } else if (trace->duration_filter)
2058                 goto out;
2059
2060         if (trace->summary_only)
2061                 goto out;
2062
2063         trace__fprintf_entry_head(trace, thread, duration, sample->time, trace->output);
2064
2065         if (ttrace->entry_pending) {
2066                 fprintf(trace->output, "%-70s", ttrace->entry_str);
2067         } else {
2068                 fprintf(trace->output, " ... [");
2069                 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
2070                 fprintf(trace->output, "]: %s()", sc->name);
2071         }
2072
2073         if (sc->fmt == NULL) {
2074 signed_print:
2075                 fprintf(trace->output, ") = %ld", ret);
2076         } else if (ret < 0 && sc->fmt->errmsg) {
2077                 char bf[STRERR_BUFSIZE];
2078                 const char *emsg = strerror_r(-ret, bf, sizeof(bf)),
2079                            *e = audit_errno_to_name(-ret);
2080
2081                 fprintf(trace->output, ") = -1 %s %s", e, emsg);
2082         } else if (ret == 0 && sc->fmt->timeout)
2083                 fprintf(trace->output, ") = 0 Timeout");
2084         else if (sc->fmt->hexret)
2085                 fprintf(trace->output, ") = %#lx", ret);
2086         else
2087                 goto signed_print;
2088
2089         fputc('\n', trace->output);
2090 out:
2091         ttrace->entry_pending = false;
2092         err = 0;
2093 out_put:
2094         thread__put(thread);
2095         return err;
2096 }
2097
2098 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
2099                               union perf_event *event __maybe_unused,
2100                               struct perf_sample *sample)
2101 {
2102         struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2103         struct thread_trace *ttrace;
2104         size_t filename_len, entry_str_len, to_move;
2105         ssize_t remaining_space;
2106         char *pos;
2107         const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
2108
2109         if (!thread)
2110                 goto out;
2111
2112         ttrace = thread__priv(thread);
2113         if (!ttrace)
2114                 goto out;
2115
2116         filename_len = strlen(filename);
2117
2118         if (ttrace->filename.namelen < filename_len) {
2119                 char *f = realloc(ttrace->filename.name, filename_len + 1);
2120
2121                 if (f == NULL)
2122                                 goto out;
2123
2124                 ttrace->filename.namelen = filename_len;
2125                 ttrace->filename.name = f;
2126         }
2127
2128         strcpy(ttrace->filename.name, filename);
2129         ttrace->filename.pending_open = true;
2130
2131         if (!ttrace->filename.ptr)
2132                 goto out;
2133
2134         entry_str_len = strlen(ttrace->entry_str);
2135         remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
2136         if (remaining_space <= 0)
2137                 goto out;
2138
2139         if (filename_len > (size_t)remaining_space) {
2140                 filename += filename_len - remaining_space;
2141                 filename_len = remaining_space;
2142         }
2143
2144         to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
2145         pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
2146         memmove(pos + filename_len, pos, to_move);
2147         memcpy(pos, filename, filename_len);
2148
2149         ttrace->filename.ptr = 0;
2150         ttrace->filename.entry_str_pos = 0;
2151 out:
2152         return 0;
2153 }
2154
2155 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
2156                                      union perf_event *event __maybe_unused,
2157                                      struct perf_sample *sample)
2158 {
2159         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
2160         double runtime_ms = (double)runtime / NSEC_PER_MSEC;
2161         struct thread *thread = machine__findnew_thread(trace->host,
2162                                                         sample->pid,
2163                                                         sample->tid);
2164         struct thread_trace *ttrace = thread__trace(thread, trace->output);
2165
2166         if (ttrace == NULL)
2167                 goto out_dump;
2168
2169         ttrace->runtime_ms += runtime_ms;
2170         trace->runtime_ms += runtime_ms;
2171         thread__put(thread);
2172         return 0;
2173
2174 out_dump:
2175         fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
2176                evsel->name,
2177                perf_evsel__strval(evsel, sample, "comm"),
2178                (pid_t)perf_evsel__intval(evsel, sample, "pid"),
2179                runtime,
2180                perf_evsel__intval(evsel, sample, "vruntime"));
2181         thread__put(thread);
2182         return 0;
2183 }
2184
2185 static void bpf_output__printer(enum binary_printer_ops op,
2186                                 unsigned int val, void *extra)
2187 {
2188         FILE *output = extra;
2189         unsigned char ch = (unsigned char)val;
2190
2191         switch (op) {
2192         case BINARY_PRINT_CHAR_DATA:
2193                 fprintf(output, "%c", isprint(ch) ? ch : '.');
2194                 break;
2195         case BINARY_PRINT_DATA_BEGIN:
2196         case BINARY_PRINT_LINE_BEGIN:
2197         case BINARY_PRINT_ADDR:
2198         case BINARY_PRINT_NUM_DATA:
2199         case BINARY_PRINT_NUM_PAD:
2200         case BINARY_PRINT_SEP:
2201         case BINARY_PRINT_CHAR_PAD:
2202         case BINARY_PRINT_LINE_END:
2203         case BINARY_PRINT_DATA_END:
2204         default:
2205                 break;
2206         }
2207 }
2208
2209 static void bpf_output__fprintf(struct trace *trace,
2210                                 struct perf_sample *sample)
2211 {
2212         print_binary(sample->raw_data, sample->raw_size, 8,
2213                      bpf_output__printer, trace->output);
2214 }
2215
2216 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
2217                                 union perf_event *event __maybe_unused,
2218                                 struct perf_sample *sample)
2219 {
2220         trace__printf_interrupted_entry(trace, sample);
2221         trace__fprintf_tstamp(trace, sample->time, trace->output);
2222
2223         if (trace->trace_syscalls)
2224                 fprintf(trace->output, "(         ): ");
2225
2226         fprintf(trace->output, "%s:", evsel->name);
2227
2228         if (perf_evsel__is_bpf_output(evsel)) {
2229                 bpf_output__fprintf(trace, sample);
2230         } else if (evsel->tp_format) {
2231                 event_format__fprintf(evsel->tp_format, sample->cpu,
2232                                       sample->raw_data, sample->raw_size,
2233                                       trace->output);
2234         }
2235
2236         fprintf(trace->output, ")\n");
2237         return 0;
2238 }
2239
2240 static void print_location(FILE *f, struct perf_sample *sample,
2241                            struct addr_location *al,
2242                            bool print_dso, bool print_sym)
2243 {
2244
2245         if ((verbose || print_dso) && al->map)
2246                 fprintf(f, "%s@", al->map->dso->long_name);
2247
2248         if ((verbose || print_sym) && al->sym)
2249                 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
2250                         al->addr - al->sym->start);
2251         else if (al->map)
2252                 fprintf(f, "0x%" PRIx64, al->addr);
2253         else
2254                 fprintf(f, "0x%" PRIx64, sample->addr);
2255 }
2256
2257 static int trace__pgfault(struct trace *trace,
2258                           struct perf_evsel *evsel,
2259                           union perf_event *event __maybe_unused,
2260                           struct perf_sample *sample)
2261 {
2262         struct thread *thread;
2263         struct addr_location al;
2264         char map_type = 'd';
2265         struct thread_trace *ttrace;
2266         int err = -1;
2267
2268         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2269         ttrace = thread__trace(thread, trace->output);
2270         if (ttrace == NULL)
2271                 goto out_put;
2272
2273         if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
2274                 ttrace->pfmaj++;
2275         else
2276                 ttrace->pfmin++;
2277
2278         if (trace->summary_only)
2279                 goto out;
2280
2281         thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
2282                               sample->ip, &al);
2283
2284         trace__fprintf_entry_head(trace, thread, 0, sample->time, trace->output);
2285
2286         fprintf(trace->output, "%sfault [",
2287                 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
2288                 "maj" : "min");
2289
2290         print_location(trace->output, sample, &al, false, true);
2291
2292         fprintf(trace->output, "] => ");
2293
2294         thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
2295                                    sample->addr, &al);
2296
2297         if (!al.map) {
2298                 thread__find_addr_location(thread, sample->cpumode,
2299                                            MAP__FUNCTION, sample->addr, &al);
2300
2301                 if (al.map)
2302                         map_type = 'x';
2303                 else
2304                         map_type = '?';
2305         }
2306
2307         print_location(trace->output, sample, &al, true, false);
2308
2309         fprintf(trace->output, " (%c%c)\n", map_type, al.level);
2310 out:
2311         err = 0;
2312 out_put:
2313         thread__put(thread);
2314         return err;
2315 }
2316
2317 static bool skip_sample(struct trace *trace, struct perf_sample *sample)
2318 {
2319         if ((trace->pid_list && intlist__find(trace->pid_list, sample->pid)) ||
2320             (trace->tid_list && intlist__find(trace->tid_list, sample->tid)))
2321                 return false;
2322
2323         if (trace->pid_list || trace->tid_list)
2324                 return true;
2325
2326         return false;
2327 }
2328
2329 static int trace__process_sample(struct perf_tool *tool,
2330                                  union perf_event *event,
2331                                  struct perf_sample *sample,
2332                                  struct perf_evsel *evsel,
2333                                  struct machine *machine __maybe_unused)
2334 {
2335         struct trace *trace = container_of(tool, struct trace, tool);
2336         int err = 0;
2337
2338         tracepoint_handler handler = evsel->handler;
2339
2340         if (skip_sample(trace, sample))
2341                 return 0;
2342
2343         if (!trace->full_time && trace->base_time == 0)
2344                 trace->base_time = sample->time;
2345
2346         if (handler) {
2347                 ++trace->nr_events;
2348                 handler(trace, evsel, event, sample);
2349         }
2350
2351         return err;
2352 }
2353
2354 static int parse_target_str(struct trace *trace)
2355 {
2356         if (trace->opts.target.pid) {
2357                 trace->pid_list = intlist__new(trace->opts.target.pid);
2358                 if (trace->pid_list == NULL) {
2359                         pr_err("Error parsing process id string\n");
2360                         return -EINVAL;
2361                 }
2362         }
2363
2364         if (trace->opts.target.tid) {
2365                 trace->tid_list = intlist__new(trace->opts.target.tid);
2366                 if (trace->tid_list == NULL) {
2367                         pr_err("Error parsing thread id string\n");
2368                         return -EINVAL;
2369                 }
2370         }
2371
2372         return 0;
2373 }
2374
2375 static int trace__record(struct trace *trace, int argc, const char **argv)
2376 {
2377         unsigned int rec_argc, i, j;
2378         const char **rec_argv;
2379         const char * const record_args[] = {
2380                 "record",
2381                 "-R",
2382                 "-m", "1024",
2383                 "-c", "1",
2384         };
2385
2386         const char * const sc_args[] = { "-e", };
2387         unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2388         const char * const majpf_args[] = { "-e", "major-faults" };
2389         unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2390         const char * const minpf_args[] = { "-e", "minor-faults" };
2391         unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2392
2393         /* +1 is for the event string below */
2394         rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2395                 majpf_args_nr + minpf_args_nr + argc;
2396         rec_argv = calloc(rec_argc + 1, sizeof(char *));
2397
2398         if (rec_argv == NULL)
2399                 return -ENOMEM;
2400
2401         j = 0;
2402         for (i = 0; i < ARRAY_SIZE(record_args); i++)
2403                 rec_argv[j++] = record_args[i];
2404
2405         if (trace->trace_syscalls) {
2406                 for (i = 0; i < sc_args_nr; i++)
2407                         rec_argv[j++] = sc_args[i];
2408
2409                 /* event string may be different for older kernels - e.g., RHEL6 */
2410                 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2411                         rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2412                 else if (is_valid_tracepoint("syscalls:sys_enter"))
2413                         rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2414                 else {
2415                         pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2416                         return -1;
2417                 }
2418         }
2419
2420         if (trace->trace_pgfaults & TRACE_PFMAJ)
2421                 for (i = 0; i < majpf_args_nr; i++)
2422                         rec_argv[j++] = majpf_args[i];
2423
2424         if (trace->trace_pgfaults & TRACE_PFMIN)
2425                 for (i = 0; i < minpf_args_nr; i++)
2426                         rec_argv[j++] = minpf_args[i];
2427
2428         for (i = 0; i < (unsigned int)argc; i++)
2429                 rec_argv[j++] = argv[i];
2430
2431         return cmd_record(j, rec_argv, NULL);
2432 }
2433
2434 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2435
2436 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2437 {
2438         struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2439
2440         if (IS_ERR(evsel))
2441                 return false;
2442
2443         if (perf_evsel__field(evsel, "pathname") == NULL) {
2444                 perf_evsel__delete(evsel);
2445                 return false;
2446         }
2447
2448         evsel->handler = trace__vfs_getname;
2449         perf_evlist__add(evlist, evsel);
2450         return true;
2451 }
2452
2453 static int perf_evlist__add_pgfault(struct perf_evlist *evlist,
2454                                     u64 config)
2455 {
2456         struct perf_evsel *evsel;
2457         struct perf_event_attr attr = {
2458                 .type = PERF_TYPE_SOFTWARE,
2459                 .mmap_data = 1,
2460         };
2461
2462         attr.config = config;
2463         attr.sample_period = 1;
2464
2465         event_attr_init(&attr);
2466
2467         evsel = perf_evsel__new(&attr);
2468         if (!evsel)
2469                 return -ENOMEM;
2470
2471         evsel->handler = trace__pgfault;
2472         perf_evlist__add(evlist, evsel);
2473
2474         return 0;
2475 }
2476
2477 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2478 {
2479         const u32 type = event->header.type;
2480         struct perf_evsel *evsel;
2481
2482         if (!trace->full_time && trace->base_time == 0)
2483                 trace->base_time = sample->time;
2484
2485         if (type != PERF_RECORD_SAMPLE) {
2486                 trace__process_event(trace, trace->host, event, sample);
2487                 return;
2488         }
2489
2490         evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2491         if (evsel == NULL) {
2492                 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2493                 return;
2494         }
2495
2496         if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2497             sample->raw_data == NULL) {
2498                 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2499                        perf_evsel__name(evsel), sample->tid,
2500                        sample->cpu, sample->raw_size);
2501         } else {
2502                 tracepoint_handler handler = evsel->handler;
2503                 handler(trace, evsel, event, sample);
2504         }
2505 }
2506
2507 static int trace__add_syscall_newtp(struct trace *trace)
2508 {
2509         int ret = -1;
2510         struct perf_evlist *evlist = trace->evlist;
2511         struct perf_evsel *sys_enter, *sys_exit;
2512
2513         sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2514         if (sys_enter == NULL)
2515                 goto out;
2516
2517         if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2518                 goto out_delete_sys_enter;
2519
2520         sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2521         if (sys_exit == NULL)
2522                 goto out_delete_sys_enter;
2523
2524         if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2525                 goto out_delete_sys_exit;
2526
2527         perf_evlist__add(evlist, sys_enter);
2528         perf_evlist__add(evlist, sys_exit);
2529
2530         trace->syscalls.events.sys_enter = sys_enter;
2531         trace->syscalls.events.sys_exit  = sys_exit;
2532
2533         ret = 0;
2534 out:
2535         return ret;
2536
2537 out_delete_sys_exit:
2538         perf_evsel__delete_priv(sys_exit);
2539 out_delete_sys_enter:
2540         perf_evsel__delete_priv(sys_enter);
2541         goto out;
2542 }
2543
2544 static int trace__set_ev_qualifier_filter(struct trace *trace)
2545 {
2546         int err = -1;
2547         char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2548                                                 trace->ev_qualifier_ids.nr,
2549                                                 trace->ev_qualifier_ids.entries);
2550
2551         if (filter == NULL)
2552                 goto out_enomem;
2553
2554         if (!perf_evsel__append_filter(trace->syscalls.events.sys_enter, "&&", filter))
2555                 err = perf_evsel__append_filter(trace->syscalls.events.sys_exit, "&&", filter);
2556
2557         free(filter);
2558 out:
2559         return err;
2560 out_enomem:
2561         errno = ENOMEM;
2562         goto out;
2563 }
2564
2565 static int trace__run(struct trace *trace, int argc, const char **argv)
2566 {
2567         struct perf_evlist *evlist = trace->evlist;
2568         struct perf_evsel *evsel;
2569         int err = -1, i;
2570         unsigned long before;
2571         const bool forks = argc > 0;
2572         bool draining = false;
2573
2574         trace->live = true;
2575
2576         if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2577                 goto out_error_raw_syscalls;
2578
2579         if (trace->trace_syscalls)
2580                 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2581
2582         if ((trace->trace_pgfaults & TRACE_PFMAJ) &&
2583             perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MAJ)) {
2584                 goto out_error_mem;
2585         }
2586
2587         if ((trace->trace_pgfaults & TRACE_PFMIN) &&
2588             perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MIN))
2589                 goto out_error_mem;
2590
2591         if (trace->sched &&
2592             perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2593                                    trace__sched_stat_runtime))
2594                 goto out_error_sched_stat_runtime;
2595
2596         err = perf_evlist__create_maps(evlist, &trace->opts.target);
2597         if (err < 0) {
2598                 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2599                 goto out_delete_evlist;
2600         }
2601
2602         err = trace__symbols_init(trace, evlist);
2603         if (err < 0) {
2604                 fprintf(trace->output, "Problems initializing symbol libraries!\n");
2605                 goto out_delete_evlist;
2606         }
2607
2608         perf_evlist__config(evlist, &trace->opts);
2609
2610         signal(SIGCHLD, sig_handler);
2611         signal(SIGINT, sig_handler);
2612
2613         if (forks) {
2614                 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2615                                                     argv, false, NULL);
2616                 if (err < 0) {
2617                         fprintf(trace->output, "Couldn't run the workload!\n");
2618                         goto out_delete_evlist;
2619                 }
2620         }
2621
2622         err = perf_evlist__open(evlist);
2623         if (err < 0)
2624                 goto out_error_open;
2625
2626         err = bpf__apply_obj_config();
2627         if (err) {
2628                 char errbuf[BUFSIZ];
2629
2630                 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2631                 pr_err("ERROR: Apply config to BPF failed: %s\n",
2632                          errbuf);
2633                 goto out_error_open;
2634         }
2635
2636         /*
2637          * Better not use !target__has_task() here because we need to cover the
2638          * case where no threads were specified in the command line, but a
2639          * workload was, and in that case we will fill in the thread_map when
2640          * we fork the workload in perf_evlist__prepare_workload.
2641          */
2642         if (trace->filter_pids.nr > 0)
2643                 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2644         else if (thread_map__pid(evlist->threads, 0) == -1)
2645                 err = perf_evlist__set_filter_pid(evlist, getpid());
2646
2647         if (err < 0)
2648                 goto out_error_mem;
2649
2650         if (trace->ev_qualifier_ids.nr > 0) {
2651                 err = trace__set_ev_qualifier_filter(trace);
2652                 if (err < 0)
2653                         goto out_errno;
2654
2655                 pr_debug("event qualifier tracepoint filter: %s\n",
2656                          trace->syscalls.events.sys_exit->filter);
2657         }
2658
2659         err = perf_evlist__apply_filters(evlist, &evsel);
2660         if (err < 0)
2661                 goto out_error_apply_filters;
2662
2663         err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2664         if (err < 0)
2665                 goto out_error_mmap;
2666
2667         if (!target__none(&trace->opts.target))
2668                 perf_evlist__enable(evlist);
2669
2670         if (forks)
2671                 perf_evlist__start_workload(evlist);
2672
2673         trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2674                                   evlist->threads->nr > 1 ||
2675                                   perf_evlist__first(evlist)->attr.inherit;
2676 again:
2677         before = trace->nr_events;
2678
2679         for (i = 0; i < evlist->nr_mmaps; i++) {
2680                 union perf_event *event;
2681
2682                 while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2683                         struct perf_sample sample;
2684
2685                         ++trace->nr_events;
2686
2687                         err = perf_evlist__parse_sample(evlist, event, &sample);
2688                         if (err) {
2689                                 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2690                                 goto next_event;
2691                         }
2692
2693                         trace__handle_event(trace, event, &sample);
2694 next_event:
2695                         perf_evlist__mmap_consume(evlist, i);
2696
2697                         if (interrupted)
2698                                 goto out_disable;
2699
2700                         if (done && !draining) {
2701                                 perf_evlist__disable(evlist);
2702                                 draining = true;
2703                         }
2704                 }
2705         }
2706
2707         if (trace->nr_events == before) {
2708                 int timeout = done ? 100 : -1;
2709
2710                 if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2711                         if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2712                                 draining = true;
2713
2714                         goto again;
2715                 }
2716         } else {
2717                 goto again;
2718         }
2719
2720 out_disable:
2721         thread__zput(trace->current);
2722
2723         perf_evlist__disable(evlist);
2724
2725         if (!err) {
2726                 if (trace->summary)
2727                         trace__fprintf_thread_summary(trace, trace->output);
2728
2729                 if (trace->show_tool_stats) {
2730                         fprintf(trace->output, "Stats:\n "
2731                                                " vfs_getname : %" PRIu64 "\n"
2732                                                " proc_getname: %" PRIu64 "\n",
2733                                 trace->stats.vfs_getname,
2734                                 trace->stats.proc_getname);
2735                 }
2736         }
2737
2738 out_delete_evlist:
2739         perf_evlist__delete(evlist);
2740         trace->evlist = NULL;
2741         trace->live = false;
2742         return err;
2743 {
2744         char errbuf[BUFSIZ];
2745
2746 out_error_sched_stat_runtime:
2747         tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2748         goto out_error;
2749
2750 out_error_raw_syscalls:
2751         tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2752         goto out_error;
2753
2754 out_error_mmap:
2755         perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2756         goto out_error;
2757
2758 out_error_open:
2759         perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2760
2761 out_error:
2762         fprintf(trace->output, "%s\n", errbuf);
2763         goto out_delete_evlist;
2764
2765 out_error_apply_filters:
2766         fprintf(trace->output,
2767                 "Failed to set filter \"%s\" on event %s with %d (%s)\n",
2768                 evsel->filter, perf_evsel__name(evsel), errno,
2769                 strerror_r(errno, errbuf, sizeof(errbuf)));
2770         goto out_delete_evlist;
2771 }
2772 out_error_mem:
2773         fprintf(trace->output, "Not enough memory to run!\n");
2774         goto out_delete_evlist;
2775
2776 out_errno:
2777         fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2778         goto out_delete_evlist;
2779 }
2780
2781 static int trace__replay(struct trace *trace)
2782 {
2783         const struct perf_evsel_str_handler handlers[] = {
2784                 { "probe:vfs_getname",       trace__vfs_getname, },
2785         };
2786         struct perf_data_file file = {
2787                 .path  = input_name,
2788                 .mode  = PERF_DATA_MODE_READ,
2789                 .force = trace->force,
2790         };
2791         struct perf_session *session;
2792         struct perf_evsel *evsel;
2793         int err = -1;
2794
2795         trace->tool.sample        = trace__process_sample;
2796         trace->tool.mmap          = perf_event__process_mmap;
2797         trace->tool.mmap2         = perf_event__process_mmap2;
2798         trace->tool.comm          = perf_event__process_comm;
2799         trace->tool.exit          = perf_event__process_exit;
2800         trace->tool.fork          = perf_event__process_fork;
2801         trace->tool.attr          = perf_event__process_attr;
2802         trace->tool.tracing_data = perf_event__process_tracing_data;
2803         trace->tool.build_id      = perf_event__process_build_id;
2804
2805         trace->tool.ordered_events = true;
2806         trace->tool.ordering_requires_timestamps = true;
2807
2808         /* add tid to output */
2809         trace->multiple_threads = true;
2810
2811         session = perf_session__new(&file, false, &trace->tool);
2812         if (session == NULL)
2813                 return -1;
2814
2815         if (symbol__init(&session->header.env) < 0)
2816                 goto out;
2817
2818         trace->host = &session->machines.host;
2819
2820         err = perf_session__set_tracepoints_handlers(session, handlers);
2821         if (err)
2822                 goto out;
2823
2824         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2825                                                      "raw_syscalls:sys_enter");
2826         /* older kernels have syscalls tp versus raw_syscalls */
2827         if (evsel == NULL)
2828                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2829                                                              "syscalls:sys_enter");
2830
2831         if (evsel &&
2832             (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2833             perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2834                 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2835                 goto out;
2836         }
2837
2838         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2839                                                      "raw_syscalls:sys_exit");
2840         if (evsel == NULL)
2841                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2842                                                              "syscalls:sys_exit");
2843         if (evsel &&
2844             (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2845             perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2846                 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2847                 goto out;
2848         }
2849
2850         evlist__for_each(session->evlist, evsel) {
2851                 if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2852                     (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2853                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2854                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2855                         evsel->handler = trace__pgfault;
2856         }
2857
2858         err = parse_target_str(trace);
2859         if (err != 0)
2860                 goto out;
2861
2862         setup_pager();
2863
2864         err = perf_session__process_events(session);
2865         if (err)
2866                 pr_err("Failed to process events, error %d", err);
2867
2868         else if (trace->summary)
2869                 trace__fprintf_thread_summary(trace, trace->output);
2870
2871 out:
2872         perf_session__delete(session);
2873
2874         return err;
2875 }
2876
2877 static size_t trace__fprintf_threads_header(FILE *fp)
2878 {
2879         size_t printed;
2880
2881         printed  = fprintf(fp, "\n Summary of events:\n\n");
2882
2883         return printed;
2884 }
2885
2886 static size_t thread__dump_stats(struct thread_trace *ttrace,
2887                                  struct trace *trace, FILE *fp)
2888 {
2889         struct stats *stats;
2890         size_t printed = 0;
2891         struct syscall *sc;
2892         struct int_node *inode = intlist__first(ttrace->syscall_stats);
2893
2894         if (inode == NULL)
2895                 return 0;
2896
2897         printed += fprintf(fp, "\n");
2898
2899         printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2900         printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2901         printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2902
2903         /* each int_node is a syscall */
2904         while (inode) {
2905                 stats = inode->priv;
2906                 if (stats) {
2907                         double min = (double)(stats->min) / NSEC_PER_MSEC;
2908                         double max = (double)(stats->max) / NSEC_PER_MSEC;
2909                         double avg = avg_stats(stats);
2910                         double pct;
2911                         u64 n = (u64) stats->n;
2912
2913                         pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2914                         avg /= NSEC_PER_MSEC;
2915
2916                         sc = &trace->syscalls.table[inode->i];
2917                         printed += fprintf(fp, "   %-15s", sc->name);
2918                         printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2919                                            n, avg * n, min, avg);
2920                         printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2921                 }
2922
2923                 inode = intlist__next(inode);
2924         }
2925
2926         printed += fprintf(fp, "\n\n");
2927
2928         return printed;
2929 }
2930
2931 /* struct used to pass data to per-thread function */
2932 struct summary_data {
2933         FILE *fp;
2934         struct trace *trace;
2935         size_t printed;
2936 };
2937
2938 static int trace__fprintf_one_thread(struct thread *thread, void *priv)
2939 {
2940         struct summary_data *data = priv;
2941         FILE *fp = data->fp;
2942         size_t printed = data->printed;
2943         struct trace *trace = data->trace;
2944         struct thread_trace *ttrace = thread__priv(thread);
2945         double ratio;
2946
2947         if (ttrace == NULL)
2948                 return 0;
2949
2950         ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2951
2952         printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2953         printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2954         printed += fprintf(fp, "%.1f%%", ratio);
2955         if (ttrace->pfmaj)
2956                 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2957         if (ttrace->pfmin)
2958                 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2959         printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2960         printed += thread__dump_stats(ttrace, trace, fp);
2961
2962         data->printed += printed;
2963
2964         return 0;
2965 }
2966
2967 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2968 {
2969         struct summary_data data = {
2970                 .fp = fp,
2971                 .trace = trace
2972         };
2973         data.printed = trace__fprintf_threads_header(fp);
2974
2975         machine__for_each_thread(trace->host, trace__fprintf_one_thread, &data);
2976
2977         return data.printed;
2978 }
2979
2980 static int trace__set_duration(const struct option *opt, const char *str,
2981                                int unset __maybe_unused)
2982 {
2983         struct trace *trace = opt->value;
2984
2985         trace->duration_filter = atof(str);
2986         return 0;
2987 }
2988
2989 static int trace__set_filter_pids(const struct option *opt, const char *str,
2990                                   int unset __maybe_unused)
2991 {
2992         int ret = -1;
2993         size_t i;
2994         struct trace *trace = opt->value;
2995         /*
2996          * FIXME: introduce a intarray class, plain parse csv and create a
2997          * { int nr, int entries[] } struct...
2998          */
2999         struct intlist *list = intlist__new(str);
3000
3001         if (list == NULL)
3002                 return -1;
3003
3004         i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
3005         trace->filter_pids.entries = calloc(i, sizeof(pid_t));
3006
3007         if (trace->filter_pids.entries == NULL)
3008                 goto out;
3009
3010         trace->filter_pids.entries[0] = getpid();
3011
3012         for (i = 1; i < trace->filter_pids.nr; ++i)
3013                 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
3014
3015         intlist__delete(list);
3016         ret = 0;
3017 out:
3018         return ret;
3019 }
3020
3021 static int trace__open_output(struct trace *trace, const char *filename)
3022 {
3023         struct stat st;
3024
3025         if (!stat(filename, &st) && st.st_size) {
3026                 char oldname[PATH_MAX];
3027
3028                 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
3029                 unlink(oldname);
3030                 rename(filename, oldname);
3031         }
3032
3033         trace->output = fopen(filename, "w");
3034
3035         return trace->output == NULL ? -errno : 0;
3036 }
3037
3038 static int parse_pagefaults(const struct option *opt, const char *str,
3039                             int unset __maybe_unused)
3040 {
3041         int *trace_pgfaults = opt->value;
3042
3043         if (strcmp(str, "all") == 0)
3044                 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
3045         else if (strcmp(str, "maj") == 0)
3046                 *trace_pgfaults |= TRACE_PFMAJ;
3047         else if (strcmp(str, "min") == 0)
3048                 *trace_pgfaults |= TRACE_PFMIN;
3049         else
3050                 return -1;
3051
3052         return 0;
3053 }
3054
3055 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
3056 {
3057         struct perf_evsel *evsel;
3058
3059         evlist__for_each(evlist, evsel)
3060                 evsel->handler = handler;
3061 }
3062
3063 int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
3064 {
3065         const char *trace_usage[] = {
3066                 "perf trace [<options>] [<command>]",
3067                 "perf trace [<options>] -- <command> [<options>]",
3068                 "perf trace record [<options>] [<command>]",
3069                 "perf trace record [<options>] -- <command> [<options>]",
3070                 NULL
3071         };
3072         struct trace trace = {
3073                 .audit = {
3074                         .machine = audit_detect_machine(),
3075                         .open_id = audit_name_to_syscall("open", trace.audit.machine),
3076                 },
3077                 .syscalls = {
3078                         . max = -1,
3079                 },
3080                 .opts = {
3081                         .target = {
3082                                 .uid       = UINT_MAX,
3083                                 .uses_mmap = true,
3084                         },
3085                         .user_freq     = UINT_MAX,
3086                         .user_interval = ULLONG_MAX,
3087                         .no_buffering  = true,
3088                         .mmap_pages    = UINT_MAX,
3089                         .proc_map_timeout  = 500,
3090                 },
3091                 .output = stderr,
3092                 .show_comm = true,
3093                 .trace_syscalls = true,
3094         };
3095         const char *output_name = NULL;
3096         const char *ev_qualifier_str = NULL;
3097         const struct option trace_options[] = {
3098         OPT_CALLBACK(0, "event", &trace.evlist, "event",
3099                      "event selector. use 'perf list' to list available events",
3100                      parse_events_option),
3101         OPT_BOOLEAN(0, "comm", &trace.show_comm,
3102                     "show the thread COMM next to its id"),
3103         OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
3104         OPT_STRING('e', "expr", &ev_qualifier_str, "expr", "list of syscalls to trace"),
3105         OPT_STRING('o', "output", &output_name, "file", "output file name"),
3106         OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
3107         OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
3108                     "trace events on existing process id"),
3109         OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
3110                     "trace events on existing thread id"),
3111         OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
3112                      "pids to filter (by the kernel)", trace__set_filter_pids),
3113         OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
3114                     "system-wide collection from all CPUs"),
3115         OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
3116                     "list of cpus to monitor"),
3117         OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
3118                     "child tasks do not inherit counters"),
3119         OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
3120                      "number of mmap data pages",
3121                      perf_evlist__parse_mmap_pages),
3122         OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
3123                    "user to profile"),
3124         OPT_CALLBACK(0, "duration", &trace, "float",
3125                      "show only events with duration > N.M ms",
3126                      trace__set_duration),
3127         OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
3128         OPT_INCR('v', "verbose", &verbose, "be more verbose"),
3129         OPT_BOOLEAN('T', "time", &trace.full_time,
3130                     "Show full timestamp, not time relative to first start"),
3131         OPT_BOOLEAN('s', "summary", &trace.summary_only,
3132                     "Show only syscall summary with statistics"),
3133         OPT_BOOLEAN('S', "with-summary", &trace.summary,
3134                     "Show all syscalls and summary with statistics"),
3135         OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
3136                      "Trace pagefaults", parse_pagefaults, "maj"),
3137         OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
3138         OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
3139         OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
3140                         "per thread proc mmap processing timeout in ms"),
3141         OPT_END()
3142         };
3143         const char * const trace_subcommands[] = { "record", NULL };
3144         int err;
3145         char bf[BUFSIZ];
3146
3147         signal(SIGSEGV, sighandler_dump_stack);
3148         signal(SIGFPE, sighandler_dump_stack);
3149
3150         trace.evlist = perf_evlist__new();
3151
3152         if (trace.evlist == NULL) {
3153                 pr_err("Not enough memory to run!\n");
3154                 err = -ENOMEM;
3155                 goto out;
3156         }
3157
3158         argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
3159                                  trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3160
3161         if (trace.trace_pgfaults) {
3162                 trace.opts.sample_address = true;
3163                 trace.opts.sample_time = true;
3164         }
3165
3166         if (trace.evlist->nr_entries > 0)
3167                 evlist__set_evsel_handler(trace.evlist, trace__event_handler);
3168
3169         if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
3170                 return trace__record(&trace, argc-1, &argv[1]);
3171
3172         /* summary_only implies summary option, but don't overwrite summary if set */
3173         if (trace.summary_only)
3174                 trace.summary = trace.summary_only;
3175
3176         if (!trace.trace_syscalls && !trace.trace_pgfaults &&
3177             trace.evlist->nr_entries == 0 /* Was --events used? */) {
3178                 pr_err("Please specify something to trace.\n");
3179                 return -1;
3180         }
3181
3182         if (output_name != NULL) {
3183                 err = trace__open_output(&trace, output_name);
3184                 if (err < 0) {
3185                         perror("failed to create output file");
3186                         goto out;
3187                 }
3188         }
3189
3190         if (ev_qualifier_str != NULL) {
3191                 const char *s = ev_qualifier_str;
3192                 struct strlist_config slist_config = {
3193                         .dirname = system_path(STRACE_GROUPS_DIR),
3194                 };
3195
3196                 trace.not_ev_qualifier = *s == '!';
3197                 if (trace.not_ev_qualifier)
3198                         ++s;
3199                 trace.ev_qualifier = strlist__new(s, &slist_config);
3200                 if (trace.ev_qualifier == NULL) {
3201                         fputs("Not enough memory to parse event qualifier",
3202                               trace.output);
3203                         err = -ENOMEM;
3204                         goto out_close;
3205                 }
3206
3207                 err = trace__validate_ev_qualifier(&trace);
3208                 if (err)
3209                         goto out_close;
3210         }
3211
3212         err = target__validate(&trace.opts.target);
3213         if (err) {
3214                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3215                 fprintf(trace.output, "%s", bf);
3216                 goto out_close;
3217         }
3218
3219         err = target__parse_uid(&trace.opts.target);
3220         if (err) {
3221                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3222                 fprintf(trace.output, "%s", bf);
3223                 goto out_close;
3224         }
3225
3226         if (!argc && target__none(&trace.opts.target))
3227                 trace.opts.target.system_wide = true;
3228
3229         if (input_name)
3230                 err = trace__replay(&trace);
3231         else
3232                 err = trace__run(&trace, argc, argv);
3233
3234 out_close:
3235         if (output_name != NULL)
3236                 fclose(trace.output);
3237 out:
3238         return err;
3239 }