]> git.kernelconcepts.de Git - karo-tx-linux.git/blob - arch/x86/kernel/cpu/perf_event.c
perf, x86: Avoid kfree() in CPU_STARTING
[karo-tx-linux.git] / arch / x86 / kernel / cpu / perf_event.c
1 /*
2  * Performance events x86 architecture code
3  *
4  *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5  *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
6  *  Copyright (C) 2009 Jaswinder Singh Rajput
7  *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
8  *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
9  *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
10  *  Copyright (C) 2009 Google, Inc., Stephane Eranian
11  *
12  *  For licencing details see kernel-base/COPYING
13  */
14
15 #include <linux/perf_event.h>
16 #include <linux/capability.h>
17 #include <linux/notifier.h>
18 #include <linux/hardirq.h>
19 #include <linux/kprobes.h>
20 #include <linux/module.h>
21 #include <linux/kdebug.h>
22 #include <linux/sched.h>
23 #include <linux/uaccess.h>
24 #include <linux/slab.h>
25 #include <linux/cpu.h>
26 #include <linux/bitops.h>
27
28 #include <asm/apic.h>
29 #include <asm/stacktrace.h>
30 #include <asm/nmi.h>
31 #include <asm/compat.h>
32 #include <asm/smp.h>
33 #include <asm/alternative.h>
34
35 #if 0
36 #undef wrmsrl
37 #define wrmsrl(msr, val)                                        \
38 do {                                                            \
39         trace_printk("wrmsrl(%lx, %lx)\n", (unsigned long)(msr),\
40                         (unsigned long)(val));                  \
41         native_write_msr((msr), (u32)((u64)(val)),              \
42                         (u32)((u64)(val) >> 32));               \
43 } while (0)
44 #endif
45
46 /*
47  *          |   NHM/WSM    |      SNB     |
48  * register -------------------------------
49  *          |  HT  | no HT |  HT  | no HT |
50  *-----------------------------------------
51  * offcore  | core | core  | cpu  | core  |
52  * lbr_sel  | core | core  | cpu  | core  |
53  * ld_lat   | cpu  | core  | cpu  | core  |
54  *-----------------------------------------
55  *
56  * Given that there is a small number of shared regs,
57  * we can pre-allocate their slot in the per-cpu
58  * per-core reg tables.
59  */
60 enum extra_reg_type {
61         EXTRA_REG_NONE  = -1,   /* not used */
62
63         EXTRA_REG_RSP_0 = 0,    /* offcore_response_0 */
64         EXTRA_REG_RSP_1 = 1,    /* offcore_response_1 */
65
66         EXTRA_REG_MAX           /* number of entries needed */
67 };
68
69 struct event_constraint {
70         union {
71                 unsigned long   idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
72                 u64             idxmsk64;
73         };
74         u64     code;
75         u64     cmask;
76         int     weight;
77 };
78
79 struct amd_nb {
80         int nb_id;  /* NorthBridge id */
81         int refcnt; /* reference count */
82         struct perf_event *owners[X86_PMC_IDX_MAX];
83         struct event_constraint event_constraints[X86_PMC_IDX_MAX];
84 };
85
86 struct intel_percore;
87
88 #define MAX_LBR_ENTRIES         16
89
90 struct cpu_hw_events {
91         /*
92          * Generic x86 PMC bits
93          */
94         struct perf_event       *events[X86_PMC_IDX_MAX]; /* in counter order */
95         unsigned long           active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
96         unsigned long           running[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
97         int                     enabled;
98
99         int                     n_events;
100         int                     n_added;
101         int                     n_txn;
102         int                     assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
103         u64                     tags[X86_PMC_IDX_MAX];
104         struct perf_event       *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
105
106         unsigned int            group_flag;
107
108         /*
109          * Intel DebugStore bits
110          */
111         struct debug_store      *ds;
112         u64                     pebs_enabled;
113
114         /*
115          * Intel LBR bits
116          */
117         int                             lbr_users;
118         void                            *lbr_context;
119         struct perf_branch_stack        lbr_stack;
120         struct perf_branch_entry        lbr_entries[MAX_LBR_ENTRIES];
121
122         /*
123          * manage shared (per-core, per-cpu) registers
124          * used on Intel NHM/WSM/SNB
125          */
126         struct intel_shared_regs        *shared_regs;
127
128         /*
129          * AMD specific bits
130          */
131         struct amd_nb           *amd_nb;
132
133         void                    *kfree_on_online;
134 };
135
136 #define __EVENT_CONSTRAINT(c, n, m, w) {\
137         { .idxmsk64 = (n) },            \
138         .code = (c),                    \
139         .cmask = (m),                   \
140         .weight = (w),                  \
141 }
142
143 #define EVENT_CONSTRAINT(c, n, m)       \
144         __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n))
145
146 /*
147  * Constraint on the Event code.
148  */
149 #define INTEL_EVENT_CONSTRAINT(c, n)    \
150         EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT)
151
152 /*
153  * Constraint on the Event code + UMask + fixed-mask
154  *
155  * filter mask to validate fixed counter events.
156  * the following filters disqualify for fixed counters:
157  *  - inv
158  *  - edge
159  *  - cnt-mask
160  *  The other filters are supported by fixed counters.
161  *  The any-thread option is supported starting with v3.
162  */
163 #define FIXED_EVENT_CONSTRAINT(c, n)    \
164         EVENT_CONSTRAINT(c, (1ULL << (32+n)), X86_RAW_EVENT_MASK)
165
166 /*
167  * Constraint on the Event code + UMask
168  */
169 #define INTEL_UEVENT_CONSTRAINT(c, n)   \
170         EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)
171
172 #define EVENT_CONSTRAINT_END            \
173         EVENT_CONSTRAINT(0, 0, 0)
174
175 #define for_each_event_constraint(e, c) \
176         for ((e) = (c); (e)->weight; (e)++)
177
178 /*
179  * Per register state.
180  */
181 struct er_account {
182         raw_spinlock_t          lock;   /* per-core: protect structure */
183         u64                     config; /* extra MSR config */
184         u64                     reg;    /* extra MSR number */
185         atomic_t                ref;    /* reference count */
186 };
187
188 /*
189  * Extra registers for specific events.
190  *
191  * Some events need large masks and require external MSRs.
192  * Those extra MSRs end up being shared for all events on
193  * a PMU and sometimes between PMU of sibling HT threads.
194  * In either case, the kernel needs to handle conflicting
195  * accesses to those extra, shared, regs. The data structure
196  * to manage those registers is stored in cpu_hw_event.
197  */
198 struct extra_reg {
199         unsigned int            event;
200         unsigned int            msr;
201         u64                     config_mask;
202         u64                     valid_mask;
203         int                     idx;  /* per_xxx->regs[] reg index */
204 };
205
206 #define EVENT_EXTRA_REG(e, ms, m, vm, i) {      \
207         .event = (e),           \
208         .msr = (ms),            \
209         .config_mask = (m),     \
210         .valid_mask = (vm),     \
211         .idx = EXTRA_REG_##i    \
212         }
213
214 #define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx)      \
215         EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm, idx)
216
217 #define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0, RSP_0)
218
219 union perf_capabilities {
220         struct {
221                 u64     lbr_format    : 6;
222                 u64     pebs_trap     : 1;
223                 u64     pebs_arch_reg : 1;
224                 u64     pebs_format   : 4;
225                 u64     smm_freeze    : 1;
226         };
227         u64     capabilities;
228 };
229
230 /*
231  * struct x86_pmu - generic x86 pmu
232  */
233 struct x86_pmu {
234         /*
235          * Generic x86 PMC bits
236          */
237         const char      *name;
238         int             version;
239         int             (*handle_irq)(struct pt_regs *);
240         void            (*disable_all)(void);
241         void            (*enable_all)(int added);
242         void            (*enable)(struct perf_event *);
243         void            (*disable)(struct perf_event *);
244         int             (*hw_config)(struct perf_event *event);
245         int             (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
246         unsigned        eventsel;
247         unsigned        perfctr;
248         u64             (*event_map)(int);
249         int             max_events;
250         int             num_counters;
251         int             num_counters_fixed;
252         int             cntval_bits;
253         u64             cntval_mask;
254         int             apic;
255         u64             max_period;
256         struct event_constraint *
257                         (*get_event_constraints)(struct cpu_hw_events *cpuc,
258                                                  struct perf_event *event);
259
260         void            (*put_event_constraints)(struct cpu_hw_events *cpuc,
261                                                  struct perf_event *event);
262         struct event_constraint *event_constraints;
263         void            (*quirks)(void);
264         int             perfctr_second_write;
265
266         int             (*cpu_prepare)(int cpu);
267         void            (*cpu_starting)(int cpu);
268         void            (*cpu_dying)(int cpu);
269         void            (*cpu_dead)(int cpu);
270
271         /*
272          * Intel Arch Perfmon v2+
273          */
274         u64                     intel_ctrl;
275         union perf_capabilities intel_cap;
276
277         /*
278          * Intel DebugStore bits
279          */
280         int             bts, pebs;
281         int             bts_active, pebs_active;
282         int             pebs_record_size;
283         void            (*drain_pebs)(struct pt_regs *regs);
284         struct event_constraint *pebs_constraints;
285
286         /*
287          * Intel LBR
288          */
289         unsigned long   lbr_tos, lbr_from, lbr_to; /* MSR base regs       */
290         int             lbr_nr;                    /* hardware stack size */
291
292         /*
293          * Extra registers for events
294          */
295         struct extra_reg *extra_regs;
296         unsigned int er_flags;
297 };
298
299 #define ERF_NO_HT_SHARING       1
300 #define ERF_HAS_RSP_1           2
301
302 static struct x86_pmu x86_pmu __read_mostly;
303
304 static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
305         .enabled = 1,
306 };
307
308 static int x86_perf_event_set_period(struct perf_event *event);
309
310 /*
311  * Generalized hw caching related hw_event table, filled
312  * in on a per model basis. A value of 0 means
313  * 'not supported', -1 means 'hw_event makes no sense on
314  * this CPU', any other value means the raw hw_event
315  * ID.
316  */
317
318 #define C(x) PERF_COUNT_HW_CACHE_##x
319
320 static u64 __read_mostly hw_cache_event_ids
321                                 [PERF_COUNT_HW_CACHE_MAX]
322                                 [PERF_COUNT_HW_CACHE_OP_MAX]
323                                 [PERF_COUNT_HW_CACHE_RESULT_MAX];
324 static u64 __read_mostly hw_cache_extra_regs
325                                 [PERF_COUNT_HW_CACHE_MAX]
326                                 [PERF_COUNT_HW_CACHE_OP_MAX]
327                                 [PERF_COUNT_HW_CACHE_RESULT_MAX];
328
329 /*
330  * Propagate event elapsed time into the generic event.
331  * Can only be executed on the CPU where the event is active.
332  * Returns the delta events processed.
333  */
334 static u64
335 x86_perf_event_update(struct perf_event *event)
336 {
337         struct hw_perf_event *hwc = &event->hw;
338         int shift = 64 - x86_pmu.cntval_bits;
339         u64 prev_raw_count, new_raw_count;
340         int idx = hwc->idx;
341         s64 delta;
342
343         if (idx == X86_PMC_IDX_FIXED_BTS)
344                 return 0;
345
346         /*
347          * Careful: an NMI might modify the previous event value.
348          *
349          * Our tactic to handle this is to first atomically read and
350          * exchange a new raw count - then add that new-prev delta
351          * count to the generic event atomically:
352          */
353 again:
354         prev_raw_count = local64_read(&hwc->prev_count);
355         rdmsrl(hwc->event_base, new_raw_count);
356
357         if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
358                                         new_raw_count) != prev_raw_count)
359                 goto again;
360
361         /*
362          * Now we have the new raw value and have updated the prev
363          * timestamp already. We can now calculate the elapsed delta
364          * (event-)time and add that to the generic event.
365          *
366          * Careful, not all hw sign-extends above the physical width
367          * of the count.
368          */
369         delta = (new_raw_count << shift) - (prev_raw_count << shift);
370         delta >>= shift;
371
372         local64_add(delta, &event->count);
373         local64_sub(delta, &hwc->period_left);
374
375         return new_raw_count;
376 }
377
378 static inline int x86_pmu_addr_offset(int index)
379 {
380         int offset;
381
382         /* offset = X86_FEATURE_PERFCTR_CORE ? index << 1 : index */
383         alternative_io(ASM_NOP2,
384                        "shll $1, %%eax",
385                        X86_FEATURE_PERFCTR_CORE,
386                        "=a" (offset),
387                        "a"  (index));
388
389         return offset;
390 }
391
392 static inline unsigned int x86_pmu_config_addr(int index)
393 {
394         return x86_pmu.eventsel + x86_pmu_addr_offset(index);
395 }
396
397 static inline unsigned int x86_pmu_event_addr(int index)
398 {
399         return x86_pmu.perfctr + x86_pmu_addr_offset(index);
400 }
401
402 /*
403  * Find and validate any extra registers to set up.
404  */
405 static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
406 {
407         struct hw_perf_event_extra *reg;
408         struct extra_reg *er;
409
410         reg = &event->hw.extra_reg;
411
412         if (!x86_pmu.extra_regs)
413                 return 0;
414
415         for (er = x86_pmu.extra_regs; er->msr; er++) {
416                 if (er->event != (config & er->config_mask))
417                         continue;
418                 if (event->attr.config1 & ~er->valid_mask)
419                         return -EINVAL;
420
421                 reg->idx = er->idx;
422                 reg->config = event->attr.config1;
423                 reg->reg = er->msr;
424                 break;
425         }
426         return 0;
427 }
428
429 static atomic_t active_events;
430 static DEFINE_MUTEX(pmc_reserve_mutex);
431
432 #ifdef CONFIG_X86_LOCAL_APIC
433
434 static bool reserve_pmc_hardware(void)
435 {
436         int i;
437
438         for (i = 0; i < x86_pmu.num_counters; i++) {
439                 if (!reserve_perfctr_nmi(x86_pmu_event_addr(i)))
440                         goto perfctr_fail;
441         }
442
443         for (i = 0; i < x86_pmu.num_counters; i++) {
444                 if (!reserve_evntsel_nmi(x86_pmu_config_addr(i)))
445                         goto eventsel_fail;
446         }
447
448         return true;
449
450 eventsel_fail:
451         for (i--; i >= 0; i--)
452                 release_evntsel_nmi(x86_pmu_config_addr(i));
453
454         i = x86_pmu.num_counters;
455
456 perfctr_fail:
457         for (i--; i >= 0; i--)
458                 release_perfctr_nmi(x86_pmu_event_addr(i));
459
460         return false;
461 }
462
463 static void release_pmc_hardware(void)
464 {
465         int i;
466
467         for (i = 0; i < x86_pmu.num_counters; i++) {
468                 release_perfctr_nmi(x86_pmu_event_addr(i));
469                 release_evntsel_nmi(x86_pmu_config_addr(i));
470         }
471 }
472
473 #else
474
475 static bool reserve_pmc_hardware(void) { return true; }
476 static void release_pmc_hardware(void) {}
477
478 #endif
479
480 static bool check_hw_exists(void)
481 {
482         u64 val, val_new = 0;
483         int i, reg, ret = 0;
484
485         /*
486          * Check to see if the BIOS enabled any of the counters, if so
487          * complain and bail.
488          */
489         for (i = 0; i < x86_pmu.num_counters; i++) {
490                 reg = x86_pmu_config_addr(i);
491                 ret = rdmsrl_safe(reg, &val);
492                 if (ret)
493                         goto msr_fail;
494                 if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
495                         goto bios_fail;
496         }
497
498         if (x86_pmu.num_counters_fixed) {
499                 reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
500                 ret = rdmsrl_safe(reg, &val);
501                 if (ret)
502                         goto msr_fail;
503                 for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
504                         if (val & (0x03 << i*4))
505                                 goto bios_fail;
506                 }
507         }
508
509         /*
510          * Now write a value and read it back to see if it matches,
511          * this is needed to detect certain hardware emulators (qemu/kvm)
512          * that don't trap on the MSR access and always return 0s.
513          */
514         val = 0xabcdUL;
515         ret = checking_wrmsrl(x86_pmu_event_addr(0), val);
516         ret |= rdmsrl_safe(x86_pmu_event_addr(0), &val_new);
517         if (ret || val != val_new)
518                 goto msr_fail;
519
520         return true;
521
522 bios_fail:
523         /*
524          * We still allow the PMU driver to operate:
525          */
526         printk(KERN_CONT "Broken BIOS detected, complain to your hardware vendor.\n");
527         printk(KERN_ERR FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg, val);
528
529         return true;
530
531 msr_fail:
532         printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n");
533
534         return false;
535 }
536
537 static void reserve_ds_buffers(void);
538 static void release_ds_buffers(void);
539
540 static void hw_perf_event_destroy(struct perf_event *event)
541 {
542         if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) {
543                 release_pmc_hardware();
544                 release_ds_buffers();
545                 mutex_unlock(&pmc_reserve_mutex);
546         }
547 }
548
549 static inline int x86_pmu_initialized(void)
550 {
551         return x86_pmu.handle_irq != NULL;
552 }
553
554 static inline int
555 set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
556 {
557         struct perf_event_attr *attr = &event->attr;
558         unsigned int cache_type, cache_op, cache_result;
559         u64 config, val;
560
561         config = attr->config;
562
563         cache_type = (config >>  0) & 0xff;
564         if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
565                 return -EINVAL;
566
567         cache_op = (config >>  8) & 0xff;
568         if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
569                 return -EINVAL;
570
571         cache_result = (config >> 16) & 0xff;
572         if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
573                 return -EINVAL;
574
575         val = hw_cache_event_ids[cache_type][cache_op][cache_result];
576
577         if (val == 0)
578                 return -ENOENT;
579
580         if (val == -1)
581                 return -EINVAL;
582
583         hwc->config |= val;
584         attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result];
585         return x86_pmu_extra_regs(val, event);
586 }
587
588 static int x86_setup_perfctr(struct perf_event *event)
589 {
590         struct perf_event_attr *attr = &event->attr;
591         struct hw_perf_event *hwc = &event->hw;
592         u64 config;
593
594         if (!is_sampling_event(event)) {
595                 hwc->sample_period = x86_pmu.max_period;
596                 hwc->last_period = hwc->sample_period;
597                 local64_set(&hwc->period_left, hwc->sample_period);
598         } else {
599                 /*
600                  * If we have a PMU initialized but no APIC
601                  * interrupts, we cannot sample hardware
602                  * events (user-space has to fall back and
603                  * sample via a hrtimer based software event):
604                  */
605                 if (!x86_pmu.apic)
606                         return -EOPNOTSUPP;
607         }
608
609         /*
610          * Do not allow config1 (extended registers) to propagate,
611          * there's no sane user-space generalization yet:
612          */
613         if (attr->type == PERF_TYPE_RAW)
614                 return 0;
615
616         if (attr->type == PERF_TYPE_HW_CACHE)
617                 return set_ext_hw_attr(hwc, event);
618
619         if (attr->config >= x86_pmu.max_events)
620                 return -EINVAL;
621
622         /*
623          * The generic map:
624          */
625         config = x86_pmu.event_map(attr->config);
626
627         if (config == 0)
628                 return -ENOENT;
629
630         if (config == -1LL)
631                 return -EINVAL;
632
633         /*
634          * Branch tracing:
635          */
636         if (attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
637             !attr->freq && hwc->sample_period == 1) {
638                 /* BTS is not supported by this architecture. */
639                 if (!x86_pmu.bts_active)
640                         return -EOPNOTSUPP;
641
642                 /* BTS is currently only allowed for user-mode. */
643                 if (!attr->exclude_kernel)
644                         return -EOPNOTSUPP;
645         }
646
647         hwc->config |= config;
648
649         return 0;
650 }
651
652 static int x86_pmu_hw_config(struct perf_event *event)
653 {
654         if (event->attr.precise_ip) {
655                 int precise = 0;
656
657                 /* Support for constant skid */
658                 if (x86_pmu.pebs_active) {
659                         precise++;
660
661                         /* Support for IP fixup */
662                         if (x86_pmu.lbr_nr)
663                                 precise++;
664                 }
665
666                 if (event->attr.precise_ip > precise)
667                         return -EOPNOTSUPP;
668         }
669
670         /*
671          * Generate PMC IRQs:
672          * (keep 'enabled' bit clear for now)
673          */
674         event->hw.config = ARCH_PERFMON_EVENTSEL_INT;
675
676         /*
677          * Count user and OS events unless requested not to
678          */
679         if (!event->attr.exclude_user)
680                 event->hw.config |= ARCH_PERFMON_EVENTSEL_USR;
681         if (!event->attr.exclude_kernel)
682                 event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
683
684         if (event->attr.type == PERF_TYPE_RAW)
685                 event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
686
687         return x86_setup_perfctr(event);
688 }
689
690 /*
691  * Setup the hardware configuration for a given attr_type
692  */
693 static int __x86_pmu_event_init(struct perf_event *event)
694 {
695         int err;
696
697         if (!x86_pmu_initialized())
698                 return -ENODEV;
699
700         err = 0;
701         if (!atomic_inc_not_zero(&active_events)) {
702                 mutex_lock(&pmc_reserve_mutex);
703                 if (atomic_read(&active_events) == 0) {
704                         if (!reserve_pmc_hardware())
705                                 err = -EBUSY;
706                         else
707                                 reserve_ds_buffers();
708                 }
709                 if (!err)
710                         atomic_inc(&active_events);
711                 mutex_unlock(&pmc_reserve_mutex);
712         }
713         if (err)
714                 return err;
715
716         event->destroy = hw_perf_event_destroy;
717
718         event->hw.idx = -1;
719         event->hw.last_cpu = -1;
720         event->hw.last_tag = ~0ULL;
721
722         /* mark unused */
723         event->hw.extra_reg.idx = EXTRA_REG_NONE;
724
725         return x86_pmu.hw_config(event);
726 }
727
728 static void x86_pmu_disable_all(void)
729 {
730         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
731         int idx;
732
733         for (idx = 0; idx < x86_pmu.num_counters; idx++) {
734                 u64 val;
735
736                 if (!test_bit(idx, cpuc->active_mask))
737                         continue;
738                 rdmsrl(x86_pmu_config_addr(idx), val);
739                 if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
740                         continue;
741                 val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
742                 wrmsrl(x86_pmu_config_addr(idx), val);
743         }
744 }
745
746 static void x86_pmu_disable(struct pmu *pmu)
747 {
748         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
749
750         if (!x86_pmu_initialized())
751                 return;
752
753         if (!cpuc->enabled)
754                 return;
755
756         cpuc->n_added = 0;
757         cpuc->enabled = 0;
758         barrier();
759
760         x86_pmu.disable_all();
761 }
762
763 static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
764                                           u64 enable_mask)
765 {
766         if (hwc->extra_reg.reg)
767                 wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config);
768         wrmsrl(hwc->config_base, hwc->config | enable_mask);
769 }
770
771 static void x86_pmu_enable_all(int added)
772 {
773         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
774         int idx;
775
776         for (idx = 0; idx < x86_pmu.num_counters; idx++) {
777                 struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
778
779                 if (!test_bit(idx, cpuc->active_mask))
780                         continue;
781
782                 __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
783         }
784 }
785
786 static struct pmu pmu;
787
788 static inline int is_x86_event(struct perf_event *event)
789 {
790         return event->pmu == &pmu;
791 }
792
793 static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
794 {
795         struct event_constraint *c, *constraints[X86_PMC_IDX_MAX];
796         unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
797         int i, j, w, wmax, num = 0;
798         struct hw_perf_event *hwc;
799
800         bitmap_zero(used_mask, X86_PMC_IDX_MAX);
801
802         for (i = 0; i < n; i++) {
803                 c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]);
804                 constraints[i] = c;
805         }
806
807         /*
808          * fastpath, try to reuse previous register
809          */
810         for (i = 0; i < n; i++) {
811                 hwc = &cpuc->event_list[i]->hw;
812                 c = constraints[i];
813
814                 /* never assigned */
815                 if (hwc->idx == -1)
816                         break;
817
818                 /* constraint still honored */
819                 if (!test_bit(hwc->idx, c->idxmsk))
820                         break;
821
822                 /* not already used */
823                 if (test_bit(hwc->idx, used_mask))
824                         break;
825
826                 __set_bit(hwc->idx, used_mask);
827                 if (assign)
828                         assign[i] = hwc->idx;
829         }
830         if (i == n)
831                 goto done;
832
833         /*
834          * begin slow path
835          */
836
837         bitmap_zero(used_mask, X86_PMC_IDX_MAX);
838
839         /*
840          * weight = number of possible counters
841          *
842          * 1    = most constrained, only works on one counter
843          * wmax = least constrained, works on any counter
844          *
845          * assign events to counters starting with most
846          * constrained events.
847          */
848         wmax = x86_pmu.num_counters;
849
850         /*
851          * when fixed event counters are present,
852          * wmax is incremented by 1 to account
853          * for one more choice
854          */
855         if (x86_pmu.num_counters_fixed)
856                 wmax++;
857
858         for (w = 1, num = n; num && w <= wmax; w++) {
859                 /* for each event */
860                 for (i = 0; num && i < n; i++) {
861                         c = constraints[i];
862                         hwc = &cpuc->event_list[i]->hw;
863
864                         if (c->weight != w)
865                                 continue;
866
867                         for_each_set_bit(j, c->idxmsk, X86_PMC_IDX_MAX) {
868                                 if (!test_bit(j, used_mask))
869                                         break;
870                         }
871
872                         if (j == X86_PMC_IDX_MAX)
873                                 break;
874
875                         __set_bit(j, used_mask);
876
877                         if (assign)
878                                 assign[i] = j;
879                         num--;
880                 }
881         }
882 done:
883         /*
884          * scheduling failed or is just a simulation,
885          * free resources if necessary
886          */
887         if (!assign || num) {
888                 for (i = 0; i < n; i++) {
889                         if (x86_pmu.put_event_constraints)
890                                 x86_pmu.put_event_constraints(cpuc, cpuc->event_list[i]);
891                 }
892         }
893         return num ? -ENOSPC : 0;
894 }
895
896 /*
897  * dogrp: true if must collect siblings events (group)
898  * returns total number of events and error code
899  */
900 static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
901 {
902         struct perf_event *event;
903         int n, max_count;
904
905         max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed;
906
907         /* current number of events already accepted */
908         n = cpuc->n_events;
909
910         if (is_x86_event(leader)) {
911                 if (n >= max_count)
912                         return -ENOSPC;
913                 cpuc->event_list[n] = leader;
914                 n++;
915         }
916         if (!dogrp)
917                 return n;
918
919         list_for_each_entry(event, &leader->sibling_list, group_entry) {
920                 if (!is_x86_event(event) ||
921                     event->state <= PERF_EVENT_STATE_OFF)
922                         continue;
923
924                 if (n >= max_count)
925                         return -ENOSPC;
926
927                 cpuc->event_list[n] = event;
928                 n++;
929         }
930         return n;
931 }
932
933 static inline void x86_assign_hw_event(struct perf_event *event,
934                                 struct cpu_hw_events *cpuc, int i)
935 {
936         struct hw_perf_event *hwc = &event->hw;
937
938         hwc->idx = cpuc->assign[i];
939         hwc->last_cpu = smp_processor_id();
940         hwc->last_tag = ++cpuc->tags[i];
941
942         if (hwc->idx == X86_PMC_IDX_FIXED_BTS) {
943                 hwc->config_base = 0;
944                 hwc->event_base = 0;
945         } else if (hwc->idx >= X86_PMC_IDX_FIXED) {
946                 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
947                 hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - X86_PMC_IDX_FIXED);
948         } else {
949                 hwc->config_base = x86_pmu_config_addr(hwc->idx);
950                 hwc->event_base  = x86_pmu_event_addr(hwc->idx);
951         }
952 }
953
954 static inline int match_prev_assignment(struct hw_perf_event *hwc,
955                                         struct cpu_hw_events *cpuc,
956                                         int i)
957 {
958         return hwc->idx == cpuc->assign[i] &&
959                 hwc->last_cpu == smp_processor_id() &&
960                 hwc->last_tag == cpuc->tags[i];
961 }
962
963 static void x86_pmu_start(struct perf_event *event, int flags);
964 static void x86_pmu_stop(struct perf_event *event, int flags);
965
966 static void x86_pmu_enable(struct pmu *pmu)
967 {
968         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
969         struct perf_event *event;
970         struct hw_perf_event *hwc;
971         int i, added = cpuc->n_added;
972
973         if (!x86_pmu_initialized())
974                 return;
975
976         if (cpuc->enabled)
977                 return;
978
979         if (cpuc->n_added) {
980                 int n_running = cpuc->n_events - cpuc->n_added;
981                 /*
982                  * apply assignment obtained either from
983                  * hw_perf_group_sched_in() or x86_pmu_enable()
984                  *
985                  * step1: save events moving to new counters
986                  * step2: reprogram moved events into new counters
987                  */
988                 for (i = 0; i < n_running; i++) {
989                         event = cpuc->event_list[i];
990                         hwc = &event->hw;
991
992                         /*
993                          * we can avoid reprogramming counter if:
994                          * - assigned same counter as last time
995                          * - running on same CPU as last time
996                          * - no other event has used the counter since
997                          */
998                         if (hwc->idx == -1 ||
999                             match_prev_assignment(hwc, cpuc, i))
1000                                 continue;
1001
1002                         /*
1003                          * Ensure we don't accidentally enable a stopped
1004                          * counter simply because we rescheduled.
1005                          */
1006                         if (hwc->state & PERF_HES_STOPPED)
1007                                 hwc->state |= PERF_HES_ARCH;
1008
1009                         x86_pmu_stop(event, PERF_EF_UPDATE);
1010                 }
1011
1012                 for (i = 0; i < cpuc->n_events; i++) {
1013                         event = cpuc->event_list[i];
1014                         hwc = &event->hw;
1015
1016                         if (!match_prev_assignment(hwc, cpuc, i))
1017                                 x86_assign_hw_event(event, cpuc, i);
1018                         else if (i < n_running)
1019                                 continue;
1020
1021                         if (hwc->state & PERF_HES_ARCH)
1022                                 continue;
1023
1024                         x86_pmu_start(event, PERF_EF_RELOAD);
1025                 }
1026                 cpuc->n_added = 0;
1027                 perf_events_lapic_init();
1028         }
1029
1030         cpuc->enabled = 1;
1031         barrier();
1032
1033         x86_pmu.enable_all(added);
1034 }
1035
1036 static inline void x86_pmu_disable_event(struct perf_event *event)
1037 {
1038         struct hw_perf_event *hwc = &event->hw;
1039
1040         wrmsrl(hwc->config_base, hwc->config);
1041 }
1042
1043 static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
1044
1045 /*
1046  * Set the next IRQ period, based on the hwc->period_left value.
1047  * To be called with the event disabled in hw:
1048  */
1049 static int
1050 x86_perf_event_set_period(struct perf_event *event)
1051 {
1052         struct hw_perf_event *hwc = &event->hw;
1053         s64 left = local64_read(&hwc->period_left);
1054         s64 period = hwc->sample_period;
1055         int ret = 0, idx = hwc->idx;
1056
1057         if (idx == X86_PMC_IDX_FIXED_BTS)
1058                 return 0;
1059
1060         /*
1061          * If we are way outside a reasonable range then just skip forward:
1062          */
1063         if (unlikely(left <= -period)) {
1064                 left = period;
1065                 local64_set(&hwc->period_left, left);
1066                 hwc->last_period = period;
1067                 ret = 1;
1068         }
1069
1070         if (unlikely(left <= 0)) {
1071                 left += period;
1072                 local64_set(&hwc->period_left, left);
1073                 hwc->last_period = period;
1074                 ret = 1;
1075         }
1076         /*
1077          * Quirk: certain CPUs dont like it if just 1 hw_event is left:
1078          */
1079         if (unlikely(left < 2))
1080                 left = 2;
1081
1082         if (left > x86_pmu.max_period)
1083                 left = x86_pmu.max_period;
1084
1085         per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
1086
1087         /*
1088          * The hw event starts counting from this event offset,
1089          * mark it to be able to extra future deltas:
1090          */
1091         local64_set(&hwc->prev_count, (u64)-left);
1092
1093         wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
1094
1095         /*
1096          * Due to erratum on certan cpu we need
1097          * a second write to be sure the register
1098          * is updated properly
1099          */
1100         if (x86_pmu.perfctr_second_write) {
1101                 wrmsrl(hwc->event_base,
1102                         (u64)(-left) & x86_pmu.cntval_mask);
1103         }
1104
1105         perf_event_update_userpage(event);
1106
1107         return ret;
1108 }
1109
1110 static void x86_pmu_enable_event(struct perf_event *event)
1111 {
1112         if (__this_cpu_read(cpu_hw_events.enabled))
1113                 __x86_pmu_enable_event(&event->hw,
1114                                        ARCH_PERFMON_EVENTSEL_ENABLE);
1115 }
1116
1117 /*
1118  * Add a single event to the PMU.
1119  *
1120  * The event is added to the group of enabled events
1121  * but only if it can be scehduled with existing events.
1122  */
1123 static int x86_pmu_add(struct perf_event *event, int flags)
1124 {
1125         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1126         struct hw_perf_event *hwc;
1127         int assign[X86_PMC_IDX_MAX];
1128         int n, n0, ret;
1129
1130         hwc = &event->hw;
1131
1132         perf_pmu_disable(event->pmu);
1133         n0 = cpuc->n_events;
1134         ret = n = collect_events(cpuc, event, false);
1135         if (ret < 0)
1136                 goto out;
1137
1138         hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
1139         if (!(flags & PERF_EF_START))
1140                 hwc->state |= PERF_HES_ARCH;
1141
1142         /*
1143          * If group events scheduling transaction was started,
1144          * skip the schedulability test here, it will be performed
1145          * at commit time (->commit_txn) as a whole
1146          */
1147         if (cpuc->group_flag & PERF_EVENT_TXN)
1148                 goto done_collect;
1149
1150         ret = x86_pmu.schedule_events(cpuc, n, assign);
1151         if (ret)
1152                 goto out;
1153         /*
1154          * copy new assignment, now we know it is possible
1155          * will be used by hw_perf_enable()
1156          */
1157         memcpy(cpuc->assign, assign, n*sizeof(int));
1158
1159 done_collect:
1160         cpuc->n_events = n;
1161         cpuc->n_added += n - n0;
1162         cpuc->n_txn += n - n0;
1163
1164         ret = 0;
1165 out:
1166         perf_pmu_enable(event->pmu);
1167         return ret;
1168 }
1169
1170 static void x86_pmu_start(struct perf_event *event, int flags)
1171 {
1172         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1173         int idx = event->hw.idx;
1174
1175         if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
1176                 return;
1177
1178         if (WARN_ON_ONCE(idx == -1))
1179                 return;
1180
1181         if (flags & PERF_EF_RELOAD) {
1182                 WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
1183                 x86_perf_event_set_period(event);
1184         }
1185
1186         event->hw.state = 0;
1187
1188         cpuc->events[idx] = event;
1189         __set_bit(idx, cpuc->active_mask);
1190         __set_bit(idx, cpuc->running);
1191         x86_pmu.enable(event);
1192         perf_event_update_userpage(event);
1193 }
1194
1195 void perf_event_print_debug(void)
1196 {
1197         u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
1198         u64 pebs;
1199         struct cpu_hw_events *cpuc;
1200         unsigned long flags;
1201         int cpu, idx;
1202
1203         if (!x86_pmu.num_counters)
1204                 return;
1205
1206         local_irq_save(flags);
1207
1208         cpu = smp_processor_id();
1209         cpuc = &per_cpu(cpu_hw_events, cpu);
1210
1211         if (x86_pmu.version >= 2) {
1212                 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
1213                 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
1214                 rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
1215                 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
1216                 rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
1217
1218                 pr_info("\n");
1219                 pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
1220                 pr_info("CPU#%d: status:     %016llx\n", cpu, status);
1221                 pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
1222                 pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
1223                 pr_info("CPU#%d: pebs:       %016llx\n", cpu, pebs);
1224         }
1225         pr_info("CPU#%d: active:     %016llx\n", cpu, *(u64 *)cpuc->active_mask);
1226
1227         for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1228                 rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl);
1229                 rdmsrl(x86_pmu_event_addr(idx), pmc_count);
1230
1231                 prev_left = per_cpu(pmc_prev_left[idx], cpu);
1232
1233                 pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
1234                         cpu, idx, pmc_ctrl);
1235                 pr_info("CPU#%d:   gen-PMC%d count: %016llx\n",
1236                         cpu, idx, pmc_count);
1237                 pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
1238                         cpu, idx, prev_left);
1239         }
1240         for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
1241                 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
1242
1243                 pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
1244                         cpu, idx, pmc_count);
1245         }
1246         local_irq_restore(flags);
1247 }
1248
1249 static void x86_pmu_stop(struct perf_event *event, int flags)
1250 {
1251         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1252         struct hw_perf_event *hwc = &event->hw;
1253
1254         if (__test_and_clear_bit(hwc->idx, cpuc->active_mask)) {
1255                 x86_pmu.disable(event);
1256                 cpuc->events[hwc->idx] = NULL;
1257                 WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
1258                 hwc->state |= PERF_HES_STOPPED;
1259         }
1260
1261         if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
1262                 /*
1263                  * Drain the remaining delta count out of a event
1264                  * that we are disabling:
1265                  */
1266                 x86_perf_event_update(event);
1267                 hwc->state |= PERF_HES_UPTODATE;
1268         }
1269 }
1270
1271 static void x86_pmu_del(struct perf_event *event, int flags)
1272 {
1273         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1274         int i;
1275
1276         /*
1277          * If we're called during a txn, we don't need to do anything.
1278          * The events never got scheduled and ->cancel_txn will truncate
1279          * the event_list.
1280          */
1281         if (cpuc->group_flag & PERF_EVENT_TXN)
1282                 return;
1283
1284         x86_pmu_stop(event, PERF_EF_UPDATE);
1285
1286         for (i = 0; i < cpuc->n_events; i++) {
1287                 if (event == cpuc->event_list[i]) {
1288
1289                         if (x86_pmu.put_event_constraints)
1290                                 x86_pmu.put_event_constraints(cpuc, event);
1291
1292                         while (++i < cpuc->n_events)
1293                                 cpuc->event_list[i-1] = cpuc->event_list[i];
1294
1295                         --cpuc->n_events;
1296                         break;
1297                 }
1298         }
1299         perf_event_update_userpage(event);
1300 }
1301
1302 static int x86_pmu_handle_irq(struct pt_regs *regs)
1303 {
1304         struct perf_sample_data data;
1305         struct cpu_hw_events *cpuc;
1306         struct perf_event *event;
1307         int idx, handled = 0;
1308         u64 val;
1309
1310         perf_sample_data_init(&data, 0);
1311
1312         cpuc = &__get_cpu_var(cpu_hw_events);
1313
1314         /*
1315          * Some chipsets need to unmask the LVTPC in a particular spot
1316          * inside the nmi handler.  As a result, the unmasking was pushed
1317          * into all the nmi handlers.
1318          *
1319          * This generic handler doesn't seem to have any issues where the
1320          * unmasking occurs so it was left at the top.
1321          */
1322         apic_write(APIC_LVTPC, APIC_DM_NMI);
1323
1324         for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1325                 if (!test_bit(idx, cpuc->active_mask)) {
1326                         /*
1327                          * Though we deactivated the counter some cpus
1328                          * might still deliver spurious interrupts still
1329                          * in flight. Catch them:
1330                          */
1331                         if (__test_and_clear_bit(idx, cpuc->running))
1332                                 handled++;
1333                         continue;
1334                 }
1335
1336                 event = cpuc->events[idx];
1337
1338                 val = x86_perf_event_update(event);
1339                 if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
1340                         continue;
1341
1342                 /*
1343                  * event overflow
1344                  */
1345                 handled++;
1346                 data.period     = event->hw.last_period;
1347
1348                 if (!x86_perf_event_set_period(event))
1349                         continue;
1350
1351                 if (perf_event_overflow(event, &data, regs))
1352                         x86_pmu_stop(event, 0);
1353         }
1354
1355         if (handled)
1356                 inc_irq_stat(apic_perf_irqs);
1357
1358         return handled;
1359 }
1360
1361 void perf_events_lapic_init(void)
1362 {
1363         if (!x86_pmu.apic || !x86_pmu_initialized())
1364                 return;
1365
1366         /*
1367          * Always use NMI for PMU
1368          */
1369         apic_write(APIC_LVTPC, APIC_DM_NMI);
1370 }
1371
1372 struct pmu_nmi_state {
1373         unsigned int    marked;
1374         int             handled;
1375 };
1376
1377 static DEFINE_PER_CPU(struct pmu_nmi_state, pmu_nmi);
1378
1379 static int __kprobes
1380 perf_event_nmi_handler(struct notifier_block *self,
1381                          unsigned long cmd, void *__args)
1382 {
1383         struct die_args *args = __args;
1384         unsigned int this_nmi;
1385         int handled;
1386
1387         if (!atomic_read(&active_events))
1388                 return NOTIFY_DONE;
1389
1390         switch (cmd) {
1391         case DIE_NMI:
1392                 break;
1393         case DIE_NMIUNKNOWN:
1394                 this_nmi = percpu_read(irq_stat.__nmi_count);
1395                 if (this_nmi != __this_cpu_read(pmu_nmi.marked))
1396                         /* let the kernel handle the unknown nmi */
1397                         return NOTIFY_DONE;
1398                 /*
1399                  * This one is a PMU back-to-back nmi. Two events
1400                  * trigger 'simultaneously' raising two back-to-back
1401                  * NMIs. If the first NMI handles both, the latter
1402                  * will be empty and daze the CPU. So, we drop it to
1403                  * avoid false-positive 'unknown nmi' messages.
1404                  */
1405                 return NOTIFY_STOP;
1406         default:
1407                 return NOTIFY_DONE;
1408         }
1409
1410         handled = x86_pmu.handle_irq(args->regs);
1411         if (!handled)
1412                 return NOTIFY_DONE;
1413
1414         this_nmi = percpu_read(irq_stat.__nmi_count);
1415         if ((handled > 1) ||
1416                 /* the next nmi could be a back-to-back nmi */
1417             ((__this_cpu_read(pmu_nmi.marked) == this_nmi) &&
1418              (__this_cpu_read(pmu_nmi.handled) > 1))) {
1419                 /*
1420                  * We could have two subsequent back-to-back nmis: The
1421                  * first handles more than one counter, the 2nd
1422                  * handles only one counter and the 3rd handles no
1423                  * counter.
1424                  *
1425                  * This is the 2nd nmi because the previous was
1426                  * handling more than one counter. We will mark the
1427                  * next (3rd) and then drop it if unhandled.
1428                  */
1429                 __this_cpu_write(pmu_nmi.marked, this_nmi + 1);
1430                 __this_cpu_write(pmu_nmi.handled, handled);
1431         }
1432
1433         return NOTIFY_STOP;
1434 }
1435
1436 static __read_mostly struct notifier_block perf_event_nmi_notifier = {
1437         .notifier_call          = perf_event_nmi_handler,
1438         .next                   = NULL,
1439         .priority               = NMI_LOCAL_LOW_PRIOR,
1440 };
1441
1442 static struct event_constraint unconstrained;
1443 static struct event_constraint emptyconstraint;
1444
1445 static struct event_constraint *
1446 x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
1447 {
1448         struct event_constraint *c;
1449
1450         if (x86_pmu.event_constraints) {
1451                 for_each_event_constraint(c, x86_pmu.event_constraints) {
1452                         if ((event->hw.config & c->cmask) == c->code)
1453                                 return c;
1454                 }
1455         }
1456
1457         return &unconstrained;
1458 }
1459
1460 #include "perf_event_amd.c"
1461 #include "perf_event_p6.c"
1462 #include "perf_event_p4.c"
1463 #include "perf_event_intel_lbr.c"
1464 #include "perf_event_intel_ds.c"
1465 #include "perf_event_intel.c"
1466
1467 static int __cpuinit
1468 x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
1469 {
1470         unsigned int cpu = (long)hcpu;
1471         struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
1472         int ret = NOTIFY_OK;
1473
1474         switch (action & ~CPU_TASKS_FROZEN) {
1475         case CPU_UP_PREPARE:
1476                 cpuc->kfree_on_online = NULL;
1477                 if (x86_pmu.cpu_prepare)
1478                         ret = x86_pmu.cpu_prepare(cpu);
1479                 break;
1480
1481         case CPU_STARTING:
1482                 if (x86_pmu.cpu_starting)
1483                         x86_pmu.cpu_starting(cpu);
1484                 break;
1485
1486         case CPU_ONLINE:
1487                 kfree(cpuc->kfree_on_online);
1488                 break;
1489
1490         case CPU_DYING:
1491                 if (x86_pmu.cpu_dying)
1492                         x86_pmu.cpu_dying(cpu);
1493                 break;
1494
1495         case CPU_UP_CANCELED:
1496         case CPU_DEAD:
1497                 if (x86_pmu.cpu_dead)
1498                         x86_pmu.cpu_dead(cpu);
1499                 break;
1500
1501         default:
1502                 break;
1503         }
1504
1505         return ret;
1506 }
1507
1508 static void __init pmu_check_apic(void)
1509 {
1510         if (cpu_has_apic)
1511                 return;
1512
1513         x86_pmu.apic = 0;
1514         pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
1515         pr_info("no hardware sampling interrupt available.\n");
1516 }
1517
1518 static int __init init_hw_perf_events(void)
1519 {
1520         struct event_constraint *c;
1521         int err;
1522
1523         pr_info("Performance Events: ");
1524
1525         switch (boot_cpu_data.x86_vendor) {
1526         case X86_VENDOR_INTEL:
1527                 err = intel_pmu_init();
1528                 break;
1529         case X86_VENDOR_AMD:
1530                 err = amd_pmu_init();
1531                 break;
1532         default:
1533                 return 0;
1534         }
1535         if (err != 0) {
1536                 pr_cont("no PMU driver, software events only.\n");
1537                 return 0;
1538         }
1539
1540         pmu_check_apic();
1541
1542         /* sanity check that the hardware exists or is emulated */
1543         if (!check_hw_exists())
1544                 return 0;
1545
1546         pr_cont("%s PMU driver.\n", x86_pmu.name);
1547
1548         if (x86_pmu.quirks)
1549                 x86_pmu.quirks();
1550
1551         if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
1552                 WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
1553                      x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
1554                 x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
1555         }
1556         x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
1557
1558         if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
1559                 WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
1560                      x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
1561                 x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
1562         }
1563
1564         x86_pmu.intel_ctrl |=
1565                 ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
1566
1567         perf_events_lapic_init();
1568         register_die_notifier(&perf_event_nmi_notifier);
1569
1570         unconstrained = (struct event_constraint)
1571                 __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
1572                                    0, x86_pmu.num_counters);
1573
1574         if (x86_pmu.event_constraints) {
1575                 for_each_event_constraint(c, x86_pmu.event_constraints) {
1576                         if (c->cmask != X86_RAW_EVENT_MASK)
1577                                 continue;
1578
1579                         c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
1580                         c->weight += x86_pmu.num_counters;
1581                 }
1582         }
1583
1584         pr_info("... version:                %d\n",     x86_pmu.version);
1585         pr_info("... bit width:              %d\n",     x86_pmu.cntval_bits);
1586         pr_info("... generic registers:      %d\n",     x86_pmu.num_counters);
1587         pr_info("... value mask:             %016Lx\n", x86_pmu.cntval_mask);
1588         pr_info("... max period:             %016Lx\n", x86_pmu.max_period);
1589         pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_counters_fixed);
1590         pr_info("... event mask:             %016Lx\n", x86_pmu.intel_ctrl);
1591
1592         perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
1593         perf_cpu_notifier(x86_pmu_notifier);
1594
1595         return 0;
1596 }
1597 early_initcall(init_hw_perf_events);
1598
1599 static inline void x86_pmu_read(struct perf_event *event)
1600 {
1601         x86_perf_event_update(event);
1602 }
1603
1604 /*
1605  * Start group events scheduling transaction
1606  * Set the flag to make pmu::enable() not perform the
1607  * schedulability test, it will be performed at commit time
1608  */
1609 static void x86_pmu_start_txn(struct pmu *pmu)
1610 {
1611         perf_pmu_disable(pmu);
1612         __this_cpu_or(cpu_hw_events.group_flag, PERF_EVENT_TXN);
1613         __this_cpu_write(cpu_hw_events.n_txn, 0);
1614 }
1615
1616 /*
1617  * Stop group events scheduling transaction
1618  * Clear the flag and pmu::enable() will perform the
1619  * schedulability test.
1620  */
1621 static void x86_pmu_cancel_txn(struct pmu *pmu)
1622 {
1623         __this_cpu_and(cpu_hw_events.group_flag, ~PERF_EVENT_TXN);
1624         /*
1625          * Truncate the collected events.
1626          */
1627         __this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn));
1628         __this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn));
1629         perf_pmu_enable(pmu);
1630 }
1631
1632 /*
1633  * Commit group events scheduling transaction
1634  * Perform the group schedulability test as a whole
1635  * Return 0 if success
1636  */
1637 static int x86_pmu_commit_txn(struct pmu *pmu)
1638 {
1639         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1640         int assign[X86_PMC_IDX_MAX];
1641         int n, ret;
1642
1643         n = cpuc->n_events;
1644
1645         if (!x86_pmu_initialized())
1646                 return -EAGAIN;
1647
1648         ret = x86_pmu.schedule_events(cpuc, n, assign);
1649         if (ret)
1650                 return ret;
1651
1652         /*
1653          * copy new assignment, now we know it is possible
1654          * will be used by hw_perf_enable()
1655          */
1656         memcpy(cpuc->assign, assign, n*sizeof(int));
1657
1658         cpuc->group_flag &= ~PERF_EVENT_TXN;
1659         perf_pmu_enable(pmu);
1660         return 0;
1661 }
1662 /*
1663  * a fake_cpuc is used to validate event groups. Due to
1664  * the extra reg logic, we need to also allocate a fake
1665  * per_core and per_cpu structure. Otherwise, group events
1666  * using extra reg may conflict without the kernel being
1667  * able to catch this when the last event gets added to
1668  * the group.
1669  */
1670 static void free_fake_cpuc(struct cpu_hw_events *cpuc)
1671 {
1672         kfree(cpuc->shared_regs);
1673         kfree(cpuc);
1674 }
1675
1676 static struct cpu_hw_events *allocate_fake_cpuc(void)
1677 {
1678         struct cpu_hw_events *cpuc;
1679         int cpu = raw_smp_processor_id();
1680
1681         cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL);
1682         if (!cpuc)
1683                 return ERR_PTR(-ENOMEM);
1684
1685         /* only needed, if we have extra_regs */
1686         if (x86_pmu.extra_regs) {
1687                 cpuc->shared_regs = allocate_shared_regs(cpu);
1688                 if (!cpuc->shared_regs)
1689                         goto error;
1690         }
1691         return cpuc;
1692 error:
1693         free_fake_cpuc(cpuc);
1694         return ERR_PTR(-ENOMEM);
1695 }
1696
1697 /*
1698  * validate that we can schedule this event
1699  */
1700 static int validate_event(struct perf_event *event)
1701 {
1702         struct cpu_hw_events *fake_cpuc;
1703         struct event_constraint *c;
1704         int ret = 0;
1705
1706         fake_cpuc = allocate_fake_cpuc();
1707         if (IS_ERR(fake_cpuc))
1708                 return PTR_ERR(fake_cpuc);
1709
1710         c = x86_pmu.get_event_constraints(fake_cpuc, event);
1711
1712         if (!c || !c->weight)
1713                 ret = -ENOSPC;
1714
1715         if (x86_pmu.put_event_constraints)
1716                 x86_pmu.put_event_constraints(fake_cpuc, event);
1717
1718         free_fake_cpuc(fake_cpuc);
1719
1720         return ret;
1721 }
1722
1723 /*
1724  * validate a single event group
1725  *
1726  * validation include:
1727  *      - check events are compatible which each other
1728  *      - events do not compete for the same counter
1729  *      - number of events <= number of counters
1730  *
1731  * validation ensures the group can be loaded onto the
1732  * PMU if it was the only group available.
1733  */
1734 static int validate_group(struct perf_event *event)
1735 {
1736         struct perf_event *leader = event->group_leader;
1737         struct cpu_hw_events *fake_cpuc;
1738         int ret = -ENOSPC, n;
1739
1740         fake_cpuc = allocate_fake_cpuc();
1741         if (IS_ERR(fake_cpuc))
1742                 return PTR_ERR(fake_cpuc);
1743         /*
1744          * the event is not yet connected with its
1745          * siblings therefore we must first collect
1746          * existing siblings, then add the new event
1747          * before we can simulate the scheduling
1748          */
1749         n = collect_events(fake_cpuc, leader, true);
1750         if (n < 0)
1751                 goto out;
1752
1753         fake_cpuc->n_events = n;
1754         n = collect_events(fake_cpuc, event, false);
1755         if (n < 0)
1756                 goto out;
1757
1758         fake_cpuc->n_events = n;
1759
1760         ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
1761
1762 out:
1763         free_fake_cpuc(fake_cpuc);
1764         return ret;
1765 }
1766
1767 static int x86_pmu_event_init(struct perf_event *event)
1768 {
1769         struct pmu *tmp;
1770         int err;
1771
1772         switch (event->attr.type) {
1773         case PERF_TYPE_RAW:
1774         case PERF_TYPE_HARDWARE:
1775         case PERF_TYPE_HW_CACHE:
1776                 break;
1777
1778         default:
1779                 return -ENOENT;
1780         }
1781
1782         err = __x86_pmu_event_init(event);
1783         if (!err) {
1784                 /*
1785                  * we temporarily connect event to its pmu
1786                  * such that validate_group() can classify
1787                  * it as an x86 event using is_x86_event()
1788                  */
1789                 tmp = event->pmu;
1790                 event->pmu = &pmu;
1791
1792                 if (event->group_leader != event)
1793                         err = validate_group(event);
1794                 else
1795                         err = validate_event(event);
1796
1797                 event->pmu = tmp;
1798         }
1799         if (err) {
1800                 if (event->destroy)
1801                         event->destroy(event);
1802         }
1803
1804         return err;
1805 }
1806
1807 static struct pmu pmu = {
1808         .pmu_enable     = x86_pmu_enable,
1809         .pmu_disable    = x86_pmu_disable,
1810
1811         .event_init     = x86_pmu_event_init,
1812
1813         .add            = x86_pmu_add,
1814         .del            = x86_pmu_del,
1815         .start          = x86_pmu_start,
1816         .stop           = x86_pmu_stop,
1817         .read           = x86_pmu_read,
1818
1819         .start_txn      = x86_pmu_start_txn,
1820         .cancel_txn     = x86_pmu_cancel_txn,
1821         .commit_txn     = x86_pmu_commit_txn,
1822 };
1823
1824 /*
1825  * callchain support
1826  */
1827
1828 static int backtrace_stack(void *data, char *name)
1829 {
1830         return 0;
1831 }
1832
1833 static void backtrace_address(void *data, unsigned long addr, int reliable)
1834 {
1835         struct perf_callchain_entry *entry = data;
1836
1837         perf_callchain_store(entry, addr);
1838 }
1839
1840 static const struct stacktrace_ops backtrace_ops = {
1841         .stack                  = backtrace_stack,
1842         .address                = backtrace_address,
1843         .walk_stack             = print_context_stack_bp,
1844 };
1845
1846 void
1847 perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
1848 {
1849         if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
1850                 /* TODO: We don't support guest os callchain now */
1851                 return;
1852         }
1853
1854         perf_callchain_store(entry, regs->ip);
1855
1856         dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
1857 }
1858
1859 #ifdef CONFIG_COMPAT
1860 static inline int
1861 perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
1862 {
1863         /* 32-bit process in 64-bit kernel. */
1864         struct stack_frame_ia32 frame;
1865         const void __user *fp;
1866
1867         if (!test_thread_flag(TIF_IA32))
1868                 return 0;
1869
1870         fp = compat_ptr(regs->bp);
1871         while (entry->nr < PERF_MAX_STACK_DEPTH) {
1872                 unsigned long bytes;
1873                 frame.next_frame     = 0;
1874                 frame.return_address = 0;
1875
1876                 bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
1877                 if (bytes != sizeof(frame))
1878                         break;
1879
1880                 if (fp < compat_ptr(regs->sp))
1881                         break;
1882
1883                 perf_callchain_store(entry, frame.return_address);
1884                 fp = compat_ptr(frame.next_frame);
1885         }
1886         return 1;
1887 }
1888 #else
1889 static inline int
1890 perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
1891 {
1892     return 0;
1893 }
1894 #endif
1895
1896 void
1897 perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
1898 {
1899         struct stack_frame frame;
1900         const void __user *fp;
1901
1902         if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
1903                 /* TODO: We don't support guest os callchain now */
1904                 return;
1905         }
1906
1907         fp = (void __user *)regs->bp;
1908
1909         perf_callchain_store(entry, regs->ip);
1910
1911         if (perf_callchain_user32(regs, entry))
1912                 return;
1913
1914         while (entry->nr < PERF_MAX_STACK_DEPTH) {
1915                 unsigned long bytes;
1916                 frame.next_frame             = NULL;
1917                 frame.return_address = 0;
1918
1919                 bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
1920                 if (bytes != sizeof(frame))
1921                         break;
1922
1923                 if ((unsigned long)fp < regs->sp)
1924                         break;
1925
1926                 perf_callchain_store(entry, frame.return_address);
1927                 fp = frame.next_frame;
1928         }
1929 }
1930
1931 unsigned long perf_instruction_pointer(struct pt_regs *regs)
1932 {
1933         unsigned long ip;
1934
1935         if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
1936                 ip = perf_guest_cbs->get_guest_ip();
1937         else
1938                 ip = instruction_pointer(regs);
1939
1940         return ip;
1941 }
1942
1943 unsigned long perf_misc_flags(struct pt_regs *regs)
1944 {
1945         int misc = 0;
1946
1947         if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
1948                 if (perf_guest_cbs->is_user_mode())
1949                         misc |= PERF_RECORD_MISC_GUEST_USER;
1950                 else
1951                         misc |= PERF_RECORD_MISC_GUEST_KERNEL;
1952         } else {
1953                 if (user_mode(regs))
1954                         misc |= PERF_RECORD_MISC_USER;
1955                 else
1956                         misc |= PERF_RECORD_MISC_KERNEL;
1957         }
1958
1959         if (regs->flags & PERF_EFLAGS_EXACT)
1960                 misc |= PERF_RECORD_MISC_EXACT_IP;
1961
1962         return misc;
1963 }