perf/x86/intel: Support task events with Intel CQM

author Matt Fleming <matt.fleming@intel.com>

Fri, 23 Jan 2015 18:45:46 +0000 (18:45 +0000)

committer Ingo Molnar <mingo@kernel.org>

Wed, 25 Feb 2015 12:53:34 +0000 (13:53 +0100)
author Matt Fleming <matt.fleming@intel.com>
Fri, 23 Jan 2015 18:45:46 +0000 (18:45 +0000)
committer Ingo Molnar <mingo@kernel.org>
Wed, 25 Feb 2015 12:53:34 +0000 (13:53 +0100)
diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c

index b5d9d746dbc0a9a779c7bfbac4a2d0e267c5d1ea..8003d87afd898dbaab46de178f77902cf302035d 100644 (file)
--- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
@@ -182,23 +182,124 @@ fail:
  
  /*
   * Determine if @a and @b measure the same set of tasks.
+ *
+ * If @a and @b measure the same set of tasks then we want to share a
+ * single RMID.
   */
  static bool __match_event(struct perf_event *a, struct perf_event *b)
  {
+       /* Per-cpu and task events don't mix */
         if ((a->attach_state & PERF_ATTACH_TASK) !=
             (b->attach_state & PERF_ATTACH_TASK))
                 return false;
  
-       /* not task */
+#ifdef CONFIG_CGROUP_PERF
+       if (a->cgrp != b->cgrp)
+               return false;
+#endif
+
+       /* If not task event, we're machine wide */
+       if (!(b->attach_state & PERF_ATTACH_TASK))
+               return true;
+
+       /*
+        * Events that target same task are placed into the same cache group.
+        */
+       if (a->hw.cqm_target == b->hw.cqm_target)
+               return true;
+
+       /*
+        * Are we an inherited event?
+        */
+       if (b->parent == a)
+               return true;
+
+       return false;
+}
+
+#ifdef CONFIG_CGROUP_PERF
+static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event)
+{
+       if (event->attach_state & PERF_ATTACH_TASK)
+               return perf_cgroup_from_task(event->hw.cqm_target);
  
-       return true; /* if not task, we're machine wide */
+       return event->cgrp;
  }
+#endif
  
  /*
   * Determine if @a's tasks intersect with @b's tasks
+ *
+ * There are combinations of events that we explicitly prohibit,
+ *
+ *                PROHIBITS
+ *     system-wide    ->       cgroup and task
+ *     cgroup        ->        system-wide
+ *                           ->        task in cgroup
+ *     task          ->        system-wide
+ *                           ->        task in cgroup
+ *
+ * Call this function before allocating an RMID.
   */
  static bool __conflict_event(struct perf_event *a, struct perf_event *b)
  {
+#ifdef CONFIG_CGROUP_PERF
+       /*
+        * We can have any number of cgroups but only one system-wide
+        * event at a time.
+        */
+       if (a->cgrp && b->cgrp) {
+               struct perf_cgroup *ac = a->cgrp;
+               struct perf_cgroup *bc = b->cgrp;
+
+               /*
+                * This condition should have been caught in
+                * __match_event() and we should be sharing an RMID.
+                */
+               WARN_ON_ONCE(ac == bc);
+
+               if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
+                   cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
+                       return true;
+
+               return false;
+       }
+
+       if (a->cgrp || b->cgrp) {
+               struct perf_cgroup *ac, *bc;
+
+               /*
+                * cgroup and system-wide events are mutually exclusive
+                */
+               if ((a->cgrp && !(b->attach_state & PERF_ATTACH_TASK)) ||
+                   (b->cgrp && !(a->attach_state & PERF_ATTACH_TASK)))
+                       return true;
+
+               /*
+                * Ensure neither event is part of the other's cgroup
+                */
+               ac = event_to_cgroup(a);
+               bc = event_to_cgroup(b);
+               if (ac == bc)
+                       return true;
+
+               /*
+                * Must have cgroup and non-intersecting task events.
+                */
+               if (!ac || !bc)
+                       return false;
+
+               /*
+                * We have cgroup and task events, and the task belongs
+                * to a cgroup. Check for for overlap.
+                */
+               if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
+                   cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
+                       return true;
+
+               return false;
+       }
+#endif
         /*
          * If one of them is not a task, same story as above with cgroups.
          */
@@ -245,9 +346,16 @@ static int intel_cqm_setup_event(struct perf_event *event,
  
  static void intel_cqm_event_read(struct perf_event *event)
  {
-       unsigned long rmid = event->hw.cqm_rmid;
+       unsigned long rmid;
         u64 val;
  
+       /*
+        * Task events are handled by intel_cqm_event_count().
+        */
+       if (event->cpu == -1)
+               return;
+
+       rmid = event->hw.cqm_rmid;
         val = __rmid_read(rmid);
  
         /*
@@ -259,6 +367,63 @@ static void intel_cqm_event_read(struct perf_event *event)
         local64_set(&event->count, val);
  }
  
+struct rmid_read {
+       unsigned int rmid;
+       atomic64_t value;
+};
+
+static void __intel_cqm_event_count(void *info)
+{
+       struct rmid_read *rr = info;
+       u64 val;
+
+       val = __rmid_read(rr->rmid);
+
+       if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
+               return;
+
+       atomic64_add(val, &rr->value);
+}
+
+static inline bool cqm_group_leader(struct perf_event *event)
+{
+       return !list_empty(&event->hw.cqm_groups_entry);
+}
+
+static u64 intel_cqm_event_count(struct perf_event *event)
+{
+       struct rmid_read rr = {
+               .rmid = event->hw.cqm_rmid,
+               .value = ATOMIC64_INIT(0),
+       };
+
+       /*
+        * We only need to worry about task events. System-wide events
+        * are handled like usual, i.e. entirely with
+        * intel_cqm_event_read().
+        */
+       if (event->cpu != -1)
+               return __perf_event_count(event);
+
+       /*
+        * Only the group leader gets to report values. This stops us
+        * reporting duplicate values to userspace, and gives us a clear
+        * rule for which task gets to report the values.
+        *
+        * Note that it is impossible to attribute these values to
+        * specific packages - we forfeit that ability when we create
+        * task events.
+        */
+       if (!cqm_group_leader(event))
+               return 0;
+
+       on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1);
+
+       local64_set(&event->count, atomic64_read(&rr.value));
+
+       return __perf_event_count(event);
+}
+
  static void intel_cqm_event_start(struct perf_event *event, int mode)
  {
         struct intel_cqm_state *state = this_cpu_ptr(&cqm_state);
@@ -344,7 +509,7 @@ static void intel_cqm_event_destroy(struct perf_event *event)
         /*
          * And we're the group leader..
          */
-       if (!list_empty(&event->hw.cqm_groups_entry)) {
+       if (cqm_group_leader(event)) {
                 /*
                  * If there was a group_other, make that leader, otherwise
                  * destroy the group and return the RMID.
@@ -365,17 +530,6 @@ static void intel_cqm_event_destroy(struct perf_event *event)
  
  static struct pmu intel_cqm_pmu;
  
-/*
- * XXX there's a bit of a problem in that we cannot simply do the one
- * event per node as one would want, since that one event would one get
- * scheduled on the one cpu. But we want to 'schedule' the RMID on all
- * CPUs.
- *
- * This means we want events for each CPU, however, that generates a lot
- * of duplicate values out to userspace -- this is not to be helped
- * unless we want to change the core code in some way. Fore more info,
- * see intel_cqm_event_read().
- */
  static int intel_cqm_event_init(struct perf_event *event)
  {
         struct perf_event *group = NULL;
@@ -387,9 +541,6 @@ static int intel_cqm_event_init(struct perf_event *event)
         if (event->attr.config & ~QOS_EVENT_MASK)
                 return -EINVAL;
  
-       if (event->cpu == -1)
-               return -EINVAL;
-
         /* unsupported modes and filters */
         if (event->attr.exclude_user   ||
             event->attr.exclude_kernel ||
@@ -407,7 +558,8 @@ static int intel_cqm_event_init(struct perf_event *event)
  
         mutex_lock(&cache_mutex);
  
-       err = intel_cqm_setup_event(event, &group); /* will also set rmid */
+       /* Will also set rmid */
+       err = intel_cqm_setup_event(event, &group);
         if (err)
                 goto out;
  
@@ -470,6 +622,7 @@ static struct pmu intel_cqm_pmu = {
         .start          = intel_cqm_event_start,
         .stop           = intel_cqm_event_stop,
         .read           = intel_cqm_event_read,
+       .count          = intel_cqm_event_count,
  };
  
  static inline void cqm_pick_event_reader(int cpu)
@@ -599,8 +752,8 @@ static int __init intel_cqm_init(void)
  
         __perf_cpu_notifier(intel_cqm_cpu_notifier);
  
-       ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1);
-
+       ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm",
+                               PERF_TYPE_INTEL_CQM);
         if (ret)
                 pr_err("Intel CQM perf registration failed: %d\n", ret);
         else
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h

index ca5504c48f4f9778c2cb530420fd861524a43cd7..dac4c2831d821038a337a9450f7f0dfc73cef8a0 100644 (file)
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -129,6 +129,7 @@ struct hw_perf_event {
                         struct list_head        cqm_events_entry;
                         struct list_head        cqm_groups_entry;
                         struct list_head        cqm_group_entry;
+                       struct task_struct      *cqm_target;
                 };
  #ifdef CONFIG_HAVE_HW_BREAKPOINT
                 struct { /* breakpoint */
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h

index 1e3cd07cf76e29269756c7a162369eb3a4fcdf14..3c8b45de57eccaaf63b979c3a82c004a47c10e69 100644 (file)
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -32,6 +32,7 @@ enum perf_type_id {
         PERF_TYPE_HW_CACHE                      = 3,
         PERF_TYPE_RAW                           = 4,
         PERF_TYPE_BREAKPOINT                    = 5,
+       PERF_TYPE_INTEL_CQM                     = 6,
  
         PERF_TYPE_MAX,                          /* non-ABI */
  };
diff --git a/kernel/events/core.c b/kernel/events/core.c

index 1fc3bae5904ac77ede9889db98ba40f8a3881409..71109a045450665f72926db828e6b716c7dbd9f4 100644 (file)
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7181,6 +7181,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
                 else if (attr->type == PERF_TYPE_BREAKPOINT)
                         event->hw.bp_target = task;
  #endif
+               else if (attr->type == PERF_TYPE_INTEL_CQM)
+                       event->hw.cqm_target = task;
         }
  
         if (!overflow_handler && parent_event) {
author	Matt Fleming <matt.fleming@intel.com>
	Fri, 23 Jan 2015 18:45:46 +0000 (18:45 +0000)
committer	Ingo Molnar <mingo@kernel.org>
	Wed, 25 Feb 2015 12:53:34 +0000 (13:53 +0100)
arch/x86/kernel/cpu/perf_event_intel_cqm.c		patch \| blob \| history
include/linux/perf_event.h		patch \| blob \| history
include/uapi/linux/perf_event.h		patch \| blob \| history
kernel/events/core.c		patch \| blob \| history