Merge remote-tracking branch 'cgroup/for-next'

[karo-tx-linux.git] / kernel / events / core.c
diff --git a/kernel/events/core.c b/kernel/events/core.c

index e9874949c78734d2c662845aedb69d79ccd5b839..1a734e0adfa78259dac6cae57239a2d5f992a9ac 100644 (file)
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -196,7 +196,7 @@ static int perf_sample_period_ns __read_mostly      = DEFAULT_SAMPLE_PERIOD_NS;
  static int perf_sample_allowed_ns __read_mostly =
         DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
  
-void update_perf_cpu_limits(void)
+static void update_perf_cpu_limits(void)
  {
         u64 tmp = perf_sample_period_ns;
  
@@ -472,7 +472,7 @@ perf_cgroup_set_timestamp(struct task_struct *task,
   * mode SWOUT : schedule out everything
   * mode SWIN : schedule in based on cgroup for next
   */
-void perf_cgroup_switch(struct task_struct *task, int mode)
+static void perf_cgroup_switch(struct task_struct *task, int mode)
  {
         struct perf_cpu_context *cpuctx;
         struct pmu *pmu;
@@ -1243,11 +1243,7 @@ static inline void perf_event__state_init(struct perf_event *event)
                                               PERF_EVENT_STATE_INACTIVE;
  }
  
-/*
- * Called at perf_event creation and when events are attached/detached from a
- * group.
- */
-static void perf_event__read_size(struct perf_event *event)
+static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
  {
         int entry = sizeof(u64); /* value */
         int size = 0;
@@ -1263,7 +1259,7 @@ static void perf_event__read_size(struct perf_event *event)
                 entry += sizeof(u64);
  
         if (event->attr.read_format & PERF_FORMAT_GROUP) {
-               nr += event->group_leader->nr_siblings;
+               nr += nr_siblings;
                 size += sizeof(u64);
         }
  
@@ -1271,14 +1267,11 @@ static void perf_event__read_size(struct perf_event *event)
         event->read_size = size;
  }
  
-static void perf_event__header_size(struct perf_event *event)
+static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
  {
         struct perf_sample_data *data;
-       u64 sample_type = event->attr.sample_type;
         u16 size = 0;
  
-       perf_event__read_size(event);
-
         if (sample_type & PERF_SAMPLE_IP)
                 size += sizeof(data->ip);
  
@@ -1303,6 +1296,17 @@ static void perf_event__header_size(struct perf_event *event)
         event->header_size = size;
  }
  
+/*
+ * Called at perf_event creation and when events are attached/detached from a
+ * group.
+ */
+static void perf_event__header_size(struct perf_event *event)
+{
+       __perf_event_read_size(event,
+                              event->group_leader->nr_siblings);
+       __perf_event_header_size(event, event->attr.sample_type);
+}
+
  static void perf_event__id_header_size(struct perf_event *event)
  {
         struct perf_sample_data *data;
@@ -1330,6 +1334,27 @@ static void perf_event__id_header_size(struct perf_event *event)
         event->id_header_size = size;
  }
  
+static bool perf_event_validate_size(struct perf_event *event)
+{
+       /*
+        * The values computed here will be over-written when we actually
+        * attach the event.
+        */
+       __perf_event_read_size(event, event->group_leader->nr_siblings + 1);
+       __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ);
+       perf_event__id_header_size(event);
+
+       /*
+        * Sum the lot; should not exceed the 64k limit we have on records.
+        * Conservative limit to allow for callchains and other variable fields.
+        */
+       if (event->read_size + event->header_size +
+           event->id_header_size + sizeof(struct perf_event_header) >= 16*1024)
+               return false;
+
+       return true;
+}
+
  static void perf_group_attach(struct perf_event *event)
  {
         struct perf_event *group_leader = event->group_leader, *pos;
@@ -1914,7 +1939,7 @@ group_sched_in(struct perf_event *group_event,
         if (group_event->state == PERF_EVENT_STATE_OFF)
                 return 0;
  
-       pmu->start_txn(pmu);
+       pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
  
         if (event_sched_in(group_event, cpuctx, ctx)) {
                 pmu->cancel_txn(pmu);
@@ -3184,14 +3209,22 @@ void perf_event_exec(void)
         rcu_read_unlock();
  }
  
+struct perf_read_data {
+       struct perf_event *event;
+       bool group;
+       int ret;
+};
+
  /*
   * Cross CPU call to read the hardware event
   */
  static void __perf_event_read(void *info)
  {
-       struct perf_event *event = info;
+       struct perf_read_data *data = info;
+       struct perf_event *sub, *event = data->event;
         struct perf_event_context *ctx = event->ctx;
         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+       struct pmu *pmu = event->pmu;
  
         /*
          * If this is a task context, we need to check whether it is
@@ -3208,9 +3241,35 @@ static void __perf_event_read(void *info)
                 update_context_time(ctx);
                 update_cgrp_time_from_event(event);
         }
+
         update_event_times(event);
-       if (event->state == PERF_EVENT_STATE_ACTIVE)
-               event->pmu->read(event);
+       if (event->state != PERF_EVENT_STATE_ACTIVE)
+               goto unlock;
+
+       if (!data->group) {
+               pmu->read(event);
+               data->ret = 0;
+               goto unlock;
+       }
+
+       pmu->start_txn(pmu, PERF_PMU_TXN_READ);
+
+       pmu->read(event);
+
+       list_for_each_entry(sub, &event->sibling_list, group_entry) {
+               update_event_times(sub);
+               if (sub->state == PERF_EVENT_STATE_ACTIVE) {
+                       /*
+                        * Use sibling's PMU rather than @event's since
+                        * sibling could be on different (eg: software) PMU.
+                        */
+                       sub->pmu->read(sub);
+               }
+       }
+
+       data->ret = pmu->commit_txn(pmu);
+
+unlock:
         raw_spin_unlock(&ctx->lock);
  }
  
@@ -3275,15 +3334,23 @@ u64 perf_event_read_local(struct perf_event *event)
         return val;
  }
  
-static u64 perf_event_read(struct perf_event *event)
+static int perf_event_read(struct perf_event *event, bool group)
  {
+       int ret = 0;
+
         /*
          * If event is enabled and currently active on a CPU, update the
          * value in the event structure:
          */
         if (event->state == PERF_EVENT_STATE_ACTIVE) {
+               struct perf_read_data data = {
+                       .event = event,
+                       .group = group,
+                       .ret = 0,
+               };
                 smp_call_function_single(event->oncpu,
-                                        __perf_event_read, event, 1);
+                                        __perf_event_read, &data, 1);
+               ret = data.ret;
         } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
                 struct perf_event_context *ctx = event->ctx;
                 unsigned long flags;
@@ -3298,11 +3365,14 @@ static u64 perf_event_read(struct perf_event *event)
                         update_context_time(ctx);
                         update_cgrp_time_from_event(event);
                 }
-               update_event_times(event);
+               if (group)
+                       update_group_times(event);
+               else
+                       update_event_times(event);
                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
         }
  
-       return perf_event_count(event);
+       return ret;
  }
  
  /*
@@ -3744,7 +3814,7 @@ static void put_event(struct perf_event *event)
          *     see the comment there.
          *
          *  2) there is a lock-inversion with mmap_sem through
-        *     perf_event_read_group(), which takes faults while
+        *     perf_read_group(), which takes faults while
          *     holding ctx->mutex, however this is called after
          *     the last filedesc died, so there is no possibility
          *     to trigger the AB-BA case.
@@ -3818,14 +3888,18 @@ u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
         *running = 0;
  
         mutex_lock(&event->child_mutex);
-       total += perf_event_read(event);
+
+       (void)perf_event_read(event, false);
+       total += perf_event_count(event);
+
         *enabled += event->total_time_enabled +
                         atomic64_read(&event->child_total_time_enabled);
         *running += event->total_time_running +
                         atomic64_read(&event->child_total_time_running);
  
         list_for_each_entry(child, &event->child_list, child_list) {
-               total += perf_event_read(child);
+               (void)perf_event_read(child, false);
+               total += perf_event_count(child);
                 *enabled += child->total_time_enabled;
                 *running += child->total_time_running;
         }
@@ -3835,55 +3909,95 @@ u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
  }
  EXPORT_SYMBOL_GPL(perf_event_read_value);
  
-static int perf_event_read_group(struct perf_event *event,
-                                  u64 read_format, char __user *buf)
+static int __perf_read_group_add(struct perf_event *leader,
+                                       u64 read_format, u64 *values)
  {
-       struct perf_event *leader = event->group_leader, *sub;
-       struct perf_event_context *ctx = leader->ctx;
-       int n = 0, size = 0, ret;
-       u64 count, enabled, running;
-       u64 values[5];
+       struct perf_event *sub;
+       int n = 1; /* skip @nr */
+       int ret;
  
-       lockdep_assert_held(&ctx->mutex);
+       ret = perf_event_read(leader, true);
+       if (ret)
+               return ret;
  
-       count = perf_event_read_value(leader, &enabled, &running);
+       /*
+        * Since we co-schedule groups, {enabled,running} times of siblings
+        * will be identical to those of the leader, so we only publish one
+        * set.
+        */
+       if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+               values[n++] += leader->total_time_enabled +
+                       atomic64_read(&leader->child_total_time_enabled);
+       }
  
-       values[n++] = 1 + leader->nr_siblings;
-       if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
-               values[n++] = enabled;
-       if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
-               values[n++] = running;
-       values[n++] = count;
+       if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+               values[n++] += leader->total_time_running +
+                       atomic64_read(&leader->child_total_time_running);
+       }
+
+       /*
+        * Write {count,id} tuples for every sibling.
+        */
+       values[n++] += perf_event_count(leader);
         if (read_format & PERF_FORMAT_ID)
                 values[n++] = primary_event_id(leader);
  
-       size = n * sizeof(u64);
+       list_for_each_entry(sub, &leader->sibling_list, group_entry) {
+               values[n++] += perf_event_count(sub);
+               if (read_format & PERF_FORMAT_ID)
+                       values[n++] = primary_event_id(sub);
+       }
  
-       if (copy_to_user(buf, values, size))
-               return -EFAULT;
+       return 0;
+}
  
-       ret = size;
+static int perf_read_group(struct perf_event *event,
+                                  u64 read_format, char __user *buf)
+{
+       struct perf_event *leader = event->group_leader, *child;
+       struct perf_event_context *ctx = leader->ctx;
+       int ret;
+       u64 *values;
  
-       list_for_each_entry(sub, &leader->sibling_list, group_entry) {
-               n = 0;
+       lockdep_assert_held(&ctx->mutex);
  
-               values[n++] = perf_event_read_value(sub, &enabled, &running);
-               if (read_format & PERF_FORMAT_ID)
-                       values[n++] = primary_event_id(sub);
+       values = kzalloc(event->read_size, GFP_KERNEL);
+       if (!values)
+               return -ENOMEM;
  
-               size = n * sizeof(u64);
+       values[0] = 1 + leader->nr_siblings;
  
-               if (copy_to_user(buf + ret, values, size)) {
-                       return -EFAULT;
-               }
+       /*
+        * By locking the child_mutex of the leader we effectively
+        * lock the child list of all siblings.. XXX explain how.
+        */
+       mutex_lock(&leader->child_mutex);
+
+       ret = __perf_read_group_add(leader, read_format, values);
+       if (ret)
+               goto unlock;
  
-               ret += size;
+       list_for_each_entry(child, &leader->child_list, child_list) {
+               ret = __perf_read_group_add(child, read_format, values);
+               if (ret)
+                       goto unlock;
         }
  
+       mutex_unlock(&leader->child_mutex);
+
+       ret = event->read_size;
+       if (copy_to_user(buf, values, event->read_size))
+               ret = -EFAULT;
+       goto out;
+
+unlock:
+       mutex_unlock(&leader->child_mutex);
+out:
+       kfree(values);
         return ret;
  }
  
-static int perf_event_read_one(struct perf_event *event,
+static int perf_read_one(struct perf_event *event,
                                  u64 read_format, char __user *buf)
  {
         u64 enabled, running;
@@ -3921,7 +4035,7 @@ static bool is_event_hup(struct perf_event *event)
   * Read the performance event - simple non blocking version for now
   */
  static ssize_t
-perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
+__perf_read(struct perf_event *event, char __user *buf, size_t count)
  {
         u64 read_format = event->attr.read_format;
         int ret;
@@ -3939,9 +4053,9 @@ perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
  
         WARN_ON_ONCE(event->ctx->parent_ctx);
         if (read_format & PERF_FORMAT_GROUP)
-               ret = perf_event_read_group(event, read_format, buf);
+               ret = perf_read_group(event, read_format, buf);
         else
-               ret = perf_event_read_one(event, read_format, buf);
+               ret = perf_read_one(event, read_format, buf);
  
         return ret;
  }
@@ -3954,7 +4068,7 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
         int ret;
  
         ctx = perf_event_ctx_lock(event);
-       ret = perf_read_hw(event, buf, count);
+       ret = __perf_read(event, buf, count);
         perf_event_ctx_unlock(event, ctx);
  
         return ret;
@@ -3985,7 +4099,7 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
  
  static void _perf_event_reset(struct perf_event *event)
  {
-       (void)perf_event_read(event);
+       (void)perf_event_read(event, false);
         local64_set(&event->count, 0);
         perf_event_update_userpage(event);
  }
@@ -5261,9 +5375,15 @@ void perf_output_sample(struct perf_output_handle *handle,
  
         if (sample_type & PERF_SAMPLE_RAW) {
                 if (data->raw) {
-                       perf_output_put(handle, data->raw->size);
-                       __output_copy(handle, data->raw->data,
-                                          data->raw->size);
+                       u32 raw_size = data->raw->size;
+                       u32 real_size = round_up(raw_size + sizeof(u32),
+                                                sizeof(u64)) - sizeof(u32);
+                       u64 zero = 0;
+
+                       perf_output_put(handle, real_size);
+                       __output_copy(handle, data->raw->data, raw_size);
+                       if (real_size - raw_size)
+                               __output_copy(handle, &zero, real_size - raw_size);
                 } else {
                         struct {
                                 u32     size;
@@ -5395,8 +5515,7 @@ void perf_prepare_sample(struct perf_event_header *header,
                 else
                         size += sizeof(u32);
  
-               WARN_ON_ONCE(size & (sizeof(u64)-1));
-               header->size += size;
+               header->size += round_up(size, sizeof(u64));
         }
  
         if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
@@ -7267,24 +7386,49 @@ static void perf_pmu_nop_void(struct pmu *pmu)
  {
  }
  
+static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags)
+{
+}
+
  static int perf_pmu_nop_int(struct pmu *pmu)
  {
         return 0;
  }
  
-static void perf_pmu_start_txn(struct pmu *pmu)
+static DEFINE_PER_CPU(unsigned int, nop_txn_flags);
+
+static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags)
  {
+       __this_cpu_write(nop_txn_flags, flags);
+
+       if (flags & ~PERF_PMU_TXN_ADD)
+               return;
+
         perf_pmu_disable(pmu);
  }
  
  static int perf_pmu_commit_txn(struct pmu *pmu)
  {
+       unsigned int flags = __this_cpu_read(nop_txn_flags);
+
+       __this_cpu_write(nop_txn_flags, 0);
+
+       if (flags & ~PERF_PMU_TXN_ADD)
+               return 0;
+
         perf_pmu_enable(pmu);
         return 0;
  }
  
  static void perf_pmu_cancel_txn(struct pmu *pmu)
  {
+       unsigned int flags =  __this_cpu_read(nop_txn_flags);
+
+       __this_cpu_write(nop_txn_flags, 0);
+
+       if (flags & ~PERF_PMU_TXN_ADD)
+               return;
+
         perf_pmu_enable(pmu);
  }
  
@@ -7523,7 +7667,7 @@ got_cpu_context:
                         pmu->commit_txn = perf_pmu_commit_txn;
                         pmu->cancel_txn = perf_pmu_cancel_txn;
                 } else {
-                       pmu->start_txn  = perf_pmu_nop_void;
+                       pmu->start_txn  = perf_pmu_nop_txn;
                         pmu->commit_txn = perf_pmu_nop_int;
                         pmu->cancel_txn = perf_pmu_nop_void;
                 }
@@ -7611,7 +7755,7 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
         return ret;
  }
  
-struct pmu *perf_init_event(struct perf_event *event)
+static struct pmu *perf_init_event(struct perf_event *event)
  {
         struct pmu *pmu = NULL;
         int idx;
@@ -8297,13 +8441,35 @@ SYSCALL_DEFINE5(perf_event_open,
  
         if (move_group) {
                 gctx = group_leader->ctx;
+               mutex_lock_double(&gctx->mutex, &ctx->mutex);
+       } else {
+               mutex_lock(&ctx->mutex);
+       }
  
+       if (!perf_event_validate_size(event)) {
+               err = -E2BIG;
+               goto err_locked;
+       }
+
+       /*
+        * Must be under the same ctx::mutex as perf_install_in_context(),
+        * because we need to serialize with concurrent event creation.
+        */
+       if (!exclusive_event_installable(event, ctx)) {
+               /* exclusive and group stuff are assumed mutually exclusive */
+               WARN_ON_ONCE(move_group);
+
+               err = -EBUSY;
+               goto err_locked;
+       }
+
+       WARN_ON_ONCE(ctx->parent_ctx);
+
+       if (move_group) {
                 /*
                  * See perf_event_ctx_lock() for comments on the details
                  * of swizzling perf_event::ctx.
                  */
-               mutex_lock_double(&gctx->mutex, &ctx->mutex);
-
                 perf_remove_from_context(group_leader, false);
  
                 list_for_each_entry(sibling, &group_leader->sibling_list,
@@ -8311,13 +8477,7 @@ SYSCALL_DEFINE5(perf_event_open,
                         perf_remove_from_context(sibling, false);
                         put_ctx(gctx);
                 }
-       } else {
-               mutex_lock(&ctx->mutex);
-       }
  
-       WARN_ON_ONCE(ctx->parent_ctx);
-
-       if (move_group) {
                 /*
                  * Wait for everybody to stop referencing the events through
                  * the old lists, before installing it on new lists.
@@ -8349,22 +8509,29 @@ SYSCALL_DEFINE5(perf_event_open,
                 perf_event__state_init(group_leader);
                 perf_install_in_context(ctx, group_leader, group_leader->cpu);
                 get_ctx(ctx);
-       }
  
-       if (!exclusive_event_installable(event, ctx)) {
-               err = -EBUSY;
-               mutex_unlock(&ctx->mutex);
-               fput(event_file);
-               goto err_context;
+               /*
+                * Now that all events are installed in @ctx, nothing
+                * references @gctx anymore, so drop the last reference we have
+                * on it.
+                */
+               put_ctx(gctx);
         }
  
+       /*
+        * Precalculate sample_data sizes; do while holding ctx::mutex such
+        * that we're serialized against further additions and before
+        * perf_install_in_context() which is the point the event is active and
+        * can use these values.
+        */
+       perf_event__header_size(event);
+       perf_event__id_header_size(event);
+
         perf_install_in_context(ctx, event, event->cpu);
         perf_unpin_context(ctx);
  
-       if (move_group) {
+       if (move_group)
                 mutex_unlock(&gctx->mutex);
-               put_ctx(gctx);
-       }
         mutex_unlock(&ctx->mutex);
  
         put_online_cpus();
@@ -8375,12 +8542,6 @@ SYSCALL_DEFINE5(perf_event_open,
         list_add_tail(&event->owner_entry, &current->perf_event_list);
         mutex_unlock(&current->perf_event_mutex);
  
-       /*
-        * Precalculate sample_data sizes
-        */
-       perf_event__header_size(event);
-       perf_event__id_header_size(event);
-
         /*
          * Drop the reference on the group_event after placing the
          * new event on the sibling_list. This ensures destruction
@@ -8391,6 +8552,12 @@ SYSCALL_DEFINE5(perf_event_open,
         fd_install(event_fd, event_file);
         return event_fd;
  
+err_locked:
+       if (move_group)
+               mutex_unlock(&gctx->mutex);
+       mutex_unlock(&ctx->mutex);
+/* err_file: */
+       fput(event_file);
  err_context:
         perf_unpin_context(ctx);
         put_ctx(ctx);