Merge commit 'v2.6.29-rc4' into sched/core

[karo-tx-linux.git] / kernel / sched.c
diff --git a/kernel/sched.c b/kernel/sched.c

index 8ee437a5ec1d5186bc411b146a513280533ff998..fcc3483e99556bc136bd92303556389642d44a8c 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -467,11 +467,17 @@ struct rt_rq {
         struct rt_prio_array active;
         unsigned long rt_nr_running;
  #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-       int highest_prio; /* highest queued rt task prio */
+       struct {
+               int curr; /* highest queued rt task prio */
+#ifdef CONFIG_SMP
+               int next; /* next highest */
+#endif
+       } highest_prio;
  #endif
  #ifdef CONFIG_SMP
         unsigned long rt_nr_migratory;
         int overloaded;
+       struct plist_head pushable_tasks;
  #endif
         int rt_throttled;
         u64 rt_time;
@@ -1610,21 +1616,42 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
  
  #endif
  
+#ifdef CONFIG_PREEMPT
+
  /*
- * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+ * fair double_lock_balance: Safely acquires both rq->locks in a fair
+ * way at the expense of forcing extra atomic operations in all
+ * invocations.  This assures that the double_lock is acquired using the
+ * same underlying policy as the spinlock_t on this architecture, which
+ * reduces latency compared to the unfair variant below.  However, it
+ * also adds more overhead and therefore may reduce throughput.
   */
-static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
+       __releases(this_rq->lock)
+       __acquires(busiest->lock)
+       __acquires(this_rq->lock)
+{
+       spin_unlock(&this_rq->lock);
+       double_rq_lock(this_rq, busiest);
+
+       return 1;
+}
+
+#else
+/*
+ * Unfair double_lock_balance: Optimizes throughput at the expense of
+ * latency by eliminating extra atomic operations when the locks are
+ * already in proper order on entry.  This favors lower cpu-ids and will
+ * grant the double lock to lower cpus over higher ids under contention,
+ * regardless of entry order into the function.
+ */
+static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
         __releases(this_rq->lock)
         __acquires(busiest->lock)
         __acquires(this_rq->lock)
  {
         int ret = 0;
  
-       if (unlikely(!irqs_disabled())) {
-               /* printk() doesn't work good under rq->lock */
-               spin_unlock(&this_rq->lock);
-               BUG_ON(1);
-       }
         if (unlikely(!spin_trylock(&busiest->lock))) {
                 if (busiest < this_rq) {
                         spin_unlock(&this_rq->lock);
@@ -1637,6 +1664,22 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
         return ret;
  }
  
+#endif /* CONFIG_PREEMPT */
+
+/*
+ * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+ */
+static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+{
+       if (unlikely(!irqs_disabled())) {
+               /* printk() doesn't work good under rq->lock */
+               spin_unlock(&this_rq->lock);
+               BUG_ON(1);
+       }
+
+       return _double_lock_balance(this_rq, busiest);
+}
+
  static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
         __releases(busiest->lock)
  {
@@ -1705,6 +1748,9 @@ static void update_avg(u64 *avg, u64 sample)
  
  static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
  {
+       if (wakeup)
+               p->se.start_runtime = p->se.sum_exec_runtime;
+
         sched_info_queued(p);
         p->sched_class->enqueue_task(rq, p, wakeup);
         p->se.on_rq = 1;
@@ -1712,10 +1758,15 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
  
  static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
  {
-       if (sleep && p->se.last_wakeup) {
-               update_avg(&p->se.avg_overlap,
-                          p->se.sum_exec_runtime - p->se.last_wakeup);
-               p->se.last_wakeup = 0;
+       if (sleep) {
+               if (p->se.last_wakeup) {
+                       update_avg(&p->se.avg_overlap,
+                               p->se.sum_exec_runtime - p->se.last_wakeup);
+                       p->se.last_wakeup = 0;
+               } else {
+                       update_avg(&p->se.avg_wakeup,
+                               sysctl_sched_wakeup_granularity);
+               }
         }
  
         sched_info_dequeued(p);
@@ -2355,6 +2406,22 @@ out_activate:
         activate_task(rq, p, 1);
         success = 1;
  
+       /*
+        * Only attribute actual wakeups done by this task.
+        */
+       if (!in_interrupt()) {
+               struct sched_entity *se = &current->se;
+               u64 sample = se->sum_exec_runtime;
+
+               if (se->last_wakeup)
+                       sample -= se->last_wakeup;
+               else
+                       sample -= se->start_runtime;
+               update_avg(&se->avg_wakeup, sample);
+
+               se->last_wakeup = se->sum_exec_runtime;
+       }
+
  out_running:
         trace_sched_wakeup(rq, p, success);
         check_preempt_curr(rq, p, sync);
@@ -2365,8 +2432,6 @@ out_running:
                 p->sched_class->task_wake_up(rq, p);
  #endif
  out:
-       current->se.last_wakeup = current->se.sum_exec_runtime;
-
         task_rq_unlock(rq, &flags);
  
         return success;
@@ -2396,6 +2461,8 @@ static void __sched_fork(struct task_struct *p)
         p->se.prev_sum_exec_runtime     = 0;
         p->se.last_wakeup               = 0;
         p->se.avg_overlap               = 0;
+       p->se.start_runtime             = 0;
+       p->se.avg_wakeup                = sysctl_sched_wakeup_granularity;
  
  #ifdef CONFIG_SCHEDSTATS
         p->se.wait_start                = 0;
@@ -2458,6 +2525,8 @@ void sched_fork(struct task_struct *p, int clone_flags)
         /* Want to start with kernel preemption disabled. */
         task_thread_info(p)->preempt_count = 1;
  #endif
+       plist_node_init(&p->pushable_tasks, MAX_PRIO);
+
         put_cpu();
  }
  
@@ -2598,6 +2667,12 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
  {
         struct mm_struct *mm = rq->prev_mm;
         long prev_state;
+#ifdef CONFIG_SMP
+       int post_schedule = 0;
+
+       if (current->sched_class->needs_post_schedule)
+               post_schedule = current->sched_class->needs_post_schedule(rq);
+#endif
  
         rq->prev_mm = NULL;
  
@@ -2616,7 +2691,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
         finish_arch_switch(prev);
         finish_lock_switch(rq, prev);
  #ifdef CONFIG_SMP
-       if (current->sched_class->post_schedule)
+       if (post_schedule)
                 current->sched_class->post_schedule(rq);
  #endif
  
@@ -2997,6 +3072,16 @@ next:
         pulled++;
         rem_load_move -= p->se.load.weight;
  
+#ifdef CONFIG_PREEMPT
+       /*
+        * NEWIDLE balancing is a source of latency, so preemptible kernels
+        * will stop after the first task is pulled to minimize the critical
+        * section.
+        */
+       if (idle == CPU_NEWLY_IDLE)
+               goto out;
+#endif
+
         /*
          * We only want to steal up to the prescribed amount of weighted load.
          */
@@ -3043,9 +3128,15 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
                                 sd, idle, all_pinned, &this_best_prio);
                 class = class->next;
  
+#ifdef CONFIG_PREEMPT
+               /*
+                * NEWIDLE balancing is a source of latency, so preemptible
+                * kernels will stop after the first task is pulled to minimize
+                * the critical section.
+                */
                 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
                         break;
-
+#endif
         } while (class && max_load_move > total_load_moved);
  
         return total_load_moved > 0;
@@ -3890,19 +3981,24 @@ int select_nohz_load_balancer(int stop_tick)
         int cpu = smp_processor_id();
  
         if (stop_tick) {
-               cpumask_set_cpu(cpu, nohz.cpu_mask);
                 cpu_rq(cpu)->in_nohz_recently = 1;
  
-               /*
-                * If we are going offline and still the leader, give up!
-                */
-               if (!cpu_active(cpu) &&
-                   atomic_read(&nohz.load_balancer) == cpu) {
+               if (!cpu_active(cpu)) {
+                       if (atomic_read(&nohz.load_balancer) != cpu)
+                               return 0;
+
+                       /*
+                        * If we are going offline and still the leader,
+                        * give up!
+                        */
                         if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
                                 BUG();
+
                         return 0;
                 }
  
+               cpumask_set_cpu(cpu, nohz.cpu_mask);
+
                 /* time for ilb owner also to sleep */
                 if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
                         if (atomic_read(&nohz.load_balancer) == cpu)
@@ -8214,11 +8310,15 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
         __set_bit(MAX_RT_PRIO, array->bitmap);
  
  #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-       rt_rq->highest_prio = MAX_RT_PRIO;
+       rt_rq->highest_prio.curr = MAX_RT_PRIO;
+#ifdef CONFIG_SMP
+       rt_rq->highest_prio.next = MAX_RT_PRIO;
+#endif
  #endif
  #ifdef CONFIG_SMP
         rt_rq->rt_nr_migratory = 0;
         rt_rq->overloaded = 0;
+       plist_head_init(&rq->rt.pushable_tasks, &rq->lock);
  #endif
  
         rt_rq->rt_time = 0;