[PATCH] export cpu_online_map

[karo-tx-linux.git] / kernel / sched.c
diff --git a/kernel/sched.c b/kernel/sched.c

index 103f705b245c751e41541a94f807c36d676806b9..1e5cafdf4e27619d0320f03dd51b324850b7b077 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -294,6 +294,10 @@ static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)
  
  static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
  {
+#ifdef CONFIG_DEBUG_SPINLOCK
+       /* this is a valid case when another task releases the spinlock */
+       rq->lock.owner = current;
+#endif
         spin_unlock_irq(&rq->lock);
  }
  
@@ -1529,10 +1533,6 @@ static inline void finish_task_switch(runqueue_t *rq, task_t *prev)
          *              Manfred Spraul <manfred@colorfullife.com>
          */
         prev_task_flags = prev->flags;
-#ifdef CONFIG_DEBUG_SPINLOCK
-       /* this is a valid case when another task releases the spinlock */
-       rq->lock.owner = current;
-#endif
         finish_arch_switch(prev);
         finish_lock_switch(rq, prev);
         if (mm)
@@ -1910,6 +1910,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
  {
         struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
         unsigned long max_load, avg_load, total_load, this_load, total_pwr;
+       unsigned long max_pull;
         int load_idx;
  
         max_load = this_load = total_load = total_pwr = 0;
@@ -1959,7 +1960,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                 group = group->next;
         } while (group != sd->groups);
  
-       if (!busiest || this_load >= max_load)
+       if (!busiest || this_load >= max_load || max_load <= SCHED_LOAD_SCALE)
                 goto out_balanced;
  
         avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
@@ -1979,8 +1980,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
          * by pulling tasks to us.  Be careful of negative numbers as they'll
          * appear as very large values with unsigned longs.
          */
+
+       /* Don't want to pull so many tasks that a group would go idle */
+       max_pull = min(max_load - avg_load, max_load - SCHED_LOAD_SCALE);
+
         /* How much load to actually move to equalise the imbalance */
-       *imbalance = min((max_load - avg_load) * busiest->cpu_power,
+       *imbalance = min(max_pull * busiest->cpu_power,
                                 (avg_load - this_load) * this->cpu_power)
                         / SCHED_LOAD_SCALE;
  
@@ -2125,6 +2130,16 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
                 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
  
                         spin_lock(&busiest->lock);
+
+                       /* don't kick the migration_thread, if the curr
+                        * task on busiest cpu can't be moved to this_cpu
+                        */
+                       if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
+                               spin_unlock(&busiest->lock);
+                               all_pinned = 1;
+                               goto out_one_pinned;
+                       }
+
                         if (!busiest->active_balance) {
                                 busiest->active_balance = 1;
                                 busiest->push_cpu = this_cpu;
@@ -2165,6 +2180,8 @@ out_balanced:
         schedstat_inc(sd, lb_balanced[idle]);
  
         sd->nr_balance_failed = 0;
+
+out_one_pinned:
         /* tune up the balancing interval */
         if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
                         (sd->balance_interval < sd->max_interval))
@@ -2357,7 +2374,8 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
  
                 if (j - sd->last_balance >= interval) {
                         if (load_balance(this_cpu, this_rq, sd, idle)) {
-                               /* We've pulled tasks over so either we're no
+                               /*
+                                * We've pulled tasks over so either we're no
                                  * longer idle, or one of our SMT siblings is
                                  * not idle.
                                  */
@@ -3861,6 +3879,7 @@ EXPORT_SYMBOL(cpu_present_map);
  
  #ifndef CONFIG_SMP
  cpumask_t cpu_online_map = CPU_MASK_ALL;
+EXPORT_SYMBOL_GPL(cpu_online_map);
  cpumask_t cpu_possible_map = CPU_MASK_ALL;
  #endif
  
@@ -5584,3 +5603,47 @@ void normalize_rt_tasks(void)
  }
  
  #endif /* CONFIG_MAGIC_SYSRQ */
+
+#ifdef CONFIG_IA64
+/*
+ * These functions are only useful for the IA64 MCA handling.
+ *
+ * They can only be called when the whole system has been
+ * stopped - every CPU needs to be quiescent, and no scheduling
+ * activity can take place. Using them for anything else would
+ * be a serious bug, and as a result, they aren't even visible
+ * under any other configuration.
+ */
+
+/**
+ * curr_task - return the current task for a given cpu.
+ * @cpu: the processor in question.
+ *
+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
+ */
+task_t *curr_task(int cpu)
+{
+       return cpu_curr(cpu);
+}
+
+/**
+ * set_curr_task - set the current task for a given cpu.
+ * @cpu: the processor in question.
+ * @p: the task pointer to set.
+ *
+ * Description: This function must only be used when non-maskable interrupts
+ * are serviced on a separate stack.  It allows the architecture to switch the
+ * notion of the current task on a cpu in a non-blocking manner.  This function
+ * must be called with all CPU's synchronized, and interrupts disabled, the
+ * and caller must save the original value of the current task (see
+ * curr_task() above) and restore that value before reenabling interrupts and
+ * re-starting the system.
+ *
+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
+ */
+void set_curr_task(int cpu, task_t *p)
+{
+       cpu_curr(cpu) = p;
+}
+
+#endif