Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...

[karo-tx-linux.git] / kernel / sched / core.c
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index bcd214e4b4d630eb3304f6000a220dc881bafc24..aa5973220ad213a960092012bf4493f296dab90b 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -817,7 +817,7 @@ static void set_load_weight(struct task_struct *p)
         /*
          * SCHED_IDLE tasks get minimal weight:
          */
-       if (p->policy == SCHED_IDLE) {
+       if (idle_policy(p->policy)) {
                 load->weight = scale_load(WEIGHT_IDLEPRIO);
                 load->inv_weight = WMULT_IDLEPRIO;
                 return;
@@ -827,17 +827,19 @@ static void set_load_weight(struct task_struct *p)
         load->inv_weight = prio_to_wmult[prio];
  }
  
-static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
+static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
  {
         update_rq_clock(rq);
-       sched_info_queued(rq, p);
+       if (!(flags & ENQUEUE_RESTORE))
+               sched_info_queued(rq, p);
         p->sched_class->enqueue_task(rq, p, flags);
  }
  
-static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
+static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
  {
         update_rq_clock(rq);
-       sched_info_dequeued(rq, p);
+       if (!(flags & DEQUEUE_SAVE))
+               sched_info_dequeued(rq, p);
         p->sched_class->dequeue_task(rq, p, flags);
  }
  
@@ -1178,7 +1180,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
                  * holding rq->lock.
                  */
                 lockdep_assert_held(&rq->lock);
-               dequeue_task(rq, p, 0);
+               dequeue_task(rq, p, DEQUEUE_SAVE);
         }
         if (running)
                 put_prev_task(rq, p);
@@ -1188,7 +1190,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
         if (running)
                 p->sched_class->set_curr_task(rq);
         if (queued)
-               enqueue_task(rq, p, 0);
+               enqueue_task(rq, p, ENQUEUE_RESTORE);
  }
  
  /*
@@ -1292,7 +1294,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
  
         if (task_cpu(p) != new_cpu) {
                 if (p->sched_class->migrate_task_rq)
-                       p->sched_class->migrate_task_rq(p, new_cpu);
+                       p->sched_class->migrate_task_rq(p);
                 p->se.nr_migrations++;
                 perf_event_task_migrate(p);
         }
@@ -1333,12 +1335,16 @@ static int migrate_swap_stop(void *data)
         struct rq *src_rq, *dst_rq;
         int ret = -EAGAIN;
  
+       if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu))
+               return -EAGAIN;
+
         src_rq = cpu_rq(arg->src_cpu);
         dst_rq = cpu_rq(arg->dst_cpu);
  
         double_raw_lock(&arg->src_task->pi_lock,
                         &arg->dst_task->pi_lock);
         double_rq_lock(src_rq, dst_rq);
+
         if (task_cpu(arg->dst_task) != arg->dst_cpu)
                 goto unlock;
  
@@ -1574,13 +1580,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
                         goto out;
                 }
  
+               /* No more Mr. Nice Guy. */
                 switch (state) {
                 case cpuset:
-                       /* No more Mr. Nice Guy. */
-                       cpuset_cpus_allowed_fallback(p);
-                       state = possible;
-                       break;
-
+                       if (IS_ENABLED(CONFIG_CPUSETS)) {
+                               cpuset_cpus_allowed_fallback(p);
+                               state = possible;
+                               break;
+                       }
+                       /* fall-through */
                 case possible:
                         do_set_cpus_allowed(p, cpu_possible_mask);
                         state = fail;
@@ -1692,7 +1700,7 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
  #endif /* CONFIG_SCHEDSTATS */
  }
  
-static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
+static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
  {
         activate_task(rq, p, en_flags);
         p->on_rq = TASK_ON_RQ_QUEUED;
@@ -2114,23 +2122,17 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
  #endif /* CONFIG_NUMA_BALANCING */
  }
  
+DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
+
  #ifdef CONFIG_NUMA_BALANCING
-#ifdef CONFIG_SCHED_DEBUG
+
  void set_numabalancing_state(bool enabled)
  {
         if (enabled)
-               sched_feat_set("NUMA");
+               static_branch_enable(&sched_numa_balancing);
         else
-               sched_feat_set("NO_NUMA");
+               static_branch_disable(&sched_numa_balancing);
  }
-#else
-__read_mostly bool numabalancing_enabled;
-
-void set_numabalancing_state(bool enabled)
-{
-       numabalancing_enabled = enabled;
-}
-#endif /* CONFIG_SCHED_DEBUG */
  
  #ifdef CONFIG_PROC_SYSCTL
  int sysctl_numa_balancing(struct ctl_table *table, int write,
@@ -2138,7 +2140,7 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
  {
         struct ctl_table t;
         int err;
-       int state = numabalancing_enabled;
+       int state = static_branch_likely(&sched_numa_balancing);
  
         if (write && !capable(CAP_SYS_ADMIN))
                 return -EPERM;
@@ -2349,6 +2351,8 @@ void wake_up_new_task(struct task_struct *p)
         struct rq *rq;
  
         raw_spin_lock_irqsave(&p->pi_lock, flags);
+       /* Initialize new task's runnable average */
+       init_entity_runnable_average(&p->se);
  #ifdef CONFIG_SMP
         /*
          * Fork balancing, do it here and not earlier because:
@@ -2358,8 +2362,6 @@ void wake_up_new_task(struct task_struct *p)
         set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
  #endif
  
-       /* Initialize new task's runnable average */
-       init_entity_runnable_average(&p->se);
         rq = __task_rq_lock(p);
         activate_task(rq, p, 0);
         p->on_rq = TASK_ON_RQ_QUEUED;
@@ -2483,7 +2485,6 @@ static inline void
  prepare_task_switch(struct rq *rq, struct task_struct *prev,
                     struct task_struct *next)
  {
-       trace_sched_switch(prev, next);
         sched_info_switch(rq, prev, next);
         perf_event_task_sched_out(prev, next);
         fire_sched_out_preempt_notifiers(prev, next);
@@ -2517,6 +2518,22 @@ static struct rq *finish_task_switch(struct task_struct *prev)
         struct mm_struct *mm = rq->prev_mm;
         long prev_state;
  
+       /*
+        * The previous task will have left us with a preempt_count of 2
+        * because it left us after:
+        *
+        *      schedule()
+        *        preempt_disable();                    // 1
+        *        __schedule()
+        *          raw_spin_lock_irq(&rq->lock)        // 2
+        *
+        * Also, see FORK_PREEMPT_COUNT.
+        */
+       if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
+                     "corrupted preempt_count: %s/%d/0x%x\n",
+                     current->comm, current->pid, preempt_count()))
+               preempt_count_set(FORK_PREEMPT_COUNT);
+
         rq->prev_mm = NULL;
  
         /*
@@ -2601,8 +2618,15 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
  {
         struct rq *rq;
  
-       /* finish_task_switch() drops rq->lock and enables preemtion */
-       preempt_disable();
+       /*
+        * New tasks start with FORK_PREEMPT_COUNT, see there and
+        * finish_task_switch() for details.
+        *
+        * finish_task_switch() will drop rq->lock() and lower preempt_count
+        * and the preempt_enable() will end up enabling preemption (on
+        * PREEMPT_COUNT kernels).
+        */
+
         rq = finish_task_switch(prev);
         balance_callback(rq);
         preempt_enable();
@@ -2960,15 +2984,13 @@ static noinline void __schedule_bug(struct task_struct *prev)
  static inline void schedule_debug(struct task_struct *prev)
  {
  #ifdef CONFIG_SCHED_STACK_END_CHECK
-       BUG_ON(unlikely(task_stack_end_corrupted(prev)));
+       BUG_ON(task_stack_end_corrupted(prev));
  #endif
-       /*
-        * Test if we are atomic. Since do_exit() needs to call into
-        * schedule() atomically, we ignore that path. Otherwise whine
-        * if we are scheduling when we should not.
-        */
-       if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD))
+
+       if (unlikely(in_atomic_preempt_off())) {
                 __schedule_bug(prev);
+               preempt_count_set(PREEMPT_DISABLED);
+       }
         rcu_sleep_check();
  
         profile_hit(SCHED_PROFILING, __builtin_return_address(0));
@@ -3054,7 +3076,7 @@ again:
   *
   * WARNING: must be called with preemption disabled!
   */
-static void __sched __schedule(void)
+static void __sched notrace __schedule(bool preempt)
  {
         struct task_struct *prev, *next;
         unsigned long *switch_count;
@@ -3066,6 +3088,17 @@ static void __sched __schedule(void)
         rcu_note_context_switch();
         prev = rq->curr;
  
+       /*
+        * do_exit() calls schedule() with preemption disabled as an exception;
+        * however we must fix that up, otherwise the next task will see an
+        * inconsistent (higher) preempt count.
+        *
+        * It also avoids the below schedule_debug() test from complaining
+        * about this.
+        */
+       if (unlikely(prev->state == TASK_DEAD))
+               preempt_enable_no_resched_notrace();
+
         schedule_debug(prev);
  
         if (sched_feat(HRTICK))
@@ -3083,7 +3116,7 @@ static void __sched __schedule(void)
         rq->clock_skip_update <<= 1; /* promote REQ to ACT */
  
         switch_count = &prev->nivcsw;
-       if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
+       if (!preempt && prev->state) {
                 if (unlikely(signal_pending_state(prev->state, prev))) {
                         prev->state = TASK_RUNNING;
                 } else {
@@ -3119,6 +3152,7 @@ static void __sched __schedule(void)
                 rq->curr = next;
                 ++*switch_count;
  
+               trace_sched_switch(preempt, prev, next);
                 rq = context_switch(rq, prev, next); /* unlocks the rq */
                 cpu = cpu_of(rq);
         } else {
@@ -3148,7 +3182,7 @@ asmlinkage __visible void __sched schedule(void)
         sched_submit_work(tsk);
         do {
                 preempt_disable();
-               __schedule();
+               __schedule(false);
                 sched_preempt_enable_no_resched();
         } while (need_resched());
  }
@@ -3188,9 +3222,9 @@ void __sched schedule_preempt_disabled(void)
  static void __sched notrace preempt_schedule_common(void)
  {
         do {
-               preempt_active_enter();
-               __schedule();
-               preempt_active_exit();
+               preempt_disable_notrace();
+               __schedule(true);
+               preempt_enable_no_resched_notrace();
  
                 /*
                  * Check again in case we missed a preemption opportunity
@@ -3241,24 +3275,17 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
                 return;
  
         do {
-               /*
-                * Use raw __prempt_count() ops that don't call function.
-                * We can't call functions before disabling preemption which
-                * disarm preemption tracing recursions.
-                */
-               __preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
-               barrier();
+               preempt_disable_notrace();
                 /*
                  * Needs preempt disabled in case user_exit() is traced
                  * and the tracer calls preempt_enable_notrace() causing
                  * an infinite recursion.
                  */
                 prev_ctx = exception_enter();
-               __schedule();
+               __schedule(true);
                 exception_exit(prev_ctx);
  
-               barrier();
-               __preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
+               preempt_enable_no_resched_notrace();
         } while (need_resched());
  }
  EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
@@ -3281,11 +3308,11 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
         prev_state = exception_enter();
  
         do {
-               preempt_active_enter();
+               preempt_disable();
                 local_irq_enable();
-               __schedule();
+               __schedule(true);
                 local_irq_disable();
-               preempt_active_exit();
+               sched_preempt_enable_no_resched();
         } while (need_resched());
  
         exception_exit(prev_state);
@@ -3313,7 +3340,7 @@ EXPORT_SYMBOL(default_wake_function);
   */
  void rt_mutex_setprio(struct task_struct *p, int prio)
  {
-       int oldprio, queued, running, enqueue_flag = 0;
+       int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE;
         struct rq *rq;
         const struct sched_class *prev_class;
  
@@ -3345,7 +3372,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
         queued = task_on_rq_queued(p);
         running = task_current(rq, p);
         if (queued)
-               dequeue_task(rq, p, 0);
+               dequeue_task(rq, p, DEQUEUE_SAVE);
         if (running)
                 put_prev_task(rq, p);
  
@@ -3363,7 +3390,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
                 if (!dl_prio(p->normal_prio) ||
                     (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
                         p->dl.dl_boosted = 1;
-                       enqueue_flag = ENQUEUE_REPLENISH;
+                       enqueue_flag |= ENQUEUE_REPLENISH;
                 } else
                         p->dl.dl_boosted = 0;
                 p->sched_class = &dl_sched_class;
@@ -3371,7 +3398,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
                 if (dl_prio(oldprio))
                         p->dl.dl_boosted = 0;
                 if (oldprio < prio)
-                       enqueue_flag = ENQUEUE_HEAD;
+                       enqueue_flag |= ENQUEUE_HEAD;
                 p->sched_class = &rt_sched_class;
         } else {
                 if (dl_prio(oldprio))
@@ -3423,7 +3450,7 @@ void set_user_nice(struct task_struct *p, long nice)
         }
         queued = task_on_rq_queued(p);
         if (queued)
-               dequeue_task(rq, p, 0);
+               dequeue_task(rq, p, DEQUEUE_SAVE);
  
         p->static_prio = NICE_TO_PRIO(nice);
         set_load_weight(p);
@@ -3432,7 +3459,7 @@ void set_user_nice(struct task_struct *p, long nice)
         delta = p->prio - old_prio;
  
         if (queued) {
-               enqueue_task(rq, p, 0);
+               enqueue_task(rq, p, ENQUEUE_RESTORE);
                 /*
                  * If the task increased its priority or is running and
                  * lowered its priority, then reschedule its CPU:
@@ -3753,10 +3780,7 @@ recheck:
         } else {
                 reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
  
-               if (policy != SCHED_DEADLINE &&
-                               policy != SCHED_FIFO && policy != SCHED_RR &&
-                               policy != SCHED_NORMAL && policy != SCHED_BATCH &&
-                               policy != SCHED_IDLE)
+               if (!valid_policy(policy))
                         return -EINVAL;
         }
  
@@ -3812,7 +3836,7 @@ recheck:
                  * Treat SCHED_IDLE as nice 20. Only allow a switch to
                  * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
                  */
-               if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
+               if (idle_policy(p->policy) && !idle_policy(policy)) {
                         if (!can_nice(p, task_nice(p)))
                                 return -EPERM;
                 }
@@ -3937,7 +3961,7 @@ change:
         queued = task_on_rq_queued(p);
         running = task_current(rq, p);
         if (queued)
-               dequeue_task(rq, p, 0);
+               dequeue_task(rq, p, DEQUEUE_SAVE);
         if (running)
                 put_prev_task(rq, p);
  
@@ -3947,11 +3971,15 @@ change:
         if (running)
                 p->sched_class->set_curr_task(rq);
         if (queued) {
+               int enqueue_flags = ENQUEUE_RESTORE;
                 /*
                  * We enqueue to tail when the priority of a task is
                  * increased (user space view).
                  */
-               enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);
+               if (oldprio <= p->prio)
+                       enqueue_flags |= ENQUEUE_HEAD;
+
+               enqueue_task(rq, p, enqueue_flags);
         }
  
         check_class_changed(rq, p, prev_class, oldprio);
@@ -4029,6 +4057,7 @@ int sched_setscheduler_nocheck(struct task_struct *p, int policy,
  {
         return _sched_setscheduler(p, policy, param, false);
  }
+EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
  
  static int
  do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
@@ -5100,7 +5129,7 @@ void sched_setnuma(struct task_struct *p, int nid)
         running = task_current(rq, p);
  
         if (queued)
-               dequeue_task(rq, p, 0);
+               dequeue_task(rq, p, DEQUEUE_SAVE);
         if (running)
                 put_prev_task(rq, p);
  
@@ -5109,7 +5138,7 @@ void sched_setnuma(struct task_struct *p, int nid)
         if (running)
                 p->sched_class->set_curr_task(rq);
         if (queued)
-               enqueue_task(rq, p, 0);
+               enqueue_task(rq, p, ENQUEUE_RESTORE);
         task_rq_unlock(rq, p, &flags);
  }
  #endif /* CONFIG_NUMA_BALANCING */
@@ -5530,21 +5559,27 @@ static void set_cpu_rq_start_time(void)
  static int sched_cpu_active(struct notifier_block *nfb,
                                       unsigned long action, void *hcpu)
  {
+       int cpu = (long)hcpu;
+
         switch (action & ~CPU_TASKS_FROZEN) {
         case CPU_STARTING:
                 set_cpu_rq_start_time();
                 return NOTIFY_OK;
+
         case CPU_ONLINE:
                 /*
                  * At this point a starting CPU has marked itself as online via
                  * set_cpu_online(). But it might not yet have marked itself
                  * as active, which is essential from here on.
-                *
-                * Thus, fall-through and help the starting CPU along.
                  */
+               set_cpu_active(cpu, true);
+               stop_machine_unpark(cpu);
+               return NOTIFY_OK;
+
         case CPU_DOWN_FAILED:
-               set_cpu_active((long)hcpu, true);
+               set_cpu_active(cpu, true);
                 return NOTIFY_OK;
+
         default:
                 return NOTIFY_DONE;
         }
@@ -6476,7 +6511,8 @@ static struct sched_domain_topology_level default_topology[] = {
         { NULL, },
  };
  
-struct sched_domain_topology_level *sched_domain_topology = default_topology;
+static struct sched_domain_topology_level *sched_domain_topology =
+       default_topology;
  
  #define for_each_sd_topology(tl)                       \
         for (tl = sched_domain_topology; tl->mask; tl++)
@@ -7477,7 +7513,7 @@ void __init sched_init(void)
  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
  static inline int preempt_count_equals(int preempt_offset)
  {
-       int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
+       int nested = preempt_count() + rcu_preempt_depth();
  
         return (nested == preempt_offset);
  }
@@ -7724,7 +7760,7 @@ void sched_move_task(struct task_struct *tsk)
         queued = task_on_rq_queued(tsk);
  
         if (queued)
-               dequeue_task(rq, tsk, 0);
+               dequeue_task(rq, tsk, DEQUEUE_SAVE);
         if (unlikely(running))
                 put_prev_task(rq, tsk);
  
@@ -7740,7 +7776,7 @@ void sched_move_task(struct task_struct *tsk)
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
         if (tsk->sched_class->task_move_group)
-               tsk->sched_class->task_move_group(tsk, queued);
+               tsk->sched_class->task_move_group(tsk);
         else
  #endif
                 set_task_rq(tsk, task_cpu(tsk));
@@ -7748,7 +7784,7 @@ void sched_move_task(struct task_struct *tsk)
         if (unlikely(running))
                 tsk->sched_class->set_curr_task(rq);
         if (queued)
-               enqueue_task(rq, tsk, 0);
+               enqueue_task(rq, tsk, ENQUEUE_RESTORE);
  
         task_rq_unlock(rq, tsk, &flags);
  }
@@ -8212,14 +8248,6 @@ static void cpu_cgroup_exit(struct cgroup_subsys_state *css,
                             struct cgroup_subsys_state *old_css,
                             struct task_struct *task)
  {
-       /*
-        * cgroup_exit() is called in the copy_process() failure path.
-        * Ignore this case since the task hasn't ran yet, this avoids
-        * trying to poke a half freed task state from generic code.
-        */
-       if (!(task->flags & PF_EXITING))
-               return;
-
         sched_move_task(task);
  }