Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...
[karo-tx-linux.git] / kernel / sched / core.c
index f7402f7..aa59732 100644 (file)
@@ -817,7 +817,7 @@ static void set_load_weight(struct task_struct *p)
        /*
         * SCHED_IDLE tasks get minimal weight:
         */
-       if (p->policy == SCHED_IDLE) {
+       if (idle_policy(p->policy)) {
                load->weight = scale_load(WEIGHT_IDLEPRIO);
                load->inv_weight = WMULT_IDLEPRIO;
                return;
@@ -827,17 +827,19 @@ static void set_load_weight(struct task_struct *p)
        load->inv_weight = prio_to_wmult[prio];
 }
 
-static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
+static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 {
        update_rq_clock(rq);
-       sched_info_queued(rq, p);
+       if (!(flags & ENQUEUE_RESTORE))
+               sched_info_queued(rq, p);
        p->sched_class->enqueue_task(rq, p, flags);
 }
 
-static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
+static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 {
        update_rq_clock(rq);
-       sched_info_dequeued(rq, p);
+       if (!(flags & DEQUEUE_SAVE))
+               sched_info_dequeued(rq, p);
        p->sched_class->dequeue_task(rq, p, flags);
 }
 
@@ -1178,7 +1180,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
                 * holding rq->lock.
                 */
                lockdep_assert_held(&rq->lock);
-               dequeue_task(rq, p, 0);
+               dequeue_task(rq, p, DEQUEUE_SAVE);
        }
        if (running)
                put_prev_task(rq, p);
@@ -1188,7 +1190,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
        if (running)
                p->sched_class->set_curr_task(rq);
        if (queued)
-               enqueue_task(rq, p, 0);
+               enqueue_task(rq, p, ENQUEUE_RESTORE);
 }
 
 /*
@@ -1292,7 +1294,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 
        if (task_cpu(p) != new_cpu) {
                if (p->sched_class->migrate_task_rq)
-                       p->sched_class->migrate_task_rq(p, new_cpu);
+                       p->sched_class->migrate_task_rq(p);
                p->se.nr_migrations++;
                perf_event_task_migrate(p);
        }
@@ -1333,12 +1335,16 @@ static int migrate_swap_stop(void *data)
        struct rq *src_rq, *dst_rq;
        int ret = -EAGAIN;
 
+       if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu))
+               return -EAGAIN;
+
        src_rq = cpu_rq(arg->src_cpu);
        dst_rq = cpu_rq(arg->dst_cpu);
 
        double_raw_lock(&arg->src_task->pi_lock,
                        &arg->dst_task->pi_lock);
        double_rq_lock(src_rq, dst_rq);
+
        if (task_cpu(arg->dst_task) != arg->dst_cpu)
                goto unlock;
 
@@ -1574,13 +1580,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
                        goto out;
                }
 
+               /* No more Mr. Nice Guy. */
                switch (state) {
                case cpuset:
-                       /* No more Mr. Nice Guy. */
-                       cpuset_cpus_allowed_fallback(p);
-                       state = possible;
-                       break;
-
+                       if (IS_ENABLED(CONFIG_CPUSETS)) {
+                               cpuset_cpus_allowed_fallback(p);
+                               state = possible;
+                               break;
+                       }
+                       /* fall-through */
                case possible:
                        do_set_cpus_allowed(p, cpu_possible_mask);
                        state = fail;
@@ -1692,7 +1700,7 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
 #endif /* CONFIG_SCHEDSTATS */
 }
 
-static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
+static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
 {
        activate_task(rq, p, en_flags);
        p->on_rq = TASK_ON_RQ_QUEUED;
@@ -2114,23 +2122,17 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 #endif /* CONFIG_NUMA_BALANCING */
 }
 
+DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
+
 #ifdef CONFIG_NUMA_BALANCING
-#ifdef CONFIG_SCHED_DEBUG
+
 void set_numabalancing_state(bool enabled)
 {
        if (enabled)
-               sched_feat_set("NUMA");
+               static_branch_enable(&sched_numa_balancing);
        else
-               sched_feat_set("NO_NUMA");
+               static_branch_disable(&sched_numa_balancing);
 }
-#else
-__read_mostly bool numabalancing_enabled;
-
-void set_numabalancing_state(bool enabled)
-{
-       numabalancing_enabled = enabled;
-}
-#endif /* CONFIG_SCHED_DEBUG */
 
 #ifdef CONFIG_PROC_SYSCTL
 int sysctl_numa_balancing(struct ctl_table *table, int write,
@@ -2138,7 +2140,7 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
 {
        struct ctl_table t;
        int err;
-       int state = numabalancing_enabled;
+       int state = static_branch_likely(&sched_numa_balancing);
 
        if (write && !capable(CAP_SYS_ADMIN))
                return -EPERM;
@@ -2349,6 +2351,8 @@ void wake_up_new_task(struct task_struct *p)
        struct rq *rq;
 
        raw_spin_lock_irqsave(&p->pi_lock, flags);
+       /* Initialize new task's runnable average */
+       init_entity_runnable_average(&p->se);
 #ifdef CONFIG_SMP
        /*
         * Fork balancing, do it here and not earlier because:
@@ -2358,8 +2362,6 @@ void wake_up_new_task(struct task_struct *p)
        set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
 #endif
 
-       /* Initialize new task's runnable average */
-       init_entity_runnable_average(&p->se);
        rq = __task_rq_lock(p);
        activate_task(rq, p, 0);
        p->on_rq = TASK_ON_RQ_QUEUED;
@@ -2483,7 +2485,6 @@ static inline void
 prepare_task_switch(struct rq *rq, struct task_struct *prev,
                    struct task_struct *next)
 {
-       trace_sched_switch(prev, next);
        sched_info_switch(rq, prev, next);
        perf_event_task_sched_out(prev, next);
        fire_sched_out_preempt_notifiers(prev, next);
@@ -2517,6 +2518,22 @@ static struct rq *finish_task_switch(struct task_struct *prev)
        struct mm_struct *mm = rq->prev_mm;
        long prev_state;
 
+       /*
+        * The previous task will have left us with a preempt_count of 2
+        * because it left us after:
+        *
+        *      schedule()
+        *        preempt_disable();                    // 1
+        *        __schedule()
+        *          raw_spin_lock_irq(&rq->lock)        // 2
+        *
+        * Also, see FORK_PREEMPT_COUNT.
+        */
+       if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
+                     "corrupted preempt_count: %s/%d/0x%x\n",
+                     current->comm, current->pid, preempt_count()))
+               preempt_count_set(FORK_PREEMPT_COUNT);
+
        rq->prev_mm = NULL;
 
        /*
@@ -2601,8 +2618,15 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
 {
        struct rq *rq;
 
-       /* finish_task_switch() drops rq->lock and enables preemtion */
-       preempt_disable();
+       /*
+        * New tasks start with FORK_PREEMPT_COUNT, see there and
+        * finish_task_switch() for details.
+        *
+        * finish_task_switch() will drop rq->lock() and lower preempt_count
+        * and the preempt_enable() will end up enabling preemption (on
+        * PREEMPT_COUNT kernels).
+        */
+
        rq = finish_task_switch(prev);
        balance_callback(rq);
        preempt_enable();
@@ -2960,15 +2984,13 @@ static noinline void __schedule_bug(struct task_struct *prev)
 static inline void schedule_debug(struct task_struct *prev)
 {
 #ifdef CONFIG_SCHED_STACK_END_CHECK
-       BUG_ON(unlikely(task_stack_end_corrupted(prev)));
+       BUG_ON(task_stack_end_corrupted(prev));
 #endif
-       /*
-        * Test if we are atomic. Since do_exit() needs to call into
-        * schedule() atomically, we ignore that path. Otherwise whine
-        * if we are scheduling when we should not.
-        */
-       if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD))
+
+       if (unlikely(in_atomic_preempt_off())) {
                __schedule_bug(prev);
+               preempt_count_set(PREEMPT_DISABLED);
+       }
        rcu_sleep_check();
 
        profile_hit(SCHED_PROFILING, __builtin_return_address(0));
@@ -3054,7 +3076,7 @@ again:
  *
  * WARNING: must be called with preemption disabled!
  */
-static void __sched __schedule(void)
+static void __sched notrace __schedule(bool preempt)
 {
        struct task_struct *prev, *next;
        unsigned long *switch_count;
@@ -3066,6 +3088,17 @@ static void __sched __schedule(void)
        rcu_note_context_switch();
        prev = rq->curr;
 
+       /*
+        * do_exit() calls schedule() with preemption disabled as an exception;
+        * however we must fix that up, otherwise the next task will see an
+        * inconsistent (higher) preempt count.
+        *
+        * It also avoids the below schedule_debug() test from complaining
+        * about this.
+        */
+       if (unlikely(prev->state == TASK_DEAD))
+               preempt_enable_no_resched_notrace();
+
        schedule_debug(prev);
 
        if (sched_feat(HRTICK))
@@ -3083,7 +3116,7 @@ static void __sched __schedule(void)
        rq->clock_skip_update <<= 1; /* promote REQ to ACT */
 
        switch_count = &prev->nivcsw;
-       if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
+       if (!preempt && prev->state) {
                if (unlikely(signal_pending_state(prev->state, prev))) {
                        prev->state = TASK_RUNNING;
                } else {
@@ -3119,6 +3152,7 @@ static void __sched __schedule(void)
                rq->curr = next;
                ++*switch_count;
 
+               trace_sched_switch(preempt, prev, next);
                rq = context_switch(rq, prev, next); /* unlocks the rq */
                cpu = cpu_of(rq);
        } else {
@@ -3148,7 +3182,7 @@ asmlinkage __visible void __sched schedule(void)
        sched_submit_work(tsk);
        do {
                preempt_disable();
-               __schedule();
+               __schedule(false);
                sched_preempt_enable_no_resched();
        } while (need_resched());
 }
@@ -3188,9 +3222,9 @@ void __sched schedule_preempt_disabled(void)
 static void __sched notrace preempt_schedule_common(void)
 {
        do {
-               preempt_active_enter();
-               __schedule();
-               preempt_active_exit();
+               preempt_disable_notrace();
+               __schedule(true);
+               preempt_enable_no_resched_notrace();
 
                /*
                 * Check again in case we missed a preemption opportunity
@@ -3241,24 +3275,17 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
                return;
 
        do {
-               /*
-                * Use raw __prempt_count() ops that don't call function.
-                * We can't call functions before disabling preemption which
-                * disarm preemption tracing recursions.
-                */
-               __preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
-               barrier();
+               preempt_disable_notrace();
                /*
                 * Needs preempt disabled in case user_exit() is traced
                 * and the tracer calls preempt_enable_notrace() causing
                 * an infinite recursion.
                 */
                prev_ctx = exception_enter();
-               __schedule();
+               __schedule(true);
                exception_exit(prev_ctx);
 
-               barrier();
-               __preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
+               preempt_enable_no_resched_notrace();
        } while (need_resched());
 }
 EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
@@ -3281,11 +3308,11 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
        prev_state = exception_enter();
 
        do {
-               preempt_active_enter();
+               preempt_disable();
                local_irq_enable();
-               __schedule();
+               __schedule(true);
                local_irq_disable();
-               preempt_active_exit();
+               sched_preempt_enable_no_resched();
        } while (need_resched());
 
        exception_exit(prev_state);
@@ -3313,7 +3340,7 @@ EXPORT_SYMBOL(default_wake_function);
  */
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
-       int oldprio, queued, running, enqueue_flag = 0;
+       int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE;
        struct rq *rq;
        const struct sched_class *prev_class;
 
@@ -3345,7 +3372,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
        queued = task_on_rq_queued(p);
        running = task_current(rq, p);
        if (queued)
-               dequeue_task(rq, p, 0);
+               dequeue_task(rq, p, DEQUEUE_SAVE);
        if (running)
                put_prev_task(rq, p);
 
@@ -3363,7 +3390,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
                if (!dl_prio(p->normal_prio) ||
                    (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
                        p->dl.dl_boosted = 1;
-                       enqueue_flag = ENQUEUE_REPLENISH;
+                       enqueue_flag |= ENQUEUE_REPLENISH;
                } else
                        p->dl.dl_boosted = 0;
                p->sched_class = &dl_sched_class;
@@ -3371,7 +3398,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
                if (dl_prio(oldprio))
                        p->dl.dl_boosted = 0;
                if (oldprio < prio)
-                       enqueue_flag = ENQUEUE_HEAD;
+                       enqueue_flag |= ENQUEUE_HEAD;
                p->sched_class = &rt_sched_class;
        } else {
                if (dl_prio(oldprio))
@@ -3423,7 +3450,7 @@ void set_user_nice(struct task_struct *p, long nice)
        }
        queued = task_on_rq_queued(p);
        if (queued)
-               dequeue_task(rq, p, 0);
+               dequeue_task(rq, p, DEQUEUE_SAVE);
 
        p->static_prio = NICE_TO_PRIO(nice);
        set_load_weight(p);
@@ -3432,7 +3459,7 @@ void set_user_nice(struct task_struct *p, long nice)
        delta = p->prio - old_prio;
 
        if (queued) {
-               enqueue_task(rq, p, 0);
+               enqueue_task(rq, p, ENQUEUE_RESTORE);
                /*
                 * If the task increased its priority or is running and
                 * lowered its priority, then reschedule its CPU:
@@ -3753,10 +3780,7 @@ recheck:
        } else {
                reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
 
-               if (policy != SCHED_DEADLINE &&
-                               policy != SCHED_FIFO && policy != SCHED_RR &&
-                               policy != SCHED_NORMAL && policy != SCHED_BATCH &&
-                               policy != SCHED_IDLE)
+               if (!valid_policy(policy))
                        return -EINVAL;
        }
 
@@ -3812,7 +3836,7 @@ recheck:
                 * Treat SCHED_IDLE as nice 20. Only allow a switch to
                 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
                 */
-               if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
+               if (idle_policy(p->policy) && !idle_policy(policy)) {
                        if (!can_nice(p, task_nice(p)))
                                return -EPERM;
                }
@@ -3937,7 +3961,7 @@ change:
        queued = task_on_rq_queued(p);
        running = task_current(rq, p);
        if (queued)
-               dequeue_task(rq, p, 0);
+               dequeue_task(rq, p, DEQUEUE_SAVE);
        if (running)
                put_prev_task(rq, p);
 
@@ -3947,11 +3971,15 @@ change:
        if (running)
                p->sched_class->set_curr_task(rq);
        if (queued) {
+               int enqueue_flags = ENQUEUE_RESTORE;
                /*
                 * We enqueue to tail when the priority of a task is
                 * increased (user space view).
                 */
-               enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);
+               if (oldprio <= p->prio)
+                       enqueue_flags |= ENQUEUE_HEAD;
+
+               enqueue_task(rq, p, enqueue_flags);
        }
 
        check_class_changed(rq, p, prev_class, oldprio);
@@ -5101,7 +5129,7 @@ void sched_setnuma(struct task_struct *p, int nid)
        running = task_current(rq, p);
 
        if (queued)
-               dequeue_task(rq, p, 0);
+               dequeue_task(rq, p, DEQUEUE_SAVE);
        if (running)
                put_prev_task(rq, p);
 
@@ -5110,7 +5138,7 @@ void sched_setnuma(struct task_struct *p, int nid)
        if (running)
                p->sched_class->set_curr_task(rq);
        if (queued)
-               enqueue_task(rq, p, 0);
+               enqueue_task(rq, p, ENQUEUE_RESTORE);
        task_rq_unlock(rq, p, &flags);
 }
 #endif /* CONFIG_NUMA_BALANCING */
@@ -5531,21 +5559,27 @@ static void set_cpu_rq_start_time(void)
 static int sched_cpu_active(struct notifier_block *nfb,
                                      unsigned long action, void *hcpu)
 {
+       int cpu = (long)hcpu;
+
        switch (action & ~CPU_TASKS_FROZEN) {
        case CPU_STARTING:
                set_cpu_rq_start_time();
                return NOTIFY_OK;
+
        case CPU_ONLINE:
                /*
                 * At this point a starting CPU has marked itself as online via
                 * set_cpu_online(). But it might not yet have marked itself
                 * as active, which is essential from here on.
-                *
-                * Thus, fall-through and help the starting CPU along.
                 */
+               set_cpu_active(cpu, true);
+               stop_machine_unpark(cpu);
+               return NOTIFY_OK;
+
        case CPU_DOWN_FAILED:
-               set_cpu_active((long)hcpu, true);
+               set_cpu_active(cpu, true);
                return NOTIFY_OK;
+
        default:
                return NOTIFY_DONE;
        }
@@ -6477,7 +6511,8 @@ static struct sched_domain_topology_level default_topology[] = {
        { NULL, },
 };
 
-struct sched_domain_topology_level *sched_domain_topology = default_topology;
+static struct sched_domain_topology_level *sched_domain_topology =
+       default_topology;
 
 #define for_each_sd_topology(tl)                       \
        for (tl = sched_domain_topology; tl->mask; tl++)
@@ -7478,7 +7513,7 @@ void __init sched_init(void)
 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
 static inline int preempt_count_equals(int preempt_offset)
 {
-       int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
+       int nested = preempt_count() + rcu_preempt_depth();
 
        return (nested == preempt_offset);
 }
@@ -7725,7 +7760,7 @@ void sched_move_task(struct task_struct *tsk)
        queued = task_on_rq_queued(tsk);
 
        if (queued)
-               dequeue_task(rq, tsk, 0);
+               dequeue_task(rq, tsk, DEQUEUE_SAVE);
        if (unlikely(running))
                put_prev_task(rq, tsk);
 
@@ -7741,7 +7776,7 @@ void sched_move_task(struct task_struct *tsk)
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
        if (tsk->sched_class->task_move_group)
-               tsk->sched_class->task_move_group(tsk, queued);
+               tsk->sched_class->task_move_group(tsk);
        else
 #endif
                set_task_rq(tsk, task_cpu(tsk));
@@ -7749,7 +7784,7 @@ void sched_move_task(struct task_struct *tsk)
        if (unlikely(running))
                tsk->sched_class->set_curr_task(rq);
        if (queued)
-               enqueue_task(rq, tsk, 0);
+               enqueue_task(rq, tsk, ENQUEUE_RESTORE);
 
        task_rq_unlock(rq, tsk, &flags);
 }
@@ -8213,14 +8248,6 @@ static void cpu_cgroup_exit(struct cgroup_subsys_state *css,
                            struct cgroup_subsys_state *old_css,
                            struct task_struct *task)
 {
-       /*
-        * cgroup_exit() is called in the copy_process() failure path.
-        * Ignore this case since the task hasn't ran yet, this avoids
-        * trying to poke a half freed task state from generic code.
-        */
-       if (!(task->flags & PF_EXITING))
-               return;
-
        sched_move_task(task);
 }