]> git.kernelconcepts.de Git - karo-tx-linux.git/commitdiff
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...
authorLinus Torvalds <torvalds@linux-foundation.org>
Wed, 4 Nov 2015 02:03:50 +0000 (18:03 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 4 Nov 2015 02:03:50 +0000 (18:03 -0800)
Pull scheduler changes from Ingo Molnar:
 "The main changes in this cycle were:

   - sched/fair load tracking fixes and cleanups (Byungchul Park)

   - Make load tracking frequency scale invariant (Dietmar Eggemann)

   - sched/deadline updates (Juri Lelli)

   - stop machine fixes, cleanups and enhancements for bugs triggered by
     CPU hotplug stress testing (Oleg Nesterov)

   - scheduler preemption code rework: remove PREEMPT_ACTIVE and related
     cleanups (Peter Zijlstra)

   - Rework the sched_info::run_delay code to fix races (Peter Zijlstra)

   - Optimize per entity utilization tracking (Peter Zijlstra)

   - ... misc other fixes, cleanups and smaller updates"

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (57 commits)
  sched: Don't scan all-offline ->cpus_allowed twice if !CONFIG_CPUSETS
  sched: Move cpu_active() tests from stop_two_cpus() into migrate_swap_stop()
  sched: Start stopper early
  stop_machine: Kill cpu_stop_threads->setup() and cpu_stop_unpark()
  stop_machine: Kill smp_hotplug_thread->pre_unpark, introduce stop_machine_unpark()
  stop_machine: Change cpu_stop_queue_two_works() to rely on stopper->enabled
  stop_machine: Introduce __cpu_stop_queue_work() and cpu_stop_queue_two_works()
  stop_machine: Ensure that a queued callback will be called before cpu_stop_park()
  sched/x86: Fix typo in __switch_to() comments
  sched/core: Remove a parameter in the migrate_task_rq() function
  sched/core: Drop unlikely behind BUG_ON()
  sched/core: Fix task and run queue sched_info::run_delay inconsistencies
  sched/numa: Fix task_tick_fair() from disabling numa_balancing
  sched/core: Add preempt_count invariant check
  sched/core: More notrace annotations
  sched/core: Kill PREEMPT_ACTIVE
  sched/core, sched/x86: Kill thread_info::saved_preempt_count
  sched/core: Simplify preempt_count tests
  sched/core: Robustify preemption leak checks
  sched/core: Stop setting PREEMPT_ACTIVE
  ...

1  2 
include/linux/sched.h
kernel/cpu.c
kernel/exit.c
kernel/locking/rtmutex.c
kernel/sched/core.c

diff --combined include/linux/sched.h
index 56667292d1e444df2e9f21fc8281caf32e44f3f5,23ca455d9582693173bd85b9468f9f487164a86c..9e1e06c3ce051e63862a22e7edb5897e7027591e
@@@ -599,36 -599,39 +599,42 @@@ struct task_cputime_atomic 
                .sum_exec_runtime = ATOMIC64_INIT(0),           \
        }
  
- #ifdef CONFIG_PREEMPT_COUNT
- #define PREEMPT_DISABLED      (1 + PREEMPT_ENABLED)
- #else
- #define PREEMPT_DISABLED      PREEMPT_ENABLED
- #endif
+ #define PREEMPT_DISABLED      (PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)
+ /*
+  * Disable preemption until the scheduler is running -- use an unconditional
+  * value so that it also works on !PREEMPT_COUNT kernels.
+  *
+  * Reset by start_kernel()->sched_init()->init_idle()->init_idle_preempt_count().
+  */
+ #define INIT_PREEMPT_COUNT    PREEMPT_OFFSET
  
  /*
-  * Disable preemption until the scheduler is running.
-  * Reset by start_kernel()->sched_init()->init_idle().
+  * Initial preempt_count value; reflects the preempt_count schedule invariant
+  * which states that during context switches:
   *
-  * We include PREEMPT_ACTIVE to avoid cond_resched() from working
-  * before the scheduler is active -- see should_resched().
+  *    preempt_count() == 2*PREEMPT_DISABLE_OFFSET
+  *
+  * Note: PREEMPT_DISABLE_OFFSET is 0 for !PREEMPT_COUNT kernels.
+  * Note: See finish_task_switch().
   */
- #define INIT_PREEMPT_COUNT    (PREEMPT_DISABLED + PREEMPT_ACTIVE)
+ #define FORK_PREEMPT_COUNT    (2*PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)
  
  /**
   * struct thread_group_cputimer - thread group interval timer counts
   * @cputime_atomic:   atomic thread group interval timers.
 - * @running:          non-zero when there are timers running and
 - *                    @cputime receives updates.
 + * @running:          true when there are timers running and
 + *                    @cputime_atomic receives updates.
 + * @checking_timer:   true when a thread in the group is in the
 + *                    process of checking for thread group timers.
   *
   * This structure contains the version of task_cputime, above, that is
   * used for thread group CPU timer calculations.
   */
  struct thread_group_cputimer {
        struct task_cputime_atomic cputime_atomic;
 -      int running;
 +      bool running;
 +      bool checking_timer;
  };
  
  #include <linux/rwsem.h>
@@@ -1142,8 -1145,6 +1148,6 @@@ struct sched_domain_topology_level 
  #endif
  };
  
- extern struct sched_domain_topology_level *sched_domain_topology;
  extern void set_sched_topology(struct sched_domain_topology_level *tl);
  extern void wake_up_if_idle(int cpu);
  
@@@ -1192,10 -1193,10 +1196,10 @@@ struct load_weight 
  
  /*
   * The load_avg/util_avg accumulates an infinite geometric series.
-  * 1) load_avg factors the amount of time that a sched_entity is
-  * runnable on a rq into its weight. For cfs_rq, it is the aggregated
-  * such weights of all runnable and blocked sched_entities.
-  * 2) util_avg factors frequency scaling into the amount of time
+  * 1) load_avg factors frequency scaling into the amount of time that a
+  * sched_entity is runnable on a rq into its weight. For cfs_rq, it is the
+  * aggregated such weights of all runnable and blocked sched_entities.
+  * 2) util_avg factors frequency and cpu scaling into the amount of time
   * that a sched_entity is running on a CPU, in the range [0..SCHED_LOAD_SCALE].
   * For cfs_rq, it is the aggregated such times of all runnable and
   * blocked sched_entities.
@@@ -1345,12 -1346,10 +1349,12 @@@ struct sched_dl_entity 
  
  union rcu_special {
        struct {
 -              bool blocked;
 -              bool need_qs;
 -      } b;
 -      short s;
 +              u8 blocked;
 +              u8 need_qs;
 +              u8 exp_need_qs;
 +              u8 pad; /* Otherwise the compiler can store garbage here. */
 +      } b; /* Bits. */
 +      u32 s; /* Set of bits. */
  };
  struct rcu_node;
  
diff --combined kernel/cpu.c
index 14a9cdf8abe9e806083ca3041c2861f6ef0713b3,c85df2775b735895d0637c416c712f507828dd9d..85ff5e26e23b45b34201120c758082599f995b7e
@@@ -102,6 -102,19 +102,6 @@@ void get_online_cpus(void
  }
  EXPORT_SYMBOL_GPL(get_online_cpus);
  
 -bool try_get_online_cpus(void)
 -{
 -      if (cpu_hotplug.active_writer == current)
 -              return true;
 -      if (!mutex_trylock(&cpu_hotplug.lock))
 -              return false;
 -      cpuhp_lock_acquire_tryread();
 -      atomic_inc(&cpu_hotplug.refcount);
 -      mutex_unlock(&cpu_hotplug.lock);
 -      return true;
 -}
 -EXPORT_SYMBOL_GPL(try_get_online_cpus);
 -
  void put_online_cpus(void)
  {
        int refcount;
@@@ -291,8 -304,8 +291,8 @@@ static inline void check_for_tasks(int 
  {
        struct task_struct *g, *p;
  
-       read_lock_irq(&tasklist_lock);
-       do_each_thread(g, p) {
+       read_lock(&tasklist_lock);
+       for_each_process_thread(g, p) {
                if (!p->on_rq)
                        continue;
                /*
  
                pr_warn("Task %s (pid=%d) is on cpu %d (state=%ld, flags=%x)\n",
                        p->comm, task_pid_nr(p), dead_cpu, p->state, p->flags);
-       } while_each_thread(g, p);
-       read_unlock_irq(&tasklist_lock);
+       }
+       read_unlock(&tasklist_lock);
  }
  
  struct take_cpu_down_param {
@@@ -331,7 -344,7 +331,7 @@@ static int take_cpu_down(void *_param
        /* Give up timekeeping duties */
        tick_handover_do_timer();
        /* Park the stopper thread */
-       kthread_park(current);
+       stop_machine_park((long)param->hcpu);
        return 0;
  }
  
diff --combined kernel/exit.c
index 0e93b63bbc59292815f49ddc95c0eceafa48b7e4,443677c8efe6ec17ac11dc2fe699f0fcb61fa107..07110c6020a04ea37c04bc18bd0b9287cd0466dc
@@@ -706,10 -706,12 +706,12 @@@ void do_exit(long code
        smp_mb();
        raw_spin_unlock_wait(&tsk->pi_lock);
  
-       if (unlikely(in_atomic()))
+       if (unlikely(in_atomic())) {
                pr_info("note: %s[%d] exited with preempt_count %d\n",
                        current->comm, task_pid_nr(current),
                        preempt_count());
+               preempt_count_set(PREEMPT_ENABLED);
+       }
  
        /* sync mm's RSS info before statistics gathering */
        if (tsk->mm)
         */
        flush_ptrace_hw_breakpoint(tsk);
  
 +      TASKS_RCU(preempt_disable());
        TASKS_RCU(tasks_rcu_i = __srcu_read_lock(&tasks_rcu_exit_srcu));
 +      TASKS_RCU(preempt_enable());
        exit_notify(tsk, group_dead);
        proc_exit_connector(tsk);
  #ifdef CONFIG_NUMA
diff --combined kernel/locking/rtmutex.c
index bbb72b4f64a148de2bcf186acf5b2e08d2d77207,35e9bfcc6ad940d90e9790544106b87912a05195..8251e75dd9c0bd67337754baf6f03fb2cf1f956f
@@@ -74,23 -74,14 +74,23 @@@ static void fixup_rt_mutex_waiters(stru
   * set up.
   */
  #ifndef CONFIG_DEBUG_RT_MUTEXES
 -# define rt_mutex_cmpxchg(l,c,n)      (cmpxchg(&l->owner, c, n) == c)
 +# define rt_mutex_cmpxchg_relaxed(l,c,n) (cmpxchg_relaxed(&l->owner, c, n) == c)
 +# define rt_mutex_cmpxchg_acquire(l,c,n) (cmpxchg_acquire(&l->owner, c, n) == c)
 +# define rt_mutex_cmpxchg_release(l,c,n) (cmpxchg_release(&l->owner, c, n) == c)
 +
 +/*
 + * Callers must hold the ->wait_lock -- which is the whole purpose as we force
 + * all future threads that attempt to [Rmw] the lock to the slowpath. As such
 + * relaxed semantics suffice.
 + */
  static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
  {
        unsigned long owner, *p = (unsigned long *) &lock->owner;
  
        do {
                owner = *p;
 -      } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
 +      } while (cmpxchg_relaxed(p, owner,
 +                               owner | RT_MUTEX_HAS_WAITERS) != owner);
  }
  
  /*
@@@ -130,14 -121,11 +130,14 @@@ static inline bool unlock_rt_mutex_safe
         *                                      lock(wait_lock);
         *                                      acquire(lock);
         */
 -      return rt_mutex_cmpxchg(lock, owner, NULL);
 +      return rt_mutex_cmpxchg_release(lock, owner, NULL);
  }
  
  #else
 -# define rt_mutex_cmpxchg(l,c,n)      (0)
 +# define rt_mutex_cmpxchg_relaxed(l,c,n)      (0)
 +# define rt_mutex_cmpxchg_acquire(l,c,n)      (0)
 +# define rt_mutex_cmpxchg_release(l,c,n)      (0)
 +
  static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
  {
        lock->owner = (struct task_struct *)
@@@ -170,7 -158,8 +170,8 @@@ rt_mutex_waiter_less(struct rt_mutex_wa
         * then right waiter has a dl_prio() too.
         */
        if (dl_prio(left->prio))
-               return (left->task->dl.deadline < right->task->dl.deadline);
+               return dl_time_before(left->task->dl.deadline,
+                                     right->task->dl.deadline);
  
        return 0;
  }
@@@ -1333,7 -1322,7 +1334,7 @@@ rt_mutex_fastlock(struct rt_mutex *lock
                                struct hrtimer_sleeper *timeout,
                                enum rtmutex_chainwalk chwalk))
  {
 -      if (likely(rt_mutex_cmpxchg(lock, NULL, current))) {
 +      if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
                rt_mutex_deadlock_account_lock(lock, current);
                return 0;
        } else
@@@ -1349,7 -1338,7 +1350,7 @@@ rt_mutex_timed_fastlock(struct rt_mute
                                      enum rtmutex_chainwalk chwalk))
  {
        if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
 -          likely(rt_mutex_cmpxchg(lock, NULL, current))) {
 +          likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
                rt_mutex_deadlock_account_lock(lock, current);
                return 0;
        } else
@@@ -1360,7 -1349,7 +1361,7 @@@ static inline in
  rt_mutex_fasttrylock(struct rt_mutex *lock,
                     int (*slowfn)(struct rt_mutex *lock))
  {
 -      if (likely(rt_mutex_cmpxchg(lock, NULL, current))) {
 +      if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
                rt_mutex_deadlock_account_lock(lock, current);
                return 1;
        }
@@@ -1374,7 -1363,7 +1375,7 @@@ rt_mutex_fastunlock(struct rt_mutex *lo
  {
        WAKE_Q(wake_q);
  
 -      if (likely(rt_mutex_cmpxchg(lock, current, NULL))) {
 +      if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
                rt_mutex_deadlock_account_unlock(current);
  
        } else {
@@@ -1496,7 -1485,7 +1497,7 @@@ EXPORT_SYMBOL_GPL(rt_mutex_unlock)
  bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock,
                                   struct wake_q_head *wqh)
  {
 -      if (likely(rt_mutex_cmpxchg(lock, current, NULL))) {
 +      if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
                rt_mutex_deadlock_account_unlock(current);
                return false;
        }
diff --combined kernel/sched/core.c
index f7402f7eb44803a6659b5f2b6337d64624dd8d0a,b4d263db52a6f6d7b09dab05508c91fb526af929..aa5973220ad213a960092012bf4493f296dab90b
@@@ -817,7 -817,7 +817,7 @@@ static void set_load_weight(struct task
        /*
         * SCHED_IDLE tasks get minimal weight:
         */
-       if (p->policy == SCHED_IDLE) {
+       if (idle_policy(p->policy)) {
                load->weight = scale_load(WEIGHT_IDLEPRIO);
                load->inv_weight = WMULT_IDLEPRIO;
                return;
        load->inv_weight = prio_to_wmult[prio];
  }
  
- static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
+ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
  {
        update_rq_clock(rq);
-       sched_info_queued(rq, p);
+       if (!(flags & ENQUEUE_RESTORE))
+               sched_info_queued(rq, p);
        p->sched_class->enqueue_task(rq, p, flags);
  }
  
- static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
+ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
  {
        update_rq_clock(rq);
-       sched_info_dequeued(rq, p);
+       if (!(flags & DEQUEUE_SAVE))
+               sched_info_dequeued(rq, p);
        p->sched_class->dequeue_task(rq, p, flags);
  }
  
@@@ -1178,7 -1180,7 +1180,7 @@@ void do_set_cpus_allowed(struct task_st
                 * holding rq->lock.
                 */
                lockdep_assert_held(&rq->lock);
-               dequeue_task(rq, p, 0);
+               dequeue_task(rq, p, DEQUEUE_SAVE);
        }
        if (running)
                put_prev_task(rq, p);
        if (running)
                p->sched_class->set_curr_task(rq);
        if (queued)
-               enqueue_task(rq, p, 0);
+               enqueue_task(rq, p, ENQUEUE_RESTORE);
  }
  
  /*
@@@ -1292,7 -1294,7 +1294,7 @@@ void set_task_cpu(struct task_struct *p
  
        if (task_cpu(p) != new_cpu) {
                if (p->sched_class->migrate_task_rq)
-                       p->sched_class->migrate_task_rq(p, new_cpu);
+                       p->sched_class->migrate_task_rq(p);
                p->se.nr_migrations++;
                perf_event_task_migrate(p);
        }
@@@ -1333,12 -1335,16 +1335,16 @@@ static int migrate_swap_stop(void *data
        struct rq *src_rq, *dst_rq;
        int ret = -EAGAIN;
  
+       if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu))
+               return -EAGAIN;
        src_rq = cpu_rq(arg->src_cpu);
        dst_rq = cpu_rq(arg->dst_cpu);
  
        double_raw_lock(&arg->src_task->pi_lock,
                        &arg->dst_task->pi_lock);
        double_rq_lock(src_rq, dst_rq);
        if (task_cpu(arg->dst_task) != arg->dst_cpu)
                goto unlock;
  
@@@ -1574,13 -1580,15 +1580,15 @@@ static int select_fallback_rq(int cpu, 
                        goto out;
                }
  
+               /* No more Mr. Nice Guy. */
                switch (state) {
                case cpuset:
-                       /* No more Mr. Nice Guy. */
-                       cpuset_cpus_allowed_fallback(p);
-                       state = possible;
-                       break;
+                       if (IS_ENABLED(CONFIG_CPUSETS)) {
+                               cpuset_cpus_allowed_fallback(p);
+                               state = possible;
+                               break;
+                       }
+                       /* fall-through */
                case possible:
                        do_set_cpus_allowed(p, cpu_possible_mask);
                        state = fail;
@@@ -1692,7 -1700,7 +1700,7 @@@ ttwu_stat(struct task_struct *p, int cp
  #endif /* CONFIG_SCHEDSTATS */
  }
  
- static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
+ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
  {
        activate_task(rq, p, en_flags);
        p->on_rq = TASK_ON_RQ_QUEUED;
@@@ -2114,23 -2122,17 +2122,17 @@@ static void __sched_fork(unsigned long 
  #endif /* CONFIG_NUMA_BALANCING */
  }
  
+ DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
  #ifdef CONFIG_NUMA_BALANCING
- #ifdef CONFIG_SCHED_DEBUG
  void set_numabalancing_state(bool enabled)
  {
        if (enabled)
-               sched_feat_set("NUMA");
+               static_branch_enable(&sched_numa_balancing);
        else
-               sched_feat_set("NO_NUMA");
+               static_branch_disable(&sched_numa_balancing);
  }
- #else
- __read_mostly bool numabalancing_enabled;
- void set_numabalancing_state(bool enabled)
- {
-       numabalancing_enabled = enabled;
- }
- #endif /* CONFIG_SCHED_DEBUG */
  
  #ifdef CONFIG_PROC_SYSCTL
  int sysctl_numa_balancing(struct ctl_table *table, int write,
  {
        struct ctl_table t;
        int err;
-       int state = numabalancing_enabled;
+       int state = static_branch_likely(&sched_numa_balancing);
  
        if (write && !capable(CAP_SYS_ADMIN))
                return -EPERM;
@@@ -2349,6 -2351,8 +2351,8 @@@ void wake_up_new_task(struct task_struc
        struct rq *rq;
  
        raw_spin_lock_irqsave(&p->pi_lock, flags);
+       /* Initialize new task's runnable average */
+       init_entity_runnable_average(&p->se);
  #ifdef CONFIG_SMP
        /*
         * Fork balancing, do it here and not earlier because:
        set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
  #endif
  
-       /* Initialize new task's runnable average */
-       init_entity_runnable_average(&p->se);
        rq = __task_rq_lock(p);
        activate_task(rq, p, 0);
        p->on_rq = TASK_ON_RQ_QUEUED;
        trace_sched_wakeup_new(p);
        check_preempt_curr(rq, p, WF_FORK);
  #ifdef CONFIG_SMP
 -      if (p->sched_class->task_woken)
 +      if (p->sched_class->task_woken) {
 +              /*
 +               * Nothing relies on rq->lock after this, so its fine to
 +               * drop it.
 +               */
 +              lockdep_unpin_lock(&rq->lock);
                p->sched_class->task_woken(rq, p);
 +              lockdep_pin_lock(&rq->lock);
 +      }
  #endif
        task_rq_unlock(rq, p, &flags);
  }
@@@ -2483,7 -2478,6 +2485,6 @@@ static inline voi
  prepare_task_switch(struct rq *rq, struct task_struct *prev,
                    struct task_struct *next)
  {
-       trace_sched_switch(prev, next);
        sched_info_switch(rq, prev, next);
        perf_event_task_sched_out(prev, next);
        fire_sched_out_preempt_notifiers(prev, next);
@@@ -2517,6 -2511,22 +2518,22 @@@ static struct rq *finish_task_switch(st
        struct mm_struct *mm = rq->prev_mm;
        long prev_state;
  
+       /*
+        * The previous task will have left us with a preempt_count of 2
+        * because it left us after:
+        *
+        *      schedule()
+        *        preempt_disable();                    // 1
+        *        __schedule()
+        *          raw_spin_lock_irq(&rq->lock)        // 2
+        *
+        * Also, see FORK_PREEMPT_COUNT.
+        */
+       if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
+                     "corrupted preempt_count: %s/%d/0x%x\n",
+                     current->comm, current->pid, preempt_count()))
+               preempt_count_set(FORK_PREEMPT_COUNT);
        rq->prev_mm = NULL;
  
        /*
@@@ -2601,8 -2611,15 +2618,15 @@@ asmlinkage __visible void schedule_tail
  {
        struct rq *rq;
  
-       /* finish_task_switch() drops rq->lock and enables preemtion */
-       preempt_disable();
+       /*
+        * New tasks start with FORK_PREEMPT_COUNT, see there and
+        * finish_task_switch() for details.
+        *
+        * finish_task_switch() will drop rq->lock() and lower preempt_count
+        * and the preempt_enable() will end up enabling preemption (on
+        * PREEMPT_COUNT kernels).
+        */
        rq = finish_task_switch(prev);
        balance_callback(rq);
        preempt_enable();
@@@ -2960,15 -2977,13 +2984,13 @@@ static noinline void __schedule_bug(str
  static inline void schedule_debug(struct task_struct *prev)
  {
  #ifdef CONFIG_SCHED_STACK_END_CHECK
-       BUG_ON(unlikely(task_stack_end_corrupted(prev)));
+       BUG_ON(task_stack_end_corrupted(prev));
  #endif
-       /*
-        * Test if we are atomic. Since do_exit() needs to call into
-        * schedule() atomically, we ignore that path. Otherwise whine
-        * if we are scheduling when we should not.
-        */
-       if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD))
+       if (unlikely(in_atomic_preempt_off())) {
                __schedule_bug(prev);
+               preempt_count_set(PREEMPT_DISABLED);
+       }
        rcu_sleep_check();
  
        profile_hit(SCHED_PROFILING, __builtin_return_address(0));
@@@ -3054,7 -3069,7 +3076,7 @@@ again
   *
   * WARNING: must be called with preemption disabled!
   */
- static void __sched __schedule(void)
+ static void __sched notrace __schedule(bool preempt)
  {
        struct task_struct *prev, *next;
        unsigned long *switch_count;
        rcu_note_context_switch();
        prev = rq->curr;
  
+       /*
+        * do_exit() calls schedule() with preemption disabled as an exception;
+        * however we must fix that up, otherwise the next task will see an
+        * inconsistent (higher) preempt count.
+        *
+        * It also avoids the below schedule_debug() test from complaining
+        * about this.
+        */
+       if (unlikely(prev->state == TASK_DEAD))
+               preempt_enable_no_resched_notrace();
        schedule_debug(prev);
  
        if (sched_feat(HRTICK))
        rq->clock_skip_update <<= 1; /* promote REQ to ACT */
  
        switch_count = &prev->nivcsw;
-       if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
+       if (!preempt && prev->state) {
                if (unlikely(signal_pending_state(prev->state, prev))) {
                        prev->state = TASK_RUNNING;
                } else {
                rq->curr = next;
                ++*switch_count;
  
+               trace_sched_switch(preempt, prev, next);
                rq = context_switch(rq, prev, next); /* unlocks the rq */
                cpu = cpu_of(rq);
        } else {
@@@ -3148,7 -3175,7 +3182,7 @@@ asmlinkage __visible void __sched sched
        sched_submit_work(tsk);
        do {
                preempt_disable();
-               __schedule();
+               __schedule(false);
                sched_preempt_enable_no_resched();
        } while (need_resched());
  }
@@@ -3188,9 -3215,9 +3222,9 @@@ void __sched schedule_preempt_disabled(
  static void __sched notrace preempt_schedule_common(void)
  {
        do {
-               preempt_active_enter();
-               __schedule();
-               preempt_active_exit();
+               preempt_disable_notrace();
+               __schedule(true);
+               preempt_enable_no_resched_notrace();
  
                /*
                 * Check again in case we missed a preemption opportunity
@@@ -3241,24 -3268,17 +3275,17 @@@ asmlinkage __visible void __sched notra
                return;
  
        do {
-               /*
-                * Use raw __prempt_count() ops that don't call function.
-                * We can't call functions before disabling preemption which
-                * disarm preemption tracing recursions.
-                */
-               __preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
-               barrier();
+               preempt_disable_notrace();
                /*
                 * Needs preempt disabled in case user_exit() is traced
                 * and the tracer calls preempt_enable_notrace() causing
                 * an infinite recursion.
                 */
                prev_ctx = exception_enter();
-               __schedule();
+               __schedule(true);
                exception_exit(prev_ctx);
  
-               barrier();
-               __preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
+               preempt_enable_no_resched_notrace();
        } while (need_resched());
  }
  EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
@@@ -3281,11 -3301,11 +3308,11 @@@ asmlinkage __visible void __sched preem
        prev_state = exception_enter();
  
        do {
-               preempt_active_enter();
+               preempt_disable();
                local_irq_enable();
-               __schedule();
+               __schedule(true);
                local_irq_disable();
-               preempt_active_exit();
+               sched_preempt_enable_no_resched();
        } while (need_resched());
  
        exception_exit(prev_state);
@@@ -3313,7 -3333,7 +3340,7 @@@ EXPORT_SYMBOL(default_wake_function)
   */
  void rt_mutex_setprio(struct task_struct *p, int prio)
  {
-       int oldprio, queued, running, enqueue_flag = 0;
+       int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE;
        struct rq *rq;
        const struct sched_class *prev_class;
  
        queued = task_on_rq_queued(p);
        running = task_current(rq, p);
        if (queued)
-               dequeue_task(rq, p, 0);
+               dequeue_task(rq, p, DEQUEUE_SAVE);
        if (running)
                put_prev_task(rq, p);
  
                if (!dl_prio(p->normal_prio) ||
                    (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
                        p->dl.dl_boosted = 1;
-                       enqueue_flag = ENQUEUE_REPLENISH;
+                       enqueue_flag |= ENQUEUE_REPLENISH;
                } else
                        p->dl.dl_boosted = 0;
                p->sched_class = &dl_sched_class;
                if (dl_prio(oldprio))
                        p->dl.dl_boosted = 0;
                if (oldprio < prio)
-                       enqueue_flag = ENQUEUE_HEAD;
+                       enqueue_flag |= ENQUEUE_HEAD;
                p->sched_class = &rt_sched_class;
        } else {
                if (dl_prio(oldprio))
@@@ -3423,7 -3443,7 +3450,7 @@@ void set_user_nice(struct task_struct *
        }
        queued = task_on_rq_queued(p);
        if (queued)
-               dequeue_task(rq, p, 0);
+               dequeue_task(rq, p, DEQUEUE_SAVE);
  
        p->static_prio = NICE_TO_PRIO(nice);
        set_load_weight(p);
        delta = p->prio - old_prio;
  
        if (queued) {
-               enqueue_task(rq, p, 0);
+               enqueue_task(rq, p, ENQUEUE_RESTORE);
                /*
                 * If the task increased its priority or is running and
                 * lowered its priority, then reschedule its CPU:
@@@ -3753,10 -3773,7 +3780,7 @@@ recheck
        } else {
                reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
  
-               if (policy != SCHED_DEADLINE &&
-                               policy != SCHED_FIFO && policy != SCHED_RR &&
-                               policy != SCHED_NORMAL && policy != SCHED_BATCH &&
-                               policy != SCHED_IDLE)
+               if (!valid_policy(policy))
                        return -EINVAL;
        }
  
                 * Treat SCHED_IDLE as nice 20. Only allow a switch to
                 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
                 */
-               if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
+               if (idle_policy(p->policy) && !idle_policy(policy)) {
                        if (!can_nice(p, task_nice(p)))
                                return -EPERM;
                }
@@@ -3937,7 -3954,7 +3961,7 @@@ change
        queued = task_on_rq_queued(p);
        running = task_current(rq, p);
        if (queued)
-               dequeue_task(rq, p, 0);
+               dequeue_task(rq, p, DEQUEUE_SAVE);
        if (running)
                put_prev_task(rq, p);
  
        if (running)
                p->sched_class->set_curr_task(rq);
        if (queued) {
+               int enqueue_flags = ENQUEUE_RESTORE;
                /*
                 * We enqueue to tail when the priority of a task is
                 * increased (user space view).
                 */
-               enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);
+               if (oldprio <= p->prio)
+                       enqueue_flags |= ENQUEUE_HEAD;
+               enqueue_task(rq, p, enqueue_flags);
        }
  
        check_class_changed(rq, p, prev_class, oldprio);
@@@ -4029,7 -4050,6 +4057,7 @@@ int sched_setscheduler_nocheck(struct t
  {
        return _sched_setscheduler(p, policy, param, false);
  }
 +EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
  
  static int
  do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
@@@ -5101,7 -5121,7 +5129,7 @@@ void sched_setnuma(struct task_struct *
        running = task_current(rq, p);
  
        if (queued)
-               dequeue_task(rq, p, 0);
+               dequeue_task(rq, p, DEQUEUE_SAVE);
        if (running)
                put_prev_task(rq, p);
  
        if (running)
                p->sched_class->set_curr_task(rq);
        if (queued)
-               enqueue_task(rq, p, 0);
+               enqueue_task(rq, p, ENQUEUE_RESTORE);
        task_rq_unlock(rq, p, &flags);
  }
  #endif /* CONFIG_NUMA_BALANCING */
@@@ -5531,21 -5551,27 +5559,27 @@@ static void set_cpu_rq_start_time(void
  static int sched_cpu_active(struct notifier_block *nfb,
                                      unsigned long action, void *hcpu)
  {
+       int cpu = (long)hcpu;
        switch (action & ~CPU_TASKS_FROZEN) {
        case CPU_STARTING:
                set_cpu_rq_start_time();
                return NOTIFY_OK;
        case CPU_ONLINE:
                /*
                 * At this point a starting CPU has marked itself as online via
                 * set_cpu_online(). But it might not yet have marked itself
                 * as active, which is essential from here on.
-                *
-                * Thus, fall-through and help the starting CPU along.
                 */
+               set_cpu_active(cpu, true);
+               stop_machine_unpark(cpu);
+               return NOTIFY_OK;
        case CPU_DOWN_FAILED:
-               set_cpu_active((long)hcpu, true);
+               set_cpu_active(cpu, true);
                return NOTIFY_OK;
        default:
                return NOTIFY_DONE;
        }
@@@ -6477,7 -6503,8 +6511,8 @@@ static struct sched_domain_topology_lev
        { NULL, },
  };
  
- struct sched_domain_topology_level *sched_domain_topology = default_topology;
+ static struct sched_domain_topology_level *sched_domain_topology =
+       default_topology;
  
  #define for_each_sd_topology(tl)                      \
        for (tl = sched_domain_topology; tl->mask; tl++)
@@@ -7478,7 -7505,7 +7513,7 @@@ void __init sched_init(void
  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
  static inline int preempt_count_equals(int preempt_offset)
  {
-       int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
+       int nested = preempt_count() + rcu_preempt_depth();
  
        return (nested == preempt_offset);
  }
@@@ -7725,7 -7752,7 +7760,7 @@@ void sched_move_task(struct task_struc
        queued = task_on_rq_queued(tsk);
  
        if (queued)
-               dequeue_task(rq, tsk, 0);
+               dequeue_task(rq, tsk, DEQUEUE_SAVE);
        if (unlikely(running))
                put_prev_task(rq, tsk);
  
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
        if (tsk->sched_class->task_move_group)
-               tsk->sched_class->task_move_group(tsk, queued);
+               tsk->sched_class->task_move_group(tsk);
        else
  #endif
                set_task_rq(tsk, task_cpu(tsk));
        if (unlikely(running))
                tsk->sched_class->set_curr_task(rq);
        if (queued)
-               enqueue_task(rq, tsk, 0);
+               enqueue_task(rq, tsk, ENQUEUE_RESTORE);
  
        task_rq_unlock(rq, tsk, &flags);
  }
@@@ -8213,14 -8240,6 +8248,6 @@@ static void cpu_cgroup_exit(struct cgro
                            struct cgroup_subsys_state *old_css,
                            struct task_struct *task)
  {
-       /*
-        * cgroup_exit() is called in the copy_process() failure path.
-        * Ignore this case since the task hasn't ran yet, this avoids
-        * trying to poke a half freed task state from generic code.
-        */
-       if (!(task->flags & PF_EXITING))
-               return;
        sched_move_task(task);
  }