Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...

author Linus Torvalds <torvalds@linux-foundation.org>

Wed, 4 Nov 2015 02:03:50 +0000 (18:03 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Wed, 4 Nov 2015 02:03:50 +0000 (18:03 -0800)
author Linus Torvalds <torvalds@linux-foundation.org>
Wed, 4 Nov 2015 02:03:50 +0000 (18:03 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Wed, 4 Nov 2015 02:03:50 +0000 (18:03 -0800)
diff --combined include/linux/sched.h

index 56667292d1e444df2e9f21fc8281caf32e44f3f5,23ca455d9582693173bd85b9468f9f487164a86c..9e1e06c3ce051e63862a22e7edb5897e7027591e
--- 1/include/linux/sched.h
--- 2/include/linux/sched.h
+++ b/include/linux/sched.h
@@@ -599,36 -599,39 +599,42 @@@ struct task_cputime_atomic 
                 .sum_exec_runtime = ATOMIC64_INIT(0),           \
         }
   
- #ifdef CONFIG_PREEMPT_COUNT
- #define PREEMPT_DISABLED      (1 + PREEMPT_ENABLED)
- #else
- #define PREEMPT_DISABLED      PREEMPT_ENABLED
- #endif
+ #define PREEMPT_DISABLED      (PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)
+ 
+ /*
+  * Disable preemption until the scheduler is running -- use an unconditional
+  * value so that it also works on !PREEMPT_COUNT kernels.
+  *
+  * Reset by start_kernel()->sched_init()->init_idle()->init_idle_preempt_count().
+  */
+ #define INIT_PREEMPT_COUNT    PREEMPT_OFFSET
   
   /*
-  * Disable preemption until the scheduler is running.
-  * Reset by start_kernel()->sched_init()->init_idle().
+  * Initial preempt_count value; reflects the preempt_count schedule invariant
+  * which states that during context switches:
    *
-  * We include PREEMPT_ACTIVE to avoid cond_resched() from working
-  * before the scheduler is active -- see should_resched().
+  *    preempt_count() == 2*PREEMPT_DISABLE_OFFSET
+  *
+  * Note: PREEMPT_DISABLE_OFFSET is 0 for !PREEMPT_COUNT kernels.
+  * Note: See finish_task_switch().
    */
- #define INIT_PREEMPT_COUNT    (PREEMPT_DISABLED + PREEMPT_ACTIVE)
+ #define FORK_PREEMPT_COUNT    (2*PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)
   
   /**
    * struct thread_group_cputimer - thread group interval timer counts
    * @cputime_atomic:   atomic thread group interval timers.
- - * @running:          non-zero when there are timers running and
- - *                    @cputime receives updates.
+ + * @running:          true when there are timers running and
+ + *                    @cputime_atomic receives updates.
+ + * @checking_timer:   true when a thread in the group is in the
+ + *                    process of checking for thread group timers.
    *
    * This structure contains the version of task_cputime, above, that is
    * used for thread group CPU timer calculations.
    */
   struct thread_group_cputimer {
         struct task_cputime_atomic cputime_atomic;
- -      int running;
+ +      bool running;
+ +      bool checking_timer;
   };
   
   #include <linux/rwsem.h>
@@@ -1142,8 -1145,6 +1148,6 @@@ struct sched_domain_topology_level 
   #endif
   };
   
- extern struct sched_domain_topology_level *sched_domain_topology;
- 
   extern void set_sched_topology(struct sched_domain_topology_level *tl);
   extern void wake_up_if_idle(int cpu);
   
@@@ -1192,10 -1193,10 +1196,10 @@@ struct load_weight 
   
   /*
    * The load_avg/util_avg accumulates an infinite geometric series.
-  * 1) load_avg factors the amount of time that a sched_entity is
-  * runnable on a rq into its weight. For cfs_rq, it is the aggregated
-  * such weights of all runnable and blocked sched_entities.
-  * 2) util_avg factors frequency scaling into the amount of time
+  * 1) load_avg factors frequency scaling into the amount of time that a
+  * sched_entity is runnable on a rq into its weight. For cfs_rq, it is the
+  * aggregated such weights of all runnable and blocked sched_entities.
+  * 2) util_avg factors frequency and cpu scaling into the amount of time
    * that a sched_entity is running on a CPU, in the range [0..SCHED_LOAD_SCALE].
    * For cfs_rq, it is the aggregated such times of all runnable and
    * blocked sched_entities.
@@@ -1345,12 -1346,10 +1349,12 @@@ struct sched_dl_entity 
   
   union rcu_special {
         struct {
- -              bool blocked;
- -              bool need_qs;
- -      } b;
- -      short s;
+ +              u8 blocked;
+ +              u8 need_qs;
+ +              u8 exp_need_qs;
+ +              u8 pad; /* Otherwise the compiler can store garbage here. */
+ +      } b; /* Bits. */
+ +      u32 s; /* Set of bits. */
   };
   struct rcu_node;
   
diff --combined kernel/cpu.c

index 14a9cdf8abe9e806083ca3041c2861f6ef0713b3,c85df2775b735895d0637c416c712f507828dd9d..85ff5e26e23b45b34201120c758082599f995b7e
--- 1/kernel/cpu.c
--- 2/kernel/cpu.c
+++ b/kernel/cpu.c
@@@ -102,6 -102,19 +102,6 @@@ void get_online_cpus(void
   }
   EXPORT_SYMBOL_GPL(get_online_cpus);
   
- -bool try_get_online_cpus(void)
- -{
- -      if (cpu_hotplug.active_writer == current)
- -              return true;
- -      if (!mutex_trylock(&cpu_hotplug.lock))
- -              return false;
- -      cpuhp_lock_acquire_tryread();
- -      atomic_inc(&cpu_hotplug.refcount);
- -      mutex_unlock(&cpu_hotplug.lock);
- -      return true;
- -}
- -EXPORT_SYMBOL_GPL(try_get_online_cpus);
- -
   void put_online_cpus(void)
   {
         int refcount;
@@@ -291,8 -304,8 +291,8 @@@ static inline void check_for_tasks(int 
   {
         struct task_struct *g, *p;
   
-       read_lock_irq(&tasklist_lock);
-       do_each_thread(g, p) {
+       read_lock(&tasklist_lock);
+       for_each_process_thread(g, p) {
                 if (!p->on_rq)
                         continue;
                 /*
@@@ -307,8 -320,8 +307,8 @@@
   
                 pr_warn("Task %s (pid=%d) is on cpu %d (state=%ld, flags=%x)\n",
                         p->comm, task_pid_nr(p), dead_cpu, p->state, p->flags);
-       } while_each_thread(g, p);
-       read_unlock_irq(&tasklist_lock);
+       }
+       read_unlock(&tasklist_lock);
   }
   
   struct take_cpu_down_param {
@@@ -331,7 -344,7 +331,7 @@@ static int take_cpu_down(void *_param
         /* Give up timekeeping duties */
         tick_handover_do_timer();
         /* Park the stopper thread */
-       kthread_park(current);
+       stop_machine_park((long)param->hcpu);
         return 0;
   }
   
diff --combined kernel/exit.c

index 0e93b63bbc59292815f49ddc95c0eceafa48b7e4,443677c8efe6ec17ac11dc2fe699f0fcb61fa107..07110c6020a04ea37c04bc18bd0b9287cd0466dc
--- 1/kernel/exit.c
--- 2/kernel/exit.c
+++ b/kernel/exit.c
@@@ -706,10 -706,12 +706,12 @@@ void do_exit(long code
         smp_mb();
         raw_spin_unlock_wait(&tsk->pi_lock);
   
-       if (unlikely(in_atomic()))
+       if (unlikely(in_atomic())) {
                 pr_info("note: %s[%d] exited with preempt_count %d\n",
                         current->comm, task_pid_nr(current),
                         preempt_count());
+               preempt_count_set(PREEMPT_ENABLED);
+       }
   
         /* sync mm's RSS info before statistics gathering */
         if (tsk->mm)
@@@ -761,9 -763,7 +763,9 @@@
          */
         flush_ptrace_hw_breakpoint(tsk);
   
+ +      TASKS_RCU(preempt_disable());
         TASKS_RCU(tasks_rcu_i = __srcu_read_lock(&tasks_rcu_exit_srcu));
+ +      TASKS_RCU(preempt_enable());
         exit_notify(tsk, group_dead);
         proc_exit_connector(tsk);
   #ifdef CONFIG_NUMA
diff --combined kernel/locking/rtmutex.c

index bbb72b4f64a148de2bcf186acf5b2e08d2d77207,35e9bfcc6ad940d90e9790544106b87912a05195..8251e75dd9c0bd67337754baf6f03fb2cf1f956f
--- 1/kernel/locking/rtmutex.c
--- 2/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@@ -74,23 -74,14 +74,23 @@@ static void fixup_rt_mutex_waiters(stru
    * set up.
    */
   #ifndef CONFIG_DEBUG_RT_MUTEXES
- -# define rt_mutex_cmpxchg(l,c,n)      (cmpxchg(&l->owner, c, n) == c)
+ +# define rt_mutex_cmpxchg_relaxed(l,c,n) (cmpxchg_relaxed(&l->owner, c, n) == c)
+ +# define rt_mutex_cmpxchg_acquire(l,c,n) (cmpxchg_acquire(&l->owner, c, n) == c)
+ +# define rt_mutex_cmpxchg_release(l,c,n) (cmpxchg_release(&l->owner, c, n) == c)
+ +
+ +/*
+ + * Callers must hold the ->wait_lock -- which is the whole purpose as we force
+ + * all future threads that attempt to [Rmw] the lock to the slowpath. As such
+ + * relaxed semantics suffice.
+ + */
   static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
   {
         unsigned long owner, *p = (unsigned long *) &lock->owner;
   
         do {
                 owner = *p;
- -      } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
+ +      } while (cmpxchg_relaxed(p, owner,
+ +                               owner | RT_MUTEX_HAS_WAITERS) != owner);
   }
   
   /*
@@@ -130,14 -121,11 +130,14 @@@ static inline bool unlock_rt_mutex_safe
          *                                      lock(wait_lock);
          *                                      acquire(lock);
          */
- -      return rt_mutex_cmpxchg(lock, owner, NULL);
+ +      return rt_mutex_cmpxchg_release(lock, owner, NULL);
   }
   
   #else
- -# define rt_mutex_cmpxchg(l,c,n)      (0)
+ +# define rt_mutex_cmpxchg_relaxed(l,c,n)      (0)
+ +# define rt_mutex_cmpxchg_acquire(l,c,n)      (0)
+ +# define rt_mutex_cmpxchg_release(l,c,n)      (0)
+ +
   static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
   {
         lock->owner = (struct task_struct *)
@@@ -170,7 -158,8 +170,8 @@@ rt_mutex_waiter_less(struct rt_mutex_wa
          * then right waiter has a dl_prio() too.
          */
         if (dl_prio(left->prio))
-               return (left->task->dl.deadline < right->task->dl.deadline);
+               return dl_time_before(left->task->dl.deadline,
+                                     right->task->dl.deadline);
   
         return 0;
   }
@@@ -1333,7 -1322,7 +1334,7 @@@ rt_mutex_fastlock(struct rt_mutex *lock
                                 struct hrtimer_sleeper *timeout,
                                 enum rtmutex_chainwalk chwalk))
   {
- -      if (likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+ +      if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
                 rt_mutex_deadlock_account_lock(lock, current);
                 return 0;
         } else
@@@ -1349,7 -1338,7 +1350,7 @@@ rt_mutex_timed_fastlock(struct rt_mute
                                       enum rtmutex_chainwalk chwalk))
   {
         if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
- -          likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+ +          likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
                 rt_mutex_deadlock_account_lock(lock, current);
                 return 0;
         } else
@@@ -1360,7 -1349,7 +1361,7 @@@ static inline in
   rt_mutex_fasttrylock(struct rt_mutex *lock,
                      int (*slowfn)(struct rt_mutex *lock))
   {
- -      if (likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+ +      if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
                 rt_mutex_deadlock_account_lock(lock, current);
                 return 1;
         }
@@@ -1374,7 -1363,7 +1375,7 @@@ rt_mutex_fastunlock(struct rt_mutex *lo
   {
         WAKE_Q(wake_q);
   
- -      if (likely(rt_mutex_cmpxchg(lock, current, NULL))) {
+ +      if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
                 rt_mutex_deadlock_account_unlock(current);
   
         } else {
@@@ -1496,7 -1485,7 +1497,7 @@@ EXPORT_SYMBOL_GPL(rt_mutex_unlock)
   bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock,
                                    struct wake_q_head *wqh)
   {
- -      if (likely(rt_mutex_cmpxchg(lock, current, NULL))) {
+ +      if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
                 rt_mutex_deadlock_account_unlock(current);
                 return false;
         }
diff --combined kernel/sched/core.c

index f7402f7eb44803a6659b5f2b6337d64624dd8d0a,b4d263db52a6f6d7b09dab05508c91fb526af929..aa5973220ad213a960092012bf4493f296dab90b
--- 1/kernel/sched/core.c
--- 2/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@@ -817,7 -817,7 +817,7 @@@ static void set_load_weight(struct task
         /*
          * SCHED_IDLE tasks get minimal weight:
          */
-       if (p->policy == SCHED_IDLE) {
+       if (idle_policy(p->policy)) {
                 load->weight = scale_load(WEIGHT_IDLEPRIO);
                 load->inv_weight = WMULT_IDLEPRIO;
                 return;
@@@ -827,17 -827,19 +827,19 @@@
         load->inv_weight = prio_to_wmult[prio];
   }
   
- static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
+ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
   {
         update_rq_clock(rq);
-       sched_info_queued(rq, p);
+       if (!(flags & ENQUEUE_RESTORE))
+               sched_info_queued(rq, p);
         p->sched_class->enqueue_task(rq, p, flags);
   }
   
- static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
+ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
   {
         update_rq_clock(rq);
-       sched_info_dequeued(rq, p);
+       if (!(flags & DEQUEUE_SAVE))
+               sched_info_dequeued(rq, p);
         p->sched_class->dequeue_task(rq, p, flags);
   }
   
@@@ -1178,7 -1180,7 +1180,7 @@@ void do_set_cpus_allowed(struct task_st
                  * holding rq->lock.
                  */
                 lockdep_assert_held(&rq->lock);
-               dequeue_task(rq, p, 0);
+               dequeue_task(rq, p, DEQUEUE_SAVE);
         }
         if (running)
                 put_prev_task(rq, p);
@@@ -1188,7 -1190,7 +1190,7 @@@
         if (running)
                 p->sched_class->set_curr_task(rq);
         if (queued)
-               enqueue_task(rq, p, 0);
+               enqueue_task(rq, p, ENQUEUE_RESTORE);
   }
   
   /*
@@@ -1292,7 -1294,7 +1294,7 @@@ void set_task_cpu(struct task_struct *p
   
         if (task_cpu(p) != new_cpu) {
                 if (p->sched_class->migrate_task_rq)
-                       p->sched_class->migrate_task_rq(p, new_cpu);
+                       p->sched_class->migrate_task_rq(p);
                 p->se.nr_migrations++;
                 perf_event_task_migrate(p);
         }
@@@ -1333,12 -1335,16 +1335,16 @@@ static int migrate_swap_stop(void *data
         struct rq *src_rq, *dst_rq;
         int ret = -EAGAIN;
   
+       if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu))
+               return -EAGAIN;
+ 
         src_rq = cpu_rq(arg->src_cpu);
         dst_rq = cpu_rq(arg->dst_cpu);
   
         double_raw_lock(&arg->src_task->pi_lock,
                         &arg->dst_task->pi_lock);
         double_rq_lock(src_rq, dst_rq);
+ 
         if (task_cpu(arg->dst_task) != arg->dst_cpu)
                 goto unlock;
   
@@@ -1574,13 -1580,15 +1580,15 @@@ static int select_fallback_rq(int cpu, 
                         goto out;
                 }
   
+               /* No more Mr. Nice Guy. */
                 switch (state) {
                 case cpuset:
-                       /* No more Mr. Nice Guy. */
-                       cpuset_cpus_allowed_fallback(p);
-                       state = possible;
-                       break;
- 
+                       if (IS_ENABLED(CONFIG_CPUSETS)) {
+                               cpuset_cpus_allowed_fallback(p);
+                               state = possible;
+                               break;
+                       }
+                       /* fall-through */
                 case possible:
                         do_set_cpus_allowed(p, cpu_possible_mask);
                         state = fail;
@@@ -1692,7 -1700,7 +1700,7 @@@ ttwu_stat(struct task_struct *p, int cp
   #endif /* CONFIG_SCHEDSTATS */
   }
   
- static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
+ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
   {
         activate_task(rq, p, en_flags);
         p->on_rq = TASK_ON_RQ_QUEUED;
@@@ -2114,23 -2122,17 +2122,17 @@@ static void __sched_fork(unsigned long 
   #endif /* CONFIG_NUMA_BALANCING */
   }
   
+ DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
+ 
   #ifdef CONFIG_NUMA_BALANCING
- #ifdef CONFIG_SCHED_DEBUG
+ 
   void set_numabalancing_state(bool enabled)
   {
         if (enabled)
-               sched_feat_set("NUMA");
+               static_branch_enable(&sched_numa_balancing);
         else
-               sched_feat_set("NO_NUMA");
+               static_branch_disable(&sched_numa_balancing);
   }
- #else
- __read_mostly bool numabalancing_enabled;
- 
- void set_numabalancing_state(bool enabled)
- {
-       numabalancing_enabled = enabled;
- }
- #endif /* CONFIG_SCHED_DEBUG */
   
   #ifdef CONFIG_PROC_SYSCTL
   int sysctl_numa_balancing(struct ctl_table *table, int write,
@@@ -2138,7 -2140,7 +2140,7 @@@
   {
         struct ctl_table t;
         int err;
-       int state = numabalancing_enabled;
+       int state = static_branch_likely(&sched_numa_balancing);
   
         if (write && !capable(CAP_SYS_ADMIN))
                 return -EPERM;
@@@ -2349,6 -2351,8 +2351,8 @@@ void wake_up_new_task(struct task_struc
         struct rq *rq;
   
         raw_spin_lock_irqsave(&p->pi_lock, flags);
+       /* Initialize new task's runnable average */
+       init_entity_runnable_average(&p->se);
   #ifdef CONFIG_SMP
         /*
          * Fork balancing, do it here and not earlier because:
@@@ -2358,23 -2362,14 +2362,21 @@@
         set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
   #endif
   
-       /* Initialize new task's runnable average */
-       init_entity_runnable_average(&p->se);
         rq = __task_rq_lock(p);
         activate_task(rq, p, 0);
         p->on_rq = TASK_ON_RQ_QUEUED;
         trace_sched_wakeup_new(p);
         check_preempt_curr(rq, p, WF_FORK);
   #ifdef CONFIG_SMP
- -      if (p->sched_class->task_woken)
+ +      if (p->sched_class->task_woken) {
+ +              /*
+ +               * Nothing relies on rq->lock after this, so its fine to
+ +               * drop it.
+ +               */
+ +              lockdep_unpin_lock(&rq->lock);
                 p->sched_class->task_woken(rq, p);
+ +              lockdep_pin_lock(&rq->lock);
+ +      }
   #endif
         task_rq_unlock(rq, p, &flags);
   }
@@@ -2483,7 -2478,6 +2485,6 @@@ static inline voi
   prepare_task_switch(struct rq *rq, struct task_struct *prev,
                     struct task_struct *next)
   {
-       trace_sched_switch(prev, next);
         sched_info_switch(rq, prev, next);
         perf_event_task_sched_out(prev, next);
         fire_sched_out_preempt_notifiers(prev, next);
@@@ -2517,6 -2511,22 +2518,22 @@@ static struct rq *finish_task_switch(st
         struct mm_struct *mm = rq->prev_mm;
         long prev_state;
   
+       /*
+        * The previous task will have left us with a preempt_count of 2
+        * because it left us after:
+        *
+        *      schedule()
+        *        preempt_disable();                    // 1
+        *        __schedule()
+        *          raw_spin_lock_irq(&rq->lock)        // 2
+        *
+        * Also, see FORK_PREEMPT_COUNT.
+        */
+       if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
+                     "corrupted preempt_count: %s/%d/0x%x\n",
+                     current->comm, current->pid, preempt_count()))
+               preempt_count_set(FORK_PREEMPT_COUNT);
+ 
         rq->prev_mm = NULL;
   
         /*
@@@ -2601,8 -2611,15 +2618,15 @@@ asmlinkage __visible void schedule_tail
   {
         struct rq *rq;
   
-       /* finish_task_switch() drops rq->lock and enables preemtion */
-       preempt_disable();
+       /*
+        * New tasks start with FORK_PREEMPT_COUNT, see there and
+        * finish_task_switch() for details.
+        *
+        * finish_task_switch() will drop rq->lock() and lower preempt_count
+        * and the preempt_enable() will end up enabling preemption (on
+        * PREEMPT_COUNT kernels).
+        */
+ 
         rq = finish_task_switch(prev);
         balance_callback(rq);
         preempt_enable();
@@@ -2960,15 -2977,13 +2984,13 @@@ static noinline void __schedule_bug(str
   static inline void schedule_debug(struct task_struct *prev)
   {
   #ifdef CONFIG_SCHED_STACK_END_CHECK
-       BUG_ON(unlikely(task_stack_end_corrupted(prev)));
+       BUG_ON(task_stack_end_corrupted(prev));
   #endif
-       /*
-        * Test if we are atomic. Since do_exit() needs to call into
-        * schedule() atomically, we ignore that path. Otherwise whine
-        * if we are scheduling when we should not.
-        */
-       if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD))
+ 
+       if (unlikely(in_atomic_preempt_off())) {
                 __schedule_bug(prev);
+               preempt_count_set(PREEMPT_DISABLED);
+       }
         rcu_sleep_check();
   
         profile_hit(SCHED_PROFILING, __builtin_return_address(0));
@@@ -3054,7 -3069,7 +3076,7 @@@ again
    *
    * WARNING: must be called with preemption disabled!
    */
- static void __sched __schedule(void)
+ static void __sched notrace __schedule(bool preempt)
   {
         struct task_struct *prev, *next;
         unsigned long *switch_count;
@@@ -3066,6 -3081,17 +3088,17 @@@
         rcu_note_context_switch();
         prev = rq->curr;
   
+       /*
+        * do_exit() calls schedule() with preemption disabled as an exception;
+        * however we must fix that up, otherwise the next task will see an
+        * inconsistent (higher) preempt count.
+        *
+        * It also avoids the below schedule_debug() test from complaining
+        * about this.
+        */
+       if (unlikely(prev->state == TASK_DEAD))
+               preempt_enable_no_resched_notrace();
+ 
         schedule_debug(prev);
   
         if (sched_feat(HRTICK))
@@@ -3083,7 -3109,7 +3116,7 @@@
         rq->clock_skip_update <<= 1; /* promote REQ to ACT */
   
         switch_count = &prev->nivcsw;
-       if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
+       if (!preempt && prev->state) {
                 if (unlikely(signal_pending_state(prev->state, prev))) {
                         prev->state = TASK_RUNNING;
                 } else {
@@@ -3119,6 -3145,7 +3152,7 @@@
                 rq->curr = next;
                 ++*switch_count;
   
+               trace_sched_switch(preempt, prev, next);
                 rq = context_switch(rq, prev, next); /* unlocks the rq */
                 cpu = cpu_of(rq);
         } else {
@@@ -3148,7 -3175,7 +3182,7 @@@ asmlinkage __visible void __sched sched
         sched_submit_work(tsk);
         do {
                 preempt_disable();
-               __schedule();
+               __schedule(false);
                 sched_preempt_enable_no_resched();
         } while (need_resched());
   }
@@@ -3188,9 -3215,9 +3222,9 @@@ void __sched schedule_preempt_disabled(
   static void __sched notrace preempt_schedule_common(void)
   {
         do {
-               preempt_active_enter();
-               __schedule();
-               preempt_active_exit();
+               preempt_disable_notrace();
+               __schedule(true);
+               preempt_enable_no_resched_notrace();
   
                 /*
                  * Check again in case we missed a preemption opportunity
@@@ -3241,24 -3268,17 +3275,17 @@@ asmlinkage __visible void __sched notra
                 return;
   
         do {
-               /*
-                * Use raw __prempt_count() ops that don't call function.
-                * We can't call functions before disabling preemption which
-                * disarm preemption tracing recursions.
-                */
-               __preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
-               barrier();
+               preempt_disable_notrace();
                 /*
                  * Needs preempt disabled in case user_exit() is traced
                  * and the tracer calls preempt_enable_notrace() causing
                  * an infinite recursion.
                  */
                 prev_ctx = exception_enter();
-               __schedule();
+               __schedule(true);
                 exception_exit(prev_ctx);
   
-               barrier();
-               __preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
+               preempt_enable_no_resched_notrace();
         } while (need_resched());
   }
   EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
@@@ -3281,11 -3301,11 +3308,11 @@@ asmlinkage __visible void __sched preem
         prev_state = exception_enter();
   
         do {
-               preempt_active_enter();
+               preempt_disable();
                 local_irq_enable();
-               __schedule();
+               __schedule(true);
                 local_irq_disable();
-               preempt_active_exit();
+               sched_preempt_enable_no_resched();
         } while (need_resched());
   
         exception_exit(prev_state);
@@@ -3313,7 -3333,7 +3340,7 @@@ EXPORT_SYMBOL(default_wake_function)
    */
   void rt_mutex_setprio(struct task_struct *p, int prio)
   {
-       int oldprio, queued, running, enqueue_flag = 0;
+       int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE;
         struct rq *rq;
         const struct sched_class *prev_class;
   
@@@ -3345,7 -3365,7 +3372,7 @@@
         queued = task_on_rq_queued(p);
         running = task_current(rq, p);
         if (queued)
-               dequeue_task(rq, p, 0);
+               dequeue_task(rq, p, DEQUEUE_SAVE);
         if (running)
                 put_prev_task(rq, p);
   
@@@ -3363,7 -3383,7 +3390,7 @@@
                 if (!dl_prio(p->normal_prio) ||
                     (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
                         p->dl.dl_boosted = 1;
-                       enqueue_flag = ENQUEUE_REPLENISH;
+                       enqueue_flag |= ENQUEUE_REPLENISH;
                 } else
                         p->dl.dl_boosted = 0;
                 p->sched_class = &dl_sched_class;
@@@ -3371,7 -3391,7 +3398,7 @@@
                 if (dl_prio(oldprio))
                         p->dl.dl_boosted = 0;
                 if (oldprio < prio)
-                       enqueue_flag = ENQUEUE_HEAD;
+                       enqueue_flag |= ENQUEUE_HEAD;
                 p->sched_class = &rt_sched_class;
         } else {
                 if (dl_prio(oldprio))
@@@ -3423,7 -3443,7 +3450,7 @@@ void set_user_nice(struct task_struct *
         }
         queued = task_on_rq_queued(p);
         if (queued)
-               dequeue_task(rq, p, 0);
+               dequeue_task(rq, p, DEQUEUE_SAVE);
   
         p->static_prio = NICE_TO_PRIO(nice);
         set_load_weight(p);
@@@ -3432,7 -3452,7 +3459,7 @@@
         delta = p->prio - old_prio;
   
         if (queued) {
-               enqueue_task(rq, p, 0);
+               enqueue_task(rq, p, ENQUEUE_RESTORE);
                 /*
                  * If the task increased its priority or is running and
                  * lowered its priority, then reschedule its CPU:
@@@ -3753,10 -3773,7 +3780,7 @@@ recheck
         } else {
                 reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
   
-               if (policy != SCHED_DEADLINE &&
-                               policy != SCHED_FIFO && policy != SCHED_RR &&
-                               policy != SCHED_NORMAL && policy != SCHED_BATCH &&
-                               policy != SCHED_IDLE)
+               if (!valid_policy(policy))
                         return -EINVAL;
         }
   
@@@ -3812,7 -3829,7 +3836,7 @@@
                  * Treat SCHED_IDLE as nice 20. Only allow a switch to
                  * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
                  */
-               if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
+               if (idle_policy(p->policy) && !idle_policy(policy)) {
                         if (!can_nice(p, task_nice(p)))
                                 return -EPERM;
                 }
@@@ -3937,7 -3954,7 +3961,7 @@@ change
         queued = task_on_rq_queued(p);
         running = task_current(rq, p);
         if (queued)
-               dequeue_task(rq, p, 0);
+               dequeue_task(rq, p, DEQUEUE_SAVE);
         if (running)
                 put_prev_task(rq, p);
   
@@@ -3947,11 -3964,15 +3971,15 @@@
         if (running)
                 p->sched_class->set_curr_task(rq);
         if (queued) {
+               int enqueue_flags = ENQUEUE_RESTORE;
                 /*
                  * We enqueue to tail when the priority of a task is
                  * increased (user space view).
                  */
-               enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);
+               if (oldprio <= p->prio)
+                       enqueue_flags |= ENQUEUE_HEAD;
+ 
+               enqueue_task(rq, p, enqueue_flags);
         }
   
         check_class_changed(rq, p, prev_class, oldprio);
@@@ -4029,7 -4050,6 +4057,7 @@@ int sched_setscheduler_nocheck(struct t
   {
         return _sched_setscheduler(p, policy, param, false);
   }
+ +EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
   
   static int
   do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
@@@ -5101,7 -5121,7 +5129,7 @@@ void sched_setnuma(struct task_struct *
         running = task_current(rq, p);
   
         if (queued)
-               dequeue_task(rq, p, 0);
+               dequeue_task(rq, p, DEQUEUE_SAVE);
         if (running)
                 put_prev_task(rq, p);
   
@@@ -5110,7 -5130,7 +5138,7 @@@
         if (running)
                 p->sched_class->set_curr_task(rq);
         if (queued)
-               enqueue_task(rq, p, 0);
+               enqueue_task(rq, p, ENQUEUE_RESTORE);
         task_rq_unlock(rq, p, &flags);
   }
   #endif /* CONFIG_NUMA_BALANCING */
@@@ -5531,21 -5551,27 +5559,27 @@@ static void set_cpu_rq_start_time(void
   static int sched_cpu_active(struct notifier_block *nfb,
                                       unsigned long action, void *hcpu)
   {
+       int cpu = (long)hcpu;
+ 
         switch (action & ~CPU_TASKS_FROZEN) {
         case CPU_STARTING:
                 set_cpu_rq_start_time();
                 return NOTIFY_OK;
+ 
         case CPU_ONLINE:
                 /*
                  * At this point a starting CPU has marked itself as online via
                  * set_cpu_online(). But it might not yet have marked itself
                  * as active, which is essential from here on.
-                *
-                * Thus, fall-through and help the starting CPU along.
                  */
+               set_cpu_active(cpu, true);
+               stop_machine_unpark(cpu);
+               return NOTIFY_OK;
+ 
         case CPU_DOWN_FAILED:
-               set_cpu_active((long)hcpu, true);
+               set_cpu_active(cpu, true);
                 return NOTIFY_OK;
+ 
         default:
                 return NOTIFY_DONE;
         }
@@@ -6477,7 -6503,8 +6511,8 @@@ static struct sched_domain_topology_lev
         { NULL, },
   };
   
- struct sched_domain_topology_level *sched_domain_topology = default_topology;
+ static struct sched_domain_topology_level *sched_domain_topology =
+       default_topology;
   
   #define for_each_sd_topology(tl)                      \
         for (tl = sched_domain_topology; tl->mask; tl++)
@@@ -7478,7 -7505,7 +7513,7 @@@ void __init sched_init(void
   #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
   static inline int preempt_count_equals(int preempt_offset)
   {
-       int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
+       int nested = preempt_count() + rcu_preempt_depth();
   
         return (nested == preempt_offset);
   }
@@@ -7725,7 -7752,7 +7760,7 @@@ void sched_move_task(struct task_struc
         queued = task_on_rq_queued(tsk);
   
         if (queued)
-               dequeue_task(rq, tsk, 0);
+               dequeue_task(rq, tsk, DEQUEUE_SAVE);
         if (unlikely(running))
                 put_prev_task(rq, tsk);
   
@@@ -7741,7 -7768,7 +7776,7 @@@
   
   #ifdef CONFIG_FAIR_GROUP_SCHED
         if (tsk->sched_class->task_move_group)
-               tsk->sched_class->task_move_group(tsk, queued);
+               tsk->sched_class->task_move_group(tsk);
         else
   #endif
                 set_task_rq(tsk, task_cpu(tsk));
@@@ -7749,7 -7776,7 +7784,7 @@@
         if (unlikely(running))
                 tsk->sched_class->set_curr_task(rq);
         if (queued)
-               enqueue_task(rq, tsk, 0);
+               enqueue_task(rq, tsk, ENQUEUE_RESTORE);
   
         task_rq_unlock(rq, tsk, &flags);
   }
@@@ -8213,14 -8240,6 +8248,6 @@@ static void cpu_cgroup_exit(struct cgro
                             struct cgroup_subsys_state *old_css,
                             struct task_struct *task)
   {
-       /*
-        * cgroup_exit() is called in the copy_process() failure path.
-        * Ignore this case since the task hasn't ran yet, this avoids
-        * trying to poke a half freed task state from generic code.
-        */
-       if (!(task->flags & PF_EXITING))
-               return;
- 
         sched_move_task(task);
   }
author	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 4 Nov 2015 02:03:50 +0000 (18:03 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 4 Nov 2015 02:03:50 +0000 (18:03 -0800)
		1	2
include/linux/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/cpu.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/exit.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/locking/rtmutex.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/core.c	patch \|	diff1 \|	diff2 \|	blob \| history