]> git.kernelconcepts.de Git - karo-tx-linux.git/blobdiff - kernel/sched/core.c
sched: Use replace normalize_task() with __sched_setscheduler()
[karo-tx-linux.git] / kernel / sched / core.c
index 123673291ffbb160734ed889b934d557611a1cf1..b610ef9e522f03b7d0ff4ccbce22aa8710e205d6 100644 (file)
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched.h>
 
-void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
-{
-       unsigned long delta;
-       ktime_t soft, hard, now;
-
-       for (;;) {
-               if (hrtimer_active(period_timer))
-                       break;
-
-               now = hrtimer_cb_get_time(period_timer);
-               hrtimer_forward(period_timer, now, period);
-
-               soft = hrtimer_get_softexpires(period_timer);
-               hard = hrtimer_get_expires(period_timer);
-               delta = ktime_to_ns(ktime_sub(hard, soft));
-               __hrtimer_start_range_ns(period_timer, soft, delta,
-                                        HRTIMER_MODE_ABS_PINNED, 0);
-       }
-}
-
 DEFINE_MUTEX(sched_domains_mutex);
 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 
@@ -355,12 +335,11 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
 
 #ifdef CONFIG_SMP
 
-static int __hrtick_restart(struct rq *rq)
+static void __hrtick_restart(struct rq *rq)
 {
        struct hrtimer *timer = &rq->hrtick_timer;
-       ktime_t time = hrtimer_get_softexpires(timer);
 
-       return __hrtimer_start_range_ns(timer, time, 0, HRTIMER_MODE_ABS_PINNED, 0);
+       hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
 }
 
 /*
@@ -440,8 +419,8 @@ void hrtick_start(struct rq *rq, u64 delay)
         * doesn't make sense. Rely on vruntime for fairness.
         */
        delay = max_t(u64, delay, 10000LL);
-       __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
-                       HRTIMER_MODE_REL_PINNED, 0);
+       hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
+                     HRTIMER_MODE_REL_PINNED);
 }
 
 static inline void init_hrtick(void)
@@ -511,7 +490,7 @@ static bool set_nr_and_not_polling(struct task_struct *p)
 static bool set_nr_if_polling(struct task_struct *p)
 {
        struct thread_info *ti = task_thread_info(p);
-       typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags);
+       typeof(ti->flags) old, val = READ_ONCE(ti->flags);
 
        for (;;) {
                if (!(val & _TIF_POLLING_NRFLAG))
@@ -541,6 +520,52 @@ static bool set_nr_if_polling(struct task_struct *p)
 #endif
 #endif
 
+void wake_q_add(struct wake_q_head *head, struct task_struct *task)
+{
+       struct wake_q_node *node = &task->wake_q;
+
+       /*
+        * Atomically grab the task, if ->wake_q is !nil already it means
+        * its already queued (either by us or someone else) and will get the
+        * wakeup due to that.
+        *
+        * This cmpxchg() implies a full barrier, which pairs with the write
+        * barrier implied by the wakeup in wake_up_list().
+        */
+       if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
+               return;
+
+       get_task_struct(task);
+
+       /*
+        * The head is context local, there can be no concurrency.
+        */
+       *head->lastp = node;
+       head->lastp = &node->next;
+}
+
+void wake_up_q(struct wake_q_head *head)
+{
+       struct wake_q_node *node = head->first;
+
+       while (node != WAKE_Q_TAIL) {
+               struct task_struct *task;
+
+               task = container_of(node, struct task_struct, wake_q);
+               BUG_ON(!task);
+               /* task can safely be re-inserted now */
+               node = node->next;
+               task->wake_q.next = NULL;
+
+               /*
+                * wake_up_process() implies a wmb() to pair with the queueing
+                * in wake_q_add() so as not to miss wakeups.
+                */
+               wake_up_process(task);
+               put_task_struct(task);
+       }
+}
+
 /*
  * resched_curr - mark rq's current task 'to be rescheduled now'.
  *
@@ -2252,23 +2277,35 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 #ifdef CONFIG_SMP
 
 /* rq->lock is NOT held, but preemption is disabled */
-static inline void post_schedule(struct rq *rq)
+static void __balance_callback(struct rq *rq)
 {
-       if (rq->post_schedule) {
-               unsigned long flags;
+       struct callback_head *head, *next;
+       void (*func)(struct rq *rq);
+       unsigned long flags;
 
-               raw_spin_lock_irqsave(&rq->lock, flags);
-               if (rq->curr->sched_class->post_schedule)
-                       rq->curr->sched_class->post_schedule(rq);
-               raw_spin_unlock_irqrestore(&rq->lock, flags);
+       raw_spin_lock_irqsave(&rq->lock, flags);
+       head = rq->balance_callback;
+       rq->balance_callback = NULL;
+       while (head) {
+               func = (void (*)(struct rq *))head->func;
+               next = head->next;
+               head->next = NULL;
+               head = next;
 
-               rq->post_schedule = 0;
+               func(rq);
        }
+       raw_spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+static inline void balance_callback(struct rq *rq)
+{
+       if (unlikely(rq->balance_callback))
+               __balance_callback(rq);
 }
 
 #else
 
-static inline void post_schedule(struct rq *rq)
+static inline void balance_callback(struct rq *rq)
 {
 }
 
@@ -2286,7 +2323,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
        /* finish_task_switch() drops rq->lock and enables preemtion */
        preempt_disable();
        rq = finish_task_switch(prev);
-       post_schedule(rq);
+       balance_callback(rq);
        preempt_enable();
 
        if (current->set_child_tid)
@@ -2397,9 +2434,9 @@ unsigned long nr_iowait_cpu(int cpu)
 
 void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
 {
-       struct rq *this = this_rq();
-       *nr_waiters = atomic_read(&this->nr_iowait);
-       *load = this->cpu_load[0];
+       struct rq *rq = this_rq();
+       *nr_waiters = atomic_read(&rq->nr_iowait);
+       *load = rq->load.weight;
 }
 
 #ifdef CONFIG_SMP
@@ -2497,6 +2534,7 @@ void scheduler_tick(void)
        update_rq_clock(rq);
        curr->sched_class->task_tick(rq, curr, 0);
        update_cpu_load_active(rq);
+       calc_global_load_tick(rq);
        raw_spin_unlock(&rq->lock);
 
        perf_event_task_tick();
@@ -2525,7 +2563,7 @@ void scheduler_tick(void)
 u64 scheduler_tick_max_deferment(void)
 {
        struct rq *rq = this_rq();
-       unsigned long next, now = ACCESS_ONCE(jiffies);
+       unsigned long next, now = READ_ONCE(jiffies);
 
        next = rq->last_sched_tick + HZ;
 
@@ -2726,9 +2764,7 @@ again:
  *          - return from syscall or exception to user-space
  *          - return from interrupt-handler to user-space
  *
- * WARNING: all callers must re-check need_resched() afterward and reschedule
- * accordingly in case an event triggered the need for rescheduling (such as
- * an interrupt waking up a task) while preemption was disabled in __schedule().
+ * WARNING: must be called with preemption disabled!
  */
 static void __sched __schedule(void)
 {
@@ -2737,7 +2773,6 @@ static void __sched __schedule(void)
        struct rq *rq;
        int cpu;
 
-       preempt_disable();
        cpu = smp_processor_id();
        rq = cpu_rq(cpu);
        rcu_note_context_switch();
@@ -2800,9 +2835,7 @@ static void __sched __schedule(void)
        } else
                raw_spin_unlock_irq(&rq->lock);
 
-       post_schedule(rq);
-
-       sched_preempt_enable_no_resched();
+       balance_callback(rq);
 }
 
 static inline void sched_submit_work(struct task_struct *tsk)
@@ -2823,7 +2856,9 @@ asmlinkage __visible void __sched schedule(void)
 
        sched_submit_work(tsk);
        do {
+               preempt_disable();
                __schedule();
+               sched_preempt_enable_no_resched();
        } while (need_resched());
 }
 EXPORT_SYMBOL(schedule);
@@ -2862,15 +2897,14 @@ void __sched schedule_preempt_disabled(void)
 static void __sched notrace preempt_schedule_common(void)
 {
        do {
-               __preempt_count_add(PREEMPT_ACTIVE);
+               preempt_active_enter();
                __schedule();
-               __preempt_count_sub(PREEMPT_ACTIVE);
+               preempt_active_exit();
 
                /*
                 * Check again in case we missed a preemption opportunity
                 * between schedule and now.
                 */
-               barrier();
        } while (need_resched());
 }
 
@@ -2894,9 +2928,8 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
 NOKPROBE_SYMBOL(preempt_schedule);
 EXPORT_SYMBOL(preempt_schedule);
 
-#ifdef CONFIG_CONTEXT_TRACKING
 /**
- * preempt_schedule_context - preempt_schedule called by tracing
+ * preempt_schedule_notrace - preempt_schedule called by tracing
  *
  * The tracing infrastructure uses preempt_enable_notrace to prevent
  * recursion and tracing preempt enabling caused by the tracing
@@ -2909,7 +2942,7 @@ EXPORT_SYMBOL(preempt_schedule);
  * instead of preempt_schedule() to exit user context if needed before
  * calling the scheduler.
  */
-asmlinkage __visible void __sched notrace preempt_schedule_context(void)
+asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
 {
        enum ctx_state prev_ctx;
 
@@ -2917,7 +2950,13 @@ asmlinkage __visible void __sched notrace preempt_schedule_context(void)
                return;
 
        do {
-               __preempt_count_add(PREEMPT_ACTIVE);
+               /*
+                * Use raw __prempt_count() ops that don't call function.
+                * We can't call functions before disabling preemption which
+                * disarm preemption tracing recursions.
+                */
+               __preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
+               barrier();
                /*
                 * Needs preempt disabled in case user_exit() is traced
                 * and the tracer calls preempt_enable_notrace() causing
@@ -2927,12 +2966,11 @@ asmlinkage __visible void __sched notrace preempt_schedule_context(void)
                __schedule();
                exception_exit(prev_ctx);
 
-               __preempt_count_sub(PREEMPT_ACTIVE);
                barrier();
+               __preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
        } while (need_resched());
 }
-EXPORT_SYMBOL_GPL(preempt_schedule_context);
-#endif /* CONFIG_CONTEXT_TRACKING */
+EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
 
 #endif /* CONFIG_PREEMPT */
 
@@ -2952,17 +2990,11 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
        prev_state = exception_enter();
 
        do {
-               __preempt_count_add(PREEMPT_ACTIVE);
+               preempt_active_enter();
                local_irq_enable();
                __schedule();
                local_irq_disable();
-               __preempt_count_sub(PREEMPT_ACTIVE);
-
-               /*
-                * Check again in case we missed a preemption opportunity
-                * between schedule and now.
-                */
-               barrier();
+               preempt_active_exit();
        } while (need_resched());
 
        exception_exit(prev_state);
@@ -3406,7 +3438,7 @@ static bool dl_param_changed(struct task_struct *p,
 
 static int __sched_setscheduler(struct task_struct *p,
                                const struct sched_attr *attr,
-                               bool user)
+                               bool user, bool pi)
 {
        int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
                      MAX_RT_PRIO - 1 - attr->sched_priority;
@@ -3592,18 +3624,20 @@ change:
        p->sched_reset_on_fork = reset_on_fork;
        oldprio = p->prio;
 
-       /*
-        * Take priority boosted tasks into account. If the new
-        * effective priority is unchanged, we just store the new
-        * normal parameters and do not touch the scheduler class and
-        * the runqueue. This will be done when the task deboost
-        * itself.
-        */
-       new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
-       if (new_effective_prio == oldprio) {
-               __setscheduler_params(p, attr);
-               task_rq_unlock(rq, p, &flags);
-               return 0;
+       if (pi) {
+               /*
+                * Take priority boosted tasks into account. If the new
+                * effective priority is unchanged, we just store the new
+                * normal parameters and do not touch the scheduler class and
+                * the runqueue. This will be done when the task deboost
+                * itself.
+                */
+               new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
+               if (new_effective_prio == oldprio) {
+                       __setscheduler_params(p, attr);
+                       task_rq_unlock(rq, p, &flags);
+                       return 0;
+               }
        }
 
        queued = task_on_rq_queued(p);
@@ -3614,7 +3648,7 @@ change:
                put_prev_task(rq, p);
 
        prev_class = p->sched_class;
-       __setscheduler(rq, p, attr, true);
+       __setscheduler(rq, p, attr, pi);
 
        if (running)
                p->sched_class->set_curr_task(rq);
@@ -3629,7 +3663,8 @@ change:
        check_class_changed(rq, p, prev_class, oldprio);
        task_rq_unlock(rq, p, &flags);
 
-       rt_mutex_adjust_pi(p);
+       if (pi)
+               rt_mutex_adjust_pi(p);
 
        return 0;
 }
@@ -3650,7 +3685,7 @@ static int _sched_setscheduler(struct task_struct *p, int policy,
                attr.sched_policy = policy;
        }
 
-       return __sched_setscheduler(p, &attr, check);
+       return __sched_setscheduler(p, &attr, check, true);
 }
 /**
  * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
@@ -3671,7 +3706,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
 
 int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
 {
-       return __sched_setscheduler(p, attr, true);
+       return __sched_setscheduler(p, attr, true, true);
 }
 EXPORT_SYMBOL_GPL(sched_setattr);
 
@@ -5314,7 +5349,7 @@ static struct notifier_block migration_notifier = {
        .priority = CPU_PRI_MIGRATION,
 };
 
-static void __cpuinit set_cpu_rq_start_time(void)
+static void set_cpu_rq_start_time(void)
 {
        int cpu = smp_processor_id();
        struct rq *rq = cpu_rq(cpu);
@@ -7199,7 +7234,7 @@ void __init sched_init(void)
                rq->sd = NULL;
                rq->rd = NULL;
                rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
-               rq->post_schedule = 0;
+               rq->balance_callback = NULL;
                rq->active_balance = 0;
                rq->next_balance = jiffies;
                rq->push_cpu = 0;
@@ -7329,32 +7364,12 @@ EXPORT_SYMBOL(___might_sleep);
 #endif
 
 #ifdef CONFIG_MAGIC_SYSRQ
-static void normalize_task(struct rq *rq, struct task_struct *p)
+void normalize_rt_tasks(void)
 {
-       const struct sched_class *prev_class = p->sched_class;
+       struct task_struct *g, *p;
        struct sched_attr attr = {
                .sched_policy = SCHED_NORMAL,
        };
-       int old_prio = p->prio;
-       int queued;
-
-       queued = task_on_rq_queued(p);
-       if (queued)
-               dequeue_task(rq, p, 0);
-       __setscheduler(rq, p, &attr, false);
-       if (queued) {
-               enqueue_task(rq, p, 0);
-               resched_curr(rq);
-       }
-
-       check_class_changed(rq, p, prev_class, old_prio);
-}
-
-void normalize_rt_tasks(void)
-{
-       struct task_struct *g, *p;
-       unsigned long flags;
-       struct rq *rq;
 
        read_lock(&tasklist_lock);
        for_each_process_thread(g, p) {
@@ -7381,9 +7396,7 @@ void normalize_rt_tasks(void)
                        continue;
                }
 
-               rq = task_rq_lock(p, &flags);
-               normalize_task(rq, p);
-               task_rq_unlock(rq, p, &flags);
+               __sched_setscheduler(p, &attr, false, false);
        }
        read_unlock(&tasklist_lock);
 }
@@ -7734,11 +7747,11 @@ static long sched_group_rt_runtime(struct task_group *tg)
        return rt_runtime_us;
 }
 
-static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
+static int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
 {
        u64 rt_runtime, rt_period;
 
-       rt_period = (u64)rt_period_us * NSEC_PER_USEC;
+       rt_period = rt_period_us * NSEC_PER_USEC;
        rt_runtime = tg->rt_bandwidth.rt_runtime;
 
        return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
@@ -8105,10 +8118,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
 
        __refill_cfs_bandwidth_runtime(cfs_b);
        /* restart the period timer (if active) to handle new period expiry */
-       if (runtime_enabled && cfs_b->timer_active) {
-               /* force a reprogram */
-               __start_cfs_bandwidth(cfs_b, true);
-       }
+       if (runtime_enabled)
+               start_cfs_bandwidth(cfs_b);
        raw_spin_unlock_irq(&cfs_b->lock);
 
        for_each_online_cpu(i) {