From: Linus Torvalds Date: Wed, 4 Nov 2015 02:03:50 +0000 (-0800) Subject: Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel... X-Git-Tag: v4.4-rc1~153 X-Git-Url: https://git.kernelconcepts.de/?p=karo-tx-linux.git;a=commitdiff_plain;h=53528695ff6d8b77011bc818407c13e30914a946;hp=b831ef2cad979912850e34f82415c0c5d59de8cb Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip Pull scheduler changes from Ingo Molnar: "The main changes in this cycle were: - sched/fair load tracking fixes and cleanups (Byungchul Park) - Make load tracking frequency scale invariant (Dietmar Eggemann) - sched/deadline updates (Juri Lelli) - stop machine fixes, cleanups and enhancements for bugs triggered by CPU hotplug stress testing (Oleg Nesterov) - scheduler preemption code rework: remove PREEMPT_ACTIVE and related cleanups (Peter Zijlstra) - Rework the sched_info::run_delay code to fix races (Peter Zijlstra) - Optimize per entity utilization tracking (Peter Zijlstra) - ... misc other fixes, cleanups and smaller updates" * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (57 commits) sched: Don't scan all-offline ->cpus_allowed twice if !CONFIG_CPUSETS sched: Move cpu_active() tests from stop_two_cpus() into migrate_swap_stop() sched: Start stopper early stop_machine: Kill cpu_stop_threads->setup() and cpu_stop_unpark() stop_machine: Kill smp_hotplug_thread->pre_unpark, introduce stop_machine_unpark() stop_machine: Change cpu_stop_queue_two_works() to rely on stopper->enabled stop_machine: Introduce __cpu_stop_queue_work() and cpu_stop_queue_two_works() stop_machine: Ensure that a queued callback will be called before cpu_stop_park() sched/x86: Fix typo in __switch_to() comments sched/core: Remove a parameter in the migrate_task_rq() function sched/core: Drop unlikely behind BUG_ON() sched/core: Fix task and run queue sched_info::run_delay inconsistencies sched/numa: Fix task_tick_fair() from disabling numa_balancing sched/core: Add preempt_count invariant check sched/core: More notrace annotations sched/core: Kill PREEMPT_ACTIVE sched/core, sched/x86: Kill thread_info::saved_preempt_count sched/core: Simplify preempt_count tests sched/core: Robustify preemption leak checks sched/core: Stop setting PREEMPT_ACTIVE ... --- diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index b12f81022a6b..01bcde84d3e4 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -30,12 +30,9 @@ static __always_inline void preempt_count_set(int pc) /* * must be macros to avoid header recursion hell */ -#define init_task_preempt_count(p) do { \ - task_thread_info(p)->saved_preempt_count = PREEMPT_DISABLED; \ -} while (0) +#define init_task_preempt_count(p) do { } while (0) #define init_idle_preempt_count(p, cpu) do { \ - task_thread_info(p)->saved_preempt_count = PREEMPT_ENABLED; \ per_cpu(__preempt_count, (cpu)) = PREEMPT_ENABLED; \ } while (0) diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 8afdc3e44247..809877e9030b 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -57,7 +57,6 @@ struct thread_info { __u32 flags; /* low level flags */ __u32 status; /* thread synchronous flags */ __u32 cpu; /* current CPU */ - int saved_preempt_count; mm_segment_t addr_limit; void __user *sysenter_return; unsigned int sig_on_uaccess_error:1; @@ -69,7 +68,6 @@ struct thread_info { .task = &tsk, \ .flags = 0, \ .cpu = 0, \ - .saved_preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ } diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 737527b40e5b..9f950917528b 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -279,14 +279,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl)) set_iopl_mask(next->iopl); - /* - * If it were not for PREEMPT_ACTIVE we could guarantee that the - * preempt_count of all tasks was equal here and this would not be - * needed. - */ - task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count); - this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count); - /* * Now maybe handle debug registers and/or IO bitmaps */ diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index b35921a670b2..e835d263a33b 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -332,7 +332,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* * Switch FS and GS. * - * These are even more complicated than FS and GS: they have + * These are even more complicated than DS and ES: they have * 64-bit bases are that controlled by arch_prctl. Those bases * only differ from the values in the GDT or LDT if the selector * is 0. @@ -401,14 +401,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) */ this_cpu_write(current_task, next_p); - /* - * If it were not for PREEMPT_ACTIVE we could guarantee that the - * preempt_count of all tasks was equal here and this would not be - * needed. - */ - task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count); - this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count); - /* Reload esp0 and ss1. This changes current_thread_info(). */ load_sp0(tss, next); diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h index 0bec580a4885..5d8ffa3e6f8c 100644 --- a/include/asm-generic/preempt.h +++ b/include/asm-generic/preempt.h @@ -24,7 +24,7 @@ static __always_inline void preempt_count_set(int pc) * must be macros to avoid header recursion hell */ #define init_task_preempt_count(p) do { \ - task_thread_info(p)->preempt_count = PREEMPT_DISABLED; \ + task_thread_info(p)->preempt_count = FORK_PREEMPT_COUNT; \ } while (0) #define init_idle_preempt_count(p, cpu) do { \ diff --git a/include/linux/preempt.h b/include/linux/preempt.h index bea8dd8ff5e0..75e4e30677f1 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -26,7 +26,6 @@ * SOFTIRQ_MASK: 0x0000ff00 * HARDIRQ_MASK: 0x000f0000 * NMI_MASK: 0x00100000 - * PREEMPT_ACTIVE: 0x00200000 * PREEMPT_NEED_RESCHED: 0x80000000 */ #define PREEMPT_BITS 8 @@ -53,10 +52,6 @@ #define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET) -#define PREEMPT_ACTIVE_BITS 1 -#define PREEMPT_ACTIVE_SHIFT (NMI_SHIFT + NMI_BITS) -#define PREEMPT_ACTIVE (__IRQ_MASK(PREEMPT_ACTIVE_BITS) << PREEMPT_ACTIVE_SHIFT) - /* We use the MSB mostly because its available */ #define PREEMPT_NEED_RESCHED 0x80000000 @@ -126,8 +121,7 @@ * Check whether we were atomic before we did preempt_disable(): * (used by the scheduler) */ -#define in_atomic_preempt_off() \ - ((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_DISABLE_OFFSET) +#define in_atomic_preempt_off() (preempt_count() != PREEMPT_DISABLE_OFFSET) #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER) extern void preempt_count_add(int val); @@ -146,18 +140,6 @@ extern void preempt_count_sub(int val); #define preempt_count_inc() preempt_count_add(1) #define preempt_count_dec() preempt_count_sub(1) -#define preempt_active_enter() \ -do { \ - preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); \ - barrier(); \ -} while (0) - -#define preempt_active_exit() \ -do { \ - barrier(); \ - preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); \ -} while (0) - #ifdef CONFIG_PREEMPT_COUNT #define preempt_disable() \ diff --git a/include/linux/sched.h b/include/linux/sched.h index 56667292d1e4..9e1e06c3ce05 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -599,20 +599,26 @@ struct task_cputime_atomic { .sum_exec_runtime = ATOMIC64_INIT(0), \ } -#ifdef CONFIG_PREEMPT_COUNT -#define PREEMPT_DISABLED (1 + PREEMPT_ENABLED) -#else -#define PREEMPT_DISABLED PREEMPT_ENABLED -#endif +#define PREEMPT_DISABLED (PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED) + +/* + * Disable preemption until the scheduler is running -- use an unconditional + * value so that it also works on !PREEMPT_COUNT kernels. + * + * Reset by start_kernel()->sched_init()->init_idle()->init_idle_preempt_count(). + */ +#define INIT_PREEMPT_COUNT PREEMPT_OFFSET /* - * Disable preemption until the scheduler is running. - * Reset by start_kernel()->sched_init()->init_idle(). + * Initial preempt_count value; reflects the preempt_count schedule invariant + * which states that during context switches: * - * We include PREEMPT_ACTIVE to avoid cond_resched() from working - * before the scheduler is active -- see should_resched(). + * preempt_count() == 2*PREEMPT_DISABLE_OFFSET + * + * Note: PREEMPT_DISABLE_OFFSET is 0 for !PREEMPT_COUNT kernels. + * Note: See finish_task_switch(). */ -#define INIT_PREEMPT_COUNT (PREEMPT_DISABLED + PREEMPT_ACTIVE) +#define FORK_PREEMPT_COUNT (2*PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED) /** * struct thread_group_cputimer - thread group interval timer counts @@ -1142,8 +1148,6 @@ struct sched_domain_topology_level { #endif }; -extern struct sched_domain_topology_level *sched_domain_topology; - extern void set_sched_topology(struct sched_domain_topology_level *tl); extern void wake_up_if_idle(int cpu); @@ -1192,10 +1196,10 @@ struct load_weight { /* * The load_avg/util_avg accumulates an infinite geometric series. - * 1) load_avg factors the amount of time that a sched_entity is - * runnable on a rq into its weight. For cfs_rq, it is the aggregated - * such weights of all runnable and blocked sched_entities. - * 2) util_avg factors frequency scaling into the amount of time + * 1) load_avg factors frequency scaling into the amount of time that a + * sched_entity is runnable on a rq into its weight. For cfs_rq, it is the + * aggregated such weights of all runnable and blocked sched_entities. + * 2) util_avg factors frequency and cpu scaling into the amount of time * that a sched_entity is running on a CPU, in the range [0..SCHED_LOAD_SCALE]. * For cfs_rq, it is the aggregated such times of all runnable and * blocked sched_entities. diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h index 9d303b8847df..9089a2ae913d 100644 --- a/include/linux/sched/deadline.h +++ b/include/linux/sched/deadline.h @@ -21,4 +21,9 @@ static inline int dl_task(struct task_struct *p) return dl_prio(p->prio); } +static inline bool dl_time_before(u64 a, u64 b) +{ + return (s64)(a - b) < 0; +} + #endif /* _SCHED_DEADLINE_H */ diff --git a/include/linux/smpboot.h b/include/linux/smpboot.h index e6109a6cd8f6..12910cf19869 100644 --- a/include/linux/smpboot.h +++ b/include/linux/smpboot.h @@ -24,9 +24,6 @@ struct smpboot_thread_data; * parked (cpu offline) * @unpark: Optional unpark function, called when the thread is * unparked (cpu online) - * @pre_unpark: Optional unpark function, called before the thread is - * unparked (cpu online). This is not guaranteed to be - * called on the target cpu of the thread. Careful! * @cpumask: Internal state. To update which threads are unparked, * call smpboot_update_cpumask_percpu_thread(). * @selfparking: Thread is not parked by the park function. @@ -42,7 +39,6 @@ struct smp_hotplug_thread { void (*cleanup)(unsigned int cpu, bool online); void (*park)(unsigned int cpu); void (*unpark)(unsigned int cpu); - void (*pre_unpark)(unsigned int cpu); cpumask_var_t cpumask; bool selfparking; const char *thread_comm; diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h index 414d924318ce..0adedca24c5b 100644 --- a/include/linux/stop_machine.h +++ b/include/linux/stop_machine.h @@ -33,6 +33,8 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, struct cpu_stop_work *work_buf); int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg); int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg); +void stop_machine_park(int cpu); +void stop_machine_unpark(int cpu); #else /* CONFIG_SMP */ diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 539d6bc3216a..9b90c57517a9 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -104,22 +104,17 @@ DEFINE_EVENT(sched_wakeup_template, sched_wakeup_new, TP_ARGS(p)); #ifdef CREATE_TRACE_POINTS -static inline long __trace_sched_switch_state(struct task_struct *p) +static inline long __trace_sched_switch_state(bool preempt, struct task_struct *p) { - long state = p->state; - -#ifdef CONFIG_PREEMPT #ifdef CONFIG_SCHED_DEBUG BUG_ON(p != current); #endif /* CONFIG_SCHED_DEBUG */ + /* - * For all intents and purposes a preempted task is a running task. + * Preemption ignores task state, therefore preempted tasks are always + * RUNNING (we will not have dequeued if state != RUNNING). */ - if (preempt_count() & PREEMPT_ACTIVE) - state = TASK_RUNNING | TASK_STATE_MAX; -#endif /* CONFIG_PREEMPT */ - - return state; + return preempt ? TASK_RUNNING | TASK_STATE_MAX : p->state; } #endif /* CREATE_TRACE_POINTS */ @@ -128,10 +123,11 @@ static inline long __trace_sched_switch_state(struct task_struct *p) */ TRACE_EVENT(sched_switch, - TP_PROTO(struct task_struct *prev, + TP_PROTO(bool preempt, + struct task_struct *prev, struct task_struct *next), - TP_ARGS(prev, next), + TP_ARGS(preempt, prev, next), TP_STRUCT__entry( __array( char, prev_comm, TASK_COMM_LEN ) @@ -147,7 +143,7 @@ TRACE_EVENT(sched_switch, memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN); __entry->prev_pid = prev->pid; __entry->prev_prio = prev->prio; - __entry->prev_state = __trace_sched_switch_state(prev); + __entry->prev_state = __trace_sched_switch_state(preempt, prev); memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN); __entry->next_pid = next->pid; __entry->next_prio = next->prio; diff --git a/kernel/cpu.c b/kernel/cpu.c index 14a9cdf8abe9..85ff5e26e23b 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -291,8 +291,8 @@ static inline void check_for_tasks(int dead_cpu) { struct task_struct *g, *p; - read_lock_irq(&tasklist_lock); - do_each_thread(g, p) { + read_lock(&tasklist_lock); + for_each_process_thread(g, p) { if (!p->on_rq) continue; /* @@ -307,8 +307,8 @@ static inline void check_for_tasks(int dead_cpu) pr_warn("Task %s (pid=%d) is on cpu %d (state=%ld, flags=%x)\n", p->comm, task_pid_nr(p), dead_cpu, p->state, p->flags); - } while_each_thread(g, p); - read_unlock_irq(&tasklist_lock); + } + read_unlock(&tasklist_lock); } struct take_cpu_down_param { @@ -331,7 +331,7 @@ static int take_cpu_down(void *_param) /* Give up timekeeping duties */ tick_handover_do_timer(); /* Park the stopper thread */ - kthread_park(current); + stop_machine_park((long)param->hcpu); return 0; } diff --git a/kernel/exit.c b/kernel/exit.c index 0e93b63bbc59..07110c6020a0 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -706,10 +706,12 @@ void do_exit(long code) smp_mb(); raw_spin_unlock_wait(&tsk->pi_lock); - if (unlikely(in_atomic())) + if (unlikely(in_atomic())) { pr_info("note: %s[%d] exited with preempt_count %d\n", current->comm, task_pid_nr(current), preempt_count()); + preempt_count_set(PREEMPT_ENABLED); + } /* sync mm's RSS info before statistics gathering */ if (tsk->mm) diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index bbb72b4f64a1..8251e75dd9c0 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -170,7 +170,8 @@ rt_mutex_waiter_less(struct rt_mutex_waiter *left, * then right waiter has a dl_prio() too. */ if (dl_prio(left->prio)) - return (left->task->dl.deadline < right->task->dl.deadline); + return dl_time_before(left->task->dl.deadline, + right->task->dl.deadline); return 0; } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f7402f7eb448..aa5973220ad2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -817,7 +817,7 @@ static void set_load_weight(struct task_struct *p) /* * SCHED_IDLE tasks get minimal weight: */ - if (p->policy == SCHED_IDLE) { + if (idle_policy(p->policy)) { load->weight = scale_load(WEIGHT_IDLEPRIO); load->inv_weight = WMULT_IDLEPRIO; return; @@ -827,17 +827,19 @@ static void set_load_weight(struct task_struct *p) load->inv_weight = prio_to_wmult[prio]; } -static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) +static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) { update_rq_clock(rq); - sched_info_queued(rq, p); + if (!(flags & ENQUEUE_RESTORE)) + sched_info_queued(rq, p); p->sched_class->enqueue_task(rq, p, flags); } -static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) +static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) { update_rq_clock(rq); - sched_info_dequeued(rq, p); + if (!(flags & DEQUEUE_SAVE)) + sched_info_dequeued(rq, p); p->sched_class->dequeue_task(rq, p, flags); } @@ -1178,7 +1180,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) * holding rq->lock. */ lockdep_assert_held(&rq->lock); - dequeue_task(rq, p, 0); + dequeue_task(rq, p, DEQUEUE_SAVE); } if (running) put_prev_task(rq, p); @@ -1188,7 +1190,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) if (running) p->sched_class->set_curr_task(rq); if (queued) - enqueue_task(rq, p, 0); + enqueue_task(rq, p, ENQUEUE_RESTORE); } /* @@ -1292,7 +1294,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) if (task_cpu(p) != new_cpu) { if (p->sched_class->migrate_task_rq) - p->sched_class->migrate_task_rq(p, new_cpu); + p->sched_class->migrate_task_rq(p); p->se.nr_migrations++; perf_event_task_migrate(p); } @@ -1333,12 +1335,16 @@ static int migrate_swap_stop(void *data) struct rq *src_rq, *dst_rq; int ret = -EAGAIN; + if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu)) + return -EAGAIN; + src_rq = cpu_rq(arg->src_cpu); dst_rq = cpu_rq(arg->dst_cpu); double_raw_lock(&arg->src_task->pi_lock, &arg->dst_task->pi_lock); double_rq_lock(src_rq, dst_rq); + if (task_cpu(arg->dst_task) != arg->dst_cpu) goto unlock; @@ -1574,13 +1580,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p) goto out; } + /* No more Mr. Nice Guy. */ switch (state) { case cpuset: - /* No more Mr. Nice Guy. */ - cpuset_cpus_allowed_fallback(p); - state = possible; - break; - + if (IS_ENABLED(CONFIG_CPUSETS)) { + cpuset_cpus_allowed_fallback(p); + state = possible; + break; + } + /* fall-through */ case possible: do_set_cpus_allowed(p, cpu_possible_mask); state = fail; @@ -1692,7 +1700,7 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) #endif /* CONFIG_SCHEDSTATS */ } -static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) +static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) { activate_task(rq, p, en_flags); p->on_rq = TASK_ON_RQ_QUEUED; @@ -2114,23 +2122,17 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) #endif /* CONFIG_NUMA_BALANCING */ } +DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); + #ifdef CONFIG_NUMA_BALANCING -#ifdef CONFIG_SCHED_DEBUG + void set_numabalancing_state(bool enabled) { if (enabled) - sched_feat_set("NUMA"); + static_branch_enable(&sched_numa_balancing); else - sched_feat_set("NO_NUMA"); + static_branch_disable(&sched_numa_balancing); } -#else -__read_mostly bool numabalancing_enabled; - -void set_numabalancing_state(bool enabled) -{ - numabalancing_enabled = enabled; -} -#endif /* CONFIG_SCHED_DEBUG */ #ifdef CONFIG_PROC_SYSCTL int sysctl_numa_balancing(struct ctl_table *table, int write, @@ -2138,7 +2140,7 @@ int sysctl_numa_balancing(struct ctl_table *table, int write, { struct ctl_table t; int err; - int state = numabalancing_enabled; + int state = static_branch_likely(&sched_numa_balancing); if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; @@ -2349,6 +2351,8 @@ void wake_up_new_task(struct task_struct *p) struct rq *rq; raw_spin_lock_irqsave(&p->pi_lock, flags); + /* Initialize new task's runnable average */ + init_entity_runnable_average(&p->se); #ifdef CONFIG_SMP /* * Fork balancing, do it here and not earlier because: @@ -2358,8 +2362,6 @@ void wake_up_new_task(struct task_struct *p) set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); #endif - /* Initialize new task's runnable average */ - init_entity_runnable_average(&p->se); rq = __task_rq_lock(p); activate_task(rq, p, 0); p->on_rq = TASK_ON_RQ_QUEUED; @@ -2483,7 +2485,6 @@ static inline void prepare_task_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { - trace_sched_switch(prev, next); sched_info_switch(rq, prev, next); perf_event_task_sched_out(prev, next); fire_sched_out_preempt_notifiers(prev, next); @@ -2517,6 +2518,22 @@ static struct rq *finish_task_switch(struct task_struct *prev) struct mm_struct *mm = rq->prev_mm; long prev_state; + /* + * The previous task will have left us with a preempt_count of 2 + * because it left us after: + * + * schedule() + * preempt_disable(); // 1 + * __schedule() + * raw_spin_lock_irq(&rq->lock) // 2 + * + * Also, see FORK_PREEMPT_COUNT. + */ + if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, + "corrupted preempt_count: %s/%d/0x%x\n", + current->comm, current->pid, preempt_count())) + preempt_count_set(FORK_PREEMPT_COUNT); + rq->prev_mm = NULL; /* @@ -2601,8 +2618,15 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) { struct rq *rq; - /* finish_task_switch() drops rq->lock and enables preemtion */ - preempt_disable(); + /* + * New tasks start with FORK_PREEMPT_COUNT, see there and + * finish_task_switch() for details. + * + * finish_task_switch() will drop rq->lock() and lower preempt_count + * and the preempt_enable() will end up enabling preemption (on + * PREEMPT_COUNT kernels). + */ + rq = finish_task_switch(prev); balance_callback(rq); preempt_enable(); @@ -2960,15 +2984,13 @@ static noinline void __schedule_bug(struct task_struct *prev) static inline void schedule_debug(struct task_struct *prev) { #ifdef CONFIG_SCHED_STACK_END_CHECK - BUG_ON(unlikely(task_stack_end_corrupted(prev))); + BUG_ON(task_stack_end_corrupted(prev)); #endif - /* - * Test if we are atomic. Since do_exit() needs to call into - * schedule() atomically, we ignore that path. Otherwise whine - * if we are scheduling when we should not. - */ - if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD)) + + if (unlikely(in_atomic_preempt_off())) { __schedule_bug(prev); + preempt_count_set(PREEMPT_DISABLED); + } rcu_sleep_check(); profile_hit(SCHED_PROFILING, __builtin_return_address(0)); @@ -3054,7 +3076,7 @@ again: * * WARNING: must be called with preemption disabled! */ -static void __sched __schedule(void) +static void __sched notrace __schedule(bool preempt) { struct task_struct *prev, *next; unsigned long *switch_count; @@ -3066,6 +3088,17 @@ static void __sched __schedule(void) rcu_note_context_switch(); prev = rq->curr; + /* + * do_exit() calls schedule() with preemption disabled as an exception; + * however we must fix that up, otherwise the next task will see an + * inconsistent (higher) preempt count. + * + * It also avoids the below schedule_debug() test from complaining + * about this. + */ + if (unlikely(prev->state == TASK_DEAD)) + preempt_enable_no_resched_notrace(); + schedule_debug(prev); if (sched_feat(HRTICK)) @@ -3083,7 +3116,7 @@ static void __sched __schedule(void) rq->clock_skip_update <<= 1; /* promote REQ to ACT */ switch_count = &prev->nivcsw; - if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { + if (!preempt && prev->state) { if (unlikely(signal_pending_state(prev->state, prev))) { prev->state = TASK_RUNNING; } else { @@ -3119,6 +3152,7 @@ static void __sched __schedule(void) rq->curr = next; ++*switch_count; + trace_sched_switch(preempt, prev, next); rq = context_switch(rq, prev, next); /* unlocks the rq */ cpu = cpu_of(rq); } else { @@ -3148,7 +3182,7 @@ asmlinkage __visible void __sched schedule(void) sched_submit_work(tsk); do { preempt_disable(); - __schedule(); + __schedule(false); sched_preempt_enable_no_resched(); } while (need_resched()); } @@ -3188,9 +3222,9 @@ void __sched schedule_preempt_disabled(void) static void __sched notrace preempt_schedule_common(void) { do { - preempt_active_enter(); - __schedule(); - preempt_active_exit(); + preempt_disable_notrace(); + __schedule(true); + preempt_enable_no_resched_notrace(); /* * Check again in case we missed a preemption opportunity @@ -3241,24 +3275,17 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) return; do { - /* - * Use raw __prempt_count() ops that don't call function. - * We can't call functions before disabling preemption which - * disarm preemption tracing recursions. - */ - __preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); - barrier(); + preempt_disable_notrace(); /* * Needs preempt disabled in case user_exit() is traced * and the tracer calls preempt_enable_notrace() causing * an infinite recursion. */ prev_ctx = exception_enter(); - __schedule(); + __schedule(true); exception_exit(prev_ctx); - barrier(); - __preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); + preempt_enable_no_resched_notrace(); } while (need_resched()); } EXPORT_SYMBOL_GPL(preempt_schedule_notrace); @@ -3281,11 +3308,11 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) prev_state = exception_enter(); do { - preempt_active_enter(); + preempt_disable(); local_irq_enable(); - __schedule(); + __schedule(true); local_irq_disable(); - preempt_active_exit(); + sched_preempt_enable_no_resched(); } while (need_resched()); exception_exit(prev_state); @@ -3313,7 +3340,7 @@ EXPORT_SYMBOL(default_wake_function); */ void rt_mutex_setprio(struct task_struct *p, int prio) { - int oldprio, queued, running, enqueue_flag = 0; + int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE; struct rq *rq; const struct sched_class *prev_class; @@ -3345,7 +3372,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) queued = task_on_rq_queued(p); running = task_current(rq, p); if (queued) - dequeue_task(rq, p, 0); + dequeue_task(rq, p, DEQUEUE_SAVE); if (running) put_prev_task(rq, p); @@ -3363,7 +3390,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (!dl_prio(p->normal_prio) || (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { p->dl.dl_boosted = 1; - enqueue_flag = ENQUEUE_REPLENISH; + enqueue_flag |= ENQUEUE_REPLENISH; } else p->dl.dl_boosted = 0; p->sched_class = &dl_sched_class; @@ -3371,7 +3398,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (dl_prio(oldprio)) p->dl.dl_boosted = 0; if (oldprio < prio) - enqueue_flag = ENQUEUE_HEAD; + enqueue_flag |= ENQUEUE_HEAD; p->sched_class = &rt_sched_class; } else { if (dl_prio(oldprio)) @@ -3423,7 +3450,7 @@ void set_user_nice(struct task_struct *p, long nice) } queued = task_on_rq_queued(p); if (queued) - dequeue_task(rq, p, 0); + dequeue_task(rq, p, DEQUEUE_SAVE); p->static_prio = NICE_TO_PRIO(nice); set_load_weight(p); @@ -3432,7 +3459,7 @@ void set_user_nice(struct task_struct *p, long nice) delta = p->prio - old_prio; if (queued) { - enqueue_task(rq, p, 0); + enqueue_task(rq, p, ENQUEUE_RESTORE); /* * If the task increased its priority or is running and * lowered its priority, then reschedule its CPU: @@ -3753,10 +3780,7 @@ recheck: } else { reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK); - if (policy != SCHED_DEADLINE && - policy != SCHED_FIFO && policy != SCHED_RR && - policy != SCHED_NORMAL && policy != SCHED_BATCH && - policy != SCHED_IDLE) + if (!valid_policy(policy)) return -EINVAL; } @@ -3812,7 +3836,7 @@ recheck: * Treat SCHED_IDLE as nice 20. Only allow a switch to * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. */ - if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { + if (idle_policy(p->policy) && !idle_policy(policy)) { if (!can_nice(p, task_nice(p))) return -EPERM; } @@ -3937,7 +3961,7 @@ change: queued = task_on_rq_queued(p); running = task_current(rq, p); if (queued) - dequeue_task(rq, p, 0); + dequeue_task(rq, p, DEQUEUE_SAVE); if (running) put_prev_task(rq, p); @@ -3947,11 +3971,15 @@ change: if (running) p->sched_class->set_curr_task(rq); if (queued) { + int enqueue_flags = ENQUEUE_RESTORE; /* * We enqueue to tail when the priority of a task is * increased (user space view). */ - enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0); + if (oldprio <= p->prio) + enqueue_flags |= ENQUEUE_HEAD; + + enqueue_task(rq, p, enqueue_flags); } check_class_changed(rq, p, prev_class, oldprio); @@ -5101,7 +5129,7 @@ void sched_setnuma(struct task_struct *p, int nid) running = task_current(rq, p); if (queued) - dequeue_task(rq, p, 0); + dequeue_task(rq, p, DEQUEUE_SAVE); if (running) put_prev_task(rq, p); @@ -5110,7 +5138,7 @@ void sched_setnuma(struct task_struct *p, int nid) if (running) p->sched_class->set_curr_task(rq); if (queued) - enqueue_task(rq, p, 0); + enqueue_task(rq, p, ENQUEUE_RESTORE); task_rq_unlock(rq, p, &flags); } #endif /* CONFIG_NUMA_BALANCING */ @@ -5531,21 +5559,27 @@ static void set_cpu_rq_start_time(void) static int sched_cpu_active(struct notifier_block *nfb, unsigned long action, void *hcpu) { + int cpu = (long)hcpu; + switch (action & ~CPU_TASKS_FROZEN) { case CPU_STARTING: set_cpu_rq_start_time(); return NOTIFY_OK; + case CPU_ONLINE: /* * At this point a starting CPU has marked itself as online via * set_cpu_online(). But it might not yet have marked itself * as active, which is essential from here on. - * - * Thus, fall-through and help the starting CPU along. */ + set_cpu_active(cpu, true); + stop_machine_unpark(cpu); + return NOTIFY_OK; + case CPU_DOWN_FAILED: - set_cpu_active((long)hcpu, true); + set_cpu_active(cpu, true); return NOTIFY_OK; + default: return NOTIFY_DONE; } @@ -6477,7 +6511,8 @@ static struct sched_domain_topology_level default_topology[] = { { NULL, }, }; -struct sched_domain_topology_level *sched_domain_topology = default_topology; +static struct sched_domain_topology_level *sched_domain_topology = + default_topology; #define for_each_sd_topology(tl) \ for (tl = sched_domain_topology; tl->mask; tl++) @@ -7478,7 +7513,7 @@ void __init sched_init(void) #ifdef CONFIG_DEBUG_ATOMIC_SLEEP static inline int preempt_count_equals(int preempt_offset) { - int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); + int nested = preempt_count() + rcu_preempt_depth(); return (nested == preempt_offset); } @@ -7725,7 +7760,7 @@ void sched_move_task(struct task_struct *tsk) queued = task_on_rq_queued(tsk); if (queued) - dequeue_task(rq, tsk, 0); + dequeue_task(rq, tsk, DEQUEUE_SAVE); if (unlikely(running)) put_prev_task(rq, tsk); @@ -7741,7 +7776,7 @@ void sched_move_task(struct task_struct *tsk) #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->task_move_group) - tsk->sched_class->task_move_group(tsk, queued); + tsk->sched_class->task_move_group(tsk); else #endif set_task_rq(tsk, task_cpu(tsk)); @@ -7749,7 +7784,7 @@ void sched_move_task(struct task_struct *tsk) if (unlikely(running)) tsk->sched_class->set_curr_task(rq); if (queued) - enqueue_task(rq, tsk, 0); + enqueue_task(rq, tsk, ENQUEUE_RESTORE); task_rq_unlock(rq, tsk, &flags); } @@ -8213,14 +8248,6 @@ static void cpu_cgroup_exit(struct cgroup_subsys_state *css, struct cgroup_subsys_state *old_css, struct task_struct *task) { - /* - * cgroup_exit() is called in the copy_process() failure path. - * Ignore this case since the task hasn't ran yet, this avoids - * trying to poke a half freed task state from generic code. - */ - if (!(task->flags & PF_EXITING)) - return; - sched_move_task(task); } diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index c6acb07466bb..5a75b08cfd85 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -31,11 +31,6 @@ static inline int right_child(int i) return (i << 1) + 2; } -static inline int dl_time_before(u64 a, u64 b) -{ - return (s64)(a - b) < 0; -} - static void cpudl_exchange(struct cpudl *cp, int a, int b) { int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu; diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index 1a0a6ef2fbe1..fcbdf83fed7e 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h @@ -2,6 +2,7 @@ #define _LINUX_CPUDL_H #include +#include #define IDX_INVALID -1 diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9a5e60fe721a..824aa9f501a3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -661,11 +661,12 @@ static unsigned long task_h_load(struct task_struct *p); /* * We choose a half-life close to 1 scheduling period. - * Note: The tables below are dependent on this value. + * Note: The tables runnable_avg_yN_inv and runnable_avg_yN_sum are + * dependent on this value. */ #define LOAD_AVG_PERIOD 32 #define LOAD_AVG_MAX 47742 /* maximum possible load avg */ -#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */ +#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_AVG_MAX */ /* Give new sched_entity start runnable values to heavy its load in infant time */ void init_entity_runnable_average(struct sched_entity *se) @@ -682,7 +683,7 @@ void init_entity_runnable_average(struct sched_entity *se) sa->load_avg = scale_load_down(se->load.weight); sa->load_sum = sa->load_avg * LOAD_AVG_MAX; sa->util_avg = scale_load_down(SCHED_LOAD_SCALE); - sa->util_sum = LOAD_AVG_MAX; + sa->util_sum = sa->util_avg * LOAD_AVG_MAX; /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } @@ -2069,7 +2070,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) int local = !!(flags & TNF_FAULT_LOCAL); int priv; - if (!numabalancing_enabled) + if (!static_branch_likely(&sched_numa_balancing)) return; /* for example, ksmd faulting in a user's mm */ @@ -2157,7 +2158,7 @@ void task_numa_work(struct callback_head *work) struct vm_area_struct *vma; unsigned long start, end; unsigned long nr_pte_updates = 0; - long pages; + long pages, virtpages; WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); @@ -2203,9 +2204,11 @@ void task_numa_work(struct callback_head *work) start = mm->numa_scan_offset; pages = sysctl_numa_balancing_scan_size; pages <<= 20 - PAGE_SHIFT; /* MB in pages */ + virtpages = pages * 8; /* Scan up to this much virtual space */ if (!pages) return; + down_read(&mm->mmap_sem); vma = find_vma(mm, start); if (!vma) { @@ -2240,18 +2243,22 @@ void task_numa_work(struct callback_head *work) start = max(start, vma->vm_start); end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); end = min(end, vma->vm_end); - nr_pte_updates += change_prot_numa(vma, start, end); + nr_pte_updates = change_prot_numa(vma, start, end); /* - * Scan sysctl_numa_balancing_scan_size but ensure that - * at least one PTE is updated so that unused virtual - * address space is quickly skipped. + * Try to scan sysctl_numa_balancing_size worth of + * hpages that have at least one present PTE that + * is not already pte-numa. If the VMA contains + * areas that are unused or already full of prot_numa + * PTEs, scan up to virtpages, to skip through those + * areas faster. */ if (nr_pte_updates) pages -= (end - start) >> PAGE_SHIFT; + virtpages -= (end - start) >> PAGE_SHIFT; start = end; - if (pages <= 0) + if (pages <= 0 || virtpages <= 0) goto out; cond_resched(); @@ -2515,6 +2522,12 @@ static u32 __compute_runnable_contrib(u64 n) return contrib + runnable_avg_yN_sum[n]; } +#if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10 +#error "load tracking assumes 2^10 as unit" +#endif + +#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) + /* * We can represent the historical contribution to runnable average as the * coefficients of a geometric series. To do this we sub-divide our runnable @@ -2547,10 +2560,10 @@ static __always_inline int __update_load_avg(u64 now, int cpu, struct sched_avg *sa, unsigned long weight, int running, struct cfs_rq *cfs_rq) { - u64 delta, periods; + u64 delta, scaled_delta, periods; u32 contrib; - int delta_w, decayed = 0; - unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu); + unsigned int delta_w, scaled_delta_w, decayed = 0; + unsigned long scale_freq, scale_cpu; delta = now - sa->last_update_time; /* @@ -2571,6 +2584,9 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, return 0; sa->last_update_time = now; + scale_freq = arch_scale_freq_capacity(NULL, cpu); + scale_cpu = arch_scale_cpu_capacity(NULL, cpu); + /* delta_w is the amount already accumulated against our next period */ delta_w = sa->period_contrib; if (delta + delta_w >= 1024) { @@ -2585,13 +2601,16 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, * period and accrue it. */ delta_w = 1024 - delta_w; + scaled_delta_w = cap_scale(delta_w, scale_freq); if (weight) { - sa->load_sum += weight * delta_w; - if (cfs_rq) - cfs_rq->runnable_load_sum += weight * delta_w; + sa->load_sum += weight * scaled_delta_w; + if (cfs_rq) { + cfs_rq->runnable_load_sum += + weight * scaled_delta_w; + } } if (running) - sa->util_sum += delta_w * scale_freq >> SCHED_CAPACITY_SHIFT; + sa->util_sum += scaled_delta_w * scale_cpu; delta -= delta_w; @@ -2608,23 +2627,25 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, /* Efficiently calculate \sum (1..n_period) 1024*y^i */ contrib = __compute_runnable_contrib(periods); + contrib = cap_scale(contrib, scale_freq); if (weight) { sa->load_sum += weight * contrib; if (cfs_rq) cfs_rq->runnable_load_sum += weight * contrib; } if (running) - sa->util_sum += contrib * scale_freq >> SCHED_CAPACITY_SHIFT; + sa->util_sum += contrib * scale_cpu; } /* Remainder of delta accrued against u_0` */ + scaled_delta = cap_scale(delta, scale_freq); if (weight) { - sa->load_sum += weight * delta; + sa->load_sum += weight * scaled_delta; if (cfs_rq) - cfs_rq->runnable_load_sum += weight * delta; + cfs_rq->runnable_load_sum += weight * scaled_delta; } if (running) - sa->util_sum += delta * scale_freq >> SCHED_CAPACITY_SHIFT; + sa->util_sum += scaled_delta * scale_cpu; sa->period_contrib += delta; @@ -2634,7 +2655,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, cfs_rq->runnable_load_avg = div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX); } - sa->util_avg = (sa->util_sum << SCHED_LOAD_SHIFT) / LOAD_AVG_MAX; + sa->util_avg = sa->util_sum / LOAD_AVG_MAX; } return decayed; @@ -2677,8 +2698,7 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) if (atomic_long_read(&cfs_rq->removed_util_avg)) { long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0); sa->util_avg = max_t(long, sa->util_avg - r, 0); - sa->util_sum = max_t(s32, sa->util_sum - - ((r * LOAD_AVG_MAX) >> SCHED_LOAD_SHIFT), 0); + sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0); } decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, @@ -2696,33 +2716,70 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) static inline void update_load_avg(struct sched_entity *se, int update_tg) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - int cpu = cpu_of(rq_of(cfs_rq)); u64 now = cfs_rq_clock_task(cfs_rq); + int cpu = cpu_of(rq_of(cfs_rq)); /* * Track task load average for carrying it to new CPU after migrated, and * track group sched_entity load average for task_h_load calc in migration */ __update_load_avg(now, cpu, &se->avg, - se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL); + se->on_rq * scale_load_down(se->load.weight), + cfs_rq->curr == se, NULL); if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg) update_tg_load_avg(cfs_rq, 0); } +static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + if (!sched_feat(ATTACH_AGE_LOAD)) + goto skip_aging; + + /* + * If we got migrated (either between CPUs or between cgroups) we'll + * have aged the average right before clearing @last_update_time. + */ + if (se->avg.last_update_time) { + __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), + &se->avg, 0, 0, NULL); + + /* + * XXX: we could have just aged the entire load away if we've been + * absent from the fair class for too long. + */ + } + +skip_aging: + se->avg.last_update_time = cfs_rq->avg.last_update_time; + cfs_rq->avg.load_avg += se->avg.load_avg; + cfs_rq->avg.load_sum += se->avg.load_sum; + cfs_rq->avg.util_avg += se->avg.util_avg; + cfs_rq->avg.util_sum += se->avg.util_sum; +} + +static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), + &se->avg, se->on_rq * scale_load_down(se->load.weight), + cfs_rq->curr == se, NULL); + + cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0); + cfs_rq->avg.load_sum = max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0); + cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0); + cfs_rq->avg.util_sum = max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0); +} + /* Add the load generated by se into cfs_rq's load average */ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { struct sched_avg *sa = &se->avg; u64 now = cfs_rq_clock_task(cfs_rq); - int migrated = 0, decayed; + int migrated, decayed; - if (sa->last_update_time == 0) { - sa->last_update_time = now; - migrated = 1; - } - else { + migrated = !sa->last_update_time; + if (!migrated) { __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL); @@ -2733,12 +2790,8 @@ enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) cfs_rq->runnable_load_avg += sa->load_avg; cfs_rq->runnable_load_sum += sa->load_sum; - if (migrated) { - cfs_rq->avg.load_avg += sa->load_avg; - cfs_rq->avg.load_sum += sa->load_sum; - cfs_rq->avg.util_avg += sa->util_avg; - cfs_rq->avg.util_sum += sa->util_sum; - } + if (migrated) + attach_entity_load_avg(cfs_rq, se); if (decayed || migrated) update_tg_load_avg(cfs_rq, 0); @@ -2753,7 +2806,7 @@ dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) cfs_rq->runnable_load_avg = max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0); cfs_rq->runnable_load_sum = - max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0); + max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0); } /* @@ -2821,6 +2874,11 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} static inline void remove_entity_load_avg(struct sched_entity *se) {} +static inline void +attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} +static inline void +detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} + static inline int idle_balance(struct rq *rq) { return 0; @@ -4817,32 +4875,39 @@ next: done: return target; } + /* - * get_cpu_usage returns the amount of capacity of a CPU that is used by CFS + * cpu_util returns the amount of capacity of a CPU that is used by CFS * tasks. The unit of the return value must be the one of capacity so we can - * compare the usage with the capacity of the CPU that is available for CFS - * task (ie cpu_capacity). - * cfs.avg.util_avg is the sum of running time of runnable tasks on a - * CPU. It represents the amount of utilization of a CPU in the range - * [0..SCHED_LOAD_SCALE]. The usage of a CPU can't be higher than the full - * capacity of the CPU because it's about the running time on this CPU. - * Nevertheless, cfs.avg.util_avg can be higher than SCHED_LOAD_SCALE - * because of unfortunate rounding in util_avg or just - * after migrating tasks until the average stabilizes with the new running - * time. So we need to check that the usage stays into the range - * [0..cpu_capacity_orig] and cap if necessary. - * Without capping the usage, a group could be seen as overloaded (CPU0 usage - * at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity + * compare the utilization with the capacity of the CPU that is available for + * CFS task (ie cpu_capacity). + * + * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the + * recent utilization of currently non-runnable tasks on a CPU. It represents + * the amount of utilization of a CPU in the range [0..capacity_orig] where + * capacity_orig is the cpu_capacity available at the highest frequency + * (arch_scale_freq_capacity()). + * The utilization of a CPU converges towards a sum equal to or less than the + * current capacity (capacity_curr <= capacity_orig) of the CPU because it is + * the running time on this CPU scaled by capacity_curr. + * + * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even + * higher than capacity_orig because of unfortunate rounding in + * cfs.avg.util_avg or just after migrating tasks and new task wakeups until + * the average stabilizes with the new running time. We need to check that the + * utilization stays within the range of [0..capacity_orig] and cap it if + * necessary. Without utilization capping, a group could be seen as overloaded + * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of + * available capacity. We allow utilization to overshoot capacity_curr (but not + * capacity_orig) as it useful for predicting the capacity required after task + * migrations (scheduler-driven DVFS). */ -static int get_cpu_usage(int cpu) +static int cpu_util(int cpu) { - unsigned long usage = cpu_rq(cpu)->cfs.avg.util_avg; + unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; unsigned long capacity = capacity_orig_of(cpu); - if (usage >= SCHED_LOAD_SCALE) - return capacity; - - return (usage * capacity) >> SCHED_LOAD_SHIFT; + return (util >= capacity) ? capacity : util; } /* @@ -4945,7 +5010,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f * previous cpu. However, the caller only guarantees p->pi_lock is held; no * other assumptions, including the state of rq->lock, should be made. */ -static void migrate_task_rq_fair(struct task_struct *p, int next_cpu) +static void migrate_task_rq_fair(struct task_struct *p) { /* * We are supposed to update the task to "current" time, then its up to date @@ -5525,10 +5590,10 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) unsigned long src_faults, dst_faults; int src_nid, dst_nid; - if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) + if (!static_branch_likely(&sched_numa_balancing)) return -1; - if (!sched_feat(NUMA)) + if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) return -1; src_nid = cpu_to_node(env->src_cpu); @@ -5934,7 +5999,7 @@ struct sg_lb_stats { unsigned long sum_weighted_load; /* Weighted load of group's tasks */ unsigned long load_per_task; unsigned long group_capacity; - unsigned long group_usage; /* Total usage of the group */ + unsigned long group_util; /* Total utilization of the group */ unsigned int sum_nr_running; /* Nr tasks running in the group */ unsigned int idle_cpus; unsigned int group_weight; @@ -6010,19 +6075,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd, return load_idx; } -static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu) -{ - if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) - return sd->smt_gain / sd->span_weight; - - return SCHED_CAPACITY_SCALE; -} - -unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) -{ - return default_scale_cpu_capacity(sd, cpu); -} - static unsigned long scale_rt_capacity(int cpu) { struct rq *rq = cpu_rq(cpu); @@ -6052,16 +6104,9 @@ static unsigned long scale_rt_capacity(int cpu) static void update_cpu_capacity(struct sched_domain *sd, int cpu) { - unsigned long capacity = SCHED_CAPACITY_SCALE; + unsigned long capacity = arch_scale_cpu_capacity(sd, cpu); struct sched_group *sdg = sd->groups; - if (sched_feat(ARCH_CAPACITY)) - capacity *= arch_scale_cpu_capacity(sd, cpu); - else - capacity *= default_scale_cpu_capacity(sd, cpu); - - capacity >>= SCHED_CAPACITY_SHIFT; - cpu_rq(cpu)->cpu_capacity_orig = capacity; capacity *= scale_rt_capacity(cpu); @@ -6187,8 +6232,8 @@ static inline int sg_imbalanced(struct sched_group *group) * group_has_capacity returns true if the group has spare capacity that could * be used by some tasks. * We consider that a group has spare capacity if the * number of task is - * smaller than the number of CPUs or if the usage is lower than the available - * capacity for CFS tasks. + * smaller than the number of CPUs or if the utilization is lower than the + * available capacity for CFS tasks. * For the latter, we use a threshold to stabilize the state, to take into * account the variance of the tasks' load and to return true if the available * capacity in meaningful for the load balancer. @@ -6202,7 +6247,7 @@ group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs) return true; if ((sgs->group_capacity * 100) > - (sgs->group_usage * env->sd->imbalance_pct)) + (sgs->group_util * env->sd->imbalance_pct)) return true; return false; @@ -6223,15 +6268,15 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) return false; if ((sgs->group_capacity * 100) < - (sgs->group_usage * env->sd->imbalance_pct)) + (sgs->group_util * env->sd->imbalance_pct)) return true; return false; } -static enum group_type group_classify(struct lb_env *env, - struct sched_group *group, - struct sg_lb_stats *sgs) +static inline enum +group_type group_classify(struct sched_group *group, + struct sg_lb_stats *sgs) { if (sgs->group_no_capacity) return group_overloaded; @@ -6271,7 +6316,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, load = source_load(i, load_idx); sgs->group_load += load; - sgs->group_usage += get_cpu_usage(i); + sgs->group_util += cpu_util(i); sgs->sum_nr_running += rq->cfs.h_nr_running; if (rq->nr_running > 1) @@ -6296,7 +6341,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_weight = group->group_weight; sgs->group_no_capacity = group_is_overloaded(env, sgs); - sgs->group_type = group_classify(env, group, sgs); + sgs->group_type = group_classify(group, sgs); } /** @@ -6430,7 +6475,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd group_has_capacity(env, &sds->local_stat) && (sgs->sum_nr_running > 1)) { sgs->group_no_capacity = 1; - sgs->group_type = group_overloaded; + sgs->group_type = group_classify(sg, sgs); } if (update_sd_pick_busiest(env, sds, sg, sgs)) { @@ -7610,8 +7655,22 @@ out: * When the cpu is attached to null domain for ex, it will not be * updated. */ - if (likely(update_next_balance)) + if (likely(update_next_balance)) { rq->next_balance = next_balance; + +#ifdef CONFIG_NO_HZ_COMMON + /* + * If this CPU has been elected to perform the nohz idle + * balance. Other idle CPUs have already rebalanced with + * nohz_idle_balance() and nohz.next_balance has been + * updated accordingly. This CPU is now running the idle load + * balance for itself and we need to update the + * nohz.next_balance accordingly. + */ + if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance)) + nohz.next_balance = rq->next_balance; +#endif + } } #ifdef CONFIG_NO_HZ_COMMON @@ -7624,6 +7683,9 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) int this_cpu = this_rq->cpu; struct rq *rq; int balance_cpu; + /* Earliest time when we have to do rebalance again */ + unsigned long next_balance = jiffies + 60*HZ; + int update_next_balance = 0; if (idle != CPU_IDLE || !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu))) @@ -7655,10 +7717,19 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) rebalance_domains(rq, CPU_IDLE); } - if (time_after(this_rq->next_balance, rq->next_balance)) - this_rq->next_balance = rq->next_balance; + if (time_after(next_balance, rq->next_balance)) { + next_balance = rq->next_balance; + update_next_balance = 1; + } } - nohz.next_balance = this_rq->next_balance; + + /* + * next_balance will be updated only when there is a need. + * When the CPU is attached to null domain for ex, it will not be + * updated. + */ + if (likely(update_next_balance)) + nohz.next_balance = next_balance; end: clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)); } @@ -7811,7 +7882,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) entity_tick(cfs_rq, se, queued); } - if (numabalancing_enabled) + if (static_branch_unlikely(&sched_numa_balancing)) task_tick_numa(rq, curr); } @@ -7887,21 +7958,39 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) check_preempt_curr(rq, p, 0); } -static void switched_from_fair(struct rq *rq, struct task_struct *p) +static inline bool vruntime_normalized(struct task_struct *p) { struct sched_entity *se = &p->se; - struct cfs_rq *cfs_rq = cfs_rq_of(se); /* - * Ensure the task's vruntime is normalized, so that when it's - * switched back to the fair class the enqueue_entity(.flags=0) will - * do the right thing. + * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases, + * the dequeue_entity(.flags=0) will already have normalized the + * vruntime. + */ + if (p->on_rq) + return true; + + /* + * When !on_rq, vruntime of the task has usually NOT been normalized. + * But there are some cases where it has already been normalized: * - * If it's queued, then the dequeue_entity(.flags=0) will already - * have normalized the vruntime, if it's !queued, then only when - * the task is sleeping will it still have non-normalized vruntime. + * - A forked child which is waiting for being woken up by + * wake_up_new_task(). + * - A task which has been woken up by try_to_wake_up() and + * waiting for actually being woken up by sched_ttwu_pending(). */ - if (!task_on_rq_queued(p) && p->state != TASK_RUNNING) { + if (!se->sum_exec_runtime || p->state == TASK_WAKING) + return true; + + return false; +} + +static void detach_task_cfs_rq(struct task_struct *p) +{ + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + if (!vruntime_normalized(p)) { /* * Fix up our vruntime so that the current sleep doesn't * cause 'unlimited' sleep bonus. @@ -7910,28 +7999,14 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) se->vruntime -= cfs_rq->min_vruntime; } -#ifdef CONFIG_SMP /* Catch up with the cfs_rq and remove our load when we leave */ - __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq), &se->avg, - se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL); - - cfs_rq->avg.load_avg = - max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0); - cfs_rq->avg.load_sum = - max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0); - cfs_rq->avg.util_avg = - max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0); - cfs_rq->avg.util_sum = - max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0); -#endif + detach_entity_load_avg(cfs_rq, se); } -/* - * We switched to the sched_fair class. - */ -static void switched_to_fair(struct rq *rq, struct task_struct *p) +static void attach_task_cfs_rq(struct task_struct *p) { struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); #ifdef CONFIG_FAIR_GROUP_SCHED /* @@ -7941,31 +8016,33 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) se->depth = se->parent ? se->parent->depth + 1 : 0; #endif - if (!task_on_rq_queued(p)) { + /* Synchronize task with its cfs_rq */ + attach_entity_load_avg(cfs_rq, se); + if (!vruntime_normalized(p)) + se->vruntime += cfs_rq->min_vruntime; +} + +static void switched_from_fair(struct rq *rq, struct task_struct *p) +{ + detach_task_cfs_rq(p); +} + +static void switched_to_fair(struct rq *rq, struct task_struct *p) +{ + attach_task_cfs_rq(p); + + if (task_on_rq_queued(p)) { /* - * Ensure the task has a non-normalized vruntime when it is switched - * back to the fair class with !queued, so that enqueue_entity() at - * wake-up time will do the right thing. - * - * If it's queued, then the enqueue_entity(.flags=0) makes the task - * has non-normalized vruntime, if it's !queued, then it still has - * normalized vruntime. + * We were most likely switched from sched_rt, so + * kick off the schedule if running, otherwise just see + * if we can still preempt the current task. */ - if (p->state != TASK_RUNNING) - se->vruntime += cfs_rq_of(se)->min_vruntime; - return; + if (rq->curr == p) + resched_curr(rq); + else + check_preempt_curr(rq, p, 0); } - - /* - * We were most likely switched from sched_rt, so - * kick off the schedule if running, otherwise just see - * if we can still preempt the current task. - */ - if (rq->curr == p) - resched_curr(rq); - else - check_preempt_curr(rq, p, 0); } /* Account for a task changing its policy or group. @@ -8000,56 +8077,16 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) } #ifdef CONFIG_FAIR_GROUP_SCHED -static void task_move_group_fair(struct task_struct *p, int queued) +static void task_move_group_fair(struct task_struct *p) { - struct sched_entity *se = &p->se; - struct cfs_rq *cfs_rq; - - /* - * If the task was not on the rq at the time of this cgroup movement - * it must have been asleep, sleeping tasks keep their ->vruntime - * absolute on their old rq until wakeup (needed for the fair sleeper - * bonus in place_entity()). - * - * If it was on the rq, we've just 'preempted' it, which does convert - * ->vruntime to a relative base. - * - * Make sure both cases convert their relative position when migrating - * to another cgroup's rq. This does somewhat interfere with the - * fair sleeper stuff for the first placement, but who cares. - */ - /* - * When !queued, vruntime of the task has usually NOT been normalized. - * But there are some cases where it has already been normalized: - * - * - Moving a forked child which is waiting for being woken up by - * wake_up_new_task(). - * - Moving a task which has been woken up by try_to_wake_up() and - * waiting for actually being woken up by sched_ttwu_pending(). - * - * To prevent boost or penalty in the new cfs_rq caused by delta - * min_vruntime between the two cfs_rqs, we skip vruntime adjustment. - */ - if (!queued && (!se->sum_exec_runtime || p->state == TASK_WAKING)) - queued = 1; - - if (!queued) - se->vruntime -= cfs_rq_of(se)->min_vruntime; + detach_task_cfs_rq(p); set_task_rq(p, task_cpu(p)); - se->depth = se->parent ? se->parent->depth + 1 : 0; - if (!queued) { - cfs_rq = cfs_rq_of(se); - se->vruntime += cfs_rq->min_vruntime; #ifdef CONFIG_SMP - /* Virtually synchronize task with its new cfs_rq */ - p->se.avg.last_update_time = cfs_rq->avg.last_update_time; - cfs_rq->avg.load_avg += p->se.avg.load_avg; - cfs_rq->avg.load_sum += p->se.avg.load_sum; - cfs_rq->avg.util_avg += p->se.avg.util_avg; - cfs_rq->avg.util_sum += p->se.avg.util_sum; + /* Tell se's cfs_rq has been changed -- migrated */ + p->se.avg.last_update_time = 0; #endif - } + attach_task_cfs_rq(p); } void free_fair_sched_group(struct task_group *tg) diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 83a50e7ca533..69631fa46c2f 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -36,11 +36,6 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true) */ SCHED_FEAT(WAKEUP_PREEMPTION, true) -/* - * Use arch dependent cpu capacity functions - */ -SCHED_FEAT(ARCH_CAPACITY, true) - SCHED_FEAT(HRTICK, false) SCHED_FEAT(DOUBLE_TICK, false) SCHED_FEAT(LB_BIAS, true) @@ -72,19 +67,5 @@ SCHED_FEAT(RT_PUSH_IPI, true) SCHED_FEAT(FORCE_SD_OVERLAP, false) SCHED_FEAT(RT_RUNTIME_SHARE, true) SCHED_FEAT(LB_MIN, false) +SCHED_FEAT(ATTACH_AGE_LOAD, true) -/* - * Apply the automatic NUMA scheduling policy. Enabled automatically - * at runtime if running on a NUMA machine. Can be controlled via - * numa_balancing= - */ -#ifdef CONFIG_NUMA_BALANCING - -/* - * NUMA will favor moving tasks towards nodes where a higher number of - * hinting faults are recorded during active load balancing. It will - * resist moving tasks towards nodes where a lower number of hinting - * faults have been recorded. - */ -SCHED_FEAT(NUMA, true) -#endif diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index d2ea59364a1c..e3cc16312046 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -635,11 +635,11 @@ bool sched_rt_bandwidth_account(struct rt_rq *rt_rq) /* * We ran out of runtime, see if we can borrow some from our neighbours. */ -static int do_balance_runtime(struct rt_rq *rt_rq) +static void do_balance_runtime(struct rt_rq *rt_rq) { struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd; - int i, weight, more = 0; + int i, weight; u64 rt_period; weight = cpumask_weight(rd->span); @@ -673,7 +673,6 @@ static int do_balance_runtime(struct rt_rq *rt_rq) diff = rt_period - rt_rq->rt_runtime; iter->rt_runtime -= diff; rt_rq->rt_runtime += diff; - more = 1; if (rt_rq->rt_runtime == rt_period) { raw_spin_unlock(&iter->rt_runtime_lock); break; @@ -683,8 +682,6 @@ next: raw_spin_unlock(&iter->rt_runtime_lock); } raw_spin_unlock(&rt_b->rt_runtime_lock); - - return more; } /* @@ -796,26 +793,19 @@ static void __enable_runtime(struct rq *rq) } } -static int balance_runtime(struct rt_rq *rt_rq) +static void balance_runtime(struct rt_rq *rt_rq) { - int more = 0; - if (!sched_feat(RT_RUNTIME_SHARE)) - return more; + return; if (rt_rq->rt_time > rt_rq->rt_runtime) { raw_spin_unlock(&rt_rq->rt_runtime_lock); - more = do_balance_runtime(rt_rq); + do_balance_runtime(rt_rq); raw_spin_lock(&rt_rq->rt_runtime_lock); } - - return more; } #else /* !CONFIG_SMP */ -static inline int balance_runtime(struct rt_rq *rt_rq) -{ - return 0; -} +static inline void balance_runtime(struct rt_rq *rt_rq) {} #endif /* CONFIG_SMP */ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6d2a119c7ad9..efd3bfc7e347 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -84,6 +84,10 @@ static inline void update_cpu_load_active(struct rq *this_rq) { } */ #define RUNTIME_INF ((u64)~0ULL) +static inline int idle_policy(int policy) +{ + return policy == SCHED_IDLE; +} static inline int fair_policy(int policy) { return policy == SCHED_NORMAL || policy == SCHED_BATCH; @@ -98,6 +102,11 @@ static inline int dl_policy(int policy) { return policy == SCHED_DEADLINE; } +static inline bool valid_policy(int policy) +{ + return idle_policy(policy) || fair_policy(policy) || + rt_policy(policy) || dl_policy(policy); +} static inline int task_has_rt_policy(struct task_struct *p) { @@ -109,11 +118,6 @@ static inline int task_has_dl_policy(struct task_struct *p) return dl_policy(p->policy); } -static inline bool dl_time_before(u64 a, u64 b) -{ - return (s64)(a - b) < 0; -} - /* * Tells if entity @a should preempt entity @b. */ @@ -1003,17 +1007,7 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ -#ifdef CONFIG_NUMA_BALANCING -#define sched_feat_numa(x) sched_feat(x) -#ifdef CONFIG_SCHED_DEBUG -#define numabalancing_enabled sched_feat_numa(NUMA) -#else -extern bool numabalancing_enabled; -#endif /* CONFIG_SCHED_DEBUG */ -#else -#define sched_feat_numa(x) (0) -#define numabalancing_enabled (0) -#endif /* CONFIG_NUMA_BALANCING */ +extern struct static_key_false sched_numa_balancing; static inline u64 global_rt_period(void) { @@ -1157,16 +1151,18 @@ static const u32 prio_to_wmult[40] = { /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, }; -#define ENQUEUE_WAKEUP 1 -#define ENQUEUE_HEAD 2 +#define ENQUEUE_WAKEUP 0x01 +#define ENQUEUE_HEAD 0x02 #ifdef CONFIG_SMP -#define ENQUEUE_WAKING 4 /* sched_class::task_waking was called */ +#define ENQUEUE_WAKING 0x04 /* sched_class::task_waking was called */ #else -#define ENQUEUE_WAKING 0 +#define ENQUEUE_WAKING 0x00 #endif -#define ENQUEUE_REPLENISH 8 +#define ENQUEUE_REPLENISH 0x08 +#define ENQUEUE_RESTORE 0x10 -#define DEQUEUE_SLEEP 1 +#define DEQUEUE_SLEEP 0x01 +#define DEQUEUE_SAVE 0x02 #define RETRY_TASK ((void *)-1UL) @@ -1194,7 +1190,7 @@ struct sched_class { #ifdef CONFIG_SMP int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); - void (*migrate_task_rq)(struct task_struct *p, int next_cpu); + void (*migrate_task_rq)(struct task_struct *p); void (*task_waking) (struct task_struct *task); void (*task_woken) (struct rq *this_rq, struct task_struct *task); @@ -1227,7 +1223,7 @@ struct sched_class { void (*update_curr) (struct rq *rq); #ifdef CONFIG_FAIR_GROUP_SCHED - void (*task_move_group) (struct task_struct *p, int on_rq); + void (*task_move_group) (struct task_struct *p); #endif }; @@ -1405,6 +1401,17 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) } #endif +#ifndef arch_scale_cpu_capacity +static __always_inline +unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) +{ + if (sd && (sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) + return sd->smt_gain / sd->span_weight; + + return SCHED_CAPACITY_SCALE; +} +#endif + static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); diff --git a/kernel/smpboot.c b/kernel/smpboot.c index a818cbc73e14..d264f59bff56 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -222,9 +222,8 @@ static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cp { struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); - if (ht->pre_unpark) - ht->pre_unpark(cpu); - kthread_unpark(tsk); + if (!ht->selfparking) + kthread_unpark(tsk); } void smpboot_unpark_threads(unsigned int cpu) diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 12484e5d5c88..867bc20e1ef1 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -73,21 +73,24 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed) } } +static void __cpu_stop_queue_work(struct cpu_stopper *stopper, + struct cpu_stop_work *work) +{ + list_add_tail(&work->list, &stopper->works); + wake_up_process(stopper->thread); +} + /* queue @work to @stopper. if offline, @work is completed immediately */ static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); - unsigned long flags; spin_lock_irqsave(&stopper->lock, flags); - - if (stopper->enabled) { - list_add_tail(&work->list, &stopper->works); - wake_up_process(stopper->thread); - } else + if (stopper->enabled) + __cpu_stop_queue_work(stopper, work); + else cpu_stop_signal_done(work->done, false); - spin_unlock_irqrestore(&stopper->lock, flags); } @@ -213,6 +216,31 @@ static int multi_cpu_stop(void *data) return err; } +static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1, + int cpu2, struct cpu_stop_work *work2) +{ + struct cpu_stopper *stopper1 = per_cpu_ptr(&cpu_stopper, cpu1); + struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2); + int err; + + lg_double_lock(&stop_cpus_lock, cpu1, cpu2); + spin_lock_irq(&stopper1->lock); + spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING); + + err = -ENOENT; + if (!stopper1->enabled || !stopper2->enabled) + goto unlock; + + err = 0; + __cpu_stop_queue_work(stopper1, work1); + __cpu_stop_queue_work(stopper2, work2); +unlock: + spin_unlock(&stopper2->lock); + spin_unlock_irq(&stopper1->lock); + lg_double_unlock(&stop_cpus_lock, cpu1, cpu2); + + return err; +} /** * stop_two_cpus - stops two cpus * @cpu1: the cpu to stop @@ -247,24 +275,13 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * cpu_stop_init_done(&done, 2); set_state(&msdata, MULTI_STOP_PREPARE); - /* - * If we observe both CPUs active we know _cpu_down() cannot yet have - * queued its stop_machine works and therefore ours will get executed - * first. Or its not either one of our CPUs that's getting unplugged, - * in which case we don't care. - * - * This relies on the stopper workqueues to be FIFO. - */ - if (!cpu_active(cpu1) || !cpu_active(cpu2)) { + if (cpu1 > cpu2) + swap(cpu1, cpu2); + if (cpu_stop_queue_two_works(cpu1, &work1, cpu2, &work2)) { preempt_enable(); return -ENOENT; } - lg_double_lock(&stop_cpus_lock, cpu1, cpu2); - cpu_stop_queue_work(cpu1, &work1); - cpu_stop_queue_work(cpu2, &work2); - lg_double_unlock(&stop_cpus_lock, cpu1, cpu2); - preempt_enable(); wait_for_completion(&done.completion); @@ -452,6 +469,18 @@ repeat: } } +void stop_machine_park(int cpu) +{ + struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); + /* + * Lockless. cpu_stopper_thread() will take stopper->lock and flush + * the pending works before it parks, until then it is fine to queue + * the new works. + */ + stopper->enabled = false; + kthread_park(stopper->thread); +} + extern void sched_set_stop_task(int cpu, struct task_struct *stop); static void cpu_stop_create(unsigned int cpu) @@ -462,26 +491,16 @@ static void cpu_stop_create(unsigned int cpu) static void cpu_stop_park(unsigned int cpu) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); - struct cpu_stop_work *work, *tmp; - unsigned long flags; - /* drain remaining works */ - spin_lock_irqsave(&stopper->lock, flags); - list_for_each_entry_safe(work, tmp, &stopper->works, list) { - list_del_init(&work->list); - cpu_stop_signal_done(work->done, false); - } - stopper->enabled = false; - spin_unlock_irqrestore(&stopper->lock, flags); + WARN_ON(!list_empty(&stopper->works)); } -static void cpu_stop_unpark(unsigned int cpu) +void stop_machine_unpark(int cpu) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); - spin_lock_irq(&stopper->lock); stopper->enabled = true; - spin_unlock_irq(&stopper->lock); + kthread_unpark(stopper->thread); } static struct smp_hotplug_thread cpu_stop_threads = { @@ -490,9 +509,7 @@ static struct smp_hotplug_thread cpu_stop_threads = { .thread_fn = cpu_stopper_thread, .thread_comm = "migration/%u", .create = cpu_stop_create, - .setup = cpu_stop_unpark, .park = cpu_stop_park, - .pre_unpark = cpu_stop_unpark, .selfparking = true, }; @@ -508,6 +525,7 @@ static int __init cpu_stop_init(void) } BUG_ON(smpboot_register_percpu_thread(&cpu_stop_threads)); + stop_machine_unpark(raw_smp_processor_id()); stop_machine_initialized = true; return 0; } diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b0623ac785a2..00611e95a8ee 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5697,7 +5697,7 @@ free: } static void -ftrace_graph_probe_sched_switch(void *ignore, +ftrace_graph_probe_sched_switch(void *ignore, bool preempt, struct task_struct *prev, struct task_struct *next) { unsigned long long timestamp; diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index f270088e9929..4c896a0101bd 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -16,7 +16,8 @@ static int sched_ref; static DEFINE_MUTEX(sched_register_mutex); static void -probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next) +probe_sched_switch(void *ignore, bool preempt, + struct task_struct *prev, struct task_struct *next) { if (unlikely(!sched_ref)) return; diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 12cbe77b4136..4bcfbac289ff 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -420,7 +420,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, } static void notrace -probe_wakeup_sched_switch(void *ignore, +probe_wakeup_sched_switch(void *ignore, bool preempt, struct task_struct *prev, struct task_struct *next) { struct trace_array_cpu *data;