sched: Move the loadavg code to a more obvious location

author Peter Zijlstra <peterz@infradead.org>

Tue, 14 Apr 2015 11:19:42 +0000 (13:19 +0200)

committer Ingo Molnar <mingo@kernel.org>

Fri, 8 May 2015 10:04:12 +0000 (12:04 +0200)
author Peter Zijlstra <peterz@infradead.org>
Tue, 14 Apr 2015 11:19:42 +0000 (13:19 +0200)
committer Ingo Molnar <mingo@kernel.org>
Fri, 8 May 2015 10:04:12 +0000 (12:04 +0200)
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 26a2e6122734f8237ac44d47fb6bf4e96cca124b..85cf253bc366b3082eff289bfd5729b4a2d1a505 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -173,7 +173,12 @@ extern unsigned long nr_iowait_cpu(int cpu);
  extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load);
  
  extern void calc_global_load(unsigned long ticks);
+
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
  extern void update_cpu_load_nohz(void);
+#else
+static inline void update_cpu_load_nohz(void) { }
+#endif
  
  extern unsigned long get_parent_ip(unsigned long addr);
  
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile

index 46be8702487561cd88a7895fea8c6401d72e9ce6..67687973ce80d63d3f52698fb4b738b76964b896 100644 (file)
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -11,7 +11,7 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
  CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
  endif
  
-obj-y += core.o proc.o clock.o cputime.o
+obj-y += core.o loadavg.o clock.o cputime.o
  obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
  obj-y += wait.o completion.o idle.o
  obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index fdf972d56f65aba3b45675ff1b8d1e3c31685e8e..527fc28a737a75ce1e8c7542453afd672cc72d13 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2397,9 +2397,9 @@ unsigned long nr_iowait_cpu(int cpu)
  
  void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
  {
-       struct rq *this = this_rq();
-       *nr_waiters = atomic_read(&this->nr_iowait);
-       *load = this->cpu_load[0];
+       struct rq *rq = this_rq();
+       *nr_waiters = atomic_read(&rq->nr_iowait);
+       *load = rq->load.weight;
  }
  
  #ifdef CONFIG_SMP
@@ -2497,6 +2497,7 @@ void scheduler_tick(void)
         update_rq_clock(rq);
         curr->sched_class->task_tick(rq, curr, 0);
         update_cpu_load_active(rq);
+       calc_global_load_tick(rq);
         raw_spin_unlock(&rq->lock);
  
         perf_event_task_tick();
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index ffeaa4105e48a36105ecaea8967082e1e7a7af98..4bc6013886ecaac3ed1f6b1d214142f14ae20214 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4323,6 +4323,189 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
  }
  
  #ifdef CONFIG_SMP
+
+/*
+ * per rq 'load' arrray crap; XXX kill this.
+ */
+
+/*
+ * The exact cpuload at various idx values, calculated at every tick would be
+ * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
+ *
+ * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
+ * on nth tick when cpu may be busy, then we have:
+ * load = ((2^idx - 1) / 2^idx)^(n-1) * load
+ * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
+ *
+ * decay_load_missed() below does efficient calculation of
+ * load = ((2^idx - 1) / 2^idx)^(n-1) * load
+ * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
+ *
+ * The calculation is approximated on a 128 point scale.
+ * degrade_zero_ticks is the number of ticks after which load at any
+ * particular idx is approximated to be zero.
+ * degrade_factor is a precomputed table, a row for each load idx.
+ * Each column corresponds to degradation factor for a power of two ticks,
+ * based on 128 point scale.
+ * Example:
+ * row 2, col 3 (=12) says that the degradation at load idx 2 after
+ * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
+ *
+ * With this power of 2 load factors, we can degrade the load n times
+ * by looking at 1 bits in n and doing as many mult/shift instead of
+ * n mult/shifts needed by the exact degradation.
+ */
+#define DEGRADE_SHIFT          7
+static const unsigned char
+               degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
+static const unsigned char
+               degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
+                                       {0, 0, 0, 0, 0, 0, 0, 0},
+                                       {64, 32, 8, 0, 0, 0, 0, 0},
+                                       {96, 72, 40, 12, 1, 0, 0},
+                                       {112, 98, 75, 43, 15, 1, 0},
+                                       {120, 112, 98, 76, 45, 16, 2} };
+
+/*
+ * Update cpu_load for any missed ticks, due to tickless idle. The backlog
+ * would be when CPU is idle and so we just decay the old load without
+ * adding any new load.
+ */
+static unsigned long
+decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
+{
+       int j = 0;
+
+       if (!missed_updates)
+               return load;
+
+       if (missed_updates >= degrade_zero_ticks[idx])
+               return 0;
+
+       if (idx == 1)
+               return load >> missed_updates;
+
+       while (missed_updates) {
+               if (missed_updates % 2)
+                       load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
+
+               missed_updates >>= 1;
+               j++;
+       }
+       return load;
+}
+
+/*
+ * Update rq->cpu_load[] statistics. This function is usually called every
+ * scheduler tick (TICK_NSEC). With tickless idle this will not be called
+ * every tick. We fix it up based on jiffies.
+ */
+static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
+                             unsigned long pending_updates)
+{
+       int i, scale;
+
+       this_rq->nr_load_updates++;
+
+       /* Update our load: */
+       this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
+       for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
+               unsigned long old_load, new_load;
+
+               /* scale is effectively 1 << i now, and >> i divides by scale */
+
+               old_load = this_rq->cpu_load[i];
+               old_load = decay_load_missed(old_load, pending_updates - 1, i);
+               new_load = this_load;
+               /*
+                * Round up the averaging division if load is increasing. This
+                * prevents us from getting stuck on 9 if the load is 10, for
+                * example.
+                */
+               if (new_load > old_load)
+                       new_load += scale - 1;
+
+               this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
+       }
+
+       sched_avg_update(this_rq);
+}
+
+#ifdef CONFIG_NO_HZ_COMMON
+/*
+ * There is no sane way to deal with nohz on smp when using jiffies because the
+ * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
+ * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
+ *
+ * Therefore we cannot use the delta approach from the regular tick since that
+ * would seriously skew the load calculation. However we'll make do for those
+ * updates happening while idle (nohz_idle_balance) or coming out of idle
+ * (tick_nohz_idle_exit).
+ *
+ * This means we might still be one tick off for nohz periods.
+ */
+
+/*
+ * Called from nohz_idle_balance() to update the load ratings before doing the
+ * idle balance.
+ */
+static void update_idle_cpu_load(struct rq *this_rq)
+{
+       unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
+       unsigned long load = this_rq->cfs.runnable_load_avg;
+       unsigned long pending_updates;
+
+       /*
+        * bail if there's load or we're actually up-to-date.
+        */
+       if (load || curr_jiffies == this_rq->last_load_update_tick)
+               return;
+
+       pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+       this_rq->last_load_update_tick = curr_jiffies;
+
+       __update_cpu_load(this_rq, load, pending_updates);
+}
+
+/*
+ * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
+ */
+void update_cpu_load_nohz(void)
+{
+       struct rq *this_rq = this_rq();
+       unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
+       unsigned long pending_updates;
+
+       if (curr_jiffies == this_rq->last_load_update_tick)
+               return;
+
+       raw_spin_lock(&this_rq->lock);
+       pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+       if (pending_updates) {
+               this_rq->last_load_update_tick = curr_jiffies;
+               /*
+                * We were idle, this means load 0, the current load might be
+                * !0 due to remote wakeups and the sort.
+                */
+               __update_cpu_load(this_rq, 0, pending_updates);
+       }
+       raw_spin_unlock(&this_rq->lock);
+}
+#endif /* CONFIG_NO_HZ */
+
+/*
+ * Called from scheduler_tick()
+ */
+void update_cpu_load_active(struct rq *this_rq)
+{
+       unsigned long load = this_rq->cfs.runnable_load_avg;
+       /*
+        * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
+        */
+       this_rq->last_load_update_tick = jiffies;
+       __update_cpu_load(this_rq, load, 1);
+}
+
  /* Used instead of source_load when we know the type == 0 */
  static unsigned long weighted_cpuload(const int cpu)
  {
diff --git a/kernel/sched/proc.c b/kernel/sched/loadavg.c

similarity index 62%

rename from kernel/sched/proc.c

rename to kernel/sched/loadavg.c

index 8ecd552fe4f2229eacabe8d1e9bb1941078fb577..ef7159012cf366f5a724e3cc3d66186d99e97cc2 100644 (file)
--- a/kernel/sched/proc.c
+++ b/kernel/sched/loadavg.c
@@ -1,7 +1,9 @@
  /*
- *  kernel/sched/proc.c
+ * kernel/sched/loadavg.c
   *
- *  Kernel load calculations, forked from sched/core.c
+ * This file contains the magic bits required to compute the global loadavg
+ * figure. Its a silly number but people think its important. We go through
+ * great pains to make it work on big machines and tickless kernels.
   */
  
  #include <linux/export.h>
@@ -81,7 +83,7 @@ long calc_load_fold_active(struct rq *this_rq)
         long nr_active, delta = 0;
  
         nr_active = this_rq->nr_running;
-       nr_active += (long) this_rq->nr_uninterruptible;
+       nr_active += (long)this_rq->nr_uninterruptible;
  
         if (nr_active != this_rq->calc_load_active) {
                 delta = nr_active - this_rq->calc_load_active;
@@ -186,6 +188,7 @@ void calc_load_enter_idle(void)
         delta = calc_load_fold_active(this_rq);
         if (delta) {
                 int idx = calc_load_write_idx();
+
                 atomic_long_add(delta, &calc_load_idle[idx]);
         }
  }
@@ -241,18 +244,20 @@ fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
  {
         unsigned long result = 1UL << frac_bits;
  
-       if (n) for (;;) {
-               if (n & 1) {
-                       result *= x;
-                       result += 1UL << (frac_bits - 1);
-                       result >>= frac_bits;
+       if (n) {
+               for (;;) {
+                       if (n & 1) {
+                               result *= x;
+                               result += 1UL << (frac_bits - 1);
+                               result >>= frac_bits;
+                       }
+                       n >>= 1;
+                       if (!n)
+                               break;
+                       x *= x;
+                       x += 1UL << (frac_bits - 1);
+                       x >>= frac_bits;
                 }
-               n >>= 1;
-               if (!n)
-                       break;
-               x *= x;
-               x += 1UL << (frac_bits - 1);
-               x >>= frac_bits;
         }
  
         return result;
@@ -285,7 +290,6 @@ static unsigned long
  calc_load_n(unsigned long load, unsigned long exp,
             unsigned long active, unsigned int n)
  {
-
         return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
  }
  
@@ -339,6 +343,8 @@ static inline void calc_global_nohz(void) { }
  /*
   * calc_load - update the avenrun load estimates 10 ticks after the
   * CPUs have updated calc_load_tasks.
+ *
+ * Called from the global timer code.
   */
  void calc_global_load(unsigned long ticks)
  {
@@ -370,10 +376,10 @@ void calc_global_load(unsigned long ticks)
  }
  
  /*
- * Called from update_cpu_load() to periodically update this CPU's
+ * Called from scheduler_tick() to periodically update this CPU's
   * active count.
   */
-static void calc_load_account_active(struct rq *this_rq)
+void calc_global_load_tick(struct rq *this_rq)
  {
         long delta;
  
@@ -386,199 +392,3 @@ static void calc_load_account_active(struct rq *this_rq)
  
         this_rq->calc_load_update += LOAD_FREQ;
  }
-
-/*
- * End of global load-average stuff
- */
-
-/*
- * The exact cpuload at various idx values, calculated at every tick would be
- * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
- *
- * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
- * on nth tick when cpu may be busy, then we have:
- * load = ((2^idx - 1) / 2^idx)^(n-1) * load
- * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
- *
- * decay_load_missed() below does efficient calculation of
- * load = ((2^idx - 1) / 2^idx)^(n-1) * load
- * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
- *
- * The calculation is approximated on a 128 point scale.
- * degrade_zero_ticks is the number of ticks after which load at any
- * particular idx is approximated to be zero.
- * degrade_factor is a precomputed table, a row for each load idx.
- * Each column corresponds to degradation factor for a power of two ticks,
- * based on 128 point scale.
- * Example:
- * row 2, col 3 (=12) says that the degradation at load idx 2 after
- * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
- *
- * With this power of 2 load factors, we can degrade the load n times
- * by looking at 1 bits in n and doing as many mult/shift instead of
- * n mult/shifts needed by the exact degradation.
- */
-#define DEGRADE_SHIFT          7
-static const unsigned char
-               degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
-static const unsigned char
-               degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
-                                       {0, 0, 0, 0, 0, 0, 0, 0},
-                                       {64, 32, 8, 0, 0, 0, 0, 0},
-                                       {96, 72, 40, 12, 1, 0, 0},
-                                       {112, 98, 75, 43, 15, 1, 0},
-                                       {120, 112, 98, 76, 45, 16, 2} };
-
-/*
- * Update cpu_load for any missed ticks, due to tickless idle. The backlog
- * would be when CPU is idle and so we just decay the old load without
- * adding any new load.
- */
-static unsigned long
-decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
-{
-       int j = 0;
-
-       if (!missed_updates)
-               return load;
-
-       if (missed_updates >= degrade_zero_ticks[idx])
-               return 0;
-
-       if (idx == 1)
-               return load >> missed_updates;
-
-       while (missed_updates) {
-               if (missed_updates % 2)
-                       load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
-
-               missed_updates >>= 1;
-               j++;
-       }
-       return load;
-}
-
-/*
- * Update rq->cpu_load[] statistics. This function is usually called every
- * scheduler tick (TICK_NSEC). With tickless idle this will not be called
- * every tick. We fix it up based on jiffies.
- */
-static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
-                             unsigned long pending_updates)
-{
-       int i, scale;
-
-       this_rq->nr_load_updates++;
-
-       /* Update our load: */
-       this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
-       for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
-               unsigned long old_load, new_load;
-
-               /* scale is effectively 1 << i now, and >> i divides by scale */
-
-               old_load = this_rq->cpu_load[i];
-               old_load = decay_load_missed(old_load, pending_updates - 1, i);
-               new_load = this_load;
-               /*
-                * Round up the averaging division if load is increasing. This
-                * prevents us from getting stuck on 9 if the load is 10, for
-                * example.
-                */
-               if (new_load > old_load)
-                       new_load += scale - 1;
-
-               this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
-       }
-
-       sched_avg_update(this_rq);
-}
-
-#ifdef CONFIG_SMP
-static inline unsigned long get_rq_runnable_load(struct rq *rq)
-{
-       return rq->cfs.runnable_load_avg;
-}
-#else
-static inline unsigned long get_rq_runnable_load(struct rq *rq)
-{
-       return rq->load.weight;
-}
-#endif
-
-#ifdef CONFIG_NO_HZ_COMMON
-/*
- * There is no sane way to deal with nohz on smp when using jiffies because the
- * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
- * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
- *
- * Therefore we cannot use the delta approach from the regular tick since that
- * would seriously skew the load calculation. However we'll make do for those
- * updates happening while idle (nohz_idle_balance) or coming out of idle
- * (tick_nohz_idle_exit).
- *
- * This means we might still be one tick off for nohz periods.
- */
-
-/*
- * Called from nohz_idle_balance() to update the load ratings before doing the
- * idle balance.
- */
-void update_idle_cpu_load(struct rq *this_rq)
-{
-       unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
-       unsigned long load = get_rq_runnable_load(this_rq);
-       unsigned long pending_updates;
-
-       /*
-        * bail if there's load or we're actually up-to-date.
-        */
-       if (load || curr_jiffies == this_rq->last_load_update_tick)
-               return;
-
-       pending_updates = curr_jiffies - this_rq->last_load_update_tick;
-       this_rq->last_load_update_tick = curr_jiffies;
-
-       __update_cpu_load(this_rq, load, pending_updates);
-}
-
-/*
- * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
- */
-void update_cpu_load_nohz(void)
-{
-       struct rq *this_rq = this_rq();
-       unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
-       unsigned long pending_updates;
-
-       if (curr_jiffies == this_rq->last_load_update_tick)
-               return;
-
-       raw_spin_lock(&this_rq->lock);
-       pending_updates = curr_jiffies - this_rq->last_load_update_tick;
-       if (pending_updates) {
-               this_rq->last_load_update_tick = curr_jiffies;
-               /*
-                * We were idle, this means load 0, the current load might be
-                * !0 due to remote wakeups and the sort.
-                */
-               __update_cpu_load(this_rq, 0, pending_updates);
-       }
-       raw_spin_unlock(&this_rq->lock);
-}
-#endif /* CONFIG_NO_HZ */
-
-/*
- * Called from scheduler_tick()
- */
-void update_cpu_load_active(struct rq *this_rq)
-{
-       unsigned long load = get_rq_runnable_load(this_rq);
-       /*
-        * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
-        */
-       this_rq->last_load_update_tick = jiffies;
-       __update_cpu_load(this_rq, load, 1);
-
-       calc_load_account_active(this_rq);
-}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

index e0e1299939588ac47f08b13b45f1a6e2e9cf4d7f..09ed26a89f31186ae1ce8a8d9c95c59c37af4401 100644 (file)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -26,8 +26,14 @@ extern __read_mostly int scheduler_running;
  extern unsigned long calc_load_update;
  extern atomic_long_t calc_load_tasks;
  
+extern void calc_global_load_tick(struct rq *this_rq);
  extern long calc_load_fold_active(struct rq *this_rq);
+
+#ifdef CONFIG_SMP
  extern void update_cpu_load_active(struct rq *this_rq);
+#else
+static inline void update_cpu_load_active(struct rq *this_rq) { }
+#endif
  
  /*
   * Helpers for converting nanosecond timing to jiffy resolution
@@ -1298,8 +1304,6 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
  
  unsigned long to_ratio(u64 period, u64 runtime);
  
-extern void update_idle_cpu_load(struct rq *this_rq);
-
  extern void init_task_runnable_average(struct task_struct *p);
  
  static inline void add_nr_running(struct rq *rq, unsigned count)
author	Peter Zijlstra <peterz@infradead.org>
	Tue, 14 Apr 2015 11:19:42 +0000 (13:19 +0200)
committer	Ingo Molnar <mingo@kernel.org>
	Fri, 8 May 2015 10:04:12 +0000 (12:04 +0200)
include/linux/sched.h		patch \| blob \| history
kernel/sched/Makefile		patch \| blob \| history
kernel/sched/core.c		patch \| blob \| history
kernel/sched/fair.c		patch \| blob \| history
kernel/sched/loadavg.c	[moved from kernel/sched/proc.c with 62% similarity]	patch \| blob \| history
kernel/sched/sched.h		patch \| blob \| history