]> git.kernelconcepts.de Git - karo-tx-linux.git/commitdiff
Merge branch 'timers-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...
authorLinus Torvalds <torvalds@linux-foundation.org>
Tue, 23 Jun 2015 01:57:44 +0000 (18:57 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 23 Jun 2015 01:57:44 +0000 (18:57 -0700)
Pull timer updates from Thomas Gleixner:
 "A rather largish update for everything time and timer related:

   - Cache footprint optimizations for both hrtimers and timer wheel

   - Lower the NOHZ impact on systems which have NOHZ or timer migration
     disabled at runtime.

   - Optimize run time overhead of hrtimer interrupt by making the clock
     offset updates smarter

   - hrtimer cleanups and removal of restrictions to tackle some
     problems in sched/perf

   - Some more leap second tweaks

   - Another round of changes addressing the 2038 problem

   - First step to change the internals of clock event devices by
     introducing the necessary infrastructure

   - Allow constant folding for usecs/msecs_to_jiffies()

   - The usual pile of clockevent/clocksource driver updates

  The hrtimer changes contain updates to sched, perf and x86 as they
  depend on them plus changes all over the tree to cleanup API changes
  and redundant code, which got copied all over the place.  The y2038
  changes touch s390 to remove the last non 2038 safe code related to
  boot/persistant clock"

* 'timers-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (114 commits)
  clocksource: Increase dependencies of timer-stm32 to limit build wreckage
  timer: Minimize nohz off overhead
  timer: Reduce timer migration overhead if disabled
  timer: Stats: Simplify the flags handling
  timer: Replace timer base by a cpu index
  timer: Use hlist for the timer wheel hash buckets
  timer: Remove FIFO "guarantee"
  timers: Sanitize catchup_timer_jiffies() usage
  hrtimer: Allow hrtimer::function() to free the timer
  seqcount: Introduce raw_write_seqcount_barrier()
  seqcount: Rename write_seqcount_barrier()
  hrtimer: Fix hrtimer_is_queued() hole
  hrtimer: Remove HRTIMER_STATE_MIGRATE
  selftest: Timers: Avoid signal deadlock in leap-a-day
  timekeeping: Copy the shadow-timekeeper over the real timekeeper last
  clockevents: Check state instead of mode in suspend/resume path
  selftests: timers: Add leap-second timer edge testing to leap-a-day.c
  ntp: Do leapsecond adjustment in adjtimex read path
  time: Prevent early expiry of hrtimers[CLOCK_REALTIME] at the leap second edge
  ntp: Introduce and use SECS_PER_DAY macro instead of 86400
  ...

18 files changed:
1  2 
arch/x86/kernel/cpu/perf_event_intel_uncore.c
fs/dcache.c
include/linux/perf_event.h
include/linux/rcupdate.h
include/linux/rcutree.h
include/linux/sched.h
kernel/events/core.c
kernel/futex.c
kernel/locking/rtmutex.c
kernel/rcu/tree_plugin.h
kernel/sched/core.c
kernel/sched/deadline.c
kernel/sched/debug.c
kernel/sched/fair.c
kernel/sched/rt.c
kernel/sched/sched.h
kernel/time/hrtimer.c
net/sched/sch_api.c

index 7c1de16101782b37e2ace385c55d0f7941a0d90d,7c411f0e58fd5fa0eb7368dd4bf9100c9eae3cbe..21b5e38c921b7a78102a2adbabf06328b56dbf9b
@@@ -233,9 -233,8 +233,8 @@@ static enum hrtimer_restart uncore_pmu_
  
  void uncore_pmu_start_hrtimer(struct intel_uncore_box *box)
  {
-       __hrtimer_start_range_ns(&box->hrtimer,
-                       ns_to_ktime(box->hrtimer_duration), 0,
-                       HRTIMER_MODE_REL_PINNED, 0);
+       hrtimer_start(&box->hrtimer, ns_to_ktime(box->hrtimer_duration),
+                     HRTIMER_MODE_REL_PINNED);
  }
  
  void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box)
@@@ -365,8 -364,9 +364,8 @@@ static int uncore_assign_events(struct 
        bitmap_zero(used_mask, UNCORE_PMC_IDX_MAX);
  
        for (i = 0, wmin = UNCORE_PMC_IDX_MAX, wmax = 0; i < n; i++) {
 -              hwc = &box->event_list[i]->hw;
                c = uncore_get_event_constraint(box, box->event_list[i]);
 -              hwc->constraint = c;
 +              box->event_constraint[i] = c;
                wmin = min(wmin, c->weight);
                wmax = max(wmax, c->weight);
        }
        /* fastpath, try to reuse previous register */
        for (i = 0; i < n; i++) {
                hwc = &box->event_list[i]->hw;
 -              c = hwc->constraint;
 +              c = box->event_constraint[i];
  
                /* never assigned */
                if (hwc->idx == -1)
        }
        /* slow path */
        if (i != n)
 -              ret = perf_assign_events(box->event_list, n,
 -                                       wmin, wmax, assign);
 +              ret = perf_assign_events(box->event_constraint, n,
 +                                       wmin, wmax, n, assign);
  
        if (!assign || ret) {
                for (i = 0; i < n; i++)
@@@ -839,7 -839,6 +838,7 @@@ static int uncore_pci_probe(struct pci_
        box->phys_id = phys_id;
        box->pci_dev = pdev;
        box->pmu = pmu;
 +      uncore_box_init(box);
        pci_set_drvdata(pdev, box);
  
        raw_spin_lock(&uncore_box_lock);
@@@ -922,9 -921,6 +921,9 @@@ static int __init uncore_pci_init(void
        case 69: /* Haswell Celeron */
                ret = hsw_uncore_pci_init();
                break;
 +      case 61: /* Broadwell */
 +              ret = bdw_uncore_pci_init();
 +              break;
        default:
                return 0;
        }
@@@ -1006,10 -1002,8 +1005,10 @@@ static int uncore_cpu_starting(int cpu
                        pmu = &type->pmus[j];
                        box = *per_cpu_ptr(pmu->box, cpu);
                        /* called by uncore_cpu_init? */
 -                      if (box && box->phys_id >= 0)
 +                      if (box && box->phys_id >= 0) {
 +                              uncore_box_init(box);
                                continue;
 +                      }
  
                        for_each_online_cpu(k) {
                                exist = *per_cpu_ptr(pmu->box, k);
                                }
                        }
  
 -                      if (box)
 +                      if (box) {
                                box->phys_id = phys_id;
 +                              uncore_box_init(box);
 +                      }
                }
        }
        return 0;
diff --combined fs/dcache.c
index 37b5afdaf6989e211151cc55a7fa656a6addd364,b43a1694d2caebd475148ebafbd68d751cdc0b99..592c4b582495b515c52a2aa3458be3422953c111
@@@ -322,17 -322,17 +322,17 @@@ static void dentry_free(struct dentry *
  }
  
  /**
-  * dentry_rcuwalk_barrier - invalidate in-progress rcu-walk lookups
+  * dentry_rcuwalk_invalidate - invalidate in-progress rcu-walk lookups
   * @dentry: the target dentry
   * After this call, in-progress rcu-walk path lookup will fail. This
   * should be called after unhashing, and after changing d_inode (if
   * the dentry has not already been unhashed).
   */
- static inline void dentry_rcuwalk_barrier(struct dentry *dentry)
+ static inline void dentry_rcuwalk_invalidate(struct dentry *dentry)
  {
-       assert_spin_locked(&dentry->d_lock);
-       /* Go through a barrier */
-       write_seqcount_barrier(&dentry->d_seq);
+       lockdep_assert_held(&dentry->d_lock);
+       /* Go through am invalidation barrier */
+       write_seqcount_invalidate(&dentry->d_seq);
  }
  
  /*
@@@ -372,7 -372,7 +372,7 @@@ static void dentry_unlink_inode(struct 
        struct inode *inode = dentry->d_inode;
        __d_clear_type_and_inode(dentry);
        hlist_del_init(&dentry->d_u.d_alias);
-       dentry_rcuwalk_barrier(dentry);
+       dentry_rcuwalk_invalidate(dentry);
        spin_unlock(&dentry->d_lock);
        spin_unlock(&inode->i_lock);
        if (!inode->i_nlink)
@@@ -494,7 -494,7 +494,7 @@@ void __d_drop(struct dentry *dentry
                __hlist_bl_del(&dentry->d_hash);
                dentry->d_hash.pprev = NULL;
                hlist_bl_unlock(b);
-               dentry_rcuwalk_barrier(dentry);
+               dentry_rcuwalk_invalidate(dentry);
        }
  }
  EXPORT_SYMBOL(__d_drop);
@@@ -1239,13 -1239,13 +1239,13 @@@ ascend
                /* might go back up the wrong parent if we have had a rename. */
                if (need_seqretry(&rename_lock, seq))
                        goto rename_retry;
 -              next = child->d_child.next;
 -              while (unlikely(child->d_flags & DCACHE_DENTRY_KILLED)) {
 +              /* go into the first sibling still alive */
 +              do {
 +                      next = child->d_child.next;
                        if (next == &this_parent->d_subdirs)
                                goto ascend;
                        child = list_entry(next, struct dentry, d_child);
 -                      next = next->next;
 -              }
 +              } while (unlikely(child->d_flags & DCACHE_DENTRY_KILLED));
                rcu_read_unlock();
                goto resume;
        }
@@@ -1752,7 -1752,7 +1752,7 @@@ static void __d_instantiate(struct dent
        if (inode)
                hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
        __d_set_inode_and_type(dentry, inode, add_flags);
-       dentry_rcuwalk_barrier(dentry);
+       dentry_rcuwalk_invalidate(dentry);
        spin_unlock(&dentry->d_lock);
        fsnotify_d_instantiate(dentry, inode);
  }
index a204d5266f5f0fc6ba6c175e9580d9b00c9621dc,cf3342a8ad807c7583d0b2ea1a72bc34735f3e5b..1b82d44b0a02d278f980acb0ae2158d2462fb329
@@@ -92,6 -92,8 +92,6 @@@ struct hw_perf_event_extra 
        int             idx;    /* index in shared_regs->regs[] */
  };
  
 -struct event_constraint;
 -
  /**
   * struct hw_perf_event - performance event hardware details:
   */
@@@ -110,6 -112,8 +110,6 @@@ struct hw_perf_event 
  
                        struct hw_perf_event_extra extra_reg;
                        struct hw_perf_event_extra branch_reg;
 -
 -                      struct event_constraint *constraint;
                };
                struct { /* software */
                        struct hrtimer  hrtimer;
                };
                struct { /* intel_cqm */
                        int                     cqm_state;
 -                      int                     cqm_rmid;
 +                      u32                     cqm_rmid;
                        struct list_head        cqm_events_entry;
                        struct list_head        cqm_groups_entry;
                        struct list_head        cqm_group_entry;
@@@ -562,8 -566,12 +562,12 @@@ struct perf_cpu_context 
        struct perf_event_context       *task_ctx;
        int                             active_oncpu;
        int                             exclusive;
+       raw_spinlock_t                  hrtimer_lock;
        struct hrtimer                  hrtimer;
        ktime_t                         hrtimer_interval;
+       unsigned int                    hrtimer_active;
        struct pmu                      *unique_pmu;
        struct perf_cgroup              *cgrp;
  };
@@@ -730,22 -738,6 +734,22 @@@ extern int perf_event_overflow(struct p
                                 struct perf_sample_data *data,
                                 struct pt_regs *regs);
  
 +extern void perf_event_output(struct perf_event *event,
 +                              struct perf_sample_data *data,
 +                              struct pt_regs *regs);
 +
 +extern void
 +perf_event_header__init_id(struct perf_event_header *header,
 +                         struct perf_sample_data *data,
 +                         struct perf_event *event);
 +extern void
 +perf_event__output_id_sample(struct perf_event *event,
 +                           struct perf_output_handle *handle,
 +                           struct perf_sample_data *sample);
 +
 +extern void
 +perf_log_lost_samples(struct perf_event *event, u64 lost);
 +
  static inline bool is_sampling_event(struct perf_event *event)
  {
        return event->attr.sample_period != 0;
@@@ -810,33 -802,11 +814,33 @@@ perf_sw_event_sched(u32 event_id, u64 n
  
  extern struct static_key_deferred perf_sched_events;
  
 +static __always_inline bool
 +perf_sw_migrate_enabled(void)
 +{
 +      if (static_key_false(&perf_swevent_enabled[PERF_COUNT_SW_CPU_MIGRATIONS]))
 +              return true;
 +      return false;
 +}
 +
 +static inline void perf_event_task_migrate(struct task_struct *task)
 +{
 +      if (perf_sw_migrate_enabled())
 +              task->sched_migrated = 1;
 +}
 +
  static inline void perf_event_task_sched_in(struct task_struct *prev,
                                            struct task_struct *task)
  {
        if (static_key_false(&perf_sched_events.key))
                __perf_event_task_sched_in(prev, task);
 +
 +      if (perf_sw_migrate_enabled() && task->sched_migrated) {
 +              struct pt_regs *regs = this_cpu_ptr(&__perf_regs[0]);
 +
 +              perf_fetch_caller_regs(regs);
 +              ___perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, regs, 0);
 +              task->sched_migrated = 0;
 +      }
  }
  
  static inline void perf_event_task_sched_out(struct task_struct *prev,
@@@ -959,8 -929,6 +963,8 @@@ perf_aux_output_skip(struct perf_output
  static inline void *
  perf_get_aux(struct perf_output_handle *handle)                               { return NULL; }
  static inline void
 +perf_event_task_migrate(struct task_struct *task)                     { }
 +static inline void
  perf_event_task_sched_in(struct task_struct *prev,
                         struct task_struct *task)                      { }
  static inline void
diff --combined include/linux/rcupdate.h
index 03a899aabd1762c74bb86b4b807dbf29651fad42,0627a447c589dedd3c0c82aa6ca8d39c86bad037..33a056bb886faeedeb9690faefd3a4adeeedd14b
@@@ -44,6 -44,8 +44,8 @@@
  #include <linux/debugobjects.h>
  #include <linux/bug.h>
  #include <linux/compiler.h>
+ #include <linux/ktime.h>
  #include <asm/barrier.h>
  
  extern int rcu_expedited; /* for sysctl */
@@@ -292,6 -294,10 +294,6 @@@ void rcu_sched_qs(void)
  void rcu_bh_qs(void);
  void rcu_check_callbacks(int user);
  struct notifier_block;
 -void rcu_idle_enter(void);
 -void rcu_idle_exit(void);
 -void rcu_irq_enter(void);
 -void rcu_irq_exit(void);
  int rcu_cpu_notify(struct notifier_block *self,
                   unsigned long action, void *hcpu);
  
@@@ -360,8 -366,8 +362,8 @@@ extern struct srcu_struct tasks_rcu_exi
  #define rcu_note_voluntary_context_switch(t) \
        do { \
                rcu_all_qs(); \
 -              if (ACCESS_ONCE((t)->rcu_tasks_holdout)) \
 -                      ACCESS_ONCE((t)->rcu_tasks_holdout) = false; \
 +              if (READ_ONCE((t)->rcu_tasks_holdout)) \
 +                      WRITE_ONCE((t)->rcu_tasks_holdout, false); \
        } while (0)
  #else /* #ifdef CONFIG_TASKS_RCU */
  #define TASKS_RCU(x) do { } while (0)
@@@ -605,7 -611,7 +607,7 @@@ static inline void rcu_preempt_sleep_ch
  
  #define __rcu_access_pointer(p, space) \
  ({ \
 -      typeof(*p) *_________p1 = (typeof(*p) *__force)ACCESS_ONCE(p); \
 +      typeof(*p) *_________p1 = (typeof(*p) *__force)READ_ONCE(p); \
        rcu_dereference_sparse(p, space); \
        ((typeof(*p) __force __kernel *)(_________p1)); \
  })
        ((typeof(*p) __force __kernel *)(p)); \
  })
  
 -#define __rcu_access_index(p, space) \
 -({ \
 -      typeof(p) _________p1 = ACCESS_ONCE(p); \
 -      rcu_dereference_sparse(p, space); \
 -      (_________p1); \
 -})
 -#define __rcu_dereference_index_check(p, c) \
 -({ \
 -      /* Dependency order vs. p above. */ \
 -      typeof(p) _________p1 = lockless_dereference(p); \
 -      rcu_lockdep_assert(c, \
 -                         "suspicious rcu_dereference_index_check() usage"); \
 -      (_________p1); \
 -})
 -
  /**
   * RCU_INITIALIZER() - statically initialize an RCU-protected global variable
   * @v: The value to statically initialize with.
   */
  #define lockless_dereference(p) \
  ({ \
 -      typeof(p) _________p1 = ACCESS_ONCE(p); \
 +      typeof(p) _________p1 = READ_ONCE(p); \
        smp_read_barrier_depends(); /* Dependency order vs. p above. */ \
        (_________p1); \
  })
   * @p: The pointer to read
   *
   * Return the value of the specified RCU-protected pointer, but omit the
 - * smp_read_barrier_depends() and keep the ACCESS_ONCE().  This is useful
 + * smp_read_barrier_depends() and keep the READ_ONCE().  This is useful
   * when the value of this pointer is accessed, but the pointer is not
   * dereferenced, for example, when testing an RCU-protected pointer against
   * NULL.  Although rcu_access_pointer() may also be used in cases where
   */
  #define rcu_dereference_raw_notrace(p) __rcu_dereference_check((p), 1, __rcu)
  
 -/**
 - * rcu_access_index() - fetch RCU index with no dereferencing
 - * @p: The index to read
 - *
 - * Return the value of the specified RCU-protected index, but omit the
 - * smp_read_barrier_depends() and keep the ACCESS_ONCE().  This is useful
 - * when the value of this index is accessed, but the index is not
 - * dereferenced, for example, when testing an RCU-protected index against
 - * -1.  Although rcu_access_index() may also be used in cases where
 - * update-side locks prevent the value of the index from changing, you
 - * should instead use rcu_dereference_index_protected() for this use case.
 - */
 -#define rcu_access_index(p) __rcu_access_index((p), __rcu)
 -
 -/**
 - * rcu_dereference_index_check() - rcu_dereference for indices with debug checking
 - * @p: The pointer to read, prior to dereferencing
 - * @c: The conditions under which the dereference will take place
 - *
 - * Similar to rcu_dereference_check(), but omits the sparse checking.
 - * This allows rcu_dereference_index_check() to be used on integers,
 - * which can then be used as array indices.  Attempting to use
 - * rcu_dereference_check() on an integer will give compiler warnings
 - * because the sparse address-space mechanism relies on dereferencing
 - * the RCU-protected pointer.  Dereferencing integers is not something
 - * that even gcc will put up with.
 - *
 - * Note that this function does not implicitly check for RCU read-side
 - * critical sections.  If this function gains lots of uses, it might
 - * make sense to provide versions for each flavor of RCU, but it does
 - * not make sense as of early 2010.
 - */
 -#define rcu_dereference_index_check(p, c) \
 -      __rcu_dereference_index_check((p), (c))
 -
  /**
   * rcu_dereference_protected() - fetch RCU pointer when updates prevented
   * @p: The pointer to read, prior to dereferencing
   * @c: The conditions under which the dereference will take place
   *
   * Return the value of the specified RCU-protected pointer, but omit
 - * both the smp_read_barrier_depends() and the ACCESS_ONCE().  This
 + * both the smp_read_barrier_depends() and the READ_ONCE().  This
   * is useful in cases where update-side locks prevent the value of the
   * pointer from changing.  Please note that this primitive does -not-
   * prevent the compiler from repeating this reference or combining it
@@@ -1099,13 -1155,13 +1101,13 @@@ static inline notrace void rcu_read_unl
  #define kfree_rcu(ptr, rcu_head)                                      \
        __kfree_rcu(&((ptr)->rcu_head), offsetof(typeof(*(ptr)), rcu_head))
  
 -#if defined(CONFIG_TINY_RCU) || defined(CONFIG_RCU_NOCB_CPU_ALL)
 +#ifdef CONFIG_TINY_RCU
- static inline int rcu_needs_cpu(unsigned long *delta_jiffies)
+ static inline int rcu_needs_cpu(u64 basemono, u64 *nextevt)
  {
-       *delta_jiffies = ULONG_MAX;
+       *nextevt = KTIME_MAX;
        return 0;
  }
 -#endif /* #if defined(CONFIG_TINY_RCU) || defined(CONFIG_RCU_NOCB_CPU_ALL) */
 +#endif /* #ifdef CONFIG_TINY_RCU */
  
  #if defined(CONFIG_RCU_NOCB_CPU_ALL)
  static inline bool rcu_is_nocb_cpu(int cpu) { return true; }
diff --combined include/linux/rcutree.h
index 3fa4a43ab4150b0b8c956ebacc5de47dace12462,db2e31beaae7c5f179972ec1a49051c240a90329..456879143f89f9db45d0f79315f728f50a9f9d0c
@@@ -31,7 -31,9 +31,7 @@@
  #define __LINUX_RCUTREE_H
  
  void rcu_note_context_switch(void);
- int rcu_needs_cpu(unsigned long *delta_jiffies);
 -#ifndef CONFIG_RCU_NOCB_CPU_ALL
+ int rcu_needs_cpu(u64 basem, u64 *nextevt);
 -#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
  void rcu_cpu_stall_reset(void);
  
  /*
@@@ -91,11 -93,6 +91,11 @@@ void rcu_force_quiescent_state(void)
  void rcu_bh_force_quiescent_state(void);
  void rcu_sched_force_quiescent_state(void);
  
 +void rcu_idle_enter(void);
 +void rcu_idle_exit(void);
 +void rcu_irq_enter(void);
 +void rcu_irq_exit(void);
 +
  void exit_rcu(void);
  
  void rcu_scheduler_starting(void);
diff --combined include/linux/sched.h
index d4193d5613cf594108390e5d953451fd73087de5,d7151460b0cfc98d437211a347524164a5d3bc0f..30364cb58b1fa8348bec5169741bf5263b3ab15a
@@@ -25,7 -25,7 +25,7 @@@ struct sched_param 
  #include <linux/errno.h>
  #include <linux/nodemask.h>
  #include <linux/mm_types.h>
 -#include <linux/preempt_mask.h>
 +#include <linux/preempt.h>
  
  #include <asm/page.h>
  #include <asm/ptrace.h>
@@@ -132,7 -132,6 +132,7 @@@ struct fs_struct
  struct perf_event_context;
  struct blk_plug;
  struct filename;
 +struct nameidata;
  
  #define VMACACHE_BITS 2
  #define VMACACHE_SIZE (1U << VMACACHE_BITS)
@@@ -174,12 -173,7 +174,12 @@@ extern unsigned long nr_iowait_cpu(int 
  extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load);
  
  extern void calc_global_load(unsigned long ticks);
 +
 +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
  extern void update_cpu_load_nohz(void);
 +#else
 +static inline void update_cpu_load_nohz(void) { }
 +#endif
  
  extern unsigned long get_parent_ip(unsigned long addr);
  
@@@ -219,10 -213,9 +219,10 @@@ print_cfs_rq(struct seq_file *m, int cp
  #define TASK_WAKEKILL         128
  #define TASK_WAKING           256
  #define TASK_PARKED           512
 -#define TASK_STATE_MAX                1024
 +#define TASK_NOLOAD           1024
 +#define TASK_STATE_MAX                2048
  
 -#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWP"
 +#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPN"
  
  extern char ___assert_task_state[1 - 2*!!(
                sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)];
  #define TASK_STOPPED          (TASK_WAKEKILL | __TASK_STOPPED)
  #define TASK_TRACED           (TASK_WAKEKILL | __TASK_TRACED)
  
 +#define TASK_IDLE             (TASK_UNINTERRUPTIBLE | TASK_NOLOAD)
 +
  /* Convenience macros for the sake of wake_up */
  #define TASK_NORMAL           (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)
  #define TASK_ALL              (TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED)
                        ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
  #define task_contributes_to_load(task)        \
                                ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
 -                               (task->flags & PF_FROZEN) == 0)
 +                               (task->flags & PF_FROZEN) == 0 && \
 +                               (task->state & TASK_NOLOAD) == 0)
  
  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
  
  #define set_task_state(tsk, state_value)                      \
        do {                                                    \
                (tsk)->task_state_change = _THIS_IP_;           \
 -              set_mb((tsk)->state, (state_value));            \
 +              smp_store_mb((tsk)->state, (state_value));              \
        } while (0)
  
  /*
  #define set_current_state(state_value)                                \
        do {                                                    \
                current->task_state_change = _THIS_IP_;         \
 -              set_mb(current->state, (state_value));          \
 +              smp_store_mb(current->state, (state_value));            \
        } while (0)
  
  #else
  #define __set_task_state(tsk, state_value)            \
        do { (tsk)->state = (state_value); } while (0)
  #define set_task_state(tsk, state_value)              \
 -      set_mb((tsk)->state, (state_value))
 +      smp_store_mb((tsk)->state, (state_value))
  
  /*
   * set_current_state() includes a barrier so that the write of current->state
  #define __set_current_state(state_value)              \
        do { current->state = (state_value); } while (0)
  #define set_current_state(state_value)                        \
 -      set_mb(current->state, (state_value))
 +      smp_store_mb(current->state, (state_value))
  
  #endif
  
@@@ -345,14 -335,10 +345,10 @@@ extern int runqueue_is_locked(int cpu)
  #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
  extern void nohz_balance_enter_idle(int cpu);
  extern void set_cpu_sd_state_idle(void);
- extern int get_nohz_timer_target(int pinned);
+ extern int get_nohz_timer_target(void);
  #else
  static inline void nohz_balance_enter_idle(int cpu) { }
  static inline void set_cpu_sd_state_idle(void) { }
- static inline int get_nohz_timer_target(int pinned)
- {
-       return smp_processor_id();
- }
  #endif
  
  /*
@@@ -577,23 -563,6 +573,23 @@@ struct task_cputime 
                .sum_exec_runtime = 0,                          \
        }
  
 +/*
 + * This is the atomic variant of task_cputime, which can be used for
 + * storing and updating task_cputime statistics without locking.
 + */
 +struct task_cputime_atomic {
 +      atomic64_t utime;
 +      atomic64_t stime;
 +      atomic64_t sum_exec_runtime;
 +};
 +
 +#define INIT_CPUTIME_ATOMIC \
 +      (struct task_cputime_atomic) {                          \
 +              .utime = ATOMIC64_INIT(0),                      \
 +              .stime = ATOMIC64_INIT(0),                      \
 +              .sum_exec_runtime = ATOMIC64_INIT(0),           \
 +      }
 +
  #ifdef CONFIG_PREEMPT_COUNT
  #define PREEMPT_DISABLED      (1 + PREEMPT_ENABLED)
  #else
  
  /**
   * struct thread_group_cputimer - thread group interval timer counts
 - * @cputime:          thread group interval timers.
 + * @cputime_atomic:   atomic thread group interval timers.
   * @running:          non-zero when there are timers running and
   *                    @cputime receives updates.
 - * @lock:             lock for fields in this struct.
   *
   * This structure contains the version of task_cputime, above, that is
   * used for thread group CPU timer calculations.
   */
  struct thread_group_cputimer {
 -      struct task_cputime cputime;
 +      struct task_cputime_atomic cputime_atomic;
        int running;
 -      raw_spinlock_t lock;
  };
  
  #include <linux/rwsem.h>
@@@ -924,50 -895,6 +920,50 @@@ enum cpu_idle_type 
  #define SCHED_CAPACITY_SHIFT  10
  #define SCHED_CAPACITY_SCALE  (1L << SCHED_CAPACITY_SHIFT)
  
 +/*
 + * Wake-queues are lists of tasks with a pending wakeup, whose
 + * callers have already marked the task as woken internally,
 + * and can thus carry on. A common use case is being able to
 + * do the wakeups once the corresponding user lock as been
 + * released.
 + *
 + * We hold reference to each task in the list across the wakeup,
 + * thus guaranteeing that the memory is still valid by the time
 + * the actual wakeups are performed in wake_up_q().
 + *
 + * One per task suffices, because there's never a need for a task to be
 + * in two wake queues simultaneously; it is forbidden to abandon a task
 + * in a wake queue (a call to wake_up_q() _must_ follow), so if a task is
 + * already in a wake queue, the wakeup will happen soon and the second
 + * waker can just skip it.
 + *
 + * The WAKE_Q macro declares and initializes the list head.
 + * wake_up_q() does NOT reinitialize the list; it's expected to be
 + * called near the end of a function, where the fact that the queue is
 + * not used again will be easy to see by inspection.
 + *
 + * Note that this can cause spurious wakeups. schedule() callers
 + * must ensure the call is done inside a loop, confirming that the
 + * wakeup condition has in fact occurred.
 + */
 +struct wake_q_node {
 +      struct wake_q_node *next;
 +};
 +
 +struct wake_q_head {
 +      struct wake_q_node *first;
 +      struct wake_q_node **lastp;
 +};
 +
 +#define WAKE_Q_TAIL ((struct wake_q_node *) 0x01)
 +
 +#define WAKE_Q(name)                                  \
 +      struct wake_q_head name = { WAKE_Q_TAIL, &name.first }
 +
 +extern void wake_q_add(struct wake_q_head *head,
 +                     struct task_struct *task);
 +extern void wake_up_q(struct wake_q_head *head);
 +
  /*
   * sched-domains (multiprocessor balancing) declarations:
   */
@@@ -1403,6 -1330,8 +1399,6 @@@ struct task_struct 
        int rcu_read_lock_nesting;
        union rcu_special rcu_read_unlock_special;
        struct list_head rcu_node_entry;
 -#endif /* #ifdef CONFIG_PREEMPT_RCU */
 -#ifdef CONFIG_PREEMPT_RCU
        struct rcu_node *rcu_blocked_node;
  #endif /* #ifdef CONFIG_PREEMPT_RCU */
  #ifdef CONFIG_TASKS_RCU
  #endif
  
        struct mm_struct *mm, *active_mm;
 -#ifdef CONFIG_COMPAT_BRK
 -      unsigned brk_randomized:1;
 -#endif
        /* per-thread vma caching */
        u32 vmacache_seqnum;
        struct vm_area_struct *vmacache[VMACACHE_SIZE];
        int exit_state;
        int exit_code, exit_signal;
        int pdeath_signal;  /*  The signal sent when the parent dies  */
 -      unsigned int jobctl;    /* JOBCTL_*, siglock protected */
 +      unsigned long jobctl;   /* JOBCTL_*, siglock protected */
  
        /* Used for emulating ABI behavior of previous Linux versions */
        unsigned int personality;
        /* Revert to default priority/policy when forking */
        unsigned sched_reset_on_fork:1;
        unsigned sched_contributes_to_load:1;
 +      unsigned sched_migrated:1;
  
  #ifdef CONFIG_MEMCG_KMEM
        unsigned memcg_kmem_skip_account:1;
  #endif
 +#ifdef CONFIG_COMPAT_BRK
 +      unsigned brk_randomized:1;
 +#endif
  
        unsigned long atomic_flags; /* Flags needing atomic access. */
  
                                       it with task_lock())
                                     - initialized normally by setup_new_exec */
  /* file system info */
 -      int link_count, total_link_count;
 +      struct nameidata *nameidata;
  #ifdef CONFIG_SYSVIPC
  /* ipc stuff */
        struct sysv_sem sysvsem;
        /* Protection of the PI data structures: */
        raw_spinlock_t pi_lock;
  
 +      struct wake_q_node wake_q;
 +
  #ifdef CONFIG_RT_MUTEXES
        /* PI waiters blocked on a rt_mutex held by this task */
        struct rb_root pi_waiters;
  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
        unsigned long   task_state_change;
  #endif
 +      int pagefault_disabled;
  };
  
  /* Future-safe accessor for struct task_struct's cpus_allowed. */
@@@ -2148,22 -2073,22 +2144,22 @@@ TASK_PFA_CLEAR(SPREAD_SLAB, spread_slab
  #define JOBCTL_TRAPPING_BIT   21      /* switching to TRACED */
  #define JOBCTL_LISTENING_BIT  22      /* ptracer is listening for events */
  
 -#define JOBCTL_STOP_DEQUEUED  (1 << JOBCTL_STOP_DEQUEUED_BIT)
 -#define JOBCTL_STOP_PENDING   (1 << JOBCTL_STOP_PENDING_BIT)
 -#define JOBCTL_STOP_CONSUME   (1 << JOBCTL_STOP_CONSUME_BIT)
 -#define JOBCTL_TRAP_STOP      (1 << JOBCTL_TRAP_STOP_BIT)
 -#define JOBCTL_TRAP_NOTIFY    (1 << JOBCTL_TRAP_NOTIFY_BIT)
 -#define JOBCTL_TRAPPING               (1 << JOBCTL_TRAPPING_BIT)
 -#define JOBCTL_LISTENING      (1 << JOBCTL_LISTENING_BIT)
 +#define JOBCTL_STOP_DEQUEUED  (1UL << JOBCTL_STOP_DEQUEUED_BIT)
 +#define JOBCTL_STOP_PENDING   (1UL << JOBCTL_STOP_PENDING_BIT)
 +#define JOBCTL_STOP_CONSUME   (1UL << JOBCTL_STOP_CONSUME_BIT)
 +#define JOBCTL_TRAP_STOP      (1UL << JOBCTL_TRAP_STOP_BIT)
 +#define JOBCTL_TRAP_NOTIFY    (1UL << JOBCTL_TRAP_NOTIFY_BIT)
 +#define JOBCTL_TRAPPING               (1UL << JOBCTL_TRAPPING_BIT)
 +#define JOBCTL_LISTENING      (1UL << JOBCTL_LISTENING_BIT)
  
  #define JOBCTL_TRAP_MASK      (JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY)
  #define JOBCTL_PENDING_MASK   (JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK)
  
  extern bool task_set_jobctl_pending(struct task_struct *task,
 -                                  unsigned int mask);
 +                                  unsigned long mask);
  extern void task_clear_jobctl_trapping(struct task_struct *task);
  extern void task_clear_jobctl_pending(struct task_struct *task,
 -                                    unsigned int mask);
 +                                    unsigned long mask);
  
  static inline void rcu_copy_process(struct task_struct *p)
  {
@@@ -3033,6 -2958,11 +3029,6 @@@ static __always_inline bool need_resche
  void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
  void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times);
  
 -static inline void thread_group_cputime_init(struct signal_struct *sig)
 -{
 -      raw_spin_lock_init(&sig->cputimer.lock);
 -}
 -
  /*
   * Reevaluate whether the task has signals pending delivery.
   * Wake the task if so.
@@@ -3146,13 -3076,13 +3142,13 @@@ static inline void mm_update_next_owner
  static inline unsigned long task_rlimit(const struct task_struct *tsk,
                unsigned int limit)
  {
 -      return ACCESS_ONCE(tsk->signal->rlim[limit].rlim_cur);
 +      return READ_ONCE(tsk->signal->rlim[limit].rlim_cur);
  }
  
  static inline unsigned long task_rlimit_max(const struct task_struct *tsk,
                unsigned int limit)
  {
 -      return ACCESS_ONCE(tsk->signal->rlim[limit].rlim_max);
 +      return READ_ONCE(tsk->signal->rlim[limit].rlim_max);
  }
  
  static inline unsigned long rlimit(unsigned int limit)
diff --combined kernel/events/core.c
index f2003b97ddc99d726cf5cc145b134b128671f17c,1c6c2826af1ee0bfa6e8d1c9ba1284fb065550c3..8e13f3e54ec369f26d52e52081f013a6aa29fd23
  
  static struct workqueue_struct *perf_wq;
  
+ typedef int (*remote_function_f)(void *);
  struct remote_function_call {
        struct task_struct      *p;
-       int                     (*func)(void *info);
+       remote_function_f       func;
        void                    *info;
        int                     ret;
  };
@@@ -86,7 -88,7 +88,7 @@@ static void remote_function(void *data
   *        -EAGAIN - when the process moved away
   */
  static int
- task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
+ task_function_call(struct task_struct *p, remote_function_f func, void *info)
  {
        struct remote_function_call data = {
                .p      = p,
   *
   * returns: @func return value or -ENXIO when the cpu is offline
   */
- static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
+ static int cpu_function_call(int cpu, remote_function_f func, void *info)
  {
        struct remote_function_call data = {
                .p      = NULL,
@@@ -747,62 -749,31 +749,31 @@@ perf_cgroup_mark_enabled(struct perf_ev
  /*
   * function must be called with interrupts disbled
   */
- static enum hrtimer_restart perf_cpu_hrtimer_handler(struct hrtimer *hr)
+ static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
  {
        struct perf_cpu_context *cpuctx;
-       enum hrtimer_restart ret = HRTIMER_NORESTART;
        int rotations = 0;
  
        WARN_ON(!irqs_disabled());
  
        cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
        rotations = perf_rotate_context(cpuctx);
  
-       /*
-        * arm timer if needed
-        */
-       if (rotations) {
+       raw_spin_lock(&cpuctx->hrtimer_lock);
+       if (rotations)
                hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
-               ret = HRTIMER_RESTART;
-       }
-       return ret;
- }
- /* CPU is going down */
- void perf_cpu_hrtimer_cancel(int cpu)
- {
-       struct perf_cpu_context *cpuctx;
-       struct pmu *pmu;
-       unsigned long flags;
-       if (WARN_ON(cpu != smp_processor_id()))
-               return;
-       local_irq_save(flags);
-       rcu_read_lock();
-       list_for_each_entry_rcu(pmu, &pmus, entry) {
-               cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-               if (pmu->task_ctx_nr == perf_sw_context)
-                       continue;
-               hrtimer_cancel(&cpuctx->hrtimer);
-       }
-       rcu_read_unlock();
+       else
+               cpuctx->hrtimer_active = 0;
+       raw_spin_unlock(&cpuctx->hrtimer_lock);
  
-       local_irq_restore(flags);
+       return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
  }
  
- static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
+ static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
  {
-       struct hrtimer *hr = &cpuctx->hrtimer;
+       struct hrtimer *timer = &cpuctx->hrtimer;
        struct pmu *pmu = cpuctx->ctx.pmu;
-       int timer;
+       u64 interval;
  
        /* no multiplexing needed for SW PMU */
        if (pmu->task_ctx_nr == perf_sw_context)
         * check default is sane, if not set then force to
         * default interval (1/tick)
         */
-       timer = pmu->hrtimer_interval_ms;
-       if (timer < 1)
-               timer = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
+       interval = pmu->hrtimer_interval_ms;
+       if (interval < 1)
+               interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
  
-       cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
+       cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
  
-       hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
-       hr->function = perf_cpu_hrtimer_handler;
+       raw_spin_lock_init(&cpuctx->hrtimer_lock);
+       hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
+       timer->function = perf_mux_hrtimer_handler;
  }
  
- static void perf_cpu_hrtimer_restart(struct perf_cpu_context *cpuctx)
+ static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
  {
-       struct hrtimer *hr = &cpuctx->hrtimer;
+       struct hrtimer *timer = &cpuctx->hrtimer;
        struct pmu *pmu = cpuctx->ctx.pmu;
+       unsigned long flags;
  
        /* not for SW PMU */
        if (pmu->task_ctx_nr == perf_sw_context)
-               return;
+               return 0;
  
-       if (hrtimer_active(hr))
-               return;
+       raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
+       if (!cpuctx->hrtimer_active) {
+               cpuctx->hrtimer_active = 1;
+               hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
+               hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
+       }
+       raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
  
-       if (!hrtimer_callback_running(hr))
-               __hrtimer_start_range_ns(hr, cpuctx->hrtimer_interval,
-                                        0, HRTIMER_MODE_REL_PINNED, 0);
+       return 0;
  }
  
  void perf_pmu_disable(struct pmu *pmu)
@@@ -1935,7 -1911,7 +1911,7 @@@ group_sched_in(struct perf_event *group
  
        if (event_sched_in(group_event, cpuctx, ctx)) {
                pmu->cancel_txn(pmu);
-               perf_cpu_hrtimer_restart(cpuctx);
+               perf_mux_hrtimer_restart(cpuctx);
                return -EAGAIN;
        }
  
@@@ -1982,7 -1958,7 +1958,7 @@@ group_error
  
        pmu->cancel_txn(pmu);
  
-       perf_cpu_hrtimer_restart(cpuctx);
+       perf_mux_hrtimer_restart(cpuctx);
  
        return -EAGAIN;
  }
@@@ -2255,7 -2231,7 +2231,7 @@@ static int __perf_event_enable(void *in
                 */
                if (leader != event) {
                        group_sched_out(leader, cpuctx, ctx);
-                       perf_cpu_hrtimer_restart(cpuctx);
+                       perf_mux_hrtimer_restart(cpuctx);
                }
                if (leader->attr.pinned) {
                        update_group_times(leader);
@@@ -3442,6 -3418,7 +3418,6 @@@ static void free_event_rcu(struct rcu_h
        if (event->ns)
                put_pid_ns(event->ns);
        perf_event_free_filter(event);
 -      perf_event_free_bpf_prog(event);
        kfree(event);
  }
  
@@@ -3572,8 -3549,6 +3548,8 @@@ static void __free_event(struct perf_ev
                        put_callchain_buffers();
        }
  
 +      perf_event_free_bpf_prog(event);
 +
        if (event->destroy)
                event->destroy(event);
  
@@@ -4331,20 -4306,20 +4307,20 @@@ static void ring_buffer_attach(struct p
                WARN_ON_ONCE(event->rcu_pending);
  
                old_rb = event->rb;
 -              event->rcu_batches = get_state_synchronize_rcu();
 -              event->rcu_pending = 1;
 -
                spin_lock_irqsave(&old_rb->event_lock, flags);
                list_del_rcu(&event->rb_entry);
                spin_unlock_irqrestore(&old_rb->event_lock, flags);
 -      }
  
 -      if (event->rcu_pending && rb) {
 -              cond_synchronize_rcu(event->rcu_batches);
 -              event->rcu_pending = 0;
 +              event->rcu_batches = get_state_synchronize_rcu();
 +              event->rcu_pending = 1;
        }
  
        if (rb) {
 +              if (event->rcu_pending) {
 +                      cond_synchronize_rcu(event->rcu_batches);
 +                      event->rcu_pending = 0;
 +              }
 +
                spin_lock_irqsave(&rb->event_lock, flags);
                list_add_rcu(&event->rb_entry, &rb->event_list);
                spin_unlock_irqrestore(&rb->event_lock, flags);
@@@ -5381,9 -5356,9 +5357,9 @@@ void perf_prepare_sample(struct perf_ev
        }
  }
  
 -static void perf_event_output(struct perf_event *event,
 -                              struct perf_sample_data *data,
 -                              struct pt_regs *regs)
 +void perf_event_output(struct perf_event *event,
 +                      struct perf_sample_data *data,
 +                      struct pt_regs *regs)
  {
        struct perf_output_handle handle;
        struct perf_event_header header;
@@@ -5974,39 -5949,6 +5950,39 @@@ void perf_event_aux_event(struct perf_e
        perf_output_end(&handle);
  }
  
 +/*
 + * Lost/dropped samples logging
 + */
 +void perf_log_lost_samples(struct perf_event *event, u64 lost)
 +{
 +      struct perf_output_handle handle;
 +      struct perf_sample_data sample;
 +      int ret;
 +
 +      struct {
 +              struct perf_event_header        header;
 +              u64                             lost;
 +      } lost_samples_event = {
 +              .header = {
 +                      .type = PERF_RECORD_LOST_SAMPLES,
 +                      .misc = 0,
 +                      .size = sizeof(lost_samples_event),
 +              },
 +              .lost           = lost,
 +      };
 +
 +      perf_event_header__init_id(&lost_samples_event.header, &sample, event);
 +
 +      ret = perf_output_begin(&handle, event,
 +                              lost_samples_event.header.size);
 +      if (ret)
 +              return;
 +
 +      perf_output_put(&handle, lost_samples_event);
 +      perf_event__output_id_sample(event, &handle, &sample);
 +      perf_output_end(&handle);
 +}
 +
  /*
   * IRQ throttle logging
   */
@@@ -6897,9 -6839,8 +6873,8 @@@ static void perf_swevent_start_hrtimer(
        } else {
                period = max_t(u64, 10000, hwc->sample_period);
        }
-       __hrtimer_start_range_ns(&hwc->hrtimer,
-                               ns_to_ktime(period), 0,
-                               HRTIMER_MODE_REL_PINNED, 0);
+       hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
+                     HRTIMER_MODE_REL_PINNED);
  }
  
  static void perf_swevent_cancel_hrtimer(struct perf_event *event)
@@@ -7200,6 -7141,8 +7175,8 @@@ perf_event_mux_interval_ms_show(struct 
        return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
  }
  
+ static DEFINE_MUTEX(mux_interval_mutex);
  static ssize_t
  perf_event_mux_interval_ms_store(struct device *dev,
                                 struct device_attribute *attr,
        if (timer == pmu->hrtimer_interval_ms)
                return count;
  
+       mutex_lock(&mux_interval_mutex);
        pmu->hrtimer_interval_ms = timer;
  
        /* update all cpuctx for this PMU */
-       for_each_possible_cpu(cpu) {
+       get_online_cpus();
+       for_each_online_cpu(cpu) {
                struct perf_cpu_context *cpuctx;
                cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
                cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
  
-               if (hrtimer_active(&cpuctx->hrtimer))
-                       hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval);
+               cpu_function_call(cpu,
+                       (remote_function_f)perf_mux_hrtimer_restart, cpuctx);
        }
+       put_online_cpus();
+       mutex_unlock(&mux_interval_mutex);
  
        return count;
  }
@@@ -7334,7 -7281,7 +7315,7 @@@ skip_type
                lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
                cpuctx->ctx.pmu = pmu;
  
-               __perf_cpu_hrtimer_init(cpuctx, cpu);
+               __perf_mux_hrtimer_init(cpuctx, cpu);
  
                cpuctx->unique_pmu = pmu;
        }
diff --combined kernel/futex.c
index aacc706f85fcc1d7a1e8dd6c4e753c5b3495e1b0,720eacff6b581d3d16638a86aaa42dcc0be5dce7..ea6ca0bca52570b8cd88a9c428016cd54cf55a0a
@@@ -1090,11 -1090,9 +1090,11 @@@ static void __unqueue_futex(struct fute
  
  /*
   * The hash bucket lock must be held when this is called.
 - * Afterwards, the futex_q must not be accessed.
 + * Afterwards, the futex_q must not be accessed. Callers
 + * must ensure to later call wake_up_q() for the actual
 + * wakeups to occur.
   */
 -static void wake_futex(struct futex_q *q)
 +static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
  {
        struct task_struct *p = q->task;
  
                return;
  
        /*
 -       * We set q->lock_ptr = NULL _before_ we wake up the task. If
 -       * a non-futex wake up happens on another CPU then the task
 -       * might exit and p would dereference a non-existing task
 -       * struct. Prevent this by holding a reference on p across the
 -       * wake up.
 +       * Queue the task for later wakeup for after we've released
 +       * the hb->lock. wake_q_add() grabs reference to p.
         */
 -      get_task_struct(p);
 -
 +      wake_q_add(wake_q, p);
        __unqueue_futex(q);
        /*
         * The waiting task can free the futex_q as soon as
         */
        smp_wmb();
        q->lock_ptr = NULL;
 -
 -      wake_up_state(p, TASK_NORMAL);
 -      put_task_struct(p);
  }
  
  static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
@@@ -1212,7 -1217,6 +1212,7 @@@ futex_wake(u32 __user *uaddr, unsigned 
        struct futex_q *this, *next;
        union futex_key key = FUTEX_KEY_INIT;
        int ret;
 +      WAKE_Q(wake_q);
  
        if (!bitset)
                return -EINVAL;
                        if (!(this->bitset & bitset))
                                continue;
  
 -                      wake_futex(this);
 +                      mark_wake_futex(&wake_q, this);
                        if (++ret >= nr_wake)
                                break;
                }
        }
  
        spin_unlock(&hb->lock);
 +      wake_up_q(&wake_q);
  out_put_key:
        put_futex_key(&key);
  out:
@@@ -1266,7 -1269,6 +1266,7 @@@ futex_wake_op(u32 __user *uaddr1, unsig
        struct futex_hash_bucket *hb1, *hb2;
        struct futex_q *this, *next;
        int ret, op_ret;
 +      WAKE_Q(wake_q);
  
  retry:
        ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
@@@ -1318,7 -1320,7 +1318,7 @@@ retry_private
                                ret = -EINVAL;
                                goto out_unlock;
                        }
 -                      wake_futex(this);
 +                      mark_wake_futex(&wake_q, this);
                        if (++ret >= nr_wake)
                                break;
                }
                                        ret = -EINVAL;
                                        goto out_unlock;
                                }
 -                              wake_futex(this);
 +                              mark_wake_futex(&wake_q, this);
                                if (++op_ret >= nr_wake2)
                                        break;
                        }
  
  out_unlock:
        double_unlock_hb(hb1, hb2);
 +      wake_up_q(&wake_q);
  out_put_keys:
        put_futex_key(&key2);
  out_put_key1:
@@@ -1502,7 -1503,6 +1502,7 @@@ static int futex_requeue(u32 __user *ua
        struct futex_pi_state *pi_state = NULL;
        struct futex_hash_bucket *hb1, *hb2;
        struct futex_q *this, *next;
 +      WAKE_Q(wake_q);
  
        if (requeue_pi) {
                /*
@@@ -1679,7 -1679,7 +1679,7 @@@ retry_private
                 * woken by futex_unlock_pi().
                 */
                if (++task_count <= nr_wake && !requeue_pi) {
 -                      wake_futex(this);
 +                      mark_wake_futex(&wake_q, this);
                        continue;
                }
  
  out_unlock:
        free_pi_state(pi_state);
        double_unlock_hb(hb1, hb2);
 +      wake_up_q(&wake_q);
        hb_waiters_dec(hb2);
  
        /*
@@@ -2056,7 -2055,7 +2056,7 @@@ static void futex_wait_queue_me(struct 
  {
        /*
         * The task state is guaranteed to be set before another task can
 -       * wake it. set_current_state() is implemented using set_mb() and
 +       * wake it. set_current_state() is implemented using smp_store_mb() and
         * queue_me() calls spin_unlock() upon completion, both serializing
         * access to the hash list and forcing another memory barrier.
         */
        queue_me(q, hb);
  
        /* Arm the timer */
-       if (timeout) {
+       if (timeout)
                hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
-               if (!hrtimer_active(&timeout->timer))
-                       timeout->task = NULL;
-       }
  
        /*
         * If we have been removed from the hash list, then another task
diff --combined kernel/locking/rtmutex.c
index 30ec5b46cd8c789a1276d9b0abda01fdabd7dfa2,8b678cac7fbe389553272a417a3d82c2ddb39406..36573e96a47761c6cd3fc17463651f3e11028d59
@@@ -70,10 -70,10 +70,10 @@@ static void fixup_rt_mutex_waiters(stru
  }
  
  /*
 - * We can speed up the acquire/release, if the architecture
 - * supports cmpxchg and if there's no debugging state to be set up
 + * We can speed up the acquire/release, if there's no debugging state to be
 + * set up.
   */
 -#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES)
 +#ifndef CONFIG_DEBUG_RT_MUTEXES
  # define rt_mutex_cmpxchg(l,c,n)      (cmpxchg(&l->owner, c, n) == c)
  static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
  {
@@@ -1182,11 -1182,8 +1182,8 @@@ rt_mutex_slowlock(struct rt_mutex *lock
        set_current_state(state);
  
        /* Setup the timer, when timeout != NULL */
-       if (unlikely(timeout)) {
+       if (unlikely(timeout))
                hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
-               if (!hrtimer_active(&timeout->timer))
-                       timeout->task = NULL;
-       }
  
        ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk);
  
@@@ -1443,17 -1440,10 +1440,17 @@@ EXPORT_SYMBOL_GPL(rt_mutex_timed_lock)
   *
   * @lock:     the rt_mutex to be locked
   *
 + * This function can only be called in thread context. It's safe to
 + * call it from atomic regions, but not from hard interrupt or soft
 + * interrupt context.
 + *
   * Returns 1 on success and 0 on contention
   */
  int __sched rt_mutex_trylock(struct rt_mutex *lock)
  {
 +      if (WARN_ON(in_irq() || in_nmi() || in_serving_softirq()))
 +              return 0;
 +
        return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
  }
  EXPORT_SYMBOL_GPL(rt_mutex_trylock);
diff --combined kernel/rcu/tree_plugin.h
index 32664347091a1a6b7e04e2bf6ae8128a3411fc42,d72fa24f23128a640a386525070676a5446de2cb..013485fb2b06b9f499d0673a36bf8f62d5e72607
@@@ -43,17 -43,7 +43,17 @@@ DEFINE_PER_CPU(unsigned int, rcu_cpu_kt
  DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
  DEFINE_PER_CPU(char, rcu_cpu_has_work);
  
 -#endif /* #ifdef CONFIG_RCU_BOOST */
 +#else /* #ifdef CONFIG_RCU_BOOST */
 +
 +/*
 + * Some architectures do not define rt_mutexes, but if !CONFIG_RCU_BOOST,
 + * all uses are in dead code.  Provide a definition to keep the compiler
 + * happy, but add WARN_ON_ONCE() to complain if used in the wrong place.
 + * This probably needs to be excluded from -rt builds.
 + */
 +#define rt_mutex_owner(a) ({ WARN_ON_ONCE(1); NULL; })
 +
 +#endif /* #else #ifdef CONFIG_RCU_BOOST */
  
  #ifdef CONFIG_RCU_NOCB_CPU
  static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
@@@ -70,11 -60,11 +70,11 @@@ static void __init rcu_bootup_announce_
  {
        if (IS_ENABLED(CONFIG_RCU_TRACE))
                pr_info("\tRCU debugfs-based tracing is enabled.\n");
 -      if ((IS_ENABLED(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) ||
 -          (!IS_ENABLED(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32))
 +      if ((IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 64) ||
 +          (!IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 32))
                pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
 -                     CONFIG_RCU_FANOUT);
 -      if (IS_ENABLED(CONFIG_RCU_FANOUT_EXACT))
 +                     RCU_FANOUT);
 +      if (rcu_fanout_exact)
                pr_info("\tHierarchical RCU autobalancing is disabled.\n");
        if (IS_ENABLED(CONFIG_RCU_FAST_NO_HZ))
                pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n");
                pr_info("\tAdditional per-CPU info printed with stalls.\n");
        if (NUM_RCU_LVL_4 != 0)
                pr_info("\tFour-level hierarchy is enabled.\n");
 -      if (CONFIG_RCU_FANOUT_LEAF != 16)
 +      if (RCU_FANOUT_LEAF != 16)
                pr_info("\tBuild-time adjustment of leaf fanout to %d.\n",
 -                      CONFIG_RCU_FANOUT_LEAF);
 -      if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF)
 +                      RCU_FANOUT_LEAF);
 +      if (rcu_fanout_leaf != RCU_FANOUT_LEAF)
                pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
        if (nr_cpu_ids != NR_CPUS)
                pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
  #ifdef CONFIG_PREEMPT_RCU
  
  RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
 -static struct rcu_state *rcu_state_p = &rcu_preempt_state;
 +static struct rcu_state *const rcu_state_p = &rcu_preempt_state;
 +static struct rcu_data __percpu *const rcu_data_p = &rcu_preempt_data;
  
  static int rcu_preempted_readers_exp(struct rcu_node *rnp);
  static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
@@@ -127,11 -116,11 +127,11 @@@ static void __init rcu_bootup_announce(
   */
  static void rcu_preempt_qs(void)
  {
 -      if (!__this_cpu_read(rcu_preempt_data.passed_quiesce)) {
 +      if (!__this_cpu_read(rcu_data_p->passed_quiesce)) {
                trace_rcu_grace_period(TPS("rcu_preempt"),
 -                                     __this_cpu_read(rcu_preempt_data.gpnum),
 +                                     __this_cpu_read(rcu_data_p->gpnum),
                                       TPS("cpuqs"));
 -              __this_cpu_write(rcu_preempt_data.passed_quiesce, 1);
 +              __this_cpu_write(rcu_data_p->passed_quiesce, 1);
                barrier(); /* Coordinate with rcu_preempt_check_callbacks(). */
                current->rcu_read_unlock_special.b.need_qs = false;
        }
@@@ -161,7 -150,7 +161,7 @@@ static void rcu_preempt_note_context_sw
            !t->rcu_read_unlock_special.b.blocked) {
  
                /* Possibly blocking in an RCU read-side critical section. */
 -              rdp = this_cpu_ptr(rcu_preempt_state.rda);
 +              rdp = this_cpu_ptr(rcu_state_p->rda);
                rnp = rdp->mynode;
                raw_spin_lock_irqsave(&rnp->lock, flags);
                smp_mb__after_unlock_lock();
                if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) {
                        list_add(&t->rcu_node_entry, rnp->gp_tasks->prev);
                        rnp->gp_tasks = &t->rcu_node_entry;
 -#ifdef CONFIG_RCU_BOOST
 -                      if (rnp->boost_tasks != NULL)
 +                      if (IS_ENABLED(CONFIG_RCU_BOOST) &&
 +                          rnp->boost_tasks != NULL)
                                rnp->boost_tasks = rnp->gp_tasks;
 -#endif /* #ifdef CONFIG_RCU_BOOST */
                } else {
                        list_add(&t->rcu_node_entry, &rnp->blkd_tasks);
                        if (rnp->qsmask & rdp->grpmask)
@@@ -273,7 -263,9 +273,7 @@@ void rcu_read_unlock_special(struct tas
        bool empty_exp_now;
        unsigned long flags;
        struct list_head *np;
 -#ifdef CONFIG_RCU_BOOST
        bool drop_boost_mutex = false;
 -#endif /* #ifdef CONFIG_RCU_BOOST */
        struct rcu_node *rnp;
        union rcu_special special;
  
                t->rcu_read_unlock_special.b.blocked = false;
  
                /*
 -               * Remove this task from the list it blocked on.  The
 -               * task can migrate while we acquire the lock, but at
 -               * most one time.  So at most two passes through loop.
 +               * Remove this task from the list it blocked on.  The task
 +               * now remains queued on the rcu_node corresponding to
 +               * the CPU it first blocked on, so the first attempt to
 +               * acquire the task's rcu_node's ->lock will succeed.
 +               * Keep the loop and add a WARN_ON() out of sheer paranoia.
                 */
                for (;;) {
                        rnp = t->rcu_blocked_node;
                        smp_mb__after_unlock_lock();
                        if (rnp == t->rcu_blocked_node)
                                break;
 +                      WARN_ON_ONCE(1);
                        raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
                }
                empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
                        rnp->gp_tasks = np;
                if (&t->rcu_node_entry == rnp->exp_tasks)
                        rnp->exp_tasks = np;
 -#ifdef CONFIG_RCU_BOOST
 -              if (&t->rcu_node_entry == rnp->boost_tasks)
 -                      rnp->boost_tasks = np;
 -              /* Snapshot ->boost_mtx ownership with rcu_node lock held. */
 -              drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t;
 -#endif /* #ifdef CONFIG_RCU_BOOST */
 +              if (IS_ENABLED(CONFIG_RCU_BOOST)) {
 +                      if (&t->rcu_node_entry == rnp->boost_tasks)
 +                              rnp->boost_tasks = np;
 +                      /* Snapshot ->boost_mtx ownership w/rnp->lock held. */
 +                      drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t;
 +              }
  
                /*
                 * If this was the last task on the current list, and if
                                                         rnp->grplo,
                                                         rnp->grphi,
                                                         !!rnp->gp_tasks);
 -                      rcu_report_unblock_qs_rnp(&rcu_preempt_state,
 -                                                rnp, flags);
 +                      rcu_report_unblock_qs_rnp(rcu_state_p, rnp, flags);
                } else {
                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
                }
  
 -#ifdef CONFIG_RCU_BOOST
                /* Unboost if we were boosted. */
 -              if (drop_boost_mutex)
 +              if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex)
                        rt_mutex_unlock(&rnp->boost_mtx);
 -#endif /* #ifdef CONFIG_RCU_BOOST */
  
                /*
                 * If this was the last task on the expedited lists,
                 * then we need to report up the rcu_node hierarchy.
                 */
                if (!empty_exp && empty_exp_now)
 -                      rcu_report_exp_rnp(&rcu_preempt_state, rnp, true);
 +                      rcu_report_exp_rnp(rcu_state_p, rnp, true);
        } else {
                local_irq_restore(flags);
        }
@@@ -398,7 -390,7 +398,7 @@@ static void rcu_print_detail_task_stall
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
                return;
        }
 -      t = list_entry(rnp->gp_tasks,
 +      t = list_entry(rnp->gp_tasks->prev,
                       struct task_struct, rcu_node_entry);
        list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
                sched_show_task(t);
@@@ -455,7 -447,7 +455,7 @@@ static int rcu_print_task_stall(struct 
        if (!rcu_preempt_blocked_readers_cgp(rnp))
                return 0;
        rcu_print_task_stall_begin(rnp);
 -      t = list_entry(rnp->gp_tasks,
 +      t = list_entry(rnp->gp_tasks->prev,
                       struct task_struct, rcu_node_entry);
        list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
                pr_cont(" P%d", t->pid);
@@@ -499,8 -491,8 +499,8 @@@ static void rcu_preempt_check_callbacks
                return;
        }
        if (t->rcu_read_lock_nesting > 0 &&
 -          __this_cpu_read(rcu_preempt_data.qs_pending) &&
 -          !__this_cpu_read(rcu_preempt_data.passed_quiesce))
 +          __this_cpu_read(rcu_data_p->qs_pending) &&
 +          !__this_cpu_read(rcu_data_p->passed_quiesce))
                t->rcu_read_unlock_special.b.need_qs = true;
  }
  
  
  static void rcu_preempt_do_callbacks(void)
  {
 -      rcu_do_batch(&rcu_preempt_state, this_cpu_ptr(&rcu_preempt_data));
 +      rcu_do_batch(rcu_state_p, this_cpu_ptr(rcu_data_p));
  }
  
  #endif /* #ifdef CONFIG_RCU_BOOST */
   */
  void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
  {
 -      __call_rcu(head, func, &rcu_preempt_state, -1, 0);
 +      __call_rcu(head, func, rcu_state_p, -1, 0);
  }
  EXPORT_SYMBOL_GPL(call_rcu);
  
@@@ -578,7 -570,7 +578,7 @@@ static int rcu_preempted_readers_exp(st
  static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
  {
        return !rcu_preempted_readers_exp(rnp) &&
 -             ACCESS_ONCE(rnp->expmask) == 0;
 +             READ_ONCE(rnp->expmask) == 0;
  }
  
  /*
@@@ -719,12 -711,12 +719,12 @@@ sync_rcu_preempt_exp_init2(struct rcu_s
  void synchronize_rcu_expedited(void)
  {
        struct rcu_node *rnp;
 -      struct rcu_state *rsp = &rcu_preempt_state;
 +      struct rcu_state *rsp = rcu_state_p;
        unsigned long snap;
        int trycount = 0;
  
        smp_mb(); /* Caller's modifications seen first by other CPUs. */
 -      snap = ACCESS_ONCE(sync_rcu_preempt_exp_count) + 1;
 +      snap = READ_ONCE(sync_rcu_preempt_exp_count) + 1;
        smp_mb(); /* Above access cannot bleed into critical section. */
  
        /*
         */
        while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) {
                if (ULONG_CMP_LT(snap,
 -                  ACCESS_ONCE(sync_rcu_preempt_exp_count))) {
 +                  READ_ONCE(sync_rcu_preempt_exp_count))) {
                        put_online_cpus();
                        goto mb_ret; /* Others did our work for us. */
                }
                        return;
                }
        }
 -      if (ULONG_CMP_LT(snap, ACCESS_ONCE(sync_rcu_preempt_exp_count))) {
 +      if (ULONG_CMP_LT(snap, READ_ONCE(sync_rcu_preempt_exp_count))) {
                put_online_cpus();
                goto unlock_mb_ret; /* Others did our work for us. */
        }
  
        /* Clean up and exit. */
        smp_mb(); /* ensure expedited GP seen before counter increment. */
 -      ACCESS_ONCE(sync_rcu_preempt_exp_count) =
 -                                      sync_rcu_preempt_exp_count + 1;
 +      WRITE_ONCE(sync_rcu_preempt_exp_count, sync_rcu_preempt_exp_count + 1);
  unlock_mb_ret:
        mutex_unlock(&sync_rcu_preempt_exp_mutex);
  mb_ret:
@@@ -806,7 -799,7 +806,7 @@@ EXPORT_SYMBOL_GPL(synchronize_rcu_exped
   */
  void rcu_barrier(void)
  {
 -      _rcu_barrier(&rcu_preempt_state);
 +      _rcu_barrier(rcu_state_p);
  }
  EXPORT_SYMBOL_GPL(rcu_barrier);
  
   */
  static void __init __rcu_init_preempt(void)
  {
 -      rcu_init_one(&rcu_preempt_state, &rcu_preempt_data);
 +      rcu_init_one(rcu_state_p, rcu_data_p);
  }
  
  /*
@@@ -838,8 -831,7 +838,8 @@@ void exit_rcu(void
  
  #else /* #ifdef CONFIG_PREEMPT_RCU */
  
 -static struct rcu_state *rcu_state_p = &rcu_sched_state;
 +static struct rcu_state *const rcu_state_p = &rcu_sched_state;
 +static struct rcu_data __percpu *const rcu_data_p = &rcu_sched_data;
  
  /*
   * Tell them what RCU they are running.
@@@ -1002,8 -994,8 +1002,8 @@@ static int rcu_boost(struct rcu_node *r
        struct task_struct *t;
        struct list_head *tb;
  
 -      if (ACCESS_ONCE(rnp->exp_tasks) == NULL &&
 -          ACCESS_ONCE(rnp->boost_tasks) == NULL)
 +      if (READ_ONCE(rnp->exp_tasks) == NULL &&
 +          READ_ONCE(rnp->boost_tasks) == NULL)
                return 0;  /* Nothing left to boost. */
  
        raw_spin_lock_irqsave(&rnp->lock, flags);
        rt_mutex_lock(&rnp->boost_mtx);
        rt_mutex_unlock(&rnp->boost_mtx);  /* Then keep lockdep happy. */
  
 -      return ACCESS_ONCE(rnp->exp_tasks) != NULL ||
 -             ACCESS_ONCE(rnp->boost_tasks) != NULL;
 +      return READ_ONCE(rnp->exp_tasks) != NULL ||
 +             READ_ONCE(rnp->boost_tasks) != NULL;
  }
  
  /*
@@@ -1181,7 -1173,7 +1181,7 @@@ static int rcu_spawn_one_boost_kthread(
        struct sched_param sp;
        struct task_struct *t;
  
 -      if (&rcu_preempt_state != rsp)
 +      if (rcu_state_p != rsp)
                return 0;
  
        if (!rcu_scheduler_fully_active || rcu_rnp_online_cpus(rnp) == 0)
@@@ -1375,12 -1367,13 +1375,12 @@@ static void rcu_prepare_kthreads(int cp
   * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs
   * any flavor of RCU.
   */
- int rcu_needs_cpu(unsigned long *delta_jiffies)
 -#ifndef CONFIG_RCU_NOCB_CPU_ALL
+ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
  {
-       *delta_jiffies = ULONG_MAX;
+       *nextevt = KTIME_MAX;
 -      return rcu_cpu_has_callbacks(NULL);
 +      return IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)
 +             ? 0 : rcu_cpu_has_callbacks(NULL);
  }
 -#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
  
  /*
   * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
@@@ -1439,8 -1432,6 +1439,6 @@@ module_param(rcu_idle_gp_delay, int, 06
  static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY;
  module_param(rcu_idle_lazy_gp_delay, int, 0644);
  
- extern int tick_nohz_active;
  /*
   * Try to advance callbacks for all flavors of RCU on the current CPU, but
   * only if it has been awhile since the last time we did so.  Afterwards,
@@@ -1469,7 -1460,7 +1467,7 @@@ static bool __maybe_unused rcu_try_adva
                 * callbacks not yet ready to invoke.
                 */
                if ((rdp->completed != rnp->completed ||
 -                   unlikely(ACCESS_ONCE(rdp->gpwrap))) &&
 +                   unlikely(READ_ONCE(rdp->gpwrap))) &&
                    rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL])
                        note_gp_changes(rsp, rdp);
  
   *
   * The caller must have disabled interrupts.
   */
- int rcu_needs_cpu(unsigned long *dj)
 -#ifndef CONFIG_RCU_NOCB_CPU_ALL
+ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
  {
        struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+       unsigned long dj;
  
-               *dj = ULONG_MAX;
 +      if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)) {
++              *nextevt = KTIME_MAX;
 +              return 0;
 +      }
 +
        /* Snapshot to detect later posting of non-lazy callback. */
        rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
  
        /* If no callbacks, RCU doesn't need the CPU. */
        if (!rcu_cpu_has_callbacks(&rdtp->all_lazy)) {
-               *dj = ULONG_MAX;
+               *nextevt = KTIME_MAX;
                return 0;
        }
  
  
        /* Request timer delay depending on laziness, and round. */
        if (!rdtp->all_lazy) {
-               *dj = round_up(rcu_idle_gp_delay + jiffies,
+               dj = round_up(rcu_idle_gp_delay + jiffies,
                               rcu_idle_gp_delay) - jiffies;
        } else {
-               *dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies;
+               dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies;
        }
+       *nextevt = basemono + dj * TICK_NSEC;
        return 0;
  }
 -#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
  
  /*
   * Prepare a CPU for idle from an RCU perspective.  The first major task
   */
  static void rcu_prepare_for_idle(void)
  {
 -#ifndef CONFIG_RCU_NOCB_CPU_ALL
        bool needwake;
        struct rcu_data *rdp;
        struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
        struct rcu_state *rsp;
        int tne;
  
 +      if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL))
 +              return;
 +
        /* Handle nohz enablement switches conservatively. */
 -      tne = ACCESS_ONCE(tick_nohz_active);
 +      tne = READ_ONCE(tick_nohz_active);
        if (tne != rdtp->tick_nohz_enabled_snap) {
                if (rcu_cpu_has_callbacks(NULL))
                        invoke_rcu_core(); /* force nohz to see update. */
                if (needwake)
                        rcu_gp_kthread_wake(rsp);
        }
 -#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
  }
  
  /*
   */
  static void rcu_cleanup_after_idle(void)
  {
 -#ifndef CONFIG_RCU_NOCB_CPU_ALL
 -      if (rcu_is_nocb_cpu(smp_processor_id()))
 +      if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) ||
 +          rcu_is_nocb_cpu(smp_processor_id()))
                return;
        if (rcu_try_advance_all_cbs())
                invoke_rcu_core();
 -#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
  }
  
  /*
@@@ -1770,7 -1760,7 +1770,7 @@@ static void print_cpu_stall_info(struc
               atomic_read(&rdtp->dynticks) & 0xfff,
               rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
               rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
 -             ACCESS_ONCE(rsp->n_force_qs) - rsp->n_force_qs_gpstart,
 +             READ_ONCE(rsp->n_force_qs) - rsp->n_force_qs_gpstart,
               fast_no_hz);
  }
  
@@@ -1908,11 -1898,11 +1908,11 @@@ static void wake_nocb_leader(struct rcu
  {
        struct rcu_data *rdp_leader = rdp->nocb_leader;
  
 -      if (!ACCESS_ONCE(rdp_leader->nocb_kthread))
 +      if (!READ_ONCE(rdp_leader->nocb_kthread))
                return;
 -      if (ACCESS_ONCE(rdp_leader->nocb_leader_sleep) || force) {
 +      if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) {
                /* Prior smp_mb__after_atomic() orders against prior enqueue. */
 -              ACCESS_ONCE(rdp_leader->nocb_leader_sleep) = false;
 +              WRITE_ONCE(rdp_leader->nocb_leader_sleep, false);
                wake_up(&rdp_leader->nocb_wq);
        }
  }
@@@ -1944,14 -1934,14 +1944,14 @@@ static bool rcu_nocb_cpu_needs_barrier(
        ret = atomic_long_read(&rdp->nocb_q_count);
  
  #ifdef CONFIG_PROVE_RCU
 -      rhp = ACCESS_ONCE(rdp->nocb_head);
 +      rhp = READ_ONCE(rdp->nocb_head);
        if (!rhp)
 -              rhp = ACCESS_ONCE(rdp->nocb_gp_head);
 +              rhp = READ_ONCE(rdp->nocb_gp_head);
        if (!rhp)
 -              rhp = ACCESS_ONCE(rdp->nocb_follower_head);
 +              rhp = READ_ONCE(rdp->nocb_follower_head);
  
        /* Having no rcuo kthread but CBs after scheduler starts is bad! */
 -      if (!ACCESS_ONCE(rdp->nocb_kthread) && rhp &&
 +      if (!READ_ONCE(rdp->nocb_kthread) && rhp &&
            rcu_scheduler_fully_active) {
                /* RCU callback enqueued before CPU first came online??? */
                pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n",
@@@ -1985,12 -1975,12 +1985,12 @@@ static void __call_rcu_nocb_enqueue(str
        atomic_long_add(rhcount, &rdp->nocb_q_count);
        /* rcu_barrier() relies on ->nocb_q_count add before xchg. */
        old_rhpp = xchg(&rdp->nocb_tail, rhtp);
 -      ACCESS_ONCE(*old_rhpp) = rhp;
 +      WRITE_ONCE(*old_rhpp, rhp);
        atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy);
        smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */
  
        /* If we are not being polled and there is a kthread, awaken it ... */
 -      t = ACCESS_ONCE(rdp->nocb_kthread);
 +      t = READ_ONCE(rdp->nocb_kthread);
        if (rcu_nocb_poll || !t) {
                trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
                                    TPS("WakeNotPoll"));
@@@ -2128,7 -2118,7 +2128,7 @@@ static void rcu_nocb_wait_gp(struct rcu
        for (;;) {
                wait_event_interruptible(
                        rnp->nocb_gp_wq[c & 0x1],
 -                      (d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c)));
 +                      (d = ULONG_CMP_GE(READ_ONCE(rnp->completed), c)));
                if (likely(d))
                        break;
                WARN_ON(signal_pending(current));
@@@ -2155,7 -2145,7 +2155,7 @@@ wait_again
        if (!rcu_nocb_poll) {
                trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep");
                wait_event_interruptible(my_rdp->nocb_wq,
 -                              !ACCESS_ONCE(my_rdp->nocb_leader_sleep));
 +                              !READ_ONCE(my_rdp->nocb_leader_sleep));
                /* Memory barrier handled by smp_mb() calls below and repoll. */
        } else if (firsttime) {
                firsttime = false; /* Don't drown trace log with "Poll"! */
         */
        gotcbs = false;
        for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
 -              rdp->nocb_gp_head = ACCESS_ONCE(rdp->nocb_head);
 +              rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head);
                if (!rdp->nocb_gp_head)
                        continue;  /* No CBs here, try next follower. */
  
                /* Move callbacks to wait-for-GP list, which is empty. */
 -              ACCESS_ONCE(rdp->nocb_head) = NULL;
 +              WRITE_ONCE(rdp->nocb_head, NULL);
                rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head);
                gotcbs = true;
        }
                my_rdp->nocb_leader_sleep = true;
                smp_mb();  /* Ensure _sleep true before scan. */
                for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower)
 -                      if (ACCESS_ONCE(rdp->nocb_head)) {
 +                      if (READ_ONCE(rdp->nocb_head)) {
                                /* Found CB, so short-circuit next wait. */
                                my_rdp->nocb_leader_sleep = false;
                                break;
  
        /* Each pass through the following loop wakes a follower, if needed. */
        for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
 -              if (ACCESS_ONCE(rdp->nocb_head))
 +              if (READ_ONCE(rdp->nocb_head))
                        my_rdp->nocb_leader_sleep = false;/* No need to sleep.*/
                if (!rdp->nocb_gp_head)
                        continue; /* No CBs, so no need to wake follower. */
@@@ -2251,7 -2241,7 +2251,7 @@@ static void nocb_follower_wait(struct r
                        trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
                                            "FollowerSleep");
                        wait_event_interruptible(rdp->nocb_wq,
 -                                               ACCESS_ONCE(rdp->nocb_follower_head));
 +                                               READ_ONCE(rdp->nocb_follower_head));
                } else if (firsttime) {
                        /* Don't drown trace log with "Poll"! */
                        firsttime = false;
@@@ -2292,10 -2282,10 +2292,10 @@@ static int rcu_nocb_kthread(void *arg
                        nocb_follower_wait(rdp);
  
                /* Pull the ready-to-invoke callbacks onto local list. */
 -              list = ACCESS_ONCE(rdp->nocb_follower_head);
 +              list = READ_ONCE(rdp->nocb_follower_head);
                BUG_ON(!list);
                trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty");
 -              ACCESS_ONCE(rdp->nocb_follower_head) = NULL;
 +              WRITE_ONCE(rdp->nocb_follower_head, NULL);
                tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head);
  
                /* Each pass through the following loop invokes a callback. */
  /* Is a deferred wakeup of rcu_nocb_kthread() required? */
  static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
  {
 -      return ACCESS_ONCE(rdp->nocb_defer_wakeup);
 +      return READ_ONCE(rdp->nocb_defer_wakeup);
  }
  
  /* Do a deferred wakeup of rcu_nocb_kthread(). */
@@@ -2344,8 -2334,8 +2344,8 @@@ static void do_nocb_deferred_wakeup(str
  
        if (!rcu_nocb_need_deferred_wakeup(rdp))
                return;
 -      ndw = ACCESS_ONCE(rdp->nocb_defer_wakeup);
 -      ACCESS_ONCE(rdp->nocb_defer_wakeup) = RCU_NOGP_WAKE_NOT;
 +      ndw = READ_ONCE(rdp->nocb_defer_wakeup);
 +      WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE_NOT);
        wake_nocb_leader(rdp, ndw == RCU_NOGP_WAKE_FORCE);
        trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake"));
  }
@@@ -2458,7 -2448,7 +2458,7 @@@ static void rcu_spawn_one_nocb_kthread(
        t = kthread_run(rcu_nocb_kthread, rdp_spawn,
                        "rcuo%c/%d", rsp->abbr, cpu);
        BUG_ON(IS_ERR(t));
 -      ACCESS_ONCE(rdp_spawn->nocb_kthread) = t;
 +      WRITE_ONCE(rdp_spawn->nocb_kthread, t);
  }
  
  /*
@@@ -2673,7 -2663,7 +2673,7 @@@ static void rcu_sysidle_enter(int irq
  
        /* Record start of fully idle period. */
        j = jiffies;
 -      ACCESS_ONCE(rdtp->dynticks_idle_jiffies) = j;
 +      WRITE_ONCE(rdtp->dynticks_idle_jiffies, j);
        smp_mb__before_atomic();
        atomic_inc(&rdtp->dynticks_idle);
        smp_mb__after_atomic();
   */
  void rcu_sysidle_force_exit(void)
  {
 -      int oldstate = ACCESS_ONCE(full_sysidle_state);
 +      int oldstate = READ_ONCE(full_sysidle_state);
        int newoldstate;
  
        /*
@@@ -2804,7 -2794,7 +2804,7 @@@ static void rcu_sysidle_check_cpu(struc
        smp_mb(); /* Read counters before timestamps. */
  
        /* Pick up timestamps. */
 -      j = ACCESS_ONCE(rdtp->dynticks_idle_jiffies);
 +      j = READ_ONCE(rdtp->dynticks_idle_jiffies);
        /* If this CPU entered idle more recently, update maxj timestamp. */
        if (ULONG_CMP_LT(*maxj, j))
                *maxj = j;
@@@ -2841,11 -2831,11 +2841,11 @@@ static unsigned long rcu_sysidle_delay(
  static void rcu_sysidle(unsigned long j)
  {
        /* Check the current state. */
 -      switch (ACCESS_ONCE(full_sysidle_state)) {
 +      switch (READ_ONCE(full_sysidle_state)) {
        case RCU_SYSIDLE_NOT:
  
                /* First time all are idle, so note a short idle period. */
 -              ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_SHORT;
 +              WRITE_ONCE(full_sysidle_state, RCU_SYSIDLE_SHORT);
                break;
  
        case RCU_SYSIDLE_SHORT:
@@@ -2883,7 -2873,7 +2883,7 @@@ static void rcu_sysidle_cancel(void
  {
        smp_mb();
        if (full_sysidle_state > RCU_SYSIDLE_SHORT)
 -              ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT;
 +              WRITE_ONCE(full_sysidle_state, RCU_SYSIDLE_NOT);
  }
  
  /*
@@@ -2935,7 -2925,7 +2935,7 @@@ static void rcu_sysidle_cb(struct rcu_h
        smp_mb();  /* grace period precedes setting inuse. */
  
        rshp = container_of(rhp, struct rcu_sysidle_head, rh);
 -      ACCESS_ONCE(rshp->inuse) = 0;
 +      WRITE_ONCE(rshp->inuse, 0);
  }
  
  /*
  bool rcu_sys_is_idle(void)
  {
        static struct rcu_sysidle_head rsh;
 -      int rss = ACCESS_ONCE(full_sysidle_state);
 +      int rss = READ_ONCE(full_sysidle_state);
  
        if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu))
                return false;
                        }
                        rcu_sysidle_report(rcu_state_p, isidle, maxj, false);
                        oldrss = rss;
 -                      rss = ACCESS_ONCE(full_sysidle_state);
 +                      rss = READ_ONCE(full_sysidle_state);
                }
        }
  
@@@ -3058,10 -3048,10 +3058,10 @@@ static bool rcu_nohz_full_cpu(struct rc
  #ifdef CONFIG_NO_HZ_FULL
        if (tick_nohz_full_cpu(smp_processor_id()) &&
            (!rcu_gp_in_progress(rsp) ||
 -           ULONG_CMP_LT(jiffies, ACCESS_ONCE(rsp->gp_start) + HZ)))
 -              return 1;
 +           ULONG_CMP_LT(jiffies, READ_ONCE(rsp->gp_start) + HZ)))
 +              return true;
  #endif /* #ifdef CONFIG_NO_HZ_FULL */
 -      return 0;
 +      return false;
  }
  
  /*
@@@ -3087,7 -3077,7 +3087,7 @@@ static void rcu_bind_gp_kthread(void
  static void rcu_dynticks_task_enter(void)
  {
  #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
 -      ACCESS_ONCE(current->rcu_tasks_idle_cpu) = smp_processor_id();
 +      WRITE_ONCE(current->rcu_tasks_idle_cpu, smp_processor_id());
  #endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
  }
  
  static void rcu_dynticks_task_exit(void)
  {
  #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
 -      ACCESS_ONCE(current->rcu_tasks_idle_cpu) = -1;
 +      WRITE_ONCE(current->rcu_tasks_idle_cpu, -1);
  #endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
  }
diff --combined kernel/sched/core.c
index f89ca9bcf42a5f582e2c276dc2ef21338b8bd402,e9f25ce70c77396d312335552428e43535ce1df4..c9a707b593317d7936182d3db18b8ece8a3ac159
  #define CREATE_TRACE_POINTS
  #include <trace/events/sched.h>
  
- void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
- {
-       unsigned long delta;
-       ktime_t soft, hard, now;
-       for (;;) {
-               if (hrtimer_active(period_timer))
-                       break;
-               now = hrtimer_cb_get_time(period_timer);
-               hrtimer_forward(period_timer, now, period);
-               soft = hrtimer_get_softexpires(period_timer);
-               hard = hrtimer_get_expires(period_timer);
-               delta = ktime_to_ns(ktime_sub(hard, soft));
-               __hrtimer_start_range_ns(period_timer, soft, delta,
-                                        HRTIMER_MODE_ABS_PINNED, 0);
-       }
- }
  DEFINE_MUTEX(sched_domains_mutex);
  DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
  
@@@ -355,12 -335,11 +335,11 @@@ static enum hrtimer_restart hrtick(stru
  
  #ifdef CONFIG_SMP
  
- static int __hrtick_restart(struct rq *rq)
+ static void __hrtick_restart(struct rq *rq)
  {
        struct hrtimer *timer = &rq->hrtick_timer;
-       ktime_t time = hrtimer_get_softexpires(timer);
  
-       return __hrtimer_start_range_ns(timer, time, 0, HRTIMER_MODE_ABS_PINNED, 0);
+       hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
  }
  
  /*
@@@ -440,8 -419,8 +419,8 @@@ void hrtick_start(struct rq *rq, u64 de
         * doesn't make sense. Rely on vruntime for fairness.
         */
        delay = max_t(u64, delay, 10000LL);
-       __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
-                       HRTIMER_MODE_REL_PINNED, 0);
+       hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
+                     HRTIMER_MODE_REL_PINNED);
  }
  
  static inline void init_hrtick(void)
@@@ -511,7 -490,7 +490,7 @@@ static bool set_nr_and_not_polling(stru
  static bool set_nr_if_polling(struct task_struct *p)
  {
        struct thread_info *ti = task_thread_info(p);
 -      typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags);
 +      typeof(ti->flags) old, val = READ_ONCE(ti->flags);
  
        for (;;) {
                if (!(val & _TIF_POLLING_NRFLAG))
@@@ -541,52 -520,6 +520,52 @@@ static bool set_nr_if_polling(struct ta
  #endif
  #endif
  
 +void wake_q_add(struct wake_q_head *head, struct task_struct *task)
 +{
 +      struct wake_q_node *node = &task->wake_q;
 +
 +      /*
 +       * Atomically grab the task, if ->wake_q is !nil already it means
 +       * its already queued (either by us or someone else) and will get the
 +       * wakeup due to that.
 +       *
 +       * This cmpxchg() implies a full barrier, which pairs with the write
 +       * barrier implied by the wakeup in wake_up_list().
 +       */
 +      if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
 +              return;
 +
 +      get_task_struct(task);
 +
 +      /*
 +       * The head is context local, there can be no concurrency.
 +       */
 +      *head->lastp = node;
 +      head->lastp = &node->next;
 +}
 +
 +void wake_up_q(struct wake_q_head *head)
 +{
 +      struct wake_q_node *node = head->first;
 +
 +      while (node != WAKE_Q_TAIL) {
 +              struct task_struct *task;
 +
 +              task = container_of(node, struct task_struct, wake_q);
 +              BUG_ON(!task);
 +              /* task can safely be re-inserted now */
 +              node = node->next;
 +              task->wake_q.next = NULL;
 +
 +              /*
 +               * wake_up_process() implies a wmb() to pair with the queueing
 +               * in wake_q_add() so as not to miss wakeups.
 +               */
 +              wake_up_process(task);
 +              put_task_struct(task);
 +      }
 +}
 +
  /*
   * resched_curr - mark rq's current task 'to be rescheduled now'.
   *
@@@ -639,13 -572,12 +618,12 @@@ void resched_cpu(int cpu
   * selecting an idle cpu will add more delays to the timers than intended
   * (as that cpu's timer base may not be uptodate wrt jiffies etc).
   */
- int get_nohz_timer_target(int pinned)
+ int get_nohz_timer_target(void)
  {
-       int cpu = smp_processor_id();
-       int i;
+       int i, cpu = smp_processor_id();
        struct sched_domain *sd;
  
-       if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu))
+       if (!idle_cpu(cpu))
                return cpu;
  
        rcu_read_lock();
@@@ -1095,7 -1027,7 +1073,7 @@@ void set_task_cpu(struct task_struct *p
                if (p->sched_class->migrate_task_rq)
                        p->sched_class->migrate_task_rq(p, new_cpu);
                p->se.nr_migrations++;
 -              perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
 +              perf_event_task_migrate(p);
        }
  
        __set_task_cpu(p, new_cpu);
@@@ -2151,15 -2083,12 +2129,15 @@@ void wake_up_new_task(struct task_struc
  
  #ifdef CONFIG_PREEMPT_NOTIFIERS
  
 +static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE;
 +
  /**
   * preempt_notifier_register - tell me when current is being preempted & rescheduled
   * @notifier: notifier struct to register
   */
  void preempt_notifier_register(struct preempt_notifier *notifier)
  {
 +      static_key_slow_inc(&preempt_notifier_key);
        hlist_add_head(&notifier->link, &current->preempt_notifiers);
  }
  EXPORT_SYMBOL_GPL(preempt_notifier_register);
   * preempt_notifier_unregister - no longer interested in preemption notifications
   * @notifier: notifier struct to unregister
   *
 - * This is safe to call from within a preemption notifier.
 + * This is *not* safe to call from within a preemption notifier.
   */
  void preempt_notifier_unregister(struct preempt_notifier *notifier)
  {
        hlist_del(&notifier->link);
 +      static_key_slow_dec(&preempt_notifier_key);
  }
  EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
  
 -static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
 +static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
  {
        struct preempt_notifier *notifier;
  
                notifier->ops->sched_in(notifier, raw_smp_processor_id());
  }
  
 +static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
 +{
 +      if (static_key_false(&preempt_notifier_key))
 +              __fire_sched_in_preempt_notifiers(curr);
 +}
 +
  static void
 -fire_sched_out_preempt_notifiers(struct task_struct *curr,
 -                               struct task_struct *next)
 +__fire_sched_out_preempt_notifiers(struct task_struct *curr,
 +                                 struct task_struct *next)
  {
        struct preempt_notifier *notifier;
  
                notifier->ops->sched_out(notifier, next);
  }
  
 +static __always_inline void
 +fire_sched_out_preempt_notifiers(struct task_struct *curr,
 +                               struct task_struct *next)
 +{
 +      if (static_key_false(&preempt_notifier_key))
 +              __fire_sched_out_preempt_notifiers(curr, next);
 +}
 +
  #else /* !CONFIG_PREEMPT_NOTIFIERS */
  
 -static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
 +static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
  {
  }
  
 -static void
 +static inline void
  fire_sched_out_preempt_notifiers(struct task_struct *curr,
                                 struct task_struct *next)
  {
@@@ -2461,9 -2375,9 +2439,9 @@@ unsigned long nr_iowait_cpu(int cpu
  
  void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
  {
 -      struct rq *this = this_rq();
 -      *nr_waiters = atomic_read(&this->nr_iowait);
 -      *load = this->cpu_load[0];
 +      struct rq *rq = this_rq();
 +      *nr_waiters = atomic_read(&rq->nr_iowait);
 +      *load = rq->load.weight;
  }
  
  #ifdef CONFIG_SMP
@@@ -2561,7 -2475,6 +2539,7 @@@ void scheduler_tick(void
        update_rq_clock(rq);
        curr->sched_class->task_tick(rq, curr, 0);
        update_cpu_load_active(rq);
 +      calc_global_load_tick(rq);
        raw_spin_unlock(&rq->lock);
  
        perf_event_task_tick();
  u64 scheduler_tick_max_deferment(void)
  {
        struct rq *rq = this_rq();
 -      unsigned long next, now = ACCESS_ONCE(jiffies);
 +      unsigned long next, now = READ_ONCE(jiffies);
  
        next = rq->last_sched_tick + HZ;
  
@@@ -2791,7 -2704,9 +2769,7 @@@ again
   *          - return from syscall or exception to user-space
   *          - return from interrupt-handler to user-space
   *
 - * WARNING: all callers must re-check need_resched() afterward and reschedule
 - * accordingly in case an event triggered the need for rescheduling (such as
 - * an interrupt waking up a task) while preemption was disabled in __schedule().
 + * WARNING: must be called with preemption disabled!
   */
  static void __sched __schedule(void)
  {
        struct rq *rq;
        int cpu;
  
 -      preempt_disable();
        cpu = smp_processor_id();
        rq = cpu_rq(cpu);
        rcu_note_context_switch();
                raw_spin_unlock_irq(&rq->lock);
  
        post_schedule(rq);
 -
 -      sched_preempt_enable_no_resched();
  }
  
  static inline void sched_submit_work(struct task_struct *tsk)
@@@ -2883,9 -2801,7 +2861,9 @@@ asmlinkage __visible void __sched sched
  
        sched_submit_work(tsk);
        do {
 +              preempt_disable();
                __schedule();
 +              sched_preempt_enable_no_resched();
        } while (need_resched());
  }
  EXPORT_SYMBOL(schedule);
@@@ -2924,14 -2840,15 +2902,14 @@@ void __sched schedule_preempt_disabled(
  static void __sched notrace preempt_schedule_common(void)
  {
        do {
 -              __preempt_count_add(PREEMPT_ACTIVE);
 +              preempt_active_enter();
                __schedule();
 -              __preempt_count_sub(PREEMPT_ACTIVE);
 +              preempt_active_exit();
  
                /*
                 * Check again in case we missed a preemption opportunity
                 * between schedule and now.
                 */
 -              barrier();
        } while (need_resched());
  }
  
@@@ -2955,8 -2872,9 +2933,8 @@@ asmlinkage __visible void __sched notra
  NOKPROBE_SYMBOL(preempt_schedule);
  EXPORT_SYMBOL(preempt_schedule);
  
 -#ifdef CONFIG_CONTEXT_TRACKING
  /**
 - * preempt_schedule_context - preempt_schedule called by tracing
 + * preempt_schedule_notrace - preempt_schedule called by tracing
   *
   * The tracing infrastructure uses preempt_enable_notrace to prevent
   * recursion and tracing preempt enabling caused by the tracing
   * instead of preempt_schedule() to exit user context if needed before
   * calling the scheduler.
   */
 -asmlinkage __visible void __sched notrace preempt_schedule_context(void)
 +asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
  {
        enum ctx_state prev_ctx;
  
                return;
  
        do {
 -              __preempt_count_add(PREEMPT_ACTIVE);
 +              /*
 +               * Use raw __prempt_count() ops that don't call function.
 +               * We can't call functions before disabling preemption which
 +               * disarm preemption tracing recursions.
 +               */
 +              __preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
 +              barrier();
                /*
                 * Needs preempt disabled in case user_exit() is traced
                 * and the tracer calls preempt_enable_notrace() causing
                __schedule();
                exception_exit(prev_ctx);
  
 -              __preempt_count_sub(PREEMPT_ACTIVE);
                barrier();
 +              __preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
        } while (need_resched());
  }
 -EXPORT_SYMBOL_GPL(preempt_schedule_context);
 -#endif /* CONFIG_CONTEXT_TRACKING */
 +EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
  
  #endif /* CONFIG_PREEMPT */
  
@@@ -3017,11 -2930,17 +2995,11 @@@ asmlinkage __visible void __sched preem
        prev_state = exception_enter();
  
        do {
 -              __preempt_count_add(PREEMPT_ACTIVE);
 +              preempt_active_enter();
                local_irq_enable();
                __schedule();
                local_irq_disable();
 -              __preempt_count_sub(PREEMPT_ACTIVE);
 -
 -              /*
 -               * Check again in case we missed a preemption opportunity
 -               * between schedule and now.
 -               */
 -              barrier();
 +              preempt_active_exit();
        } while (need_resched());
  
        exception_exit(prev_state);
@@@ -3099,6 -3018,7 +3077,6 @@@ void rt_mutex_setprio(struct task_struc
                if (!dl_prio(p->normal_prio) ||
                    (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
                        p->dl.dl_boosted = 1;
 -                      p->dl.dl_throttled = 0;
                        enqueue_flag = ENQUEUE_REPLENISH;
                } else
                        p->dl.dl_boosted = 0;
@@@ -4447,7 -4367,10 +4425,7 @@@ long __sched io_schedule_timeout(long t
        long ret;
  
        current->in_iowait = 1;
 -      if (old_iowait)
 -              blk_schedule_flush_plug(current);
 -      else
 -              blk_flush_plug(current);
 +      blk_schedule_flush_plug(current);
  
        delayacct_blkio_start();
        rq = raw_rq();
@@@ -5372,7 -5295,7 +5350,7 @@@ static struct notifier_block migration_
        .priority = CPU_PRI_MIGRATION,
  };
  
 -static void __cpuinit set_cpu_rq_start_time(void)
 +static void set_cpu_rq_start_time(void)
  {
        int cpu = smp_processor_id();
        struct rq *rq = cpu_rq(cpu);
@@@ -7126,8 -7049,6 +7104,6 @@@ void __init sched_init_smp(void
  }
  #endif /* CONFIG_SMP */
  
- const_debug unsigned int sysctl_timer_migration = 1;
  int in_sched_functions(unsigned long addr)
  {
        return in_lock_functions(addr) ||
@@@ -7792,11 -7713,11 +7768,11 @@@ static long sched_group_rt_runtime(stru
        return rt_runtime_us;
  }
  
 -static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
 +static int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
  {
        u64 rt_runtime, rt_period;
  
 -      rt_period = (u64)rt_period_us * NSEC_PER_USEC;
 +      rt_period = rt_period_us * NSEC_PER_USEC;
        rt_runtime = tg->rt_bandwidth.rt_runtime;
  
        return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
@@@ -8163,10 -8084,8 +8139,8 @@@ static int tg_set_cfs_bandwidth(struct 
  
        __refill_cfs_bandwidth_runtime(cfs_b);
        /* restart the period timer (if active) to handle new period expiry */
-       if (runtime_enabled && cfs_b->timer_active) {
-               /* force a reprogram */
-               __start_cfs_bandwidth(cfs_b, true);
-       }
+       if (runtime_enabled)
+               start_cfs_bandwidth(cfs_b);
        raw_spin_unlock_irq(&cfs_b->lock);
  
        for_each_online_cpu(i) {
diff --combined kernel/sched/deadline.c
index 392e8fb94db36ef32aad026510d3ebfe3d89f6ef,21d6907d2b9fd07c47d9e9c9125d6d5b47f1b499..eac20c557a55cc83f8e9d7e62578868ba9436aff
@@@ -503,8 -503,6 +503,6 @@@ static int start_dl_timer(struct sched_
        struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
        struct rq *rq = rq_of_dl_rq(dl_rq);
        ktime_t now, act;
-       ktime_t soft, hard;
-       unsigned long range;
        s64 delta;
  
        if (boosted)
        if (ktime_us_delta(act, now) < 0)
                return 0;
  
-       hrtimer_set_expires(&dl_se->dl_timer, act);
+       hrtimer_start(&dl_se->dl_timer, act, HRTIMER_MODE_ABS);
  
-       soft = hrtimer_get_softexpires(&dl_se->dl_timer);
-       hard = hrtimer_get_expires(&dl_se->dl_timer);
-       range = ktime_to_ns(ktime_sub(hard, soft));
-       __hrtimer_start_range_ns(&dl_se->dl_timer, soft,
-                                range, HRTIMER_MODE_ABS, 0);
-       return hrtimer_active(&dl_se->dl_timer);
+       return 1;
  }
  
  /*
@@@ -640,7 -632,7 +632,7 @@@ void init_dl_task_timer(struct sched_dl
  }
  
  static
 -int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se)
 +int dl_runtime_exceeded(struct sched_dl_entity *dl_se)
  {
        return (dl_se->runtime <= 0);
  }
@@@ -684,7 -676,7 +676,7 @@@ static void update_curr_dl(struct rq *r
        sched_rt_avg_update(rq, delta_exec);
  
        dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec;
 -      if (dl_runtime_exceeded(rq, dl_se)) {
 +      if (dl_runtime_exceeded(dl_se)) {
                dl_se->dl_throttled = 1;
                __dequeue_task_dl(rq, curr, 0);
                if (unlikely(!start_dl_timer(dl_se, curr->dl.dl_boosted)))
@@@ -995,7 -987,7 +987,7 @@@ select_task_rq_dl(struct task_struct *p
        rq = cpu_rq(cpu);
  
        rcu_read_lock();
 -      curr = ACCESS_ONCE(rq->curr); /* unlocked access */
 +      curr = READ_ONCE(rq->curr); /* unlocked access */
  
        /*
         * If we are dealing with a -deadline task, we must
            (p->nr_cpus_allowed > 1)) {
                int target = find_later_rq(p);
  
 -              if (target != -1)
 +              if (target != -1 &&
 +                              dl_time_before(p->dl.deadline,
 +                                      cpu_rq(target)->dl.earliest_dl.curr))
                        cpu = target;
        }
        rcu_read_unlock();
@@@ -1232,32 -1222,6 +1224,32 @@@ next_node
        return NULL;
  }
  
 +/*
 + * Return the earliest pushable rq's task, which is suitable to be executed
 + * on the CPU, NULL otherwise:
 + */
 +static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu)
 +{
 +      struct rb_node *next_node = rq->dl.pushable_dl_tasks_leftmost;
 +      struct task_struct *p = NULL;
 +
 +      if (!has_pushable_dl_tasks(rq))
 +              return NULL;
 +
 +next_node:
 +      if (next_node) {
 +              p = rb_entry(next_node, struct task_struct, pushable_dl_tasks);
 +
 +              if (pick_dl_task(rq, p, cpu))
 +                      return p;
 +
 +              next_node = rb_next(next_node);
 +              goto next_node;
 +      }
 +
 +      return NULL;
 +}
 +
  static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl);
  
  static int find_later_rq(struct task_struct *task)
@@@ -1361,17 -1325,6 +1353,17 @@@ static struct rq *find_lock_later_rq(st
  
                later_rq = cpu_rq(cpu);
  
 +              if (!dl_time_before(task->dl.deadline,
 +                                      later_rq->dl.earliest_dl.curr)) {
 +                      /*
 +                       * Target rq has tasks of equal or earlier deadline,
 +                       * retrying does not release any lock and is unlikely
 +                       * to yield a different result.
 +                       */
 +                      later_rq = NULL;
 +                      break;
 +              }
 +
                /* Retry if something changed. */
                if (double_lock_balance(rq, later_rq)) {
                        if (unlikely(task_rq(task) != rq ||
@@@ -1553,7 -1506,7 +1545,7 @@@ static int pull_dl_task(struct rq *this
                if (src_rq->dl.dl_nr_running <= 1)
                        goto skip;
  
 -              p = pick_next_earliest_dl_task(src_rq, this_cpu);
 +              p = pick_earliest_pushable_dl_task(src_rq, this_cpu);
  
                /*
                 * We found a task to be pulled if:
@@@ -1698,7 -1651,7 +1690,7 @@@ static void rq_offline_dl(struct rq *rq
        cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu);
  }
  
 -void init_sched_dl_class(void)
 +void __init init_sched_dl_class(void)
  {
        unsigned int i;
  
diff --combined kernel/sched/debug.c
index 704683cc90422d096d8a591326db9a880574ed23,f94724eda407ecc656217c4b6ce0ebeb656f8e8b..315c68e015d955d6227a83b6b951482cffd8a68e
@@@ -132,14 -132,12 +132,14 @@@ print_task(struct seq_file *m, struct r
                p->prio);
  #ifdef CONFIG_SCHEDSTATS
        SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
 -              SPLIT_NS(p->se.vruntime),
 +              SPLIT_NS(p->se.statistics.wait_sum),
                SPLIT_NS(p->se.sum_exec_runtime),
                SPLIT_NS(p->se.statistics.sum_sleep_runtime));
  #else
 -      SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
 -              0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
 +      SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
 +              0LL, 0L,
 +              SPLIT_NS(p->se.sum_exec_runtime),
 +              0LL, 0L);
  #endif
  #ifdef CONFIG_NUMA_BALANCING
        SEQ_printf(m, " %d", task_node(p));
@@@ -158,7 -156,7 +158,7 @@@ static void print_rq(struct seq_file *m
        SEQ_printf(m,
        "\nrunnable tasks:\n"
        "            task   PID         tree-key  switches  prio"
 -      "     exec-runtime         sum-exec        sum-sleep\n"
 +      "     wait-time             sum-exec        sum-sleep\n"
        "------------------------------------------------------"
        "----------------------------------------------------\n");
  
@@@ -232,8 -230,6 +232,6 @@@ void print_cfs_rq(struct seq_file *m, i
  #endif
  #endif
  #ifdef CONFIG_CFS_BANDWIDTH
-       SEQ_printf(m, "  .%-30s: %d\n", "tg->cfs_bandwidth.timer_active",
-                       cfs_rq->tg->cfs_bandwidth.timer_active);
        SEQ_printf(m, "  .%-30s: %d\n", "throttled",
                        cfs_rq->throttled);
        SEQ_printf(m, "  .%-30s: %d\n", "throttle_count",
@@@ -584,7 -580,6 +582,7 @@@ void proc_sched_show_task(struct task_s
        nr_switches = p->nvcsw + p->nivcsw;
  
  #ifdef CONFIG_SCHEDSTATS
 +      PN(se.statistics.sum_sleep_runtime);
        PN(se.statistics.wait_start);
        PN(se.statistics.sleep_start);
        PN(se.statistics.block_start);
diff --combined kernel/sched/fair.c
index 433061d984eac6ce5322714ab29bd3beb148eaf4,69be2825262d5df3e4d859e3faabefe3064863e2..40a7fcbf491eb7d1f5735d0e9efd25c23d4d60a9
@@@ -141,9 -141,9 +141,9 @@@ static inline void update_load_set(stru
   *
   * This idea comes from the SD scheduler of Con Kolivas:
   */
 -static int get_update_sysctl_factor(void)
 +static unsigned int get_update_sysctl_factor(void)
  {
 -      unsigned int cpus = min_t(int, num_online_cpus(), 8);
 +      unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
        unsigned int factor;
  
        switch (sysctl_sched_tunable_scaling) {
@@@ -576,7 -576,7 +576,7 @@@ int sched_proc_update_handler(struct ct
                loff_t *ppos)
  {
        int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 -      int factor = get_update_sysctl_factor();
 +      unsigned int factor = get_update_sysctl_factor();
  
        if (ret || !write)
                return ret;
@@@ -834,7 -834,7 +834,7 @@@ static unsigned int task_nr_scan_window
  
  static unsigned int task_scan_min(struct task_struct *p)
  {
 -      unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size);
 +      unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
        unsigned int scan, floor;
        unsigned int windows = 1;
  
@@@ -1198,9 -1198,11 +1198,9 @@@ static void task_numa_assign(struct tas
  static bool load_too_imbalanced(long src_load, long dst_load,
                                struct task_numa_env *env)
  {
 +      long imb, old_imb;
 +      long orig_src_load, orig_dst_load;
        long src_capacity, dst_capacity;
 -      long orig_src_load;
 -      long load_a, load_b;
 -      long moved_load;
 -      long imb;
  
        /*
         * The load is corrected for the CPU capacity available on each node.
        dst_capacity = env->dst_stats.compute_capacity;
  
        /* We care about the slope of the imbalance, not the direction. */
 -      load_a = dst_load;
 -      load_b = src_load;
 -      if (load_a < load_b)
 -              swap(load_a, load_b);
 +      if (dst_load < src_load)
 +              swap(dst_load, src_load);
  
        /* Is the difference below the threshold? */
 -      imb = load_a * src_capacity * 100 -
 -              load_b * dst_capacity * env->imbalance_pct;
 +      imb = dst_load * src_capacity * 100 -
 +            src_load * dst_capacity * env->imbalance_pct;
        if (imb <= 0)
                return false;
  
        /*
         * The imbalance is above the allowed threshold.
 -       * Allow a move that brings us closer to a balanced situation,
 -       * without moving things past the point of balance.
 +       * Compare it with the old imbalance.
         */
        orig_src_load = env->src_stats.load;
 +      orig_dst_load = env->dst_stats.load;
  
 -      /*
 -       * In a task swap, there will be one load moving from src to dst,
 -       * and another moving back. This is the net sum of both moves.
 -       * A simple task move will always have a positive value.
 -       * Allow the move if it brings the system closer to a balanced
 -       * situation, without crossing over the balance point.
 -       */
 -      moved_load = orig_src_load - src_load;
 +      if (orig_dst_load < orig_src_load)
 +              swap(orig_dst_load, orig_src_load);
  
 -      if (moved_load > 0)
 -              /* Moving src -> dst. Did we overshoot balance? */
 -              return src_load * dst_capacity < dst_load * src_capacity;
 -      else
 -              /* Moving dst -> src. Did we overshoot balance? */
 -              return dst_load * src_capacity < src_load * dst_capacity;
 +      old_imb = orig_dst_load * src_capacity * 100 -
 +                orig_src_load * dst_capacity * env->imbalance_pct;
 +
 +      /* Would this change make things worse? */
 +      return (imb > old_imb);
  }
  
  /*
@@@ -1398,30 -1409,6 +1398,30 @@@ static void task_numa_find_cpu(struct t
        }
  }
  
 +/* Only move tasks to a NUMA node less busy than the current node. */
 +static bool numa_has_capacity(struct task_numa_env *env)
 +{
 +      struct numa_stats *src = &env->src_stats;
 +      struct numa_stats *dst = &env->dst_stats;
 +
 +      if (src->has_free_capacity && !dst->has_free_capacity)
 +              return false;
 +
 +      /*
 +       * Only consider a task move if the source has a higher load
 +       * than the destination, corrected for CPU capacity on each node.
 +       *
 +       *      src->load                dst->load
 +       * --------------------- vs ---------------------
 +       * src->compute_capacity    dst->compute_capacity
 +       */
 +      if (src->load * dst->compute_capacity >
 +          dst->load * src->compute_capacity)
 +              return true;
 +
 +      return false;
 +}
 +
  static int task_numa_migrate(struct task_struct *p)
  {
        struct task_numa_env env = {
        update_numa_stats(&env.dst_stats, env.dst_nid);
  
        /* Try to find a spot on the preferred nid. */
 -      task_numa_find_cpu(&env, taskimp, groupimp);
 +      if (numa_has_capacity(&env))
 +              task_numa_find_cpu(&env, taskimp, groupimp);
  
        /*
         * Look at other nodes in these cases:
                        env.dist = dist;
                        env.dst_nid = nid;
                        update_numa_stats(&env.dst_stats, env.dst_nid);
 -                      task_numa_find_cpu(&env, taskimp, groupimp);
 +                      if (numa_has_capacity(&env))
 +                              task_numa_find_cpu(&env, taskimp, groupimp);
                }
        }
  
@@@ -1809,12 -1794,7 +1809,12 @@@ static void task_numa_placement(struct 
        u64 runtime, period;
        spinlock_t *group_lock = NULL;
  
 -      seq = ACCESS_ONCE(p->mm->numa_scan_seq);
 +      /*
 +       * The p->mm->numa_scan_seq field gets updated without
 +       * exclusive access. Use READ_ONCE() here to ensure
 +       * that the field is read in a single access:
 +       */
 +      seq = READ_ONCE(p->mm->numa_scan_seq);
        if (p->numa_scan_seq == seq)
                return;
        p->numa_scan_seq = seq;
@@@ -1958,7 -1938,7 +1958,7 @@@ static void task_numa_group(struct task
        }
  
        rcu_read_lock();
 -      tsk = ACCESS_ONCE(cpu_rq(cpu)->curr);
 +      tsk = READ_ONCE(cpu_rq(cpu)->curr);
  
        if (!cpupid_match_pid(tsk, cpupid))
                goto no_join;
@@@ -2127,15 -2107,7 +2127,15 @@@ void task_numa_fault(int last_cpupid, i
  
  static void reset_ptenuma_scan(struct task_struct *p)
  {
 -      ACCESS_ONCE(p->mm->numa_scan_seq)++;
 +      /*
 +       * We only did a read acquisition of the mmap sem, so
 +       * p->mm->numa_scan_seq is written to without exclusive access
 +       * and the update is not guaranteed to be atomic. That's not
 +       * much of an issue though, since this is just used for
 +       * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
 +       * expensive, to avoid any form of compiler optimizations:
 +       */
 +      WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
        p->mm->numa_scan_offset = 0;
  }
  
@@@ -2209,7 -2181,7 +2209,7 @@@ void task_numa_work(struct callback_hea
        }
        for (; vma; vma = vma->vm_next) {
                if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
 -                      is_vm_hugetlb_page(vma)) {
 +                      is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
                        continue;
                }
  
@@@ -3504,16 -3476,7 +3504,7 @@@ static int assign_cfs_rq_runtime(struc
        if (cfs_b->quota == RUNTIME_INF)
                amount = min_amount;
        else {
-               /*
-                * If the bandwidth pool has become inactive, then at least one
-                * period must have elapsed since the last consumption.
-                * Refresh the global state and ensure bandwidth timer becomes
-                * active.
-                */
-               if (!cfs_b->timer_active) {
-                       __refill_cfs_bandwidth_runtime(cfs_b);
-                       __start_cfs_bandwidth(cfs_b, false);
-               }
+               start_cfs_bandwidth(cfs_b);
  
                if (cfs_b->runtime > 0) {
                        amount = min(cfs_b->runtime, min_amount);
@@@ -3662,6 -3625,7 +3653,7 @@@ static void throttle_cfs_rq(struct cfs_
        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
        struct sched_entity *se;
        long task_delta, dequeue = 1;
+       bool empty;
  
        se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
  
        cfs_rq->throttled = 1;
        cfs_rq->throttled_clock = rq_clock(rq);
        raw_spin_lock(&cfs_b->lock);
+       empty = list_empty(&cfs_rq->throttled_list);
        /*
         * Add to the _head_ of the list, so that an already-started
         * distribute_cfs_runtime will not see us
         */
        list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
-       if (!cfs_b->timer_active)
-               __start_cfs_bandwidth(cfs_b, false);
+       /*
+        * If we're the first throttled task, make sure the bandwidth
+        * timer is running.
+        */
+       if (empty)
+               start_cfs_bandwidth(cfs_b);
        raw_spin_unlock(&cfs_b->lock);
  }
  
@@@ -3812,13 -3784,6 +3812,6 @@@ static int do_sched_cfs_period_timer(st
        if (cfs_b->idle && !throttled)
                goto out_deactivate;
  
-       /*
-        * if we have relooped after returning idle once, we need to update our
-        * status as actually running, so that other cpus doing
-        * __start_cfs_bandwidth will stop trying to cancel us.
-        */
-       cfs_b->timer_active = 1;
        __refill_cfs_bandwidth_runtime(cfs_b);
  
        if (!throttled) {
        return 0;
  
  out_deactivate:
-       cfs_b->timer_active = 0;
        return 1;
  }
  
@@@ -3878,7 -3842,7 +3870,7 @@@ static const u64 cfs_bandwidth_slack_pe
   * Are we near the end of the current quota period?
   *
   * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
-  * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of
+  * hrtimer base being cleared by hrtimer_start. In the case of
   * migrate_hrtimers, base is never cleared, so we are fine.
   */
  static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
@@@ -3906,8 -3870,9 +3898,9 @@@ static void start_cfs_slack_bandwidth(s
        if (runtime_refresh_within(cfs_b, min_left))
                return;
  
-       start_bandwidth_timer(&cfs_b->slack_timer,
-                               ns_to_ktime(cfs_bandwidth_slack_period));
+       hrtimer_start(&cfs_b->slack_timer,
+                       ns_to_ktime(cfs_bandwidth_slack_period),
+                       HRTIMER_MODE_REL);
  }
  
  /* we know any runtime found here is valid as update_curr() precedes return */
@@@ -4027,6 -3992,7 +4020,7 @@@ static enum hrtimer_restart sched_cfs_s
  {
        struct cfs_bandwidth *cfs_b =
                container_of(timer, struct cfs_bandwidth, slack_timer);
        do_sched_cfs_slack_timer(cfs_b);
  
        return HRTIMER_NORESTART;
@@@ -4036,20 -4002,19 +4030,19 @@@ static enum hrtimer_restart sched_cfs_p
  {
        struct cfs_bandwidth *cfs_b =
                container_of(timer, struct cfs_bandwidth, period_timer);
-       ktime_t now;
        int overrun;
        int idle = 0;
  
        raw_spin_lock(&cfs_b->lock);
        for (;;) {
-               now = hrtimer_cb_get_time(timer);
-               overrun = hrtimer_forward(timer, now, cfs_b->period);
+               overrun = hrtimer_forward_now(timer, cfs_b->period);
                if (!overrun)
                        break;
  
                idle = do_sched_cfs_period_timer(cfs_b, overrun);
        }
+       if (idle)
+               cfs_b->period_active = 0;
        raw_spin_unlock(&cfs_b->lock);
  
        return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
@@@ -4063,7 -4028,7 +4056,7 @@@ void init_cfs_bandwidth(struct cfs_band
        cfs_b->period = ns_to_ktime(default_cfs_period());
  
        INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
-       hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+       hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
        cfs_b->period_timer.function = sched_cfs_period_timer;
        hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        cfs_b->slack_timer.function = sched_cfs_slack_timer;
@@@ -4075,28 -4040,15 +4068,15 @@@ static void init_cfs_rq_runtime(struct 
        INIT_LIST_HEAD(&cfs_rq->throttled_list);
  }
  
- /* requires cfs_b->lock, may release to reprogram timer */
- void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force)
+ void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
  {
-       /*
-        * The timer may be active because we're trying to set a new bandwidth
-        * period or because we're racing with the tear-down path
-        * (timer_active==0 becomes visible before the hrtimer call-back
-        * terminates).  In either case we ensure that it's re-programmed
-        */
-       while (unlikely(hrtimer_active(&cfs_b->period_timer)) &&
-              hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) {
-               /* bounce the lock to allow do_sched_cfs_period_timer to run */
-               raw_spin_unlock(&cfs_b->lock);
-               cpu_relax();
-               raw_spin_lock(&cfs_b->lock);
-               /* if someone else restarted the timer then we're done */
-               if (!force && cfs_b->timer_active)
-                       return;
-       }
+       lockdep_assert_held(&cfs_b->lock);
  
-       cfs_b->timer_active = 1;
-       start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
+       if (!cfs_b->period_active) {
+               cfs_b->period_active = 1;
+               hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
+               hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
+       }
  }
  
  static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
@@@ -4351,189 -4303,6 +4331,189 @@@ static void dequeue_task_fair(struct r
  }
  
  #ifdef CONFIG_SMP
 +
 +/*
 + * per rq 'load' arrray crap; XXX kill this.
 + */
 +
 +/*
 + * The exact cpuload at various idx values, calculated at every tick would be
 + * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
 + *
 + * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
 + * on nth tick when cpu may be busy, then we have:
 + * load = ((2^idx - 1) / 2^idx)^(n-1) * load
 + * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
 + *
 + * decay_load_missed() below does efficient calculation of
 + * load = ((2^idx - 1) / 2^idx)^(n-1) * load
 + * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
 + *
 + * The calculation is approximated on a 128 point scale.
 + * degrade_zero_ticks is the number of ticks after which load at any
 + * particular idx is approximated to be zero.
 + * degrade_factor is a precomputed table, a row for each load idx.
 + * Each column corresponds to degradation factor for a power of two ticks,
 + * based on 128 point scale.
 + * Example:
 + * row 2, col 3 (=12) says that the degradation at load idx 2 after
 + * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
 + *
 + * With this power of 2 load factors, we can degrade the load n times
 + * by looking at 1 bits in n and doing as many mult/shift instead of
 + * n mult/shifts needed by the exact degradation.
 + */
 +#define DEGRADE_SHIFT         7
 +static const unsigned char
 +              degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
 +static const unsigned char
 +              degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
 +                                      {0, 0, 0, 0, 0, 0, 0, 0},
 +                                      {64, 32, 8, 0, 0, 0, 0, 0},
 +                                      {96, 72, 40, 12, 1, 0, 0},
 +                                      {112, 98, 75, 43, 15, 1, 0},
 +                                      {120, 112, 98, 76, 45, 16, 2} };
 +
 +/*
 + * Update cpu_load for any missed ticks, due to tickless idle. The backlog
 + * would be when CPU is idle and so we just decay the old load without
 + * adding any new load.
 + */
 +static unsigned long
 +decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
 +{
 +      int j = 0;
 +
 +      if (!missed_updates)
 +              return load;
 +
 +      if (missed_updates >= degrade_zero_ticks[idx])
 +              return 0;
 +
 +      if (idx == 1)
 +              return load >> missed_updates;
 +
 +      while (missed_updates) {
 +              if (missed_updates % 2)
 +                      load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
 +
 +              missed_updates >>= 1;
 +              j++;
 +      }
 +      return load;
 +}
 +
 +/*
 + * Update rq->cpu_load[] statistics. This function is usually called every
 + * scheduler tick (TICK_NSEC). With tickless idle this will not be called
 + * every tick. We fix it up based on jiffies.
 + */
 +static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
 +                            unsigned long pending_updates)
 +{
 +      int i, scale;
 +
 +      this_rq->nr_load_updates++;
 +
 +      /* Update our load: */
 +      this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
 +      for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
 +              unsigned long old_load, new_load;
 +
 +              /* scale is effectively 1 << i now, and >> i divides by scale */
 +
 +              old_load = this_rq->cpu_load[i];
 +              old_load = decay_load_missed(old_load, pending_updates - 1, i);
 +              new_load = this_load;
 +              /*
 +               * Round up the averaging division if load is increasing. This
 +               * prevents us from getting stuck on 9 if the load is 10, for
 +               * example.
 +               */
 +              if (new_load > old_load)
 +                      new_load += scale - 1;
 +
 +              this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
 +      }
 +
 +      sched_avg_update(this_rq);
 +}
 +
 +#ifdef CONFIG_NO_HZ_COMMON
 +/*
 + * There is no sane way to deal with nohz on smp when using jiffies because the
 + * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
 + * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
 + *
 + * Therefore we cannot use the delta approach from the regular tick since that
 + * would seriously skew the load calculation. However we'll make do for those
 + * updates happening while idle (nohz_idle_balance) or coming out of idle
 + * (tick_nohz_idle_exit).
 + *
 + * This means we might still be one tick off for nohz periods.
 + */
 +
 +/*
 + * Called from nohz_idle_balance() to update the load ratings before doing the
 + * idle balance.
 + */
 +static void update_idle_cpu_load(struct rq *this_rq)
 +{
 +      unsigned long curr_jiffies = READ_ONCE(jiffies);
 +      unsigned long load = this_rq->cfs.runnable_load_avg;
 +      unsigned long pending_updates;
 +
 +      /*
 +       * bail if there's load or we're actually up-to-date.
 +       */
 +      if (load || curr_jiffies == this_rq->last_load_update_tick)
 +              return;
 +
 +      pending_updates = curr_jiffies - this_rq->last_load_update_tick;
 +      this_rq->last_load_update_tick = curr_jiffies;
 +
 +      __update_cpu_load(this_rq, load, pending_updates);
 +}
 +
 +/*
 + * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
 + */
 +void update_cpu_load_nohz(void)
 +{
 +      struct rq *this_rq = this_rq();
 +      unsigned long curr_jiffies = READ_ONCE(jiffies);
 +      unsigned long pending_updates;
 +
 +      if (curr_jiffies == this_rq->last_load_update_tick)
 +              return;
 +
 +      raw_spin_lock(&this_rq->lock);
 +      pending_updates = curr_jiffies - this_rq->last_load_update_tick;
 +      if (pending_updates) {
 +              this_rq->last_load_update_tick = curr_jiffies;
 +              /*
 +               * We were idle, this means load 0, the current load might be
 +               * !0 due to remote wakeups and the sort.
 +               */
 +              __update_cpu_load(this_rq, 0, pending_updates);
 +      }
 +      raw_spin_unlock(&this_rq->lock);
 +}
 +#endif /* CONFIG_NO_HZ */
 +
 +/*
 + * Called from scheduler_tick()
 + */
 +void update_cpu_load_active(struct rq *this_rq)
 +{
 +      unsigned long load = this_rq->cfs.runnable_load_avg;
 +      /*
 +       * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
 +       */
 +      this_rq->last_load_update_tick = jiffies;
 +      __update_cpu_load(this_rq, load, 1);
 +}
 +
  /* Used instead of source_load when we know the type == 0 */
  static unsigned long weighted_cpuload(const int cpu)
  {
@@@ -4586,7 -4355,7 +4566,7 @@@ static unsigned long capacity_orig_of(i
  static unsigned long cpu_avg_load_per_task(int cpu)
  {
        struct rq *rq = cpu_rq(cpu);
 -      unsigned long nr_running = ACCESS_ONCE(rq->cfs.h_nr_running);
 +      unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
        unsigned long load_avg = rq->cfs.runnable_load_avg;
  
        if (nr_running)
@@@ -5337,21 -5106,18 +5317,21 @@@ again
                 * entity, update_curr() will update its vruntime, otherwise
                 * forget we've ever seen it.
                 */
 -              if (curr && curr->on_rq)
 -                      update_curr(cfs_rq);
 -              else
 -                      curr = NULL;
 +              if (curr) {
 +                      if (curr->on_rq)
 +                              update_curr(cfs_rq);
 +                      else
 +                              curr = NULL;
  
 -              /*
 -               * This call to check_cfs_rq_runtime() will do the throttle and
 -               * dequeue its entity in the parent(s). Therefore the 'simple'
 -               * nr_running test will indeed be correct.
 -               */
 -              if (unlikely(check_cfs_rq_runtime(cfs_rq)))
 -                      goto simple;
 +                      /*
 +                       * This call to check_cfs_rq_runtime() will do the
 +                       * throttle and dequeue its entity in the parent(s).
 +                       * Therefore the 'simple' nr_running test will indeed
 +                       * be correct.
 +                       */
 +                      if (unlikely(check_cfs_rq_runtime(cfs_rq)))
 +                              goto simple;
 +              }
  
                se = pick_next_entity(cfs_rq, curr);
                cfs_rq = group_cfs_rq(se);
@@@ -5681,15 -5447,10 +5661,15 @@@ static int task_hot(struct task_struct 
  }
  
  #ifdef CONFIG_NUMA_BALANCING
 -/* Returns true if the destination node has incurred more faults */
 +/*
 + * Returns true if the destination node is the preferred node.
 + * Needs to match fbq_classify_rq(): if there is a runnable task
 + * that is not on its preferred node, we should identify it.
 + */
  static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
  {
        struct numa_group *numa_group = rcu_dereference(p->numa_group);
 +      unsigned long src_faults, dst_faults;
        int src_nid, dst_nid;
  
        if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
        if (src_nid == dst_nid)
                return false;
  
 -      if (numa_group) {
 -              /* Task is already in the group's interleave set. */
 -              if (node_isset(src_nid, numa_group->active_nodes))
 -                      return false;
 -
 -              /* Task is moving into the group's interleave set. */
 -              if (node_isset(dst_nid, numa_group->active_nodes))
 -                      return true;
 -
 -              return group_faults(p, dst_nid) > group_faults(p, src_nid);
 -      }
 -
        /* Encourage migration to the preferred node. */
        if (dst_nid == p->numa_preferred_nid)
                return true;
  
 -      return task_faults(p, dst_nid) > task_faults(p, src_nid);
 +      /* Migrating away from the preferred node is bad. */
 +      if (src_nid == p->numa_preferred_nid)
 +              return false;
 +
 +      if (numa_group) {
 +              src_faults = group_faults(p, src_nid);
 +              dst_faults = group_faults(p, dst_nid);
 +      } else {
 +              src_faults = task_faults(p, src_nid);
 +              dst_faults = task_faults(p, dst_nid);
 +      }
 +
 +      return dst_faults > src_faults;
  }
  
  
  static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
  {
        struct numa_group *numa_group = rcu_dereference(p->numa_group);
 +      unsigned long src_faults, dst_faults;
        int src_nid, dst_nid;
  
        if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
        if (src_nid == dst_nid)
                return false;
  
 -      if (numa_group) {
 -              /* Task is moving within/into the group's interleave set. */
 -              if (node_isset(dst_nid, numa_group->active_nodes))
 -                      return false;
 +      /* Migrating away from the preferred node is bad. */
 +      if (src_nid == p->numa_preferred_nid)
 +              return true;
  
 -              /* Task is moving out of the group's interleave set. */
 -              if (node_isset(src_nid, numa_group->active_nodes))
 -                      return true;
 +      /* Encourage migration to the preferred node. */
 +      if (dst_nid == p->numa_preferred_nid)
 +              return false;
  
 -              return group_faults(p, dst_nid) < group_faults(p, src_nid);
 +      if (numa_group) {
 +              src_faults = group_faults(p, src_nid);
 +              dst_faults = group_faults(p, dst_nid);
 +      } else {
 +              src_faults = task_faults(p, src_nid);
 +              dst_faults = task_faults(p, dst_nid);
        }
  
 -      /* Migrating away from the preferred node is always bad. */
 -      if (src_nid == p->numa_preferred_nid)
 -              return true;
 -
 -      return task_faults(p, dst_nid) < task_faults(p, src_nid);
 +      return dst_faults < src_faults;
  }
  
  #else
@@@ -6257,8 -6017,8 +6237,8 @@@ static unsigned long scale_rt_capacity(
         * Since we're reading these variables without serialization make sure
         * we read them once before doing sanity checks on them.
         */
 -      age_stamp = ACCESS_ONCE(rq->age_stamp);
 -      avg = ACCESS_ONCE(rq->rt_avg);
 +      age_stamp = READ_ONCE(rq->age_stamp);
 +      avg = READ_ONCE(rq->rt_avg);
        delta = __rq_clock_broken(rq) - age_stamp;
  
        if (unlikely(delta < 0))
diff --combined kernel/sched/rt.c
index 560d2fa623c311c9aa5ad51ead007e1b27c6fa6c,e43da5391dcdd785ed39d335f3d8056889924d87..7d7093c51f8d169cea027c78a9ca0321c8f15932
@@@ -18,19 -18,22 +18,22 @@@ static enum hrtimer_restart sched_rt_pe
  {
        struct rt_bandwidth *rt_b =
                container_of(timer, struct rt_bandwidth, rt_period_timer);
-       ktime_t now;
-       int overrun;
        int idle = 0;
+       int overrun;
  
+       raw_spin_lock(&rt_b->rt_runtime_lock);
        for (;;) {
-               now = hrtimer_cb_get_time(timer);
-               overrun = hrtimer_forward(timer, now, rt_b->rt_period);
+               overrun = hrtimer_forward_now(timer, rt_b->rt_period);
                if (!overrun)
                        break;
  
+               raw_spin_unlock(&rt_b->rt_runtime_lock);
                idle = do_sched_rt_period_timer(rt_b, overrun);
+               raw_spin_lock(&rt_b->rt_runtime_lock);
        }
+       if (idle)
+               rt_b->rt_period_active = 0;
+       raw_spin_unlock(&rt_b->rt_runtime_lock);
  
        return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
  }
@@@ -52,11 -55,12 +55,12 @@@ static void start_rt_bandwidth(struct r
        if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
                return;
  
-       if (hrtimer_active(&rt_b->rt_period_timer))
-               return;
        raw_spin_lock(&rt_b->rt_runtime_lock);
-       start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
+       if (!rt_b->rt_period_active) {
+               rt_b->rt_period_active = 1;
+               hrtimer_forward_now(&rt_b->rt_period_timer, rt_b->rt_period);
+               hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED);
+       }
        raw_spin_unlock(&rt_b->rt_runtime_lock);
  }
  
@@@ -1323,7 -1327,7 +1327,7 @@@ select_task_rq_rt(struct task_struct *p
        rq = cpu_rq(cpu);
  
        rcu_read_lock();
 -      curr = ACCESS_ONCE(rq->curr); /* unlocked access */
 +      curr = READ_ONCE(rq->curr); /* unlocked access */
  
        /*
         * If the current task on @p's runqueue is an RT task, then
diff --combined kernel/sched/sched.h
index d62b2882232b7a3017eda9c873de01c7ffcfc4a1,f9a58ef373b4b2240f521882d0b0d29da71d58b7..aea7c1f393cb3c983b3fd01e7df53155e4859ef3
@@@ -26,14 -26,8 +26,14 @@@ extern __read_mostly int scheduler_runn
  extern unsigned long calc_load_update;
  extern atomic_long_t calc_load_tasks;
  
 +extern void calc_global_load_tick(struct rq *this_rq);
  extern long calc_load_fold_active(struct rq *this_rq);
 +
 +#ifdef CONFIG_SMP
  extern void update_cpu_load_active(struct rq *this_rq);
 +#else
 +static inline void update_cpu_load_active(struct rq *this_rq) { }
 +#endif
  
  /*
   * Helpers for converting nanosecond timing to jiffy resolution
@@@ -137,6 -131,7 +137,7 @@@ struct rt_bandwidth 
        ktime_t                 rt_period;
        u64                     rt_runtime;
        struct hrtimer          rt_period_timer;
+       unsigned int            rt_period_active;
  };
  
  void __dl_clear_params(struct task_struct *p);
@@@ -221,7 -216,7 +222,7 @@@ struct cfs_bandwidth 
        s64 hierarchical_quota;
        u64 runtime_expires;
  
-       int idle, timer_active;
+       int idle, period_active;
        struct hrtimer period_timer, slack_timer;
        struct list_head throttled_cfs_rq;
  
@@@ -312,7 -307,7 +313,7 @@@ extern void init_cfs_bandwidth(struct c
  extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
  
  extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
- extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force);
+ extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
  extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
  
  extern void free_rt_sched_group(struct task_group *tg);
@@@ -713,7 -708,7 +714,7 @@@ DECLARE_PER_CPU_SHARED_ALIGNED(struct r
  
  static inline u64 __rq_clock_broken(struct rq *rq)
  {
 -      return ACCESS_ONCE(rq->clock);
 +      return READ_ONCE(rq->clock);
  }
  
  static inline u64 rq_clock(struct rq *rq)
@@@ -1290,6 -1285,7 +1291,6 @@@ extern void update_max_interval(void)
  extern void init_sched_dl_class(void);
  extern void init_sched_rt_class(void);
  extern void init_sched_fair_class(void);
 -extern void init_sched_dl_class(void);
  
  extern void resched_curr(struct rq *rq);
  extern void resched_cpu(int cpu);
@@@ -1303,6 -1299,8 +1304,6 @@@ extern void init_dl_task_timer(struct s
  
  unsigned long to_ratio(u64 period, u64 runtime);
  
 -extern void update_idle_cpu_load(struct rq *this_rq);
 -
  extern void init_task_runnable_average(struct task_struct *p);
  
  static inline void add_nr_running(struct rq *rq, unsigned count)
@@@ -1409,8 -1407,6 +1410,6 @@@ static inline void sched_rt_avg_update(
  static inline void sched_avg_update(struct rq *rq) { }
  #endif
  
- extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period);
  /*
   * __task_rq_lock - lock the rq @p resides on.
   */
diff --combined kernel/time/hrtimer.c
index 93ef7190bdeaadbf99efe07954cca3bee6399d07,db5c9508ed9500b056adedd71467f358d0d9ab13..5c7ae4b641c44aca69393a704507630a652381bf
   */
  DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
  {
        .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
+       .seq = SEQCNT_ZERO(hrtimer_bases.seq),
        .clock_base =
        {
                {
                        .index = HRTIMER_BASE_MONOTONIC,
                        .clockid = CLOCK_MONOTONIC,
                        .get_time = &ktime_get,
-                       .resolution = KTIME_LOW_RES,
                },
                {
                        .index = HRTIMER_BASE_REALTIME,
                        .clockid = CLOCK_REALTIME,
                        .get_time = &ktime_get_real,
-                       .resolution = KTIME_LOW_RES,
                },
                {
                        .index = HRTIMER_BASE_BOOTTIME,
                        .clockid = CLOCK_BOOTTIME,
                        .get_time = &ktime_get_boottime,
-                       .resolution = KTIME_LOW_RES,
                },
                {
                        .index = HRTIMER_BASE_TAI,
                        .clockid = CLOCK_TAI,
                        .get_time = &ktime_get_clocktai,
-                       .resolution = KTIME_LOW_RES,
                },
        }
  };
@@@ -109,33 -105,24 +105,24 @@@ static inline int hrtimer_clockid_to_ba
        return hrtimer_clock_to_base_table[clock_id];
  }
  
- /*
-  * Get the coarse grained time at the softirq based on xtime and
-  * wall_to_monotonic.
-  */
- static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
- {
-       ktime_t xtim, mono, boot, tai;
-       ktime_t off_real, off_boot, off_tai;
-       mono = ktime_get_update_offsets_tick(&off_real, &off_boot, &off_tai);
-       boot = ktime_add(mono, off_boot);
-       xtim = ktime_add(mono, off_real);
-       tai = ktime_add(mono, off_tai);
-       base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim;
-       base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono;
-       base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot;
-       base->clock_base[HRTIMER_BASE_TAI].softirq_time = tai;
- }
  /*
   * Functions and macros which are different for UP/SMP systems are kept in a
   * single place
   */
  #ifdef CONFIG_SMP
  
+ /*
+  * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base()
+  * such that hrtimer_callback_running() can unconditionally dereference
+  * timer->base->cpu_base
+  */
+ static struct hrtimer_cpu_base migration_cpu_base = {
+       .seq = SEQCNT_ZERO(migration_cpu_base),
+       .clock_base = { { .cpu_base = &migration_cpu_base, }, },
+ };
+ #define migration_base        migration_cpu_base.clock_base[0]
  /*
   * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
   * means that all timers which are tied to this base via timer->base are
   * be found on the lists/queues.
   *
   * When the timer's base is locked, and the timer removed from list, it is
-  * possible to set timer->base = NULL and drop the lock: the timer remains
-  * locked.
+  * possible to set timer->base = &migration_base and drop the lock: the timer
+  * remains locked.
   */
  static
  struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
  
        for (;;) {
                base = timer->base;
-               if (likely(base != NULL)) {
+               if (likely(base != &migration_base)) {
                        raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
                        if (likely(base == timer->base))
                                return base;
@@@ -190,6 -177,24 +177,24 @@@ hrtimer_check_target(struct hrtimer *ti
  #endif
  }
  
+ #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+ static inline
+ struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
+                                        int pinned)
+ {
+       if (pinned || !base->migration_enabled)
+               return this_cpu_ptr(&hrtimer_bases);
+       return &per_cpu(hrtimer_bases, get_nohz_timer_target());
+ }
+ #else
+ static inline
+ struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
+                                        int pinned)
+ {
+       return this_cpu_ptr(&hrtimer_bases);
+ }
+ #endif
  /*
   * Switch the timer base to the current CPU when possible.
   */
@@@ -197,14 -202,13 +202,13 @@@ static inline struct hrtimer_clock_bas
  switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
                    int pinned)
  {
+       struct hrtimer_cpu_base *new_cpu_base, *this_base;
        struct hrtimer_clock_base *new_base;
-       struct hrtimer_cpu_base *new_cpu_base;
-       int this_cpu = smp_processor_id();
-       int cpu = get_nohz_timer_target(pinned);
        int basenum = base->index;
  
+       this_base = this_cpu_ptr(&hrtimer_bases);
+       new_cpu_base = get_target_base(this_base, pinned);
  again:
-       new_cpu_base = &per_cpu(hrtimer_bases, cpu);
        new_base = &new_cpu_base->clock_base[basenum];
  
        if (base != new_base) {
                if (unlikely(hrtimer_callback_running(timer)))
                        return base;
  
-               /* See the comment in lock_timer_base() */
-               timer->base = NULL;
+               /* See the comment in lock_hrtimer_base() */
+               timer->base = &migration_base;
                raw_spin_unlock(&base->cpu_base->lock);
                raw_spin_lock(&new_base->cpu_base->lock);
  
-               if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
-                       cpu = this_cpu;
+               if (new_cpu_base != this_base &&
+                   hrtimer_check_target(timer, new_base)) {
                        raw_spin_unlock(&new_base->cpu_base->lock);
                        raw_spin_lock(&base->cpu_base->lock);
+                       new_cpu_base = this_base;
                        timer->base = base;
                        goto again;
                }
                timer->base = new_base;
        } else {
-               if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
-                       cpu = this_cpu;
+               if (new_cpu_base != this_base &&
+                   hrtimer_check_target(timer, new_base)) {
+                       new_cpu_base = this_base;
                        goto again;
                }
        }
@@@ -266,23 -272,21 +272,23 @@@ lock_hrtimer_base(const struct hrtimer 
  /*
   * Divide a ktime value by a nanosecond value
   */
 -u64 __ktime_divns(const ktime_t kt, s64 div)
 +s64 __ktime_divns(const ktime_t kt, s64 div)
  {
 -      u64 dclc;
        int sft = 0;
 +      s64 dclc;
 +      u64 tmp;
  
        dclc = ktime_to_ns(kt);
 +      tmp = dclc < 0 ? -dclc : dclc;
 +
        /* Make sure the divisor is less than 2^32: */
        while (div >> 32) {
                sft++;
                div >>= 1;
        }
 -      dclc >>= sft;
 -      do_div(dclc, (unsigned long) div);
 -
 -      return dclc;
 +      tmp >>= sft;
 +      do_div(tmp, (unsigned long) div);
 +      return dclc < 0 ? -tmp : tmp;
  }
  EXPORT_SYMBOL_GPL(__ktime_divns);
  #endif /* BITS_PER_LONG >= 64 */
@@@ -443,24 -447,35 +449,35 @@@ static inline void debug_deactivate(str
  }
  
  #if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS)
+ static inline void hrtimer_update_next_timer(struct hrtimer_cpu_base *cpu_base,
+                                            struct hrtimer *timer)
+ {
+ #ifdef CONFIG_HIGH_RES_TIMERS
+       cpu_base->next_timer = timer;
+ #endif
+ }
  static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
  {
        struct hrtimer_clock_base *base = cpu_base->clock_base;
        ktime_t expires, expires_next = { .tv64 = KTIME_MAX };
-       int i;
+       unsigned int active = cpu_base->active_bases;
  
-       for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
+       hrtimer_update_next_timer(cpu_base, NULL);
+       for (; active; base++, active >>= 1) {
                struct timerqueue_node *next;
                struct hrtimer *timer;
  
-               next = timerqueue_getnext(&base->active);
-               if (!next)
+               if (!(active & 0x01))
                        continue;
  
+               next = timerqueue_getnext(&base->active);
                timer = container_of(next, struct hrtimer, node);
                expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
-               if (expires.tv64 < expires_next.tv64)
+               if (expires.tv64 < expires_next.tv64) {
                        expires_next = expires;
+                       hrtimer_update_next_timer(cpu_base, timer);
+               }
        }
        /*
         * clock_was_set() might have changed base->offset of any of
  }
  #endif
  
+ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
+ {
+       ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
+       ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
+       ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
+       return ktime_get_update_offsets_now(&base->clock_was_set_seq,
+                                           offs_real, offs_boot, offs_tai);
+ }
  /* High resolution timer related functions */
  #ifdef CONFIG_HIGH_RES_TIMERS
  
   * High resolution timer enabled ?
   */
  static int hrtimer_hres_enabled __read_mostly  = 1;
+ unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC;
+ EXPORT_SYMBOL_GPL(hrtimer_resolution);
  
  /*
   * Enable / Disable high resolution mode
@@@ -508,9 -535,14 +537,14 @@@ static inline int hrtimer_is_hres_enabl
  /*
   * Is the high resolution mode active ?
   */
+ static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base)
+ {
+       return cpu_base->hres_active;
+ }
  static inline int hrtimer_hres_active(void)
  {
-       return __this_cpu_read(hrtimer_bases.hres_active);
+       return __hrtimer_hres_active(this_cpu_ptr(&hrtimer_bases));
  }
  
  /*
  static void
  hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
  {
-       ktime_t expires_next = __hrtimer_get_next_event(cpu_base);
+       ktime_t expires_next;
+       if (!cpu_base->hres_active)
+               return;
+       expires_next = __hrtimer_get_next_event(cpu_base);
  
        if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64)
                return;
        if (cpu_base->hang_detected)
                return;
  
-       if (cpu_base->expires_next.tv64 != KTIME_MAX)
-               tick_program_event(cpu_base->expires_next, 1);
+       tick_program_event(cpu_base->expires_next, 1);
  }
  
  /*
-  * Shared reprogramming for clock_realtime and clock_monotonic
-  *
   * When a timer is enqueued and expires earlier than the already enqueued
   * timers, we have to check, whether it expires earlier than the timer for
   * which the clock event device was armed.
   *
-  * Note, that in case the state has HRTIMER_STATE_CALLBACK set, no reprogramming
-  * and no expiry check happens. The timer gets enqueued into the rbtree. The
-  * reprogramming and expiry check is done in the hrtimer_interrupt or in the
-  * softirq.
-  *
   * Called with interrupts disabled and base->cpu_base.lock held
   */
- static int hrtimer_reprogram(struct hrtimer *timer,
-                            struct hrtimer_clock_base *base)
+ static void hrtimer_reprogram(struct hrtimer *timer,
+                             struct hrtimer_clock_base *base)
  {
        struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
        ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
-       int res;
  
        WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);
  
        /*
-        * When the callback is running, we do not reprogram the clock event
-        * device. The timer callback is either running on a different CPU or
-        * the callback is executed in the hrtimer_interrupt context. The
-        * reprogramming is handled either by the softirq, which called the
-        * callback or at the end of the hrtimer_interrupt.
+        * If the timer is not on the current cpu, we cannot reprogram
+        * the other cpus clock event device.
         */
-       if (hrtimer_callback_running(timer))
-               return 0;
+       if (base->cpu_base != cpu_base)
+               return;
+       /*
+        * If the hrtimer interrupt is running, then it will
+        * reevaluate the clock bases and reprogram the clock event
+        * device. The callbacks are always executed in hard interrupt
+        * context so we don't need an extra check for a running
+        * callback.
+        */
+       if (cpu_base->in_hrtirq)
+               return;
  
        /*
         * CLOCK_REALTIME timer might be requested with an absolute
-        * expiry time which is less than base->offset. Nothing wrong
-        * about that, just avoid to call into the tick code, which
-        * has now objections against negative expiry values.
+        * expiry time which is less than base->offset. Set it to 0.
         */
        if (expires.tv64 < 0)
-               return -ETIME;
+               expires.tv64 = 0;
  
        if (expires.tv64 >= cpu_base->expires_next.tv64)
-               return 0;
+               return;
  
-       /*
-        * When the target cpu of the timer is currently executing
-        * hrtimer_interrupt(), then we do not touch the clock event
-        * device. hrtimer_interrupt() will reevaluate all clock bases
-        * before reprogramming the device.
-        */
-       if (cpu_base->in_hrtirq)
-               return 0;
+       /* Update the pointer to the next expiring timer */
+       cpu_base->next_timer = timer;
  
        /*
         * If a hang was detected in the last timer interrupt then we
         * to make progress.
         */
        if (cpu_base->hang_detected)
-               return 0;
+               return;
  
        /*
-        * Clockevents returns -ETIME, when the event was in the past.
+        * Program the timer hardware. We enforce the expiry for
+        * events which are already in the past.
         */
-       res = tick_program_event(expires, 0);
-       if (!IS_ERR_VALUE(res))
-               cpu_base->expires_next = expires;
-       return res;
+       cpu_base->expires_next = expires;
+       tick_program_event(expires, 1);
  }
  
  /*
@@@ -630,15 -656,6 +658,6 @@@ static inline void hrtimer_init_hres(st
        base->hres_active = 0;
  }
  
- static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
- {
-       ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
-       ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
-       ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
-       return ktime_get_update_offsets_now(offs_real, offs_boot, offs_tai);
- }
  /*
   * Retrigger next event is called after clock was set
   *
@@@ -648,7 -665,7 +667,7 @@@ static void retrigger_next_event(void *
  {
        struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
  
-       if (!hrtimer_hres_active())
+       if (!base->hres_active)
                return;
  
        raw_spin_lock(&base->lock);
   */
  static int hrtimer_switch_to_hres(void)
  {
-       int i, cpu = smp_processor_id();
-       struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu);
-       unsigned long flags;
-       if (base->hres_active)
-               return 1;
-       local_irq_save(flags);
+       struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
  
        if (tick_init_highres()) {
-               local_irq_restore(flags);
                printk(KERN_WARNING "Could not switch to high resolution "
-                                   "mode on CPU %d\n", cpu);
+                                   "mode on CPU %d\n", base->cpu);
                return 0;
        }
        base->hres_active = 1;
-       for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
-               base->clock_base[i].resolution = KTIME_HIGH_RES;
+       hrtimer_resolution = HIGH_RES_NSEC;
  
        tick_setup_sched_timer();
        /* "Retrigger" the interrupt to get things going */
        retrigger_next_event(NULL);
-       local_irq_restore(flags);
        return 1;
  }
  
@@@ -706,6 -713,7 +715,7 @@@ void clock_was_set_delayed(void
  
  #else
  
+ static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *b) { return 0; }
  static inline int hrtimer_hres_active(void) { return 0; }
  static inline int hrtimer_is_hres_enabled(void) { return 0; }
  static inline int hrtimer_switch_to_hres(void) { return 0; }
@@@ -803,6 -811,14 +813,14 @@@ void unlock_hrtimer_base(const struct h
   *
   * Forward the timer expiry so it will expire in the future.
   * Returns the number of overruns.
+  *
+  * Can be safely called from the callback function of @timer. If
+  * called from other contexts @timer must neither be enqueued nor
+  * running the callback and the caller needs to take care of
+  * serialization.
+  *
+  * Note: This only updates the timer expiry value and does not requeue
+  * the timer.
   */
  u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
  {
        if (delta.tv64 < 0)
                return 0;
  
-       if (interval.tv64 < timer->base->resolution.tv64)
-               interval.tv64 = timer->base->resolution.tv64;
+       if (WARN_ON(timer->state & HRTIMER_STATE_ENQUEUED))
+               return 0;
+       if (interval.tv64 < hrtimer_resolution)
+               interval.tv64 = hrtimer_resolution;
  
        if (unlikely(delta.tv64 >= interval.tv64)) {
                s64 incr = ktime_to_ns(interval);
@@@ -849,16 -868,11 +870,11 @@@ static int enqueue_hrtimer(struct hrtim
  {
        debug_activate(timer);
  
-       timerqueue_add(&base->active, &timer->node);
        base->cpu_base->active_bases |= 1 << base->index;
  
-       /*
-        * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the
-        * state of a possibly running callback.
-        */
-       timer->state |= HRTIMER_STATE_ENQUEUED;
+       timer->state = HRTIMER_STATE_ENQUEUED;
  
-       return (&timer->node == base->active.next);
+       return timerqueue_add(&base->active, &timer->node);
  }
  
  /*
@@@ -875,39 -889,38 +891,38 @@@ static void __remove_hrtimer(struct hrt
                             struct hrtimer_clock_base *base,
                             unsigned long newstate, int reprogram)
  {
-       struct timerqueue_node *next_timer;
-       if (!(timer->state & HRTIMER_STATE_ENQUEUED))
-               goto out;
+       struct hrtimer_cpu_base *cpu_base = base->cpu_base;
+       unsigned int state = timer->state;
+       timer->state = newstate;
+       if (!(state & HRTIMER_STATE_ENQUEUED))
+               return;
+       if (!timerqueue_del(&base->active, &timer->node))
+               cpu_base->active_bases &= ~(1 << base->index);
  
-       next_timer = timerqueue_getnext(&base->active);
-       timerqueue_del(&base->active, &timer->node);
-       if (&timer->node == next_timer) {
  #ifdef CONFIG_HIGH_RES_TIMERS
-               /* Reprogram the clock event device. if enabled */
-               if (reprogram && hrtimer_hres_active()) {
-                       ktime_t expires;
-                       expires = ktime_sub(hrtimer_get_expires(timer),
-                                           base->offset);
-                       if (base->cpu_base->expires_next.tv64 == expires.tv64)
-                               hrtimer_force_reprogram(base->cpu_base, 1);
-               }
+       /*
+        * Note: If reprogram is false we do not update
+        * cpu_base->next_timer. This happens when we remove the first
+        * timer on a remote cpu. No harm as we never dereference
+        * cpu_base->next_timer. So the worst thing what can happen is
+        * an superflous call to hrtimer_force_reprogram() on the
+        * remote cpu later on if the same timer gets enqueued again.
+        */
+       if (reprogram && timer == cpu_base->next_timer)
+               hrtimer_force_reprogram(cpu_base, 1);
  #endif
-       }
-       if (!timerqueue_getnext(&base->active))
-               base->cpu_base->active_bases &= ~(1 << base->index);
- out:
-       timer->state = newstate;
  }
  
  /*
   * remove hrtimer, called with base lock held
   */
  static inline int
- remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
+ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart)
  {
        if (hrtimer_is_queued(timer)) {
-               unsigned long state;
+               unsigned long state = timer->state;
                int reprogram;
  
                /*
                debug_deactivate(timer);
                timer_stats_hrtimer_clear_start_info(timer);
                reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
-               /*
-                * We must preserve the CALLBACK state flag here,
-                * otherwise we could move the timer base in
-                * switch_hrtimer_base.
-                */
-               state = timer->state & HRTIMER_STATE_CALLBACK;
+               if (!restart)
+                       state = HRTIMER_STATE_INACTIVE;
                __remove_hrtimer(timer, base, state, reprogram);
                return 1;
        }
        return 0;
  }
  
- int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
-               unsigned long delta_ns, const enum hrtimer_mode mode,
-               int wakeup)
+ /**
+  * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
+  * @timer:    the timer to be added
+  * @tim:      expiry time
+  * @delta_ns: "slack" range for the timer
+  * @mode:     expiry mode: absolute (HRTIMER_MODE_ABS) or
+  *            relative (HRTIMER_MODE_REL)
+  */
+ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
+                           unsigned long delta_ns, const enum hrtimer_mode mode)
  {
        struct hrtimer_clock_base *base, *new_base;
        unsigned long flags;
-       int ret, leftmost;
+       int leftmost;
  
        base = lock_hrtimer_base(timer, &flags);
  
        /* Remove an active timer from the queue: */
-       ret = remove_hrtimer(timer, base);
+       remove_hrtimer(timer, base, true);
  
        if (mode & HRTIMER_MODE_REL) {
                tim = ktime_add_safe(tim, base->get_time());
                 * timeouts. This will go away with the GTOD framework.
                 */
  #ifdef CONFIG_TIME_LOW_RES
-               tim = ktime_add_safe(tim, base->resolution);
+               tim = ktime_add_safe(tim, ktime_set(0, hrtimer_resolution));
  #endif
        }
  
        timer_stats_hrtimer_set_start_info(timer);
  
        leftmost = enqueue_hrtimer(timer, new_base);
-       if (!leftmost) {
-               unlock_hrtimer_base(timer, &flags);
-               return ret;
-       }
+       if (!leftmost)
+               goto unlock;
  
        if (!hrtimer_is_hres_active(timer)) {
                /*
                 * Kick to reschedule the next tick to handle the new timer
                 * on dynticks target.
                 */
-               wake_up_nohz_cpu(new_base->cpu_base->cpu);
-       } else if (new_base->cpu_base == this_cpu_ptr(&hrtimer_bases) &&
-                       hrtimer_reprogram(timer, new_base)) {
-               /*
-                * Only allow reprogramming if the new base is on this CPU.
-                * (it might still be on another CPU if the timer was pending)
-                *
-                * XXX send_remote_softirq() ?
-                */
-               if (wakeup) {
-                       /*
-                        * We need to drop cpu_base->lock to avoid a
-                        * lock ordering issue vs. rq->lock.
-                        */
-                       raw_spin_unlock(&new_base->cpu_base->lock);
-                       raise_softirq_irqoff(HRTIMER_SOFTIRQ);
-                       local_irq_restore(flags);
-                       return ret;
-               } else {
-                       __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
-               }
+               if (new_base->cpu_base->nohz_active)
+                       wake_up_nohz_cpu(new_base->cpu_base->cpu);
+       } else {
+               hrtimer_reprogram(timer, new_base);
        }
+ unlock:
        unlock_hrtimer_base(timer, &flags);
-       return ret;
- }
- EXPORT_SYMBOL_GPL(__hrtimer_start_range_ns);
- /**
-  * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
-  * @timer:    the timer to be added
-  * @tim:      expiry time
-  * @delta_ns: "slack" range for the timer
-  * @mode:     expiry mode: absolute (HRTIMER_MODE_ABS) or
-  *            relative (HRTIMER_MODE_REL)
-  *
-  * Returns:
-  *  0 on success
-  *  1 when the timer was active
-  */
- int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
-               unsigned long delta_ns, const enum hrtimer_mode mode)
- {
-       return __hrtimer_start_range_ns(timer, tim, delta_ns, mode, 1);
  }
  EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
  
- /**
-  * hrtimer_start - (re)start an hrtimer on the current CPU
-  * @timer:    the timer to be added
-  * @tim:      expiry time
-  * @mode:     expiry mode: absolute (HRTIMER_MODE_ABS) or
-  *            relative (HRTIMER_MODE_REL)
-  *
-  * Returns:
-  *  0 on success
-  *  1 when the timer was active
-  */
- int
- hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
- {
-       return __hrtimer_start_range_ns(timer, tim, 0, mode, 1);
- }
- EXPORT_SYMBOL_GPL(hrtimer_start);
  /**
   * hrtimer_try_to_cancel - try to deactivate a timer
   * @timer:    hrtimer to stop
@@@ -1062,10 -1020,19 +1022,19 @@@ int hrtimer_try_to_cancel(struct hrtime
        unsigned long flags;
        int ret = -1;
  
+       /*
+        * Check lockless first. If the timer is not active (neither
+        * enqueued nor running the callback, nothing to do here.  The
+        * base lock does not serialize against a concurrent enqueue,
+        * so we can avoid taking it.
+        */
+       if (!hrtimer_active(timer))
+               return 0;
        base = lock_hrtimer_base(timer, &flags);
  
        if (!hrtimer_callback_running(timer))
-               ret = remove_hrtimer(timer, base);
+               ret = remove_hrtimer(timer, base, false);
  
        unlock_hrtimer_base(timer, &flags);
  
@@@ -1115,26 -1082,22 +1084,22 @@@ EXPORT_SYMBOL_GPL(hrtimer_get_remaining
  /**
   * hrtimer_get_next_event - get the time until next expiry event
   *
-  * Returns the delta to the next expiry event or KTIME_MAX if no timer
-  * is pending.
+  * Returns the next expiry time or KTIME_MAX if no timer is pending.
   */
ktime_t hrtimer_get_next_event(void)
u64 hrtimer_get_next_event(void)
  {
        struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
-       ktime_t mindelta = { .tv64 = KTIME_MAX };
+       u64 expires = KTIME_MAX;
        unsigned long flags;
  
        raw_spin_lock_irqsave(&cpu_base->lock, flags);
  
-       if (!hrtimer_hres_active())
-               mindelta = ktime_sub(__hrtimer_get_next_event(cpu_base),
-                                    ktime_get());
+       if (!__hrtimer_hres_active(cpu_base))
+               expires = __hrtimer_get_next_event(cpu_base).tv64;
  
        raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
  
-       if (mindelta.tv64 < 0)
-               mindelta.tv64 = 0;
-       return mindelta;
+       return expires;
  }
  #endif
  
@@@ -1176,37 -1139,73 +1141,73 @@@ void hrtimer_init(struct hrtimer *timer
  }
  EXPORT_SYMBOL_GPL(hrtimer_init);
  
- /**
-  * hrtimer_get_res - get the timer resolution for a clock
-  * @which_clock: which clock to query
-  * @tp:                pointer to timespec variable to store the resolution
+ /*
+  * A timer is active, when it is enqueued into the rbtree or the
+  * callback function is running or it's in the state of being migrated
+  * to another cpu.
   *
-  * Store the resolution of the clock selected by @which_clock in the
-  * variable pointed to by @tp.
+  * It is important for this function to not return a false negative.
   */
int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
bool hrtimer_active(const struct hrtimer *timer)
  {
        struct hrtimer_cpu_base *cpu_base;
-       int base = hrtimer_clockid_to_base(which_clock);
+       unsigned int seq;
  
-       cpu_base = raw_cpu_ptr(&hrtimer_bases);
-       *tp = ktime_to_timespec(cpu_base->clock_base[base].resolution);
+       do {
+               cpu_base = READ_ONCE(timer->base->cpu_base);
+               seq = raw_read_seqcount_begin(&cpu_base->seq);
  
-       return 0;
+               if (timer->state != HRTIMER_STATE_INACTIVE ||
+                   cpu_base->running == timer)
+                       return true;
+       } while (read_seqcount_retry(&cpu_base->seq, seq) ||
+                cpu_base != READ_ONCE(timer->base->cpu_base));
+       return false;
  }
- EXPORT_SYMBOL_GPL(hrtimer_get_res);
+ EXPORT_SYMBOL_GPL(hrtimer_active);
  
- static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
+ /*
+  * The write_seqcount_barrier()s in __run_hrtimer() split the thing into 3
+  * distinct sections:
+  *
+  *  - queued: the timer is queued
+  *  - callback:       the timer is being ran
+  *  - post:   the timer is inactive or (re)queued
+  *
+  * On the read side we ensure we observe timer->state and cpu_base->running
+  * from the same section, if anything changed while we looked at it, we retry.
+  * This includes timer->base changing because sequence numbers alone are
+  * insufficient for that.
+  *
+  * The sequence numbers are required because otherwise we could still observe
+  * a false negative if the read side got smeared over multiple consequtive
+  * __run_hrtimer() invocations.
+  */
+ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
+                         struct hrtimer_clock_base *base,
+                         struct hrtimer *timer, ktime_t *now)
  {
-       struct hrtimer_clock_base *base = timer->base;
-       struct hrtimer_cpu_base *cpu_base = base->cpu_base;
        enum hrtimer_restart (*fn)(struct hrtimer *);
        int restart;
  
-       WARN_ON(!irqs_disabled());
+       lockdep_assert_held(&cpu_base->lock);
  
        debug_deactivate(timer);
-       __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
+       cpu_base->running = timer;
+       /*
+        * Separate the ->running assignment from the ->state assignment.
+        *
+        * As with a regular write barrier, this ensures the read side in
+        * hrtimer_active() cannot observe cpu_base->running == NULL &&
+        * timer->state == INACTIVE.
+        */
+       raw_write_seqcount_barrier(&cpu_base->seq);
+       __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
        timer_stats_account_hrtimer(timer);
        fn = timer->function;
  
        raw_spin_lock(&cpu_base->lock);
  
        /*
-        * Note: We clear the CALLBACK bit after enqueue_hrtimer and
+        * Note: We clear the running state after enqueue_hrtimer and
         * we do not reprogramm the event hardware. Happens either in
         * hrtimer_start_range_ns() or in hrtimer_interrupt()
+        *
+        * Note: Because we dropped the cpu_base->lock above,
+        * hrtimer_start_range_ns() can have popped in and enqueued the timer
+        * for us already.
         */
-       if (restart != HRTIMER_NORESTART) {
-               BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
+       if (restart != HRTIMER_NORESTART &&
+           !(timer->state & HRTIMER_STATE_ENQUEUED))
                enqueue_hrtimer(timer, base);
-       }
  
-       WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK));
+       /*
+        * Separate the ->running assignment from the ->state assignment.
+        *
+        * As with a regular write barrier, this ensures the read side in
+        * hrtimer_active() cannot observe cpu_base->running == NULL &&
+        * timer->state == INACTIVE.
+        */
+       raw_write_seqcount_barrier(&cpu_base->seq);
  
-       timer->state &= ~HRTIMER_STATE_CALLBACK;
+       WARN_ON_ONCE(cpu_base->running != timer);
+       cpu_base->running = NULL;
  }
  
- #ifdef CONFIG_HIGH_RES_TIMERS
- /*
-  * High resolution timer interrupt
-  * Called with interrupts disabled
-  */
- void hrtimer_interrupt(struct clock_event_device *dev)
+ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
  {
-       struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
-       ktime_t expires_next, now, entry_time, delta;
-       int i, retries = 0;
-       BUG_ON(!cpu_base->hres_active);
-       cpu_base->nr_events++;
-       dev->next_event.tv64 = KTIME_MAX;
-       raw_spin_lock(&cpu_base->lock);
-       entry_time = now = hrtimer_update_base(cpu_base);
- retry:
-       cpu_base->in_hrtirq = 1;
-       /*
-        * We set expires_next to KTIME_MAX here with cpu_base->lock
-        * held to prevent that a timer is enqueued in our queue via
-        * the migration code. This does not affect enqueueing of
-        * timers which run their callback and need to be requeued on
-        * this CPU.
-        */
-       cpu_base->expires_next.tv64 = KTIME_MAX;
+       struct hrtimer_clock_base *base = cpu_base->clock_base;
+       unsigned int active = cpu_base->active_bases;
  
-       for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
-               struct hrtimer_clock_base *base;
+       for (; active; base++, active >>= 1) {
                struct timerqueue_node *node;
                ktime_t basenow;
  
-               if (!(cpu_base->active_bases & (1 << i)))
+               if (!(active & 0x01))
                        continue;
  
-               base = cpu_base->clock_base + i;
                basenow = ktime_add(now, base->offset);
  
                while ((node = timerqueue_getnext(&base->active))) {
                        if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer))
                                break;
  
-                       __run_hrtimer(timer, &basenow);
+                       __run_hrtimer(cpu_base, base, timer, &basenow);
                }
        }
+ }
+ #ifdef CONFIG_HIGH_RES_TIMERS
+ /*
+  * High resolution timer interrupt
+  * Called with interrupts disabled
+  */
+ void hrtimer_interrupt(struct clock_event_device *dev)
+ {
+       struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
+       ktime_t expires_next, now, entry_time, delta;
+       int retries = 0;
+       BUG_ON(!cpu_base->hres_active);
+       cpu_base->nr_events++;
+       dev->next_event.tv64 = KTIME_MAX;
+       raw_spin_lock(&cpu_base->lock);
+       entry_time = now = hrtimer_update_base(cpu_base);
+ retry:
+       cpu_base->in_hrtirq = 1;
+       /*
+        * We set expires_next to KTIME_MAX here with cpu_base->lock
+        * held to prevent that a timer is enqueued in our queue via
+        * the migration code. This does not affect enqueueing of
+        * timers which run their callback and need to be requeued on
+        * this CPU.
+        */
+       cpu_base->expires_next.tv64 = KTIME_MAX;
+       __hrtimer_run_queues(cpu_base, now);
        /* Reevaluate the clock bases for the next expiry */
        expires_next = __hrtimer_get_next_event(cpu_base);
        /*
        raw_spin_unlock(&cpu_base->lock);
  
        /* Reprogramming necessary ? */
-       if (expires_next.tv64 == KTIME_MAX ||
-           !tick_program_event(expires_next, 0)) {
+       if (!tick_program_event(expires_next, 0)) {
                cpu_base->hang_detected = 0;
                return;
        }
        cpu_base->hang_detected = 1;
        raw_spin_unlock(&cpu_base->lock);
        delta = ktime_sub(now, entry_time);
-       if (delta.tv64 > cpu_base->max_hang_time.tv64)
-               cpu_base->max_hang_time = delta;
+       if ((unsigned int)delta.tv64 > cpu_base->max_hang_time)
+               cpu_base->max_hang_time = (unsigned int) delta.tv64;
        /*
         * Limit it to a sensible value as we enforce a longer
         * delay. Give the CPU at least 100ms to catch up.
   * local version of hrtimer_peek_ahead_timers() called with interrupts
   * disabled.
   */
- static void __hrtimer_peek_ahead_timers(void)
+ static inline void __hrtimer_peek_ahead_timers(void)
  {
        struct tick_device *td;
  
                hrtimer_interrupt(td->evtdev);
  }
  
- /**
-  * hrtimer_peek_ahead_timers -- run soft-expired timers now
-  *
-  * hrtimer_peek_ahead_timers will peek at the timer queue of
-  * the current cpu and check if there are any timers for which
-  * the soft expires time has passed. If any such timers exist,
-  * they are run immediately and then removed from the timer queue.
-  *
-  */
- void hrtimer_peek_ahead_timers(void)
- {
-       unsigned long flags;
-       local_irq_save(flags);
-       __hrtimer_peek_ahead_timers();
-       local_irq_restore(flags);
- }
- static void run_hrtimer_softirq(struct softirq_action *h)
- {
-       hrtimer_peek_ahead_timers();
- }
  #else /* CONFIG_HIGH_RES_TIMERS */
  
  static inline void __hrtimer_peek_ahead_timers(void) { }
  #endif        /* !CONFIG_HIGH_RES_TIMERS */
  
  /*
-  * Called from timer softirq every jiffy, expire hrtimers:
-  *
-  * For HRT its the fall back code to run the softirq in the timer
-  * softirq context in case the hrtimer initialization failed or has
-  * not been done yet.
+  * Called from run_local_timers in hardirq context every jiffy
   */
- void hrtimer_run_pending(void)
+ void hrtimer_run_queues(void)
  {
-       if (hrtimer_hres_active())
+       struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
+       ktime_t now;
+       if (__hrtimer_hres_active(cpu_base))
                return;
  
        /*
-        * This _is_ ugly: We have to check in the softirq context,
-        * whether we can switch to highres and / or nohz mode. The
-        * clocksource switch happens in the timer interrupt with
-        * xtime_lock held. Notification from there only sets the
-        * check bit in the tick_oneshot code, otherwise we might
-        * deadlock vs. xtime_lock.
+        * This _is_ ugly: We have to check periodically, whether we
+        * can switch to highres and / or nohz mode. The clocksource
+        * switch happens with xtime_lock held. Notification from
+        * there only sets the check bit in the tick_oneshot code,
+        * otherwise we might deadlock vs. xtime_lock.
         */
-       if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
+       if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) {
                hrtimer_switch_to_hres();
- }
- /*
-  * Called from hardirq context every jiffy
-  */
- void hrtimer_run_queues(void)
- {
-       struct timerqueue_node *node;
-       struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
-       struct hrtimer_clock_base *base;
-       int index, gettime = 1;
-       if (hrtimer_hres_active())
                return;
-       for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
-               base = &cpu_base->clock_base[index];
-               if (!timerqueue_getnext(&base->active))
-                       continue;
-               if (gettime) {
-                       hrtimer_get_softirq_time(cpu_base);
-                       gettime = 0;
-               }
-               raw_spin_lock(&cpu_base->lock);
-               while ((node = timerqueue_getnext(&base->active))) {
-                       struct hrtimer *timer;
-                       timer = container_of(node, struct hrtimer, node);
-                       if (base->softirq_time.tv64 <=
-                                       hrtimer_get_expires_tv64(timer))
-                               break;
-                       __run_hrtimer(timer, &base->softirq_time);
-               }
-               raw_spin_unlock(&cpu_base->lock);
        }
+       raw_spin_lock(&cpu_base->lock);
+       now = hrtimer_update_base(cpu_base);
+       __hrtimer_run_queues(cpu_base, now);
+       raw_spin_unlock(&cpu_base->lock);
  }
  
  /*
@@@ -1497,8 -1456,6 +1458,6 @@@ static int __sched do_nanosleep(struct 
        do {
                set_current_state(TASK_INTERRUPTIBLE);
                hrtimer_start_expires(&t->timer, mode);
-               if (!hrtimer_active(&t->timer))
-                       t->task = NULL;
  
                if (likely(t->task))
                        freezable_schedule();
@@@ -1642,11 -1599,11 +1601,11 @@@ static void migrate_hrtimer_list(struc
                debug_deactivate(timer);
  
                /*
-                * Mark it as STATE_MIGRATE not INACTIVE otherwise the
+                * Mark it as ENQUEUED not INACTIVE otherwise the
                 * timer could be seen as !active and just vanish away
                 * under us on another CPU
                 */
-               __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);
+               __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0);
                timer->base = new_base;
                /*
                 * Enqueue the timers on the new cpu. This does not
                 * event device.
                 */
                enqueue_hrtimer(timer, new_base);
-               /* Clear the migration state bit */
-               timer->state &= ~HRTIMER_STATE_MIGRATE;
        }
  }
  
@@@ -1731,9 -1685,6 +1687,6 @@@ void __init hrtimers_init(void
        hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
                          (void *)(long)smp_processor_id());
        register_cpu_notifier(&hrtimers_nb);
- #ifdef CONFIG_HIGH_RES_TIMERS
-       open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq);
- #endif
  }
  
  /**
@@@ -1772,8 -1723,6 +1725,6 @@@ schedule_hrtimeout_range_clock(ktime_t 
        hrtimer_init_sleeper(&t, current);
  
        hrtimer_start_expires(&t.timer, mode);
-       if (!hrtimer_active(&t.timer))
-               t.task = NULL;
  
        if (likely(t.task))
                schedule();
diff --combined net/sched/sch_api.c
index 1e1c89e51a118e79610c49412e335191fc3ba834,45bc63ae18e3ae9a5fe2a80e9de4763cac39cebd..73a123daa2cc5c4c43c69120d1fecd273df76c17
@@@ -815,8 -815,10 +815,8 @@@ static int qdisc_graft(struct net_devic
                if (dev->flags & IFF_UP)
                        dev_deactivate(dev);
  
 -              if (new && new->ops->attach) {
 -                      new->ops->attach(new);
 -                      num_q = 0;
 -              }
 +              if (new && new->ops->attach)
 +                      goto skip;
  
                for (i = 0; i < num_q; i++) {
                        struct netdev_queue *dev_queue = dev_ingress_queue(dev);
                                qdisc_destroy(old);
                }
  
 +skip:
                if (!ingress) {
                        notify_and_destroy(net, skb, n, classid,
                                           dev->qdisc, new);
                        if (new && !new->ops->attach)
                                atomic_inc(&new->refcnt);
                        dev->qdisc = new ? : &noop_qdisc;
 +
 +                      if (new && new->ops->attach)
 +                              new->ops->attach(new);
                } else {
                        notify_and_destroy(net, skb, n, classid, old, new);
                }
@@@ -1885,13 -1883,10 +1885,10 @@@ EXPORT_SYMBOL(tcf_destroy_chain)
  #ifdef CONFIG_PROC_FS
  static int psched_show(struct seq_file *seq, void *v)
  {
-       struct timespec ts;
-       hrtimer_get_res(CLOCK_MONOTONIC, &ts);
        seq_printf(seq, "%08x %08x %08x %08x\n",
                   (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
                   1000000,
-                  (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
+                  (u32)NSEC_PER_SEC / hrtimer_resolution);
  
        return 0;
  }