Merge branch 'timers-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...

author Linus Torvalds <torvalds@linux-foundation.org>

Tue, 23 Jun 2015 01:57:44 +0000 (18:57 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Tue, 23 Jun 2015 01:57:44 +0000 (18:57 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Tue, 23 Jun 2015 01:57:44 +0000 (18:57 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Tue, 23 Jun 2015 01:57:44 +0000 (18:57 -0700)
diff --combined arch/x86/kernel/cpu/perf_event_intel_uncore.c

index 7c1de16101782b37e2ace385c55d0f7941a0d90d,7c411f0e58fd5fa0eb7368dd4bf9100c9eae3cbe..21b5e38c921b7a78102a2adbabf06328b56dbf9b
--- 1/arch/x86/kernel/cpu/perf_event_intel_uncore.c
--- 2/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@@ -233,9 -233,8 +233,8 @@@ static enum hrtimer_restart uncore_pmu_
   
   void uncore_pmu_start_hrtimer(struct intel_uncore_box *box)
   {
-       __hrtimer_start_range_ns(&box->hrtimer,
-                       ns_to_ktime(box->hrtimer_duration), 0,
-                       HRTIMER_MODE_REL_PINNED, 0);
+       hrtimer_start(&box->hrtimer, ns_to_ktime(box->hrtimer_duration),
+                     HRTIMER_MODE_REL_PINNED);
   }
   
   void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box)
@@@ -365,8 -364,9 +364,8 @@@ static int uncore_assign_events(struct 
         bitmap_zero(used_mask, UNCORE_PMC_IDX_MAX);
   
         for (i = 0, wmin = UNCORE_PMC_IDX_MAX, wmax = 0; i < n; i++) {
- -              hwc = &box->event_list[i]->hw;
                 c = uncore_get_event_constraint(box, box->event_list[i]);
- -              hwc->constraint = c;
+ +              box->event_constraint[i] = c;
                 wmin = min(wmin, c->weight);
                 wmax = max(wmax, c->weight);
         }
@@@ -374,7 -374,7 +373,7 @@@
         /* fastpath, try to reuse previous register */
         for (i = 0; i < n; i++) {
                 hwc = &box->event_list[i]->hw;
- -              c = hwc->constraint;
+ +              c = box->event_constraint[i];
   
                 /* never assigned */
                 if (hwc->idx == -1)
@@@ -394,8 -394,8 +393,8 @@@
         }
         /* slow path */
         if (i != n)
- -              ret = perf_assign_events(box->event_list, n,
- -                                       wmin, wmax, assign);
+ +              ret = perf_assign_events(box->event_constraint, n,
+ +                                       wmin, wmax, n, assign);
   
         if (!assign || ret) {
                 for (i = 0; i < n; i++)
@@@ -839,7 -839,6 +838,7 @@@ static int uncore_pci_probe(struct pci_
         box->phys_id = phys_id;
         box->pci_dev = pdev;
         box->pmu = pmu;
+ +      uncore_box_init(box);
         pci_set_drvdata(pdev, box);
   
         raw_spin_lock(&uncore_box_lock);
@@@ -922,9 -921,6 +921,9 @@@ static int __init uncore_pci_init(void
         case 69: /* Haswell Celeron */
                 ret = hsw_uncore_pci_init();
                 break;
+ +      case 61: /* Broadwell */
+ +              ret = bdw_uncore_pci_init();
+ +              break;
         default:
                 return 0;
         }
@@@ -1006,10 -1002,8 +1005,10 @@@ static int uncore_cpu_starting(int cpu
                         pmu = &type->pmus[j];
                         box = *per_cpu_ptr(pmu->box, cpu);
                         /* called by uncore_cpu_init? */
- -                      if (box && box->phys_id >= 0)
+ +                      if (box && box->phys_id >= 0) {
+ +                              uncore_box_init(box);
                                 continue;
+ +                      }
   
                         for_each_online_cpu(k) {
                                 exist = *per_cpu_ptr(pmu->box, k);
@@@ -1025,10 -1019,8 +1024,10 @@@
                                 }
                         }
   
- -                      if (box)
+ +                      if (box) {
                                 box->phys_id = phys_id;
+ +                              uncore_box_init(box);
+ +                      }
                 }
         }
         return 0;
diff --combined fs/dcache.c

index 37b5afdaf6989e211151cc55a7fa656a6addd364,b43a1694d2caebd475148ebafbd68d751cdc0b99..592c4b582495b515c52a2aa3458be3422953c111
--- 1/fs/dcache.c
--- 2/fs/dcache.c
+++ b/fs/dcache.c
@@@ -322,17 -322,17 +322,17 @@@ static void dentry_free(struct dentry *
   }
   
   /**
-  * dentry_rcuwalk_barrier - invalidate in-progress rcu-walk lookups
+  * dentry_rcuwalk_invalidate - invalidate in-progress rcu-walk lookups
    * @dentry: the target dentry
    * After this call, in-progress rcu-walk path lookup will fail. This
    * should be called after unhashing, and after changing d_inode (if
    * the dentry has not already been unhashed).
    */
- static inline void dentry_rcuwalk_barrier(struct dentry *dentry)
+ static inline void dentry_rcuwalk_invalidate(struct dentry *dentry)
   {
-       assert_spin_locked(&dentry->d_lock);
-       /* Go through a barrier */
-       write_seqcount_barrier(&dentry->d_seq);
+       lockdep_assert_held(&dentry->d_lock);
+       /* Go through am invalidation barrier */
+       write_seqcount_invalidate(&dentry->d_seq);
   }
   
   /*
@@@ -372,7 -372,7 +372,7 @@@ static void dentry_unlink_inode(struct 
         struct inode *inode = dentry->d_inode;
         __d_clear_type_and_inode(dentry);
         hlist_del_init(&dentry->d_u.d_alias);
-       dentry_rcuwalk_barrier(dentry);
+       dentry_rcuwalk_invalidate(dentry);
         spin_unlock(&dentry->d_lock);
         spin_unlock(&inode->i_lock);
         if (!inode->i_nlink)
@@@ -494,7 -494,7 +494,7 @@@ void __d_drop(struct dentry *dentry
                 __hlist_bl_del(&dentry->d_hash);
                 dentry->d_hash.pprev = NULL;
                 hlist_bl_unlock(b);
-               dentry_rcuwalk_barrier(dentry);
+               dentry_rcuwalk_invalidate(dentry);
         }
   }
   EXPORT_SYMBOL(__d_drop);
@@@ -1239,13 -1239,13 +1239,13 @@@ ascend
                 /* might go back up the wrong parent if we have had a rename. */
                 if (need_seqretry(&rename_lock, seq))
                         goto rename_retry;
- -              next = child->d_child.next;
- -              while (unlikely(child->d_flags & DCACHE_DENTRY_KILLED)) {
+ +              /* go into the first sibling still alive */
+ +              do {
+ +                      next = child->d_child.next;
                         if (next == &this_parent->d_subdirs)
                                 goto ascend;
                         child = list_entry(next, struct dentry, d_child);
- -                      next = next->next;
- -              }
+ +              } while (unlikely(child->d_flags & DCACHE_DENTRY_KILLED));
                 rcu_read_unlock();
                 goto resume;
         }
@@@ -1752,7 -1752,7 +1752,7 @@@ static void __d_instantiate(struct dent
         if (inode)
                 hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
         __d_set_inode_and_type(dentry, inode, add_flags);
-       dentry_rcuwalk_barrier(dentry);
+       dentry_rcuwalk_invalidate(dentry);
         spin_unlock(&dentry->d_lock);
         fsnotify_d_instantiate(dentry, inode);
   }
diff --combined include/linux/perf_event.h

index a204d5266f5f0fc6ba6c175e9580d9b00c9621dc,cf3342a8ad807c7583d0b2ea1a72bc34735f3e5b..1b82d44b0a02d278f980acb0ae2158d2462fb329
--- 1/include/linux/perf_event.h
--- 2/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@@ -92,6 -92,8 +92,6 @@@ struct hw_perf_event_extra 
         int             idx;    /* index in shared_regs->regs[] */
   };
   
- -struct event_constraint;
- -
   /**
    * struct hw_perf_event - performance event hardware details:
    */
@@@ -110,6 -112,8 +110,6 @@@ struct hw_perf_event 
   
                         struct hw_perf_event_extra extra_reg;
                         struct hw_perf_event_extra branch_reg;
- -
- -                      struct event_constraint *constraint;
                 };
                 struct { /* software */
                         struct hrtimer  hrtimer;
@@@ -120,7 -124,7 +120,7 @@@
                 };
                 struct { /* intel_cqm */
                         int                     cqm_state;
- -                      int                     cqm_rmid;
+ +                      u32                     cqm_rmid;
                         struct list_head        cqm_events_entry;
                         struct list_head        cqm_groups_entry;
                         struct list_head        cqm_group_entry;
@@@ -562,8 -566,12 +562,12 @@@ struct perf_cpu_context 
         struct perf_event_context       *task_ctx;
         int                             active_oncpu;
         int                             exclusive;
+ 
+       raw_spinlock_t                  hrtimer_lock;
         struct hrtimer                  hrtimer;
         ktime_t                         hrtimer_interval;
+       unsigned int                    hrtimer_active;
+ 
         struct pmu                      *unique_pmu;
         struct perf_cgroup              *cgrp;
   };
@@@ -730,22 -738,6 +734,22 @@@ extern int perf_event_overflow(struct p
                                  struct perf_sample_data *data,
                                  struct pt_regs *regs);
   
+ +extern void perf_event_output(struct perf_event *event,
+ +                              struct perf_sample_data *data,
+ +                              struct pt_regs *regs);
+ +
+ +extern void
+ +perf_event_header__init_id(struct perf_event_header *header,
+ +                         struct perf_sample_data *data,
+ +                         struct perf_event *event);
+ +extern void
+ +perf_event__output_id_sample(struct perf_event *event,
+ +                           struct perf_output_handle *handle,
+ +                           struct perf_sample_data *sample);
+ +
+ +extern void
+ +perf_log_lost_samples(struct perf_event *event, u64 lost);
+ +
   static inline bool is_sampling_event(struct perf_event *event)
   {
         return event->attr.sample_period != 0;
@@@ -810,33 -802,11 +814,33 @@@ perf_sw_event_sched(u32 event_id, u64 n
   
   extern struct static_key_deferred perf_sched_events;
   
+ +static __always_inline bool
+ +perf_sw_migrate_enabled(void)
+ +{
+ +      if (static_key_false(&perf_swevent_enabled[PERF_COUNT_SW_CPU_MIGRATIONS]))
+ +              return true;
+ +      return false;
+ +}
+ +
+ +static inline void perf_event_task_migrate(struct task_struct *task)
+ +{
+ +      if (perf_sw_migrate_enabled())
+ +              task->sched_migrated = 1;
+ +}
+ +
   static inline void perf_event_task_sched_in(struct task_struct *prev,
                                             struct task_struct *task)
   {
         if (static_key_false(&perf_sched_events.key))
                 __perf_event_task_sched_in(prev, task);
+ +
+ +      if (perf_sw_migrate_enabled() && task->sched_migrated) {
+ +              struct pt_regs *regs = this_cpu_ptr(&__perf_regs[0]);
+ +
+ +              perf_fetch_caller_regs(regs);
+ +              ___perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, regs, 0);
+ +              task->sched_migrated = 0;
+ +      }
   }
   
   static inline void perf_event_task_sched_out(struct task_struct *prev,
@@@ -959,8 -929,6 +963,8 @@@ perf_aux_output_skip(struct perf_output
   static inline void *
   perf_get_aux(struct perf_output_handle *handle)                               { return NULL; }
   static inline void
+ +perf_event_task_migrate(struct task_struct *task)                     { }
+ +static inline void
   perf_event_task_sched_in(struct task_struct *prev,
                          struct task_struct *task)                      { }
   static inline void
diff --combined include/linux/rcupdate.h

index 03a899aabd1762c74bb86b4b807dbf29651fad42,0627a447c589dedd3c0c82aa6ca8d39c86bad037..33a056bb886faeedeb9690faefd3a4adeeedd14b
--- 1/include/linux/rcupdate.h
--- 2/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@@ -44,6 -44,8 +44,8 @@@
   #include <linux/debugobjects.h>
   #include <linux/bug.h>
   #include <linux/compiler.h>
+ #include <linux/ktime.h>
+ 
   #include <asm/barrier.h>
   
   extern int rcu_expedited; /* for sysctl */
@@@ -292,6 -294,10 +294,6 @@@ void rcu_sched_qs(void)
   void rcu_bh_qs(void);
   void rcu_check_callbacks(int user);
   struct notifier_block;
- -void rcu_idle_enter(void);
- -void rcu_idle_exit(void);
- -void rcu_irq_enter(void);
- -void rcu_irq_exit(void);
   int rcu_cpu_notify(struct notifier_block *self,
                    unsigned long action, void *hcpu);
   
@@@ -360,8 -366,8 +362,8 @@@ extern struct srcu_struct tasks_rcu_exi
   #define rcu_note_voluntary_context_switch(t) \
         do { \
                 rcu_all_qs(); \
- -              if (ACCESS_ONCE((t)->rcu_tasks_holdout)) \
- -                      ACCESS_ONCE((t)->rcu_tasks_holdout) = false; \
+ +              if (READ_ONCE((t)->rcu_tasks_holdout)) \
+ +                      WRITE_ONCE((t)->rcu_tasks_holdout, false); \
         } while (0)
   #else /* #ifdef CONFIG_TASKS_RCU */
   #define TASKS_RCU(x) do { } while (0)
@@@ -605,7 -611,7 +607,7 @@@ static inline void rcu_preempt_sleep_ch
   
   #define __rcu_access_pointer(p, space) \
   ({ \
- -      typeof(*p) *_________p1 = (typeof(*p) *__force)ACCESS_ONCE(p); \
+ +      typeof(*p) *_________p1 = (typeof(*p) *__force)READ_ONCE(p); \
         rcu_dereference_sparse(p, space); \
         ((typeof(*p) __force __kernel *)(_________p1)); \
   })
@@@ -624,6 -630,21 +626,6 @@@
         ((typeof(*p) __force __kernel *)(p)); \
   })
   
- -#define __rcu_access_index(p, space) \
- -({ \
- -      typeof(p) _________p1 = ACCESS_ONCE(p); \
- -      rcu_dereference_sparse(p, space); \
- -      (_________p1); \
- -})
- -#define __rcu_dereference_index_check(p, c) \
- -({ \
- -      /* Dependency order vs. p above. */ \
- -      typeof(p) _________p1 = lockless_dereference(p); \
- -      rcu_lockdep_assert(c, \
- -                         "suspicious rcu_dereference_index_check() usage"); \
- -      (_________p1); \
- -})
- -
   /**
    * RCU_INITIALIZER() - statically initialize an RCU-protected global variable
    * @v: The value to statically initialize with.
@@@ -640,7 -661,7 +642,7 @@@
    */
   #define lockless_dereference(p) \
   ({ \
- -      typeof(p) _________p1 = ACCESS_ONCE(p); \
+ +      typeof(p) _________p1 = READ_ONCE(p); \
         smp_read_barrier_depends(); /* Dependency order vs. p above. */ \
         (_________p1); \
   })
@@@ -683,7 -704,7 +685,7 @@@
    * @p: The pointer to read
    *
    * Return the value of the specified RCU-protected pointer, but omit the
- - * smp_read_barrier_depends() and keep the ACCESS_ONCE().  This is useful
+ + * smp_read_barrier_depends() and keep the READ_ONCE().  This is useful
    * when the value of this pointer is accessed, but the pointer is not
    * dereferenced, for example, when testing an RCU-protected pointer against
    * NULL.  Although rcu_access_pointer() may also be used in cases where
@@@ -767,13 -788,48 +769,13 @@@
    */
   #define rcu_dereference_raw_notrace(p) __rcu_dereference_check((p), 1, __rcu)
   
- -/**
- - * rcu_access_index() - fetch RCU index with no dereferencing
- - * @p: The index to read
- - *
- - * Return the value of the specified RCU-protected index, but omit the
- - * smp_read_barrier_depends() and keep the ACCESS_ONCE().  This is useful
- - * when the value of this index is accessed, but the index is not
- - * dereferenced, for example, when testing an RCU-protected index against
- - * -1.  Although rcu_access_index() may also be used in cases where
- - * update-side locks prevent the value of the index from changing, you
- - * should instead use rcu_dereference_index_protected() for this use case.
- - */
- -#define rcu_access_index(p) __rcu_access_index((p), __rcu)
- -
- -/**
- - * rcu_dereference_index_check() - rcu_dereference for indices with debug checking
- - * @p: The pointer to read, prior to dereferencing
- - * @c: The conditions under which the dereference will take place
- - *
- - * Similar to rcu_dereference_check(), but omits the sparse checking.
- - * This allows rcu_dereference_index_check() to be used on integers,
- - * which can then be used as array indices.  Attempting to use
- - * rcu_dereference_check() on an integer will give compiler warnings
- - * because the sparse address-space mechanism relies on dereferencing
- - * the RCU-protected pointer.  Dereferencing integers is not something
- - * that even gcc will put up with.
- - *
- - * Note that this function does not implicitly check for RCU read-side
- - * critical sections.  If this function gains lots of uses, it might
- - * make sense to provide versions for each flavor of RCU, but it does
- - * not make sense as of early 2010.
- - */
- -#define rcu_dereference_index_check(p, c) \
- -      __rcu_dereference_index_check((p), (c))
- -
   /**
    * rcu_dereference_protected() - fetch RCU pointer when updates prevented
    * @p: The pointer to read, prior to dereferencing
    * @c: The conditions under which the dereference will take place
    *
    * Return the value of the specified RCU-protected pointer, but omit
- - * both the smp_read_barrier_depends() and the ACCESS_ONCE().  This
+ + * both the smp_read_barrier_depends() and the READ_ONCE().  This
    * is useful in cases where update-side locks prevent the value of the
    * pointer from changing.  Please note that this primitive does -not-
    * prevent the compiler from repeating this reference or combining it
@@@ -1099,13 -1155,13 +1101,13 @@@ static inline notrace void rcu_read_unl
   #define kfree_rcu(ptr, rcu_head)                                      \
         __kfree_rcu(&((ptr)->rcu_head), offsetof(typeof(*(ptr)), rcu_head))
   
- -#if defined(CONFIG_TINY_RCU) || defined(CONFIG_RCU_NOCB_CPU_ALL)
+ +#ifdef CONFIG_TINY_RCU
- static inline int rcu_needs_cpu(unsigned long *delta_jiffies)
+ static inline int rcu_needs_cpu(u64 basemono, u64 *nextevt)
   {
-       *delta_jiffies = ULONG_MAX;
+       *nextevt = KTIME_MAX;
         return 0;
   }
- -#endif /* #if defined(CONFIG_TINY_RCU) || defined(CONFIG_RCU_NOCB_CPU_ALL) */
+ +#endif /* #ifdef CONFIG_TINY_RCU */
   
   #if defined(CONFIG_RCU_NOCB_CPU_ALL)
   static inline bool rcu_is_nocb_cpu(int cpu) { return true; }
diff --combined include/linux/rcutree.h

index 3fa4a43ab4150b0b8c956ebacc5de47dace12462,db2e31beaae7c5f179972ec1a49051c240a90329..456879143f89f9db45d0f79315f728f50a9f9d0c
--- 1/include/linux/rcutree.h
--- 2/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@@ -31,7 -31,9 +31,7 @@@
   #define __LINUX_RCUTREE_H
   
   void rcu_note_context_switch(void);
- int rcu_needs_cpu(unsigned long *delta_jiffies);
- -#ifndef CONFIG_RCU_NOCB_CPU_ALL
+ int rcu_needs_cpu(u64 basem, u64 *nextevt);
- -#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
   void rcu_cpu_stall_reset(void);
   
   /*
@@@ -91,11 -93,6 +91,11 @@@ void rcu_force_quiescent_state(void)
   void rcu_bh_force_quiescent_state(void);
   void rcu_sched_force_quiescent_state(void);
   
+ +void rcu_idle_enter(void);
+ +void rcu_idle_exit(void);
+ +void rcu_irq_enter(void);
+ +void rcu_irq_exit(void);
+ +
   void exit_rcu(void);
   
   void rcu_scheduler_starting(void);
diff --combined include/linux/sched.h

index d4193d5613cf594108390e5d953451fd73087de5,d7151460b0cfc98d437211a347524164a5d3bc0f..30364cb58b1fa8348bec5169741bf5263b3ab15a
--- 1/include/linux/sched.h
--- 2/include/linux/sched.h
+++ b/include/linux/sched.h
@@@ -25,7 -25,7 +25,7 @@@ struct sched_param 
   #include <linux/errno.h>
   #include <linux/nodemask.h>
   #include <linux/mm_types.h>
- -#include <linux/preempt_mask.h>
+ +#include <linux/preempt.h>
   
   #include <asm/page.h>
   #include <asm/ptrace.h>
@@@ -132,7 -132,6 +132,7 @@@ struct fs_struct
   struct perf_event_context;
   struct blk_plug;
   struct filename;
+ +struct nameidata;
   
   #define VMACACHE_BITS 2
   #define VMACACHE_SIZE (1U << VMACACHE_BITS)
@@@ -174,12 -173,7 +174,12 @@@ extern unsigned long nr_iowait_cpu(int 
   extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load);
   
   extern void calc_global_load(unsigned long ticks);
+ +
+ +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
   extern void update_cpu_load_nohz(void);
+ +#else
+ +static inline void update_cpu_load_nohz(void) { }
+ +#endif
   
   extern unsigned long get_parent_ip(unsigned long addr);
   
@@@ -219,10 -213,9 +219,10 @@@ print_cfs_rq(struct seq_file *m, int cp
   #define TASK_WAKEKILL         128
   #define TASK_WAKING           256
   #define TASK_PARKED           512
- -#define TASK_STATE_MAX                1024
+ +#define TASK_NOLOAD           1024
+ +#define TASK_STATE_MAX                2048
   
- -#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWP"
+ +#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPN"
   
   extern char ___assert_task_state[1 - 2*!!(
                 sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)];
@@@ -232,8 -225,6 +232,8 @@@
   #define TASK_STOPPED          (TASK_WAKEKILL | __TASK_STOPPED)
   #define TASK_TRACED           (TASK_WAKEKILL | __TASK_TRACED)
   
+ +#define TASK_IDLE             (TASK_UNINTERRUPTIBLE | TASK_NOLOAD)
+ +
   /* Convenience macros for the sake of wake_up */
   #define TASK_NORMAL           (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)
   #define TASK_ALL              (TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED)
@@@ -249,8 -240,7 +249,8 @@@
                         ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
   #define task_contributes_to_load(task)        \
                                 ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
- -                               (task->flags & PF_FROZEN) == 0)
+ +                               (task->flags & PF_FROZEN) == 0 && \
+ +                               (task->state & TASK_NOLOAD) == 0)
   
   #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
   
@@@ -262,7 -252,7 +262,7 @@@
   #define set_task_state(tsk, state_value)                      \
         do {                                                    \
                 (tsk)->task_state_change = _THIS_IP_;           \
- -              set_mb((tsk)->state, (state_value));            \
+ +              smp_store_mb((tsk)->state, (state_value));              \
         } while (0)
   
   /*
@@@ -284,7 -274,7 +284,7 @@@
   #define set_current_state(state_value)                                \
         do {                                                    \
                 current->task_state_change = _THIS_IP_;         \
- -              set_mb(current->state, (state_value));          \
+ +              smp_store_mb(current->state, (state_value));            \
         } while (0)
   
   #else
@@@ -292,7 -282,7 +292,7 @@@
   #define __set_task_state(tsk, state_value)            \
         do { (tsk)->state = (state_value); } while (0)
   #define set_task_state(tsk, state_value)              \
- -      set_mb((tsk)->state, (state_value))
+ +      smp_store_mb((tsk)->state, (state_value))
   
   /*
    * set_current_state() includes a barrier so that the write of current->state
@@@ -308,7 -298,7 +308,7 @@@
   #define __set_current_state(state_value)              \
         do { current->state = (state_value); } while (0)
   #define set_current_state(state_value)                        \
- -      set_mb(current->state, (state_value))
+ +      smp_store_mb(current->state, (state_value))
   
   #endif
   
@@@ -345,14 -335,10 +345,10 @@@ extern int runqueue_is_locked(int cpu)
   #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
   extern void nohz_balance_enter_idle(int cpu);
   extern void set_cpu_sd_state_idle(void);
- extern int get_nohz_timer_target(int pinned);
+ extern int get_nohz_timer_target(void);
   #else
   static inline void nohz_balance_enter_idle(int cpu) { }
   static inline void set_cpu_sd_state_idle(void) { }
- static inline int get_nohz_timer_target(int pinned)
- {
-       return smp_processor_id();
- }
   #endif
   
   /*
@@@ -577,23 -563,6 +573,23 @@@ struct task_cputime 
                 .sum_exec_runtime = 0,                          \
         }
   
+ +/*
+ + * This is the atomic variant of task_cputime, which can be used for
+ + * storing and updating task_cputime statistics without locking.
+ + */
+ +struct task_cputime_atomic {
+ +      atomic64_t utime;
+ +      atomic64_t stime;
+ +      atomic64_t sum_exec_runtime;
+ +};
+ +
+ +#define INIT_CPUTIME_ATOMIC \
+ +      (struct task_cputime_atomic) {                          \
+ +              .utime = ATOMIC64_INIT(0),                      \
+ +              .stime = ATOMIC64_INIT(0),                      \
+ +              .sum_exec_runtime = ATOMIC64_INIT(0),           \
+ +      }
+ +
   #ifdef CONFIG_PREEMPT_COUNT
   #define PREEMPT_DISABLED      (1 + PREEMPT_ENABLED)
   #else
@@@ -611,16 -580,18 +607,16 @@@
   
   /**
    * struct thread_group_cputimer - thread group interval timer counts
- - * @cputime:          thread group interval timers.
+ + * @cputime_atomic:   atomic thread group interval timers.
    * @running:          non-zero when there are timers running and
    *                    @cputime receives updates.
- - * @lock:             lock for fields in this struct.
    *
    * This structure contains the version of task_cputime, above, that is
    * used for thread group CPU timer calculations.
    */
   struct thread_group_cputimer {
- -      struct task_cputime cputime;
+ +      struct task_cputime_atomic cputime_atomic;
         int running;
- -      raw_spinlock_t lock;
   };
   
   #include <linux/rwsem.h>
@@@ -924,50 -895,6 +920,50 @@@ enum cpu_idle_type 
   #define SCHED_CAPACITY_SHIFT  10
   #define SCHED_CAPACITY_SCALE  (1L << SCHED_CAPACITY_SHIFT)
   
+ +/*
+ + * Wake-queues are lists of tasks with a pending wakeup, whose
+ + * callers have already marked the task as woken internally,
+ + * and can thus carry on. A common use case is being able to
+ + * do the wakeups once the corresponding user lock as been
+ + * released.
+ + *
+ + * We hold reference to each task in the list across the wakeup,
+ + * thus guaranteeing that the memory is still valid by the time
+ + * the actual wakeups are performed in wake_up_q().
+ + *
+ + * One per task suffices, because there's never a need for a task to be
+ + * in two wake queues simultaneously; it is forbidden to abandon a task
+ + * in a wake queue (a call to wake_up_q() _must_ follow), so if a task is
+ + * already in a wake queue, the wakeup will happen soon and the second
+ + * waker can just skip it.
+ + *
+ + * The WAKE_Q macro declares and initializes the list head.
+ + * wake_up_q() does NOT reinitialize the list; it's expected to be
+ + * called near the end of a function, where the fact that the queue is
+ + * not used again will be easy to see by inspection.
+ + *
+ + * Note that this can cause spurious wakeups. schedule() callers
+ + * must ensure the call is done inside a loop, confirming that the
+ + * wakeup condition has in fact occurred.
+ + */
+ +struct wake_q_node {
+ +      struct wake_q_node *next;
+ +};
+ +
+ +struct wake_q_head {
+ +      struct wake_q_node *first;
+ +      struct wake_q_node **lastp;
+ +};
+ +
+ +#define WAKE_Q_TAIL ((struct wake_q_node *) 0x01)
+ +
+ +#define WAKE_Q(name)                                  \
+ +      struct wake_q_head name = { WAKE_Q_TAIL, &name.first }
+ +
+ +extern void wake_q_add(struct wake_q_head *head,
+ +                     struct task_struct *task);
+ +extern void wake_up_q(struct wake_q_head *head);
+ +
   /*
    * sched-domains (multiprocessor balancing) declarations:
    */
@@@ -1403,6 -1330,8 +1399,6 @@@ struct task_struct 
         int rcu_read_lock_nesting;
         union rcu_special rcu_read_unlock_special;
         struct list_head rcu_node_entry;
- -#endif /* #ifdef CONFIG_PREEMPT_RCU */
- -#ifdef CONFIG_PREEMPT_RCU
         struct rcu_node *rcu_blocked_node;
   #endif /* #ifdef CONFIG_PREEMPT_RCU */
   #ifdef CONFIG_TASKS_RCU
@@@ -1423,6 -1352,9 +1419,6 @@@
   #endif
   
         struct mm_struct *mm, *active_mm;
- -#ifdef CONFIG_COMPAT_BRK
- -      unsigned brk_randomized:1;
- -#endif
         /* per-thread vma caching */
         u32 vmacache_seqnum;
         struct vm_area_struct *vmacache[VMACACHE_SIZE];
@@@ -1433,7 -1365,7 +1429,7 @@@
         int exit_state;
         int exit_code, exit_signal;
         int pdeath_signal;  /*  The signal sent when the parent dies  */
- -      unsigned int jobctl;    /* JOBCTL_*, siglock protected */
+ +      unsigned long jobctl;   /* JOBCTL_*, siglock protected */
   
         /* Used for emulating ABI behavior of previous Linux versions */
         unsigned int personality;
@@@ -1445,14 -1377,10 +1441,14 @@@
         /* Revert to default priority/policy when forking */
         unsigned sched_reset_on_fork:1;
         unsigned sched_contributes_to_load:1;
+ +      unsigned sched_migrated:1;
   
   #ifdef CONFIG_MEMCG_KMEM
         unsigned memcg_kmem_skip_account:1;
   #endif
+ +#ifdef CONFIG_COMPAT_BRK
+ +      unsigned brk_randomized:1;
+ +#endif
   
         unsigned long atomic_flags; /* Flags needing atomic access. */
   
@@@ -1529,7 -1457,7 +1525,7 @@@
                                        it with task_lock())
                                      - initialized normally by setup_new_exec */
   /* file system info */
- -      int link_count, total_link_count;
+ +      struct nameidata *nameidata;
   #ifdef CONFIG_SYSVIPC
   /* ipc stuff */
         struct sysv_sem sysvsem;
@@@ -1579,8 -1507,6 +1575,8 @@@
         /* Protection of the PI data structures: */
         raw_spinlock_t pi_lock;
   
+ +      struct wake_q_node wake_q;
+ +
   #ifdef CONFIG_RT_MUTEXES
         /* PI waiters blocked on a rt_mutex held by this task */
         struct rb_root pi_waiters;
@@@ -1794,7 -1720,6 +1790,7 @@@
   #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
         unsigned long   task_state_change;
   #endif
+ +      int pagefault_disabled;
   };
   
   /* Future-safe accessor for struct task_struct's cpus_allowed. */
@@@ -2148,22 -2073,22 +2144,22 @@@ TASK_PFA_CLEAR(SPREAD_SLAB, spread_slab
   #define JOBCTL_TRAPPING_BIT   21      /* switching to TRACED */
   #define JOBCTL_LISTENING_BIT  22      /* ptracer is listening for events */
   
- -#define JOBCTL_STOP_DEQUEUED  (1 << JOBCTL_STOP_DEQUEUED_BIT)
- -#define JOBCTL_STOP_PENDING   (1 << JOBCTL_STOP_PENDING_BIT)
- -#define JOBCTL_STOP_CONSUME   (1 << JOBCTL_STOP_CONSUME_BIT)
- -#define JOBCTL_TRAP_STOP      (1 << JOBCTL_TRAP_STOP_BIT)
- -#define JOBCTL_TRAP_NOTIFY    (1 << JOBCTL_TRAP_NOTIFY_BIT)
- -#define JOBCTL_TRAPPING               (1 << JOBCTL_TRAPPING_BIT)
- -#define JOBCTL_LISTENING      (1 << JOBCTL_LISTENING_BIT)
+ +#define JOBCTL_STOP_DEQUEUED  (1UL << JOBCTL_STOP_DEQUEUED_BIT)
+ +#define JOBCTL_STOP_PENDING   (1UL << JOBCTL_STOP_PENDING_BIT)
+ +#define JOBCTL_STOP_CONSUME   (1UL << JOBCTL_STOP_CONSUME_BIT)
+ +#define JOBCTL_TRAP_STOP      (1UL << JOBCTL_TRAP_STOP_BIT)
+ +#define JOBCTL_TRAP_NOTIFY    (1UL << JOBCTL_TRAP_NOTIFY_BIT)
+ +#define JOBCTL_TRAPPING               (1UL << JOBCTL_TRAPPING_BIT)
+ +#define JOBCTL_LISTENING      (1UL << JOBCTL_LISTENING_BIT)
   
   #define JOBCTL_TRAP_MASK      (JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY)
   #define JOBCTL_PENDING_MASK   (JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK)
   
   extern bool task_set_jobctl_pending(struct task_struct *task,
- -                                  unsigned int mask);
+ +                                  unsigned long mask);
   extern void task_clear_jobctl_trapping(struct task_struct *task);
   extern void task_clear_jobctl_pending(struct task_struct *task,
- -                                    unsigned int mask);
+ +                                    unsigned long mask);
   
   static inline void rcu_copy_process(struct task_struct *p)
   {
@@@ -3033,6 -2958,11 +3029,6 @@@ static __always_inline bool need_resche
   void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
   void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times);
   
- -static inline void thread_group_cputime_init(struct signal_struct *sig)
- -{
- -      raw_spin_lock_init(&sig->cputimer.lock);
- -}
- -
   /*
    * Reevaluate whether the task has signals pending delivery.
    * Wake the task if so.
@@@ -3146,13 -3076,13 +3142,13 @@@ static inline void mm_update_next_owner
   static inline unsigned long task_rlimit(const struct task_struct *tsk,
                 unsigned int limit)
   {
- -      return ACCESS_ONCE(tsk->signal->rlim[limit].rlim_cur);
+ +      return READ_ONCE(tsk->signal->rlim[limit].rlim_cur);
   }
   
   static inline unsigned long task_rlimit_max(const struct task_struct *tsk,
                 unsigned int limit)
   {
- -      return ACCESS_ONCE(tsk->signal->rlim[limit].rlim_max);
+ +      return READ_ONCE(tsk->signal->rlim[limit].rlim_max);
   }
   
   static inline unsigned long rlimit(unsigned int limit)
diff --combined kernel/events/core.c

index f2003b97ddc99d726cf5cc145b134b128671f17c,1c6c2826af1ee0bfa6e8d1c9ba1284fb065550c3..8e13f3e54ec369f26d52e52081f013a6aa29fd23
--- 1/kernel/events/core.c
--- 2/kernel/events/core.c
+++ b/kernel/events/core.c
@@@ -51,9 -51,11 +51,11 @@@
   
   static struct workqueue_struct *perf_wq;
   
+ typedef int (*remote_function_f)(void *);
+ 
   struct remote_function_call {
         struct task_struct      *p;
-       int                     (*func)(void *info);
+       remote_function_f       func;
         void                    *info;
         int                     ret;
   };
@@@ -86,7 -88,7 +88,7 @@@ static void remote_function(void *data
    *        -EAGAIN - when the process moved away
    */
   static int
- task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
+ task_function_call(struct task_struct *p, remote_function_f func, void *info)
   {
         struct remote_function_call data = {
                 .p      = p,
@@@ -110,7 -112,7 +112,7 @@@
    *
    * returns: @func return value or -ENXIO when the cpu is offline
    */
- static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
+ static int cpu_function_call(int cpu, remote_function_f func, void *info)
   {
         struct remote_function_call data = {
                 .p      = NULL,
@@@ -747,62 -749,31 +749,31 @@@ perf_cgroup_mark_enabled(struct perf_ev
   /*
    * function must be called with interrupts disbled
    */
- static enum hrtimer_restart perf_cpu_hrtimer_handler(struct hrtimer *hr)
+ static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
   {
         struct perf_cpu_context *cpuctx;
-       enum hrtimer_restart ret = HRTIMER_NORESTART;
         int rotations = 0;
   
         WARN_ON(!irqs_disabled());
   
         cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
- 
         rotations = perf_rotate_context(cpuctx);
   
-       /*
-        * arm timer if needed
-        */
-       if (rotations) {
+       raw_spin_lock(&cpuctx->hrtimer_lock);
+       if (rotations)
                 hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
-               ret = HRTIMER_RESTART;
-       }
- 
-       return ret;
- }
- 
- /* CPU is going down */
- void perf_cpu_hrtimer_cancel(int cpu)
- {
-       struct perf_cpu_context *cpuctx;
-       struct pmu *pmu;
-       unsigned long flags;
- 
-       if (WARN_ON(cpu != smp_processor_id()))
-               return;
- 
-       local_irq_save(flags);
- 
-       rcu_read_lock();
- 
-       list_for_each_entry_rcu(pmu, &pmus, entry) {
-               cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
- 
-               if (pmu->task_ctx_nr == perf_sw_context)
-                       continue;
- 
-               hrtimer_cancel(&cpuctx->hrtimer);
-       }
- 
-       rcu_read_unlock();
+       else
+               cpuctx->hrtimer_active = 0;
+       raw_spin_unlock(&cpuctx->hrtimer_lock);
   
-       local_irq_restore(flags);
+       return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
   }
   
- static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
+ static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
   {
-       struct hrtimer *hr = &cpuctx->hrtimer;
+       struct hrtimer *timer = &cpuctx->hrtimer;
         struct pmu *pmu = cpuctx->ctx.pmu;
-       int timer;
+       u64 interval;
   
         /* no multiplexing needed for SW PMU */
         if (pmu->task_ctx_nr == perf_sw_context)
@@@ -812,31 -783,36 +783,36 @@@
          * check default is sane, if not set then force to
          * default interval (1/tick)
          */
-       timer = pmu->hrtimer_interval_ms;
-       if (timer < 1)
-               timer = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
+       interval = pmu->hrtimer_interval_ms;
+       if (interval < 1)
+               interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
   
-       cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
+       cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
   
-       hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
-       hr->function = perf_cpu_hrtimer_handler;
+       raw_spin_lock_init(&cpuctx->hrtimer_lock);
+       hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
+       timer->function = perf_mux_hrtimer_handler;
   }
   
- static void perf_cpu_hrtimer_restart(struct perf_cpu_context *cpuctx)
+ static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
   {
-       struct hrtimer *hr = &cpuctx->hrtimer;
+       struct hrtimer *timer = &cpuctx->hrtimer;
         struct pmu *pmu = cpuctx->ctx.pmu;
+       unsigned long flags;
   
         /* not for SW PMU */
         if (pmu->task_ctx_nr == perf_sw_context)
-               return;
+               return 0;
   
-       if (hrtimer_active(hr))
-               return;
+       raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
+       if (!cpuctx->hrtimer_active) {
+               cpuctx->hrtimer_active = 1;
+               hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
+               hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
+       }
+       raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
   
-       if (!hrtimer_callback_running(hr))
-               __hrtimer_start_range_ns(hr, cpuctx->hrtimer_interval,
-                                        0, HRTIMER_MODE_REL_PINNED, 0);
+       return 0;
   }
   
   void perf_pmu_disable(struct pmu *pmu)
@@@ -1935,7 -1911,7 +1911,7 @@@ group_sched_in(struct perf_event *group
   
         if (event_sched_in(group_event, cpuctx, ctx)) {
                 pmu->cancel_txn(pmu);
-               perf_cpu_hrtimer_restart(cpuctx);
+               perf_mux_hrtimer_restart(cpuctx);
                 return -EAGAIN;
         }
   
@@@ -1982,7 -1958,7 +1958,7 @@@ group_error
   
         pmu->cancel_txn(pmu);
   
-       perf_cpu_hrtimer_restart(cpuctx);
+       perf_mux_hrtimer_restart(cpuctx);
   
         return -EAGAIN;
   }
@@@ -2255,7 -2231,7 +2231,7 @@@ static int __perf_event_enable(void *in
                  */
                 if (leader != event) {
                         group_sched_out(leader, cpuctx, ctx);
-                       perf_cpu_hrtimer_restart(cpuctx);
+                       perf_mux_hrtimer_restart(cpuctx);
                 }
                 if (leader->attr.pinned) {
                         update_group_times(leader);
@@@ -3442,6 -3418,7 +3418,6 @@@ static void free_event_rcu(struct rcu_h
         if (event->ns)
                 put_pid_ns(event->ns);
         perf_event_free_filter(event);
- -      perf_event_free_bpf_prog(event);
         kfree(event);
   }
   
@@@ -3572,8 -3549,6 +3548,8 @@@ static void __free_event(struct perf_ev
                         put_callchain_buffers();
         }
   
+ +      perf_event_free_bpf_prog(event);
+ +
         if (event->destroy)
                 event->destroy(event);
   
@@@ -4331,20 -4306,20 +4307,20 @@@ static void ring_buffer_attach(struct p
                 WARN_ON_ONCE(event->rcu_pending);
   
                 old_rb = event->rb;
- -              event->rcu_batches = get_state_synchronize_rcu();
- -              event->rcu_pending = 1;
- -
                 spin_lock_irqsave(&old_rb->event_lock, flags);
                 list_del_rcu(&event->rb_entry);
                 spin_unlock_irqrestore(&old_rb->event_lock, flags);
- -      }
   
- -      if (event->rcu_pending && rb) {
- -              cond_synchronize_rcu(event->rcu_batches);
- -              event->rcu_pending = 0;
+ +              event->rcu_batches = get_state_synchronize_rcu();
+ +              event->rcu_pending = 1;
         }
   
         if (rb) {
+ +              if (event->rcu_pending) {
+ +                      cond_synchronize_rcu(event->rcu_batches);
+ +                      event->rcu_pending = 0;
+ +              }
+ +
                 spin_lock_irqsave(&rb->event_lock, flags);
                 list_add_rcu(&event->rb_entry, &rb->event_list);
                 spin_unlock_irqrestore(&rb->event_lock, flags);
@@@ -5381,9 -5356,9 +5357,9 @@@ void perf_prepare_sample(struct perf_ev
         }
   }
   
- -static void perf_event_output(struct perf_event *event,
- -                              struct perf_sample_data *data,
- -                              struct pt_regs *regs)
+ +void perf_event_output(struct perf_event *event,
+ +                      struct perf_sample_data *data,
+ +                      struct pt_regs *regs)
   {
         struct perf_output_handle handle;
         struct perf_event_header header;
@@@ -5974,39 -5949,6 +5950,39 @@@ void perf_event_aux_event(struct perf_e
         perf_output_end(&handle);
   }
   
+ +/*
+ + * Lost/dropped samples logging
+ + */
+ +void perf_log_lost_samples(struct perf_event *event, u64 lost)
+ +{
+ +      struct perf_output_handle handle;
+ +      struct perf_sample_data sample;
+ +      int ret;
+ +
+ +      struct {
+ +              struct perf_event_header        header;
+ +              u64                             lost;
+ +      } lost_samples_event = {
+ +              .header = {
+ +                      .type = PERF_RECORD_LOST_SAMPLES,
+ +                      .misc = 0,
+ +                      .size = sizeof(lost_samples_event),
+ +              },
+ +              .lost           = lost,
+ +      };
+ +
+ +      perf_event_header__init_id(&lost_samples_event.header, &sample, event);
+ +
+ +      ret = perf_output_begin(&handle, event,
+ +                              lost_samples_event.header.size);
+ +      if (ret)
+ +              return;
+ +
+ +      perf_output_put(&handle, lost_samples_event);
+ +      perf_event__output_id_sample(event, &handle, &sample);
+ +      perf_output_end(&handle);
+ +}
+ +
   /*
    * IRQ throttle logging
    */
@@@ -6897,9 -6839,8 +6873,8 @@@ static void perf_swevent_start_hrtimer(
         } else {
                 period = max_t(u64, 10000, hwc->sample_period);
         }
-       __hrtimer_start_range_ns(&hwc->hrtimer,
-                               ns_to_ktime(period), 0,
-                               HRTIMER_MODE_REL_PINNED, 0);
+       hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
+                     HRTIMER_MODE_REL_PINNED);
   }
   
   static void perf_swevent_cancel_hrtimer(struct perf_event *event)
@@@ -7200,6 -7141,8 +7175,8 @@@ perf_event_mux_interval_ms_show(struct 
         return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
   }
   
+ static DEFINE_MUTEX(mux_interval_mutex);
+ 
   static ssize_t
   perf_event_mux_interval_ms_store(struct device *dev,
                                  struct device_attribute *attr,
@@@ -7219,17 -7162,21 +7196,21 @@@
         if (timer == pmu->hrtimer_interval_ms)
                 return count;
   
+       mutex_lock(&mux_interval_mutex);
         pmu->hrtimer_interval_ms = timer;
   
         /* update all cpuctx for this PMU */
-       for_each_possible_cpu(cpu) {
+       get_online_cpus();
+       for_each_online_cpu(cpu) {
                 struct perf_cpu_context *cpuctx;
                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
                 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
   
-               if (hrtimer_active(&cpuctx->hrtimer))
-                       hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval);
+               cpu_function_call(cpu,
+                       (remote_function_f)perf_mux_hrtimer_restart, cpuctx);
         }
+       put_online_cpus();
+       mutex_unlock(&mux_interval_mutex);
   
         return count;
   }
@@@ -7334,7 -7281,7 +7315,7 @@@ skip_type
                 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
                 cpuctx->ctx.pmu = pmu;
   
-               __perf_cpu_hrtimer_init(cpuctx, cpu);
+               __perf_mux_hrtimer_init(cpuctx, cpu);
   
                 cpuctx->unique_pmu = pmu;
         }
diff --combined kernel/futex.c

index aacc706f85fcc1d7a1e8dd6c4e753c5b3495e1b0,720eacff6b581d3d16638a86aaa42dcc0be5dce7..ea6ca0bca52570b8cd88a9c428016cd54cf55a0a
--- 1/kernel/futex.c
--- 2/kernel/futex.c
+++ b/kernel/futex.c
@@@ -1090,11 -1090,9 +1090,11 @@@ static void __unqueue_futex(struct fute
   
   /*
    * The hash bucket lock must be held when this is called.
- - * Afterwards, the futex_q must not be accessed.
+ + * Afterwards, the futex_q must not be accessed. Callers
+ + * must ensure to later call wake_up_q() for the actual
+ + * wakeups to occur.
    */
- -static void wake_futex(struct futex_q *q)
+ +static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
   {
         struct task_struct *p = q->task;
   
@@@ -1102,10 -1100,14 +1102,10 @@@
                 return;
   
         /*
- -       * We set q->lock_ptr = NULL _before_ we wake up the task. If
- -       * a non-futex wake up happens on another CPU then the task
- -       * might exit and p would dereference a non-existing task
- -       * struct. Prevent this by holding a reference on p across the
- -       * wake up.
+ +       * Queue the task for later wakeup for after we've released
+ +       * the hb->lock. wake_q_add() grabs reference to p.
          */
- -      get_task_struct(p);
- -
+ +      wake_q_add(wake_q, p);
         __unqueue_futex(q);
         /*
          * The waiting task can free the futex_q as soon as
@@@ -1115,6 -1117,9 +1115,6 @@@
          */
         smp_wmb();
         q->lock_ptr = NULL;
- -
- -      wake_up_state(p, TASK_NORMAL);
- -      put_task_struct(p);
   }
   
   static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
@@@ -1212,7 -1217,6 +1212,7 @@@ futex_wake(u32 __user *uaddr, unsigned 
         struct futex_q *this, *next;
         union futex_key key = FUTEX_KEY_INIT;
         int ret;
+ +      WAKE_Q(wake_q);
   
         if (!bitset)
                 return -EINVAL;
@@@ -1240,14 -1244,13 +1240,14 @@@
                         if (!(this->bitset & bitset))
                                 continue;
   
- -                      wake_futex(this);
+ +                      mark_wake_futex(&wake_q, this);
                         if (++ret >= nr_wake)
                                 break;
                 }
         }
   
         spin_unlock(&hb->lock);
+ +      wake_up_q(&wake_q);
   out_put_key:
         put_futex_key(&key);
   out:
@@@ -1266,7 -1269,6 +1266,7 @@@ futex_wake_op(u32 __user *uaddr1, unsig
         struct futex_hash_bucket *hb1, *hb2;
         struct futex_q *this, *next;
         int ret, op_ret;
+ +      WAKE_Q(wake_q);
   
   retry:
         ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
@@@ -1318,7 -1320,7 +1318,7 @@@ retry_private
                                 ret = -EINVAL;
                                 goto out_unlock;
                         }
- -                      wake_futex(this);
+ +                      mark_wake_futex(&wake_q, this);
                         if (++ret >= nr_wake)
                                 break;
                 }
@@@ -1332,7 -1334,7 +1332,7 @@@
                                         ret = -EINVAL;
                                         goto out_unlock;
                                 }
- -                              wake_futex(this);
+ +                              mark_wake_futex(&wake_q, this);
                                 if (++op_ret >= nr_wake2)
                                         break;
                         }
@@@ -1342,7 -1344,6 +1342,7 @@@
   
   out_unlock:
         double_unlock_hb(hb1, hb2);
+ +      wake_up_q(&wake_q);
   out_put_keys:
         put_futex_key(&key2);
   out_put_key1:
@@@ -1502,7 -1503,6 +1502,7 @@@ static int futex_requeue(u32 __user *ua
         struct futex_pi_state *pi_state = NULL;
         struct futex_hash_bucket *hb1, *hb2;
         struct futex_q *this, *next;
+ +      WAKE_Q(wake_q);
   
         if (requeue_pi) {
                 /*
@@@ -1679,7 -1679,7 +1679,7 @@@ retry_private
                  * woken by futex_unlock_pi().
                  */
                 if (++task_count <= nr_wake && !requeue_pi) {
- -                      wake_futex(this);
+ +                      mark_wake_futex(&wake_q, this);
                         continue;
                 }
   
@@@ -1719,7 -1719,6 +1719,7 @@@
   out_unlock:
         free_pi_state(pi_state);
         double_unlock_hb(hb1, hb2);
+ +      wake_up_q(&wake_q);
         hb_waiters_dec(hb2);
   
         /*
@@@ -2056,7 -2055,7 +2056,7 @@@ static void futex_wait_queue_me(struct 
   {
         /*
          * The task state is guaranteed to be set before another task can
- -       * wake it. set_current_state() is implemented using set_mb() and
+ +       * wake it. set_current_state() is implemented using smp_store_mb() and
          * queue_me() calls spin_unlock() upon completion, both serializing
          * access to the hash list and forcing another memory barrier.
          */
@@@ -2064,11 -2063,8 +2064,8 @@@
         queue_me(q, hb);
   
         /* Arm the timer */
-       if (timeout) {
+       if (timeout)
                 hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
-               if (!hrtimer_active(&timeout->timer))
-                       timeout->task = NULL;
-       }
   
         /*
          * If we have been removed from the hash list, then another task
diff --combined kernel/locking/rtmutex.c

index 30ec5b46cd8c789a1276d9b0abda01fdabd7dfa2,8b678cac7fbe389553272a417a3d82c2ddb39406..36573e96a47761c6cd3fc17463651f3e11028d59
--- 1/kernel/locking/rtmutex.c
--- 2/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@@ -70,10 -70,10 +70,10 @@@ static void fixup_rt_mutex_waiters(stru
   }
   
   /*
- - * We can speed up the acquire/release, if the architecture
- - * supports cmpxchg and if there's no debugging state to be set up
+ + * We can speed up the acquire/release, if there's no debugging state to be
+ + * set up.
    */
- -#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES)
+ +#ifndef CONFIG_DEBUG_RT_MUTEXES
   # define rt_mutex_cmpxchg(l,c,n)      (cmpxchg(&l->owner, c, n) == c)
   static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
   {
@@@ -1182,11 -1182,8 +1182,8 @@@ rt_mutex_slowlock(struct rt_mutex *lock
         set_current_state(state);
   
         /* Setup the timer, when timeout != NULL */
-       if (unlikely(timeout)) {
+       if (unlikely(timeout))
                 hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
-               if (!hrtimer_active(&timeout->timer))
-                       timeout->task = NULL;
-       }
   
         ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk);
   
@@@ -1443,17 -1440,10 +1440,17 @@@ EXPORT_SYMBOL_GPL(rt_mutex_timed_lock)
    *
    * @lock:     the rt_mutex to be locked
    *
+ + * This function can only be called in thread context. It's safe to
+ + * call it from atomic regions, but not from hard interrupt or soft
+ + * interrupt context.
+ + *
    * Returns 1 on success and 0 on contention
    */
   int __sched rt_mutex_trylock(struct rt_mutex *lock)
   {
+ +      if (WARN_ON(in_irq() || in_nmi() || in_serving_softirq()))
+ +              return 0;
+ +
         return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
   }
   EXPORT_SYMBOL_GPL(rt_mutex_trylock);
diff --combined kernel/rcu/tree_plugin.h

index 32664347091a1a6b7e04e2bf6ae8128a3411fc42,d72fa24f23128a640a386525070676a5446de2cb..013485fb2b06b9f499d0673a36bf8f62d5e72607
--- 1/kernel/rcu/tree_plugin.h
--- 2/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@@ -43,17 -43,7 +43,17 @@@ DEFINE_PER_CPU(unsigned int, rcu_cpu_kt
   DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
   DEFINE_PER_CPU(char, rcu_cpu_has_work);
   
- -#endif /* #ifdef CONFIG_RCU_BOOST */
+ +#else /* #ifdef CONFIG_RCU_BOOST */
+ +
+ +/*
+ + * Some architectures do not define rt_mutexes, but if !CONFIG_RCU_BOOST,
+ + * all uses are in dead code.  Provide a definition to keep the compiler
+ + * happy, but add WARN_ON_ONCE() to complain if used in the wrong place.
+ + * This probably needs to be excluded from -rt builds.
+ + */
+ +#define rt_mutex_owner(a) ({ WARN_ON_ONCE(1); NULL; })
+ +
+ +#endif /* #else #ifdef CONFIG_RCU_BOOST */
   
   #ifdef CONFIG_RCU_NOCB_CPU
   static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
@@@ -70,11 -60,11 +70,11 @@@ static void __init rcu_bootup_announce_
   {
         if (IS_ENABLED(CONFIG_RCU_TRACE))
                 pr_info("\tRCU debugfs-based tracing is enabled.\n");
- -      if ((IS_ENABLED(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) ||
- -          (!IS_ENABLED(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32))
+ +      if ((IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 64) ||
+ +          (!IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 32))
                 pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
- -                     CONFIG_RCU_FANOUT);
- -      if (IS_ENABLED(CONFIG_RCU_FANOUT_EXACT))
+ +                     RCU_FANOUT);
+ +      if (rcu_fanout_exact)
                 pr_info("\tHierarchical RCU autobalancing is disabled.\n");
         if (IS_ENABLED(CONFIG_RCU_FAST_NO_HZ))
                 pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n");
@@@ -86,10 -76,10 +86,10 @@@
                 pr_info("\tAdditional per-CPU info printed with stalls.\n");
         if (NUM_RCU_LVL_4 != 0)
                 pr_info("\tFour-level hierarchy is enabled.\n");
- -      if (CONFIG_RCU_FANOUT_LEAF != 16)
+ +      if (RCU_FANOUT_LEAF != 16)
                 pr_info("\tBuild-time adjustment of leaf fanout to %d.\n",
- -                      CONFIG_RCU_FANOUT_LEAF);
- -      if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF)
+ +                      RCU_FANOUT_LEAF);
+ +      if (rcu_fanout_leaf != RCU_FANOUT_LEAF)
                 pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
         if (nr_cpu_ids != NR_CPUS)
                 pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
@@@ -100,8 -90,7 +100,8 @@@
   #ifdef CONFIG_PREEMPT_RCU
   
   RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
- -static struct rcu_state *rcu_state_p = &rcu_preempt_state;
+ +static struct rcu_state *const rcu_state_p = &rcu_preempt_state;
+ +static struct rcu_data __percpu *const rcu_data_p = &rcu_preempt_data;
   
   static int rcu_preempted_readers_exp(struct rcu_node *rnp);
   static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
@@@ -127,11 -116,11 +127,11 @@@ static void __init rcu_bootup_announce(
    */
   static void rcu_preempt_qs(void)
   {
- -      if (!__this_cpu_read(rcu_preempt_data.passed_quiesce)) {
+ +      if (!__this_cpu_read(rcu_data_p->passed_quiesce)) {
                 trace_rcu_grace_period(TPS("rcu_preempt"),
- -                                     __this_cpu_read(rcu_preempt_data.gpnum),
+ +                                     __this_cpu_read(rcu_data_p->gpnum),
                                        TPS("cpuqs"));
- -              __this_cpu_write(rcu_preempt_data.passed_quiesce, 1);
+ +              __this_cpu_write(rcu_data_p->passed_quiesce, 1);
                 barrier(); /* Coordinate with rcu_preempt_check_callbacks(). */
                 current->rcu_read_unlock_special.b.need_qs = false;
         }
@@@ -161,7 -150,7 +161,7 @@@ static void rcu_preempt_note_context_sw
             !t->rcu_read_unlock_special.b.blocked) {
   
                 /* Possibly blocking in an RCU read-side critical section. */
- -              rdp = this_cpu_ptr(rcu_preempt_state.rda);
+ +              rdp = this_cpu_ptr(rcu_state_p->rda);
                 rnp = rdp->mynode;
                 raw_spin_lock_irqsave(&rnp->lock, flags);
                 smp_mb__after_unlock_lock();
@@@ -191,9 -180,10 +191,9 @@@
                 if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) {
                         list_add(&t->rcu_node_entry, rnp->gp_tasks->prev);
                         rnp->gp_tasks = &t->rcu_node_entry;
- -#ifdef CONFIG_RCU_BOOST
- -                      if (rnp->boost_tasks != NULL)
+ +                      if (IS_ENABLED(CONFIG_RCU_BOOST) &&
+ +                          rnp->boost_tasks != NULL)
                                 rnp->boost_tasks = rnp->gp_tasks;
- -#endif /* #ifdef CONFIG_RCU_BOOST */
                 } else {
                         list_add(&t->rcu_node_entry, &rnp->blkd_tasks);
                         if (rnp->qsmask & rdp->grpmask)
@@@ -273,7 -263,9 +273,7 @@@ void rcu_read_unlock_special(struct tas
         bool empty_exp_now;
         unsigned long flags;
         struct list_head *np;
- -#ifdef CONFIG_RCU_BOOST
         bool drop_boost_mutex = false;
- -#endif /* #ifdef CONFIG_RCU_BOOST */
         struct rcu_node *rnp;
         union rcu_special special;
   
@@@ -315,11 -307,9 +315,11 @@@
                 t->rcu_read_unlock_special.b.blocked = false;
   
                 /*
- -               * Remove this task from the list it blocked on.  The
- -               * task can migrate while we acquire the lock, but at
- -               * most one time.  So at most two passes through loop.
+ +               * Remove this task from the list it blocked on.  The task
+ +               * now remains queued on the rcu_node corresponding to
+ +               * the CPU it first blocked on, so the first attempt to
+ +               * acquire the task's rcu_node's ->lock will succeed.
+ +               * Keep the loop and add a WARN_ON() out of sheer paranoia.
                  */
                 for (;;) {
                         rnp = t->rcu_blocked_node;
@@@ -327,7 -317,6 +327,7 @@@
                         smp_mb__after_unlock_lock();
                         if (rnp == t->rcu_blocked_node)
                                 break;
+ +                      WARN_ON_ONCE(1);
                         raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
                 }
                 empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
@@@ -342,12 -331,12 +342,12 @@@
                         rnp->gp_tasks = np;
                 if (&t->rcu_node_entry == rnp->exp_tasks)
                         rnp->exp_tasks = np;
- -#ifdef CONFIG_RCU_BOOST
- -              if (&t->rcu_node_entry == rnp->boost_tasks)
- -                      rnp->boost_tasks = np;
- -              /* Snapshot ->boost_mtx ownership with rcu_node lock held. */
- -              drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t;
- -#endif /* #ifdef CONFIG_RCU_BOOST */
+ +              if (IS_ENABLED(CONFIG_RCU_BOOST)) {
+ +                      if (&t->rcu_node_entry == rnp->boost_tasks)
+ +                              rnp->boost_tasks = np;
+ +                      /* Snapshot ->boost_mtx ownership w/rnp->lock held. */
+ +                      drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t;
+ +              }
   
                 /*
                  * If this was the last task on the current list, and if
@@@ -364,21 -353,24 +364,21 @@@
                                                          rnp->grplo,
                                                          rnp->grphi,
                                                          !!rnp->gp_tasks);
- -                      rcu_report_unblock_qs_rnp(&rcu_preempt_state,
- -                                                rnp, flags);
+ +                      rcu_report_unblock_qs_rnp(rcu_state_p, rnp, flags);
                 } else {
                         raw_spin_unlock_irqrestore(&rnp->lock, flags);
                 }
   
- -#ifdef CONFIG_RCU_BOOST
                 /* Unboost if we were boosted. */
- -              if (drop_boost_mutex)
+ +              if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex)
                         rt_mutex_unlock(&rnp->boost_mtx);
- -#endif /* #ifdef CONFIG_RCU_BOOST */
   
                 /*
                  * If this was the last task on the expedited lists,
                  * then we need to report up the rcu_node hierarchy.
                  */
                 if (!empty_exp && empty_exp_now)
- -                      rcu_report_exp_rnp(&rcu_preempt_state, rnp, true);
+ +                      rcu_report_exp_rnp(rcu_state_p, rnp, true);
         } else {
                 local_irq_restore(flags);
         }
@@@ -398,7 -390,7 +398,7 @@@ static void rcu_print_detail_task_stall
                 raw_spin_unlock_irqrestore(&rnp->lock, flags);
                 return;
         }
- -      t = list_entry(rnp->gp_tasks,
+ +      t = list_entry(rnp->gp_tasks->prev,
                        struct task_struct, rcu_node_entry);
         list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
                 sched_show_task(t);
@@@ -455,7 -447,7 +455,7 @@@ static int rcu_print_task_stall(struct 
         if (!rcu_preempt_blocked_readers_cgp(rnp))
                 return 0;
         rcu_print_task_stall_begin(rnp);
- -      t = list_entry(rnp->gp_tasks,
+ +      t = list_entry(rnp->gp_tasks->prev,
                        struct task_struct, rcu_node_entry);
         list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
                 pr_cont(" P%d", t->pid);
@@@ -499,8 -491,8 +499,8 @@@ static void rcu_preempt_check_callbacks
                 return;
         }
         if (t->rcu_read_lock_nesting > 0 &&
- -          __this_cpu_read(rcu_preempt_data.qs_pending) &&
- -          !__this_cpu_read(rcu_preempt_data.passed_quiesce))
+ +          __this_cpu_read(rcu_data_p->qs_pending) &&
+ +          !__this_cpu_read(rcu_data_p->passed_quiesce))
                 t->rcu_read_unlock_special.b.need_qs = true;
   }
   
@@@ -508,7 -500,7 +508,7 @@@
   
   static void rcu_preempt_do_callbacks(void)
   {
- -      rcu_do_batch(&rcu_preempt_state, this_cpu_ptr(&rcu_preempt_data));
+ +      rcu_do_batch(rcu_state_p, this_cpu_ptr(rcu_data_p));
   }
   
   #endif /* #ifdef CONFIG_RCU_BOOST */
@@@ -518,7 -510,7 +518,7 @@@
    */
   void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
   {
- -      __call_rcu(head, func, &rcu_preempt_state, -1, 0);
+ +      __call_rcu(head, func, rcu_state_p, -1, 0);
   }
   EXPORT_SYMBOL_GPL(call_rcu);
   
@@@ -578,7 -570,7 +578,7 @@@ static int rcu_preempted_readers_exp(st
   static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
   {
         return !rcu_preempted_readers_exp(rnp) &&
- -             ACCESS_ONCE(rnp->expmask) == 0;
+ +             READ_ONCE(rnp->expmask) == 0;
   }
   
   /*
@@@ -719,12 -711,12 +719,12 @@@ sync_rcu_preempt_exp_init2(struct rcu_s
   void synchronize_rcu_expedited(void)
   {
         struct rcu_node *rnp;
- -      struct rcu_state *rsp = &rcu_preempt_state;
+ +      struct rcu_state *rsp = rcu_state_p;
         unsigned long snap;
         int trycount = 0;
   
         smp_mb(); /* Caller's modifications seen first by other CPUs. */
- -      snap = ACCESS_ONCE(sync_rcu_preempt_exp_count) + 1;
+ +      snap = READ_ONCE(sync_rcu_preempt_exp_count) + 1;
         smp_mb(); /* Above access cannot bleed into critical section. */
   
         /*
@@@ -748,7 -740,7 +748,7 @@@
          */
         while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) {
                 if (ULONG_CMP_LT(snap,
- -                  ACCESS_ONCE(sync_rcu_preempt_exp_count))) {
+ +                  READ_ONCE(sync_rcu_preempt_exp_count))) {
                         put_online_cpus();
                         goto mb_ret; /* Others did our work for us. */
                 }
@@@ -760,7 -752,7 +760,7 @@@
                         return;
                 }
         }
- -      if (ULONG_CMP_LT(snap, ACCESS_ONCE(sync_rcu_preempt_exp_count))) {
+ +      if (ULONG_CMP_LT(snap, READ_ONCE(sync_rcu_preempt_exp_count))) {
                 put_online_cpus();
                 goto unlock_mb_ret; /* Others did our work for us. */
         }
@@@ -788,7 -780,8 +788,7 @@@
   
         /* Clean up and exit. */
         smp_mb(); /* ensure expedited GP seen before counter increment. */
- -      ACCESS_ONCE(sync_rcu_preempt_exp_count) =
- -                                      sync_rcu_preempt_exp_count + 1;
+ +      WRITE_ONCE(sync_rcu_preempt_exp_count, sync_rcu_preempt_exp_count + 1);
   unlock_mb_ret:
         mutex_unlock(&sync_rcu_preempt_exp_mutex);
   mb_ret:
@@@ -806,7 -799,7 +806,7 @@@ EXPORT_SYMBOL_GPL(synchronize_rcu_exped
    */
   void rcu_barrier(void)
   {
- -      _rcu_barrier(&rcu_preempt_state);
+ +      _rcu_barrier(rcu_state_p);
   }
   EXPORT_SYMBOL_GPL(rcu_barrier);
   
@@@ -815,7 -808,7 +815,7 @@@
    */
   static void __init __rcu_init_preempt(void)
   {
- -      rcu_init_one(&rcu_preempt_state, &rcu_preempt_data);
+ +      rcu_init_one(rcu_state_p, rcu_data_p);
   }
   
   /*
@@@ -838,8 -831,7 +838,8 @@@ void exit_rcu(void
   
   #else /* #ifdef CONFIG_PREEMPT_RCU */
   
- -static struct rcu_state *rcu_state_p = &rcu_sched_state;
+ +static struct rcu_state *const rcu_state_p = &rcu_sched_state;
+ +static struct rcu_data __percpu *const rcu_data_p = &rcu_sched_data;
   
   /*
    * Tell them what RCU they are running.
@@@ -1002,8 -994,8 +1002,8 @@@ static int rcu_boost(struct rcu_node *r
         struct task_struct *t;
         struct list_head *tb;
   
- -      if (ACCESS_ONCE(rnp->exp_tasks) == NULL &&
- -          ACCESS_ONCE(rnp->boost_tasks) == NULL)
+ +      if (READ_ONCE(rnp->exp_tasks) == NULL &&
+ +          READ_ONCE(rnp->boost_tasks) == NULL)
                 return 0;  /* Nothing left to boost. */
   
         raw_spin_lock_irqsave(&rnp->lock, flags);
@@@ -1056,8 -1048,8 +1056,8 @@@
         rt_mutex_lock(&rnp->boost_mtx);
         rt_mutex_unlock(&rnp->boost_mtx);  /* Then keep lockdep happy. */
   
- -      return ACCESS_ONCE(rnp->exp_tasks) != NULL ||
- -             ACCESS_ONCE(rnp->boost_tasks) != NULL;
+ +      return READ_ONCE(rnp->exp_tasks) != NULL ||
+ +             READ_ONCE(rnp->boost_tasks) != NULL;
   }
   
   /*
@@@ -1181,7 -1173,7 +1181,7 @@@ static int rcu_spawn_one_boost_kthread(
         struct sched_param sp;
         struct task_struct *t;
   
- -      if (&rcu_preempt_state != rsp)
+ +      if (rcu_state_p != rsp)
                 return 0;
   
         if (!rcu_scheduler_fully_active || rcu_rnp_online_cpus(rnp) == 0)
@@@ -1375,12 -1367,13 +1375,12 @@@ static void rcu_prepare_kthreads(int cp
    * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs
    * any flavor of RCU.
    */
- int rcu_needs_cpu(unsigned long *delta_jiffies)
- -#ifndef CONFIG_RCU_NOCB_CPU_ALL
+ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
   {
-       *delta_jiffies = ULONG_MAX;
+       *nextevt = KTIME_MAX;
- -      return rcu_cpu_has_callbacks(NULL);
+ +      return IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)
+ +             ? 0 : rcu_cpu_has_callbacks(NULL);
   }
- -#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
   
   /*
    * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
@@@ -1439,8 -1432,6 +1439,6 @@@ module_param(rcu_idle_gp_delay, int, 06
   static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY;
   module_param(rcu_idle_lazy_gp_delay, int, 0644);
   
- extern int tick_nohz_active;
- 
   /*
    * Try to advance callbacks for all flavors of RCU on the current CPU, but
    * only if it has been awhile since the last time we did so.  Afterwards,
@@@ -1469,7 -1460,7 +1467,7 @@@ static bool __maybe_unused rcu_try_adva
                  * callbacks not yet ready to invoke.
                  */
                 if ((rdp->completed != rnp->completed ||
- -                   unlikely(ACCESS_ONCE(rdp->gpwrap))) &&
+ +                   unlikely(READ_ONCE(rdp->gpwrap))) &&
                     rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL])
                         note_gp_changes(rsp, rdp);
   
@@@ -1487,21 -1478,18 +1485,22 @@@
    *
    * The caller must have disabled interrupts.
    */
- int rcu_needs_cpu(unsigned long *dj)
- -#ifndef CONFIG_RCU_NOCB_CPU_ALL
+ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
   {
         struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+       unsigned long dj;
   
-               *dj = ULONG_MAX;
+ +      if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)) {
++              *nextevt = KTIME_MAX;
+ +              return 0;
+ +      }
+ +
         /* Snapshot to detect later posting of non-lazy callback. */
         rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
   
         /* If no callbacks, RCU doesn't need the CPU. */
         if (!rcu_cpu_has_callbacks(&rdtp->all_lazy)) {
-               *dj = ULONG_MAX;
+               *nextevt = KTIME_MAX;
                 return 0;
         }
   
@@@ -1515,13 -1503,15 +1514,14 @@@
   
         /* Request timer delay depending on laziness, and round. */
         if (!rdtp->all_lazy) {
-               *dj = round_up(rcu_idle_gp_delay + jiffies,
+               dj = round_up(rcu_idle_gp_delay + jiffies,
                                rcu_idle_gp_delay) - jiffies;
         } else {
-               *dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies;
+               dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies;
         }
+       *nextevt = basemono + dj * TICK_NSEC;
         return 0;
   }
- -#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
   
   /*
    * Prepare a CPU for idle from an RCU perspective.  The first major task
@@@ -1535,6 -1525,7 +1535,6 @@@
    */
   static void rcu_prepare_for_idle(void)
   {
- -#ifndef CONFIG_RCU_NOCB_CPU_ALL
         bool needwake;
         struct rcu_data *rdp;
         struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
@@@ -1542,11 -1533,8 +1542,11 @@@
         struct rcu_state *rsp;
         int tne;
   
+ +      if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL))
+ +              return;
+ +
         /* Handle nohz enablement switches conservatively. */
- -      tne = ACCESS_ONCE(tick_nohz_active);
+ +      tne = READ_ONCE(tick_nohz_active);
         if (tne != rdtp->tick_nohz_enabled_snap) {
                 if (rcu_cpu_has_callbacks(NULL))
                         invoke_rcu_core(); /* force nohz to see update. */
@@@ -1592,6 -1580,7 +1592,6 @@@
                 if (needwake)
                         rcu_gp_kthread_wake(rsp);
         }
- -#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
   }
   
   /*
@@@ -1601,11 -1590,12 +1601,11 @@@
    */
   static void rcu_cleanup_after_idle(void)
   {
- -#ifndef CONFIG_RCU_NOCB_CPU_ALL
- -      if (rcu_is_nocb_cpu(smp_processor_id()))
+ +      if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) ||
+ +          rcu_is_nocb_cpu(smp_processor_id()))
                 return;
         if (rcu_try_advance_all_cbs())
                 invoke_rcu_core();
- -#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
   }
   
   /*
@@@ -1770,7 -1760,7 +1770,7 @@@ static void print_cpu_stall_info(struc
                atomic_read(&rdtp->dynticks) & 0xfff,
                rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
                rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
- -             ACCESS_ONCE(rsp->n_force_qs) - rsp->n_force_qs_gpstart,
+ +             READ_ONCE(rsp->n_force_qs) - rsp->n_force_qs_gpstart,
                fast_no_hz);
   }
   
@@@ -1908,11 -1898,11 +1908,11 @@@ static void wake_nocb_leader(struct rcu
   {
         struct rcu_data *rdp_leader = rdp->nocb_leader;
   
- -      if (!ACCESS_ONCE(rdp_leader->nocb_kthread))
+ +      if (!READ_ONCE(rdp_leader->nocb_kthread))
                 return;
- -      if (ACCESS_ONCE(rdp_leader->nocb_leader_sleep) || force) {
+ +      if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) {
                 /* Prior smp_mb__after_atomic() orders against prior enqueue. */
- -              ACCESS_ONCE(rdp_leader->nocb_leader_sleep) = false;
+ +              WRITE_ONCE(rdp_leader->nocb_leader_sleep, false);
                 wake_up(&rdp_leader->nocb_wq);
         }
   }
@@@ -1944,14 -1934,14 +1944,14 @@@ static bool rcu_nocb_cpu_needs_barrier(
         ret = atomic_long_read(&rdp->nocb_q_count);
   
   #ifdef CONFIG_PROVE_RCU
- -      rhp = ACCESS_ONCE(rdp->nocb_head);
+ +      rhp = READ_ONCE(rdp->nocb_head);
         if (!rhp)
- -              rhp = ACCESS_ONCE(rdp->nocb_gp_head);
+ +              rhp = READ_ONCE(rdp->nocb_gp_head);
         if (!rhp)
- -              rhp = ACCESS_ONCE(rdp->nocb_follower_head);
+ +              rhp = READ_ONCE(rdp->nocb_follower_head);
   
         /* Having no rcuo kthread but CBs after scheduler starts is bad! */
- -      if (!ACCESS_ONCE(rdp->nocb_kthread) && rhp &&
+ +      if (!READ_ONCE(rdp->nocb_kthread) && rhp &&
             rcu_scheduler_fully_active) {
                 /* RCU callback enqueued before CPU first came online??? */
                 pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n",
@@@ -1985,12 -1975,12 +1985,12 @@@ static void __call_rcu_nocb_enqueue(str
         atomic_long_add(rhcount, &rdp->nocb_q_count);
         /* rcu_barrier() relies on ->nocb_q_count add before xchg. */
         old_rhpp = xchg(&rdp->nocb_tail, rhtp);
- -      ACCESS_ONCE(*old_rhpp) = rhp;
+ +      WRITE_ONCE(*old_rhpp, rhp);
         atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy);
         smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */
   
         /* If we are not being polled and there is a kthread, awaken it ... */
- -      t = ACCESS_ONCE(rdp->nocb_kthread);
+ +      t = READ_ONCE(rdp->nocb_kthread);
         if (rcu_nocb_poll || !t) {
                 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
                                     TPS("WakeNotPoll"));
@@@ -2128,7 -2118,7 +2128,7 @@@ static void rcu_nocb_wait_gp(struct rcu
         for (;;) {
                 wait_event_interruptible(
                         rnp->nocb_gp_wq[c & 0x1],
- -                      (d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c)));
+ +                      (d = ULONG_CMP_GE(READ_ONCE(rnp->completed), c)));
                 if (likely(d))
                         break;
                 WARN_ON(signal_pending(current));
@@@ -2155,7 -2145,7 +2155,7 @@@ wait_again
         if (!rcu_nocb_poll) {
                 trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep");
                 wait_event_interruptible(my_rdp->nocb_wq,
- -                              !ACCESS_ONCE(my_rdp->nocb_leader_sleep));
+ +                              !READ_ONCE(my_rdp->nocb_leader_sleep));
                 /* Memory barrier handled by smp_mb() calls below and repoll. */
         } else if (firsttime) {
                 firsttime = false; /* Don't drown trace log with "Poll"! */
@@@ -2169,12 -2159,12 +2169,12 @@@
          */
         gotcbs = false;
         for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
- -              rdp->nocb_gp_head = ACCESS_ONCE(rdp->nocb_head);
+ +              rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head);
                 if (!rdp->nocb_gp_head)
                         continue;  /* No CBs here, try next follower. */
   
                 /* Move callbacks to wait-for-GP list, which is empty. */
- -              ACCESS_ONCE(rdp->nocb_head) = NULL;
+ +              WRITE_ONCE(rdp->nocb_head, NULL);
                 rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head);
                 gotcbs = true;
         }
@@@ -2194,7 -2184,7 +2194,7 @@@
                 my_rdp->nocb_leader_sleep = true;
                 smp_mb();  /* Ensure _sleep true before scan. */
                 for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower)
- -                      if (ACCESS_ONCE(rdp->nocb_head)) {
+ +                      if (READ_ONCE(rdp->nocb_head)) {
                                 /* Found CB, so short-circuit next wait. */
                                 my_rdp->nocb_leader_sleep = false;
                                 break;
@@@ -2215,7 -2205,7 +2215,7 @@@
   
         /* Each pass through the following loop wakes a follower, if needed. */
         for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
- -              if (ACCESS_ONCE(rdp->nocb_head))
+ +              if (READ_ONCE(rdp->nocb_head))
                         my_rdp->nocb_leader_sleep = false;/* No need to sleep.*/
                 if (!rdp->nocb_gp_head)
                         continue; /* No CBs, so no need to wake follower. */
@@@ -2251,7 -2241,7 +2251,7 @@@ static void nocb_follower_wait(struct r
                         trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
                                             "FollowerSleep");
                         wait_event_interruptible(rdp->nocb_wq,
- -                                               ACCESS_ONCE(rdp->nocb_follower_head));
+ +                                               READ_ONCE(rdp->nocb_follower_head));
                 } else if (firsttime) {
                         /* Don't drown trace log with "Poll"! */
                         firsttime = false;
@@@ -2292,10 -2282,10 +2292,10 @@@ static int rcu_nocb_kthread(void *arg
                         nocb_follower_wait(rdp);
   
                 /* Pull the ready-to-invoke callbacks onto local list. */
- -              list = ACCESS_ONCE(rdp->nocb_follower_head);
+ +              list = READ_ONCE(rdp->nocb_follower_head);
                 BUG_ON(!list);
                 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty");
- -              ACCESS_ONCE(rdp->nocb_follower_head) = NULL;
+ +              WRITE_ONCE(rdp->nocb_follower_head, NULL);
                 tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head);
   
                 /* Each pass through the following loop invokes a callback. */
@@@ -2334,7 -2324,7 +2334,7 @@@
   /* Is a deferred wakeup of rcu_nocb_kthread() required? */
   static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
   {
- -      return ACCESS_ONCE(rdp->nocb_defer_wakeup);
+ +      return READ_ONCE(rdp->nocb_defer_wakeup);
   }
   
   /* Do a deferred wakeup of rcu_nocb_kthread(). */
@@@ -2344,8 -2334,8 +2344,8 @@@ static void do_nocb_deferred_wakeup(str
   
         if (!rcu_nocb_need_deferred_wakeup(rdp))
                 return;
- -      ndw = ACCESS_ONCE(rdp->nocb_defer_wakeup);
- -      ACCESS_ONCE(rdp->nocb_defer_wakeup) = RCU_NOGP_WAKE_NOT;
+ +      ndw = READ_ONCE(rdp->nocb_defer_wakeup);
+ +      WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE_NOT);
         wake_nocb_leader(rdp, ndw == RCU_NOGP_WAKE_FORCE);
         trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake"));
   }
@@@ -2458,7 -2448,7 +2458,7 @@@ static void rcu_spawn_one_nocb_kthread(
         t = kthread_run(rcu_nocb_kthread, rdp_spawn,
                         "rcuo%c/%d", rsp->abbr, cpu);
         BUG_ON(IS_ERR(t));
- -      ACCESS_ONCE(rdp_spawn->nocb_kthread) = t;
+ +      WRITE_ONCE(rdp_spawn->nocb_kthread, t);
   }
   
   /*
@@@ -2673,7 -2663,7 +2673,7 @@@ static void rcu_sysidle_enter(int irq
   
         /* Record start of fully idle period. */
         j = jiffies;
- -      ACCESS_ONCE(rdtp->dynticks_idle_jiffies) = j;
+ +      WRITE_ONCE(rdtp->dynticks_idle_jiffies, j);
         smp_mb__before_atomic();
         atomic_inc(&rdtp->dynticks_idle);
         smp_mb__after_atomic();
@@@ -2691,7 -2681,7 +2691,7 @@@
    */
   void rcu_sysidle_force_exit(void)
   {
- -      int oldstate = ACCESS_ONCE(full_sysidle_state);
+ +      int oldstate = READ_ONCE(full_sysidle_state);
         int newoldstate;
   
         /*
@@@ -2804,7 -2794,7 +2804,7 @@@ static void rcu_sysidle_check_cpu(struc
         smp_mb(); /* Read counters before timestamps. */
   
         /* Pick up timestamps. */
- -      j = ACCESS_ONCE(rdtp->dynticks_idle_jiffies);
+ +      j = READ_ONCE(rdtp->dynticks_idle_jiffies);
         /* If this CPU entered idle more recently, update maxj timestamp. */
         if (ULONG_CMP_LT(*maxj, j))
                 *maxj = j;
@@@ -2841,11 -2831,11 +2841,11 @@@ static unsigned long rcu_sysidle_delay(
   static void rcu_sysidle(unsigned long j)
   {
         /* Check the current state. */
- -      switch (ACCESS_ONCE(full_sysidle_state)) {
+ +      switch (READ_ONCE(full_sysidle_state)) {
         case RCU_SYSIDLE_NOT:
   
                 /* First time all are idle, so note a short idle period. */
- -              ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_SHORT;
+ +              WRITE_ONCE(full_sysidle_state, RCU_SYSIDLE_SHORT);
                 break;
   
         case RCU_SYSIDLE_SHORT:
@@@ -2883,7 -2873,7 +2883,7 @@@ static void rcu_sysidle_cancel(void
   {
         smp_mb();
         if (full_sysidle_state > RCU_SYSIDLE_SHORT)
- -              ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT;
+ +              WRITE_ONCE(full_sysidle_state, RCU_SYSIDLE_NOT);
   }
   
   /*
@@@ -2935,7 -2925,7 +2935,7 @@@ static void rcu_sysidle_cb(struct rcu_h
         smp_mb();  /* grace period precedes setting inuse. */
   
         rshp = container_of(rhp, struct rcu_sysidle_head, rh);
- -      ACCESS_ONCE(rshp->inuse) = 0;
+ +      WRITE_ONCE(rshp->inuse, 0);
   }
   
   /*
@@@ -2946,7 -2936,7 +2946,7 @@@
   bool rcu_sys_is_idle(void)
   {
         static struct rcu_sysidle_head rsh;
- -      int rss = ACCESS_ONCE(full_sysidle_state);
+ +      int rss = READ_ONCE(full_sysidle_state);
   
         if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu))
                 return false;
@@@ -2974,7 -2964,7 +2974,7 @@@
                         }
                         rcu_sysidle_report(rcu_state_p, isidle, maxj, false);
                         oldrss = rss;
- -                      rss = ACCESS_ONCE(full_sysidle_state);
+ +                      rss = READ_ONCE(full_sysidle_state);
                 }
         }
   
@@@ -3058,10 -3048,10 +3058,10 @@@ static bool rcu_nohz_full_cpu(struct rc
   #ifdef CONFIG_NO_HZ_FULL
         if (tick_nohz_full_cpu(smp_processor_id()) &&
             (!rcu_gp_in_progress(rsp) ||
- -           ULONG_CMP_LT(jiffies, ACCESS_ONCE(rsp->gp_start) + HZ)))
- -              return 1;
+ +           ULONG_CMP_LT(jiffies, READ_ONCE(rsp->gp_start) + HZ)))
+ +              return true;
   #endif /* #ifdef CONFIG_NO_HZ_FULL */
- -      return 0;
+ +      return false;
   }
   
   /*
@@@ -3087,7 -3077,7 +3087,7 @@@ static void rcu_bind_gp_kthread(void
   static void rcu_dynticks_task_enter(void)
   {
   #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
- -      ACCESS_ONCE(current->rcu_tasks_idle_cpu) = smp_processor_id();
+ +      WRITE_ONCE(current->rcu_tasks_idle_cpu, smp_processor_id());
   #endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
   }
   
@@@ -3095,6 -3085,6 +3095,6 @@@
   static void rcu_dynticks_task_exit(void)
   {
   #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
- -      ACCESS_ONCE(current->rcu_tasks_idle_cpu) = -1;
+ +      WRITE_ONCE(current->rcu_tasks_idle_cpu, -1);
   #endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
   }
diff --combined kernel/sched/core.c

index f89ca9bcf42a5f582e2c276dc2ef21338b8bd402,e9f25ce70c77396d312335552428e43535ce1df4..c9a707b593317d7936182d3db18b8ece8a3ac159
--- 1/kernel/sched/core.c
--- 2/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@@ -90,26 -90,6 +90,6 @@@
   #define CREATE_TRACE_POINTS
   #include <trace/events/sched.h>
   
- void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
- {
-       unsigned long delta;
-       ktime_t soft, hard, now;
- 
-       for (;;) {
-               if (hrtimer_active(period_timer))
-                       break;
- 
-               now = hrtimer_cb_get_time(period_timer);
-               hrtimer_forward(period_timer, now, period);
- 
-               soft = hrtimer_get_softexpires(period_timer);
-               hard = hrtimer_get_expires(period_timer);
-               delta = ktime_to_ns(ktime_sub(hard, soft));
-               __hrtimer_start_range_ns(period_timer, soft, delta,
-                                        HRTIMER_MODE_ABS_PINNED, 0);
-       }
- }
- 
   DEFINE_MUTEX(sched_domains_mutex);
   DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
   
@@@ -355,12 -335,11 +335,11 @@@ static enum hrtimer_restart hrtick(stru
   
   #ifdef CONFIG_SMP
   
- static int __hrtick_restart(struct rq *rq)
+ static void __hrtick_restart(struct rq *rq)
   {
         struct hrtimer *timer = &rq->hrtick_timer;
-       ktime_t time = hrtimer_get_softexpires(timer);
   
-       return __hrtimer_start_range_ns(timer, time, 0, HRTIMER_MODE_ABS_PINNED, 0);
+       hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
   }
   
   /*
@@@ -440,8 -419,8 +419,8 @@@ void hrtick_start(struct rq *rq, u64 de
          * doesn't make sense. Rely on vruntime for fairness.
          */
         delay = max_t(u64, delay, 10000LL);
-       __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
-                       HRTIMER_MODE_REL_PINNED, 0);
+       hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
+                     HRTIMER_MODE_REL_PINNED);
   }
   
   static inline void init_hrtick(void)
@@@ -511,7 -490,7 +490,7 @@@ static bool set_nr_and_not_polling(stru
   static bool set_nr_if_polling(struct task_struct *p)
   {
         struct thread_info *ti = task_thread_info(p);
- -      typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags);
+ +      typeof(ti->flags) old, val = READ_ONCE(ti->flags);
   
         for (;;) {
                 if (!(val & _TIF_POLLING_NRFLAG))
@@@ -541,52 -520,6 +520,52 @@@ static bool set_nr_if_polling(struct ta
   #endif
   #endif
   
+ +void wake_q_add(struct wake_q_head *head, struct task_struct *task)
+ +{
+ +      struct wake_q_node *node = &task->wake_q;
+ +
+ +      /*
+ +       * Atomically grab the task, if ->wake_q is !nil already it means
+ +       * its already queued (either by us or someone else) and will get the
+ +       * wakeup due to that.
+ +       *
+ +       * This cmpxchg() implies a full barrier, which pairs with the write
+ +       * barrier implied by the wakeup in wake_up_list().
+ +       */
+ +      if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
+ +              return;
+ +
+ +      get_task_struct(task);
+ +
+ +      /*
+ +       * The head is context local, there can be no concurrency.
+ +       */
+ +      *head->lastp = node;
+ +      head->lastp = &node->next;
+ +}
+ +
+ +void wake_up_q(struct wake_q_head *head)
+ +{
+ +      struct wake_q_node *node = head->first;
+ +
+ +      while (node != WAKE_Q_TAIL) {
+ +              struct task_struct *task;
+ +
+ +              task = container_of(node, struct task_struct, wake_q);
+ +              BUG_ON(!task);
+ +              /* task can safely be re-inserted now */
+ +              node = node->next;
+ +              task->wake_q.next = NULL;
+ +
+ +              /*
+ +               * wake_up_process() implies a wmb() to pair with the queueing
+ +               * in wake_q_add() so as not to miss wakeups.
+ +               */
+ +              wake_up_process(task);
+ +              put_task_struct(task);
+ +      }
+ +}
+ +
   /*
    * resched_curr - mark rq's current task 'to be rescheduled now'.
    *
@@@ -639,13 -572,12 +618,12 @@@ void resched_cpu(int cpu
    * selecting an idle cpu will add more delays to the timers than intended
    * (as that cpu's timer base may not be uptodate wrt jiffies etc).
    */
- int get_nohz_timer_target(int pinned)
+ int get_nohz_timer_target(void)
   {
-       int cpu = smp_processor_id();
-       int i;
+       int i, cpu = smp_processor_id();
         struct sched_domain *sd;
   
-       if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu))
+       if (!idle_cpu(cpu))
                 return cpu;
   
         rcu_read_lock();
@@@ -1095,7 -1027,7 +1073,7 @@@ void set_task_cpu(struct task_struct *p
                 if (p->sched_class->migrate_task_rq)
                         p->sched_class->migrate_task_rq(p, new_cpu);
                 p->se.nr_migrations++;
- -              perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
+ +              perf_event_task_migrate(p);
         }
   
         __set_task_cpu(p, new_cpu);
@@@ -2151,15 -2083,12 +2129,15 @@@ void wake_up_new_task(struct task_struc
   
   #ifdef CONFIG_PREEMPT_NOTIFIERS
   
+ +static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE;
+ +
   /**
    * preempt_notifier_register - tell me when current is being preempted & rescheduled
    * @notifier: notifier struct to register
    */
   void preempt_notifier_register(struct preempt_notifier *notifier)
   {
+ +      static_key_slow_inc(&preempt_notifier_key);
         hlist_add_head(&notifier->link, &current->preempt_notifiers);
   }
   EXPORT_SYMBOL_GPL(preempt_notifier_register);
@@@ -2168,16 -2097,15 +2146,16 @@@
    * preempt_notifier_unregister - no longer interested in preemption notifications
    * @notifier: notifier struct to unregister
    *
- - * This is safe to call from within a preemption notifier.
+ + * This is *not* safe to call from within a preemption notifier.
    */
   void preempt_notifier_unregister(struct preempt_notifier *notifier)
   {
         hlist_del(&notifier->link);
+ +      static_key_slow_dec(&preempt_notifier_key);
   }
   EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
   
- -static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+ +static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
   {
         struct preempt_notifier *notifier;
   
@@@ -2185,15 -2113,9 +2163,15 @@@
                 notifier->ops->sched_in(notifier, raw_smp_processor_id());
   }
   
+ +static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+ +{
+ +      if (static_key_false(&preempt_notifier_key))
+ +              __fire_sched_in_preempt_notifiers(curr);
+ +}
+ +
   static void
- -fire_sched_out_preempt_notifiers(struct task_struct *curr,
- -                               struct task_struct *next)
+ +__fire_sched_out_preempt_notifiers(struct task_struct *curr,
+ +                                 struct task_struct *next)
   {
         struct preempt_notifier *notifier;
   
@@@ -2201,21 -2123,13 +2179,21 @@@
                 notifier->ops->sched_out(notifier, next);
   }
   
+ +static __always_inline void
+ +fire_sched_out_preempt_notifiers(struct task_struct *curr,
+ +                               struct task_struct *next)
+ +{
+ +      if (static_key_false(&preempt_notifier_key))
+ +              __fire_sched_out_preempt_notifiers(curr, next);
+ +}
+ +
   #else /* !CONFIG_PREEMPT_NOTIFIERS */
   
- -static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+ +static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
   {
   }
   
- -static void
+ +static inline void
   fire_sched_out_preempt_notifiers(struct task_struct *curr,
                                  struct task_struct *next)
   {
@@@ -2461,9 -2375,9 +2439,9 @@@ unsigned long nr_iowait_cpu(int cpu
   
   void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
   {
- -      struct rq *this = this_rq();
- -      *nr_waiters = atomic_read(&this->nr_iowait);
- -      *load = this->cpu_load[0];
+ +      struct rq *rq = this_rq();
+ +      *nr_waiters = atomic_read(&rq->nr_iowait);
+ +      *load = rq->load.weight;
   }
   
   #ifdef CONFIG_SMP
@@@ -2561,7 -2475,6 +2539,7 @@@ void scheduler_tick(void
         update_rq_clock(rq);
         curr->sched_class->task_tick(rq, curr, 0);
         update_cpu_load_active(rq);
+ +      calc_global_load_tick(rq);
         raw_spin_unlock(&rq->lock);
   
         perf_event_task_tick();
@@@ -2590,7 -2503,7 +2568,7 @@@
   u64 scheduler_tick_max_deferment(void)
   {
         struct rq *rq = this_rq();
- -      unsigned long next, now = ACCESS_ONCE(jiffies);
+ +      unsigned long next, now = READ_ONCE(jiffies);
   
         next = rq->last_sched_tick + HZ;
   
@@@ -2791,7 -2704,9 +2769,7 @@@ again
    *          - return from syscall or exception to user-space
    *          - return from interrupt-handler to user-space
    *
- - * WARNING: all callers must re-check need_resched() afterward and reschedule
- - * accordingly in case an event triggered the need for rescheduling (such as
- - * an interrupt waking up a task) while preemption was disabled in __schedule().
+ + * WARNING: must be called with preemption disabled!
    */
   static void __sched __schedule(void)
   {
@@@ -2800,6 -2715,7 +2778,6 @@@
         struct rq *rq;
         int cpu;
   
- -      preempt_disable();
         cpu = smp_processor_id();
         rq = cpu_rq(cpu);
         rcu_note_context_switch();
@@@ -2863,6 -2779,8 +2841,6 @@@
                 raw_spin_unlock_irq(&rq->lock);
   
         post_schedule(rq);
- -
- -      sched_preempt_enable_no_resched();
   }
   
   static inline void sched_submit_work(struct task_struct *tsk)
@@@ -2883,9 -2801,7 +2861,9 @@@ asmlinkage __visible void __sched sched
   
         sched_submit_work(tsk);
         do {
+ +              preempt_disable();
                 __schedule();
+ +              sched_preempt_enable_no_resched();
         } while (need_resched());
   }
   EXPORT_SYMBOL(schedule);
@@@ -2924,14 -2840,15 +2902,14 @@@ void __sched schedule_preempt_disabled(
   static void __sched notrace preempt_schedule_common(void)
   {
         do {
- -              __preempt_count_add(PREEMPT_ACTIVE);
+ +              preempt_active_enter();
                 __schedule();
- -              __preempt_count_sub(PREEMPT_ACTIVE);
+ +              preempt_active_exit();
   
                 /*
                  * Check again in case we missed a preemption opportunity
                  * between schedule and now.
                  */
- -              barrier();
         } while (need_resched());
   }
   
@@@ -2955,8 -2872,9 +2933,8 @@@ asmlinkage __visible void __sched notra
   NOKPROBE_SYMBOL(preempt_schedule);
   EXPORT_SYMBOL(preempt_schedule);
   
- -#ifdef CONFIG_CONTEXT_TRACKING
   /**
- - * preempt_schedule_context - preempt_schedule called by tracing
+ + * preempt_schedule_notrace - preempt_schedule called by tracing
    *
    * The tracing infrastructure uses preempt_enable_notrace to prevent
    * recursion and tracing preempt enabling caused by the tracing
@@@ -2969,7 -2887,7 +2947,7 @@@
    * instead of preempt_schedule() to exit user context if needed before
    * calling the scheduler.
    */
- -asmlinkage __visible void __sched notrace preempt_schedule_context(void)
+ +asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
   {
         enum ctx_state prev_ctx;
   
@@@ -2977,13 -2895,7 +2955,13 @@@
                 return;
   
         do {
- -              __preempt_count_add(PREEMPT_ACTIVE);
+ +              /*
+ +               * Use raw __prempt_count() ops that don't call function.
+ +               * We can't call functions before disabling preemption which
+ +               * disarm preemption tracing recursions.
+ +               */
+ +              __preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
+ +              barrier();
                 /*
                  * Needs preempt disabled in case user_exit() is traced
                  * and the tracer calls preempt_enable_notrace() causing
@@@ -2993,11 -2905,12 +2971,11 @@@
                 __schedule();
                 exception_exit(prev_ctx);
   
- -              __preempt_count_sub(PREEMPT_ACTIVE);
                 barrier();
+ +              __preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
         } while (need_resched());
   }
- -EXPORT_SYMBOL_GPL(preempt_schedule_context);
- -#endif /* CONFIG_CONTEXT_TRACKING */
+ +EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
   
   #endif /* CONFIG_PREEMPT */
   
@@@ -3017,11 -2930,17 +2995,11 @@@ asmlinkage __visible void __sched preem
         prev_state = exception_enter();
   
         do {
- -              __preempt_count_add(PREEMPT_ACTIVE);
+ +              preempt_active_enter();
                 local_irq_enable();
                 __schedule();
                 local_irq_disable();
- -              __preempt_count_sub(PREEMPT_ACTIVE);
- -
- -              /*
- -               * Check again in case we missed a preemption opportunity
- -               * between schedule and now.
- -               */
- -              barrier();
+ +              preempt_active_exit();
         } while (need_resched());
   
         exception_exit(prev_state);
@@@ -3099,6 -3018,7 +3077,6 @@@ void rt_mutex_setprio(struct task_struc
                 if (!dl_prio(p->normal_prio) ||
                     (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
                         p->dl.dl_boosted = 1;
- -                      p->dl.dl_throttled = 0;
                         enqueue_flag = ENQUEUE_REPLENISH;
                 } else
                         p->dl.dl_boosted = 0;
@@@ -4447,7 -4367,10 +4425,7 @@@ long __sched io_schedule_timeout(long t
         long ret;
   
         current->in_iowait = 1;
- -      if (old_iowait)
- -              blk_schedule_flush_plug(current);
- -      else
- -              blk_flush_plug(current);
+ +      blk_schedule_flush_plug(current);
   
         delayacct_blkio_start();
         rq = raw_rq();
@@@ -5372,7 -5295,7 +5350,7 @@@ static struct notifier_block migration_
         .priority = CPU_PRI_MIGRATION,
   };
   
- -static void __cpuinit set_cpu_rq_start_time(void)
+ +static void set_cpu_rq_start_time(void)
   {
         int cpu = smp_processor_id();
         struct rq *rq = cpu_rq(cpu);
@@@ -7126,8 -7049,6 +7104,6 @@@ void __init sched_init_smp(void
   }
   #endif /* CONFIG_SMP */
   
- const_debug unsigned int sysctl_timer_migration = 1;
- 
   int in_sched_functions(unsigned long addr)
   {
         return in_lock_functions(addr) ||
@@@ -7792,11 -7713,11 +7768,11 @@@ static long sched_group_rt_runtime(stru
         return rt_runtime_us;
   }
   
- -static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
+ +static int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
   {
         u64 rt_runtime, rt_period;
   
- -      rt_period = (u64)rt_period_us * NSEC_PER_USEC;
+ +      rt_period = rt_period_us * NSEC_PER_USEC;
         rt_runtime = tg->rt_bandwidth.rt_runtime;
   
         return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
@@@ -8163,10 -8084,8 +8139,8 @@@ static int tg_set_cfs_bandwidth(struct 
   
         __refill_cfs_bandwidth_runtime(cfs_b);
         /* restart the period timer (if active) to handle new period expiry */
-       if (runtime_enabled && cfs_b->timer_active) {
-               /* force a reprogram */
-               __start_cfs_bandwidth(cfs_b, true);
-       }
+       if (runtime_enabled)
+               start_cfs_bandwidth(cfs_b);
         raw_spin_unlock_irq(&cfs_b->lock);
   
         for_each_online_cpu(i) {
diff --combined kernel/sched/deadline.c

index 392e8fb94db36ef32aad026510d3ebfe3d89f6ef,21d6907d2b9fd07c47d9e9c9125d6d5b47f1b499..eac20c557a55cc83f8e9d7e62578868ba9436aff
--- 1/kernel/sched/deadline.c
--- 2/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@@ -503,8 -503,6 +503,6 @@@ static int start_dl_timer(struct sched_
         struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
         struct rq *rq = rq_of_dl_rq(dl_rq);
         ktime_t now, act;
-       ktime_t soft, hard;
-       unsigned long range;
         s64 delta;
   
         if (boosted)
@@@ -527,15 -525,9 +525,9 @@@
         if (ktime_us_delta(act, now) < 0)
                 return 0;
   
-       hrtimer_set_expires(&dl_se->dl_timer, act);
+       hrtimer_start(&dl_se->dl_timer, act, HRTIMER_MODE_ABS);
   
-       soft = hrtimer_get_softexpires(&dl_se->dl_timer);
-       hard = hrtimer_get_expires(&dl_se->dl_timer);
-       range = ktime_to_ns(ktime_sub(hard, soft));
-       __hrtimer_start_range_ns(&dl_se->dl_timer, soft,
-                                range, HRTIMER_MODE_ABS, 0);
- 
-       return hrtimer_active(&dl_se->dl_timer);
+       return 1;
   }
   
   /*
@@@ -640,7 -632,7 +632,7 @@@ void init_dl_task_timer(struct sched_dl
   }
   
   static
- -int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se)
+ +int dl_runtime_exceeded(struct sched_dl_entity *dl_se)
   {
         return (dl_se->runtime <= 0);
   }
@@@ -684,7 -676,7 +676,7 @@@ static void update_curr_dl(struct rq *r
         sched_rt_avg_update(rq, delta_exec);
   
         dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec;
- -      if (dl_runtime_exceeded(rq, dl_se)) {
+ +      if (dl_runtime_exceeded(dl_se)) {
                 dl_se->dl_throttled = 1;
                 __dequeue_task_dl(rq, curr, 0);
                 if (unlikely(!start_dl_timer(dl_se, curr->dl.dl_boosted)))
@@@ -995,7 -987,7 +987,7 @@@ select_task_rq_dl(struct task_struct *p
         rq = cpu_rq(cpu);
   
         rcu_read_lock();
- -      curr = ACCESS_ONCE(rq->curr); /* unlocked access */
+ +      curr = READ_ONCE(rq->curr); /* unlocked access */
   
         /*
          * If we are dealing with a -deadline task, we must
@@@ -1012,9 -1004,7 +1004,9 @@@
             (p->nr_cpus_allowed > 1)) {
                 int target = find_later_rq(p);
   
- -              if (target != -1)
+ +              if (target != -1 &&
+ +                              dl_time_before(p->dl.deadline,
+ +                                      cpu_rq(target)->dl.earliest_dl.curr))
                         cpu = target;
         }
         rcu_read_unlock();
@@@ -1232,32 -1222,6 +1224,32 @@@ next_node
         return NULL;
   }
   
+ +/*
+ + * Return the earliest pushable rq's task, which is suitable to be executed
+ + * on the CPU, NULL otherwise:
+ + */
+ +static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu)
+ +{
+ +      struct rb_node *next_node = rq->dl.pushable_dl_tasks_leftmost;
+ +      struct task_struct *p = NULL;
+ +
+ +      if (!has_pushable_dl_tasks(rq))
+ +              return NULL;
+ +
+ +next_node:
+ +      if (next_node) {
+ +              p = rb_entry(next_node, struct task_struct, pushable_dl_tasks);
+ +
+ +              if (pick_dl_task(rq, p, cpu))
+ +                      return p;
+ +
+ +              next_node = rb_next(next_node);
+ +              goto next_node;
+ +      }
+ +
+ +      return NULL;
+ +}
+ +
   static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl);
   
   static int find_later_rq(struct task_struct *task)
@@@ -1361,17 -1325,6 +1353,17 @@@ static struct rq *find_lock_later_rq(st
   
                 later_rq = cpu_rq(cpu);
   
+ +              if (!dl_time_before(task->dl.deadline,
+ +                                      later_rq->dl.earliest_dl.curr)) {
+ +                      /*
+ +                       * Target rq has tasks of equal or earlier deadline,
+ +                       * retrying does not release any lock and is unlikely
+ +                       * to yield a different result.
+ +                       */
+ +                      later_rq = NULL;
+ +                      break;
+ +              }
+ +
                 /* Retry if something changed. */
                 if (double_lock_balance(rq, later_rq)) {
                         if (unlikely(task_rq(task) != rq ||
@@@ -1553,7 -1506,7 +1545,7 @@@ static int pull_dl_task(struct rq *this
                 if (src_rq->dl.dl_nr_running <= 1)
                         goto skip;
   
- -              p = pick_next_earliest_dl_task(src_rq, this_cpu);
+ +              p = pick_earliest_pushable_dl_task(src_rq, this_cpu);
   
                 /*
                  * We found a task to be pulled if:
@@@ -1698,7 -1651,7 +1690,7 @@@ static void rq_offline_dl(struct rq *rq
         cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu);
   }
   
- -void init_sched_dl_class(void)
+ +void __init init_sched_dl_class(void)
   {
         unsigned int i;
   
diff --combined kernel/sched/debug.c

index 704683cc90422d096d8a591326db9a880574ed23,f94724eda407ecc656217c4b6ce0ebeb656f8e8b..315c68e015d955d6227a83b6b951482cffd8a68e
--- 1/kernel/sched/debug.c
--- 2/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@@ -132,14 -132,12 +132,14 @@@ print_task(struct seq_file *m, struct r
                 p->prio);
   #ifdef CONFIG_SCHEDSTATS
         SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
- -              SPLIT_NS(p->se.vruntime),
+ +              SPLIT_NS(p->se.statistics.wait_sum),
                 SPLIT_NS(p->se.sum_exec_runtime),
                 SPLIT_NS(p->se.statistics.sum_sleep_runtime));
   #else
- -      SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
- -              0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
+ +      SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
+ +              0LL, 0L,
+ +              SPLIT_NS(p->se.sum_exec_runtime),
+ +              0LL, 0L);
   #endif
   #ifdef CONFIG_NUMA_BALANCING
         SEQ_printf(m, " %d", task_node(p));
@@@ -158,7 -156,7 +158,7 @@@ static void print_rq(struct seq_file *m
         SEQ_printf(m,
         "\nrunnable tasks:\n"
         "            task   PID         tree-key  switches  prio"
- -      "     exec-runtime         sum-exec        sum-sleep\n"
+ +      "     wait-time             sum-exec        sum-sleep\n"
         "------------------------------------------------------"
         "----------------------------------------------------\n");
   
@@@ -232,8 -230,6 +232,6 @@@ void print_cfs_rq(struct seq_file *m, i
   #endif
   #endif
   #ifdef CONFIG_CFS_BANDWIDTH
-       SEQ_printf(m, "  .%-30s: %d\n", "tg->cfs_bandwidth.timer_active",
-                       cfs_rq->tg->cfs_bandwidth.timer_active);
         SEQ_printf(m, "  .%-30s: %d\n", "throttled",
                         cfs_rq->throttled);
         SEQ_printf(m, "  .%-30s: %d\n", "throttle_count",
@@@ -584,7 -580,6 +582,7 @@@ void proc_sched_show_task(struct task_s
         nr_switches = p->nvcsw + p->nivcsw;
   
   #ifdef CONFIG_SCHEDSTATS
+ +      PN(se.statistics.sum_sleep_runtime);
         PN(se.statistics.wait_start);
         PN(se.statistics.sleep_start);
         PN(se.statistics.block_start);
diff --combined kernel/sched/fair.c

index 433061d984eac6ce5322714ab29bd3beb148eaf4,69be2825262d5df3e4d859e3faabefe3064863e2..40a7fcbf491eb7d1f5735d0e9efd25c23d4d60a9
--- 1/kernel/sched/fair.c
--- 2/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@@ -141,9 -141,9 +141,9 @@@ static inline void update_load_set(stru
    *
    * This idea comes from the SD scheduler of Con Kolivas:
    */
- -static int get_update_sysctl_factor(void)
+ +static unsigned int get_update_sysctl_factor(void)
   {
- -      unsigned int cpus = min_t(int, num_online_cpus(), 8);
+ +      unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
         unsigned int factor;
   
         switch (sysctl_sched_tunable_scaling) {
@@@ -576,7 -576,7 +576,7 @@@ int sched_proc_update_handler(struct ct
                 loff_t *ppos)
   {
         int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
- -      int factor = get_update_sysctl_factor();
+ +      unsigned int factor = get_update_sysctl_factor();
   
         if (ret || !write)
                 return ret;
@@@ -834,7 -834,7 +834,7 @@@ static unsigned int task_nr_scan_window
   
   static unsigned int task_scan_min(struct task_struct *p)
   {
- -      unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size);
+ +      unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
         unsigned int scan, floor;
         unsigned int windows = 1;
   
@@@ -1198,9 -1198,11 +1198,9 @@@ static void task_numa_assign(struct tas
   static bool load_too_imbalanced(long src_load, long dst_load,
                                 struct task_numa_env *env)
   {
+ +      long imb, old_imb;
+ +      long orig_src_load, orig_dst_load;
         long src_capacity, dst_capacity;
- -      long orig_src_load;
- -      long load_a, load_b;
- -      long moved_load;
- -      long imb;
   
         /*
          * The load is corrected for the CPU capacity available on each node.
@@@ -1213,30 -1215,39 +1213,30 @@@
         dst_capacity = env->dst_stats.compute_capacity;
   
         /* We care about the slope of the imbalance, not the direction. */
- -      load_a = dst_load;
- -      load_b = src_load;
- -      if (load_a < load_b)
- -              swap(load_a, load_b);
+ +      if (dst_load < src_load)
+ +              swap(dst_load, src_load);
   
         /* Is the difference below the threshold? */
- -      imb = load_a * src_capacity * 100 -
- -              load_b * dst_capacity * env->imbalance_pct;
+ +      imb = dst_load * src_capacity * 100 -
+ +            src_load * dst_capacity * env->imbalance_pct;
         if (imb <= 0)
                 return false;
   
         /*
          * The imbalance is above the allowed threshold.
- -       * Allow a move that brings us closer to a balanced situation,
- -       * without moving things past the point of balance.
+ +       * Compare it with the old imbalance.
          */
         orig_src_load = env->src_stats.load;
+ +      orig_dst_load = env->dst_stats.load;
   
- -      /*
- -       * In a task swap, there will be one load moving from src to dst,
- -       * and another moving back. This is the net sum of both moves.
- -       * A simple task move will always have a positive value.
- -       * Allow the move if it brings the system closer to a balanced
- -       * situation, without crossing over the balance point.
- -       */
- -      moved_load = orig_src_load - src_load;
+ +      if (orig_dst_load < orig_src_load)
+ +              swap(orig_dst_load, orig_src_load);
   
- -      if (moved_load > 0)
- -              /* Moving src -> dst. Did we overshoot balance? */
- -              return src_load * dst_capacity < dst_load * src_capacity;
- -      else
- -              /* Moving dst -> src. Did we overshoot balance? */
- -              return dst_load * src_capacity < src_load * dst_capacity;
+ +      old_imb = orig_dst_load * src_capacity * 100 -
+ +                orig_src_load * dst_capacity * env->imbalance_pct;
+ +
+ +      /* Would this change make things worse? */
+ +      return (imb > old_imb);
   }
   
   /*
@@@ -1398,30 -1409,6 +1398,30 @@@ static void task_numa_find_cpu(struct t
         }
   }
   
+ +/* Only move tasks to a NUMA node less busy than the current node. */
+ +static bool numa_has_capacity(struct task_numa_env *env)
+ +{
+ +      struct numa_stats *src = &env->src_stats;
+ +      struct numa_stats *dst = &env->dst_stats;
+ +
+ +      if (src->has_free_capacity && !dst->has_free_capacity)
+ +              return false;
+ +
+ +      /*
+ +       * Only consider a task move if the source has a higher load
+ +       * than the destination, corrected for CPU capacity on each node.
+ +       *
+ +       *      src->load                dst->load
+ +       * --------------------- vs ---------------------
+ +       * src->compute_capacity    dst->compute_capacity
+ +       */
+ +      if (src->load * dst->compute_capacity >
+ +          dst->load * src->compute_capacity)
+ +              return true;
+ +
+ +      return false;
+ +}
+ +
   static int task_numa_migrate(struct task_struct *p)
   {
         struct task_numa_env env = {
@@@ -1476,8 -1463,7 +1476,8 @@@
         update_numa_stats(&env.dst_stats, env.dst_nid);
   
         /* Try to find a spot on the preferred nid. */
- -      task_numa_find_cpu(&env, taskimp, groupimp);
+ +      if (numa_has_capacity(&env))
+ +              task_numa_find_cpu(&env, taskimp, groupimp);
   
         /*
          * Look at other nodes in these cases:
@@@ -1508,8 -1494,7 +1508,8 @@@
                         env.dist = dist;
                         env.dst_nid = nid;
                         update_numa_stats(&env.dst_stats, env.dst_nid);
- -                      task_numa_find_cpu(&env, taskimp, groupimp);
+ +                      if (numa_has_capacity(&env))
+ +                              task_numa_find_cpu(&env, taskimp, groupimp);
                 }
         }
   
@@@ -1809,12 -1794,7 +1809,12 @@@ static void task_numa_placement(struct 
         u64 runtime, period;
         spinlock_t *group_lock = NULL;
   
- -      seq = ACCESS_ONCE(p->mm->numa_scan_seq);
+ +      /*
+ +       * The p->mm->numa_scan_seq field gets updated without
+ +       * exclusive access. Use READ_ONCE() here to ensure
+ +       * that the field is read in a single access:
+ +       */
+ +      seq = READ_ONCE(p->mm->numa_scan_seq);
         if (p->numa_scan_seq == seq)
                 return;
         p->numa_scan_seq = seq;
@@@ -1958,7 -1938,7 +1958,7 @@@ static void task_numa_group(struct task
         }
   
         rcu_read_lock();
- -      tsk = ACCESS_ONCE(cpu_rq(cpu)->curr);
+ +      tsk = READ_ONCE(cpu_rq(cpu)->curr);
   
         if (!cpupid_match_pid(tsk, cpupid))
                 goto no_join;
@@@ -2127,15 -2107,7 +2127,15 @@@ void task_numa_fault(int last_cpupid, i
   
   static void reset_ptenuma_scan(struct task_struct *p)
   {
- -      ACCESS_ONCE(p->mm->numa_scan_seq)++;
+ +      /*
+ +       * We only did a read acquisition of the mmap sem, so
+ +       * p->mm->numa_scan_seq is written to without exclusive access
+ +       * and the update is not guaranteed to be atomic. That's not
+ +       * much of an issue though, since this is just used for
+ +       * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
+ +       * expensive, to avoid any form of compiler optimizations:
+ +       */
+ +      WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
         p->mm->numa_scan_offset = 0;
   }
   
@@@ -2209,7 -2181,7 +2209,7 @@@ void task_numa_work(struct callback_hea
         }
         for (; vma; vma = vma->vm_next) {
                 if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
- -                      is_vm_hugetlb_page(vma)) {
+ +                      is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
                         continue;
                 }
   
@@@ -3504,16 -3476,7 +3504,7 @@@ static int assign_cfs_rq_runtime(struc
         if (cfs_b->quota == RUNTIME_INF)
                 amount = min_amount;
         else {
-               /*
-                * If the bandwidth pool has become inactive, then at least one
-                * period must have elapsed since the last consumption.
-                * Refresh the global state and ensure bandwidth timer becomes
-                * active.
-                */
-               if (!cfs_b->timer_active) {
-                       __refill_cfs_bandwidth_runtime(cfs_b);
-                       __start_cfs_bandwidth(cfs_b, false);
-               }
+               start_cfs_bandwidth(cfs_b);
   
                 if (cfs_b->runtime > 0) {
                         amount = min(cfs_b->runtime, min_amount);
@@@ -3662,6 -3625,7 +3653,7 @@@ static void throttle_cfs_rq(struct cfs_
         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
         struct sched_entity *se;
         long task_delta, dequeue = 1;
+       bool empty;
   
         se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
   
@@@ -3691,13 -3655,21 +3683,21 @@@
         cfs_rq->throttled = 1;
         cfs_rq->throttled_clock = rq_clock(rq);
         raw_spin_lock(&cfs_b->lock);
+       empty = list_empty(&cfs_rq->throttled_list);
+ 
         /*
          * Add to the _head_ of the list, so that an already-started
          * distribute_cfs_runtime will not see us
          */
         list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
-       if (!cfs_b->timer_active)
-               __start_cfs_bandwidth(cfs_b, false);
+ 
+       /*
+        * If we're the first throttled task, make sure the bandwidth
+        * timer is running.
+        */
+       if (empty)
+               start_cfs_bandwidth(cfs_b);
+ 
         raw_spin_unlock(&cfs_b->lock);
   }
   
@@@ -3812,13 -3784,6 +3812,6 @@@ static int do_sched_cfs_period_timer(st
         if (cfs_b->idle && !throttled)
                 goto out_deactivate;
   
-       /*
-        * if we have relooped after returning idle once, we need to update our
-        * status as actually running, so that other cpus doing
-        * __start_cfs_bandwidth will stop trying to cancel us.
-        */
-       cfs_b->timer_active = 1;
- 
         __refill_cfs_bandwidth_runtime(cfs_b);
   
         if (!throttled) {
@@@ -3863,7 -3828,6 +3856,6 @@@
         return 0;
   
   out_deactivate:
-       cfs_b->timer_active = 0;
         return 1;
   }
   
@@@ -3878,7 -3842,7 +3870,7 @@@ static const u64 cfs_bandwidth_slack_pe
    * Are we near the end of the current quota period?
    *
    * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
-  * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of
+  * hrtimer base being cleared by hrtimer_start. In the case of
    * migrate_hrtimers, base is never cleared, so we are fine.
    */
   static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
@@@ -3906,8 -3870,9 +3898,9 @@@ static void start_cfs_slack_bandwidth(s
         if (runtime_refresh_within(cfs_b, min_left))
                 return;
   
-       start_bandwidth_timer(&cfs_b->slack_timer,
-                               ns_to_ktime(cfs_bandwidth_slack_period));
+       hrtimer_start(&cfs_b->slack_timer,
+                       ns_to_ktime(cfs_bandwidth_slack_period),
+                       HRTIMER_MODE_REL);
   }
   
   /* we know any runtime found here is valid as update_curr() precedes return */
@@@ -4027,6 -3992,7 +4020,7 @@@ static enum hrtimer_restart sched_cfs_s
   {
         struct cfs_bandwidth *cfs_b =
                 container_of(timer, struct cfs_bandwidth, slack_timer);
+ 
         do_sched_cfs_slack_timer(cfs_b);
   
         return HRTIMER_NORESTART;
@@@ -4036,20 -4002,19 +4030,19 @@@ static enum hrtimer_restart sched_cfs_p
   {
         struct cfs_bandwidth *cfs_b =
                 container_of(timer, struct cfs_bandwidth, period_timer);
-       ktime_t now;
         int overrun;
         int idle = 0;
   
         raw_spin_lock(&cfs_b->lock);
         for (;;) {
-               now = hrtimer_cb_get_time(timer);
-               overrun = hrtimer_forward(timer, now, cfs_b->period);
- 
+               overrun = hrtimer_forward_now(timer, cfs_b->period);
                 if (!overrun)
                         break;
   
                 idle = do_sched_cfs_period_timer(cfs_b, overrun);
         }
+       if (idle)
+               cfs_b->period_active = 0;
         raw_spin_unlock(&cfs_b->lock);
   
         return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
@@@ -4063,7 -4028,7 +4056,7 @@@ void init_cfs_bandwidth(struct cfs_band
         cfs_b->period = ns_to_ktime(default_cfs_period());
   
         INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
-       hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+       hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
         cfs_b->period_timer.function = sched_cfs_period_timer;
         hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
         cfs_b->slack_timer.function = sched_cfs_slack_timer;
@@@ -4075,28 -4040,15 +4068,15 @@@ static void init_cfs_rq_runtime(struct 
         INIT_LIST_HEAD(&cfs_rq->throttled_list);
   }
   
- /* requires cfs_b->lock, may release to reprogram timer */
- void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force)
+ void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
   {
-       /*
-        * The timer may be active because we're trying to set a new bandwidth
-        * period or because we're racing with the tear-down path
-        * (timer_active==0 becomes visible before the hrtimer call-back
-        * terminates).  In either case we ensure that it's re-programmed
-        */
-       while (unlikely(hrtimer_active(&cfs_b->period_timer)) &&
-              hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) {
-               /* bounce the lock to allow do_sched_cfs_period_timer to run */
-               raw_spin_unlock(&cfs_b->lock);
-               cpu_relax();
-               raw_spin_lock(&cfs_b->lock);
-               /* if someone else restarted the timer then we're done */
-               if (!force && cfs_b->timer_active)
-                       return;
-       }
+       lockdep_assert_held(&cfs_b->lock);
   
-       cfs_b->timer_active = 1;
-       start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
+       if (!cfs_b->period_active) {
+               cfs_b->period_active = 1;
+               hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
+               hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
+       }
   }
   
   static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
@@@ -4351,189 -4303,6 +4331,189 @@@ static void dequeue_task_fair(struct r
   }
   
   #ifdef CONFIG_SMP
+ +
+ +/*
+ + * per rq 'load' arrray crap; XXX kill this.
+ + */
+ +
+ +/*
+ + * The exact cpuload at various idx values, calculated at every tick would be
+ + * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
+ + *
+ + * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
+ + * on nth tick when cpu may be busy, then we have:
+ + * load = ((2^idx - 1) / 2^idx)^(n-1) * load
+ + * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
+ + *
+ + * decay_load_missed() below does efficient calculation of
+ + * load = ((2^idx - 1) / 2^idx)^(n-1) * load
+ + * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
+ + *
+ + * The calculation is approximated on a 128 point scale.
+ + * degrade_zero_ticks is the number of ticks after which load at any
+ + * particular idx is approximated to be zero.
+ + * degrade_factor is a precomputed table, a row for each load idx.
+ + * Each column corresponds to degradation factor for a power of two ticks,
+ + * based on 128 point scale.
+ + * Example:
+ + * row 2, col 3 (=12) says that the degradation at load idx 2 after
+ + * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
+ + *
+ + * With this power of 2 load factors, we can degrade the load n times
+ + * by looking at 1 bits in n and doing as many mult/shift instead of
+ + * n mult/shifts needed by the exact degradation.
+ + */
+ +#define DEGRADE_SHIFT         7
+ +static const unsigned char
+ +              degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
+ +static const unsigned char
+ +              degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
+ +                                      {0, 0, 0, 0, 0, 0, 0, 0},
+ +                                      {64, 32, 8, 0, 0, 0, 0, 0},
+ +                                      {96, 72, 40, 12, 1, 0, 0},
+ +                                      {112, 98, 75, 43, 15, 1, 0},
+ +                                      {120, 112, 98, 76, 45, 16, 2} };
+ +
+ +/*
+ + * Update cpu_load for any missed ticks, due to tickless idle. The backlog
+ + * would be when CPU is idle and so we just decay the old load without
+ + * adding any new load.
+ + */
+ +static unsigned long
+ +decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
+ +{
+ +      int j = 0;
+ +
+ +      if (!missed_updates)
+ +              return load;
+ +
+ +      if (missed_updates >= degrade_zero_ticks[idx])
+ +              return 0;
+ +
+ +      if (idx == 1)
+ +              return load >> missed_updates;
+ +
+ +      while (missed_updates) {
+ +              if (missed_updates % 2)
+ +                      load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
+ +
+ +              missed_updates >>= 1;
+ +              j++;
+ +      }
+ +      return load;
+ +}
+ +
+ +/*
+ + * Update rq->cpu_load[] statistics. This function is usually called every
+ + * scheduler tick (TICK_NSEC). With tickless idle this will not be called
+ + * every tick. We fix it up based on jiffies.
+ + */
+ +static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
+ +                            unsigned long pending_updates)
+ +{
+ +      int i, scale;
+ +
+ +      this_rq->nr_load_updates++;
+ +
+ +      /* Update our load: */
+ +      this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
+ +      for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
+ +              unsigned long old_load, new_load;
+ +
+ +              /* scale is effectively 1 << i now, and >> i divides by scale */
+ +
+ +              old_load = this_rq->cpu_load[i];
+ +              old_load = decay_load_missed(old_load, pending_updates - 1, i);
+ +              new_load = this_load;
+ +              /*
+ +               * Round up the averaging division if load is increasing. This
+ +               * prevents us from getting stuck on 9 if the load is 10, for
+ +               * example.
+ +               */
+ +              if (new_load > old_load)
+ +                      new_load += scale - 1;
+ +
+ +              this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
+ +      }
+ +
+ +      sched_avg_update(this_rq);
+ +}
+ +
+ +#ifdef CONFIG_NO_HZ_COMMON
+ +/*
+ + * There is no sane way to deal with nohz on smp when using jiffies because the
+ + * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
+ + * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
+ + *
+ + * Therefore we cannot use the delta approach from the regular tick since that
+ + * would seriously skew the load calculation. However we'll make do for those
+ + * updates happening while idle (nohz_idle_balance) or coming out of idle
+ + * (tick_nohz_idle_exit).
+ + *
+ + * This means we might still be one tick off for nohz periods.
+ + */
+ +
+ +/*
+ + * Called from nohz_idle_balance() to update the load ratings before doing the
+ + * idle balance.
+ + */
+ +static void update_idle_cpu_load(struct rq *this_rq)
+ +{
+ +      unsigned long curr_jiffies = READ_ONCE(jiffies);
+ +      unsigned long load = this_rq->cfs.runnable_load_avg;
+ +      unsigned long pending_updates;
+ +
+ +      /*
+ +       * bail if there's load or we're actually up-to-date.
+ +       */
+ +      if (load || curr_jiffies == this_rq->last_load_update_tick)
+ +              return;
+ +
+ +      pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+ +      this_rq->last_load_update_tick = curr_jiffies;
+ +
+ +      __update_cpu_load(this_rq, load, pending_updates);
+ +}
+ +
+ +/*
+ + * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
+ + */
+ +void update_cpu_load_nohz(void)
+ +{
+ +      struct rq *this_rq = this_rq();
+ +      unsigned long curr_jiffies = READ_ONCE(jiffies);
+ +      unsigned long pending_updates;
+ +
+ +      if (curr_jiffies == this_rq->last_load_update_tick)
+ +              return;
+ +
+ +      raw_spin_lock(&this_rq->lock);
+ +      pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+ +      if (pending_updates) {
+ +              this_rq->last_load_update_tick = curr_jiffies;
+ +              /*
+ +               * We were idle, this means load 0, the current load might be
+ +               * !0 due to remote wakeups and the sort.
+ +               */
+ +              __update_cpu_load(this_rq, 0, pending_updates);
+ +      }
+ +      raw_spin_unlock(&this_rq->lock);
+ +}
+ +#endif /* CONFIG_NO_HZ */
+ +
+ +/*
+ + * Called from scheduler_tick()
+ + */
+ +void update_cpu_load_active(struct rq *this_rq)
+ +{
+ +      unsigned long load = this_rq->cfs.runnable_load_avg;
+ +      /*
+ +       * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
+ +       */
+ +      this_rq->last_load_update_tick = jiffies;
+ +      __update_cpu_load(this_rq, load, 1);
+ +}
+ +
   /* Used instead of source_load when we know the type == 0 */
   static unsigned long weighted_cpuload(const int cpu)
   {
@@@ -4586,7 -4355,7 +4566,7 @@@ static unsigned long capacity_orig_of(i
   static unsigned long cpu_avg_load_per_task(int cpu)
   {
         struct rq *rq = cpu_rq(cpu);
- -      unsigned long nr_running = ACCESS_ONCE(rq->cfs.h_nr_running);
+ +      unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
         unsigned long load_avg = rq->cfs.runnable_load_avg;
   
         if (nr_running)
@@@ -5337,21 -5106,18 +5317,21 @@@ again
                  * entity, update_curr() will update its vruntime, otherwise
                  * forget we've ever seen it.
                  */
- -              if (curr && curr->on_rq)
- -                      update_curr(cfs_rq);
- -              else
- -                      curr = NULL;
+ +              if (curr) {
+ +                      if (curr->on_rq)
+ +                              update_curr(cfs_rq);
+ +                      else
+ +                              curr = NULL;
   
- -              /*
- -               * This call to check_cfs_rq_runtime() will do the throttle and
- -               * dequeue its entity in the parent(s). Therefore the 'simple'
- -               * nr_running test will indeed be correct.
- -               */
- -              if (unlikely(check_cfs_rq_runtime(cfs_rq)))
- -                      goto simple;
+ +                      /*
+ +                       * This call to check_cfs_rq_runtime() will do the
+ +                       * throttle and dequeue its entity in the parent(s).
+ +                       * Therefore the 'simple' nr_running test will indeed
+ +                       * be correct.
+ +                       */
+ +                      if (unlikely(check_cfs_rq_runtime(cfs_rq)))
+ +                              goto simple;
+ +              }
   
                 se = pick_next_entity(cfs_rq, curr);
                 cfs_rq = group_cfs_rq(se);
@@@ -5681,15 -5447,10 +5661,15 @@@ static int task_hot(struct task_struct 
   }
   
   #ifdef CONFIG_NUMA_BALANCING
- -/* Returns true if the destination node has incurred more faults */
+ +/*
+ + * Returns true if the destination node is the preferred node.
+ + * Needs to match fbq_classify_rq(): if there is a runnable task
+ + * that is not on its preferred node, we should identify it.
+ + */
   static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
   {
         struct numa_group *numa_group = rcu_dereference(p->numa_group);
+ +      unsigned long src_faults, dst_faults;
         int src_nid, dst_nid;
   
         if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
@@@ -5703,30 -5464,29 +5683,30 @@@
         if (src_nid == dst_nid)
                 return false;
   
- -      if (numa_group) {
- -              /* Task is already in the group's interleave set. */
- -              if (node_isset(src_nid, numa_group->active_nodes))
- -                      return false;
- -
- -              /* Task is moving into the group's interleave set. */
- -              if (node_isset(dst_nid, numa_group->active_nodes))
- -                      return true;
- -
- -              return group_faults(p, dst_nid) > group_faults(p, src_nid);
- -      }
- -
         /* Encourage migration to the preferred node. */
         if (dst_nid == p->numa_preferred_nid)
                 return true;
   
- -      return task_faults(p, dst_nid) > task_faults(p, src_nid);
+ +      /* Migrating away from the preferred node is bad. */
+ +      if (src_nid == p->numa_preferred_nid)
+ +              return false;
+ +
+ +      if (numa_group) {
+ +              src_faults = group_faults(p, src_nid);
+ +              dst_faults = group_faults(p, dst_nid);
+ +      } else {
+ +              src_faults = task_faults(p, src_nid);
+ +              dst_faults = task_faults(p, dst_nid);
+ +      }
+ +
+ +      return dst_faults > src_faults;
   }
   
   
   static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
   {
         struct numa_group *numa_group = rcu_dereference(p->numa_group);
+ +      unsigned long src_faults, dst_faults;
         int src_nid, dst_nid;
   
         if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
@@@ -5741,23 -5501,23 +5721,23 @@@
         if (src_nid == dst_nid)
                 return false;
   
- -      if (numa_group) {
- -              /* Task is moving within/into the group's interleave set. */
- -              if (node_isset(dst_nid, numa_group->active_nodes))
- -                      return false;
+ +      /* Migrating away from the preferred node is bad. */
+ +      if (src_nid == p->numa_preferred_nid)
+ +              return true;
   
- -              /* Task is moving out of the group's interleave set. */
- -              if (node_isset(src_nid, numa_group->active_nodes))
- -                      return true;
+ +      /* Encourage migration to the preferred node. */
+ +      if (dst_nid == p->numa_preferred_nid)
+ +              return false;
   
- -              return group_faults(p, dst_nid) < group_faults(p, src_nid);
+ +      if (numa_group) {
+ +              src_faults = group_faults(p, src_nid);
+ +              dst_faults = group_faults(p, dst_nid);
+ +      } else {
+ +              src_faults = task_faults(p, src_nid);
+ +              dst_faults = task_faults(p, dst_nid);
         }
   
- -      /* Migrating away from the preferred node is always bad. */
- -      if (src_nid == p->numa_preferred_nid)
- -              return true;
- -
- -      return task_faults(p, dst_nid) < task_faults(p, src_nid);
+ +      return dst_faults < src_faults;
   }
   
   #else
@@@ -6257,8 -6017,8 +6237,8 @@@ static unsigned long scale_rt_capacity(
          * Since we're reading these variables without serialization make sure
          * we read them once before doing sanity checks on them.
          */
- -      age_stamp = ACCESS_ONCE(rq->age_stamp);
- -      avg = ACCESS_ONCE(rq->rt_avg);
+ +      age_stamp = READ_ONCE(rq->age_stamp);
+ +      avg = READ_ONCE(rq->rt_avg);
         delta = __rq_clock_broken(rq) - age_stamp;
   
         if (unlikely(delta < 0))
diff --combined kernel/sched/rt.c

index 560d2fa623c311c9aa5ad51ead007e1b27c6fa6c,e43da5391dcdd785ed39d335f3d8056889924d87..7d7093c51f8d169cea027c78a9ca0321c8f15932
--- 1/kernel/sched/rt.c
--- 2/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@@ -18,19 -18,22 +18,22 @@@ static enum hrtimer_restart sched_rt_pe
   {
         struct rt_bandwidth *rt_b =
                 container_of(timer, struct rt_bandwidth, rt_period_timer);
-       ktime_t now;
-       int overrun;
         int idle = 0;
+       int overrun;
   
+       raw_spin_lock(&rt_b->rt_runtime_lock);
         for (;;) {
-               now = hrtimer_cb_get_time(timer);
-               overrun = hrtimer_forward(timer, now, rt_b->rt_period);
- 
+               overrun = hrtimer_forward_now(timer, rt_b->rt_period);
                 if (!overrun)
                         break;
   
+               raw_spin_unlock(&rt_b->rt_runtime_lock);
                 idle = do_sched_rt_period_timer(rt_b, overrun);
+               raw_spin_lock(&rt_b->rt_runtime_lock);
         }
+       if (idle)
+               rt_b->rt_period_active = 0;
+       raw_spin_unlock(&rt_b->rt_runtime_lock);
   
         return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
   }
@@@ -52,11 -55,12 +55,12 @@@ static void start_rt_bandwidth(struct r
         if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
                 return;
   
-       if (hrtimer_active(&rt_b->rt_period_timer))
-               return;
- 
         raw_spin_lock(&rt_b->rt_runtime_lock);
-       start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
+       if (!rt_b->rt_period_active) {
+               rt_b->rt_period_active = 1;
+               hrtimer_forward_now(&rt_b->rt_period_timer, rt_b->rt_period);
+               hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED);
+       }
         raw_spin_unlock(&rt_b->rt_runtime_lock);
   }
   
@@@ -1323,7 -1327,7 +1327,7 @@@ select_task_rq_rt(struct task_struct *p
         rq = cpu_rq(cpu);
   
         rcu_read_lock();
- -      curr = ACCESS_ONCE(rq->curr); /* unlocked access */
+ +      curr = READ_ONCE(rq->curr); /* unlocked access */
   
         /*
          * If the current task on @p's runqueue is an RT task, then
diff --combined kernel/sched/sched.h

index d62b2882232b7a3017eda9c873de01c7ffcfc4a1,f9a58ef373b4b2240f521882d0b0d29da71d58b7..aea7c1f393cb3c983b3fd01e7df53155e4859ef3
--- 1/kernel/sched/sched.h
--- 2/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@@ -26,14 -26,8 +26,14 @@@ extern __read_mostly int scheduler_runn
   extern unsigned long calc_load_update;
   extern atomic_long_t calc_load_tasks;
   
+ +extern void calc_global_load_tick(struct rq *this_rq);
   extern long calc_load_fold_active(struct rq *this_rq);
+ +
+ +#ifdef CONFIG_SMP
   extern void update_cpu_load_active(struct rq *this_rq);
+ +#else
+ +static inline void update_cpu_load_active(struct rq *this_rq) { }
+ +#endif
   
   /*
    * Helpers for converting nanosecond timing to jiffy resolution
@@@ -137,6 -131,7 +137,7 @@@ struct rt_bandwidth 
         ktime_t                 rt_period;
         u64                     rt_runtime;
         struct hrtimer          rt_period_timer;
+       unsigned int            rt_period_active;
   };
   
   void __dl_clear_params(struct task_struct *p);
@@@ -221,7 -216,7 +222,7 @@@ struct cfs_bandwidth 
         s64 hierarchical_quota;
         u64 runtime_expires;
   
-       int idle, timer_active;
+       int idle, period_active;
         struct hrtimer period_timer, slack_timer;
         struct list_head throttled_cfs_rq;
   
@@@ -312,7 -307,7 +313,7 @@@ extern void init_cfs_bandwidth(struct c
   extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
   
   extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
- extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force);
+ extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
   extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
   
   extern void free_rt_sched_group(struct task_group *tg);
@@@ -713,7 -708,7 +714,7 @@@ DECLARE_PER_CPU_SHARED_ALIGNED(struct r
   
   static inline u64 __rq_clock_broken(struct rq *rq)
   {
- -      return ACCESS_ONCE(rq->clock);
+ +      return READ_ONCE(rq->clock);
   }
   
   static inline u64 rq_clock(struct rq *rq)
@@@ -1290,6 -1285,7 +1291,6 @@@ extern void update_max_interval(void)
   extern void init_sched_dl_class(void);
   extern void init_sched_rt_class(void);
   extern void init_sched_fair_class(void);
- -extern void init_sched_dl_class(void);
   
   extern void resched_curr(struct rq *rq);
   extern void resched_cpu(int cpu);
@@@ -1303,6 -1299,8 +1304,6 @@@ extern void init_dl_task_timer(struct s
   
   unsigned long to_ratio(u64 period, u64 runtime);
   
- -extern void update_idle_cpu_load(struct rq *this_rq);
- -
   extern void init_task_runnable_average(struct task_struct *p);
   
   static inline void add_nr_running(struct rq *rq, unsigned count)
@@@ -1409,8 -1407,6 +1410,6 @@@ static inline void sched_rt_avg_update(
   static inline void sched_avg_update(struct rq *rq) { }
   #endif
   
- extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period);
- 
   /*
    * __task_rq_lock - lock the rq @p resides on.
    */
diff --combined kernel/time/hrtimer.c

index 93ef7190bdeaadbf99efe07954cca3bee6399d07,db5c9508ed9500b056adedd71467f358d0d9ab13..5c7ae4b641c44aca69393a704507630a652381bf
--- 1/kernel/time/hrtimer.c
--- 2/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@@ -66,33 -66,29 +66,29 @@@
    */
   DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
   {
- 
         .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
+       .seq = SEQCNT_ZERO(hrtimer_bases.seq),
         .clock_base =
         {
                 {
                         .index = HRTIMER_BASE_MONOTONIC,
                         .clockid = CLOCK_MONOTONIC,
                         .get_time = &ktime_get,
-                       .resolution = KTIME_LOW_RES,
                 },
                 {
                         .index = HRTIMER_BASE_REALTIME,
                         .clockid = CLOCK_REALTIME,
                         .get_time = &ktime_get_real,
-                       .resolution = KTIME_LOW_RES,
                 },
                 {
                         .index = HRTIMER_BASE_BOOTTIME,
                         .clockid = CLOCK_BOOTTIME,
                         .get_time = &ktime_get_boottime,
-                       .resolution = KTIME_LOW_RES,
                 },
                 {
                         .index = HRTIMER_BASE_TAI,
                         .clockid = CLOCK_TAI,
                         .get_time = &ktime_get_clocktai,
-                       .resolution = KTIME_LOW_RES,
                 },
         }
   };
@@@ -109,33 -105,24 +105,24 @@@ static inline int hrtimer_clockid_to_ba
         return hrtimer_clock_to_base_table[clock_id];
   }
   
- 
- /*
-  * Get the coarse grained time at the softirq based on xtime and
-  * wall_to_monotonic.
-  */
- static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
- {
-       ktime_t xtim, mono, boot, tai;
-       ktime_t off_real, off_boot, off_tai;
- 
-       mono = ktime_get_update_offsets_tick(&off_real, &off_boot, &off_tai);
-       boot = ktime_add(mono, off_boot);
-       xtim = ktime_add(mono, off_real);
-       tai = ktime_add(mono, off_tai);
- 
-       base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim;
-       base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono;
-       base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot;
-       base->clock_base[HRTIMER_BASE_TAI].softirq_time = tai;
- }
- 
   /*
    * Functions and macros which are different for UP/SMP systems are kept in a
    * single place
    */
   #ifdef CONFIG_SMP
   
+ /*
+  * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base()
+  * such that hrtimer_callback_running() can unconditionally dereference
+  * timer->base->cpu_base
+  */
+ static struct hrtimer_cpu_base migration_cpu_base = {
+       .seq = SEQCNT_ZERO(migration_cpu_base),
+       .clock_base = { { .cpu_base = &migration_cpu_base, }, },
+ };
+ 
+ #define migration_base        migration_cpu_base.clock_base[0]
+ 
   /*
    * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
    * means that all timers which are tied to this base via timer->base are
@@@ -145,8 -132,8 +132,8 @@@
    * be found on the lists/queues.
    *
    * When the timer's base is locked, and the timer removed from list, it is
-  * possible to set timer->base = NULL and drop the lock: the timer remains
-  * locked.
+  * possible to set timer->base = &migration_base and drop the lock: the timer
+  * remains locked.
    */
   static
   struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
@@@ -156,7 -143,7 +143,7 @@@
   
         for (;;) {
                 base = timer->base;
-               if (likely(base != NULL)) {
+               if (likely(base != &migration_base)) {
                         raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
                         if (likely(base == timer->base))
                                 return base;
@@@ -190,6 -177,24 +177,24 @@@ hrtimer_check_target(struct hrtimer *ti
   #endif
   }
   
+ #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+ static inline
+ struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
+                                        int pinned)
+ {
+       if (pinned || !base->migration_enabled)
+               return this_cpu_ptr(&hrtimer_bases);
+       return &per_cpu(hrtimer_bases, get_nohz_timer_target());
+ }
+ #else
+ static inline
+ struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
+                                        int pinned)
+ {
+       return this_cpu_ptr(&hrtimer_bases);
+ }
+ #endif
+ 
   /*
    * Switch the timer base to the current CPU when possible.
    */
@@@ -197,14 -202,13 +202,13 @@@ static inline struct hrtimer_clock_bas
   switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
                     int pinned)
   {
+       struct hrtimer_cpu_base *new_cpu_base, *this_base;
         struct hrtimer_clock_base *new_base;
-       struct hrtimer_cpu_base *new_cpu_base;
-       int this_cpu = smp_processor_id();
-       int cpu = get_nohz_timer_target(pinned);
         int basenum = base->index;
   
+       this_base = this_cpu_ptr(&hrtimer_bases);
+       new_cpu_base = get_target_base(this_base, pinned);
   again:
-       new_cpu_base = &per_cpu(hrtimer_bases, cpu);
         new_base = &new_cpu_base->clock_base[basenum];
   
         if (base != new_base) {
@@@ -220,22 -224,24 +224,24 @@@
                 if (unlikely(hrtimer_callback_running(timer)))
                         return base;
   
-               /* See the comment in lock_timer_base() */
-               timer->base = NULL;
+               /* See the comment in lock_hrtimer_base() */
+               timer->base = &migration_base;
                 raw_spin_unlock(&base->cpu_base->lock);
                 raw_spin_lock(&new_base->cpu_base->lock);
   
-               if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
-                       cpu = this_cpu;
+               if (new_cpu_base != this_base &&
+                   hrtimer_check_target(timer, new_base)) {
                         raw_spin_unlock(&new_base->cpu_base->lock);
                         raw_spin_lock(&base->cpu_base->lock);
+                       new_cpu_base = this_base;
                         timer->base = base;
                         goto again;
                 }
                 timer->base = new_base;
         } else {
-               if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
-                       cpu = this_cpu;
+               if (new_cpu_base != this_base &&
+                   hrtimer_check_target(timer, new_base)) {
+                       new_cpu_base = this_base;
                         goto again;
                 }
         }
@@@ -266,23 -272,21 +272,23 @@@ lock_hrtimer_base(const struct hrtimer 
   /*
    * Divide a ktime value by a nanosecond value
    */
- -u64 __ktime_divns(const ktime_t kt, s64 div)
+ +s64 __ktime_divns(const ktime_t kt, s64 div)
   {
- -      u64 dclc;
         int sft = 0;
+ +      s64 dclc;
+ +      u64 tmp;
   
         dclc = ktime_to_ns(kt);
+ +      tmp = dclc < 0 ? -dclc : dclc;
+ +
         /* Make sure the divisor is less than 2^32: */
         while (div >> 32) {
                 sft++;
                 div >>= 1;
         }
- -      dclc >>= sft;
- -      do_div(dclc, (unsigned long) div);
- -
- -      return dclc;
+ +      tmp >>= sft;
+ +      do_div(tmp, (unsigned long) div);
+ +      return dclc < 0 ? -tmp : tmp;
   }
   EXPORT_SYMBOL_GPL(__ktime_divns);
   #endif /* BITS_PER_LONG >= 64 */
@@@ -443,24 -447,35 +449,35 @@@ static inline void debug_deactivate(str
   }
   
   #if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS)
+ static inline void hrtimer_update_next_timer(struct hrtimer_cpu_base *cpu_base,
+                                            struct hrtimer *timer)
+ {
+ #ifdef CONFIG_HIGH_RES_TIMERS
+       cpu_base->next_timer = timer;
+ #endif
+ }
+ 
   static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
   {
         struct hrtimer_clock_base *base = cpu_base->clock_base;
         ktime_t expires, expires_next = { .tv64 = KTIME_MAX };
-       int i;
+       unsigned int active = cpu_base->active_bases;
   
-       for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
+       hrtimer_update_next_timer(cpu_base, NULL);
+       for (; active; base++, active >>= 1) {
                 struct timerqueue_node *next;
                 struct hrtimer *timer;
   
-               next = timerqueue_getnext(&base->active);
-               if (!next)
+               if (!(active & 0x01))
                         continue;
   
+               next = timerqueue_getnext(&base->active);
                 timer = container_of(next, struct hrtimer, node);
                 expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
-               if (expires.tv64 < expires_next.tv64)
+               if (expires.tv64 < expires_next.tv64) {
                         expires_next = expires;
+                       hrtimer_update_next_timer(cpu_base, timer);
+               }
         }
         /*
          * clock_was_set() might have changed base->offset of any of
@@@ -473,6 -488,16 +490,16 @@@
   }
   #endif
   
+ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
+ {
+       ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
+       ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
+       ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
+ 
+       return ktime_get_update_offsets_now(&base->clock_was_set_seq,
+                                           offs_real, offs_boot, offs_tai);
+ }
+ 
   /* High resolution timer related functions */
   #ifdef CONFIG_HIGH_RES_TIMERS
   
@@@ -480,6 -505,8 +507,8 @@@
    * High resolution timer enabled ?
    */
   static int hrtimer_hres_enabled __read_mostly  = 1;
+ unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC;
+ EXPORT_SYMBOL_GPL(hrtimer_resolution);
   
   /*
    * Enable / Disable high resolution mode
@@@ -508,9 -535,14 +537,14 @@@ static inline int hrtimer_is_hres_enabl
   /*
    * Is the high resolution mode active ?
    */
+ static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base)
+ {
+       return cpu_base->hres_active;
+ }
+ 
   static inline int hrtimer_hres_active(void)
   {
-       return __this_cpu_read(hrtimer_bases.hres_active);
+       return __hrtimer_hres_active(this_cpu_ptr(&hrtimer_bases));
   }
   
   /*
@@@ -521,7 -553,12 +555,12 @@@
   static void
   hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
   {
-       ktime_t expires_next = __hrtimer_get_next_event(cpu_base);
+       ktime_t expires_next;
+ 
+       if (!cpu_base->hres_active)
+               return;
+ 
+       expires_next = __hrtimer_get_next_event(cpu_base);
   
         if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64)
                 return;
@@@ -545,63 -582,53 +584,53 @@@
         if (cpu_base->hang_detected)
                 return;
   
-       if (cpu_base->expires_next.tv64 != KTIME_MAX)
-               tick_program_event(cpu_base->expires_next, 1);
+       tick_program_event(cpu_base->expires_next, 1);
   }
   
   /*
-  * Shared reprogramming for clock_realtime and clock_monotonic
-  *
    * When a timer is enqueued and expires earlier than the already enqueued
    * timers, we have to check, whether it expires earlier than the timer for
    * which the clock event device was armed.
    *
-  * Note, that in case the state has HRTIMER_STATE_CALLBACK set, no reprogramming
-  * and no expiry check happens. The timer gets enqueued into the rbtree. The
-  * reprogramming and expiry check is done in the hrtimer_interrupt or in the
-  * softirq.
-  *
    * Called with interrupts disabled and base->cpu_base.lock held
    */
- static int hrtimer_reprogram(struct hrtimer *timer,
-                            struct hrtimer_clock_base *base)
+ static void hrtimer_reprogram(struct hrtimer *timer,
+                             struct hrtimer_clock_base *base)
   {
         struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
         ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
-       int res;
   
         WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);
   
         /*
-        * When the callback is running, we do not reprogram the clock event
-        * device. The timer callback is either running on a different CPU or
-        * the callback is executed in the hrtimer_interrupt context. The
-        * reprogramming is handled either by the softirq, which called the
-        * callback or at the end of the hrtimer_interrupt.
+        * If the timer is not on the current cpu, we cannot reprogram
+        * the other cpus clock event device.
          */
-       if (hrtimer_callback_running(timer))
-               return 0;
+       if (base->cpu_base != cpu_base)
+               return;
+ 
+       /*
+        * If the hrtimer interrupt is running, then it will
+        * reevaluate the clock bases and reprogram the clock event
+        * device. The callbacks are always executed in hard interrupt
+        * context so we don't need an extra check for a running
+        * callback.
+        */
+       if (cpu_base->in_hrtirq)
+               return;
   
         /*
          * CLOCK_REALTIME timer might be requested with an absolute
-        * expiry time which is less than base->offset. Nothing wrong
-        * about that, just avoid to call into the tick code, which
-        * has now objections against negative expiry values.
+        * expiry time which is less than base->offset. Set it to 0.
          */
         if (expires.tv64 < 0)
-               return -ETIME;
+               expires.tv64 = 0;
   
         if (expires.tv64 >= cpu_base->expires_next.tv64)
-               return 0;
+               return;
   
-       /*
-        * When the target cpu of the timer is currently executing
-        * hrtimer_interrupt(), then we do not touch the clock event
-        * device. hrtimer_interrupt() will reevaluate all clock bases
-        * before reprogramming the device.
-        */
-       if (cpu_base->in_hrtirq)
-               return 0;
+       /* Update the pointer to the next expiring timer */
+       cpu_base->next_timer = timer;
   
         /*
          * If a hang was detected in the last timer interrupt then we
@@@ -610,15 -637,14 +639,14 @@@
          * to make progress.
          */
         if (cpu_base->hang_detected)
-               return 0;
+               return;
   
         /*
-        * Clockevents returns -ETIME, when the event was in the past.
+        * Program the timer hardware. We enforce the expiry for
+        * events which are already in the past.
          */
-       res = tick_program_event(expires, 0);
-       if (!IS_ERR_VALUE(res))
-               cpu_base->expires_next = expires;
-       return res;
+       cpu_base->expires_next = expires;
+       tick_program_event(expires, 1);
   }
   
   /*
@@@ -630,15 -656,6 +658,6 @@@ static inline void hrtimer_init_hres(st
         base->hres_active = 0;
   }
   
- static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
- {
-       ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
-       ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
-       ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
- 
-       return ktime_get_update_offsets_now(offs_real, offs_boot, offs_tai);
- }
- 
   /*
    * Retrigger next event is called after clock was set
    *
@@@ -648,7 -665,7 +667,7 @@@ static void retrigger_next_event(void *
   {
         struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
   
-       if (!hrtimer_hres_active())
+       if (!base->hres_active)
                 return;
   
         raw_spin_lock(&base->lock);
@@@ -662,29 -679,19 +681,19 @@@
    */
   static int hrtimer_switch_to_hres(void)
   {
-       int i, cpu = smp_processor_id();
-       struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu);
-       unsigned long flags;
- 
-       if (base->hres_active)
-               return 1;
- 
-       local_irq_save(flags);
+       struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
   
         if (tick_init_highres()) {
-               local_irq_restore(flags);
                 printk(KERN_WARNING "Could not switch to high resolution "
-                                   "mode on CPU %d\n", cpu);
+                                   "mode on CPU %d\n", base->cpu);
                 return 0;
         }
         base->hres_active = 1;
-       for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
-               base->clock_base[i].resolution = KTIME_HIGH_RES;
+       hrtimer_resolution = HIGH_RES_NSEC;
   
         tick_setup_sched_timer();
         /* "Retrigger" the interrupt to get things going */
         retrigger_next_event(NULL);
-       local_irq_restore(flags);
         return 1;
   }
   
@@@ -706,6 -713,7 +715,7 @@@ void clock_was_set_delayed(void
   
   #else
   
+ static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *b) { return 0; }
   static inline int hrtimer_hres_active(void) { return 0; }
   static inline int hrtimer_is_hres_enabled(void) { return 0; }
   static inline int hrtimer_switch_to_hres(void) { return 0; }
@@@ -803,6 -811,14 +813,14 @@@ void unlock_hrtimer_base(const struct h
    *
    * Forward the timer expiry so it will expire in the future.
    * Returns the number of overruns.
+  *
+  * Can be safely called from the callback function of @timer. If
+  * called from other contexts @timer must neither be enqueued nor
+  * running the callback and the caller needs to take care of
+  * serialization.
+  *
+  * Note: This only updates the timer expiry value and does not requeue
+  * the timer.
    */
   u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
   {
@@@ -814,8 -830,11 +832,11 @@@
         if (delta.tv64 < 0)
                 return 0;
   
-       if (interval.tv64 < timer->base->resolution.tv64)
-               interval.tv64 = timer->base->resolution.tv64;
+       if (WARN_ON(timer->state & HRTIMER_STATE_ENQUEUED))
+               return 0;
+ 
+       if (interval.tv64 < hrtimer_resolution)
+               interval.tv64 = hrtimer_resolution;
   
         if (unlikely(delta.tv64 >= interval.tv64)) {
                 s64 incr = ktime_to_ns(interval);
@@@ -849,16 -868,11 +870,11 @@@ static int enqueue_hrtimer(struct hrtim
   {
         debug_activate(timer);
   
-       timerqueue_add(&base->active, &timer->node);
         base->cpu_base->active_bases |= 1 << base->index;
   
-       /*
-        * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the
-        * state of a possibly running callback.
-        */
-       timer->state |= HRTIMER_STATE_ENQUEUED;
+       timer->state = HRTIMER_STATE_ENQUEUED;
   
-       return (&timer->node == base->active.next);
+       return timerqueue_add(&base->active, &timer->node);
   }
   
   /*
@@@ -875,39 -889,38 +891,38 @@@ static void __remove_hrtimer(struct hrt
                              struct hrtimer_clock_base *base,
                              unsigned long newstate, int reprogram)
   {
-       struct timerqueue_node *next_timer;
-       if (!(timer->state & HRTIMER_STATE_ENQUEUED))
-               goto out;
+       struct hrtimer_cpu_base *cpu_base = base->cpu_base;
+       unsigned int state = timer->state;
+ 
+       timer->state = newstate;
+       if (!(state & HRTIMER_STATE_ENQUEUED))
+               return;
+ 
+       if (!timerqueue_del(&base->active, &timer->node))
+               cpu_base->active_bases &= ~(1 << base->index);
   
-       next_timer = timerqueue_getnext(&base->active);
-       timerqueue_del(&base->active, &timer->node);
-       if (&timer->node == next_timer) {
   #ifdef CONFIG_HIGH_RES_TIMERS
-               /* Reprogram the clock event device. if enabled */
-               if (reprogram && hrtimer_hres_active()) {
-                       ktime_t expires;
- 
-                       expires = ktime_sub(hrtimer_get_expires(timer),
-                                           base->offset);
-                       if (base->cpu_base->expires_next.tv64 == expires.tv64)
-                               hrtimer_force_reprogram(base->cpu_base, 1);
-               }
+       /*
+        * Note: If reprogram is false we do not update
+        * cpu_base->next_timer. This happens when we remove the first
+        * timer on a remote cpu. No harm as we never dereference
+        * cpu_base->next_timer. So the worst thing what can happen is
+        * an superflous call to hrtimer_force_reprogram() on the
+        * remote cpu later on if the same timer gets enqueued again.
+        */
+       if (reprogram && timer == cpu_base->next_timer)
+               hrtimer_force_reprogram(cpu_base, 1);
   #endif
-       }
-       if (!timerqueue_getnext(&base->active))
-               base->cpu_base->active_bases &= ~(1 << base->index);
- out:
-       timer->state = newstate;
   }
   
   /*
    * remove hrtimer, called with base lock held
    */
   static inline int
- remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
+ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart)
   {
         if (hrtimer_is_queued(timer)) {
-               unsigned long state;
+               unsigned long state = timer->state;
                 int reprogram;
   
                 /*
@@@ -921,30 -934,35 +936,35 @@@
                 debug_deactivate(timer);
                 timer_stats_hrtimer_clear_start_info(timer);
                 reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
-               /*
-                * We must preserve the CALLBACK state flag here,
-                * otherwise we could move the timer base in
-                * switch_hrtimer_base.
-                */
-               state = timer->state & HRTIMER_STATE_CALLBACK;
+ 
+               if (!restart)
+                       state = HRTIMER_STATE_INACTIVE;
+ 
                 __remove_hrtimer(timer, base, state, reprogram);
                 return 1;
         }
         return 0;
   }
   
- int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
-               unsigned long delta_ns, const enum hrtimer_mode mode,
-               int wakeup)
+ /**
+  * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
+  * @timer:    the timer to be added
+  * @tim:      expiry time
+  * @delta_ns: "slack" range for the timer
+  * @mode:     expiry mode: absolute (HRTIMER_MODE_ABS) or
+  *            relative (HRTIMER_MODE_REL)
+  */
+ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
+                           unsigned long delta_ns, const enum hrtimer_mode mode)
   {
         struct hrtimer_clock_base *base, *new_base;
         unsigned long flags;
-       int ret, leftmost;
+       int leftmost;
   
         base = lock_hrtimer_base(timer, &flags);
   
         /* Remove an active timer from the queue: */
-       ret = remove_hrtimer(timer, base);
+       remove_hrtimer(timer, base, true);
   
         if (mode & HRTIMER_MODE_REL) {
                 tim = ktime_add_safe(tim, base->get_time());
@@@ -956,7 -974,7 +976,7 @@@
                  * timeouts. This will go away with the GTOD framework.
                  */
   #ifdef CONFIG_TIME_LOW_RES
-               tim = ktime_add_safe(tim, base->resolution);
+               tim = ktime_add_safe(tim, ktime_set(0, hrtimer_resolution));
   #endif
         }
   
@@@ -968,84 -986,24 +988,24 @@@
         timer_stats_hrtimer_set_start_info(timer);
   
         leftmost = enqueue_hrtimer(timer, new_base);
- 
-       if (!leftmost) {
-               unlock_hrtimer_base(timer, &flags);
-               return ret;
-       }
+       if (!leftmost)
+               goto unlock;
   
         if (!hrtimer_is_hres_active(timer)) {
                 /*
                  * Kick to reschedule the next tick to handle the new timer
                  * on dynticks target.
                  */
-               wake_up_nohz_cpu(new_base->cpu_base->cpu);
-       } else if (new_base->cpu_base == this_cpu_ptr(&hrtimer_bases) &&
-                       hrtimer_reprogram(timer, new_base)) {
-               /*
-                * Only allow reprogramming if the new base is on this CPU.
-                * (it might still be on another CPU if the timer was pending)
-                *
-                * XXX send_remote_softirq() ?
-                */
-               if (wakeup) {
-                       /*
-                        * We need to drop cpu_base->lock to avoid a
-                        * lock ordering issue vs. rq->lock.
-                        */
-                       raw_spin_unlock(&new_base->cpu_base->lock);
-                       raise_softirq_irqoff(HRTIMER_SOFTIRQ);
-                       local_irq_restore(flags);
-                       return ret;
-               } else {
-                       __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
-               }
+               if (new_base->cpu_base->nohz_active)
+                       wake_up_nohz_cpu(new_base->cpu_base->cpu);
+       } else {
+               hrtimer_reprogram(timer, new_base);
         }
- 
+ unlock:
         unlock_hrtimer_base(timer, &flags);
- 
-       return ret;
- }
- EXPORT_SYMBOL_GPL(__hrtimer_start_range_ns);
- 
- /**
-  * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
-  * @timer:    the timer to be added
-  * @tim:      expiry time
-  * @delta_ns: "slack" range for the timer
-  * @mode:     expiry mode: absolute (HRTIMER_MODE_ABS) or
-  *            relative (HRTIMER_MODE_REL)
-  *
-  * Returns:
-  *  0 on success
-  *  1 when the timer was active
-  */
- int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
-               unsigned long delta_ns, const enum hrtimer_mode mode)
- {
-       return __hrtimer_start_range_ns(timer, tim, delta_ns, mode, 1);
   }
   EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
   
- /**
-  * hrtimer_start - (re)start an hrtimer on the current CPU
-  * @timer:    the timer to be added
-  * @tim:      expiry time
-  * @mode:     expiry mode: absolute (HRTIMER_MODE_ABS) or
-  *            relative (HRTIMER_MODE_REL)
-  *
-  * Returns:
-  *  0 on success
-  *  1 when the timer was active
-  */
- int
- hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
- {
-       return __hrtimer_start_range_ns(timer, tim, 0, mode, 1);
- }
- EXPORT_SYMBOL_GPL(hrtimer_start);
- 
- 
   /**
    * hrtimer_try_to_cancel - try to deactivate a timer
    * @timer:    hrtimer to stop
@@@ -1062,10 -1020,19 +1022,19 @@@ int hrtimer_try_to_cancel(struct hrtime
         unsigned long flags;
         int ret = -1;
   
+       /*
+        * Check lockless first. If the timer is not active (neither
+        * enqueued nor running the callback, nothing to do here.  The
+        * base lock does not serialize against a concurrent enqueue,
+        * so we can avoid taking it.
+        */
+       if (!hrtimer_active(timer))
+               return 0;
+ 
         base = lock_hrtimer_base(timer, &flags);
   
         if (!hrtimer_callback_running(timer))
-               ret = remove_hrtimer(timer, base);
+               ret = remove_hrtimer(timer, base, false);
   
         unlock_hrtimer_base(timer, &flags);
   
@@@ -1115,26 -1082,22 +1084,22 @@@ EXPORT_SYMBOL_GPL(hrtimer_get_remaining
   /**
    * hrtimer_get_next_event - get the time until next expiry event
    *
-  * Returns the delta to the next expiry event or KTIME_MAX if no timer
-  * is pending.
+  * Returns the next expiry time or KTIME_MAX if no timer is pending.
    */
- ktime_t hrtimer_get_next_event(void)
+ u64 hrtimer_get_next_event(void)
   {
         struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
-       ktime_t mindelta = { .tv64 = KTIME_MAX };
+       u64 expires = KTIME_MAX;
         unsigned long flags;
   
         raw_spin_lock_irqsave(&cpu_base->lock, flags);
   
-       if (!hrtimer_hres_active())
-               mindelta = ktime_sub(__hrtimer_get_next_event(cpu_base),
-                                    ktime_get());
+       if (!__hrtimer_hres_active(cpu_base))
+               expires = __hrtimer_get_next_event(cpu_base).tv64;
   
         raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
   
-       if (mindelta.tv64 < 0)
-               mindelta.tv64 = 0;
-       return mindelta;
+       return expires;
   }
   #endif
   
@@@ -1176,37 -1139,73 +1141,73 @@@ void hrtimer_init(struct hrtimer *timer
   }
   EXPORT_SYMBOL_GPL(hrtimer_init);
   
- /**
-  * hrtimer_get_res - get the timer resolution for a clock
-  * @which_clock: which clock to query
-  * @tp:                pointer to timespec variable to store the resolution
+ /*
+  * A timer is active, when it is enqueued into the rbtree or the
+  * callback function is running or it's in the state of being migrated
+  * to another cpu.
    *
-  * Store the resolution of the clock selected by @which_clock in the
-  * variable pointed to by @tp.
+  * It is important for this function to not return a false negative.
    */
- int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
+ bool hrtimer_active(const struct hrtimer *timer)
   {
         struct hrtimer_cpu_base *cpu_base;
-       int base = hrtimer_clockid_to_base(which_clock);
+       unsigned int seq;
   
-       cpu_base = raw_cpu_ptr(&hrtimer_bases);
-       *tp = ktime_to_timespec(cpu_base->clock_base[base].resolution);
+       do {
+               cpu_base = READ_ONCE(timer->base->cpu_base);
+               seq = raw_read_seqcount_begin(&cpu_base->seq);
   
-       return 0;
+               if (timer->state != HRTIMER_STATE_INACTIVE ||
+                   cpu_base->running == timer)
+                       return true;
+ 
+       } while (read_seqcount_retry(&cpu_base->seq, seq) ||
+                cpu_base != READ_ONCE(timer->base->cpu_base));
+ 
+       return false;
   }
- EXPORT_SYMBOL_GPL(hrtimer_get_res);
+ EXPORT_SYMBOL_GPL(hrtimer_active);
   
- static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
+ /*
+  * The write_seqcount_barrier()s in __run_hrtimer() split the thing into 3
+  * distinct sections:
+  *
+  *  - queued: the timer is queued
+  *  - callback:       the timer is being ran
+  *  - post:   the timer is inactive or (re)queued
+  *
+  * On the read side we ensure we observe timer->state and cpu_base->running
+  * from the same section, if anything changed while we looked at it, we retry.
+  * This includes timer->base changing because sequence numbers alone are
+  * insufficient for that.
+  *
+  * The sequence numbers are required because otherwise we could still observe
+  * a false negative if the read side got smeared over multiple consequtive
+  * __run_hrtimer() invocations.
+  */
+ 
+ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
+                         struct hrtimer_clock_base *base,
+                         struct hrtimer *timer, ktime_t *now)
   {
-       struct hrtimer_clock_base *base = timer->base;
-       struct hrtimer_cpu_base *cpu_base = base->cpu_base;
         enum hrtimer_restart (*fn)(struct hrtimer *);
         int restart;
   
-       WARN_ON(!irqs_disabled());
+       lockdep_assert_held(&cpu_base->lock);
   
         debug_deactivate(timer);
-       __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
+       cpu_base->running = timer;
+ 
+       /*
+        * Separate the ->running assignment from the ->state assignment.
+        *
+        * As with a regular write barrier, this ensures the read side in
+        * hrtimer_active() cannot observe cpu_base->running == NULL &&
+        * timer->state == INACTIVE.
+        */
+       raw_write_seqcount_barrier(&cpu_base->seq);
+ 
+       __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
         timer_stats_account_hrtimer(timer);
         fn = timer->function;
   
@@@ -1222,58 -1221,43 +1223,43 @@@
         raw_spin_lock(&cpu_base->lock);
   
         /*
-        * Note: We clear the CALLBACK bit after enqueue_hrtimer and
+        * Note: We clear the running state after enqueue_hrtimer and
          * we do not reprogramm the event hardware. Happens either in
          * hrtimer_start_range_ns() or in hrtimer_interrupt()
+        *
+        * Note: Because we dropped the cpu_base->lock above,
+        * hrtimer_start_range_ns() can have popped in and enqueued the timer
+        * for us already.
          */
-       if (restart != HRTIMER_NORESTART) {
-               BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
+       if (restart != HRTIMER_NORESTART &&
+           !(timer->state & HRTIMER_STATE_ENQUEUED))
                 enqueue_hrtimer(timer, base);
-       }
   
-       WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK));
+       /*
+        * Separate the ->running assignment from the ->state assignment.
+        *
+        * As with a regular write barrier, this ensures the read side in
+        * hrtimer_active() cannot observe cpu_base->running == NULL &&
+        * timer->state == INACTIVE.
+        */
+       raw_write_seqcount_barrier(&cpu_base->seq);
   
-       timer->state &= ~HRTIMER_STATE_CALLBACK;
+       WARN_ON_ONCE(cpu_base->running != timer);
+       cpu_base->running = NULL;
   }
   
- #ifdef CONFIG_HIGH_RES_TIMERS
- 
- /*
-  * High resolution timer interrupt
-  * Called with interrupts disabled
-  */
- void hrtimer_interrupt(struct clock_event_device *dev)
+ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
   {
-       struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
-       ktime_t expires_next, now, entry_time, delta;
-       int i, retries = 0;
- 
-       BUG_ON(!cpu_base->hres_active);
-       cpu_base->nr_events++;
-       dev->next_event.tv64 = KTIME_MAX;
- 
-       raw_spin_lock(&cpu_base->lock);
-       entry_time = now = hrtimer_update_base(cpu_base);
- retry:
-       cpu_base->in_hrtirq = 1;
-       /*
-        * We set expires_next to KTIME_MAX here with cpu_base->lock
-        * held to prevent that a timer is enqueued in our queue via
-        * the migration code. This does not affect enqueueing of
-        * timers which run their callback and need to be requeued on
-        * this CPU.
-        */
-       cpu_base->expires_next.tv64 = KTIME_MAX;
+       struct hrtimer_clock_base *base = cpu_base->clock_base;
+       unsigned int active = cpu_base->active_bases;
   
-       for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
-               struct hrtimer_clock_base *base;
+       for (; active; base++, active >>= 1) {
                 struct timerqueue_node *node;
                 ktime_t basenow;
   
-               if (!(cpu_base->active_bases & (1 << i)))
+               if (!(active & 0x01))
                         continue;
   
-               base = cpu_base->clock_base + i;
                 basenow = ktime_add(now, base->offset);
   
                 while ((node = timerqueue_getnext(&base->active))) {
@@@ -1296,9 -1280,42 +1282,42 @@@
                         if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer))
                                 break;
   
-                       __run_hrtimer(timer, &basenow);
+                       __run_hrtimer(cpu_base, base, timer, &basenow);
                 }
         }
+ }
+ 
+ #ifdef CONFIG_HIGH_RES_TIMERS
+ 
+ /*
+  * High resolution timer interrupt
+  * Called with interrupts disabled
+  */
+ void hrtimer_interrupt(struct clock_event_device *dev)
+ {
+       struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
+       ktime_t expires_next, now, entry_time, delta;
+       int retries = 0;
+ 
+       BUG_ON(!cpu_base->hres_active);
+       cpu_base->nr_events++;
+       dev->next_event.tv64 = KTIME_MAX;
+ 
+       raw_spin_lock(&cpu_base->lock);
+       entry_time = now = hrtimer_update_base(cpu_base);
+ retry:
+       cpu_base->in_hrtirq = 1;
+       /*
+        * We set expires_next to KTIME_MAX here with cpu_base->lock
+        * held to prevent that a timer is enqueued in our queue via
+        * the migration code. This does not affect enqueueing of
+        * timers which run their callback and need to be requeued on
+        * this CPU.
+        */
+       cpu_base->expires_next.tv64 = KTIME_MAX;
+ 
+       __hrtimer_run_queues(cpu_base, now);
+ 
         /* Reevaluate the clock bases for the next expiry */
         expires_next = __hrtimer_get_next_event(cpu_base);
         /*
@@@ -1310,8 -1327,7 +1329,7 @@@
         raw_spin_unlock(&cpu_base->lock);
   
         /* Reprogramming necessary ? */
-       if (expires_next.tv64 == KTIME_MAX ||
-           !tick_program_event(expires_next, 0)) {
+       if (!tick_program_event(expires_next, 0)) {
                 cpu_base->hang_detected = 0;
                 return;
         }
@@@ -1344,8 -1360,8 +1362,8 @@@
         cpu_base->hang_detected = 1;
         raw_spin_unlock(&cpu_base->lock);
         delta = ktime_sub(now, entry_time);
-       if (delta.tv64 > cpu_base->max_hang_time.tv64)
-               cpu_base->max_hang_time = delta;
+       if ((unsigned int)delta.tv64 > cpu_base->max_hang_time)
+               cpu_base->max_hang_time = (unsigned int) delta.tv64;
         /*
          * Limit it to a sensible value as we enforce a longer
          * delay. Give the CPU at least 100ms to catch up.
@@@ -1363,7 -1379,7 +1381,7 @@@
    * local version of hrtimer_peek_ahead_timers() called with interrupts
    * disabled.
    */
- static void __hrtimer_peek_ahead_timers(void)
+ static inline void __hrtimer_peek_ahead_timers(void)
   {
         struct tick_device *td;
   
@@@ -1375,29 -1391,6 +1393,6 @@@
                 hrtimer_interrupt(td->evtdev);
   }
   
- /**
-  * hrtimer_peek_ahead_timers -- run soft-expired timers now
-  *
-  * hrtimer_peek_ahead_timers will peek at the timer queue of
-  * the current cpu and check if there are any timers for which
-  * the soft expires time has passed. If any such timers exist,
-  * they are run immediately and then removed from the timer queue.
-  *
-  */
- void hrtimer_peek_ahead_timers(void)
- {
-       unsigned long flags;
- 
-       local_irq_save(flags);
-       __hrtimer_peek_ahead_timers();
-       local_irq_restore(flags);
- }
- 
- static void run_hrtimer_softirq(struct softirq_action *h)
- {
-       hrtimer_peek_ahead_timers();
- }
- 
   #else /* CONFIG_HIGH_RES_TIMERS */
   
   static inline void __hrtimer_peek_ahead_timers(void) { }
@@@ -1405,66 -1398,32 +1400,32 @@@
   #endif        /* !CONFIG_HIGH_RES_TIMERS */
   
   /*
-  * Called from timer softirq every jiffy, expire hrtimers:
-  *
-  * For HRT its the fall back code to run the softirq in the timer
-  * softirq context in case the hrtimer initialization failed or has
-  * not been done yet.
+  * Called from run_local_timers in hardirq context every jiffy
    */
- void hrtimer_run_pending(void)
+ void hrtimer_run_queues(void)
   {
-       if (hrtimer_hres_active())
+       struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
+       ktime_t now;
+ 
+       if (__hrtimer_hres_active(cpu_base))
                 return;
   
         /*
-        * This _is_ ugly: We have to check in the softirq context,
-        * whether we can switch to highres and / or nohz mode. The
-        * clocksource switch happens in the timer interrupt with
-        * xtime_lock held. Notification from there only sets the
-        * check bit in the tick_oneshot code, otherwise we might
-        * deadlock vs. xtime_lock.
+        * This _is_ ugly: We have to check periodically, whether we
+        * can switch to highres and / or nohz mode. The clocksource
+        * switch happens with xtime_lock held. Notification from
+        * there only sets the check bit in the tick_oneshot code,
+        * otherwise we might deadlock vs. xtime_lock.
          */
-       if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
+       if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) {
                 hrtimer_switch_to_hres();
- }
- 
- /*
-  * Called from hardirq context every jiffy
-  */
- void hrtimer_run_queues(void)
- {
-       struct timerqueue_node *node;
-       struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
-       struct hrtimer_clock_base *base;
-       int index, gettime = 1;
- 
-       if (hrtimer_hres_active())
                 return;
- 
-       for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
-               base = &cpu_base->clock_base[index];
-               if (!timerqueue_getnext(&base->active))
-                       continue;
- 
-               if (gettime) {
-                       hrtimer_get_softirq_time(cpu_base);
-                       gettime = 0;
-               }
- 
-               raw_spin_lock(&cpu_base->lock);
- 
-               while ((node = timerqueue_getnext(&base->active))) {
-                       struct hrtimer *timer;
- 
-                       timer = container_of(node, struct hrtimer, node);
-                       if (base->softirq_time.tv64 <=
-                                       hrtimer_get_expires_tv64(timer))
-                               break;
- 
-                       __run_hrtimer(timer, &base->softirq_time);
-               }
-               raw_spin_unlock(&cpu_base->lock);
         }
+ 
+       raw_spin_lock(&cpu_base->lock);
+       now = hrtimer_update_base(cpu_base);
+       __hrtimer_run_queues(cpu_base, now);
+       raw_spin_unlock(&cpu_base->lock);
   }
   
   /*
@@@ -1497,8 -1456,6 +1458,6 @@@ static int __sched do_nanosleep(struct 
         do {
                 set_current_state(TASK_INTERRUPTIBLE);
                 hrtimer_start_expires(&t->timer, mode);
-               if (!hrtimer_active(&t->timer))
-                       t->task = NULL;
   
                 if (likely(t->task))
                         freezable_schedule();
@@@ -1642,11 -1599,11 +1601,11 @@@ static void migrate_hrtimer_list(struc
                 debug_deactivate(timer);
   
                 /*
-                * Mark it as STATE_MIGRATE not INACTIVE otherwise the
+                * Mark it as ENQUEUED not INACTIVE otherwise the
                  * timer could be seen as !active and just vanish away
                  * under us on another CPU
                  */
-               __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);
+               __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0);
                 timer->base = new_base;
                 /*
                  * Enqueue the timers on the new cpu. This does not
@@@ -1657,9 -1614,6 +1616,6 @@@
                  * event device.
                  */
                 enqueue_hrtimer(timer, new_base);
- 
-               /* Clear the migration state bit */
-               timer->state &= ~HRTIMER_STATE_MIGRATE;
         }
   }
   
@@@ -1731,9 -1685,6 +1687,6 @@@ void __init hrtimers_init(void
         hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
                           (void *)(long)smp_processor_id());
         register_cpu_notifier(&hrtimers_nb);
- #ifdef CONFIG_HIGH_RES_TIMERS
-       open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq);
- #endif
   }
   
   /**
@@@ -1772,8 -1723,6 +1725,6 @@@ schedule_hrtimeout_range_clock(ktime_t 
         hrtimer_init_sleeper(&t, current);
   
         hrtimer_start_expires(&t.timer, mode);
-       if (!hrtimer_active(&t.timer))
-               t.task = NULL;
   
         if (likely(t.task))
                 schedule();
diff --combined net/sched/sch_api.c

index 1e1c89e51a118e79610c49412e335191fc3ba834,45bc63ae18e3ae9a5fe2a80e9de4763cac39cebd..73a123daa2cc5c4c43c69120d1fecd273df76c17
--- 1/net/sched/sch_api.c
--- 2/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@@ -815,8 -815,10 +815,8 @@@ static int qdisc_graft(struct net_devic
                 if (dev->flags & IFF_UP)
                         dev_deactivate(dev);
   
- -              if (new && new->ops->attach) {
- -                      new->ops->attach(new);
- -                      num_q = 0;
- -              }
+ +              if (new && new->ops->attach)
+ +                      goto skip;
   
                 for (i = 0; i < num_q; i++) {
                         struct netdev_queue *dev_queue = dev_ingress_queue(dev);
@@@ -832,16 -834,12 +832,16 @@@
                                 qdisc_destroy(old);
                 }
   
+ +skip:
                 if (!ingress) {
                         notify_and_destroy(net, skb, n, classid,
                                            dev->qdisc, new);
                         if (new && !new->ops->attach)
                                 atomic_inc(&new->refcnt);
                         dev->qdisc = new ? : &noop_qdisc;
+ +
+ +                      if (new && new->ops->attach)
+ +                              new->ops->attach(new);
                 } else {
                         notify_and_destroy(net, skb, n, classid, old, new);
                 }
@@@ -1885,13 -1883,10 +1885,10 @@@ EXPORT_SYMBOL(tcf_destroy_chain)
   #ifdef CONFIG_PROC_FS
   static int psched_show(struct seq_file *seq, void *v)
   {
-       struct timespec ts;
- 
-       hrtimer_get_res(CLOCK_MONOTONIC, &ts);
         seq_printf(seq, "%08x %08x %08x %08x\n",
                    (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
                    1000000,
-                  (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
+                  (u32)NSEC_PER_SEC / hrtimer_resolution);
   
         return 0;
   }
author	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 23 Jun 2015 01:57:44 +0000 (18:57 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 23 Jun 2015 01:57:44 +0000 (18:57 -0700)
		1	2
arch/x86/kernel/cpu/perf_event_intel_uncore.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/dcache.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/perf_event.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/rcupdate.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/rcutree.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/events/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/futex.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/locking/rtmutex.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/rcu/tree_plugin.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/deadline.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/debug.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/fair.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/rt.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/time/hrtimer.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/sched/sch_api.c	patch \|	diff1 \|	diff2 \|	blob \| history