]> git.kernelconcepts.de Git - karo-tx-linux.git/commitdiff
Merge branch 'for-3.17' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu
authorLinus Torvalds <torvalds@linux-foundation.org>
Mon, 4 Aug 2014 17:09:27 +0000 (10:09 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Mon, 4 Aug 2014 17:09:27 +0000 (10:09 -0700)
Pull percpu updates from Tejun Heo:

 - Major reorganization of percpu header files which I think makes
   things a lot more readable and logical than before.

 - percpu-refcount is updated so that it requires explicit destruction
   and can be reinitialized if necessary.  This was pulled into the
   block tree to replace the custom percpu refcnting implemented in
   blk-mq.

 - In the process, percpu and percpu-refcount got cleaned up a bit

* 'for-3.17' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu: (21 commits)
  percpu-refcount: implement percpu_ref_reinit() and percpu_ref_is_zero()
  percpu-refcount: require percpu_ref to be exited explicitly
  percpu-refcount: use unsigned long for pcpu_count pointer
  percpu-refcount: add helpers for ->percpu_count accesses
  percpu-refcount: one bit is enough for REF_STATUS
  percpu-refcount, aio: use percpu_ref_cancel_init() in ioctx_alloc()
  workqueue: stronger test in process_one_work()
  workqueue: clear POOL_DISASSOCIATED in rebind_workers()
  percpu: Use ALIGN macro instead of hand coding alignment calculation
  percpu: invoke __verify_pcpu_ptr() from the generic part of accessors and operations
  percpu: preffity percpu header files
  percpu: use raw_cpu_*() to define __this_cpu_*()
  percpu: reorder macros in percpu header files
  percpu: move {raw|this}_cpu_*() definitions to include/linux/percpu-defs.h
  percpu: move generic {raw|this}_cpu_*_N() definitions to include/asm-generic/percpu.h
  percpu: only allow sized arch overrides for {raw|this}_cpu_*() ops
  percpu: reorganize include/linux/percpu-defs.h
  percpu: move accessors from include/linux/percpu.h to percpu-defs.h
  percpu: include/asm-generic/percpu.h should contain only arch-overridable parts
  percpu: introduce arch_raw_cpu_ptr()
  ...

1  2 
fs/aio.c
include/linux/percpu-defs.h
kernel/cgroup.c
kernel/workqueue.c

diff --combined fs/aio.c
index 1c9c5f0a9e2be991b0bcba9ab71bcdcc1ae8ddc4,ea1bc2e8f4f30c2c05d1fd597e0c32cb80bd0c41..bd7ec2cc2674d73ab71405769ce53b922a0c0082
+++ b/fs/aio.c
@@@ -506,6 -506,8 +506,8 @@@ static void free_ioctx(struct work_stru
  
        aio_free_ring(ctx);
        free_percpu(ctx->cpu);
+       percpu_ref_exit(&ctx->reqs);
+       percpu_ref_exit(&ctx->users);
        kmem_cache_free(kioctx_cachep, ctx);
  }
  
@@@ -715,8 -717,8 +717,8 @@@ err_ctx
  err:
        mutex_unlock(&ctx->ring_lock);
        free_percpu(ctx->cpu);
-       free_percpu(ctx->reqs.pcpu_count);
-       free_percpu(ctx->users.pcpu_count);
+       percpu_ref_exit(&ctx->reqs);
+       percpu_ref_exit(&ctx->users);
        kmem_cache_free(kioctx_cachep, ctx);
        pr_debug("error allocating ioctx %d\n", err);
        return ERR_PTR(err);
@@@ -830,20 -832,16 +832,20 @@@ void exit_aio(struct mm_struct *mm
  static void put_reqs_available(struct kioctx *ctx, unsigned nr)
  {
        struct kioctx_cpu *kcpu;
 +      unsigned long flags;
  
        preempt_disable();
        kcpu = this_cpu_ptr(ctx->cpu);
  
 +      local_irq_save(flags);
        kcpu->reqs_available += nr;
 +
        while (kcpu->reqs_available >= ctx->req_batch * 2) {
                kcpu->reqs_available -= ctx->req_batch;
                atomic_add(ctx->req_batch, &ctx->reqs_available);
        }
  
 +      local_irq_restore(flags);
        preempt_enable();
  }
  
@@@ -851,12 -849,10 +853,12 @@@ static bool get_reqs_available(struct k
  {
        struct kioctx_cpu *kcpu;
        bool ret = false;
 +      unsigned long flags;
  
        preempt_disable();
        kcpu = this_cpu_ptr(ctx->cpu);
  
 +      local_irq_save(flags);
        if (!kcpu->reqs_available) {
                int old, avail = atomic_read(&ctx->reqs_available);
  
        ret = true;
        kcpu->reqs_available--;
  out:
 +      local_irq_restore(flags);
        preempt_enable();
        return ret;
  }
@@@ -1028,7 -1023,6 +1030,7 @@@ void aio_complete(struct kiocb *iocb, l
  
        /* everything turned out well, dispose of the aiocb. */
        kiocb_free(iocb);
 +      put_reqs_available(ctx, 1);
  
        /*
         * We have to order our ring_info tail store above and test
@@@ -1070,9 -1064,6 +1072,9 @@@ static long aio_read_events_ring(struc
        if (head == tail)
                goto out;
  
 +      head %= ctx->nr_events;
 +      tail %= ctx->nr_events;
 +
        while (ret < nr) {
                long avail;
                struct io_event *ev;
        flush_dcache_page(ctx->ring_pages[0]);
  
        pr_debug("%li  h%u t%u\n", ret, head, tail);
 -
 -      put_reqs_available(ctx, ret);
  out:
        mutex_unlock(&ctx->ring_lock);
  
index dec01d6c3f80088f046bfd8057f8d3488a891d52,c93fff16776c8be213368e05fc3d2d1a2198999a..cfd56046ecec3b07bf3f266617206a46c0b2c12e
@@@ -1,6 -1,40 +1,40 @@@
+ /*
+  * linux/percpu-defs.h - basic definitions for percpu areas
+  *
+  * DO NOT INCLUDE DIRECTLY OUTSIDE PERCPU IMPLEMENTATION PROPER.
+  *
+  * This file is separate from linux/percpu.h to avoid cyclic inclusion
+  * dependency from arch header files.  Only to be included from
+  * asm/percpu.h.
+  *
+  * This file includes macros necessary to declare percpu sections and
+  * variables, and definitions of percpu accessors and operations.  It
+  * should provide enough percpu features to arch header files even when
+  * they can only include asm/percpu.h to avoid cyclic inclusion dependency.
+  */
  #ifndef _LINUX_PERCPU_DEFS_H
  #define _LINUX_PERCPU_DEFS_H
  
+ #ifdef CONFIG_SMP
+ #ifdef MODULE
+ #define PER_CPU_SHARED_ALIGNED_SECTION ""
+ #define PER_CPU_ALIGNED_SECTION ""
+ #else
+ #define PER_CPU_SHARED_ALIGNED_SECTION "..shared_aligned"
+ #define PER_CPU_ALIGNED_SECTION "..shared_aligned"
+ #endif
+ #define PER_CPU_FIRST_SECTION "..first"
+ #else
+ #define PER_CPU_SHARED_ALIGNED_SECTION ""
+ #define PER_CPU_ALIGNED_SECTION "..shared_aligned"
+ #define PER_CPU_FIRST_SECTION ""
+ #endif
  /*
   * Base implementations of per-CPU variable declarations and definitions, where
   * the section in which the variable is to be placed is provided by the
  #define __PCPU_DUMMY_ATTRS                                            \
        __attribute__((section(".discard"), unused))
  
- /*
-  * Macro which verifies @ptr is a percpu pointer without evaluating
-  * @ptr.  This is to be used in percpu accessors to verify that the
-  * input parameter is a percpu pointer.
-  *
-  * + 0 is required in order to convert the pointer type from a
-  * potential array type to a pointer to a single item of the array.
-  */
- #define __verify_pcpu_ptr(ptr)        do {                                    \
-       const void __percpu *__vpp_verify = (typeof((ptr) + 0))NULL;    \
-       (void)__vpp_verify;                                             \
- } while (0)
  /*
   * s390 and alpha modules require percpu variables to be defined as
   * weak to force the compiler to generate GOT based external
   * Declaration/definition used for per-CPU variables that must be read mostly.
   */
  #define DECLARE_PER_CPU_READ_MOSTLY(type, name)                       \
 -      DECLARE_PER_CPU_SECTION(type, name, "..readmostly")
 +      DECLARE_PER_CPU_SECTION(type, name, "..read_mostly")
  
  #define DEFINE_PER_CPU_READ_MOSTLY(type, name)                                \
 -      DEFINE_PER_CPU_SECTION(type, name, "..readmostly")
 +      DEFINE_PER_CPU_SECTION(type, name, "..read_mostly")
  
  /*
   * Intermodule exports for per-CPU variables.  sparse forgets about
  #define EXPORT_PER_CPU_SYMBOL_GPL(var)
  #endif
  
+ /*
+  * Accessors and operations.
+  */
+ #ifndef __ASSEMBLY__
+ /*
+  * __verify_pcpu_ptr() verifies @ptr is a percpu pointer without evaluating
+  * @ptr and is invoked once before a percpu area is accessed by all
+  * accessors and operations.  This is performed in the generic part of
+  * percpu and arch overrides don't need to worry about it; however, if an
+  * arch wants to implement an arch-specific percpu accessor or operation,
+  * it may use __verify_pcpu_ptr() to verify the parameters.
+  *
+  * + 0 is required in order to convert the pointer type from a
+  * potential array type to a pointer to a single item of the array.
+  */
+ #define __verify_pcpu_ptr(ptr)                                                \
+ do {                                                                  \
+       const void __percpu *__vpp_verify = (typeof((ptr) + 0))NULL;    \
+       (void)__vpp_verify;                                             \
+ } while (0)
+ #ifdef CONFIG_SMP
+ /*
+  * Add an offset to a pointer but keep the pointer as-is.  Use RELOC_HIDE()
+  * to prevent the compiler from making incorrect assumptions about the
+  * pointer value.  The weird cast keeps both GCC and sparse happy.
+  */
+ #define SHIFT_PERCPU_PTR(__p, __offset)                                       \
+       RELOC_HIDE((typeof(*(__p)) __kernel __force *)(__p), (__offset))
+ #define per_cpu_ptr(ptr, cpu)                                         \
+ ({                                                                    \
+       __verify_pcpu_ptr(ptr);                                         \
+       SHIFT_PERCPU_PTR((ptr), per_cpu_offset((cpu)));                 \
+ })
+ #define raw_cpu_ptr(ptr)                                              \
+ ({                                                                    \
+       __verify_pcpu_ptr(ptr);                                         \
+       arch_raw_cpu_ptr(ptr);                                          \
+ })
+ #ifdef CONFIG_DEBUG_PREEMPT
+ #define this_cpu_ptr(ptr)                                             \
+ ({                                                                    \
+       __verify_pcpu_ptr(ptr);                                         \
+       SHIFT_PERCPU_PTR(ptr, my_cpu_offset);                           \
+ })
+ #else
+ #define this_cpu_ptr(ptr) raw_cpu_ptr(ptr)
+ #endif
+ #else /* CONFIG_SMP */
+ #define VERIFY_PERCPU_PTR(__p)                                                \
+ ({                                                                    \
+       __verify_pcpu_ptr(__p);                                         \
+       (typeof(*(__p)) __kernel __force *)(__p);                       \
+ })
+ #define per_cpu_ptr(ptr, cpu) ({ (void)(cpu); VERIFY_PERCPU_PTR(ptr); })
+ #define raw_cpu_ptr(ptr)      per_cpu_ptr(ptr, 0)
+ #define this_cpu_ptr(ptr)     raw_cpu_ptr(ptr)
+ #endif        /* CONFIG_SMP */
+ #define per_cpu(var, cpu)     (*per_cpu_ptr(&(var), cpu))
+ #define __raw_get_cpu_var(var)        (*raw_cpu_ptr(&(var)))
+ #define __get_cpu_var(var)    (*this_cpu_ptr(&(var)))
+ /* keep until we have removed all uses of __this_cpu_ptr */
+ #define __this_cpu_ptr(ptr)   raw_cpu_ptr(ptr)
+ /*
+  * Must be an lvalue. Since @var must be a simple identifier,
+  * we force a syntax error here if it isn't.
+  */
+ #define get_cpu_var(var)                                              \
+ (*({                                                                  \
+       preempt_disable();                                              \
+       this_cpu_ptr(&var);                                             \
+ }))
+ /*
+  * The weird & is necessary because sparse considers (void)(var) to be
+  * a direct dereference of percpu variable (var).
+  */
+ #define put_cpu_var(var)                                              \
+ do {                                                                  \
+       (void)&(var);                                                   \
+       preempt_enable();                                               \
+ } while (0)
+ #define get_cpu_ptr(var)                                              \
+ ({                                                                    \
+       preempt_disable();                                              \
+       this_cpu_ptr(var);                                              \
+ })
+ #define put_cpu_ptr(var)                                              \
+ do {                                                                  \
+       (void)(var);                                                    \
+       preempt_enable();                                               \
+ } while (0)
+ /*
+  * Branching function to split up a function into a set of functions that
+  * are called for different scalar sizes of the objects handled.
+  */
+ extern void __bad_size_call_parameter(void);
+ #ifdef CONFIG_DEBUG_PREEMPT
+ extern void __this_cpu_preempt_check(const char *op);
+ #else
+ static inline void __this_cpu_preempt_check(const char *op) { }
+ #endif
+ #define __pcpu_size_call_return(stem, variable)                               \
+ ({                                                                    \
+       typeof(variable) pscr_ret__;                                    \
+       __verify_pcpu_ptr(&(variable));                                 \
+       switch(sizeof(variable)) {                                      \
+       case 1: pscr_ret__ = stem##1(variable); break;                  \
+       case 2: pscr_ret__ = stem##2(variable); break;                  \
+       case 4: pscr_ret__ = stem##4(variable); break;                  \
+       case 8: pscr_ret__ = stem##8(variable); break;                  \
+       default:                                                        \
+               __bad_size_call_parameter(); break;                     \
+       }                                                               \
+       pscr_ret__;                                                     \
+ })
+ #define __pcpu_size_call_return2(stem, variable, ...)                 \
+ ({                                                                    \
+       typeof(variable) pscr2_ret__;                                   \
+       __verify_pcpu_ptr(&(variable));                                 \
+       switch(sizeof(variable)) {                                      \
+       case 1: pscr2_ret__ = stem##1(variable, __VA_ARGS__); break;    \
+       case 2: pscr2_ret__ = stem##2(variable, __VA_ARGS__); break;    \
+       case 4: pscr2_ret__ = stem##4(variable, __VA_ARGS__); break;    \
+       case 8: pscr2_ret__ = stem##8(variable, __VA_ARGS__); break;    \
+       default:                                                        \
+               __bad_size_call_parameter(); break;                     \
+       }                                                               \
+       pscr2_ret__;                                                    \
+ })
+ /*
+  * Special handling for cmpxchg_double.  cmpxchg_double is passed two
+  * percpu variables.  The first has to be aligned to a double word
+  * boundary and the second has to follow directly thereafter.
+  * We enforce this on all architectures even if they don't support
+  * a double cmpxchg instruction, since it's a cheap requirement, and it
+  * avoids breaking the requirement for architectures with the instruction.
+  */
+ #define __pcpu_double_call_return_bool(stem, pcp1, pcp2, ...)         \
+ ({                                                                    \
+       bool pdcrb_ret__;                                               \
+       __verify_pcpu_ptr(&(pcp1));                                     \
+       BUILD_BUG_ON(sizeof(pcp1) != sizeof(pcp2));                     \
+       VM_BUG_ON((unsigned long)(&(pcp1)) % (2 * sizeof(pcp1)));       \
+       VM_BUG_ON((unsigned long)(&(pcp2)) !=                           \
+                 (unsigned long)(&(pcp1)) + sizeof(pcp1));             \
+       switch(sizeof(pcp1)) {                                          \
+       case 1: pdcrb_ret__ = stem##1(pcp1, pcp2, __VA_ARGS__); break;  \
+       case 2: pdcrb_ret__ = stem##2(pcp1, pcp2, __VA_ARGS__); break;  \
+       case 4: pdcrb_ret__ = stem##4(pcp1, pcp2, __VA_ARGS__); break;  \
+       case 8: pdcrb_ret__ = stem##8(pcp1, pcp2, __VA_ARGS__); break;  \
+       default:                                                        \
+               __bad_size_call_parameter(); break;                     \
+       }                                                               \
+       pdcrb_ret__;                                                    \
+ })
+ #define __pcpu_size_call(stem, variable, ...)                         \
+ do {                                                                  \
+       __verify_pcpu_ptr(&(variable));                                 \
+       switch(sizeof(variable)) {                                      \
+               case 1: stem##1(variable, __VA_ARGS__);break;           \
+               case 2: stem##2(variable, __VA_ARGS__);break;           \
+               case 4: stem##4(variable, __VA_ARGS__);break;           \
+               case 8: stem##8(variable, __VA_ARGS__);break;           \
+               default:                                                \
+                       __bad_size_call_parameter();break;              \
+       }                                                               \
+ } while (0)
+ /*
+  * this_cpu operations (C) 2008-2013 Christoph Lameter <cl@linux.com>
+  *
+  * Optimized manipulation for memory allocated through the per cpu
+  * allocator or for addresses of per cpu variables.
+  *
+  * These operation guarantee exclusivity of access for other operations
+  * on the *same* processor. The assumption is that per cpu data is only
+  * accessed by a single processor instance (the current one).
+  *
+  * The arch code can provide optimized implementation by defining macros
+  * for certain scalar sizes. F.e. provide this_cpu_add_2() to provide per
+  * cpu atomic operations for 2 byte sized RMW actions. If arch code does
+  * not provide operations for a scalar size then the fallback in the
+  * generic code will be used.
+  *
+  * cmpxchg_double replaces two adjacent scalars at once.  The first two
+  * parameters are per cpu variables which have to be of the same size.  A
+  * truth value is returned to indicate success or failure (since a double
+  * register result is difficult to handle).  There is very limited hardware
+  * support for these operations, so only certain sizes may work.
+  */
+ /*
+  * Operations for contexts where we do not want to do any checks for
+  * preemptions.  Unless strictly necessary, always use [__]this_cpu_*()
+  * instead.
+  *
+  * If there is no other protection through preempt disable and/or disabling
+  * interupts then one of these RMW operations can show unexpected behavior
+  * because the execution thread was rescheduled on another processor or an
+  * interrupt occurred and the same percpu variable was modified from the
+  * interrupt context.
+  */
+ #define raw_cpu_read(pcp)             __pcpu_size_call_return(raw_cpu_read_, pcp)
+ #define raw_cpu_write(pcp, val)               __pcpu_size_call(raw_cpu_write_, pcp, val)
+ #define raw_cpu_add(pcp, val)         __pcpu_size_call(raw_cpu_add_, pcp, val)
+ #define raw_cpu_and(pcp, val)         __pcpu_size_call(raw_cpu_and_, pcp, val)
+ #define raw_cpu_or(pcp, val)          __pcpu_size_call(raw_cpu_or_, pcp, val)
+ #define raw_cpu_add_return(pcp, val)  __pcpu_size_call_return2(raw_cpu_add_return_, pcp, val)
+ #define raw_cpu_xchg(pcp, nval)               __pcpu_size_call_return2(raw_cpu_xchg_, pcp, nval)
+ #define raw_cpu_cmpxchg(pcp, oval, nval) \
+       __pcpu_size_call_return2(raw_cpu_cmpxchg_, pcp, oval, nval)
+ #define raw_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) \
+       __pcpu_double_call_return_bool(raw_cpu_cmpxchg_double_, pcp1, pcp2, oval1, oval2, nval1, nval2)
+ #define raw_cpu_sub(pcp, val)         raw_cpu_add(pcp, -(val))
+ #define raw_cpu_inc(pcp)              raw_cpu_add(pcp, 1)
+ #define raw_cpu_dec(pcp)              raw_cpu_sub(pcp, 1)
+ #define raw_cpu_sub_return(pcp, val)  raw_cpu_add_return(pcp, -(typeof(pcp))(val))
+ #define raw_cpu_inc_return(pcp)               raw_cpu_add_return(pcp, 1)
+ #define raw_cpu_dec_return(pcp)               raw_cpu_add_return(pcp, -1)
+ /*
+  * Operations for contexts that are safe from preemption/interrupts.  These
+  * operations verify that preemption is disabled.
+  */
+ #define __this_cpu_read(pcp)                                          \
+ ({                                                                    \
+       __this_cpu_preempt_check("read");                               \
+       raw_cpu_read(pcp);                                              \
+ })
+ #define __this_cpu_write(pcp, val)                                    \
+ ({                                                                    \
+       __this_cpu_preempt_check("write");                              \
+       raw_cpu_write(pcp, val);                                        \
+ })
+ #define __this_cpu_add(pcp, val)                                      \
+ ({                                                                    \
+       __this_cpu_preempt_check("add");                                \
+       raw_cpu_add(pcp, val);                                          \
+ })
+ #define __this_cpu_and(pcp, val)                                      \
+ ({                                                                    \
+       __this_cpu_preempt_check("and");                                \
+       raw_cpu_and(pcp, val);                                          \
+ })
+ #define __this_cpu_or(pcp, val)                                               \
+ ({                                                                    \
+       __this_cpu_preempt_check("or");                                 \
+       raw_cpu_or(pcp, val);                                           \
+ })
+ #define __this_cpu_add_return(pcp, val)                                       \
+ ({                                                                    \
+       __this_cpu_preempt_check("add_return");                         \
+       raw_cpu_add_return(pcp, val);                                   \
+ })
+ #define __this_cpu_xchg(pcp, nval)                                    \
+ ({                                                                    \
+       __this_cpu_preempt_check("xchg");                               \
+       raw_cpu_xchg(pcp, nval);                                        \
+ })
+ #define __this_cpu_cmpxchg(pcp, oval, nval)                           \
+ ({                                                                    \
+       __this_cpu_preempt_check("cmpxchg");                            \
+       raw_cpu_cmpxchg(pcp, oval, nval);                               \
+ })
+ #define __this_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) \
+ ({    __this_cpu_preempt_check("cmpxchg_double");                     \
+       raw_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2); \
+ })
+ #define __this_cpu_sub(pcp, val)      __this_cpu_add(pcp, -(typeof(pcp))(val))
+ #define __this_cpu_inc(pcp)           __this_cpu_add(pcp, 1)
+ #define __this_cpu_dec(pcp)           __this_cpu_sub(pcp, 1)
+ #define __this_cpu_sub_return(pcp, val)       __this_cpu_add_return(pcp, -(typeof(pcp))(val))
+ #define __this_cpu_inc_return(pcp)    __this_cpu_add_return(pcp, 1)
+ #define __this_cpu_dec_return(pcp)    __this_cpu_add_return(pcp, -1)
+ /*
+  * Operations with implied preemption protection.  These operations can be
+  * used without worrying about preemption.  Note that interrupts may still
+  * occur while an operation is in progress and if the interrupt modifies
+  * the variable too then RMW actions may not be reliable.
+  */
+ #define this_cpu_read(pcp)            __pcpu_size_call_return(this_cpu_read_, pcp)
+ #define this_cpu_write(pcp, val)      __pcpu_size_call(this_cpu_write_, pcp, val)
+ #define this_cpu_add(pcp, val)                __pcpu_size_call(this_cpu_add_, pcp, val)
+ #define this_cpu_and(pcp, val)                __pcpu_size_call(this_cpu_and_, pcp, val)
+ #define this_cpu_or(pcp, val)         __pcpu_size_call(this_cpu_or_, pcp, val)
+ #define this_cpu_add_return(pcp, val) __pcpu_size_call_return2(this_cpu_add_return_, pcp, val)
+ #define this_cpu_xchg(pcp, nval)      __pcpu_size_call_return2(this_cpu_xchg_, pcp, nval)
+ #define this_cpu_cmpxchg(pcp, oval, nval) \
+       __pcpu_size_call_return2(this_cpu_cmpxchg_, pcp, oval, nval)
+ #define this_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) \
+       __pcpu_double_call_return_bool(this_cpu_cmpxchg_double_, pcp1, pcp2, oval1, oval2, nval1, nval2)
+ #define this_cpu_sub(pcp, val)                this_cpu_add(pcp, -(typeof(pcp))(val))
+ #define this_cpu_inc(pcp)             this_cpu_add(pcp, 1)
+ #define this_cpu_dec(pcp)             this_cpu_sub(pcp, 1)
+ #define this_cpu_sub_return(pcp, val) this_cpu_add_return(pcp, -(typeof(pcp))(val))
+ #define this_cpu_inc_return(pcp)      this_cpu_add_return(pcp, 1)
+ #define this_cpu_dec_return(pcp)      this_cpu_add_return(pcp, -1)
+ #endif /* __ASSEMBLY__ */
  #endif /* _LINUX_PERCPU_DEFS_H */
diff --combined kernel/cgroup.c
index 70776aec2562b7180ddc2a39bef55603f584ea15,c06aa5e257a89ab3e1e9bbf777713636bf222661..aad41f06901b57cb69235e6141c8501c2365183a
@@@ -1638,7 -1638,7 +1638,7 @@@ destroy_root
  exit_root_id:
        cgroup_exit_root_id(root);
  cancel_ref:
-       percpu_ref_cancel_init(&root_cgrp->self.refcnt);
+       percpu_ref_exit(&root_cgrp->self.refcnt);
  out:
        free_cgrp_cset_links(&tmp_links);
        return ret;
@@@ -1648,13 -1648,10 +1648,13 @@@ static struct dentry *cgroup_mount(stru
                         int flags, const char *unused_dev_name,
                         void *data)
  {
 +      struct super_block *pinned_sb = NULL;
 +      struct cgroup_subsys *ss;
        struct cgroup_root *root;
        struct cgroup_sb_opts opts;
        struct dentry *dentry;
        int ret;
 +      int i;
        bool new_sb;
  
        /*
                goto out_unlock;
        }
  
 +      /*
 +       * Destruction of cgroup root is asynchronous, so subsystems may
 +       * still be dying after the previous unmount.  Let's drain the
 +       * dying subsystems.  We just need to ensure that the ones
 +       * unmounted previously finish dying and don't care about new ones
 +       * starting.  Testing ref liveliness is good enough.
 +       */
 +      for_each_subsys(ss, i) {
 +              if (!(opts.subsys_mask & (1 << i)) ||
 +                  ss->root == &cgrp_dfl_root)
 +                      continue;
 +
 +              if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
 +                      mutex_unlock(&cgroup_mutex);
 +                      msleep(10);
 +                      ret = restart_syscall();
 +                      goto out_free;
 +              }
 +              cgroup_put(&ss->root->cgrp);
 +      }
 +
        for_each_root(root) {
                bool name_match = false;
  
                }
  
                /*
 -               * A root's lifetime is governed by its root cgroup.
 -               * tryget_live failure indicate that the root is being
 -               * destroyed.  Wait for destruction to complete so that the
 -               * subsystems are free.  We can use wait_queue for the wait
 -               * but this path is super cold.  Let's just sleep for a bit
 -               * and retry.
 +               * We want to reuse @root whose lifetime is governed by its
 +               * ->cgrp.  Let's check whether @root is alive and keep it
 +               * that way.  As cgroup_kill_sb() can happen anytime, we
 +               * want to block it by pinning the sb so that @root doesn't
 +               * get killed before mount is complete.
 +               *
 +               * With the sb pinned, tryget_live can reliably indicate
 +               * whether @root can be reused.  If it's being killed,
 +               * drain it.  We can use wait_queue for the wait but this
 +               * path is super cold.  Let's just sleep a bit and retry.
                 */
 -              if (!percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
 +              pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
 +              if (IS_ERR(pinned_sb) ||
 +                  !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
                        mutex_unlock(&cgroup_mutex);
 +                      if (!IS_ERR_OR_NULL(pinned_sb))
 +                              deactivate_super(pinned_sb);
                        msleep(10);
                        ret = restart_syscall();
                        goto out_free;
@@@ -1802,16 -1770,6 +1802,16 @@@ out_free
                                CGROUP_SUPER_MAGIC, &new_sb);
        if (IS_ERR(dentry) || !new_sb)
                cgroup_put(&root->cgrp);
 +
 +      /*
 +       * If @pinned_sb, we're reusing an existing root and holding an
 +       * extra ref on its sb.  Mount is complete.  Put the extra ref.
 +       */
 +      if (pinned_sb) {
 +              WARN_ON(new_sb);
 +              deactivate_super(pinned_sb);
 +      }
 +
        return dentry;
  }
  
@@@ -3370,7 -3328,7 +3370,7 @@@ bool css_has_online_children(struct cgr
  
        rcu_read_lock();
        css_for_each_child(child, css) {
 -              if (css->flags & CSS_ONLINE) {
 +              if (child->flags & CSS_ONLINE) {
                        ret = true;
                        break;
                }
@@@ -4175,6 -4133,8 +4175,8 @@@ static void css_free_work_fn(struct wor
                container_of(work, struct cgroup_subsys_state, destroy_work);
        struct cgroup *cgrp = css->cgroup;
  
+       percpu_ref_exit(&css->refcnt);
        if (css->ss) {
                /* css free path */
                if (css->parent)
@@@ -4372,7 -4332,7 +4374,7 @@@ err_list_del
  err_free_id:
        cgroup_idr_remove(&ss->css_idr, css->id);
  err_free_percpu_ref:
-       percpu_ref_cancel_init(&css->refcnt);
+       percpu_ref_exit(&css->refcnt);
  err_free_css:
        call_rcu(&css->rcu_head, css_free_rcu_fn);
        return err;
@@@ -4483,7 -4443,7 +4485,7 @@@ static int cgroup_mkdir(struct kernfs_n
  out_free_id:
        cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
  out_cancel_ref:
-       percpu_ref_cancel_init(&cgrp->self.refcnt);
+       percpu_ref_exit(&cgrp->self.refcnt);
  out_free_cgrp:
        kfree(cgrp);
  out_unlock:
diff --combined kernel/workqueue.c
index 7a2e449a96b18b080b5d74e60a0c3b753ebac88e,a3021d63f62d531f51ff72e467075bab579e73c0..5dbe22aa3efd48f92b39e6ccd25b63880dc1015b
@@@ -265,6 -265,7 +265,6 @@@ struct workqueue_struct 
  
  static struct kmem_cache *pwq_cache;
  
 -static int wq_numa_tbl_len;           /* highest possible NUMA node id + 1 */
  static cpumask_var_t *wq_numa_possible_cpumask;
                                        /* possible CPUs of each node */
  
@@@ -757,6 -758,13 +757,6 @@@ static bool too_many_workers(struct wor
        int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
        int nr_busy = pool->nr_workers - nr_idle;
  
 -      /*
 -       * nr_idle and idle_list may disagree if idle rebinding is in
 -       * progress.  Never return %true if idle_list is empty.
 -       */
 -      if (list_empty(&pool->idle_list))
 -              return false;
 -
        return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
  }
  
@@@ -842,7 -850,7 +842,7 @@@ struct task_struct *wq_worker_sleeping(
        pool = worker->pool;
  
        /* this can only happen on the local cpu */
 -      if (WARN_ON_ONCE(cpu != raw_smp_processor_id()))
 +      if (WARN_ON_ONCE(cpu != raw_smp_processor_id() || pool->cpu != cpu))
                return NULL;
  
        /*
   * worker_set_flags - set worker flags and adjust nr_running accordingly
   * @worker: self
   * @flags: flags to set
 - * @wakeup: wakeup an idle worker if necessary
   *
 - * Set @flags in @worker->flags and adjust nr_running accordingly.  If
 - * nr_running becomes zero and @wakeup is %true, an idle worker is
 - * woken up.
 + * Set @flags in @worker->flags and adjust nr_running accordingly.
   *
   * CONTEXT:
   * spin_lock_irq(pool->lock)
   */
 -static inline void worker_set_flags(struct worker *worker, unsigned int flags,
 -                                  bool wakeup)
 +static inline void worker_set_flags(struct worker *worker, unsigned int flags)
  {
        struct worker_pool *pool = worker->pool;
  
        WARN_ON_ONCE(worker->task != current);
  
 -      /*
 -       * If transitioning into NOT_RUNNING, adjust nr_running and
 -       * wake up an idle worker as necessary if requested by
 -       * @wakeup.
 -       */
 +      /* If transitioning into NOT_RUNNING, adjust nr_running. */
        if ((flags & WORKER_NOT_RUNNING) &&
            !(worker->flags & WORKER_NOT_RUNNING)) {
 -              if (wakeup) {
 -                      if (atomic_dec_and_test(&pool->nr_running) &&
 -                          !list_empty(&pool->worklist))
 -                              wake_up_worker(pool);
 -              } else
 -                      atomic_dec(&pool->nr_running);
 +              atomic_dec(&pool->nr_running);
        }
  
        worker->flags |= flags;
@@@ -1211,7 -1232,7 +1211,7 @@@ static int try_to_grab_pending(struct w
                        pwq_activate_delayed_work(work);
  
                list_del_init(&work->entry);
 -              pwq_dec_nr_in_flight(get_work_pwq(work), get_work_color(work));
 +              pwq_dec_nr_in_flight(pwq, get_work_color(work));
  
                /* work->data points to pwq iff queued, point to pool */
                set_work_pool_and_keep_pending(work, pool->id);
@@@ -1539,7 -1560,7 +1539,7 @@@ static void worker_enter_idle(struct wo
                         (worker->hentry.next || worker->hentry.pprev)))
                return;
  
 -      /* can't use worker_set_flags(), also called from start_worker() */
 +      /* can't use worker_set_flags(), also called from create_worker() */
        worker->flags |= WORKER_IDLE;
        pool->nr_idle++;
        worker->last_active = jiffies;
@@@ -1581,11 -1602,11 +1581,11 @@@ static void worker_leave_idle(struct wo
        list_del_init(&worker->entry);
  }
  
 -static struct worker *alloc_worker(void)
 +static struct worker *alloc_worker(int node)
  {
        struct worker *worker;
  
 -      worker = kzalloc(sizeof(*worker), GFP_KERNEL);
 +      worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, node);
        if (worker) {
                INIT_LIST_HEAD(&worker->entry);
                INIT_LIST_HEAD(&worker->scheduled);
@@@ -1649,9 -1670,6 +1649,9 @@@ static void worker_detach_from_pool(str
                detach_completion = pool->detach_completion;
        mutex_unlock(&pool->attach_mutex);
  
 +      /* clear leftover flags without pool->lock after it is detached */
 +      worker->flags &= ~(WORKER_UNBOUND | WORKER_REBOUND);
 +
        if (detach_completion)
                complete(detach_completion);
  }
   * create_worker - create a new workqueue worker
   * @pool: pool the new worker will belong to
   *
 - * Create a new worker which is attached to @pool.  The new worker must be
 - * started by start_worker().
 + * Create and start a new worker which is attached to @pool.
   *
   * CONTEXT:
   * Might sleep.  Does GFP_KERNEL allocations.
@@@ -1679,7 -1698,7 +1679,7 @@@ static struct worker *create_worker(str
        if (id < 0)
                goto fail;
  
 -      worker = alloc_worker();
 +      worker = alloc_worker(pool->node);
        if (!worker)
                goto fail;
  
        /* successful, attach the worker to the pool */
        worker_attach_to_pool(worker, pool);
  
 +      /* start the newly created worker */
 +      spin_lock_irq(&pool->lock);
 +      worker->pool->nr_workers++;
 +      worker_enter_idle(worker);
 +      wake_up_process(worker->task);
 +      spin_unlock_irq(&pool->lock);
 +
        return worker;
  
  fail:
        return NULL;
  }
  
 -/**
 - * start_worker - start a newly created worker
 - * @worker: worker to start
 - *
 - * Make the pool aware of @worker and start it.
 - *
 - * CONTEXT:
 - * spin_lock_irq(pool->lock).
 - */
 -static void start_worker(struct worker *worker)
 -{
 -      worker->pool->nr_workers++;
 -      worker_enter_idle(worker);
 -      wake_up_process(worker->task);
 -}
 -
 -/**
 - * create_and_start_worker - create and start a worker for a pool
 - * @pool: the target pool
 - *
 - * Grab the managership of @pool and create and start a new worker for it.
 - *
 - * Return: 0 on success. A negative error code otherwise.
 - */
 -static int create_and_start_worker(struct worker_pool *pool)
 -{
 -      struct worker *worker;
 -
 -      worker = create_worker(pool);
 -      if (worker) {
 -              spin_lock_irq(&pool->lock);
 -              start_worker(worker);
 -              spin_unlock_irq(&pool->lock);
 -      }
 -
 -      return worker ? 0 : -ENOMEM;
 -}
 -
  /**
   * destroy_worker - destroy a workqueue worker
   * @worker: worker to be destroyed
@@@ -1859,10 -1909,23 +1859,10 @@@ restart
        mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
  
        while (true) {
 -              struct worker *worker;
 -
 -              worker = create_worker(pool);
 -              if (worker) {
 -                      del_timer_sync(&pool->mayday_timer);
 -                      spin_lock_irq(&pool->lock);
 -                      start_worker(worker);
 -                      if (WARN_ON_ONCE(need_to_create_worker(pool)))
 -                              goto restart;
 -                      return true;
 -              }
 -
 -              if (!need_to_create_worker(pool))
 +              if (create_worker(pool) || !need_to_create_worker(pool))
                        break;
  
 -              __set_current_state(TASK_INTERRUPTIBLE);
 -              schedule_timeout(CREATE_COOLDOWN);
 +              schedule_timeout_interruptible(CREATE_COOLDOWN);
  
                if (!need_to_create_worker(pool))
                        break;
  
        del_timer_sync(&pool->mayday_timer);
        spin_lock_irq(&pool->lock);
 +      /*
 +       * This is necessary even after a new worker was just successfully
 +       * created as @pool->lock was dropped and the new worker might have
 +       * already become busy.
 +       */
        if (need_to_create_worker(pool))
                goto restart;
        return true;
@@@ -1962,6 -2020,7 +1962,7 @@@ __acquires(&pool->lock
  
        lockdep_copy_map(&lockdep_map, &work->lockdep_map);
  #endif
+       /* ensure we're on the correct CPU */
        WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
                     raw_smp_processor_id() != pool->cpu);
  
        list_del_init(&work->entry);
  
        /*
 -       * CPU intensive works don't participate in concurrency
 -       * management.  They're the scheduler's responsibility.
 +       * CPU intensive works don't participate in concurrency management.
 +       * They're the scheduler's responsibility.  This takes @worker out
 +       * of concurrency management and the next code block will chain
 +       * execution of the pending work items.
         */
        if (unlikely(cpu_intensive))
 -              worker_set_flags(worker, WORKER_CPU_INTENSIVE, true);
 +              worker_set_flags(worker, WORKER_CPU_INTENSIVE);
  
        /*
 -       * Unbound pool isn't concurrency managed and work items should be
 -       * executed ASAP.  Wake up another worker if necessary.
 +       * Wake up another worker if necessary.  The condition is always
 +       * false for normal per-cpu workers since nr_running would always
 +       * be >= 1 at this point.  This is used to chain execution of the
 +       * pending work items for WORKER_NOT_RUNNING workers such as the
 +       * UNBOUND and CPU_INTENSIVE ones.
         */
 -      if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool))
 +      if (need_more_worker(pool))
                wake_up_worker(pool);
  
        /*
@@@ -2159,7 -2213,7 +2160,7 @@@ recheck
                }
        } while (keep_working(pool));
  
 -      worker_set_flags(worker, WORKER_PREP, false);
 +      worker_set_flags(worker, WORKER_PREP);
  sleep:
        /*
         * pool->lock is held and there's no work to process and no need to
@@@ -2252,27 -2306,29 +2253,27 @@@ repeat
                                move_linked_works(work, scheduled, &n);
  
                process_scheduled_works(rescuer);
 -              spin_unlock_irq(&pool->lock);
 -
 -              worker_detach_from_pool(rescuer, pool);
 -
 -              spin_lock_irq(&pool->lock);
  
                /*
                 * Put the reference grabbed by send_mayday().  @pool won't
 -               * go away while we're holding its lock.
 +               * go away while we're still attached to it.
                 */
                put_pwq(pwq);
  
                /*
 -               * Leave this pool.  If keep_working() is %true, notify a
 +               * Leave this pool.  If need_more_worker() is %true, notify a
                 * regular worker; otherwise, we end up with 0 concurrency
                 * and stalling the execution.
                 */
 -              if (keep_working(pool))
 +              if (need_more_worker(pool))
                        wake_up_worker(pool);
  
                rescuer->pool = NULL;
 -              spin_unlock(&pool->lock);
 -              spin_lock(&wq_mayday_lock);
 +              spin_unlock_irq(&pool->lock);
 +
 +              worker_detach_from_pool(rescuer, pool);
 +
 +              spin_lock_irq(&wq_mayday_lock);
        }
  
        spin_unlock_irq(&wq_mayday_lock);
@@@ -3223,7 -3279,6 +3224,7 @@@ int workqueue_sysfs_register(struct wor
                }
        }
  
 +      dev_set_uevent_suppress(&wq_dev->dev, false);
        kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);
        return 0;
  }
@@@ -3397,7 -3452,7 +3398,7 @@@ static void put_unbound_pool(struct wor
                return;
  
        /* sanity checks */
 -      if (WARN_ON(!(pool->flags & POOL_DISASSOCIATED)) ||
 +      if (WARN_ON(!(pool->cpu < 0)) ||
            WARN_ON(!list_empty(&pool->worklist)))
                return;
  
@@@ -3463,7 -3518,7 +3464,7 @@@ static struct worker_pool *get_unbound_
        hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) {
                if (wqattrs_equal(pool->attrs, attrs)) {
                        pool->refcnt++;
 -                      goto out_unlock;
 +                      return pool;
                }
        }
  
                goto fail;
  
        /* create and start the initial worker */
 -      if (create_and_start_worker(pool) < 0)
 +      if (!create_worker(pool))
                goto fail;
  
        /* install */
        hash_add(unbound_pool_hash, &pool->hash_node, hash);
 -out_unlock:
 +
        return pool;
  fail:
        if (pool)
@@@ -3530,6 -3585,11 +3531,6 @@@ static void pwq_unbound_release_workfn(
        if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND)))
                return;
  
 -      /*
 -       * Unlink @pwq.  Synchronization against wq->mutex isn't strictly
 -       * necessary on release but do it anyway.  It's easier to verify
 -       * and consistent with the linking path.
 -       */
        mutex_lock(&wq->mutex);
        list_del_rcu(&pwq->pwqs_node);
        is_last = list_empty(&wq->pwqs);
@@@ -3626,7 -3686,10 +3627,7 @@@ static void link_pwq(struct pool_workqu
        if (!list_empty(&pwq->pwqs_node))
                return;
  
 -      /*
 -       * Set the matching work_color.  This is synchronized with
 -       * wq->mutex to avoid confusing flush_workqueue().
 -       */
 +      /* set the matching work_color */
        pwq->work_color = wq->work_color;
  
        /* sync max_active to the current setting */
@@@ -3763,7 -3826,7 +3764,7 @@@ int apply_workqueue_attrs(struct workqu
        if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)))
                return -EINVAL;
  
 -      pwq_tbl = kzalloc(wq_numa_tbl_len * sizeof(pwq_tbl[0]), GFP_KERNEL);
 +      pwq_tbl = kzalloc(nr_node_ids * sizeof(pwq_tbl[0]), GFP_KERNEL);
        new_attrs = alloc_workqueue_attrs(GFP_KERNEL);
        tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL);
        if (!pwq_tbl || !new_attrs || !tmp_attrs)
@@@ -4011,7 -4074,7 +4012,7 @@@ struct workqueue_struct *__alloc_workqu
  
        /* allocate wq and format name */
        if (flags & WQ_UNBOUND)
 -              tbl_size = wq_numa_tbl_len * sizeof(wq->numa_pwq_tbl[0]);
 +              tbl_size = nr_node_ids * sizeof(wq->numa_pwq_tbl[0]);
  
        wq = kzalloc(sizeof(*wq) + tbl_size, GFP_KERNEL);
        if (!wq)
        if (flags & WQ_MEM_RECLAIM) {
                struct worker *rescuer;
  
 -              rescuer = alloc_worker();
 +              rescuer = alloc_worker(NUMA_NO_NODE);
                if (!rescuer)
                        goto err_destroy;
  
@@@ -4401,6 -4464,8 +4402,6 @@@ static void wq_unbind_fn(struct work_st
        struct worker *worker;
  
        for_each_cpu_worker_pool(pool, cpu) {
 -              WARN_ON_ONCE(cpu != smp_processor_id());
 -
                mutex_lock(&pool->attach_mutex);
                spin_lock_irq(&pool->lock);
  
@@@ -4562,7 -4627,7 +4563,7 @@@ static int workqueue_cpu_up_callback(st
                for_each_cpu_worker_pool(pool, cpu) {
                        if (pool->nr_workers)
                                continue;
 -                      if (create_and_start_worker(pool) < 0)
 +                      if (!create_worker(pool))
                                return NOTIFY_BAD;
                }
                break;
                for_each_pool(pool, pi) {
                        mutex_lock(&pool->attach_mutex);
  
-                       if (pool->cpu == cpu) {
+                       if (pool->cpu == cpu)
                                rebind_workers(pool);
-                       } else if (pool->cpu < 0) {
+                       else if (pool->cpu < 0)
                                restore_unbound_workers_cpumask(pool, cpu);
-                       }
  
                        mutex_unlock(&pool->attach_mutex);
                }
@@@ -4782,6 -4846,10 +4782,6 @@@ static void __init wq_numa_init(void
        cpumask_var_t *tbl;
        int node, cpu;
  
 -      /* determine NUMA pwq table len - highest node id + 1 */
 -      for_each_node(node)
 -              wq_numa_tbl_len = max(wq_numa_tbl_len, node + 1);
 -
        if (num_possible_nodes() <= 1)
                return;
  
         * available.  Build one from cpu_to_node() which should have been
         * fully initialized by now.
         */
 -      tbl = kzalloc(wq_numa_tbl_len * sizeof(tbl[0]), GFP_KERNEL);
 +      tbl = kzalloc(nr_node_ids * sizeof(tbl[0]), GFP_KERNEL);
        BUG_ON(!tbl);
  
        for_each_node(node)
 -              BUG_ON(!alloc_cpumask_var_node(&tbl[node], GFP_KERNEL,
 +              BUG_ON(!zalloc_cpumask_var_node(&tbl[node], GFP_KERNEL,
                                node_online(node) ? node : NUMA_NO_NODE));
  
        for_each_possible_cpu(cpu) {
@@@ -4858,7 -4926,7 +4858,7 @@@ static int __init init_workqueues(void
  
                for_each_cpu_worker_pool(pool, cpu) {
                        pool->flags &= ~POOL_DISASSOCIATED;
 -                      BUG_ON(create_and_start_worker(pool) < 0);
 +                      BUG_ON(!create_worker(pool));
                }
        }