]> git.kernelconcepts.de Git - karo-tx-linux.git/blobdiff - kernel/workqueue.c
Merge branch 'drm-fixes' of git://people.freedesktop.org/~airlied/linux
[karo-tx-linux.git] / kernel / workqueue.c
index 41ff75b478c60b443cd80626351b1e3f3030f94b..586ad91300b0f3924b41bd417cda9cbb076f1a9d 100644 (file)
@@ -159,6 +159,7 @@ struct worker_pool {
 
        /* see manage_workers() for details on the two manager mutexes */
        struct mutex            manager_arb;    /* manager arbitration */
+       struct worker           *manager;       /* L: purely informational */
        struct mutex            attach_mutex;   /* attach/detach exclusion */
        struct list_head        workers;        /* A: attached workers */
        struct completion       *detach_completion; /* all workers detached */
@@ -230,7 +231,7 @@ struct wq_device;
  */
 struct workqueue_struct {
        struct list_head        pwqs;           /* WR: all pwqs of this wq */
-       struct list_head        list;           /* PL: list of all workqueues */
+       struct list_head        list;           /* PR: list of all workqueues */
 
        struct mutex            mutex;          /* protects this wq */
        int                     work_color;     /* WQ: current work color */
@@ -257,6 +258,13 @@ struct workqueue_struct {
 #endif
        char                    name[WQ_NAME_LEN]; /* I: workqueue name */
 
+       /*
+        * Destruction of workqueue_struct is sched-RCU protected to allow
+        * walking the workqueues list without grabbing wq_pool_mutex.
+        * This is used to dump all workqueues from sysrq.
+        */
+       struct rcu_head         rcu;
+
        /* hot fields used during command issue, aligned to cacheline */
        unsigned int            flags ____cacheline_aligned; /* WQ: WQ_* flags */
        struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */
@@ -288,7 +296,7 @@ static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf;
 static DEFINE_MUTEX(wq_pool_mutex);    /* protects pools and workqueues list */
 static DEFINE_SPINLOCK(wq_mayday_lock);        /* protects wq->maydays list */
 
-static LIST_HEAD(workqueues);          /* PL: list of all workqueues */
+static LIST_HEAD(workqueues);          /* PR: list of all workqueues */
 static bool workqueue_freezing;                /* PL: have wqs started freezing? */
 
 /* the per-cpu worker pools */
@@ -324,6 +332,7 @@ EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
 static int worker_thread(void *__worker);
 static void copy_workqueue_attrs(struct workqueue_attrs *to,
                                 const struct workqueue_attrs *from);
+static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/workqueue.h>
@@ -1911,9 +1920,11 @@ static bool manage_workers(struct worker *worker)
         */
        if (!mutex_trylock(&pool->manager_arb))
                return false;
+       pool->manager = worker;
 
        maybe_create_worker(pool);
 
+       pool->manager = NULL;
        mutex_unlock(&pool->manager_arb);
        return true;
 }
@@ -2303,6 +2314,7 @@ repeat:
 struct wq_barrier {
        struct work_struct      work;
        struct completion       done;
+       struct task_struct      *task;  /* purely informational */
 };
 
 static void wq_barrier_func(struct work_struct *work)
@@ -2351,6 +2363,7 @@ static void insert_wq_barrier(struct pool_workqueue *pwq,
        INIT_WORK_ONSTACK(&barr->work, wq_barrier_func);
        __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
        init_completion(&barr->done);
+       barr->task = current;
 
        /*
         * If @target is currently being executed, schedule the
@@ -2989,624 +3002,319 @@ int execute_in_process_context(work_func_t fn, struct execute_work *ew)
 }
 EXPORT_SYMBOL_GPL(execute_in_process_context);
 
-#ifdef CONFIG_SYSFS
-/*
- * Workqueues with WQ_SYSFS flag set is visible to userland via
- * /sys/bus/workqueue/devices/WQ_NAME.  All visible workqueues have the
- * following attributes.
- *
- *  per_cpu    RO bool : whether the workqueue is per-cpu or unbound
- *  max_active RW int  : maximum number of in-flight work items
- *
- * Unbound workqueues have the following extra attributes.
+/**
+ * free_workqueue_attrs - free a workqueue_attrs
+ * @attrs: workqueue_attrs to free
  *
- *  id         RO int  : the associated pool ID
- *  nice       RW int  : nice value of the workers
- *  cpumask    RW mask : bitmask of allowed CPUs for the workers
+ * Undo alloc_workqueue_attrs().
  */
-struct wq_device {
-       struct workqueue_struct         *wq;
-       struct device                   dev;
-};
-
-static struct workqueue_struct *dev_to_wq(struct device *dev)
+void free_workqueue_attrs(struct workqueue_attrs *attrs)
 {
-       struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
-
-       return wq_dev->wq;
+       if (attrs) {
+               free_cpumask_var(attrs->cpumask);
+               kfree(attrs);
+       }
 }
 
-static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr,
-                           char *buf)
+/**
+ * alloc_workqueue_attrs - allocate a workqueue_attrs
+ * @gfp_mask: allocation mask to use
+ *
+ * Allocate a new workqueue_attrs, initialize with default settings and
+ * return it.
+ *
+ * Return: The allocated new workqueue_attr on success. %NULL on failure.
+ */
+struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask)
 {
-       struct workqueue_struct *wq = dev_to_wq(dev);
+       struct workqueue_attrs *attrs;
 
-       return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND));
+       attrs = kzalloc(sizeof(*attrs), gfp_mask);
+       if (!attrs)
+               goto fail;
+       if (!alloc_cpumask_var(&attrs->cpumask, gfp_mask))
+               goto fail;
+
+       cpumask_copy(attrs->cpumask, cpu_possible_mask);
+       return attrs;
+fail:
+       free_workqueue_attrs(attrs);
+       return NULL;
 }
-static DEVICE_ATTR_RO(per_cpu);
 
-static ssize_t max_active_show(struct device *dev,
-                              struct device_attribute *attr, char *buf)
+static void copy_workqueue_attrs(struct workqueue_attrs *to,
+                                const struct workqueue_attrs *from)
 {
-       struct workqueue_struct *wq = dev_to_wq(dev);
-
-       return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active);
+       to->nice = from->nice;
+       cpumask_copy(to->cpumask, from->cpumask);
+       /*
+        * Unlike hash and equality test, this function doesn't ignore
+        * ->no_numa as it is used for both pool and wq attrs.  Instead,
+        * get_unbound_pool() explicitly clears ->no_numa after copying.
+        */
+       to->no_numa = from->no_numa;
 }
 
-static ssize_t max_active_store(struct device *dev,
-                               struct device_attribute *attr, const char *buf,
-                               size_t count)
+/* hash value of the content of @attr */
+static u32 wqattrs_hash(const struct workqueue_attrs *attrs)
 {
-       struct workqueue_struct *wq = dev_to_wq(dev);
-       int val;
-
-       if (sscanf(buf, "%d", &val) != 1 || val <= 0)
-               return -EINVAL;
+       u32 hash = 0;
 
-       workqueue_set_max_active(wq, val);
-       return count;
+       hash = jhash_1word(attrs->nice, hash);
+       hash = jhash(cpumask_bits(attrs->cpumask),
+                    BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash);
+       return hash;
 }
-static DEVICE_ATTR_RW(max_active);
-
-static struct attribute *wq_sysfs_attrs[] = {
-       &dev_attr_per_cpu.attr,
-       &dev_attr_max_active.attr,
-       NULL,
-};
-ATTRIBUTE_GROUPS(wq_sysfs);
 
-static ssize_t wq_pool_ids_show(struct device *dev,
-                               struct device_attribute *attr, char *buf)
+/* content equality test */
+static bool wqattrs_equal(const struct workqueue_attrs *a,
+                         const struct workqueue_attrs *b)
 {
-       struct workqueue_struct *wq = dev_to_wq(dev);
-       const char *delim = "";
-       int node, written = 0;
-
-       rcu_read_lock_sched();
-       for_each_node(node) {
-               written += scnprintf(buf + written, PAGE_SIZE - written,
-                                    "%s%d:%d", delim, node,
-                                    unbound_pwq_by_node(wq, node)->pool->id);
-               delim = " ";
-       }
-       written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
-       rcu_read_unlock_sched();
-
-       return written;
+       if (a->nice != b->nice)
+               return false;
+       if (!cpumask_equal(a->cpumask, b->cpumask))
+               return false;
+       return true;
 }
 
-static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
-                           char *buf)
+/**
+ * init_worker_pool - initialize a newly zalloc'd worker_pool
+ * @pool: worker_pool to initialize
+ *
+ * Initiailize a newly zalloc'd @pool.  It also allocates @pool->attrs.
+ *
+ * Return: 0 on success, -errno on failure.  Even on failure, all fields
+ * inside @pool proper are initialized and put_unbound_pool() can be called
+ * on @pool safely to release it.
+ */
+static int init_worker_pool(struct worker_pool *pool)
 {
-       struct workqueue_struct *wq = dev_to_wq(dev);
-       int written;
+       spin_lock_init(&pool->lock);
+       pool->id = -1;
+       pool->cpu = -1;
+       pool->node = NUMA_NO_NODE;
+       pool->flags |= POOL_DISASSOCIATED;
+       INIT_LIST_HEAD(&pool->worklist);
+       INIT_LIST_HEAD(&pool->idle_list);
+       hash_init(pool->busy_hash);
 
-       mutex_lock(&wq->mutex);
-       written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice);
-       mutex_unlock(&wq->mutex);
+       init_timer_deferrable(&pool->idle_timer);
+       pool->idle_timer.function = idle_worker_timeout;
+       pool->idle_timer.data = (unsigned long)pool;
 
-       return written;
-}
+       setup_timer(&pool->mayday_timer, pool_mayday_timeout,
+                   (unsigned long)pool);
 
-/* prepare workqueue_attrs for sysfs store operations */
-static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq)
-{
-       struct workqueue_attrs *attrs;
+       mutex_init(&pool->manager_arb);
+       mutex_init(&pool->attach_mutex);
+       INIT_LIST_HEAD(&pool->workers);
 
-       attrs = alloc_workqueue_attrs(GFP_KERNEL);
-       if (!attrs)
-               return NULL;
+       ida_init(&pool->worker_ida);
+       INIT_HLIST_NODE(&pool->hash_node);
+       pool->refcnt = 1;
 
-       mutex_lock(&wq->mutex);
-       copy_workqueue_attrs(attrs, wq->unbound_attrs);
-       mutex_unlock(&wq->mutex);
-       return attrs;
+       /* shouldn't fail above this point */
+       pool->attrs = alloc_workqueue_attrs(GFP_KERNEL);
+       if (!pool->attrs)
+               return -ENOMEM;
+       return 0;
 }
 
-static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
-                            const char *buf, size_t count)
+static void rcu_free_wq(struct rcu_head *rcu)
 {
-       struct workqueue_struct *wq = dev_to_wq(dev);
-       struct workqueue_attrs *attrs;
-       int ret;
-
-       attrs = wq_sysfs_prep_attrs(wq);
-       if (!attrs)
-               return -ENOMEM;
+       struct workqueue_struct *wq =
+               container_of(rcu, struct workqueue_struct, rcu);
 
-       if (sscanf(buf, "%d", &attrs->nice) == 1 &&
-           attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE)
-               ret = apply_workqueue_attrs(wq, attrs);
+       if (!(wq->flags & WQ_UNBOUND))
+               free_percpu(wq->cpu_pwqs);
        else
-               ret = -EINVAL;
+               free_workqueue_attrs(wq->unbound_attrs);
 
-       free_workqueue_attrs(attrs);
-       return ret ?: count;
+       kfree(wq->rescuer);
+       kfree(wq);
 }
 
-static ssize_t wq_cpumask_show(struct device *dev,
-                              struct device_attribute *attr, char *buf)
+static void rcu_free_pool(struct rcu_head *rcu)
 {
-       struct workqueue_struct *wq = dev_to_wq(dev);
-       int written;
+       struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu);
 
-       mutex_lock(&wq->mutex);
-       written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
-                           cpumask_pr_args(wq->unbound_attrs->cpumask));
-       mutex_unlock(&wq->mutex);
-       return written;
+       ida_destroy(&pool->worker_ida);
+       free_workqueue_attrs(pool->attrs);
+       kfree(pool);
 }
 
-static ssize_t wq_cpumask_store(struct device *dev,
-                               struct device_attribute *attr,
-                               const char *buf, size_t count)
+/**
+ * put_unbound_pool - put a worker_pool
+ * @pool: worker_pool to put
+ *
+ * Put @pool.  If its refcnt reaches zero, it gets destroyed in sched-RCU
+ * safe manner.  get_unbound_pool() calls this function on its failure path
+ * and this function should be able to release pools which went through,
+ * successfully or not, init_worker_pool().
+ *
+ * Should be called with wq_pool_mutex held.
+ */
+static void put_unbound_pool(struct worker_pool *pool)
 {
-       struct workqueue_struct *wq = dev_to_wq(dev);
-       struct workqueue_attrs *attrs;
-       int ret;
+       DECLARE_COMPLETION_ONSTACK(detach_completion);
+       struct worker *worker;
 
-       attrs = wq_sysfs_prep_attrs(wq);
-       if (!attrs)
-               return -ENOMEM;
+       lockdep_assert_held(&wq_pool_mutex);
 
-       ret = cpumask_parse(buf, attrs->cpumask);
-       if (!ret)
-               ret = apply_workqueue_attrs(wq, attrs);
+       if (--pool->refcnt)
+               return;
 
-       free_workqueue_attrs(attrs);
-       return ret ?: count;
-}
+       /* sanity checks */
+       if (WARN_ON(!(pool->cpu < 0)) ||
+           WARN_ON(!list_empty(&pool->worklist)))
+               return;
 
-static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr,
-                           char *buf)
-{
-       struct workqueue_struct *wq = dev_to_wq(dev);
-       int written;
-
-       mutex_lock(&wq->mutex);
-       written = scnprintf(buf, PAGE_SIZE, "%d\n",
-                           !wq->unbound_attrs->no_numa);
-       mutex_unlock(&wq->mutex);
-
-       return written;
-}
-
-static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr,
-                            const char *buf, size_t count)
-{
-       struct workqueue_struct *wq = dev_to_wq(dev);
-       struct workqueue_attrs *attrs;
-       int v, ret;
-
-       attrs = wq_sysfs_prep_attrs(wq);
-       if (!attrs)
-               return -ENOMEM;
+       /* release id and unhash */
+       if (pool->id >= 0)
+               idr_remove(&worker_pool_idr, pool->id);
+       hash_del(&pool->hash_node);
 
-       ret = -EINVAL;
-       if (sscanf(buf, "%d", &v) == 1) {
-               attrs->no_numa = !v;
-               ret = apply_workqueue_attrs(wq, attrs);
-       }
+       /*
+        * Become the manager and destroy all workers.  Grabbing
+        * manager_arb prevents @pool's workers from blocking on
+        * attach_mutex.
+        */
+       mutex_lock(&pool->manager_arb);
 
-       free_workqueue_attrs(attrs);
-       return ret ?: count;
-}
+       spin_lock_irq(&pool->lock);
+       while ((worker = first_idle_worker(pool)))
+               destroy_worker(worker);
+       WARN_ON(pool->nr_workers || pool->nr_idle);
+       spin_unlock_irq(&pool->lock);
 
-static struct device_attribute wq_sysfs_unbound_attrs[] = {
-       __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL),
-       __ATTR(nice, 0644, wq_nice_show, wq_nice_store),
-       __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
-       __ATTR(numa, 0644, wq_numa_show, wq_numa_store),
-       __ATTR_NULL,
-};
+       mutex_lock(&pool->attach_mutex);
+       if (!list_empty(&pool->workers))
+               pool->detach_completion = &detach_completion;
+       mutex_unlock(&pool->attach_mutex);
 
-static struct bus_type wq_subsys = {
-       .name                           = "workqueue",
-       .dev_groups                     = wq_sysfs_groups,
-};
+       if (pool->detach_completion)
+               wait_for_completion(pool->detach_completion);
 
-static int __init wq_sysfs_init(void)
-{
-       return subsys_virtual_register(&wq_subsys, NULL);
-}
-core_initcall(wq_sysfs_init);
+       mutex_unlock(&pool->manager_arb);
 
-static void wq_device_release(struct device *dev)
-{
-       struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
+       /* shut down the timers */
+       del_timer_sync(&pool->idle_timer);
+       del_timer_sync(&pool->mayday_timer);
 
-       kfree(wq_dev);
+       /* sched-RCU protected to allow dereferences from get_work_pool() */
+       call_rcu_sched(&pool->rcu, rcu_free_pool);
 }
 
 /**
- * workqueue_sysfs_register - make a workqueue visible in sysfs
- * @wq: the workqueue to register
+ * get_unbound_pool - get a worker_pool with the specified attributes
+ * @attrs: the attributes of the worker_pool to get
  *
- * Expose @wq in sysfs under /sys/bus/workqueue/devices.
- * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set
- * which is the preferred method.
+ * Obtain a worker_pool which has the same attributes as @attrs, bump the
+ * reference count and return it.  If there already is a matching
+ * worker_pool, it will be used; otherwise, this function attempts to
+ * create a new one.
  *
- * Workqueue user should use this function directly iff it wants to apply
- * workqueue_attrs before making the workqueue visible in sysfs; otherwise,
- * apply_workqueue_attrs() may race against userland updating the
- * attributes.
+ * Should be called with wq_pool_mutex held.
  *
- * Return: 0 on success, -errno on failure.
+ * Return: On success, a worker_pool with the same attributes as @attrs.
+ * On failure, %NULL.
  */
-int workqueue_sysfs_register(struct workqueue_struct *wq)
+static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
 {
-       struct wq_device *wq_dev;
-       int ret;
+       u32 hash = wqattrs_hash(attrs);
+       struct worker_pool *pool;
+       int node;
 
-       /*
-        * Adjusting max_active or creating new pwqs by applyting
-        * attributes breaks ordering guarantee.  Disallow exposing ordered
-        * workqueues.
-        */
-       if (WARN_ON(wq->flags & __WQ_ORDERED))
-               return -EINVAL;
+       lockdep_assert_held(&wq_pool_mutex);
 
-       wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);
-       if (!wq_dev)
-               return -ENOMEM;
+       /* do we already have a matching pool? */
+       hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) {
+               if (wqattrs_equal(pool->attrs, attrs)) {
+                       pool->refcnt++;
+                       return pool;
+               }
+       }
 
-       wq_dev->wq = wq;
-       wq_dev->dev.bus = &wq_subsys;
-       wq_dev->dev.init_name = wq->name;
-       wq_dev->dev.release = wq_device_release;
+       /* nope, create a new one */
+       pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+       if (!pool || init_worker_pool(pool) < 0)
+               goto fail;
+
+       lockdep_set_subclass(&pool->lock, 1);   /* see put_pwq() */
+       copy_workqueue_attrs(pool->attrs, attrs);
 
        /*
-        * unbound_attrs are created separately.  Suppress uevent until
-        * everything is ready.
+        * no_numa isn't a worker_pool attribute, always clear it.  See
+        * 'struct workqueue_attrs' comments for detail.
         */
-       dev_set_uevent_suppress(&wq_dev->dev, true);
-
-       ret = device_register(&wq_dev->dev);
-       if (ret) {
-               kfree(wq_dev);
-               wq->wq_dev = NULL;
-               return ret;
-       }
-
-       if (wq->flags & WQ_UNBOUND) {
-               struct device_attribute *attr;
+       pool->attrs->no_numa = false;
 
-               for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) {
-                       ret = device_create_file(&wq_dev->dev, attr);
-                       if (ret) {
-                               device_unregister(&wq_dev->dev);
-                               wq->wq_dev = NULL;
-                               return ret;
+       /* if cpumask is contained inside a NUMA node, we belong to that node */
+       if (wq_numa_enabled) {
+               for_each_node(node) {
+                       if (cpumask_subset(pool->attrs->cpumask,
+                                          wq_numa_possible_cpumask[node])) {
+                               pool->node = node;
+                               break;
                        }
                }
        }
 
-       dev_set_uevent_suppress(&wq_dev->dev, false);
-       kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);
-       return 0;
-}
+       if (worker_pool_assign_id(pool) < 0)
+               goto fail;
 
-/**
- * workqueue_sysfs_unregister - undo workqueue_sysfs_register()
- * @wq: the workqueue to unregister
- *
- * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister.
- */
-static void workqueue_sysfs_unregister(struct workqueue_struct *wq)
-{
-       struct wq_device *wq_dev = wq->wq_dev;
+       /* create and start the initial worker */
+       if (!create_worker(pool))
+               goto fail;
 
-       if (!wq->wq_dev)
-               return;
+       /* install */
+       hash_add(unbound_pool_hash, &pool->hash_node, hash);
 
-       wq->wq_dev = NULL;
-       device_unregister(&wq_dev->dev);
+       return pool;
+fail:
+       if (pool)
+               put_unbound_pool(pool);
+       return NULL;
 }
-#else  /* CONFIG_SYSFS */
-static void workqueue_sysfs_unregister(struct workqueue_struct *wq)    { }
-#endif /* CONFIG_SYSFS */
 
-/**
- * free_workqueue_attrs - free a workqueue_attrs
- * @attrs: workqueue_attrs to free
- *
- * Undo alloc_workqueue_attrs().
- */
-void free_workqueue_attrs(struct workqueue_attrs *attrs)
+static void rcu_free_pwq(struct rcu_head *rcu)
 {
-       if (attrs) {
-               free_cpumask_var(attrs->cpumask);
-               kfree(attrs);
-       }
+       kmem_cache_free(pwq_cache,
+                       container_of(rcu, struct pool_workqueue, rcu));
 }
 
-/**
- * alloc_workqueue_attrs - allocate a workqueue_attrs
- * @gfp_mask: allocation mask to use
- *
- * Allocate a new workqueue_attrs, initialize with default settings and
- * return it.
- *
- * Return: The allocated new workqueue_attr on success. %NULL on failure.
+/*
+ * Scheduled on system_wq by put_pwq() when an unbound pwq hits zero refcnt
+ * and needs to be destroyed.
  */
-struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask)
+static void pwq_unbound_release_workfn(struct work_struct *work)
 {
-       struct workqueue_attrs *attrs;
+       struct pool_workqueue *pwq = container_of(work, struct pool_workqueue,
+                                                 unbound_release_work);
+       struct workqueue_struct *wq = pwq->wq;
+       struct worker_pool *pool = pwq->pool;
+       bool is_last;
 
-       attrs = kzalloc(sizeof(*attrs), gfp_mask);
-       if (!attrs)
-               goto fail;
-       if (!alloc_cpumask_var(&attrs->cpumask, gfp_mask))
-               goto fail;
+       if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND)))
+               return;
 
-       cpumask_copy(attrs->cpumask, cpu_possible_mask);
-       return attrs;
-fail:
-       free_workqueue_attrs(attrs);
-       return NULL;
-}
+       mutex_lock(&wq->mutex);
+       list_del_rcu(&pwq->pwqs_node);
+       is_last = list_empty(&wq->pwqs);
+       mutex_unlock(&wq->mutex);
+
+       mutex_lock(&wq_pool_mutex);
+       put_unbound_pool(pool);
+       mutex_unlock(&wq_pool_mutex);
+
+       call_rcu_sched(&pwq->rcu, rcu_free_pwq);
 
-static void copy_workqueue_attrs(struct workqueue_attrs *to,
-                                const struct workqueue_attrs *from)
-{
-       to->nice = from->nice;
-       cpumask_copy(to->cpumask, from->cpumask);
        /*
-        * Unlike hash and equality test, this function doesn't ignore
-        * ->no_numa as it is used for both pool and wq attrs.  Instead,
-        * get_unbound_pool() explicitly clears ->no_numa after copying.
+        * If we're the last pwq going away, @wq is already dead and no one
+        * is gonna access it anymore.  Schedule RCU free.
         */
-       to->no_numa = from->no_numa;
-}
-
-/* hash value of the content of @attr */
-static u32 wqattrs_hash(const struct workqueue_attrs *attrs)
-{
-       u32 hash = 0;
-
-       hash = jhash_1word(attrs->nice, hash);
-       hash = jhash(cpumask_bits(attrs->cpumask),
-                    BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash);
-       return hash;
-}
-
-/* content equality test */
-static bool wqattrs_equal(const struct workqueue_attrs *a,
-                         const struct workqueue_attrs *b)
-{
-       if (a->nice != b->nice)
-               return false;
-       if (!cpumask_equal(a->cpumask, b->cpumask))
-               return false;
-       return true;
-}
-
-/**
- * init_worker_pool - initialize a newly zalloc'd worker_pool
- * @pool: worker_pool to initialize
- *
- * Initiailize a newly zalloc'd @pool.  It also allocates @pool->attrs.
- *
- * Return: 0 on success, -errno on failure.  Even on failure, all fields
- * inside @pool proper are initialized and put_unbound_pool() can be called
- * on @pool safely to release it.
- */
-static int init_worker_pool(struct worker_pool *pool)
-{
-       spin_lock_init(&pool->lock);
-       pool->id = -1;
-       pool->cpu = -1;
-       pool->node = NUMA_NO_NODE;
-       pool->flags |= POOL_DISASSOCIATED;
-       INIT_LIST_HEAD(&pool->worklist);
-       INIT_LIST_HEAD(&pool->idle_list);
-       hash_init(pool->busy_hash);
-
-       init_timer_deferrable(&pool->idle_timer);
-       pool->idle_timer.function = idle_worker_timeout;
-       pool->idle_timer.data = (unsigned long)pool;
-
-       setup_timer(&pool->mayday_timer, pool_mayday_timeout,
-                   (unsigned long)pool);
-
-       mutex_init(&pool->manager_arb);
-       mutex_init(&pool->attach_mutex);
-       INIT_LIST_HEAD(&pool->workers);
-
-       ida_init(&pool->worker_ida);
-       INIT_HLIST_NODE(&pool->hash_node);
-       pool->refcnt = 1;
-
-       /* shouldn't fail above this point */
-       pool->attrs = alloc_workqueue_attrs(GFP_KERNEL);
-       if (!pool->attrs)
-               return -ENOMEM;
-       return 0;
-}
-
-static void rcu_free_pool(struct rcu_head *rcu)
-{
-       struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu);
-
-       ida_destroy(&pool->worker_ida);
-       free_workqueue_attrs(pool->attrs);
-       kfree(pool);
-}
-
-/**
- * put_unbound_pool - put a worker_pool
- * @pool: worker_pool to put
- *
- * Put @pool.  If its refcnt reaches zero, it gets destroyed in sched-RCU
- * safe manner.  get_unbound_pool() calls this function on its failure path
- * and this function should be able to release pools which went through,
- * successfully or not, init_worker_pool().
- *
- * Should be called with wq_pool_mutex held.
- */
-static void put_unbound_pool(struct worker_pool *pool)
-{
-       DECLARE_COMPLETION_ONSTACK(detach_completion);
-       struct worker *worker;
-
-       lockdep_assert_held(&wq_pool_mutex);
-
-       if (--pool->refcnt)
-               return;
-
-       /* sanity checks */
-       if (WARN_ON(!(pool->cpu < 0)) ||
-           WARN_ON(!list_empty(&pool->worklist)))
-               return;
-
-       /* release id and unhash */
-       if (pool->id >= 0)
-               idr_remove(&worker_pool_idr, pool->id);
-       hash_del(&pool->hash_node);
-
-       /*
-        * Become the manager and destroy all workers.  Grabbing
-        * manager_arb prevents @pool's workers from blocking on
-        * attach_mutex.
-        */
-       mutex_lock(&pool->manager_arb);
-
-       spin_lock_irq(&pool->lock);
-       while ((worker = first_idle_worker(pool)))
-               destroy_worker(worker);
-       WARN_ON(pool->nr_workers || pool->nr_idle);
-       spin_unlock_irq(&pool->lock);
-
-       mutex_lock(&pool->attach_mutex);
-       if (!list_empty(&pool->workers))
-               pool->detach_completion = &detach_completion;
-       mutex_unlock(&pool->attach_mutex);
-
-       if (pool->detach_completion)
-               wait_for_completion(pool->detach_completion);
-
-       mutex_unlock(&pool->manager_arb);
-
-       /* shut down the timers */
-       del_timer_sync(&pool->idle_timer);
-       del_timer_sync(&pool->mayday_timer);
-
-       /* sched-RCU protected to allow dereferences from get_work_pool() */
-       call_rcu_sched(&pool->rcu, rcu_free_pool);
-}
-
-/**
- * get_unbound_pool - get a worker_pool with the specified attributes
- * @attrs: the attributes of the worker_pool to get
- *
- * Obtain a worker_pool which has the same attributes as @attrs, bump the
- * reference count and return it.  If there already is a matching
- * worker_pool, it will be used; otherwise, this function attempts to
- * create a new one.
- *
- * Should be called with wq_pool_mutex held.
- *
- * Return: On success, a worker_pool with the same attributes as @attrs.
- * On failure, %NULL.
- */
-static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
-{
-       u32 hash = wqattrs_hash(attrs);
-       struct worker_pool *pool;
-       int node;
-
-       lockdep_assert_held(&wq_pool_mutex);
-
-       /* do we already have a matching pool? */
-       hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) {
-               if (wqattrs_equal(pool->attrs, attrs)) {
-                       pool->refcnt++;
-                       return pool;
-               }
-       }
-
-       /* nope, create a new one */
-       pool = kzalloc(sizeof(*pool), GFP_KERNEL);
-       if (!pool || init_worker_pool(pool) < 0)
-               goto fail;
-
-       lockdep_set_subclass(&pool->lock, 1);   /* see put_pwq() */
-       copy_workqueue_attrs(pool->attrs, attrs);
-
-       /*
-        * no_numa isn't a worker_pool attribute, always clear it.  See
-        * 'struct workqueue_attrs' comments for detail.
-        */
-       pool->attrs->no_numa = false;
-
-       /* if cpumask is contained inside a NUMA node, we belong to that node */
-       if (wq_numa_enabled) {
-               for_each_node(node) {
-                       if (cpumask_subset(pool->attrs->cpumask,
-                                          wq_numa_possible_cpumask[node])) {
-                               pool->node = node;
-                               break;
-                       }
-               }
-       }
-
-       if (worker_pool_assign_id(pool) < 0)
-               goto fail;
-
-       /* create and start the initial worker */
-       if (!create_worker(pool))
-               goto fail;
-
-       /* install */
-       hash_add(unbound_pool_hash, &pool->hash_node, hash);
-
-       return pool;
-fail:
-       if (pool)
-               put_unbound_pool(pool);
-       return NULL;
-}
-
-static void rcu_free_pwq(struct rcu_head *rcu)
-{
-       kmem_cache_free(pwq_cache,
-                       container_of(rcu, struct pool_workqueue, rcu));
-}
-
-/*
- * Scheduled on system_wq by put_pwq() when an unbound pwq hits zero refcnt
- * and needs to be destroyed.
- */
-static void pwq_unbound_release_workfn(struct work_struct *work)
-{
-       struct pool_workqueue *pwq = container_of(work, struct pool_workqueue,
-                                                 unbound_release_work);
-       struct workqueue_struct *wq = pwq->wq;
-       struct worker_pool *pool = pwq->pool;
-       bool is_last;
-
-       if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND)))
-               return;
-
-       mutex_lock(&wq->mutex);
-       list_del_rcu(&pwq->pwqs_node);
-       is_last = list_empty(&wq->pwqs);
-       mutex_unlock(&wq->mutex);
-
-       mutex_lock(&wq_pool_mutex);
-       put_unbound_pool(pool);
-       mutex_unlock(&wq_pool_mutex);
-
-       call_rcu_sched(&pwq->rcu, rcu_free_pwq);
-
-       /*
-        * If we're the last pwq going away, @wq is already dead and no one
-        * is gonna access it anymore.  Free it.
-        */
-       if (is_last) {
-               free_workqueue_attrs(wq->unbound_attrs);
-               kfree(wq);
-       }
+       if (is_last)
+               call_rcu_sched(&wq->rcu, rcu_free_wq);
 }
 
 /**
@@ -4143,7 +3851,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
                pwq_adjust_max_active(pwq);
        mutex_unlock(&wq->mutex);
 
-       list_add(&wq->list, &workqueues);
+       list_add_tail_rcu(&wq->list, &workqueues);
 
        mutex_unlock(&wq_pool_mutex);
 
@@ -4199,24 +3907,20 @@ void destroy_workqueue(struct workqueue_struct *wq)
         * flushing is complete in case freeze races us.
         */
        mutex_lock(&wq_pool_mutex);
-       list_del_init(&wq->list);
+       list_del_rcu(&wq->list);
        mutex_unlock(&wq_pool_mutex);
 
        workqueue_sysfs_unregister(wq);
 
-       if (wq->rescuer) {
+       if (wq->rescuer)
                kthread_stop(wq->rescuer->task);
-               kfree(wq->rescuer);
-               wq->rescuer = NULL;
-       }
 
        if (!(wq->flags & WQ_UNBOUND)) {
                /*
                 * The base ref is never dropped on per-cpu pwqs.  Directly
-                * free the pwqs and wq.
+                * schedule RCU free.
                 */
-               free_percpu(wq->cpu_pwqs);
-               kfree(wq);
+               call_rcu_sched(&wq->rcu, rcu_free_wq);
        } else {
                /*
                 * We're the sole accessor of @wq at this point.  Directly
@@ -4437,13 +4141,173 @@ void print_worker_info(const char *log_lvl, struct task_struct *task)
        }
 }
 
-/*
- * CPU hotplug.
- *
- * There are two challenges in supporting CPU hotplug.  Firstly, there
- * are a lot of assumptions on strong associations among work, pwq and
- * pool which make migrating pending and scheduled works very
- * difficult to implement without impacting hot paths.  Secondly,
+static void pr_cont_pool_info(struct worker_pool *pool)
+{
+       pr_cont(" cpus=%*pbl", nr_cpumask_bits, pool->attrs->cpumask);
+       if (pool->node != NUMA_NO_NODE)
+               pr_cont(" node=%d", pool->node);
+       pr_cont(" flags=0x%x nice=%d", pool->flags, pool->attrs->nice);
+}
+
+static void pr_cont_work(bool comma, struct work_struct *work)
+{
+       if (work->func == wq_barrier_func) {
+               struct wq_barrier *barr;
+
+               barr = container_of(work, struct wq_barrier, work);
+
+               pr_cont("%s BAR(%d)", comma ? "," : "",
+                       task_pid_nr(barr->task));
+       } else {
+               pr_cont("%s %pf", comma ? "," : "", work->func);
+       }
+}
+
+static void show_pwq(struct pool_workqueue *pwq)
+{
+       struct worker_pool *pool = pwq->pool;
+       struct work_struct *work;
+       struct worker *worker;
+       bool has_in_flight = false, has_pending = false;
+       int bkt;
+
+       pr_info("  pwq %d:", pool->id);
+       pr_cont_pool_info(pool);
+
+       pr_cont(" active=%d/%d%s\n", pwq->nr_active, pwq->max_active,
+               !list_empty(&pwq->mayday_node) ? " MAYDAY" : "");
+
+       hash_for_each(pool->busy_hash, bkt, worker, hentry) {
+               if (worker->current_pwq == pwq) {
+                       has_in_flight = true;
+                       break;
+               }
+       }
+       if (has_in_flight) {
+               bool comma = false;
+
+               pr_info("    in-flight:");
+               hash_for_each(pool->busy_hash, bkt, worker, hentry) {
+                       if (worker->current_pwq != pwq)
+                               continue;
+
+                       pr_cont("%s %d%s:%pf", comma ? "," : "",
+                               task_pid_nr(worker->task),
+                               worker == pwq->wq->rescuer ? "(RESCUER)" : "",
+                               worker->current_func);
+                       list_for_each_entry(work, &worker->scheduled, entry)
+                               pr_cont_work(false, work);
+                       comma = true;
+               }
+               pr_cont("\n");
+       }
+
+       list_for_each_entry(work, &pool->worklist, entry) {
+               if (get_work_pwq(work) == pwq) {
+                       has_pending = true;
+                       break;
+               }
+       }
+       if (has_pending) {
+               bool comma = false;
+
+               pr_info("    pending:");
+               list_for_each_entry(work, &pool->worklist, entry) {
+                       if (get_work_pwq(work) != pwq)
+                               continue;
+
+                       pr_cont_work(comma, work);
+                       comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
+               }
+               pr_cont("\n");
+       }
+
+       if (!list_empty(&pwq->delayed_works)) {
+               bool comma = false;
+
+               pr_info("    delayed:");
+               list_for_each_entry(work, &pwq->delayed_works, entry) {
+                       pr_cont_work(comma, work);
+                       comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
+               }
+               pr_cont("\n");
+       }
+}
+
+/**
+ * show_workqueue_state - dump workqueue state
+ *
+ * Called from a sysrq handler and prints out all busy workqueues and
+ * pools.
+ */
+void show_workqueue_state(void)
+{
+       struct workqueue_struct *wq;
+       struct worker_pool *pool;
+       unsigned long flags;
+       int pi;
+
+       rcu_read_lock_sched();
+
+       pr_info("Showing busy workqueues and worker pools:\n");
+
+       list_for_each_entry_rcu(wq, &workqueues, list) {
+               struct pool_workqueue *pwq;
+               bool idle = true;
+
+               for_each_pwq(pwq, wq) {
+                       if (pwq->nr_active || !list_empty(&pwq->delayed_works)) {
+                               idle = false;
+                               break;
+                       }
+               }
+               if (idle)
+                       continue;
+
+               pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags);
+
+               for_each_pwq(pwq, wq) {
+                       spin_lock_irqsave(&pwq->pool->lock, flags);
+                       if (pwq->nr_active || !list_empty(&pwq->delayed_works))
+                               show_pwq(pwq);
+                       spin_unlock_irqrestore(&pwq->pool->lock, flags);
+               }
+       }
+
+       for_each_pool(pool, pi) {
+               struct worker *worker;
+               bool first = true;
+
+               spin_lock_irqsave(&pool->lock, flags);
+               if (pool->nr_workers == pool->nr_idle)
+                       goto next_pool;
+
+               pr_info("pool %d:", pool->id);
+               pr_cont_pool_info(pool);
+               pr_cont(" workers=%d", pool->nr_workers);
+               if (pool->manager)
+                       pr_cont(" manager: %d",
+                               task_pid_nr(pool->manager->task));
+               list_for_each_entry(worker, &pool->idle_list, entry) {
+                       pr_cont(" %s%d", first ? "idle: " : "",
+                               task_pid_nr(worker->task));
+                       first = false;
+               }
+               pr_cont("\n");
+       next_pool:
+               spin_unlock_irqrestore(&pool->lock, flags);
+       }
+
+       rcu_read_unlock_sched();
+}
+
+/*
+ * CPU hotplug.
+ *
+ * There are two challenges in supporting CPU hotplug.  Firstly, there
+ * are a lot of assumptions on strong associations among work, pwq and
+ * pool which make migrating pending and scheduled works very
+ * difficult to implement without impacting hot paths.  Secondly,
  * worker pools serve mix of short, long and very long running works making
  * blocked draining impractical.
  *
@@ -4637,202 +4501,519 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb,
                        else if (pool->cpu < 0)
                                restore_unbound_workers_cpumask(pool, cpu);
 
-                       mutex_unlock(&pool->attach_mutex);
-               }
+                       mutex_unlock(&pool->attach_mutex);
+               }
+
+               /* update NUMA affinity of unbound workqueues */
+               list_for_each_entry(wq, &workqueues, list)
+                       wq_update_unbound_numa(wq, cpu, true);
+
+               mutex_unlock(&wq_pool_mutex);
+               break;
+       }
+       return NOTIFY_OK;
+}
+
+/*
+ * Workqueues should be brought down after normal priority CPU notifiers.
+ * This will be registered as low priority CPU notifier.
+ */
+static int workqueue_cpu_down_callback(struct notifier_block *nfb,
+                                                unsigned long action,
+                                                void *hcpu)
+{
+       int cpu = (unsigned long)hcpu;
+       struct work_struct unbind_work;
+       struct workqueue_struct *wq;
+
+       switch (action & ~CPU_TASKS_FROZEN) {
+       case CPU_DOWN_PREPARE:
+               /* unbinding per-cpu workers should happen on the local CPU */
+               INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn);
+               queue_work_on(cpu, system_highpri_wq, &unbind_work);
+
+               /* update NUMA affinity of unbound workqueues */
+               mutex_lock(&wq_pool_mutex);
+               list_for_each_entry(wq, &workqueues, list)
+                       wq_update_unbound_numa(wq, cpu, false);
+               mutex_unlock(&wq_pool_mutex);
+
+               /* wait for per-cpu unbinding to finish */
+               flush_work(&unbind_work);
+               destroy_work_on_stack(&unbind_work);
+               break;
+       }
+       return NOTIFY_OK;
+}
+
+#ifdef CONFIG_SMP
+
+struct work_for_cpu {
+       struct work_struct work;
+       long (*fn)(void *);
+       void *arg;
+       long ret;
+};
+
+static void work_for_cpu_fn(struct work_struct *work)
+{
+       struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work);
+
+       wfc->ret = wfc->fn(wfc->arg);
+}
+
+/**
+ * work_on_cpu - run a function in user context on a particular cpu
+ * @cpu: the cpu to run on
+ * @fn: the function to run
+ * @arg: the function arg
+ *
+ * It is up to the caller to ensure that the cpu doesn't go offline.
+ * The caller must not hold any locks which would prevent @fn from completing.
+ *
+ * Return: The value @fn returns.
+ */
+long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
+{
+       struct work_for_cpu wfc = { .fn = fn, .arg = arg };
+
+       INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn);
+       schedule_work_on(cpu, &wfc.work);
+       flush_work(&wfc.work);
+       destroy_work_on_stack(&wfc.work);
+       return wfc.ret;
+}
+EXPORT_SYMBOL_GPL(work_on_cpu);
+#endif /* CONFIG_SMP */
+
+#ifdef CONFIG_FREEZER
+
+/**
+ * freeze_workqueues_begin - begin freezing workqueues
+ *
+ * Start freezing workqueues.  After this function returns, all freezable
+ * workqueues will queue new works to their delayed_works list instead of
+ * pool->worklist.
+ *
+ * CONTEXT:
+ * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's.
+ */
+void freeze_workqueues_begin(void)
+{
+       struct workqueue_struct *wq;
+       struct pool_workqueue *pwq;
+
+       mutex_lock(&wq_pool_mutex);
+
+       WARN_ON_ONCE(workqueue_freezing);
+       workqueue_freezing = true;
+
+       list_for_each_entry(wq, &workqueues, list) {
+               mutex_lock(&wq->mutex);
+               for_each_pwq(pwq, wq)
+                       pwq_adjust_max_active(pwq);
+               mutex_unlock(&wq->mutex);
+       }
+
+       mutex_unlock(&wq_pool_mutex);
+}
+
+/**
+ * freeze_workqueues_busy - are freezable workqueues still busy?
+ *
+ * Check whether freezing is complete.  This function must be called
+ * between freeze_workqueues_begin() and thaw_workqueues().
+ *
+ * CONTEXT:
+ * Grabs and releases wq_pool_mutex.
+ *
+ * Return:
+ * %true if some freezable workqueues are still busy.  %false if freezing
+ * is complete.
+ */
+bool freeze_workqueues_busy(void)
+{
+       bool busy = false;
+       struct workqueue_struct *wq;
+       struct pool_workqueue *pwq;
+
+       mutex_lock(&wq_pool_mutex);
+
+       WARN_ON_ONCE(!workqueue_freezing);
+
+       list_for_each_entry(wq, &workqueues, list) {
+               if (!(wq->flags & WQ_FREEZABLE))
+                       continue;
+               /*
+                * nr_active is monotonically decreasing.  It's safe
+                * to peek without lock.
+                */
+               rcu_read_lock_sched();
+               for_each_pwq(pwq, wq) {
+                       WARN_ON_ONCE(pwq->nr_active < 0);
+                       if (pwq->nr_active) {
+                               busy = true;
+                               rcu_read_unlock_sched();
+                               goto out_unlock;
+                       }
+               }
+               rcu_read_unlock_sched();
+       }
+out_unlock:
+       mutex_unlock(&wq_pool_mutex);
+       return busy;
+}
+
+/**
+ * thaw_workqueues - thaw workqueues
+ *
+ * Thaw workqueues.  Normal queueing is restored and all collected
+ * frozen works are transferred to their respective pool worklists.
+ *
+ * CONTEXT:
+ * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's.
+ */
+void thaw_workqueues(void)
+{
+       struct workqueue_struct *wq;
+       struct pool_workqueue *pwq;
+
+       mutex_lock(&wq_pool_mutex);
+
+       if (!workqueue_freezing)
+               goto out_unlock;
+
+       workqueue_freezing = false;
+
+       /* restore max_active and repopulate worklist */
+       list_for_each_entry(wq, &workqueues, list) {
+               mutex_lock(&wq->mutex);
+               for_each_pwq(pwq, wq)
+                       pwq_adjust_max_active(pwq);
+               mutex_unlock(&wq->mutex);
+       }
+
+out_unlock:
+       mutex_unlock(&wq_pool_mutex);
+}
+#endif /* CONFIG_FREEZER */
+
+#ifdef CONFIG_SYSFS
+/*
+ * Workqueues with WQ_SYSFS flag set is visible to userland via
+ * /sys/bus/workqueue/devices/WQ_NAME.  All visible workqueues have the
+ * following attributes.
+ *
+ *  per_cpu    RO bool : whether the workqueue is per-cpu or unbound
+ *  max_active RW int  : maximum number of in-flight work items
+ *
+ * Unbound workqueues have the following extra attributes.
+ *
+ *  id         RO int  : the associated pool ID
+ *  nice       RW int  : nice value of the workers
+ *  cpumask    RW mask : bitmask of allowed CPUs for the workers
+ */
+struct wq_device {
+       struct workqueue_struct         *wq;
+       struct device                   dev;
+};
+
+static struct workqueue_struct *dev_to_wq(struct device *dev)
+{
+       struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
+
+       return wq_dev->wq;
+}
+
+static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr,
+                           char *buf)
+{
+       struct workqueue_struct *wq = dev_to_wq(dev);
+
+       return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND));
+}
+static DEVICE_ATTR_RO(per_cpu);
+
+static ssize_t max_active_show(struct device *dev,
+                              struct device_attribute *attr, char *buf)
+{
+       struct workqueue_struct *wq = dev_to_wq(dev);
+
+       return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active);
+}
+
+static ssize_t max_active_store(struct device *dev,
+                               struct device_attribute *attr, const char *buf,
+                               size_t count)
+{
+       struct workqueue_struct *wq = dev_to_wq(dev);
+       int val;
+
+       if (sscanf(buf, "%d", &val) != 1 || val <= 0)
+               return -EINVAL;
+
+       workqueue_set_max_active(wq, val);
+       return count;
+}
+static DEVICE_ATTR_RW(max_active);
+
+static struct attribute *wq_sysfs_attrs[] = {
+       &dev_attr_per_cpu.attr,
+       &dev_attr_max_active.attr,
+       NULL,
+};
+ATTRIBUTE_GROUPS(wq_sysfs);
+
+static ssize_t wq_pool_ids_show(struct device *dev,
+                               struct device_attribute *attr, char *buf)
+{
+       struct workqueue_struct *wq = dev_to_wq(dev);
+       const char *delim = "";
+       int node, written = 0;
+
+       rcu_read_lock_sched();
+       for_each_node(node) {
+               written += scnprintf(buf + written, PAGE_SIZE - written,
+                                    "%s%d:%d", delim, node,
+                                    unbound_pwq_by_node(wq, node)->pool->id);
+               delim = " ";
+       }
+       written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
+       rcu_read_unlock_sched();
+
+       return written;
+}
+
+static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
+                           char *buf)
+{
+       struct workqueue_struct *wq = dev_to_wq(dev);
+       int written;
+
+       mutex_lock(&wq->mutex);
+       written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice);
+       mutex_unlock(&wq->mutex);
+
+       return written;
+}
+
+/* prepare workqueue_attrs for sysfs store operations */
+static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq)
+{
+       struct workqueue_attrs *attrs;
+
+       attrs = alloc_workqueue_attrs(GFP_KERNEL);
+       if (!attrs)
+               return NULL;
+
+       mutex_lock(&wq->mutex);
+       copy_workqueue_attrs(attrs, wq->unbound_attrs);
+       mutex_unlock(&wq->mutex);
+       return attrs;
+}
+
+static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
+                            const char *buf, size_t count)
+{
+       struct workqueue_struct *wq = dev_to_wq(dev);
+       struct workqueue_attrs *attrs;
+       int ret;
+
+       attrs = wq_sysfs_prep_attrs(wq);
+       if (!attrs)
+               return -ENOMEM;
+
+       if (sscanf(buf, "%d", &attrs->nice) == 1 &&
+           attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE)
+               ret = apply_workqueue_attrs(wq, attrs);
+       else
+               ret = -EINVAL;
+
+       free_workqueue_attrs(attrs);
+       return ret ?: count;
+}
+
+static ssize_t wq_cpumask_show(struct device *dev,
+                              struct device_attribute *attr, char *buf)
+{
+       struct workqueue_struct *wq = dev_to_wq(dev);
+       int written;
+
+       mutex_lock(&wq->mutex);
+       written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
+                           cpumask_pr_args(wq->unbound_attrs->cpumask));
+       mutex_unlock(&wq->mutex);
+       return written;
+}
+
+static ssize_t wq_cpumask_store(struct device *dev,
+                               struct device_attribute *attr,
+                               const char *buf, size_t count)
+{
+       struct workqueue_struct *wq = dev_to_wq(dev);
+       struct workqueue_attrs *attrs;
+       int ret;
+
+       attrs = wq_sysfs_prep_attrs(wq);
+       if (!attrs)
+               return -ENOMEM;
+
+       ret = cpumask_parse(buf, attrs->cpumask);
+       if (!ret)
+               ret = apply_workqueue_attrs(wq, attrs);
+
+       free_workqueue_attrs(attrs);
+       return ret ?: count;
+}
+
+static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr,
+                           char *buf)
+{
+       struct workqueue_struct *wq = dev_to_wq(dev);
+       int written;
 
-               /* update NUMA affinity of unbound workqueues */
-               list_for_each_entry(wq, &workqueues, list)
-                       wq_update_unbound_numa(wq, cpu, true);
+       mutex_lock(&wq->mutex);
+       written = scnprintf(buf, PAGE_SIZE, "%d\n",
+                           !wq->unbound_attrs->no_numa);
+       mutex_unlock(&wq->mutex);
 
-               mutex_unlock(&wq_pool_mutex);
-               break;
-       }
-       return NOTIFY_OK;
+       return written;
 }
 
-/*
- * Workqueues should be brought down after normal priority CPU notifiers.
- * This will be registered as low priority CPU notifier.
- */
-static int workqueue_cpu_down_callback(struct notifier_block *nfb,
-                                                unsigned long action,
-                                                void *hcpu)
+static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr,
+                            const char *buf, size_t count)
 {
-       int cpu = (unsigned long)hcpu;
-       struct work_struct unbind_work;
-       struct workqueue_struct *wq;
-
-       switch (action & ~CPU_TASKS_FROZEN) {
-       case CPU_DOWN_PREPARE:
-               /* unbinding per-cpu workers should happen on the local CPU */
-               INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn);
-               queue_work_on(cpu, system_highpri_wq, &unbind_work);
+       struct workqueue_struct *wq = dev_to_wq(dev);
+       struct workqueue_attrs *attrs;
+       int v, ret;
 
-               /* update NUMA affinity of unbound workqueues */
-               mutex_lock(&wq_pool_mutex);
-               list_for_each_entry(wq, &workqueues, list)
-                       wq_update_unbound_numa(wq, cpu, false);
-               mutex_unlock(&wq_pool_mutex);
+       attrs = wq_sysfs_prep_attrs(wq);
+       if (!attrs)
+               return -ENOMEM;
 
-               /* wait for per-cpu unbinding to finish */
-               flush_work(&unbind_work);
-               destroy_work_on_stack(&unbind_work);
-               break;
+       ret = -EINVAL;
+       if (sscanf(buf, "%d", &v) == 1) {
+               attrs->no_numa = !v;
+               ret = apply_workqueue_attrs(wq, attrs);
        }
-       return NOTIFY_OK;
+
+       free_workqueue_attrs(attrs);
+       return ret ?: count;
 }
 
-#ifdef CONFIG_SMP
+static struct device_attribute wq_sysfs_unbound_attrs[] = {
+       __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL),
+       __ATTR(nice, 0644, wq_nice_show, wq_nice_store),
+       __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
+       __ATTR(numa, 0644, wq_numa_show, wq_numa_store),
+       __ATTR_NULL,
+};
 
-struct work_for_cpu {
-       struct work_struct work;
-       long (*fn)(void *);
-       void *arg;
-       long ret;
+static struct bus_type wq_subsys = {
+       .name                           = "workqueue",
+       .dev_groups                     = wq_sysfs_groups,
 };
 
-static void work_for_cpu_fn(struct work_struct *work)
+static int __init wq_sysfs_init(void)
 {
-       struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work);
-
-       wfc->ret = wfc->fn(wfc->arg);
+       return subsys_virtual_register(&wq_subsys, NULL);
 }
+core_initcall(wq_sysfs_init);
 
-/**
- * work_on_cpu - run a function in user context on a particular cpu
- * @cpu: the cpu to run on
- * @fn: the function to run
- * @arg: the function arg
- *
- * It is up to the caller to ensure that the cpu doesn't go offline.
- * The caller must not hold any locks which would prevent @fn from completing.
- *
- * Return: The value @fn returns.
- */
-long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
+static void wq_device_release(struct device *dev)
 {
-       struct work_for_cpu wfc = { .fn = fn, .arg = arg };
+       struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
 
-       INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn);
-       schedule_work_on(cpu, &wfc.work);
-       flush_work(&wfc.work);
-       destroy_work_on_stack(&wfc.work);
-       return wfc.ret;
+       kfree(wq_dev);
 }
-EXPORT_SYMBOL_GPL(work_on_cpu);
-#endif /* CONFIG_SMP */
-
-#ifdef CONFIG_FREEZER
 
 /**
- * freeze_workqueues_begin - begin freezing workqueues
+ * workqueue_sysfs_register - make a workqueue visible in sysfs
+ * @wq: the workqueue to register
  *
- * Start freezing workqueues.  After this function returns, all freezable
- * workqueues will queue new works to their delayed_works list instead of
- * pool->worklist.
+ * Expose @wq in sysfs under /sys/bus/workqueue/devices.
+ * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set
+ * which is the preferred method.
  *
- * CONTEXT:
- * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's.
+ * Workqueue user should use this function directly iff it wants to apply
+ * workqueue_attrs before making the workqueue visible in sysfs; otherwise,
+ * apply_workqueue_attrs() may race against userland updating the
+ * attributes.
+ *
+ * Return: 0 on success, -errno on failure.
  */
-void freeze_workqueues_begin(void)
+int workqueue_sysfs_register(struct workqueue_struct *wq)
 {
-       struct workqueue_struct *wq;
-       struct pool_workqueue *pwq;
-
-       mutex_lock(&wq_pool_mutex);
+       struct wq_device *wq_dev;
+       int ret;
 
-       WARN_ON_ONCE(workqueue_freezing);
-       workqueue_freezing = true;
+       /*
+        * Adjusting max_active or creating new pwqs by applyting
+        * attributes breaks ordering guarantee.  Disallow exposing ordered
+        * workqueues.
+        */
+       if (WARN_ON(wq->flags & __WQ_ORDERED))
+               return -EINVAL;
 
-       list_for_each_entry(wq, &workqueues, list) {
-               mutex_lock(&wq->mutex);
-               for_each_pwq(pwq, wq)
-                       pwq_adjust_max_active(pwq);
-               mutex_unlock(&wq->mutex);
-       }
+       wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);
+       if (!wq_dev)
+               return -ENOMEM;
 
-       mutex_unlock(&wq_pool_mutex);
-}
+       wq_dev->wq = wq;
+       wq_dev->dev.bus = &wq_subsys;
+       wq_dev->dev.init_name = wq->name;
+       wq_dev->dev.release = wq_device_release;
 
-/**
- * freeze_workqueues_busy - are freezable workqueues still busy?
- *
- * Check whether freezing is complete.  This function must be called
- * between freeze_workqueues_begin() and thaw_workqueues().
- *
- * CONTEXT:
- * Grabs and releases wq_pool_mutex.
- *
- * Return:
- * %true if some freezable workqueues are still busy.  %false if freezing
- * is complete.
- */
-bool freeze_workqueues_busy(void)
-{
-       bool busy = false;
-       struct workqueue_struct *wq;
-       struct pool_workqueue *pwq;
+       /*
+        * unbound_attrs are created separately.  Suppress uevent until
+        * everything is ready.
+        */
+       dev_set_uevent_suppress(&wq_dev->dev, true);
 
-       mutex_lock(&wq_pool_mutex);
+       ret = device_register(&wq_dev->dev);
+       if (ret) {
+               kfree(wq_dev);
+               wq->wq_dev = NULL;
+               return ret;
+       }
 
-       WARN_ON_ONCE(!workqueue_freezing);
+       if (wq->flags & WQ_UNBOUND) {
+               struct device_attribute *attr;
 
-       list_for_each_entry(wq, &workqueues, list) {
-               if (!(wq->flags & WQ_FREEZABLE))
-                       continue;
-               /*
-                * nr_active is monotonically decreasing.  It's safe
-                * to peek without lock.
-                */
-               rcu_read_lock_sched();
-               for_each_pwq(pwq, wq) {
-                       WARN_ON_ONCE(pwq->nr_active < 0);
-                       if (pwq->nr_active) {
-                               busy = true;
-                               rcu_read_unlock_sched();
-                               goto out_unlock;
+               for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) {
+                       ret = device_create_file(&wq_dev->dev, attr);
+                       if (ret) {
+                               device_unregister(&wq_dev->dev);
+                               wq->wq_dev = NULL;
+                               return ret;
                        }
                }
-               rcu_read_unlock_sched();
        }
-out_unlock:
-       mutex_unlock(&wq_pool_mutex);
-       return busy;
+
+       dev_set_uevent_suppress(&wq_dev->dev, false);
+       kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);
+       return 0;
 }
 
 /**
- * thaw_workqueues - thaw workqueues
- *
- * Thaw workqueues.  Normal queueing is restored and all collected
- * frozen works are transferred to their respective pool worklists.
+ * workqueue_sysfs_unregister - undo workqueue_sysfs_register()
+ * @wq: the workqueue to unregister
  *
- * CONTEXT:
- * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's.
+ * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister.
  */
-void thaw_workqueues(void)
+static void workqueue_sysfs_unregister(struct workqueue_struct *wq)
 {
-       struct workqueue_struct *wq;
-       struct pool_workqueue *pwq;
-
-       mutex_lock(&wq_pool_mutex);
-
-       if (!workqueue_freezing)
-               goto out_unlock;
-
-       workqueue_freezing = false;
+       struct wq_device *wq_dev = wq->wq_dev;
 
-       /* restore max_active and repopulate worklist */
-       list_for_each_entry(wq, &workqueues, list) {
-               mutex_lock(&wq->mutex);
-               for_each_pwq(pwq, wq)
-                       pwq_adjust_max_active(pwq);
-               mutex_unlock(&wq->mutex);
-       }
+       if (!wq->wq_dev)
+               return;
 
-out_unlock:
-       mutex_unlock(&wq_pool_mutex);
+       wq->wq_dev = NULL;
+       device_unregister(&wq_dev->dev);
 }
-#endif /* CONFIG_FREEZER */
+#else  /* CONFIG_SYSFS */
+static void workqueue_sysfs_unregister(struct workqueue_struct *wq)    { }
+#endif /* CONFIG_SYSFS */
 
 static void __init wq_numa_init(void)
 {