]> git.kernelconcepts.de Git - karo-tx-linux.git/commitdiff
Merge branch 'for-3.19/core' of git://git.kernel.dk/linux-block
authorLinus Torvalds <torvalds@linux-foundation.org>
Sat, 13 Dec 2014 22:14:23 +0000 (14:14 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sat, 13 Dec 2014 22:14:23 +0000 (14:14 -0800)
Pull block driver core update from Jens Axboe:
 "This is the pull request for the core block IO changes for 3.19.  Not
  a huge round this time, mostly lots of little good fixes:

   - Fix a bug in sysfs blktrace interface causing a NULL pointer
     dereference, when enabled/disabled through that API.  From Arianna
     Avanzini.

   - Various updates/fixes/improvements for blk-mq:

        - A set of updates from Bart, mostly fixing buts in the tag
          handling.

        - Cleanup/code consolidation from Christoph.

        - Extend queue_rq API to be able to handle batching issues of IO
          requests. NVMe will utilize this shortly. From me.

        - A few tag and request handling updates from me.

        - Cleanup of the preempt handling for running queues from Paolo.

        - Prevent running of unmapped hardware queues from Ming Lei.

        - Move the kdump memory limiting check to be in the correct
          location, from Shaohua.

        - Initialize all software queues at init time from Takashi. This
          prevents a kobject warning when CPUs are brought online that
          weren't online when a queue was registered.

   - Single writeback fix for I_DIRTY clearing from Tejun.  Queued with
     the core IO changes, since it's just a single fix.

   - Version X of the __bio_add_page() segment addition retry from
     Maurizio.  Hope the Xth time is the charm.

   - Documentation fixup for IO scheduler merging from Jan.

   - Introduce (and use) generic IO stat accounting helpers for non-rq
     drivers, from Gu Zheng.

   - Kill off artificial limiting of max sectors in a request from
     Christoph"

* 'for-3.19/core' of git://git.kernel.dk/linux-block: (26 commits)
  bio: modify __bio_add_page() to accept pages that don't start a new segment
  blk-mq: Fix uninitialized kobject at CPU hotplugging
  blktrace: don't let the sysfs interface remove trace from running list
  blk-mq: Use all available hardware queues
  blk-mq: Micro-optimize bt_get()
  blk-mq: Fix a race between bt_clear_tag() and bt_get()
  blk-mq: Avoid that __bt_get_word() wraps multiple times
  blk-mq: Fix a use-after-free
  blk-mq: prevent unmapped hw queue from being scheduled
  blk-mq: re-check for available tags after running the hardware queue
  blk-mq: fix hang in bt_get()
  blk-mq: move the kdump check to blk_mq_alloc_tag_set
  blk-mq: cleanup tag free handling
  blk-mq: use 'nr_cpu_ids' as highest CPU ID count for hwq <-> cpu map
  blk: introduce generic io stat accounting help function
  blk-mq: handle the single queue case in blk_mq_hctx_next_cpu
  genhd: check for int overflow in disk_expand_part_tbl()
  blk-mq: add blk_mq_free_hctx_request()
  blk-mq: export blk_mq_free_request()
  blk-mq: use get_cpu/put_cpu instead of preempt_disable/preempt_enable
  ...

1  2 
Documentation/block/biodoc.txt
block/blk-core.c
block/blk-mq-tag.c
block/blk-mq.c
drivers/block/null_blk.c
drivers/block/virtio_blk.c
drivers/scsi/scsi_lib.c
include/linux/blk-mq.h
include/linux/blkdev.h
kernel/trace/blktrace.c

index 6b972b287795b73d84fa7a390d34aaec821163e1,f1323c6b7ed275c25fae0154e22a644377ae4d95..5aabc08de811d49ba898d550a0c2ff98ab3964a5
@@@ -827,6 -827,10 +827,6 @@@ but in the event of any barrier request
  that requests are restarted in the order they were queue. This may happen
  if the driver needs to use blk_queue_invalidate_tags().
  
 -Tagging also defines a new request flag, REQ_QUEUED. This is set whenever
 -a request is currently tagged. You should not use this flag directly,
 -blk_rq_tagged(rq) is the portable way to do so.
 -
  3.3 I/O Submission
  
  The routine submit_bio() is used to submit a single io. Higher level i/o
@@@ -942,7 -946,11 +942,11 @@@ elevator_allow_merge_fn          called wheneve
                                request safely. The io scheduler may still
                                want to stop a merge at this point if it
                                results in some sort of conflict internally,
-                               this hook allows it to do that.
+                               this hook allows it to do that. Note however
+                               that two *requests* can still be merged at later
+                               time. Currently the io scheduler has no way to
+                               prevent that. It can only learn about the fact
+                               from elevator_merge_req_fn callback.
  
  elevator_dispatch_fn*         fills the dispatch queue with ready requests.
                                I/O schedulers are free to postpone requests by
diff --combined block/blk-core.c
index ea1c4d0d7a44ea7e353b1edb709883651f4f1109,93f9152fc2718299e7bb01f9110f8990cc760f48..30f6153a40c27c7154dd453301807c7d39d1451c
@@@ -525,6 -525,9 +525,9 @@@ void blk_cleanup_queue(struct request_q
        del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer);
        blk_sync_queue(q);
  
+       if (q->mq_ops)
+               blk_mq_free_queue(q);
        spin_lock_irq(lock);
        if (q->queue_lock != &q->__queue_lock)
                q->queue_lock = &q->__queue_lock;
@@@ -1266,7 -1269,7 +1269,7 @@@ void blk_requeue_request(struct request
        blk_clear_rq_complete(rq);
        trace_block_rq_requeue(q, rq);
  
 -      if (blk_rq_tagged(rq))
 +      if (rq->cmd_flags & REQ_QUEUED)
                blk_queue_end_tag(q, rq);
  
        BUG_ON(blk_queued_rq(rq));
@@@ -1325,7 -1328,7 +1328,7 @@@ void part_round_stats(int cpu, struct h
  }
  EXPORT_SYMBOL_GPL(part_round_stats);
  
 -#ifdef CONFIG_PM_RUNTIME
 +#ifdef CONFIG_PM
  static void blk_pm_put_request(struct request *rq)
  {
        if (rq->q->dev && !(rq->cmd_flags & REQ_PM) && !--rq->q->nr_pending)
@@@ -2134,7 -2137,7 +2137,7 @@@ void blk_account_io_done(struct reques
        }
  }
  
 -#ifdef CONFIG_PM_RUNTIME
 +#ifdef CONFIG_PM
  /*
   * Don't process normal requests when queue is suspended
   * or in the process of suspending/resuming
@@@ -2554,7 -2557,7 +2557,7 @@@ EXPORT_SYMBOL_GPL(blk_unprep_request)
   */
  void blk_finish_request(struct request *req, int error)
  {
 -      if (blk_rq_tagged(req))
 +      if (req->cmd_flags & REQ_QUEUED)
                blk_queue_end_tag(req->q, req);
  
        BUG_ON(blk_queued_rq(req));
@@@ -3159,7 -3162,7 +3162,7 @@@ void blk_finish_plug(struct blk_plug *p
  }
  EXPORT_SYMBOL(blk_finish_plug);
  
 -#ifdef CONFIG_PM_RUNTIME
 +#ifdef CONFIG_PM
  /**
   * blk_pm_runtime_init - Block layer runtime PM initialization routine
   * @q: the queue of the device
diff --combined block/blk-mq-tag.c
index 728b9a4d5f561a45d037e0b0ee71f5648b2c7942,1b7229f9354a4a1043f0e2e375af7a3c98b40ac6..e3d4e4043b496199d9f1eda8d83318cd1f9b0fe9
@@@ -137,6 -137,7 +137,7 @@@ static inline bool hctx_may_queue(struc
  static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag)
  {
        int tag, org_last_tag, end;
+       bool wrap = last_tag != 0;
  
        org_last_tag = last_tag;
        end = bm->depth;
@@@ -148,15 -149,16 +149,16 @@@ restart
                         * We started with an offset, start from 0 to
                         * exhaust the map.
                         */
-                       if (org_last_tag && last_tag) {
-                               end = last_tag;
+                       if (wrap) {
+                               wrap = false;
+                               end = org_last_tag;
                                last_tag = 0;
                                goto restart;
                        }
                        return -1;
                }
                last_tag = tag + 1;
-       } while (test_and_set_bit_lock(tag, &bm->word));
+       } while (test_and_set_bit(tag, &bm->word));
  
        return tag;
  }
@@@ -246,14 -248,29 +248,29 @@@ static int bt_get(struct blk_mq_alloc_d
        if (!(data->gfp & __GFP_WAIT))
                return -1;
  
-       bs = bt_wait_ptr(bt, hctx);
        do {
+               bs = bt_wait_ptr(bt, hctx);
                prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE);
  
                tag = __bt_get(hctx, bt, last_tag);
                if (tag != -1)
                        break;
  
+               /*
+                * We're out of tags on this hardware queue, kick any
+                * pending IO submits before going to sleep waiting for
+                * some to complete.
+                */
+               blk_mq_run_hw_queue(hctx, false);
+               /*
+                * Retry tag allocation after running the hardware queue,
+                * as running the queue may also have found completions.
+                */
+               tag = __bt_get(hctx, bt, last_tag);
+               if (tag != -1)
+                       break;
                blk_mq_put_ctx(data->ctx);
  
                io_schedule();
                        hctx = data->hctx;
                        bt = &hctx->tags->bitmap_tags;
                }
-               finish_wait(&bs->wait, &wait);
-               bs = bt_wait_ptr(bt, hctx);
        } while (1);
  
        finish_wait(&bs->wait, &wait);
@@@ -340,11 -355,10 +355,10 @@@ static void bt_clear_tag(struct blk_mq_
        struct bt_wait_state *bs;
        int wait_cnt;
  
-       /*
-        * The unlock memory barrier need to order access to req in free
-        * path and clearing tag bit
-        */
-       clear_bit_unlock(TAG_TO_BIT(bt, tag), &bt->map[index].word);
+       clear_bit(TAG_TO_BIT(bt, tag), &bt->map[index].word);
+       /* Ensure that the wait list checks occur after clear_bit(). */
+       smp_mb();
  
        bs = bt_wake_ptr(bt);
        if (!bs)
        }
  }
  
- static void __blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag)
- {
-       BUG_ON(tag >= tags->nr_tags);
-       bt_clear_tag(&tags->bitmap_tags, tag);
- }
- static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags,
-                                     unsigned int tag)
- {
-       BUG_ON(tag >= tags->nr_reserved_tags);
-       bt_clear_tag(&tags->breserved_tags, tag);
- }
  void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag,
                    unsigned int *last_tag)
  {
        if (tag >= tags->nr_reserved_tags) {
                const int real_tag = tag - tags->nr_reserved_tags;
  
-               __blk_mq_put_tag(tags, real_tag);
+               BUG_ON(real_tag >= tags->nr_tags);
+               bt_clear_tag(&tags->bitmap_tags, real_tag);
                *last_tag = real_tag;
-       } else
-               __blk_mq_put_reserved_tag(tags, tag);
+       } else {
+               BUG_ON(tag >= tags->nr_reserved_tags);
+               bt_clear_tag(&tags->breserved_tags, tag);
+       }
  }
  
  static void bt_for_each(struct blk_mq_hw_ctx *hctx,
@@@ -584,34 -586,6 +586,34 @@@ int blk_mq_tag_update_depth(struct blk_
        return 0;
  }
  
 +/**
 + * blk_mq_unique_tag() - return a tag that is unique queue-wide
 + * @rq: request for which to compute a unique tag
 + *
 + * The tag field in struct request is unique per hardware queue but not over
 + * all hardware queues. Hence this function that returns a tag with the
 + * hardware context index in the upper bits and the per hardware queue tag in
 + * the lower bits.
 + *
 + * Note: When called for a request that is queued on a non-multiqueue request
 + * queue, the hardware context index is set to zero.
 + */
 +u32 blk_mq_unique_tag(struct request *rq)
 +{
 +      struct request_queue *q = rq->q;
 +      struct blk_mq_hw_ctx *hctx;
 +      int hwq = 0;
 +
 +      if (q->mq_ops) {
 +              hctx = q->mq_ops->map_queue(q, rq->mq_ctx->cpu);
 +              hwq = hctx->queue_num;
 +      }
 +
 +      return (hwq << BLK_MQ_UNIQUE_TAG_BITS) |
 +              (rq->tag & BLK_MQ_UNIQUE_TAG_MASK);
 +}
 +EXPORT_SYMBOL(blk_mq_unique_tag);
 +
  ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page)
  {
        char *orig_page = page;
diff --combined block/blk-mq.c
index 92ceef0d2ab932a58526f721dec08811a310613e,b21a3b6f7b659b06ce9644746823ae66b807dcd3..da1ab5641227b670faac42a84fde7e223668a4d8
@@@ -107,7 -107,11 +107,7 @@@ static void blk_mq_usage_counter_releas
        wake_up_all(&q->mq_freeze_wq);
  }
  
 -/*
 - * Guarantee no request is in use, so we can change any data structure of
 - * the queue afterward.
 - */
 -void blk_mq_freeze_queue(struct request_queue *q)
 +static void blk_mq_freeze_queue_start(struct request_queue *q)
  {
        bool freeze;
  
                percpu_ref_kill(&q->mq_usage_counter);
                blk_mq_run_queues(q, false);
        }
 +}
 +
 +static void blk_mq_freeze_queue_wait(struct request_queue *q)
 +{
        wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter));
  }
  
 +/*
 + * Guarantee no request is in use, so we can change any data structure of
 + * the queue afterward.
 + */
 +void blk_mq_freeze_queue(struct request_queue *q)
 +{
 +      blk_mq_freeze_queue_start(q);
 +      blk_mq_freeze_queue_wait(q);
 +}
 +
  static void blk_mq_unfreeze_queue(struct request_queue *q)
  {
        bool wake;
@@@ -279,17 -269,25 +279,25 @@@ static void __blk_mq_free_request(struc
        blk_mq_queue_exit(q);
  }
  
- void blk_mq_free_request(struct request *rq)
+ void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
  {
        struct blk_mq_ctx *ctx = rq->mq_ctx;
-       struct blk_mq_hw_ctx *hctx;
-       struct request_queue *q = rq->q;
  
        ctx->rq_completed[rq_is_sync(rq)]++;
-       hctx = q->mq_ops->map_queue(q, ctx->cpu);
        __blk_mq_free_request(hctx, ctx, rq);
+ }
+ EXPORT_SYMBOL_GPL(blk_mq_free_hctx_request);
+ void blk_mq_free_request(struct request *rq)
+ {
+       struct blk_mq_hw_ctx *hctx;
+       struct request_queue *q = rq->q;
+       hctx = q->mq_ops->map_queue(q, rq->mq_ctx->cpu);
+       blk_mq_free_hctx_request(hctx, rq);
  }
+ EXPORT_SYMBOL_GPL(blk_mq_free_request);
  
  inline void __blk_mq_end_request(struct request *rq, int error)
  {
@@@ -591,7 -589,7 +599,7 @@@ static void blk_mq_rq_timer(unsigned lo
                 * If not software queues are currently mapped to this
                 * hardware queue, there's nothing to check
                 */
-               if (!hctx->nr_ctx || !hctx->tags)
+               if (!blk_mq_hw_queue_mapped(hctx))
                        continue;
  
                blk_mq_tag_busy_iter(hctx, blk_mq_check_expired, &data);
@@@ -690,6 -688,8 +698,8 @@@ static void __blk_mq_run_hw_queue(struc
        struct request_queue *q = hctx->queue;
        struct request *rq;
        LIST_HEAD(rq_list);
+       LIST_HEAD(driver_list);
+       struct list_head *dptr;
        int queued;
  
        WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask));
                spin_unlock(&hctx->lock);
        }
  
+       /*
+        * Start off with dptr being NULL, so we start the first request
+        * immediately, even if we have more pending.
+        */
+       dptr = NULL;
        /*
         * Now process all the entries, sending them to the driver.
         */
        queued = 0;
        while (!list_empty(&rq_list)) {
+               struct blk_mq_queue_data bd;
                int ret;
  
                rq = list_first_entry(&rq_list, struct request, queuelist);
                list_del_init(&rq->queuelist);
  
-               ret = q->mq_ops->queue_rq(hctx, rq, list_empty(&rq_list));
+               bd.rq = rq;
+               bd.list = dptr;
+               bd.last = list_empty(&rq_list);
+               ret = q->mq_ops->queue_rq(hctx, &bd);
                switch (ret) {
                case BLK_MQ_RQ_QUEUE_OK:
                        queued++;
  
                if (ret == BLK_MQ_RQ_QUEUE_BUSY)
                        break;
+               /*
+                * We've done the first request. If we have more than 1
+                * left in the list, set dptr to defer issue.
+                */
+               if (!dptr && rq_list.next != rq_list.prev)
+                       dptr = &driver_list;
        }
  
        if (!queued)
   */
  static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
  {
-       int cpu = hctx->next_cpu;
+       if (hctx->queue->nr_hw_queues == 1)
+               return WORK_CPU_UNBOUND;
  
        if (--hctx->next_cpu_batch <= 0) {
-               int next_cpu;
+               int cpu = hctx->next_cpu, next_cpu;
  
                next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
                if (next_cpu >= nr_cpu_ids)
  
                hctx->next_cpu = next_cpu;
                hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
+               return cpu;
        }
  
-       return cpu;
+       return hctx->next_cpu;
  }
  
  void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
  {
-       if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
+       if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state) ||
+           !blk_mq_hw_queue_mapped(hctx)))
                return;
  
-       if (!async && cpumask_test_cpu(smp_processor_id(), hctx->cpumask))
-               __blk_mq_run_hw_queue(hctx);
-       else if (hctx->queue->nr_hw_queues == 1)
-               kblockd_schedule_delayed_work(&hctx->run_work, 0);
-       else {
-               unsigned int cpu;
+       if (!async) {
+               int cpu = get_cpu();
+               if (cpumask_test_cpu(cpu, hctx->cpumask)) {
+                       __blk_mq_run_hw_queue(hctx);
+                       put_cpu();
+                       return;
+               }
  
-               cpu = blk_mq_hctx_next_cpu(hctx);
-               kblockd_schedule_delayed_work_on(cpu, &hctx->run_work, 0);
+               put_cpu();
        }
+       kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
+                       &hctx->run_work, 0);
  }
  
  void blk_mq_run_queues(struct request_queue *q, bool async)
                    test_bit(BLK_MQ_S_STOPPED, &hctx->state))
                        continue;
  
-               preempt_disable();
                blk_mq_run_hw_queue(hctx, async);
-               preempt_enable();
        }
  }
  EXPORT_SYMBOL(blk_mq_run_queues);
@@@ -843,9 -866,7 +876,7 @@@ void blk_mq_start_hw_queue(struct blk_m
  {
        clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
  
-       preempt_disable();
        blk_mq_run_hw_queue(hctx, false);
-       preempt_enable();
  }
  EXPORT_SYMBOL(blk_mq_start_hw_queue);
  
@@@ -870,9 -891,7 +901,7 @@@ void blk_mq_start_stopped_hw_queues(str
                        continue;
  
                clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
-               preempt_disable();
                blk_mq_run_hw_queue(hctx, async);
-               preempt_enable();
        }
  }
  EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
@@@ -898,16 -917,11 +927,11 @@@ static void blk_mq_delay_work_fn(struc
  
  void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
  {
-       unsigned long tmo = msecs_to_jiffies(msecs);
-       if (hctx->queue->nr_hw_queues == 1)
-               kblockd_schedule_delayed_work(&hctx->delay_work, tmo);
-       else {
-               unsigned int cpu;
+       if (unlikely(!blk_mq_hw_queue_mapped(hctx)))
+               return;
  
-               cpu = blk_mq_hctx_next_cpu(hctx);
-               kblockd_schedule_delayed_work_on(cpu, &hctx->delay_work, tmo);
-       }
+       kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
+                       &hctx->delay_work, msecs_to_jiffies(msecs));
  }
  EXPORT_SYMBOL(blk_mq_delay_queue);
  
@@@ -1162,7 -1176,17 +1186,17 @@@ static void blk_mq_make_request(struct 
                goto run_queue;
        }
  
-       if (is_sync) {
+       /*
+        * If the driver supports defer issued based on 'last', then
+        * queue it up like normal since we can potentially save some
+        * CPU this way.
+        */
+       if (is_sync && !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) {
+               struct blk_mq_queue_data bd = {
+                       .rq = rq,
+                       .list = NULL,
+                       .last = 1
+               };
                int ret;
  
                blk_mq_bio_to_request(rq, bio);
                 * error (busy), just add it to our list as we previously
                 * would have done
                 */
-               ret = q->mq_ops->queue_rq(data.hctx, rq, true);
+               ret = q->mq_ops->queue_rq(data.hctx, &bd);
                if (ret == BLK_MQ_RQ_QUEUE_OK)
                        goto done;
                else {
@@@ -1784,16 -1808,6 +1818,6 @@@ struct request_queue *blk_mq_init_queue
        if (!ctx)
                return ERR_PTR(-ENOMEM);
  
-       /*
-        * If a crashdump is active, then we are potentially in a very
-        * memory constrained environment. Limit us to 1 queue and
-        * 64 tags to prevent using too much memory.
-        */
-       if (is_kdump_kernel()) {
-               set->nr_hw_queues = 1;
-               set->queue_depth = min(64U, set->queue_depth);
-       }
        hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL,
                        set->numa_node);
  
@@@ -1931,7 -1945,7 +1955,7 @@@ void blk_mq_free_queue(struct request_q
  /* Basically redo blk_mq_init_queue with queue frozen */
  static void blk_mq_queue_reinit(struct request_queue *q)
  {
 -      blk_mq_freeze_queue(q);
 +      WARN_ON_ONCE(!q->mq_freeze_depth);
  
        blk_mq_sysfs_unregister(q);
  
        blk_mq_map_swqueue(q);
  
        blk_mq_sysfs_register(q);
 -
 -      blk_mq_unfreeze_queue(q);
  }
  
  static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
                return NOTIFY_OK;
  
        mutex_lock(&all_q_mutex);
 +
 +      /*
 +       * We need to freeze and reinit all existing queues.  Freezing
 +       * involves synchronous wait for an RCU grace period and doing it
 +       * one by one may take a long time.  Start freezing all queues in
 +       * one swoop and then wait for the completions so that freezing can
 +       * take place in parallel.
 +       */
 +      list_for_each_entry(q, &all_q_list, all_q_node)
 +              blk_mq_freeze_queue_start(q);
 +      list_for_each_entry(q, &all_q_list, all_q_node)
 +              blk_mq_freeze_queue_wait(q);
 +
        list_for_each_entry(q, &all_q_list, all_q_node)
                blk_mq_queue_reinit(q);
 +
 +      list_for_each_entry(q, &all_q_list, all_q_node)
 +              blk_mq_unfreeze_queue(q);
 +
        mutex_unlock(&all_q_mutex);
        return NOTIFY_OK;
  }
@@@ -2049,8 -2048,6 +2073,8 @@@ static int blk_mq_alloc_rq_maps(struct 
   */
  int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
  {
 +      BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS);
 +
        if (!set->nr_hw_queues)
                return -EINVAL;
        if (!set->queue_depth)
                set->queue_depth = BLK_MQ_MAX_DEPTH;
        }
  
+       /*
+        * If a crashdump is active, then we are potentially in a very
+        * memory constrained environment. Limit us to 1 queue and
+        * 64 tags to prevent using too much memory.
+        */
+       if (is_kdump_kernel()) {
+               set->nr_hw_queues = 1;
+               set->queue_depth = min(64U, set->queue_depth);
+       }
        set->tags = kmalloc_node(set->nr_hw_queues *
                                 sizeof(struct blk_mq_tags *),
                                 GFP_KERNEL, set->numa_node);
diff --combined drivers/block/null_blk.c
index 8001e812018bbbc0c4360dacd6dc71e8029ab068,8433bc8ead3d232fa73debcc6cd9e03693589e30..caa61212fdb57851272a379287fa6f471a0d42d9
@@@ -313,15 -313,15 +313,15 @@@ static void null_request_fn(struct requ
        }
  }
  
- static int null_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq,
-               bool last)
+ static int null_queue_rq(struct blk_mq_hw_ctx *hctx,
+                        const struct blk_mq_queue_data *bd)
  {
-       struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq);
+       struct nullb_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
  
-       cmd->rq = rq;
+       cmd->rq = bd->rq;
        cmd->nq = hctx->driver_data;
  
-       blk_mq_start_request(rq);
+       blk_mq_start_request(bd->rq);
  
        null_handle_cmd(cmd);
        return BLK_MQ_RQ_QUEUE_OK;
@@@ -450,10 -450,14 +450,10 @@@ static int init_driver_queues(struct nu
  
                ret = setup_commands(nq);
                if (ret)
 -                      goto err_queue;
 +                      return ret;
                nullb->nr_queues++;
        }
 -
        return 0;
 -err_queue:
 -      cleanup_queues(nullb);
 -      return ret;
  }
  
  static int null_add_dev(void)
                        goto out_cleanup_queues;
                }
                blk_queue_make_request(nullb->q, null_queue_bio);
 -              init_driver_queues(nullb);
 +              rv = init_driver_queues(nullb);
 +              if (rv)
 +                      goto out_cleanup_blk_queue;
        } else {
                nullb->q = blk_init_queue_node(null_request_fn, &nullb->lock, home_node);
                if (!nullb->q) {
                }
                blk_queue_prep_rq(nullb->q, null_rq_prep_fn);
                blk_queue_softirq_done(nullb->q, null_softirq_done_fn);
 -              init_driver_queues(nullb);
 +              rv = init_driver_queues(nullb);
 +              if (rv)
 +                      goto out_cleanup_blk_queue;
        }
  
        nullb->q->queuedata = nullb;
index 1fb9e09fbbc5d5f15dc3cf612c5685ecb62c0505,cecd3f983e493d7582a53f85b02d5ffef8955a4b..7ef7c098708fc4e482181724d574555bbb9db6d7
@@@ -80,7 -80,7 +80,7 @@@ static int __virtblk_add_req(struct vir
  {
        struct scatterlist hdr, status, cmd, sense, inhdr, *sgs[6];
        unsigned int num_out = 0, num_in = 0;
 -      int type = vbr->out_hdr.type & ~VIRTIO_BLK_T_OUT;
 +      __virtio32 type = vbr->out_hdr.type & ~cpu_to_virtio32(vq->vdev, VIRTIO_BLK_T_OUT);
  
        sg_init_one(&hdr, &vbr->out_hdr, sizeof(vbr->out_hdr));
        sgs[num_out++] = &hdr;
         * block, and before the normal inhdr we put the sense data and the
         * inhdr with additional status information.
         */
 -      if (type == VIRTIO_BLK_T_SCSI_CMD) {
 +      if (type == cpu_to_virtio32(vq->vdev, VIRTIO_BLK_T_SCSI_CMD)) {
                sg_init_one(&cmd, vbr->req->cmd, vbr->req->cmd_len);
                sgs[num_out++] = &cmd;
        }
  
        if (have_data) {
 -              if (vbr->out_hdr.type & VIRTIO_BLK_T_OUT)
 +              if (vbr->out_hdr.type & cpu_to_virtio32(vq->vdev, VIRTIO_BLK_T_OUT))
                        sgs[num_out++] = data_sg;
                else
                        sgs[num_out + num_in++] = data_sg;
        }
  
 -      if (type == VIRTIO_BLK_T_SCSI_CMD) {
 +      if (type == cpu_to_virtio32(vq->vdev, VIRTIO_BLK_T_SCSI_CMD)) {
                sg_init_one(&sense, vbr->req->sense, SCSI_SENSE_BUFFERSIZE);
                sgs[num_out + num_in++] = &sense;
                sg_init_one(&inhdr, &vbr->in_hdr, sizeof(vbr->in_hdr));
  static inline void virtblk_request_done(struct request *req)
  {
        struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
 +      struct virtio_blk *vblk = req->q->queuedata;
        int error = virtblk_result(vbr);
  
        if (req->cmd_type == REQ_TYPE_BLOCK_PC) {
 -              req->resid_len = vbr->in_hdr.residual;
 -              req->sense_len = vbr->in_hdr.sense_len;
 -              req->errors = vbr->in_hdr.errors;
 +              req->resid_len = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.residual);
 +              req->sense_len = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.sense_len);
 +              req->errors = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.errors);
        } else if (req->cmd_type == REQ_TYPE_SPECIAL) {
                req->errors = (error != 0);
        }
@@@ -159,10 -158,11 +159,11 @@@ static void virtblk_done(struct virtque
        spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
  }
  
- static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req,
-               bool last)
+ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx,
+                          const struct blk_mq_queue_data *bd)
  {
        struct virtio_blk *vblk = hctx->queue->queuedata;
+       struct request *req = bd->rq;
        struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
        unsigned long flags;
        unsigned int num;
  
        vbr->req = req;
        if (req->cmd_flags & REQ_FLUSH) {
 -              vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH;
 +              vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_FLUSH);
                vbr->out_hdr.sector = 0;
 -              vbr->out_hdr.ioprio = req_get_ioprio(vbr->req);
 +              vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(vbr->req));
        } else {
                switch (req->cmd_type) {
                case REQ_TYPE_FS:
                        vbr->out_hdr.type = 0;
 -                      vbr->out_hdr.sector = blk_rq_pos(vbr->req);
 -                      vbr->out_hdr.ioprio = req_get_ioprio(vbr->req);
 +                      vbr->out_hdr.sector = cpu_to_virtio64(vblk->vdev, blk_rq_pos(vbr->req));
 +                      vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(vbr->req));
                        break;
                case REQ_TYPE_BLOCK_PC:
 -                      vbr->out_hdr.type = VIRTIO_BLK_T_SCSI_CMD;
 +                      vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_SCSI_CMD);
                        vbr->out_hdr.sector = 0;
 -                      vbr->out_hdr.ioprio = req_get_ioprio(vbr->req);
 +                      vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(vbr->req));
                        break;
                case REQ_TYPE_SPECIAL:
 -                      vbr->out_hdr.type = VIRTIO_BLK_T_GET_ID;
 +                      vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_GET_ID);
                        vbr->out_hdr.sector = 0;
 -                      vbr->out_hdr.ioprio = req_get_ioprio(vbr->req);
 +                      vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(vbr->req));
                        break;
                default:
                        /* We don't put anything else in the queue. */
        num = blk_rq_map_sg(hctx->queue, vbr->req, vbr->sg);
        if (num) {
                if (rq_data_dir(vbr->req) == WRITE)
 -                      vbr->out_hdr.type |= VIRTIO_BLK_T_OUT;
 +                      vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_OUT);
                else
 -                      vbr->out_hdr.type |= VIRTIO_BLK_T_IN;
 +                      vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_IN);
        }
  
        spin_lock_irqsave(&vblk->vqs[qid].lock, flags);
                return BLK_MQ_RQ_QUEUE_ERROR;
        }
  
-       if (last && virtqueue_kick_prepare(vblk->vqs[qid].vq))
+       if (bd->last && virtqueue_kick_prepare(vblk->vqs[qid].vq))
                notify = true;
        spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
  
@@@ -332,8 -332,7 +333,8 @@@ static ssize_t virtblk_serial_show(stru
  
        return err;
  }
 -DEVICE_ATTR(serial, S_IRUGO, virtblk_serial_show, NULL);
 +
 +static DEVICE_ATTR(serial, S_IRUGO, virtblk_serial_show, NULL);
  
  static void virtblk_config_changed_work(struct work_struct *work)
  {
@@@ -478,8 -477,7 +479,8 @@@ static int virtblk_get_cache_mode(struc
                                   struct virtio_blk_config, wce,
                                   &writeback);
        if (err)
 -              writeback = virtio_has_feature(vdev, VIRTIO_BLK_F_WCE);
 +              writeback = virtio_has_feature(vdev, VIRTIO_BLK_F_WCE) ||
 +                          virtio_has_feature(vdev, VIRTIO_F_VERSION_1);
  
        return writeback;
  }
@@@ -824,34 -822,25 +825,34 @@@ static const struct virtio_device_id id
        { 0 },
  };
  
 -static unsigned int features[] = {
 +static unsigned int features_legacy[] = {
        VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY,
        VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE, VIRTIO_BLK_F_SCSI,
        VIRTIO_BLK_F_WCE, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE,
        VIRTIO_BLK_F_MQ,
 +}
 +;
 +static unsigned int features[] = {
 +      VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY,
 +      VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE,
 +      VIRTIO_BLK_F_TOPOLOGY,
 +      VIRTIO_BLK_F_MQ,
  };
  
  static struct virtio_driver virtio_blk = {
 -      .feature_table          = features,
 -      .feature_table_size     = ARRAY_SIZE(features),
 -      .driver.name            = KBUILD_MODNAME,
 -      .driver.owner           = THIS_MODULE,
 -      .id_table               = id_table,
 -      .probe                  = virtblk_probe,
 -      .remove                 = virtblk_remove,
 -      .config_changed         = virtblk_config_changed,
 +      .feature_table                  = features,
 +      .feature_table_size             = ARRAY_SIZE(features),
 +      .feature_table_legacy           = features_legacy,
 +      .feature_table_size_legacy      = ARRAY_SIZE(features_legacy),
 +      .driver.name                    = KBUILD_MODNAME,
 +      .driver.owner                   = THIS_MODULE,
 +      .id_table                       = id_table,
 +      .probe                          = virtblk_probe,
 +      .remove                         = virtblk_remove,
 +      .config_changed                 = virtblk_config_changed,
  #ifdef CONFIG_PM_SLEEP
 -      .freeze                 = virtblk_freeze,
 -      .restore                = virtblk_restore,
 +      .freeze                         = virtblk_freeze,
 +      .restore                        = virtblk_restore,
  #endif
  };
  
@@@ -883,8 -872,8 +884,8 @@@ out_destroy_workqueue
  
  static void __exit fini(void)
  {
 -      unregister_blkdev(major, "virtblk");
        unregister_virtio_driver(&virtio_blk);
 +      unregister_blkdev(major, "virtblk");
        destroy_workqueue(virtblk_wq);
  }
  module_init(init);
diff --combined drivers/scsi/scsi_lib.c
index 7e3d954c9cacc1bdad0881be35a6cb1a2b0d7254,161dcc93ac7554b9a286d852727b1ef46a460fda..43318d556cbcf28209f317e0c3646923324e6127
@@@ -22,7 -22,6 +22,7 @@@
  #include <linux/hardirq.h>
  #include <linux/scatterlist.h>
  #include <linux/blk-mq.h>
 +#include <linux/ratelimit.h>
  
  #include <scsi/scsi.h>
  #include <scsi/scsi_cmnd.h>
@@@ -48,7 -47,7 +48,7 @@@ struct scsi_host_sg_pool 
        mempool_t       *pool;
  };
  
 -#define SP(x) { x, "sgpool-" __stringify(x) }
 +#define SP(x) { .size = x, "sgpool-" __stringify(x) }
  #if (SCSI_MAX_SG_SEGMENTS < 32)
  #error SCSI_MAX_SG_SEGMENTS is too small (must be 32 or greater)
  #endif
@@@ -543,6 -542,17 +543,6 @@@ static void scsi_requeue_command(struc
        put_device(&sdev->sdev_gendev);
  }
  
 -void scsi_next_command(struct scsi_cmnd *cmd)
 -{
 -      struct scsi_device *sdev = cmd->device;
 -      struct request_queue *q = sdev->request_queue;
 -
 -      scsi_put_command(cmd);
 -      scsi_run_queue(q);
 -
 -      put_device(&sdev->sdev_gendev);
 -}
 -
  void scsi_run_host_queues(struct Scsi_Host *shost)
  {
        struct scsi_device *sdev;
@@@ -588,10 -598,10 +588,10 @@@ static void scsi_free_sgtable(struct sc
        __sg_free_table(&sdb->table, SCSI_MAX_SG_SEGMENTS, mq, scsi_sg_free);
  }
  
 -static int scsi_alloc_sgtable(struct scsi_data_buffer *sdb, int nents,
 -                            gfp_t gfp_mask, bool mq)
 +static int scsi_alloc_sgtable(struct scsi_data_buffer *sdb, int nents, bool mq)
  {
        struct scatterlist *first_chunk = NULL;
 +      gfp_t gfp_mask = mq ? GFP_NOIO : GFP_ATOMIC;
        int ret;
  
        BUG_ON(!nents);
@@@ -720,6 -730,8 +720,6 @@@ static bool scsi_end_request(struct req
                        kblockd_schedule_work(&sdev->requeue_work);
                else
                        blk_mq_start_stopped_hw_queues(q, true);
 -
 -              put_device(&sdev->sdev_gendev);
        } else {
                unsigned long flags;
  
                spin_unlock_irqrestore(q->queue_lock, flags);
  
                scsi_release_buffers(cmd);
 -              scsi_next_command(cmd);
 +
 +              scsi_put_command(cmd);
 +              scsi_run_queue(q);
        }
  
 +      put_device(&sdev->sdev_gendev);
        return false;
  }
  
@@@ -822,8 -831,8 +822,8 @@@ void scsi_io_completion(struct scsi_cmn
        struct request *req = cmd->request;
        int error = 0;
        struct scsi_sense_hdr sshdr;
 -      int sense_valid = 0;
 -      int sense_deferred = 0;
 +      bool sense_valid = false;
 +      int sense_deferred = 0, level = 0;
        enum {ACTION_FAIL, ACTION_REPREP, ACTION_RETRY,
              ACTION_DELAYED_RETRY} action;
        unsigned long wait_for = (cmd->allowed + 1) * req->timeout;
                if ((sshdr.asc == 0x0) && (sshdr.ascq == 0x1d))
                        ;
                else if (!(req->cmd_flags & REQ_QUIET))
 -                      scsi_print_sense("", cmd);
 +                      scsi_print_sense(cmd);
                result = 0;
                /* BLOCK_PC may have set error */
                error = 0;
        case ACTION_FAIL:
                /* Give up and fail the remainder of the request */
                if (!(req->cmd_flags & REQ_QUIET)) {
 -                      scsi_print_result(cmd);
 -                      if (driver_byte(result) & DRIVER_SENSE)
 -                              scsi_print_sense("", cmd);
 -                      scsi_print_command(cmd);
 +                      static DEFINE_RATELIMIT_STATE(_rs,
 +                                      DEFAULT_RATELIMIT_INTERVAL,
 +                                      DEFAULT_RATELIMIT_BURST);
 +
 +                      if (unlikely(scsi_logging_level))
 +                              level = SCSI_LOG_LEVEL(SCSI_LOG_MLCOMPLETE_SHIFT,
 +                                                     SCSI_LOG_MLCOMPLETE_BITS);
 +
 +                      /*
 +                       * if logging is enabled the failure will be printed
 +                       * in scsi_log_completion(), so avoid duplicate messages
 +                       */
 +                      if (!level && __ratelimit(&_rs)) {
 +                              scsi_print_result(cmd, NULL, FAILED);
 +                              if (driver_byte(result) & DRIVER_SENSE)
 +                                      scsi_print_sense(cmd);
 +                              scsi_print_command(cmd);
 +                      }
                }
                if (!scsi_end_request(req, error, blk_rq_err_bytes(req), 0))
                        return;
        }
  }
  
 -static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb,
 -                           gfp_t gfp_mask)
 +static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb)
  {
        int count;
  
         * If sg table allocation fails, requeue request later.
         */
        if (unlikely(scsi_alloc_sgtable(sdb, req->nr_phys_segments,
 -                                      gfp_mask, req->mq_ctx != NULL)))
 +                                      req->mq_ctx != NULL)))
                return BLKPREP_DEFER;
  
        /* 
   *            BLKPREP_DEFER if the failure is retryable
   *            BLKPREP_KILL if the failure is fatal
   */
 -int scsi_init_io(struct scsi_cmnd *cmd, gfp_t gfp_mask)
 +int scsi_init_io(struct scsi_cmnd *cmd)
  {
        struct scsi_device *sdev = cmd->device;
        struct request *rq = cmd->request;
  
        BUG_ON(!rq->nr_phys_segments);
  
 -      error = scsi_init_sgtable(rq, &cmd->sdb, gfp_mask);
 +      error = scsi_init_sgtable(rq, &cmd->sdb);
        if (error)
                goto err_exit;
  
                        rq->next_rq->special = bidi_sdb;
                }
  
 -              error = scsi_init_sgtable(rq->next_rq, rq->next_rq->special,
 -                                        GFP_ATOMIC);
 +              error = scsi_init_sgtable(rq->next_rq, rq->next_rq->special);
                if (error)
                        goto err_exit;
        }
                BUG_ON(prot_sdb == NULL);
                ivecs = blk_rq_count_integrity_sg(rq->q, rq->bio);
  
 -              if (scsi_alloc_sgtable(prot_sdb, ivecs, gfp_mask, is_mq)) {
 +              if (scsi_alloc_sgtable(prot_sdb, ivecs, is_mq)) {
                        error = BLKPREP_DEFER;
                        goto err_exit;
                }
@@@ -1216,7 -1213,7 +1216,7 @@@ static int scsi_setup_blk_pc_cmnd(struc
         * submit a request without an attached bio.
         */
        if (req->bio) {
 -              int ret = scsi_init_io(cmd, GFP_ATOMIC);
 +              int ret = scsi_init_io(cmd);
                if (unlikely(ret))
                        return ret;
        } else {
@@@ -1640,87 -1637,6 +1640,87 @@@ static void scsi_softirq_done(struct re
        }
  }
  
 +/**
 + * scsi_dispatch_command - Dispatch a command to the low-level driver.
 + * @cmd: command block we are dispatching.
 + *
 + * Return: nonzero return request was rejected and device's queue needs to be
 + * plugged.
 + */
 +static int scsi_dispatch_cmd(struct scsi_cmnd *cmd)
 +{
 +      struct Scsi_Host *host = cmd->device->host;
 +      int rtn = 0;
 +
 +      atomic_inc(&cmd->device->iorequest_cnt);
 +
 +      /* check if the device is still usable */
 +      if (unlikely(cmd->device->sdev_state == SDEV_DEL)) {
 +              /* in SDEV_DEL we error all commands. DID_NO_CONNECT
 +               * returns an immediate error upwards, and signals
 +               * that the device is no longer present */
 +              cmd->result = DID_NO_CONNECT << 16;
 +              goto done;
 +      }
 +
 +      /* Check to see if the scsi lld made this device blocked. */
 +      if (unlikely(scsi_device_blocked(cmd->device))) {
 +              /*
 +               * in blocked state, the command is just put back on
 +               * the device queue.  The suspend state has already
 +               * blocked the queue so future requests should not
 +               * occur until the device transitions out of the
 +               * suspend state.
 +               */
 +              SCSI_LOG_MLQUEUE(3, scmd_printk(KERN_INFO, cmd,
 +                      "queuecommand : device blocked\n"));
 +              return SCSI_MLQUEUE_DEVICE_BUSY;
 +      }
 +
 +      /* Store the LUN value in cmnd, if needed. */
 +      if (cmd->device->lun_in_cdb)
 +              cmd->cmnd[1] = (cmd->cmnd[1] & 0x1f) |
 +                             (cmd->device->lun << 5 & 0xe0);
 +
 +      scsi_log_send(cmd);
 +
 +      /*
 +       * Before we queue this command, check if the command
 +       * length exceeds what the host adapter can handle.
 +       */
 +      if (cmd->cmd_len > cmd->device->host->max_cmd_len) {
 +              SCSI_LOG_MLQUEUE(3, scmd_printk(KERN_INFO, cmd,
 +                             "queuecommand : command too long. "
 +                             "cdb_size=%d host->max_cmd_len=%d\n",
 +                             cmd->cmd_len, cmd->device->host->max_cmd_len));
 +              cmd->result = (DID_ABORT << 16);
 +              goto done;
 +      }
 +
 +      if (unlikely(host->shost_state == SHOST_DEL)) {
 +              cmd->result = (DID_NO_CONNECT << 16);
 +              goto done;
 +
 +      }
 +
 +      trace_scsi_dispatch_cmd_start(cmd);
 +      rtn = host->hostt->queuecommand(host, cmd);
 +      if (rtn) {
 +              trace_scsi_dispatch_cmd_error(cmd, rtn);
 +              if (rtn != SCSI_MLQUEUE_DEVICE_BUSY &&
 +                  rtn != SCSI_MLQUEUE_TARGET_BUSY)
 +                      rtn = SCSI_MLQUEUE_HOST_BUSY;
 +
 +              SCSI_LOG_MLQUEUE(3, scmd_printk(KERN_INFO, cmd,
 +                      "queuecommand : request rejected\n"));
 +      }
 +
 +      return rtn;
 + done:
 +      cmd->scsi_done(cmd);
 +      return 0;
 +}
 +
  /**
   * scsi_done - Invoke completion on finished SCSI command.
   * @cmd: The SCSI Command for which a low-level device driver (LLDD) gives
@@@ -1809,7 -1725,7 +1809,7 @@@ static void scsi_request_fn(struct requ
                 * we add the dev to the starved list so it eventually gets
                 * a run when a tag is freed.
                 */
 -              if (blk_queue_tagged(q) && !blk_rq_tagged(req)) {
 +              if (blk_queue_tagged(q) && !(req->cmd_flags & REQ_QUEUED)) {
                        spin_lock_irq(shost->host_lock);
                        if (list_empty(&sdev->starved_entry))
                                list_add_tail(&sdev->starved_entry,
  
                if (!scsi_host_queue_ready(q, shost, sdev))
                        goto host_not_ready;
 +      
 +              if (sdev->simple_tags)
 +                      cmd->flags |= SCMD_TAGGED;
 +              else
 +                      cmd->flags &= ~SCMD_TAGGED;
  
                /*
                 * Finally, initialize any error handling parameters, and set up
@@@ -1947,9 -1858,10 +1947,10 @@@ static void scsi_mq_done(struct scsi_cm
        blk_mq_complete_request(cmd->request);
  }
  
- static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req,
-               bool last)
+ static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
+                        const struct blk_mq_queue_data *bd)
  {
+       struct request *req = bd->rq;
        struct request_queue *q = req->q;
        struct scsi_device *sdev = q->queuedata;
        struct Scsi_Host *shost = sdev->host;
                blk_mq_start_request(req);
        }
  
 +      if (sdev->simple_tags)
 +              cmd->flags |= SCMD_TAGGED;
 +      else
 +              cmd->flags &= ~SCMD_TAGGED;
 +
        scsi_init_cmd_errh(cmd);
        cmd->scsi_done = scsi_mq_done;
  
@@@ -2180,7 -2087,7 +2181,7 @@@ int scsi_mq_setup_tags(struct Scsi_Hos
  
        memset(&shost->tag_set, 0, sizeof(shost->tag_set));
        shost->tag_set.ops = &scsi_mq_ops;
 -      shost->tag_set.nr_hw_queues = 1;
 +      shost->tag_set.nr_hw_queues = shost->nr_hw_queues ? : 1;
        shost->tag_set.queue_depth = shost->can_queue;
        shost->tag_set.cmd_size = cmd_size;
        shost->tag_set.numa_node = NUMA_NO_NODE;
diff --combined include/linux/blk-mq.h
index 15f7034aa377ecd00367cb46d69003077ebadb06,fb0a4fb3dc2b10c49567ce1fd3a109b34ad177f3..8aded9ab2e4e89ddb66e5920c0819bade6fb0760
@@@ -79,7 -79,13 +79,13 @@@ struct blk_mq_tag_set 
        struct list_head        tag_list;
  };
  
- typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, struct request *, bool);
+ struct blk_mq_queue_data {
+       struct request *rq;
+       struct list_head *list;
+       bool last;
+ };
+ typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, const struct blk_mq_queue_data *);
  typedef struct blk_mq_hw_ctx *(map_queue_fn)(struct request_queue *, const int);
  typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool);
  typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int);
@@@ -140,6 -146,7 +146,7 @@@ enum 
        BLK_MQ_F_TAG_SHARED     = 1 << 1,
        BLK_MQ_F_SG_MERGE       = 1 << 2,
        BLK_MQ_F_SYSFS_UP       = 1 << 3,
+       BLK_MQ_F_DEFER_ISSUE    = 1 << 4,
  
        BLK_MQ_S_STOPPED        = 0,
        BLK_MQ_S_TAG_ACTIVE     = 1,
@@@ -162,28 -169,12 +169,29 @@@ void blk_mq_flush_plug_list(struct blk_
  void blk_mq_insert_request(struct request *, bool, bool, bool);
  void blk_mq_run_queues(struct request_queue *q, bool async);
  void blk_mq_free_request(struct request *rq);
+ void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *, struct request *rq);
  bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
  struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
                gfp_t gfp, bool reserved);
  struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag);
  
 +enum {
 +      BLK_MQ_UNIQUE_TAG_BITS = 16,
 +      BLK_MQ_UNIQUE_TAG_MASK = (1 << BLK_MQ_UNIQUE_TAG_BITS) - 1,
 +};
 +
 +u32 blk_mq_unique_tag(struct request *rq);
 +
 +static inline u16 blk_mq_unique_tag_to_hwq(u32 unique_tag)
 +{
 +      return unique_tag >> BLK_MQ_UNIQUE_TAG_BITS;
 +}
 +
 +static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag)
 +{
 +      return unique_tag & BLK_MQ_UNIQUE_TAG_MASK;
 +}
 +
  struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_index);
  struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *, unsigned int, int);
  
diff --combined include/linux/blkdev.h
index 0495e38542479b9abe4bf770cf8bb31f9e94dc38,74d14dba6fb753ad286a45c7116a9812400010cd..92f4b4b288dd57005b30c2b76d7ce5aed070c0d4
@@@ -398,7 -398,7 +398,7 @@@ struct request_queue 
         */
        struct kobject mq_kobj;
  
 -#ifdef CONFIG_PM_RUNTIME
 +#ifdef CONFIG_PM
        struct device           *dev;
        int                     rpm_status;
        unsigned int            nr_pending;
@@@ -1057,7 -1057,7 +1057,7 @@@ extern void blk_put_queue(struct reques
  /*
   * block layer runtime pm functions
   */
 -#ifdef CONFIG_PM_RUNTIME
 +#ifdef CONFIG_PM
  extern void blk_pm_runtime_init(struct request_queue *q, struct device *dev);
  extern int blk_pre_runtime_suspend(struct request_queue *q);
  extern void blk_post_runtime_suspend(struct request_queue *q, int err);
@@@ -1136,6 -1136,8 +1136,6 @@@ static inline bool blk_needs_flush_plug
  /*
   * tag stuff
   */
 -#define blk_rq_tagged(rq) \
 -      ((rq)->mq_ctx || ((rq)->cmd_flags & REQ_QUEUED))
  extern int blk_queue_start_tag(struct request_queue *, struct request *);
  extern struct request *blk_queue_find_tag(struct request_queue *, int);
  extern void blk_queue_end_tag(struct request_queue *, struct request *);
@@@ -1184,7 -1186,6 +1184,6 @@@ extern int blk_verify_command(unsigned 
  enum blk_default_limits {
        BLK_MAX_SEGMENTS        = 128,
        BLK_SAFE_MAX_SECTORS    = 255,
-       BLK_DEF_MAX_SECTORS     = 1024,
        BLK_MAX_SEGMENT_SIZE    = 65536,
        BLK_SEG_BOUNDARY_MASK   = 0xFFFFFFFFUL,
  };
@@@ -1581,13 -1582,13 +1580,13 @@@ static inline bool blk_integrity_merge_
                                          struct request *r1,
                                          struct request *r2)
  {
 -      return 0;
 +      return true;
  }
  static inline bool blk_integrity_merge_bio(struct request_queue *rq,
                                           struct request *r,
                                           struct bio *b)
  {
 -      return 0;
 +      return true;
  }
  static inline bool blk_integrity_is_initialized(struct gendisk *g)
  {
diff --combined kernel/trace/blktrace.c
index 11b9cb36092b2a28be83cabf6848111efb174e36,bd05fd2d5d2b5d061f233da246722b4b37cfc329..483cecfa5c174d6a74c54d5663b8b00578759d11
@@@ -1142,9 -1142,9 +1142,9 @@@ static void get_pdu_remap(const struct 
        r->sector_from = be64_to_cpu(sector_from);
  }
  
 -typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act);
 +typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act);
  
 -static int blk_log_action_classic(struct trace_iterator *iter, const char *act)
 +static void blk_log_action_classic(struct trace_iterator *iter, const char *act)
  {
        char rwbs[RWBS_LEN];
        unsigned long long ts  = iter->ts;
  
        fill_rwbs(rwbs, t);
  
 -      return trace_seq_printf(&iter->seq,
 -                              "%3d,%-3d %2d %5d.%09lu %5u %2s %3s ",
 -                              MAJOR(t->device), MINOR(t->device), iter->cpu,
 -                              secs, nsec_rem, iter->ent->pid, act, rwbs);
 +      trace_seq_printf(&iter->seq,
 +                       "%3d,%-3d %2d %5d.%09lu %5u %2s %3s ",
 +                       MAJOR(t->device), MINOR(t->device), iter->cpu,
 +                       secs, nsec_rem, iter->ent->pid, act, rwbs);
  }
  
 -static int blk_log_action(struct trace_iterator *iter, const char *act)
 +static void blk_log_action(struct trace_iterator *iter, const char *act)
  {
        char rwbs[RWBS_LEN];
        const struct blk_io_trace *t = te_blk_io_trace(iter->ent);
  
        fill_rwbs(rwbs, t);
 -      return trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ",
 -                              MAJOR(t->device), MINOR(t->device), act, rwbs);
 +      trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ",
 +                       MAJOR(t->device), MINOR(t->device), act, rwbs);
  }
  
 -static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
 +static void blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
  {
        const unsigned char *pdu_buf;
        int pdu_len;
 -      int i, end, ret;
 +      int i, end;
  
        pdu_buf = pdu_start(ent);
        pdu_len = te_blk_io_trace(ent)->pdu_len;
  
        if (!pdu_len)
 -              return 1;
 +              return;
  
        /* find the last zero that needs to be printed */
        for (end = pdu_len - 1; end >= 0; end--)
                        break;
        end++;
  
 -      if (!trace_seq_putc(s, '('))
 -              return 0;
 +      trace_seq_putc(s, '(');
  
        for (i = 0; i < pdu_len; i++) {
  
 -              ret = trace_seq_printf(s, "%s%02x",
 -                                     i == 0 ? "" : " ", pdu_buf[i]);
 -              if (!ret)
 -                      return ret;
 +              trace_seq_printf(s, "%s%02x",
 +                               i == 0 ? "" : " ", pdu_buf[i]);
  
                /*
                 * stop when the rest is just zeroes and indicate so
                 * with a ".." appended
                 */
 -              if (i == end && end != pdu_len - 1)
 -                      return trace_seq_puts(s, " ..) ");
 +              if (i == end && end != pdu_len - 1) {
 +                      trace_seq_puts(s, " ..) ");
 +                      return;
 +              }
        }
  
 -      return trace_seq_puts(s, ") ");
 +      trace_seq_puts(s, ") ");
  }
  
 -static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
 +static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
  {
        char cmd[TASK_COMM_LEN];
  
        trace_find_cmdline(ent->pid, cmd);
  
        if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
 -              int ret;
 -
 -              ret = trace_seq_printf(s, "%u ", t_bytes(ent));
 -              if (!ret)
 -                      return 0;
 -              ret = blk_log_dump_pdu(s, ent);
 -              if (!ret)
 -                      return 0;
 -              return trace_seq_printf(s, "[%s]\n", cmd);
 +              trace_seq_printf(s, "%u ", t_bytes(ent));
 +              blk_log_dump_pdu(s, ent);
 +              trace_seq_printf(s, "[%s]\n", cmd);
        } else {
                if (t_sec(ent))
 -                      return trace_seq_printf(s, "%llu + %u [%s]\n",
 +                      trace_seq_printf(s, "%llu + %u [%s]\n",
                                                t_sector(ent), t_sec(ent), cmd);
 -              return trace_seq_printf(s, "[%s]\n", cmd);
 +              else
 +                      trace_seq_printf(s, "[%s]\n", cmd);
        }
  }
  
 -static int blk_log_with_error(struct trace_seq *s,
 +static void blk_log_with_error(struct trace_seq *s,
                              const struct trace_entry *ent)
  {
        if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
 -              int ret;
 -
 -              ret = blk_log_dump_pdu(s, ent);
 -              if (ret)
 -                      return trace_seq_printf(s, "[%d]\n", t_error(ent));
 -              return 0;
 +              blk_log_dump_pdu(s, ent);
 +              trace_seq_printf(s, "[%d]\n", t_error(ent));
        } else {
                if (t_sec(ent))
 -                      return trace_seq_printf(s, "%llu + %u [%d]\n",
 -                                              t_sector(ent),
 -                                              t_sec(ent), t_error(ent));
 -              return trace_seq_printf(s, "%llu [%d]\n",
 -                                      t_sector(ent), t_error(ent));
 +                      trace_seq_printf(s, "%llu + %u [%d]\n",
 +                                       t_sector(ent),
 +                                       t_sec(ent), t_error(ent));
 +              else
 +                      trace_seq_printf(s, "%llu [%d]\n",
 +                                       t_sector(ent), t_error(ent));
        }
  }
  
 -static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
 +static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
  {
        struct blk_io_trace_remap r = { .device_from = 0, };
  
        get_pdu_remap(ent, &r);
 -      return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
 -                              t_sector(ent), t_sec(ent),
 -                              MAJOR(r.device_from), MINOR(r.device_from),
 -                              (unsigned long long)r.sector_from);
 +      trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
 +                       t_sector(ent), t_sec(ent),
 +                       MAJOR(r.device_from), MINOR(r.device_from),
 +                       (unsigned long long)r.sector_from);
  }
  
 -static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
 +static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
  {
        char cmd[TASK_COMM_LEN];
  
        trace_find_cmdline(ent->pid, cmd);
  
 -      return trace_seq_printf(s, "[%s]\n", cmd);
 +      trace_seq_printf(s, "[%s]\n", cmd);
  }
  
 -static int blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent)
 +static void blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent)
  {
        char cmd[TASK_COMM_LEN];
  
        trace_find_cmdline(ent->pid, cmd);
  
 -      return trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent));
 +      trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent));
  }
  
 -static int blk_log_split(struct trace_seq *s, const struct trace_entry *ent)
 +static void blk_log_split(struct trace_seq *s, const struct trace_entry *ent)
  {
        char cmd[TASK_COMM_LEN];
  
        trace_find_cmdline(ent->pid, cmd);
  
 -      return trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent),
 -                              get_pdu_int(ent), cmd);
 +      trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent),
 +                       get_pdu_int(ent), cmd);
  }
  
 -static int blk_log_msg(struct trace_seq *s, const struct trace_entry *ent)
 +static void blk_log_msg(struct trace_seq *s, const struct trace_entry *ent)
  {
 -      int ret;
        const struct blk_io_trace *t = te_blk_io_trace(ent);
  
 -      ret = trace_seq_putmem(s, t + 1, t->pdu_len);
 -      if (ret)
 -              return trace_seq_putc(s, '\n');
 -      return ret;
 +      trace_seq_putmem(s, t + 1, t->pdu_len);
 +      trace_seq_putc(s, '\n');
  }
  
  /*
@@@ -1327,7 -1339,7 +1327,7 @@@ static void blk_tracer_reset(struct tra
  
  static const struct {
        const char *act[2];
 -      int        (*print)(struct trace_seq *s, const struct trace_entry *ent);
 +      void       (*print)(struct trace_seq *s, const struct trace_entry *ent);
  } what2act[] = {
        [__BLK_TA_QUEUE]        = {{  "Q", "queue" },      blk_log_generic },
        [__BLK_TA_BACKMERGE]    = {{  "M", "backmerge" },  blk_log_generic },
@@@ -1352,6 -1364,7 +1352,6 @@@ static enum print_line_t print_one_line
        struct trace_seq *s = &iter->seq;
        const struct blk_io_trace *t;
        u16 what;
 -      int ret;
        bool long_act;
        blk_log_action_t *log_action;
  
        log_action = classic ? &blk_log_action_classic : &blk_log_action;
  
        if (t->action == BLK_TN_MESSAGE) {
 -              ret = log_action(iter, long_act ? "message" : "m");
 -              if (ret)
 -                      ret = blk_log_msg(s, iter->ent);
 -              goto out;
 +              log_action(iter, long_act ? "message" : "m");
 +              blk_log_msg(s, iter->ent);
        }
  
        if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
 -              ret = trace_seq_printf(s, "Unknown action %x\n", what);
 +              trace_seq_printf(s, "Unknown action %x\n", what);
        else {
 -              ret = log_action(iter, what2act[what].act[long_act]);
 -              if (ret)
 -                      ret = what2act[what].print(s, iter->ent);
 +              log_action(iter, what2act[what].act[long_act]);
 +              what2act[what].print(s, iter->ent);
        }
 -out:
 -      return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
 +
 +      return trace_handle_return(s);
  }
  
  static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
        return print_one_line(iter, false);
  }
  
 -static int blk_trace_synthesize_old_trace(struct trace_iterator *iter)
 +static void blk_trace_synthesize_old_trace(struct trace_iterator *iter)
  {
        struct trace_seq *s = &iter->seq;
        struct blk_io_trace *t = (struct blk_io_trace *)iter->ent;
                .time     = iter->ts,
        };
  
 -      if (!trace_seq_putmem(s, &old, offset))
 -              return 0;
 -      return trace_seq_putmem(s, &t->sector,
 -                              sizeof(old) - offset + t->pdu_len);
 +      trace_seq_putmem(s, &old, offset);
 +      trace_seq_putmem(s, &t->sector,
 +                       sizeof(old) - offset + t->pdu_len);
  }
  
  static enum print_line_t
  blk_trace_event_print_binary(struct trace_iterator *iter, int flags,
                             struct trace_event *event)
  {
 -      return blk_trace_synthesize_old_trace(iter) ?
 -                      TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
 +      blk_trace_synthesize_old_trace(iter);
 +
 +      return trace_handle_return(&iter->seq);
  }
  
  static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
@@@ -1477,9 -1493,6 +1477,6 @@@ static int blk_trace_remove_queue(struc
        if (atomic_dec_and_test(&blk_probes_ref))
                blk_unregister_tracepoints();
  
-       spin_lock_irq(&running_trace_lock);
-       list_del(&bt->running_list);
-       spin_unlock_irq(&running_trace_lock);
        blk_trace_free(bt);
        return 0;
  }