]> git.kernelconcepts.de Git - karo-tx-linux.git/commitdiff
Merge branch 'for-linus' of git://git.kernel.dk/linux-block
authorLinus Torvalds <torvalds@linux-foundation.org>
Wed, 1 Jul 2015 02:46:34 +0000 (19:46 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 1 Jul 2015 02:46:34 +0000 (19:46 -0700)
Pull more block layer patches from Jens Axboe:
 "A few later arrivers that I didn't fold into the first pull request,
  so we had a chance to run some testing.  This contains:

   - NVMe:
        - Set of fixes from Keith
        - 4.4 and earlier gcc build fix from Andrew

   - small set of xen-blk{back,front} fixes from Bob Liu.

   - warnings fix for bogus inline statement in I_BDEV() from Geert.

   - error code fixup for SG_IO ioctl from Paolo Bonzini"

* 'for-linus' of git://git.kernel.dk/linux-block:
  drivers/block/nvme-core.c: fix build with gcc-4.4.4
  bdi: Remove "inline" keyword from exported I_BDEV() implementation
  block: fix bogus EFAULT error from SG_IO ioctl
  NVMe: Fix filesystem deadlock on removal
  NVMe: Failed controller initialization fixes
  NVMe: Unify controller probe and resume
  NVMe: Don't use fake status on cancelled command
  NVMe: Fix device cleanup on initialization failure
  drivers: xen-blkfront: only talk_to_blkback() when in XenbusStateInitialising
  xen/block: add multi-page ring support
  driver: xen-blkfront: move talk_to_blkback to a more suitable place
  drivers: xen-blkback: delay pending_req allocation to connect_ring

block/scsi_ioctl.c
drivers/block/nvme-core.c
drivers/block/xen-blkback/blkback.c
drivers/block/xen-blkback/common.h
drivers/block/xen-blkback/xenbus.c
drivers/block/xen-blkfront.c
fs/block_dev.c

index 55b6f15dac900af77a5ad7038cd98f3133d816a8..dda653ce7b24cfb959f668bdb4a676900ed7637d 100644 (file)
@@ -326,8 +326,8 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
                        goto out_put_request;
        }
 
-       ret = -EFAULT;
-       if (blk_fill_sghdr_rq(q, rq, hdr, mode))
+       ret = blk_fill_sghdr_rq(q, rq, hdr, mode);
+       if (ret < 0)
                goto out_free_cdb;
 
        ret = 0;
index e5112714188fb46bcbf9760ee2c394fdeb733890..34338d7438f56895f76ac8b110d8e92a120b3202 100644 (file)
@@ -193,6 +193,13 @@ static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
        return 0;
 }
 
+static void nvme_admin_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
+{
+       struct nvme_queue *nvmeq = hctx->driver_data;
+
+       nvmeq->tags = NULL;
+}
+
 static int nvme_admin_init_request(void *data, struct request *req,
                                unsigned int hctx_idx, unsigned int rq_idx,
                                unsigned int numa_node)
@@ -606,7 +613,10 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx,
                        return;
                }
                if (req->cmd_type == REQ_TYPE_DRV_PRIV) {
-                       req->errors = status;
+                       if (cmd_rq->ctx == CMD_CTX_CANCELLED)
+                               req->errors = -EINTR;
+                       else
+                               req->errors = status;
                } else {
                        req->errors = nvme_error_status(status);
                }
@@ -1161,12 +1171,13 @@ static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid)
 
 int nvme_identify_ctrl(struct nvme_dev *dev, struct nvme_id_ctrl **id)
 {
-       struct nvme_command c = {
-               .identify.opcode = nvme_admin_identify,
-               .identify.cns = cpu_to_le32(1),
-       };
+       struct nvme_command c = { };
        int error;
 
+       /* gcc-4.4.4 (at least) has issues with initializers and anon unions */
+       c.identify.opcode = nvme_admin_identify;
+       c.identify.cns = cpu_to_le32(1);
+
        *id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL);
        if (!*id)
                return -ENOMEM;
@@ -1181,12 +1192,13 @@ int nvme_identify_ctrl(struct nvme_dev *dev, struct nvme_id_ctrl **id)
 int nvme_identify_ns(struct nvme_dev *dev, unsigned nsid,
                struct nvme_id_ns **id)
 {
-       struct nvme_command c = {
-               .identify.opcode = nvme_admin_identify,
-               .identify.nsid = cpu_to_le32(nsid),
-       };
+       struct nvme_command c = { };
        int error;
 
+       /* gcc-4.4.4 (at least) has issues with initializers and anon unions */
+       c.identify.opcode = nvme_admin_identify,
+       c.identify.nsid = cpu_to_le32(nsid),
+
        *id = kmalloc(sizeof(struct nvme_id_ns), GFP_KERNEL);
        if (!*id)
                return -ENOMEM;
@@ -1230,14 +1242,14 @@ int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11,
 
 int nvme_get_log_page(struct nvme_dev *dev, struct nvme_smart_log **log)
 {
-       struct nvme_command c = {
-               .common.opcode = nvme_admin_get_log_page,
-               .common.nsid = cpu_to_le32(0xFFFFFFFF),
-               .common.cdw10[0] = cpu_to_le32(
+       struct nvme_command c = { };
+       int error;
+
+       c.common.opcode = nvme_admin_get_log_page,
+       c.common.nsid = cpu_to_le32(0xFFFFFFFF),
+       c.common.cdw10[0] = cpu_to_le32(
                        (((sizeof(struct nvme_smart_log) / 4) - 1) << 16) |
                         NVME_LOG_SMART),
-       };
-       int error;
 
        *log = kmalloc(sizeof(struct nvme_smart_log), GFP_KERNEL);
        if (!*log)
@@ -1606,6 +1618,7 @@ static struct blk_mq_ops nvme_mq_admin_ops = {
        .queue_rq       = nvme_queue_rq,
        .map_queue      = blk_mq_map_queue,
        .init_hctx      = nvme_admin_init_hctx,
+       .exit_hctx      = nvme_admin_exit_hctx,
        .init_request   = nvme_admin_init_request,
        .timeout        = nvme_timeout,
 };
@@ -1648,6 +1661,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev)
                }
                if (!blk_get_queue(dev->admin_q)) {
                        nvme_dev_remove_admin(dev);
+                       dev->admin_q = NULL;
                        return -ENODEV;
                }
        } else
@@ -2349,19 +2363,20 @@ static int nvme_dev_add(struct nvme_dev *dev)
        }
        kfree(ctrl);
 
-       dev->tagset.ops = &nvme_mq_ops;
-       dev->tagset.nr_hw_queues = dev->online_queues - 1;
-       dev->tagset.timeout = NVME_IO_TIMEOUT;
-       dev->tagset.numa_node = dev_to_node(dev->dev);
-       dev->tagset.queue_depth =
+       if (!dev->tagset.tags) {
+               dev->tagset.ops = &nvme_mq_ops;
+               dev->tagset.nr_hw_queues = dev->online_queues - 1;
+               dev->tagset.timeout = NVME_IO_TIMEOUT;
+               dev->tagset.numa_node = dev_to_node(dev->dev);
+               dev->tagset.queue_depth =
                                min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1;
-       dev->tagset.cmd_size = nvme_cmd_size(dev);
-       dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
-       dev->tagset.driver_data = dev;
-
-       if (blk_mq_alloc_tag_set(&dev->tagset))
-               return 0;
+               dev->tagset.cmd_size = nvme_cmd_size(dev);
+               dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
+               dev->tagset.driver_data = dev;
 
+               if (blk_mq_alloc_tag_set(&dev->tagset))
+                       return 0;
+       }
        schedule_work(&dev->scan_work);
        return 0;
 }
@@ -2734,8 +2749,10 @@ static void nvme_free_dev(struct kref *kref)
        put_device(dev->device);
        nvme_free_namespaces(dev);
        nvme_release_instance(dev);
-       blk_mq_free_tag_set(&dev->tagset);
-       blk_put_queue(dev->admin_q);
+       if (dev->tagset.tags)
+               blk_mq_free_tag_set(&dev->tagset);
+       if (dev->admin_q)
+               blk_put_queue(dev->admin_q);
        kfree(dev->queues);
        kfree(dev->entry);
        kfree(dev);
@@ -2866,6 +2883,9 @@ static int nvme_dev_start(struct nvme_dev *dev)
 
  free_tags:
        nvme_dev_remove_admin(dev);
+       blk_put_queue(dev->admin_q);
+       dev->admin_q = NULL;
+       dev->queues[0]->tags = NULL;
  disable:
        nvme_disable_queue(dev, 0);
        nvme_dev_list_remove(dev);
@@ -2907,25 +2927,43 @@ static int nvme_dev_resume(struct nvme_dev *dev)
                spin_unlock(&dev_list_lock);
        } else {
                nvme_unfreeze_queues(dev);
-               schedule_work(&dev->scan_work);
+               nvme_dev_add(dev);
                nvme_set_irq_hints(dev);
        }
        return 0;
 }
 
+static void nvme_dead_ctrl(struct nvme_dev *dev)
+{
+       dev_warn(dev->dev, "Device failed to resume\n");
+       kref_get(&dev->kref);
+       if (IS_ERR(kthread_run(nvme_remove_dead_ctrl, dev, "nvme%d",
+                                               dev->instance))) {
+               dev_err(dev->dev,
+                       "Failed to start controller remove task\n");
+               kref_put(&dev->kref, nvme_free_dev);
+       }
+}
+
 static void nvme_dev_reset(struct nvme_dev *dev)
 {
+       bool in_probe = work_busy(&dev->probe_work);
+
        nvme_dev_shutdown(dev);
-       if (nvme_dev_resume(dev)) {
-               dev_warn(dev->dev, "Device failed to resume\n");
-               kref_get(&dev->kref);
-               if (IS_ERR(kthread_run(nvme_remove_dead_ctrl, dev, "nvme%d",
-                                                       dev->instance))) {
-                       dev_err(dev->dev,
-                               "Failed to start controller remove task\n");
-                       kref_put(&dev->kref, nvme_free_dev);
-               }
+
+       /* Synchronize with device probe so that work will see failure status
+        * and exit gracefully without trying to schedule another reset */
+       flush_work(&dev->probe_work);
+
+       /* Fail this device if reset occured during probe to avoid
+        * infinite initialization loops. */
+       if (in_probe) {
+               nvme_dead_ctrl(dev);
+               return;
        }
+       /* Schedule device resume asynchronously so the reset work is available
+        * to cleanup errors that may occur during reinitialization */
+       schedule_work(&dev->probe_work);
 }
 
 static void nvme_reset_failed_dev(struct work_struct *ws)
@@ -2957,6 +2995,7 @@ static int nvme_reset(struct nvme_dev *dev)
 
        if (!ret) {
                flush_work(&dev->reset_work);
+               flush_work(&dev->probe_work);
                return 0;
        }
 
@@ -3053,26 +3092,9 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 static void nvme_async_probe(struct work_struct *work)
 {
        struct nvme_dev *dev = container_of(work, struct nvme_dev, probe_work);
-       int result;
 
-       result = nvme_dev_start(dev);
-       if (result)
-               goto reset;
-
-       if (dev->online_queues > 1)
-               result = nvme_dev_add(dev);
-       if (result)
-               goto reset;
-
-       nvme_set_irq_hints(dev);
-       return;
- reset:
-       spin_lock(&dev_list_lock);
-       if (!work_busy(&dev->reset_work)) {
-               dev->reset_workfn = nvme_reset_failed_dev;
-               queue_work(nvme_workq, &dev->reset_work);
-       }
-       spin_unlock(&dev_list_lock);
+       if (nvme_dev_resume(dev) && !work_busy(&dev->reset_work))
+               nvme_dead_ctrl(dev);
 }
 
 static void nvme_reset_notify(struct pci_dev *pdev, bool prepare)
@@ -3104,8 +3126,8 @@ static void nvme_remove(struct pci_dev *pdev)
        flush_work(&dev->reset_work);
        flush_work(&dev->scan_work);
        device_remove_file(dev->device, &dev_attr_reset_controller);
-       nvme_dev_shutdown(dev);
        nvme_dev_remove(dev);
+       nvme_dev_shutdown(dev);
        nvme_dev_remove_admin(dev);
        device_destroy(nvme_class, MKDEV(nvme_char_major, dev->instance));
        nvme_free_queues(dev, 0);
index 713fc9ff11492766efcb7a4795b4a1750ceb9707..2126842fb6e8a862a36b733b8eb709cf785b0591 100644 (file)
@@ -83,6 +83,13 @@ module_param_named(max_persistent_grants, xen_blkif_max_pgrants, int, 0644);
 MODULE_PARM_DESC(max_persistent_grants,
                  "Maximum number of grants to map persistently");
 
+/*
+ * Maximum order of pages to be used for the shared ring between front and
+ * backend, 4KB page granularity is used.
+ */
+unsigned int xen_blkif_max_ring_order = XENBUS_MAX_RING_PAGE_ORDER;
+module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, S_IRUGO);
+MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the shared ring");
 /*
  * The LRU mechanism to clean the lists of persistent grants needs to
  * be executed periodically. The time interval between consecutive executions
@@ -1438,6 +1445,12 @@ static int __init xen_blkif_init(void)
        if (!xen_domain())
                return -ENODEV;
 
+       if (xen_blkif_max_ring_order > XENBUS_MAX_RING_PAGE_ORDER) {
+               pr_info("Invalid max_ring_order (%d), will use default max: %d.\n",
+                       xen_blkif_max_ring_order, XENBUS_MAX_RING_PAGE_ORDER);
+               xen_blkif_max_ring_order = XENBUS_MAX_RING_PAGE_ORDER;
+       }
+
        rc = xen_blkif_interface_init();
        if (rc)
                goto failed_init;
index f620b5d3f77c0270585f5cc4d86ba71bb4320092..8ccc49d01c8eb7c5fd821df8909bf8fbeeaa35a7 100644 (file)
@@ -44,6 +44,7 @@
 #include <xen/interface/io/blkif.h>
 #include <xen/interface/io/protocols.h>
 
+extern unsigned int xen_blkif_max_ring_order;
 /*
  * This is the maximum number of segments that would be allowed in indirect
  * requests. This value will also be passed to the frontend.
@@ -248,7 +249,7 @@ struct backend_info;
 #define PERSISTENT_GNT_WAS_ACTIVE      1
 
 /* Number of requests that we can fit in a ring */
-#define XEN_BLKIF_REQS                 32
+#define XEN_BLKIF_REQS_PER_PAGE                32
 
 struct persistent_gnt {
        struct page *page;
@@ -320,6 +321,7 @@ struct xen_blkif {
        struct work_struct      free_work;
        /* Thread shutdown wait queue. */
        wait_queue_head_t       shutdown_wq;
+       unsigned int nr_ring_pages;
 };
 
 struct seg_buf {
index 6ab69ad61ee126c6f62a63f77674b13489de7f73..deb3f001791f159c5c7ebce19814de31e3106a5e 100644 (file)
@@ -25,6 +25,7 @@
 
 /* Enlarge the array size in order to fully show blkback name. */
 #define BLKBACK_NAME_LEN (20)
+#define RINGREF_NAME_LEN (20)
 
 struct backend_info {
        struct xenbus_device    *dev;
@@ -124,8 +125,6 @@ static void xen_update_blkif_status(struct xen_blkif *blkif)
 static struct xen_blkif *xen_blkif_alloc(domid_t domid)
 {
        struct xen_blkif *blkif;
-       struct pending_req *req, *n;
-       int i, j;
 
        BUILD_BUG_ON(MAX_INDIRECT_PAGES > BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST);
 
@@ -151,55 +150,15 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid)
 
        INIT_LIST_HEAD(&blkif->pending_free);
        INIT_WORK(&blkif->free_work, xen_blkif_deferred_free);
-
-       for (i = 0; i < XEN_BLKIF_REQS; i++) {
-               req = kzalloc(sizeof(*req), GFP_KERNEL);
-               if (!req)
-                       goto fail;
-               list_add_tail(&req->free_list,
-                             &blkif->pending_free);
-               for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) {
-                       req->segments[j] = kzalloc(sizeof(*req->segments[0]),
-                                                  GFP_KERNEL);
-                       if (!req->segments[j])
-                               goto fail;
-               }
-               for (j = 0; j < MAX_INDIRECT_PAGES; j++) {
-                       req->indirect_pages[j] = kzalloc(sizeof(*req->indirect_pages[0]),
-                                                        GFP_KERNEL);
-                       if (!req->indirect_pages[j])
-                               goto fail;
-               }
-       }
        spin_lock_init(&blkif->pending_free_lock);
        init_waitqueue_head(&blkif->pending_free_wq);
        init_waitqueue_head(&blkif->shutdown_wq);
 
        return blkif;
-
-fail:
-       list_for_each_entry_safe(req, n, &blkif->pending_free, free_list) {
-               list_del(&req->free_list);
-               for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) {
-                       if (!req->segments[j])
-                               break;
-                       kfree(req->segments[j]);
-               }
-               for (j = 0; j < MAX_INDIRECT_PAGES; j++) {
-                       if (!req->indirect_pages[j])
-                               break;
-                       kfree(req->indirect_pages[j]);
-               }
-               kfree(req);
-       }
-
-       kmem_cache_free(xen_blkif_cachep, blkif);
-
-       return ERR_PTR(-ENOMEM);
 }
 
-static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t gref,
-                        unsigned int evtchn)
+static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref,
+                        unsigned int nr_grefs, unsigned int evtchn)
 {
        int err;
 
@@ -207,7 +166,7 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t gref,
        if (blkif->irq)
                return 0;
 
-       err = xenbus_map_ring_valloc(blkif->be->dev, &gref, 1,
+       err = xenbus_map_ring_valloc(blkif->be->dev, gref, nr_grefs,
                                     &blkif->blk_ring);
        if (err < 0)
                return err;
@@ -217,21 +176,21 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t gref,
        {
                struct blkif_sring *sring;
                sring = (struct blkif_sring *)blkif->blk_ring;
-               BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE);
+               BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE * nr_grefs);
                break;
        }
        case BLKIF_PROTOCOL_X86_32:
        {
                struct blkif_x86_32_sring *sring_x86_32;
                sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring;
-               BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE);
+               BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE * nr_grefs);
                break;
        }
        case BLKIF_PROTOCOL_X86_64:
        {
                struct blkif_x86_64_sring *sring_x86_64;
                sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring;
-               BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE);
+               BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE * nr_grefs);
                break;
        }
        default:
@@ -312,7 +271,7 @@ static void xen_blkif_free(struct xen_blkif *blkif)
                i++;
        }
 
-       WARN_ON(i != XEN_BLKIF_REQS);
+       WARN_ON(i != (XEN_BLKIF_REQS_PER_PAGE * blkif->nr_ring_pages));
 
        kmem_cache_free(xen_blkif_cachep, blkif);
 }
@@ -597,6 +556,11 @@ static int xen_blkbk_probe(struct xenbus_device *dev,
        if (err)
                goto fail;
 
+       err = xenbus_printf(XBT_NIL, dev->nodename, "max-ring-page-order", "%u",
+                           xen_blkif_max_ring_order);
+       if (err)
+               pr_warn("%s write out 'max-ring-page-order' failed\n", __func__);
+
        err = xenbus_switch_state(dev, XenbusStateInitWait);
        if (err)
                goto fail;
@@ -860,22 +824,66 @@ again:
 static int connect_ring(struct backend_info *be)
 {
        struct xenbus_device *dev = be->dev;
-       unsigned long ring_ref;
-       unsigned int evtchn;
+       unsigned int ring_ref[XENBUS_MAX_RING_PAGES];
+       unsigned int evtchn, nr_grefs, ring_page_order;
        unsigned int pers_grants;
        char protocol[64] = "";
-       int err;
+       struct pending_req *req, *n;
+       int err, i, j;
 
        pr_debug("%s %s\n", __func__, dev->otherend);
 
-       err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%lu",
-                           &ring_ref, "event-channel", "%u", &evtchn, NULL);
-       if (err) {
-               xenbus_dev_fatal(dev, err,
-                                "reading %s/ring-ref and event-channel",
+       err = xenbus_scanf(XBT_NIL, dev->otherend, "event-channel", "%u",
+                         &evtchn);
+       if (err != 1) {
+               err = -EINVAL;
+               xenbus_dev_fatal(dev, err, "reading %s/event-channel",
                                 dev->otherend);
                return err;
        }
+       pr_info("event-channel %u\n", evtchn);
+
+       err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-page-order", "%u",
+                         &ring_page_order);
+       if (err != 1) {
+               err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-ref",
+                                 "%u", &ring_ref[0]);
+               if (err != 1) {
+                       err = -EINVAL;
+                       xenbus_dev_fatal(dev, err, "reading %s/ring-ref",
+                                        dev->otherend);
+                       return err;
+               }
+               nr_grefs = 1;
+               pr_info("%s:using single page: ring-ref %d\n", dev->otherend,
+                       ring_ref[0]);
+       } else {
+               unsigned int i;
+
+               if (ring_page_order > xen_blkif_max_ring_order) {
+                       err = -EINVAL;
+                       xenbus_dev_fatal(dev, err, "%s/request %d ring page order exceed max:%d",
+                                        dev->otherend, ring_page_order,
+                                        xen_blkif_max_ring_order);
+                       return err;
+               }
+
+               nr_grefs = 1 << ring_page_order;
+               for (i = 0; i < nr_grefs; i++) {
+                       char ring_ref_name[RINGREF_NAME_LEN];
+
+                       snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i);
+                       err = xenbus_scanf(XBT_NIL, dev->otherend, ring_ref_name,
+                                          "%u", &ring_ref[i]);
+                       if (err != 1) {
+                               err = -EINVAL;
+                               xenbus_dev_fatal(dev, err, "reading %s/%s",
+                                                dev->otherend, ring_ref_name);
+                               return err;
+                       }
+                       pr_info("ring-ref%u: %u\n", i, ring_ref[i]);
+               }
+       }
 
        be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT;
        err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
@@ -900,20 +908,55 @@ static int connect_ring(struct backend_info *be)
 
        be->blkif->vbd.feature_gnt_persistent = pers_grants;
        be->blkif->vbd.overflow_max_grants = 0;
+       be->blkif->nr_ring_pages = nr_grefs;
 
-       pr_info("ring-ref %ld, event-channel %d, protocol %d (%s) %s\n",
-               ring_ref, evtchn, be->blkif->blk_protocol, protocol,
+       pr_info("ring-pages:%d, event-channel %d, protocol %d (%s) %s\n",
+               nr_grefs, evtchn, be->blkif->blk_protocol, protocol,
                pers_grants ? "persistent grants" : "");
 
+       for (i = 0; i < nr_grefs * XEN_BLKIF_REQS_PER_PAGE; i++) {
+               req = kzalloc(sizeof(*req), GFP_KERNEL);
+               if (!req)
+                       goto fail;
+               list_add_tail(&req->free_list, &be->blkif->pending_free);
+               for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) {
+                       req->segments[j] = kzalloc(sizeof(*req->segments[0]), GFP_KERNEL);
+                       if (!req->segments[j])
+                               goto fail;
+               }
+               for (j = 0; j < MAX_INDIRECT_PAGES; j++) {
+                       req->indirect_pages[j] = kzalloc(sizeof(*req->indirect_pages[0]),
+                                                        GFP_KERNEL);
+                       if (!req->indirect_pages[j])
+                               goto fail;
+               }
+       }
+
        /* Map the shared frame, irq etc. */
-       err = xen_blkif_map(be->blkif, ring_ref, evtchn);
+       err = xen_blkif_map(be->blkif, ring_ref, nr_grefs, evtchn);
        if (err) {
-               xenbus_dev_fatal(dev, err, "mapping ring-ref %lu port %u",
-                                ring_ref, evtchn);
+               xenbus_dev_fatal(dev, err, "mapping ring-ref port %u", evtchn);
                return err;
        }
 
        return 0;
+
+fail:
+       list_for_each_entry_safe(req, n, &be->blkif->pending_free, free_list) {
+               list_del(&req->free_list);
+               for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) {
+                       if (!req->segments[j])
+                               break;
+                       kfree(req->segments[j]);
+               }
+               for (j = 0; j < MAX_INDIRECT_PAGES; j++) {
+                       if (!req->indirect_pages[j])
+                               break;
+                       kfree(req->indirect_pages[j]);
+               }
+               kfree(req);
+       }
+       return -ENOMEM;
 }
 
 static const struct xenbus_device_id xen_blkbk_ids[] = {
index 2c61cf8c6f61d1086ea60155e3b1c12b9bb706bf..fc770b7d3beb1951e80c2f16956f3dd0897efe48 100644 (file)
@@ -98,7 +98,21 @@ static unsigned int xen_blkif_max_segments = 32;
 module_param_named(max, xen_blkif_max_segments, int, S_IRUGO);
 MODULE_PARM_DESC(max, "Maximum amount of segments in indirect requests (default is 32)");
 
-#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE)
+/*
+ * Maximum order of pages to be used for the shared ring between front and
+ * backend, 4KB page granularity is used.
+ */
+static unsigned int xen_blkif_max_ring_order;
+module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, S_IRUGO);
+MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the shared ring");
+
+#define BLK_RING_SIZE(info) __CONST_RING_SIZE(blkif, PAGE_SIZE * (info)->nr_ring_pages)
+#define BLK_MAX_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE * XENBUS_MAX_RING_PAGES)
+/*
+ * ring-ref%i i=(-1UL) would take 11 characters + 'ring-ref' is 8, so 19
+ * characters are enough. Define to 20 to keep consist with backend.
+ */
+#define RINGREF_NAME_LEN (20)
 
 /*
  * We have one of these per vbd, whether ide, scsi or 'other'.  They
@@ -114,13 +128,14 @@ struct blkfront_info
        int vdevice;
        blkif_vdev_t handle;
        enum blkif_state connected;
-       int ring_ref;
+       int ring_ref[XENBUS_MAX_RING_PAGES];
+       unsigned int nr_ring_pages;
        struct blkif_front_ring ring;
        unsigned int evtchn, irq;
        struct request_queue *rq;
        struct work_struct work;
        struct gnttab_free_callback callback;
-       struct blk_shadow shadow[BLK_RING_SIZE];
+       struct blk_shadow shadow[BLK_MAX_RING_SIZE];
        struct list_head grants;
        struct list_head indirect_pages;
        unsigned int persistent_gnts_c;
@@ -139,8 +154,6 @@ static unsigned int nr_minors;
 static unsigned long *minors;
 static DEFINE_SPINLOCK(minor_lock);
 
-#define MAXIMUM_OUTSTANDING_BLOCK_REQS \
-       (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
 #define GRANT_INVALID_REF      0
 
 #define PARTS_PER_DISK         16
@@ -170,7 +183,7 @@ static int blkfront_setup_indirect(struct blkfront_info *info);
 static int get_id_from_freelist(struct blkfront_info *info)
 {
        unsigned long free = info->shadow_free;
-       BUG_ON(free >= BLK_RING_SIZE);
+       BUG_ON(free >= BLK_RING_SIZE(info));
        info->shadow_free = info->shadow[free].req.u.rw.id;
        info->shadow[free].req.u.rw.id = 0x0fffffee; /* debug */
        return free;
@@ -983,7 +996,7 @@ static void blkif_free(struct blkfront_info *info, int suspend)
                }
        }
 
-       for (i = 0; i < BLK_RING_SIZE; i++) {
+       for (i = 0; i < BLK_RING_SIZE(info); i++) {
                /*
                 * Clear persistent grants present in requests already
                 * on the shared ring
@@ -1033,12 +1046,15 @@ free_shadow:
        flush_work(&info->work);
 
        /* Free resources associated with old device channel. */
-       if (info->ring_ref != GRANT_INVALID_REF) {
-               gnttab_end_foreign_access(info->ring_ref, 0,
-                                         (unsigned long)info->ring.sring);
-               info->ring_ref = GRANT_INVALID_REF;
-               info->ring.sring = NULL;
+       for (i = 0; i < info->nr_ring_pages; i++) {
+               if (info->ring_ref[i] != GRANT_INVALID_REF) {
+                       gnttab_end_foreign_access(info->ring_ref[i], 0, 0);
+                       info->ring_ref[i] = GRANT_INVALID_REF;
+               }
        }
+       free_pages((unsigned long)info->ring.sring, get_order(info->nr_ring_pages * PAGE_SIZE));
+       info->ring.sring = NULL;
+
        if (info->irq)
                unbind_from_irqhandler(info->irq, info);
        info->evtchn = info->irq = 0;
@@ -1157,7 +1173,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
                 * never have given to it (we stamp it up to BLK_RING_SIZE -
                 * look in get_id_from_freelist.
                 */
-               if (id >= BLK_RING_SIZE) {
+               if (id >= BLK_RING_SIZE(info)) {
                        WARN(1, "%s: response to %s has incorrect id (%ld)\n",
                             info->gd->disk_name, op_name(bret->operation), id);
                        /* We can't safely get the 'struct request' as
@@ -1245,26 +1261,30 @@ static int setup_blkring(struct xenbus_device *dev,
                         struct blkfront_info *info)
 {
        struct blkif_sring *sring;
-       grant_ref_t gref;
-       int err;
+       int err, i;
+       unsigned long ring_size = info->nr_ring_pages * PAGE_SIZE;
+       grant_ref_t gref[XENBUS_MAX_RING_PAGES];
 
-       info->ring_ref = GRANT_INVALID_REF;
+       for (i = 0; i < info->nr_ring_pages; i++)
+               info->ring_ref[i] = GRANT_INVALID_REF;
 
-       sring = (struct blkif_sring *)__get_free_page(GFP_NOIO | __GFP_HIGH);
+       sring = (struct blkif_sring *)__get_free_pages(GFP_NOIO | __GFP_HIGH,
+                                                      get_order(ring_size));
        if (!sring) {
                xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
                return -ENOMEM;
        }
        SHARED_RING_INIT(sring);
-       FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
+       FRONT_RING_INIT(&info->ring, sring, ring_size);
 
-       err = xenbus_grant_ring(dev, info->ring.sring, 1, &gref);
+       err = xenbus_grant_ring(dev, info->ring.sring, info->nr_ring_pages, gref);
        if (err < 0) {
-               free_page((unsigned long)sring);
+               free_pages((unsigned long)sring, get_order(ring_size));
                info->ring.sring = NULL;
                goto fail;
        }
-       info->ring_ref = gref;
+       for (i = 0; i < info->nr_ring_pages; i++)
+               info->ring_ref[i] = gref[i];
 
        err = xenbus_alloc_evtchn(dev, &info->evtchn);
        if (err)
@@ -1292,7 +1312,18 @@ static int talk_to_blkback(struct xenbus_device *dev,
 {
        const char *message = NULL;
        struct xenbus_transaction xbt;
-       int err;
+       int err, i;
+       unsigned int max_page_order = 0;
+       unsigned int ring_page_order = 0;
+
+       err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
+                          "max-ring-page-order", "%u", &max_page_order);
+       if (err != 1)
+               info->nr_ring_pages = 1;
+       else {
+               ring_page_order = min(xen_blkif_max_ring_order, max_page_order);
+               info->nr_ring_pages = 1 << ring_page_order;
+       }
 
        /* Create shared ring, alloc event channel. */
        err = setup_blkring(dev, info);
@@ -1306,11 +1337,32 @@ again:
                goto destroy_blkring;
        }
 
-       err = xenbus_printf(xbt, dev->nodename,
-                           "ring-ref", "%u", info->ring_ref);
-       if (err) {
-               message = "writing ring-ref";
-               goto abort_transaction;
+       if (info->nr_ring_pages == 1) {
+               err = xenbus_printf(xbt, dev->nodename,
+                                   "ring-ref", "%u", info->ring_ref[0]);
+               if (err) {
+                       message = "writing ring-ref";
+                       goto abort_transaction;
+               }
+       } else {
+               err = xenbus_printf(xbt, dev->nodename,
+                                   "ring-page-order", "%u", ring_page_order);
+               if (err) {
+                       message = "writing ring-page-order";
+                       goto abort_transaction;
+               }
+
+               for (i = 0; i < info->nr_ring_pages; i++) {
+                       char ring_ref_name[RINGREF_NAME_LEN];
+
+                       snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i);
+                       err = xenbus_printf(xbt, dev->nodename, ring_ref_name,
+                                           "%u", info->ring_ref[i]);
+                       if (err) {
+                               message = "writing ring-ref";
+                               goto abort_transaction;
+                       }
+               }
        }
        err = xenbus_printf(xbt, dev->nodename,
                            "event-channel", "%u", info->evtchn);
@@ -1338,6 +1390,9 @@ again:
                goto destroy_blkring;
        }
 
+       for (i = 0; i < BLK_RING_SIZE(info); i++)
+               info->shadow[i].req.u.rw.id = i+1;
+       info->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff;
        xenbus_switch_state(dev, XenbusStateInitialised);
 
        return 0;
@@ -1361,7 +1416,7 @@ again:
 static int blkfront_probe(struct xenbus_device *dev,
                          const struct xenbus_device_id *id)
 {
-       int err, vdevice, i;
+       int err, vdevice;
        struct blkfront_info *info;
 
        /* FIXME: Use dynamic device id if this is not set. */
@@ -1422,21 +1477,10 @@ static int blkfront_probe(struct xenbus_device *dev,
        info->connected = BLKIF_STATE_DISCONNECTED;
        INIT_WORK(&info->work, blkif_restart_queue);
 
-       for (i = 0; i < BLK_RING_SIZE; i++)
-               info->shadow[i].req.u.rw.id = i+1;
-       info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
-
        /* Front end dir is a number, which is used as the id. */
        info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
        dev_set_drvdata(&dev->dev, info);
 
-       err = talk_to_blkback(dev, info);
-       if (err) {
-               kfree(info);
-               dev_set_drvdata(&dev->dev, NULL);
-               return err;
-       }
-
        return 0;
 }
 
@@ -1476,10 +1520,10 @@ static int blkif_recover(struct blkfront_info *info)
 
        /* Stage 2: Set up free list. */
        memset(&info->shadow, 0, sizeof(info->shadow));
-       for (i = 0; i < BLK_RING_SIZE; i++)
+       for (i = 0; i < BLK_RING_SIZE(info); i++)
                info->shadow[i].req.u.rw.id = i+1;
        info->shadow_free = info->ring.req_prod_pvt;
-       info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
+       info->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff;
 
        rc = blkfront_setup_indirect(info);
        if (rc) {
@@ -1491,7 +1535,7 @@ static int blkif_recover(struct blkfront_info *info)
        blk_queue_max_segments(info->rq, segs);
        bio_list_init(&bio_list);
        INIT_LIST_HEAD(&requests);
-       for (i = 0; i < BLK_RING_SIZE; i++) {
+       for (i = 0; i < BLK_RING_SIZE(info); i++) {
                /* Not in use? */
                if (!copy[i].request)
                        continue;
@@ -1697,7 +1741,7 @@ static int blkfront_setup_indirect(struct blkfront_info *info)
                segs = info->max_indirect_segments;
        }
 
-       err = fill_grant_buffer(info, (segs + INDIRECT_GREFS(segs)) * BLK_RING_SIZE);
+       err = fill_grant_buffer(info, (segs + INDIRECT_GREFS(segs)) * BLK_RING_SIZE(info));
        if (err)
                goto out_of_memory;
 
@@ -1707,7 +1751,7 @@ static int blkfront_setup_indirect(struct blkfront_info *info)
                 * grants, we need to allocate a set of pages that can be
                 * used for mapping indirect grefs
                 */
-               int num = INDIRECT_GREFS(segs) * BLK_RING_SIZE;
+               int num = INDIRECT_GREFS(segs) * BLK_RING_SIZE(info);
 
                BUG_ON(!list_empty(&info->indirect_pages));
                for (i = 0; i < num; i++) {
@@ -1718,7 +1762,7 @@ static int blkfront_setup_indirect(struct blkfront_info *info)
                }
        }
 
-       for (i = 0; i < BLK_RING_SIZE; i++) {
+       for (i = 0; i < BLK_RING_SIZE(info); i++) {
                info->shadow[i].grants_used = kzalloc(
                        sizeof(info->shadow[i].grants_used[0]) * segs,
                        GFP_NOIO);
@@ -1740,7 +1784,7 @@ static int blkfront_setup_indirect(struct blkfront_info *info)
        return 0;
 
 out_of_memory:
-       for (i = 0; i < BLK_RING_SIZE; i++) {
+       for (i = 0; i < BLK_RING_SIZE(info); i++) {
                kfree(info->shadow[i].grants_used);
                info->shadow[i].grants_used = NULL;
                kfree(info->shadow[i].sg);
@@ -1906,8 +1950,15 @@ static void blkback_changed(struct xenbus_device *dev,
        dev_dbg(&dev->dev, "blkfront:blkback_changed to state %d.\n", backend_state);
 
        switch (backend_state) {
-       case XenbusStateInitialising:
        case XenbusStateInitWait:
+               if (dev->state != XenbusStateInitialising)
+                       break;
+               if (talk_to_blkback(dev, info)) {
+                       kfree(info);
+                       dev_set_drvdata(&dev->dev, NULL);
+                       break;
+               }
+       case XenbusStateInitialising:
        case XenbusStateInitialised:
        case XenbusStateReconfiguring:
        case XenbusStateReconfigured:
@@ -2091,6 +2142,12 @@ static int __init xlblk_init(void)
        if (!xen_domain())
                return -ENODEV;
 
+       if (xen_blkif_max_ring_order > XENBUS_MAX_RING_PAGE_ORDER) {
+               pr_info("Invalid max_ring_order (%d), will use default max: %d.\n",
+                       xen_blkif_max_ring_order, XENBUS_MAX_RING_PAGE_ORDER);
+               xen_blkif_max_ring_order = 0;
+       }
+
        if (!xen_has_pv_disk_devices())
                return -ENODEV;
 
index b155d32db76633bc88d1904546f803d2272db179..4fe10f93db8a3e52ebbb5330e94b80ee92455e1d 100644 (file)
@@ -43,7 +43,7 @@ static inline struct bdev_inode *BDEV_I(struct inode *inode)
        return container_of(inode, struct bdev_inode, vfs_inode);
 }
 
-inline struct block_device *I_BDEV(struct inode *inode)
+struct block_device *I_BDEV(struct inode *inode)
 {
        return &BDEV_I(inode)->bdev;
 }