drivers/block/xen-blkback/blkback.c

   1 /******************************************************************************
   2  *
   3  * Back-end of the driver for virtual block devices. This portion of the
   4  * driver exports a 'unified' block-device interface that can be accessed
   5  * by any operating system that implements a compatible front end. A
   6  * reference front-end implementation can be found in:
   7  *  drivers/block/xen-blkfront.c
   8  *
   9  * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
  10  * Copyright (c) 2005, Christopher Clark
  11  *
  12  * This program is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU General Public License version 2
  14  * as published by the Free Software Foundation; or, when distributed
  15  * separately from the Linux kernel or incorporated into other
  16  * software packages, subject to the following license:
  17  *
  18  * Permission is hereby granted, free of charge, to any person obtaining a copy
  19  * of this source file (the "Software"), to deal in the Software without
  20  * restriction, including without limitation the rights to use, copy, modify,
  21  * merge, publish, distribute, sublicense, and/or sell copies of the Software,
  22  * and to permit persons to whom the Software is furnished to do so, subject to
  23  * the following conditions:
  24  *
  25  * The above copyright notice and this permission notice shall be included in
  26  * all copies or substantial portions of the Software.
  27  *
  28  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  29  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  30  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  31  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  32  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  33  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  34  * IN THE SOFTWARE.
  35  */
  36
  37 #include <linux/spinlock.h>
  38 #include <linux/kthread.h>
  39 #include <linux/list.h>
  40 #include <linux/delay.h>
  41 #include <linux/freezer.h>
  42 #include <linux/bitmap.h>
  43
  44 #include <xen/events.h>
  45 #include <xen/page.h>
  46 #include <xen/xen.h>
  47 #include <asm/xen/hypervisor.h>
  48 #include <asm/xen/hypercall.h>
  49 #include <xen/balloon.h>
  50 #include "common.h"
  51
  52 /*
  53  * These are rather arbitrary. They are fairly large because adjacent requests
  54  * pulled from a communication ring are quite likely to end up being part of
  55  * the same scatter/gather request at the disc.
  56  *
  57  * ** TRY INCREASING 'xen_blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW **
  58  *
  59  * This will increase the chances of being able to write whole tracks.
  60  * 64 should be enough to keep us competitive with Linux.
  61  */
  62 static int xen_blkif_reqs = 64;
  63 module_param_named(reqs, xen_blkif_reqs, int, 0);
  64 MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate");
  65
  66 /* Run-time switchable: /sys/module/blkback/parameters/ */
  67 static unsigned int log_stats;
  68 module_param(log_stats, int, 0644);
  69
  70 /*
  71  * Each outstanding request that we've passed to the lower device layers has a
  72  * 'pending_req' allocated to it. Each buffer_head that completes decrements
  73  * the pendcnt towards zero. When it hits zero, the specified domain has a
  74  * response queued for it, with the saved 'id' passed back.
  75  */
  76 struct pending_req {
  77         struct xen_blkif        *blkif;
  78         u64                     id;
  79         int                     nr_pages;
  80         atomic_t                pendcnt;
  81         unsigned short          operation;
  82         int                     status;
  83         struct list_head        free_list;
  84         DECLARE_BITMAP(unmap_seg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
  85 };
  86
  87 #define BLKBACK_INVALID_HANDLE (~0)
  88
  89 struct xen_blkbk {
  90         struct pending_req      *pending_reqs;
  91         /* List of all 'pending_req' available */
  92         struct list_head        pending_free;
  93         /* And its spinlock. */
  94         spinlock_t              pending_free_lock;
  95         wait_queue_head_t       pending_free_wq;
  96         /* The list of all pages that are available. */
  97         struct page             **pending_pages;
  98         /* And the grant handles that are available. */
  99         grant_handle_t          *pending_grant_handles;
 100 };
 101
 102 static struct xen_blkbk *blkbk;
 103
 104 /*
 105  * Maximum number of grant pages that can be mapped in blkback.
 106  * BLKIF_MAX_SEGMENTS_PER_REQUEST * RING_SIZE is the maximum number of
 107  * pages that blkback will persistently map.
 108  * Currently, this is:
 109  * RING_SIZE = 32 (for all known ring types)
 110  * BLKIF_MAX_SEGMENTS_PER_REQUEST = 11
 111  * sizeof(struct persistent_gnt) = 48
 112  * So the maximum memory used to store the grants is:
 113  * 32 * 11 * 48 = 16896 bytes
 114  */
 115 static inline unsigned int max_mapped_grant_pages(enum blkif_protocol protocol)
 116 {
 117         switch (protocol) {
 118         case BLKIF_PROTOCOL_NATIVE:
 119                 return __CONST_RING_SIZE(blkif, PAGE_SIZE) *
 120                            BLKIF_MAX_SEGMENTS_PER_REQUEST;
 121         case BLKIF_PROTOCOL_X86_32:
 122                 return __CONST_RING_SIZE(blkif_x86_32, PAGE_SIZE) *
 123                            BLKIF_MAX_SEGMENTS_PER_REQUEST;
 124         case BLKIF_PROTOCOL_X86_64:
 125                 return __CONST_RING_SIZE(blkif_x86_64, PAGE_SIZE) *
 126                            BLKIF_MAX_SEGMENTS_PER_REQUEST;
 127         default:
 128                 BUG();
 129         }
 130         return 0;
 131 }
 132
 133
 134 /*
 135  * Little helpful macro to figure out the index and virtual address of the
 136  * pending_pages[..]. For each 'pending_req' we have have up to
 137  * BLKIF_MAX_SEGMENTS_PER_REQUEST (11) pages. The seg would be from 0 through
 138  * 10 and would index in the pending_pages[..].
 139  */
 140 static inline int vaddr_pagenr(struct pending_req *req, int seg)
 141 {
 142         return (req - blkbk->pending_reqs) *
 143                 BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
 144 }
 145
 146 #define pending_page(req, seg) pending_pages[vaddr_pagenr(req, seg)]
 147
 148 static inline unsigned long vaddr(struct pending_req *req, int seg)
 149 {
 150         unsigned long pfn = page_to_pfn(blkbk->pending_page(req, seg));
 151         return (unsigned long)pfn_to_kaddr(pfn);
 152 }
 153
 154 #define pending_handle(_req, _seg) \
 155         (blkbk->pending_grant_handles[vaddr_pagenr(_req, _seg)])
 156
 157
 158 static int do_block_io_op(struct xen_blkif *blkif);
 159 static int dispatch_rw_block_io(struct xen_blkif *blkif,
 160                                 struct blkif_request *req,
 161                                 struct pending_req *pending_req);
 162 static void make_response(struct xen_blkif *blkif, u64 id,
 163                           unsigned short op, int st);
 164
 165 #define foreach_grant_safe(pos, n, rbtree, node) \
 166         for ((pos) = container_of(rb_first((rbtree)), typeof(*(pos)), node), \
 167              (n) = (&(pos)->node != NULL) ? rb_next(&(pos)->node) : NULL; \
 168              &(pos)->node != NULL; \
 169              (pos) = container_of(n, typeof(*(pos)), node), \
 170              (n) = (&(pos)->node != NULL) ? rb_next(&(pos)->node) : NULL)
 171
 172
 173 static void add_persistent_gnt(struct rb_root *root,
 174                                struct persistent_gnt *persistent_gnt)
 175 {
 176         struct rb_node **new = &(root->rb_node), *parent = NULL;
 177         struct persistent_gnt *this;
 178
 179         /* Figure out where to put new node */
 180         while (*new) {
 181                 this = container_of(*new, struct persistent_gnt, node);
 182
 183                 parent = *new;
 184                 if (persistent_gnt->gnt < this->gnt)
 185                         new = &((*new)->rb_left);
 186                 else if (persistent_gnt->gnt > this->gnt)
 187                         new = &((*new)->rb_right);
 188                 else {
 189                         pr_alert(DRV_PFX " trying to add a gref that's already in the tree\n");
 190                         BUG();
 191                 }
 192         }
 193
 194         /* Add new node and rebalance tree. */
 195         rb_link_node(&(persistent_gnt->node), parent, new);
 196         rb_insert_color(&(persistent_gnt->node), root);
 197 }
 198
 199 static struct persistent_gnt *get_persistent_gnt(struct rb_root *root,
 200                                                  grant_ref_t gref)
 201 {
 202         struct persistent_gnt *data;
 203         struct rb_node *node = root->rb_node;
 204
 205         while (node) {
 206                 data = container_of(node, struct persistent_gnt, node);
 207
 208                 if (gref < data->gnt)
 209                         node = node->rb_left;
 210                 else if (gref > data->gnt)
 211                         node = node->rb_right;
 212                 else
 213                         return data;
 214         }
 215         return NULL;
 216 }
 217
 218 static void free_persistent_gnts(struct rb_root *root, unsigned int num)
 219 {
 220         struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 221         struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 222         struct persistent_gnt *persistent_gnt;
 223         struct rb_node *n;
 224         int ret = 0;
 225         int segs_to_unmap = 0;
 226
 227         foreach_grant_safe(persistent_gnt, n, root, node) {
 228                 BUG_ON(persistent_gnt->handle ==
 229                         BLKBACK_INVALID_HANDLE);
 230                 gnttab_set_unmap_op(&unmap[segs_to_unmap],
 231                         (unsigned long) pfn_to_kaddr(page_to_pfn(
 232                                 persistent_gnt->page)),
 233                         GNTMAP_host_map,
 234                         persistent_gnt->handle);
 235
 236                 pages[segs_to_unmap] = persistent_gnt->page;
 237
 238                 if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST ||
 239                         !rb_next(&persistent_gnt->node)) {
 240                         ret = gnttab_unmap_refs(unmap, NULL, pages,
 241                                 segs_to_unmap);
 242                         BUG_ON(ret);
 243                         free_xenballooned_pages(segs_to_unmap, pages);
 244                         segs_to_unmap = 0;
 245                 }
 246
 247                 rb_erase(&persistent_gnt->node, root);
 248                 kfree(persistent_gnt);
 249                 num--;
 250         }
 251         BUG_ON(num != 0);
 252 }
 253
 254 /*
 255  * Retrieve from the 'pending_reqs' a free pending_req structure to be used.
 256  */
 257 static struct pending_req *alloc_req(void)
 258 {
 259         struct pending_req *req = NULL;
 260         unsigned long flags;
 261
 262         spin_lock_irqsave(&blkbk->pending_free_lock, flags);
 263         if (!list_empty(&blkbk->pending_free)) {
 264                 req = list_entry(blkbk->pending_free.next, struct pending_req,
 265                                  free_list);
 266                 list_del(&req->free_list);
 267         }
 268         spin_unlock_irqrestore(&blkbk->pending_free_lock, flags);
 269         return req;
 270 }
 271
 272 /*
 273  * Return the 'pending_req' structure back to the freepool. We also
 274  * wake up the thread if it was waiting for a free page.
 275  */
 276 static void free_req(struct pending_req *req)
 277 {
 278         unsigned long flags;
 279         int was_empty;
 280
 281         spin_lock_irqsave(&blkbk->pending_free_lock, flags);
 282         was_empty = list_empty(&blkbk->pending_free);
 283         list_add(&req->free_list, &blkbk->pending_free);
 284         spin_unlock_irqrestore(&blkbk->pending_free_lock, flags);
 285         if (was_empty)
 286                 wake_up(&blkbk->pending_free_wq);
 287 }
 288
 289 /*
 290  * Routines for managing virtual block devices (vbds).
 291  */
 292 static int xen_vbd_translate(struct phys_req *req, struct xen_blkif *blkif,
 293                              int operation)
 294 {
 295         struct xen_vbd *vbd = &blkif->vbd;
 296         int rc = -EACCES;
 297
 298         if ((operation != READ) && vbd->readonly)
 299                 goto out;
 300
 301         if (likely(req->nr_sects)) {
 302                 blkif_sector_t end = req->sector_number + req->nr_sects;
 303
 304                 if (unlikely(end < req->sector_number))
 305                         goto out;
 306                 if (unlikely(end > vbd_sz(vbd)))
 307                         goto out;
 308         }
 309
 310         req->dev  = vbd->pdevice;
 311         req->bdev = vbd->bdev;
 312         rc = 0;
 313
 314  out:
 315         return rc;
 316 }
 317
 318 static void xen_vbd_resize(struct xen_blkif *blkif)
 319 {
 320         struct xen_vbd *vbd = &blkif->vbd;
 321         struct xenbus_transaction xbt;
 322         int err;
 323         struct xenbus_device *dev = xen_blkbk_xenbus(blkif->be);
 324         unsigned long long new_size = vbd_sz(vbd);
 325
 326         pr_info(DRV_PFX "VBD Resize: Domid: %d, Device: (%d, %d)\n",
 327                 blkif->domid, MAJOR(vbd->pdevice), MINOR(vbd->pdevice));
 328         pr_info(DRV_PFX "VBD Resize: new size %llu\n", new_size);
 329         vbd->size = new_size;
 330 again:
 331         err = xenbus_transaction_start(&xbt);
 332         if (err) {
 333                 pr_warn(DRV_PFX "Error starting transaction");
 334                 return;
 335         }
 336         err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
 337                             (unsigned long long)vbd_sz(vbd));
 338         if (err) {
 339                 pr_warn(DRV_PFX "Error writing new size");
 340                 goto abort;
 341         }
 342         /*
 343          * Write the current state; we will use this to synchronize
 344          * the front-end. If the current state is "connected" the
 345          * front-end will get the new size information online.
 346          */
 347         err = xenbus_printf(xbt, dev->nodename, "state", "%d", dev->state);
 348         if (err) {
 349                 pr_warn(DRV_PFX "Error writing the state");
 350                 goto abort;
 351         }
 352
 353         err = xenbus_transaction_end(xbt, 0);
 354         if (err == -EAGAIN)
 355                 goto again;
 356         if (err)
 357                 pr_warn(DRV_PFX "Error ending transaction");
 358         return;
 359 abort:
 360         xenbus_transaction_end(xbt, 1);
 361 }
 362
 363 /*
 364  * Notification from the guest OS.
 365  */
 366 static void blkif_notify_work(struct xen_blkif *blkif)
 367 {
 368         blkif->waiting_reqs = 1;
 369         wake_up(&blkif->wq);
 370 }
 371
 372 irqreturn_t xen_blkif_be_int(int irq, void *dev_id)
 373 {
 374         blkif_notify_work(dev_id);
 375         return IRQ_HANDLED;
 376 }
 377
 378 /*
 379  * SCHEDULER FUNCTIONS
 380  */
 381
 382 static void print_stats(struct xen_blkif *blkif)
 383 {
 384         pr_info("xen-blkback (%s): oo %3llu  |  rd %4llu  |  wr %4llu  |  f %4llu"
 385                  "  |  ds %4llu\n",
 386                  current->comm, blkif->st_oo_req,
 387                  blkif->st_rd_req, blkif->st_wr_req,
 388                  blkif->st_f_req, blkif->st_ds_req);
 389         blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
 390         blkif->st_rd_req = 0;
 391         blkif->st_wr_req = 0;
 392         blkif->st_oo_req = 0;
 393         blkif->st_ds_req = 0;
 394 }
 395
 396 int xen_blkif_schedule(void *arg)
 397 {
 398         struct xen_blkif *blkif = arg;
 399         struct xen_vbd *vbd = &blkif->vbd;
 400
 401         xen_blkif_get(blkif);
 402
 403         while (!kthread_should_stop()) {
 404                 if (try_to_freeze())
 405                         continue;
 406                 if (unlikely(vbd->size != vbd_sz(vbd)))
 407                         xen_vbd_resize(blkif);
 408
 409                 wait_event_interruptible(
 410                         blkif->wq,
 411                         blkif->waiting_reqs || kthread_should_stop());
 412                 wait_event_interruptible(
 413                         blkbk->pending_free_wq,
 414                         !list_empty(&blkbk->pending_free) ||
 415                         kthread_should_stop());
 416
 417                 blkif->waiting_reqs = 0;
 418                 smp_mb(); /* clear flag *before* checking for work */
 419
 420                 if (do_block_io_op(blkif))
 421                         blkif->waiting_reqs = 1;
 422
 423                 if (log_stats && time_after(jiffies, blkif->st_print))
 424                         print_stats(blkif);
 425         }
 426
 427         /* Free all persistent grant pages */
 428         if (!RB_EMPTY_ROOT(&blkif->persistent_gnts))
 429                 free_persistent_gnts(&blkif->persistent_gnts,
 430                         blkif->persistent_gnt_c);
 431
 432         BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts));
 433         blkif->persistent_gnt_c = 0;
 434
 435         if (log_stats)
 436                 print_stats(blkif);
 437
 438         blkif->xenblkd = NULL;
 439         xen_blkif_put(blkif);
 440
 441         return 0;
 442 }
 443
 444 struct seg_buf {
 445         unsigned long buf;
 446         unsigned int nsec;
 447 };
 448 /*
 449  * Unmap the grant references, and also remove the M2P over-rides
 450  * used in the 'pending_req'.
 451  */
 452 static void xen_blkbk_unmap(struct pending_req *req)
 453 {
 454         struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 455         struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 456         unsigned int i, invcount = 0;
 457         grant_handle_t handle;
 458         int ret;
 459
 460         for (i = 0; i < req->nr_pages; i++) {
 461                 if (!test_bit(i, req->unmap_seg))
 462                         continue;
 463                 handle = pending_handle(req, i);
 464                 if (handle == BLKBACK_INVALID_HANDLE)
 465                         continue;
 466                 gnttab_set_unmap_op(&unmap[invcount], vaddr(req, i),
 467                                     GNTMAP_host_map, handle);
 468                 pending_handle(req, i) = BLKBACK_INVALID_HANDLE;
 469                 pages[invcount] = virt_to_page(vaddr(req, i));
 470                 invcount++;
 471         }
 472
 473         ret = gnttab_unmap_refs(unmap, NULL, pages, invcount);
 474         BUG_ON(ret);
 475 }
 476
 477 static int xen_blkbk_map(struct blkif_request *req,
 478                          struct pending_req *pending_req,
 479                          struct seg_buf seg[],
 480                          struct page *pages[])
 481 {
 482         struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 483         struct persistent_gnt *persistent_gnts[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 484         struct page *pages_to_gnt[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 485         struct persistent_gnt *persistent_gnt = NULL;
 486         struct xen_blkif *blkif = pending_req->blkif;
 487         phys_addr_t addr = 0;
 488         int i, j;
 489         bool new_map;
 490         int nseg = req->u.rw.nr_segments;
 491         int segs_to_map = 0;
 492         int ret = 0;
 493         int use_persistent_gnts;
 494
 495         use_persistent_gnts = (blkif->vbd.feature_gnt_persistent);
 496
 497         BUG_ON(blkif->persistent_gnt_c >
 498                    max_mapped_grant_pages(pending_req->blkif->blk_protocol));
 499
 500         /*
 501          * Fill out preq.nr_sects with proper amount of sectors, and setup
 502          * assign map[..] with the PFN of the page in our domain with the
 503          * corresponding grant reference for each page.
 504          */
 505         for (i = 0; i < nseg; i++) {
 506                 uint32_t flags;
 507
 508                 if (use_persistent_gnts)
 509                         persistent_gnt = get_persistent_gnt(
 510                                 &blkif->persistent_gnts,
 511                                 req->u.rw.seg[i].gref);
 512
 513                 if (persistent_gnt) {
 514                         /*
 515                          * We are using persistent grants and
 516                          * the grant is already mapped
 517                          */
 518                         new_map = false;
 519                 } else if (use_persistent_gnts &&
 520                            blkif->persistent_gnt_c <
 521                            max_mapped_grant_pages(blkif->blk_protocol)) {
 522                         /*
 523                          * We are using persistent grants, the grant is
 524                          * not mapped but we have room for it
 525                          */
 526                         new_map = true;
 527                         persistent_gnt = kmalloc(
 528                                 sizeof(struct persistent_gnt),
 529                                 GFP_KERNEL);
 530                         if (!persistent_gnt)
 531                                 return -ENOMEM;
 532                         if (alloc_xenballooned_pages(1, &persistent_gnt->page,
 533                             false)) {
 534                                 kfree(persistent_gnt);
 535                                 return -ENOMEM;
 536                         }
 537                         persistent_gnt->gnt = req->u.rw.seg[i].gref;
 538                         persistent_gnt->handle = BLKBACK_INVALID_HANDLE;
 539
 540                         pages_to_gnt[segs_to_map] =
 541                                 persistent_gnt->page;
 542                         addr = (unsigned long) pfn_to_kaddr(
 543                                 page_to_pfn(persistent_gnt->page));
 544
 545                         add_persistent_gnt(&blkif->persistent_gnts,
 546                                 persistent_gnt);
 547                         blkif->persistent_gnt_c++;
 548                         pr_debug(DRV_PFX " grant %u added to the tree of persistent grants, using %u/%u\n",
 549                                  persistent_gnt->gnt, blkif->persistent_gnt_c,
 550                                  max_mapped_grant_pages(blkif->blk_protocol));
 551                 } else {
 552                         /*
 553                          * We are either using persistent grants and
 554                          * hit the maximum limit of grants mapped,
 555                          * or we are not using persistent grants.
 556                          */
 557                         if (use_persistent_gnts &&
 558                                 !blkif->vbd.overflow_max_grants) {
 559                                 blkif->vbd.overflow_max_grants = 1;
 560                                 pr_alert(DRV_PFX " domain %u, device %#x is using maximum number of persistent grants\n",
 561                                          blkif->domid, blkif->vbd.handle);
 562                         }
 563                         new_map = true;
 564                         pages[i] = blkbk->pending_page(pending_req, i);
 565                         addr = vaddr(pending_req, i);
 566                         pages_to_gnt[segs_to_map] =
 567                                 blkbk->pending_page(pending_req, i);
 568                 }
 569
 570                 if (persistent_gnt) {
 571                         pages[i] = persistent_gnt->page;
 572                         persistent_gnts[i] = persistent_gnt;
 573                 } else {
 574                         persistent_gnts[i] = NULL;
 575                 }
 576
 577                 if (new_map) {
 578                         flags = GNTMAP_host_map;
 579                         if (!persistent_gnt &&
 580                             (pending_req->operation != BLKIF_OP_READ))
 581                                 flags |= GNTMAP_readonly;
 582                         gnttab_set_map_op(&map[segs_to_map++], addr,
 583                                           flags, req->u.rw.seg[i].gref,
 584                                           blkif->domid);
 585                 }
 586         }
 587
 588         if (segs_to_map) {
 589                 ret = gnttab_map_refs(map, NULL, pages_to_gnt, segs_to_map);
 590                 BUG_ON(ret);
 591         }
 592
 593         /*
 594          * Now swizzle the MFN in our domain with the MFN from the other domain
 595          * so that when we access vaddr(pending_req,i) it has the contents of
 596          * the page from the other domain.
 597          */
 598         bitmap_zero(pending_req->unmap_seg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
 599         for (i = 0, j = 0; i < nseg; i++) {
 600                 if (!persistent_gnts[i] ||
 601                     persistent_gnts[i]->handle == BLKBACK_INVALID_HANDLE) {
 602                         /* This is a newly mapped grant */
 603                         BUG_ON(j >= segs_to_map);
 604                         if (unlikely(map[j].status != 0)) {
 605                                 pr_debug(DRV_PFX "invalid buffer -- could not remap it\n");
 606                                 map[j].handle = BLKBACK_INVALID_HANDLE;
 607                                 ret |= 1;
 608                                 if (persistent_gnts[i]) {
 609                                         rb_erase(&persistent_gnts[i]->node,
 610                                                  &blkif->persistent_gnts);
 611                                         blkif->persistent_gnt_c--;
 612                                         kfree(persistent_gnts[i]);
 613                                         persistent_gnts[i] = NULL;
 614                                 }
 615                         }
 616                 }
 617                 if (persistent_gnts[i]) {
 618                         if (persistent_gnts[i]->handle ==
 619                             BLKBACK_INVALID_HANDLE) {
 620                                 /*
 621                                  * If this is a new persistent grant
 622                                  * save the handler
 623                                  */
 624                                 persistent_gnts[i]->handle = map[j].handle;
 625                                 persistent_gnts[i]->dev_bus_addr =
 626                                         map[j++].dev_bus_addr;
 627                         }
 628                         pending_handle(pending_req, i) =
 629                                 persistent_gnts[i]->handle;
 630
 631                         if (ret)
 632                                 continue;
 633
 634                         seg[i].buf = persistent_gnts[i]->dev_bus_addr |
 635                                 (req->u.rw.seg[i].first_sect << 9);
 636                 } else {
 637                         pending_handle(pending_req, i) = map[j].handle;
 638                         bitmap_set(pending_req->unmap_seg, i, 1);
 639
 640                         if (ret) {
 641                                 j++;
 642                                 continue;
 643                         }
 644
 645                         seg[i].buf = map[j++].dev_bus_addr |
 646                                 (req->u.rw.seg[i].first_sect << 9);
 647                 }
 648         }
 649         return ret;
 650 }
 651
 652 static int dispatch_discard_io(struct xen_blkif *blkif,
 653                                 struct blkif_request *req)
 654 {
 655         int err = 0;
 656         int status = BLKIF_RSP_OKAY;
 657         struct block_device *bdev = blkif->vbd.bdev;
 658         unsigned long secure;
 659
 660         blkif->st_ds_req++;
 661
 662         xen_blkif_get(blkif);
 663         secure = (blkif->vbd.discard_secure &&
 664                  (req->u.discard.flag & BLKIF_DISCARD_SECURE)) ?
 665                  BLKDEV_DISCARD_SECURE : 0;
 666
 667         err = blkdev_issue_discard(bdev, req->u.discard.sector_number,
 668                                    req->u.discard.nr_sectors,
 669                                    GFP_KERNEL, secure);
 670
 671         if (err == -EOPNOTSUPP) {
 672                 pr_debug(DRV_PFX "discard op failed, not supported\n");
 673                 status = BLKIF_RSP_EOPNOTSUPP;
 674         } else if (err)
 675                 status = BLKIF_RSP_ERROR;
 676
 677         make_response(blkif, req->u.discard.id, req->operation, status);
 678         xen_blkif_put(blkif);
 679         return err;
 680 }
 681
 682 static int dispatch_other_io(struct xen_blkif *blkif,
 683                              struct blkif_request *req,
 684                              struct pending_req *pending_req)
 685 {
 686         free_req(pending_req);
 687         make_response(blkif, req->u.other.id, req->operation,
 688                       BLKIF_RSP_EOPNOTSUPP);
 689         return -EIO;
 690 }
 691
 692 static void xen_blk_drain_io(struct xen_blkif *blkif)
 693 {
 694         atomic_set(&blkif->drain, 1);
 695         do {
 696                 /* The initial value is one, and one refcnt taken at the
 697                  * start of the xen_blkif_schedule thread. */
 698                 if (atomic_read(&blkif->refcnt) <= 2)
 699                         break;
 700                 wait_for_completion_interruptible_timeout(
 701                                 &blkif->drain_complete, HZ);
 702
 703                 if (!atomic_read(&blkif->drain))
 704                         break;
 705         } while (!kthread_should_stop());
 706         atomic_set(&blkif->drain, 0);
 707 }
 708
 709 /*
 710  * Completion callback on the bio's. Called as bh->b_end_io()
 711  */
 712
 713 static void __end_block_io_op(struct pending_req *pending_req, int error)
 714 {
 715         /* An error fails the entire request. */
 716         if ((pending_req->operation == BLKIF_OP_FLUSH_DISKCACHE) &&
 717             (error == -EOPNOTSUPP)) {
 718                 pr_debug(DRV_PFX "flush diskcache op failed, not supported\n");
 719                 xen_blkbk_flush_diskcache(XBT_NIL, pending_req->blkif->be, 0);
 720                 pending_req->status = BLKIF_RSP_EOPNOTSUPP;
 721         } else if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) &&
 722                     (error == -EOPNOTSUPP)) {
 723                 pr_debug(DRV_PFX "write barrier op failed, not supported\n");
 724                 xen_blkbk_barrier(XBT_NIL, pending_req->blkif->be, 0);
 725                 pending_req->status = BLKIF_RSP_EOPNOTSUPP;
 726         } else if (error) {
 727                 pr_debug(DRV_PFX "Buffer not up-to-date at end of operation,"
 728                          " error=%d\n", error);
 729                 pending_req->status = BLKIF_RSP_ERROR;
 730         }
 731
 732         /*
 733          * If all of the bio's have completed it is time to unmap
 734          * the grant references associated with 'request' and provide
 735          * the proper response on the ring.
 736          */
 737         if (atomic_dec_and_test(&pending_req->pendcnt)) {
 738                 xen_blkbk_unmap(pending_req);
 739                 make_response(pending_req->blkif, pending_req->id,
 740                               pending_req->operation, pending_req->status);
 741                 xen_blkif_put(pending_req->blkif);
 742                 if (atomic_read(&pending_req->blkif->refcnt) <= 2) {
 743                         if (atomic_read(&pending_req->blkif->drain))
 744                                 complete(&pending_req->blkif->drain_complete);
 745                 }
 746                 free_req(pending_req);
 747         }
 748 }
 749
 750 /*
 751  * bio callback.
 752  */
 753 static void end_block_io_op(struct bio *bio, int error)
 754 {
 755         __end_block_io_op(bio->bi_private, error);
 756         bio_put(bio);
 757 }
 758
 759
 760
 761 /*
 762  * Function to copy the from the ring buffer the 'struct blkif_request'
 763  * (which has the sectors we want, number of them, grant references, etc),
 764  * and transmute  it to the block API to hand it over to the proper block disk.
 765  */
 766 static int
 767 __do_block_io_op(struct xen_blkif *blkif)
 768 {
 769         union blkif_back_rings *blk_rings = &blkif->blk_rings;
 770         struct blkif_request req;
 771         struct pending_req *pending_req;
 772         RING_IDX rc, rp;
 773         int more_to_do = 0;
 774
 775         rc = blk_rings->common.req_cons;
 776         rp = blk_rings->common.sring->req_prod;
 777         rmb(); /* Ensure we see queued requests up to 'rp'. */
 778
 779         while (rc != rp) {
 780
 781                 if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc))
 782                         break;
 783
 784                 if (kthread_should_stop()) {
 785                         more_to_do = 1;
 786                         break;
 787                 }
 788
 789                 pending_req = alloc_req();
 790                 if (NULL == pending_req) {
 791                         blkif->st_oo_req++;
 792                         more_to_do = 1;
 793                         break;
 794                 }
 795
 796                 switch (blkif->blk_protocol) {
 797                 case BLKIF_PROTOCOL_NATIVE:
 798                         memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req));
 799                         break;
 800                 case BLKIF_PROTOCOL_X86_32:
 801                         blkif_get_x86_32_req(&req, RING_GET_REQUEST(&blk_rings->x86_32, rc));
 802                         break;
 803                 case BLKIF_PROTOCOL_X86_64:
 804                         blkif_get_x86_64_req(&req, RING_GET_REQUEST(&blk_rings->x86_64, rc));
 805                         break;
 806                 default:
 807                         BUG();
 808                 }
 809                 blk_rings->common.req_cons = ++rc; /* before make_response() */
 810
 811                 /* Apply all sanity checks to /private copy/ of request. */
 812                 barrier();
 813
 814                 switch (req.operation) {
 815                 case BLKIF_OP_READ:
 816                 case BLKIF_OP_WRITE:
 817                 case BLKIF_OP_WRITE_BARRIER:
 818                 case BLKIF_OP_FLUSH_DISKCACHE:
 819                         if (dispatch_rw_block_io(blkif, &req, pending_req))
 820                                 goto done;
 821                         break;
 822                 case BLKIF_OP_DISCARD:
 823                         free_req(pending_req);
 824                         if (dispatch_discard_io(blkif, &req))
 825                                 goto done;
 826                         break;
 827                 default:
 828                         if (dispatch_other_io(blkif, &req, pending_req))
 829                                 goto done;
 830                         break;
 831                 }
 832
 833                 /* Yield point for this unbounded loop. */
 834                 cond_resched();
 835         }
 836 done:
 837         return more_to_do;
 838 }
 839
 840 static int
 841 do_block_io_op(struct xen_blkif *blkif)
 842 {
 843         union blkif_back_rings *blk_rings = &blkif->blk_rings;
 844         int more_to_do;
 845
 846         do {
 847                 more_to_do = __do_block_io_op(blkif);
 848                 if (more_to_do)
 849                         break;
 850
 851                 RING_FINAL_CHECK_FOR_REQUESTS(&blk_rings->common, more_to_do);
 852         } while (more_to_do);
 853
 854         return more_to_do;
 855 }
 856 /*
 857  * Transmutation of the 'struct blkif_request' to a proper 'struct bio'
 858  * and call the 'submit_bio' to pass it to the underlying storage.
 859  */
 860 static int dispatch_rw_block_io(struct xen_blkif *blkif,
 861                                 struct blkif_request *req,
 862                                 struct pending_req *pending_req)
 863 {
 864         struct phys_req preq;
 865         struct seg_buf seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 866         unsigned int nseg;
 867         struct bio *bio = NULL;
 868         struct bio *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 869         int i, nbio = 0;
 870         int operation;
 871         struct blk_plug plug;
 872         bool drain = false;
 873         struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 874
 875         switch (req->operation) {
 876         case BLKIF_OP_READ:
 877                 blkif->st_rd_req++;
 878                 operation = READ;
 879                 break;
 880         case BLKIF_OP_WRITE:
 881                 blkif->st_wr_req++;
 882                 operation = WRITE_ODIRECT;
 883                 break;
 884         case BLKIF_OP_WRITE_BARRIER:
 885                 drain = true;
 886         case BLKIF_OP_FLUSH_DISKCACHE:
 887                 blkif->st_f_req++;
 888                 operation = WRITE_FLUSH;
 889                 break;
 890         default:
 891                 operation = 0; /* make gcc happy */
 892                 goto fail_response;
 893                 break;
 894         }
 895
 896         /* Check that the number of segments is sane. */
 897         nseg = req->u.rw.nr_segments;
 898
 899         if (unlikely(nseg == 0 && operation != WRITE_FLUSH) ||
 900             unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
 901                 pr_debug(DRV_PFX "Bad number of segments in request (%d)\n",
 902                          nseg);
 903                 /* Haven't submitted any bio's yet. */
 904                 goto fail_response;
 905         }
 906
 907         preq.sector_number = req->u.rw.sector_number;
 908         preq.nr_sects      = 0;
 909
 910         pending_req->blkif     = blkif;
 911         pending_req->id        = req->u.rw.id;
 912         pending_req->operation = req->operation;
 913         pending_req->status    = BLKIF_RSP_OKAY;
 914         pending_req->nr_pages  = nseg;
 915
 916         for (i = 0; i < nseg; i++) {
 917                 seg[i].nsec = req->u.rw.seg[i].last_sect -
 918                         req->u.rw.seg[i].first_sect + 1;
 919                 if ((req->u.rw.seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
 920                     (req->u.rw.seg[i].last_sect < req->u.rw.seg[i].first_sect))
 921                         goto fail_response;
 922                 preq.nr_sects += seg[i].nsec;
 923
 924         }
 925
 926         if (xen_vbd_translate(&preq, blkif, operation) != 0) {
 927                 pr_debug(DRV_PFX "access denied: %s of [%llu,%llu] on dev=%04x\n",
 928                          operation == READ ? "read" : "write",
 929                          preq.sector_number,
 930                          preq.sector_number + preq.nr_sects,
 931                          blkif->vbd.pdevice);
 932                 goto fail_response;
 933         }
 934
 935         /*
 936          * This check _MUST_ be done after xen_vbd_translate as the preq.bdev
 937          * is set there.
 938          */
 939         for (i = 0; i < nseg; i++) {
 940                 if (((int)preq.sector_number|(int)seg[i].nsec) &
 941                     ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) {
 942                         pr_debug(DRV_PFX "Misaligned I/O request from domain %d",
 943                                  blkif->domid);
 944                         goto fail_response;
 945                 }
 946         }
 947
 948         /* Wait on all outstanding I/O's and once that has been completed
 949          * issue the WRITE_FLUSH.
 950          */
 951         if (drain)
 952                 xen_blk_drain_io(pending_req->blkif);
 953
 954         /*
 955          * If we have failed at this point, we need to undo the M2P override,
 956          * set gnttab_set_unmap_op on all of the grant references and perform
 957          * the hypercall to unmap the grants - that is all done in
 958          * xen_blkbk_unmap.
 959          */
 960         if (xen_blkbk_map(req, pending_req, seg, pages))
 961                 goto fail_flush;
 962
 963         /*
 964          * This corresponding xen_blkif_put is done in __end_block_io_op, or
 965          * below (in "!bio") if we are handling a BLKIF_OP_DISCARD.
 966          */
 967         xen_blkif_get(blkif);
 968
 969         for (i = 0; i < nseg; i++) {
 970                 while ((bio == NULL) ||
 971                        (bio_add_page(bio,
 972                                      pages[i],
 973                                      seg[i].nsec << 9,
 974                                      seg[i].buf & ~PAGE_MASK) == 0)) {
 975
 976                         bio = bio_alloc(GFP_KERNEL, nseg-i);
 977                         if (unlikely(bio == NULL))
 978                                 goto fail_put_bio;
 979
 980                         biolist[nbio++] = bio;
 981                         bio->bi_bdev    = preq.bdev;
 982                         bio->bi_private = pending_req;
 983                         bio->bi_end_io  = end_block_io_op;
 984                         bio->bi_sector  = preq.sector_number;
 985                 }
 986
 987                 preq.sector_number += seg[i].nsec;
 988         }
 989
 990         /* This will be hit if the operation was a flush or discard. */
 991         if (!bio) {
 992                 BUG_ON(operation != WRITE_FLUSH);
 993
 994                 bio = bio_alloc(GFP_KERNEL, 0);
 995                 if (unlikely(bio == NULL))
 996                         goto fail_put_bio;
 997
 998                 biolist[nbio++] = bio;
 999                 bio->bi_bdev    = preq.bdev;
1000                 bio->bi_private = pending_req;
1001                 bio->bi_end_io  = end_block_io_op;
1002         }
1003
1004         atomic_set(&pending_req->pendcnt, nbio);
1005         blk_start_plug(&plug);
1006
1007         for (i = 0; i < nbio; i++)
1008                 submit_bio(operation, biolist[i]);
1009
1010         /* Let the I/Os go.. */
1011         blk_finish_plug(&plug);
1012
1013         if (operation == READ)
1014                 blkif->st_rd_sect += preq.nr_sects;
1015         else if (operation & WRITE)
1016                 blkif->st_wr_sect += preq.nr_sects;
1017
1018         return 0;
1019
1020  fail_flush:
1021         xen_blkbk_unmap(pending_req);
1022  fail_response:
1023         /* Haven't submitted any bio's yet. */
1024         make_response(blkif, req->u.rw.id, req->operation, BLKIF_RSP_ERROR);
1025         free_req(pending_req);
1026         msleep(1); /* back off a bit */
1027         return -EIO;
1028
1029  fail_put_bio:
1030         for (i = 0; i < nbio; i++)
1031                 bio_put(biolist[i]);
1032         atomic_set(&pending_req->pendcnt, 1);
1033         __end_block_io_op(pending_req, -EINVAL);
1034         msleep(1); /* back off a bit */
1035         return -EIO;
1036 }
1037
1038
1039
1040 /*
1041  * Put a response on the ring on how the operation fared.
1042  */
1043 static void make_response(struct xen_blkif *blkif, u64 id,
1044                           unsigned short op, int st)
1045 {
1046         struct blkif_response  resp;
1047         unsigned long     flags;
1048         union blkif_back_rings *blk_rings = &blkif->blk_rings;
1049         int notify;
1050
1051         resp.id        = id;
1052         resp.operation = op;
1053         resp.status    = st;
1054
1055         spin_lock_irqsave(&blkif->blk_ring_lock, flags);
1056         /* Place on the response ring for the relevant domain. */
1057         switch (blkif->blk_protocol) {
1058         case BLKIF_PROTOCOL_NATIVE:
1059                 memcpy(RING_GET_RESPONSE(&blk_rings->native, blk_rings->native.rsp_prod_pvt),
1060                        &resp, sizeof(resp));
1061                 break;
1062         case BLKIF_PROTOCOL_X86_32:
1063                 memcpy(RING_GET_RESPONSE(&blk_rings->x86_32, blk_rings->x86_32.rsp_prod_pvt),
1064                        &resp, sizeof(resp));
1065                 break;
1066         case BLKIF_PROTOCOL_X86_64:
1067                 memcpy(RING_GET_RESPONSE(&blk_rings->x86_64, blk_rings->x86_64.rsp_prod_pvt),
1068                        &resp, sizeof(resp));
1069                 break;
1070         default:
1071                 BUG();
1072         }
1073         blk_rings->common.rsp_prod_pvt++;
1074         RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify);
1075         spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
1076         if (notify)
1077                 notify_remote_via_irq(blkif->irq);
1078 }
1079
1080 static int __init xen_blkif_init(void)
1081 {
1082         int i, mmap_pages;
1083         int rc = 0;
1084
1085         if (!xen_domain())
1086                 return -ENODEV;
1087
1088         blkbk = kzalloc(sizeof(struct xen_blkbk), GFP_KERNEL);
1089         if (!blkbk) {
1090                 pr_alert(DRV_PFX "%s: out of memory!\n", __func__);
1091                 return -ENOMEM;
1092         }
1093
1094         mmap_pages = xen_blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST;
1095
1096         blkbk->pending_reqs          = kzalloc(sizeof(blkbk->pending_reqs[0]) *
1097                                         xen_blkif_reqs, GFP_KERNEL);
1098         blkbk->pending_grant_handles = kmalloc(sizeof(blkbk->pending_grant_handles[0]) *
1099                                         mmap_pages, GFP_KERNEL);
1100         blkbk->pending_pages         = kzalloc(sizeof(blkbk->pending_pages[0]) *
1101                                         mmap_pages, GFP_KERNEL);
1102
1103         if (!blkbk->pending_reqs || !blkbk->pending_grant_handles ||
1104             !blkbk->pending_pages) {
1105                 rc = -ENOMEM;
1106                 goto out_of_memory;
1107         }
1108
1109         for (i = 0; i < mmap_pages; i++) {
1110                 blkbk->pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
1111                 blkbk->pending_pages[i] = alloc_page(GFP_KERNEL);
1112                 if (blkbk->pending_pages[i] == NULL) {
1113                         rc = -ENOMEM;
1114                         goto out_of_memory;
1115                 }
1116         }
1117         rc = xen_blkif_interface_init();
1118         if (rc)
1119                 goto failed_init;
1120
1121         INIT_LIST_HEAD(&blkbk->pending_free);
1122         spin_lock_init(&blkbk->pending_free_lock);
1123         init_waitqueue_head(&blkbk->pending_free_wq);
1124
1125         for (i = 0; i < xen_blkif_reqs; i++)
1126                 list_add_tail(&blkbk->pending_reqs[i].free_list,
1127                               &blkbk->pending_free);
1128
1129         rc = xen_blkif_xenbus_init();
1130         if (rc)
1131                 goto failed_init;
1132
1133         return 0;
1134
1135  out_of_memory:
1136         pr_alert(DRV_PFX "%s: out of memory\n", __func__);
1137  failed_init:
1138         kfree(blkbk->pending_reqs);
1139         kfree(blkbk->pending_grant_handles);
1140         if (blkbk->pending_pages) {
1141                 for (i = 0; i < mmap_pages; i++) {
1142                         if (blkbk->pending_pages[i])
1143                                 __free_page(blkbk->pending_pages[i]);
1144                 }
1145                 kfree(blkbk->pending_pages);
1146         }
1147         kfree(blkbk);
1148         blkbk = NULL;
1149         return rc;
1150 }
1151
1152 module_init(xen_blkif_init);
1153
1154 MODULE_LICENSE("Dual BSD/GPL");
1155 MODULE_ALIAS("xen-backend:vbd");