drivers/md/dm-cache-target.c

   1 /*
   2  * Copyright (C) 2012 Red Hat. All rights reserved.
   3  *
   4  * This file is released under the GPL.
   5  */
   6
   7 #include "dm.h"
   8 #include "dm-bio-prison.h"
   9 #include "dm-cache-metadata.h"
  10
  11 #include <linux/dm-io.h>
  12 #include <linux/dm-kcopyd.h>
  13 #include <linux/init.h>
  14 #include <linux/mempool.h>
  15 #include <linux/module.h>
  16 #include <linux/slab.h>
  17 #include <linux/vmalloc.h>
  18
  19 #define DM_MSG_PREFIX "cache"
  20
  21 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
  22         "A percentage of time allocated for copying to and/or from cache");
  23
  24 /*----------------------------------------------------------------*/
  25
  26 /*
  27  * Glossary:
  28  *
  29  * oblock: index of an origin block
  30  * cblock: index of a cache block
  31  * promotion: movement of a block from origin to cache
  32  * demotion: movement of a block from cache to origin
  33  * migration: movement of a block between the origin and cache device,
  34  *            either direction
  35  */
  36
  37 /*----------------------------------------------------------------*/
  38
  39 static size_t bitset_size_in_bytes(unsigned nr_entries)
  40 {
  41         return sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG);
  42 }
  43
  44 static unsigned long *alloc_bitset(unsigned nr_entries)
  45 {
  46         size_t s = bitset_size_in_bytes(nr_entries);
  47         return vzalloc(s);
  48 }
  49
  50 static void clear_bitset(void *bitset, unsigned nr_entries)
  51 {
  52         size_t s = bitset_size_in_bytes(nr_entries);
  53         memset(bitset, 0, s);
  54 }
  55
  56 static void free_bitset(unsigned long *bits)
  57 {
  58         vfree(bits);
  59 }
  60
  61 /*----------------------------------------------------------------*/
  62
  63 #define PRISON_CELLS 1024
  64 #define MIGRATION_POOL_SIZE 128
  65 #define COMMIT_PERIOD HZ
  66 #define MIGRATION_COUNT_WINDOW 10
  67
  68 /*
  69  * The block size of the device holding cache data must be >= 32KB
  70  */
  71 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT)
  72
  73 /*
  74  * FIXME: the cache is read/write for the time being.
  75  */
  76 enum cache_mode {
  77         CM_WRITE,               /* metadata may be changed */
  78         CM_READ_ONLY,           /* metadata may not be changed */
  79 };
  80
  81 struct cache_features {
  82         enum cache_mode mode;
  83         bool write_through:1;
  84 };
  85
  86 struct cache_stats {
  87         atomic_t read_hit;
  88         atomic_t read_miss;
  89         atomic_t write_hit;
  90         atomic_t write_miss;
  91         atomic_t demotion;
  92         atomic_t promotion;
  93         atomic_t copies_avoided;
  94         atomic_t cache_cell_clash;
  95         atomic_t commit_count;
  96         atomic_t discard_count;
  97 };
  98
  99 struct cache {
 100         struct dm_target *ti;
 101         struct dm_target_callbacks callbacks;
 102
 103         /*
 104          * Metadata is written to this device.
 105          */
 106         struct dm_dev *metadata_dev;
 107
 108         /*
 109          * The slower of the two data devices.  Typically a spindle.
 110          */
 111         struct dm_dev *origin_dev;
 112
 113         /*
 114          * The faster of the two data devices.  Typically an SSD.
 115          */
 116         struct dm_dev *cache_dev;
 117
 118         /*
 119          * Cache features such as write-through.
 120          */
 121         struct cache_features features;
 122
 123         /*
 124          * Size of the origin device in _complete_ blocks and native sectors.
 125          */
 126         dm_oblock_t origin_blocks;
 127         sector_t origin_sectors;
 128
 129         /*
 130          * Size of the cache device in blocks.
 131          */
 132         dm_cblock_t cache_size;
 133
 134         /*
 135          * Fields for converting from sectors to blocks.
 136          */
 137         uint32_t sectors_per_block;
 138         int sectors_per_block_shift;
 139
 140         struct dm_cache_metadata *cmd;
 141
 142         spinlock_t lock;
 143         struct bio_list deferred_bios;
 144         struct bio_list deferred_flush_bios;
 145         struct list_head quiesced_migrations;
 146         struct list_head completed_migrations;
 147         struct list_head need_commit_migrations;
 148         sector_t migration_threshold;
 149         atomic_t nr_migrations;
 150         wait_queue_head_t migration_wait;
 151
 152         /*
 153          * cache_size entries, dirty if set
 154          */
 155         dm_cblock_t nr_dirty;
 156         unsigned long *dirty_bitset;
 157
 158         /*
 159          * origin_blocks entries, discarded if set.
 160          */
 161         sector_t discard_block_size; /* a power of 2 times sectors per block */
 162         dm_dblock_t discard_nr_blocks;
 163         unsigned long *discard_bitset;
 164
 165         struct dm_kcopyd_client *copier;
 166         struct workqueue_struct *wq;
 167         struct work_struct worker;
 168
 169         struct delayed_work waker;
 170         unsigned long last_commit_jiffies;
 171
 172         struct dm_bio_prison *prison;
 173         struct dm_deferred_set *all_io_ds;
 174
 175         mempool_t *migration_pool;
 176         struct dm_cache_migration *next_migration;
 177
 178         struct dm_cache_policy *policy;
 179         unsigned policy_nr_args;
 180
 181         bool need_tick_bio:1;
 182         bool sized:1;
 183         bool quiescing:1;
 184         bool commit_requested:1;
 185         bool loaded_mappings:1;
 186         bool loaded_discards:1;
 187
 188         struct cache_stats stats;
 189
 190         /*
 191          * Rather than reconstructing the table line for the status we just
 192          * save it and regurgitate.
 193          */
 194         unsigned nr_ctr_args;
 195         const char **ctr_args;
 196 };
 197
 198 struct per_bio_data {
 199         bool tick:1;
 200         unsigned req_nr:2;
 201         struct dm_deferred_entry *all_io_entry;
 202 };
 203
 204 struct dm_cache_migration {
 205         struct list_head list;
 206         struct cache *cache;
 207
 208         unsigned long start_jiffies;
 209         dm_oblock_t old_oblock;
 210         dm_oblock_t new_oblock;
 211         dm_cblock_t cblock;
 212
 213         bool err:1;
 214         bool writeback:1;
 215         bool demote:1;
 216         bool promote:1;
 217
 218         struct dm_bio_prison_cell *old_ocell;
 219         struct dm_bio_prison_cell *new_ocell;
 220 };
 221
 222 /*
 223  * Processing a bio in the worker thread may require these memory
 224  * allocations.  We prealloc to avoid deadlocks (the same worker thread
 225  * frees them back to the mempool).
 226  */
 227 struct prealloc {
 228         struct dm_cache_migration *mg;
 229         struct dm_bio_prison_cell *cell1;
 230         struct dm_bio_prison_cell *cell2;
 231 };
 232
 233 static void wake_worker(struct cache *cache)
 234 {
 235         queue_work(cache->wq, &cache->worker);
 236 }
 237
 238 /*----------------------------------------------------------------*/
 239
 240 static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache)
 241 {
 242         /* FIXME: change to use a local slab. */
 243         return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT);
 244 }
 245
 246 static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell)
 247 {
 248         dm_bio_prison_free_cell(cache->prison, cell);
 249 }
 250
 251 static int prealloc_data_structs(struct cache *cache, struct prealloc *p)
 252 {
 253         if (!p->mg) {
 254                 p->mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT);
 255                 if (!p->mg)
 256                         return -ENOMEM;
 257         }
 258
 259         if (!p->cell1) {
 260                 p->cell1 = alloc_prison_cell(cache);
 261                 if (!p->cell1)
 262                         return -ENOMEM;
 263         }
 264
 265         if (!p->cell2) {
 266                 p->cell2 = alloc_prison_cell(cache);
 267                 if (!p->cell2)
 268                         return -ENOMEM;
 269         }
 270
 271         return 0;
 272 }
 273
 274 static void prealloc_free_structs(struct cache *cache, struct prealloc *p)
 275 {
 276         if (p->cell2)
 277                 free_prison_cell(cache, p->cell2);
 278
 279         if (p->cell1)
 280                 free_prison_cell(cache, p->cell1);
 281
 282         if (p->mg)
 283                 mempool_free(p->mg, cache->migration_pool);
 284 }
 285
 286 static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p)
 287 {
 288         struct dm_cache_migration *mg = p->mg;
 289
 290         BUG_ON(!mg);
 291         p->mg = NULL;
 292
 293         return mg;
 294 }
 295
 296 /*
 297  * You must have a cell within the prealloc struct to return.  If not this
 298  * function will BUG() rather than returning NULL.
 299  */
 300 static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p)
 301 {
 302         struct dm_bio_prison_cell *r = NULL;
 303
 304         if (p->cell1) {
 305                 r = p->cell1;
 306                 p->cell1 = NULL;
 307
 308         } else if (p->cell2) {
 309                 r = p->cell2;
 310                 p->cell2 = NULL;
 311         } else
 312                 BUG();
 313
 314         return r;
 315 }
 316
 317 /*
 318  * You can't have more than two cells in a prealloc struct.  BUG() will be
 319  * called if you try and overfill.
 320  */
 321 static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell)
 322 {
 323         if (!p->cell2)
 324                 p->cell2 = cell;
 325
 326         else if (!p->cell1)
 327                 p->cell1 = cell;
 328
 329         else
 330                 BUG();
 331 }
 332
 333 /*----------------------------------------------------------------*/
 334
 335 static void build_key(dm_oblock_t oblock, struct dm_cell_key *key)
 336 {
 337         key->virtual = 0;
 338         key->dev = 0;
 339         key->block = from_oblock(oblock);
 340 }
 341
 342 /*
 343  * The caller hands in a preallocated cell, and a free function for it.
 344  * The cell will be freed if there's an error, or if it wasn't used because
 345  * a cell with that key already exists.
 346  */
 347 typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell);
 348
 349 static int bio_detain(struct cache *cache, dm_oblock_t oblock,
 350                       struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
 351                       cell_free_fn free_fn, void *free_context,
 352                       struct dm_bio_prison_cell **cell_result)
 353 {
 354         int r;
 355         struct dm_cell_key key;
 356
 357         build_key(oblock, &key);
 358         r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result);
 359         if (r)
 360                 free_fn(free_context, cell_prealloc);
 361
 362         return r;
 363 }
 364
 365 static int get_cell(struct cache *cache,
 366                     dm_oblock_t oblock,
 367                     struct prealloc *structs,
 368                     struct dm_bio_prison_cell **cell_result)
 369 {
 370         int r;
 371         struct dm_cell_key key;
 372         struct dm_bio_prison_cell *cell_prealloc;
 373
 374         cell_prealloc = prealloc_get_cell(structs);
 375
 376         build_key(oblock, &key);
 377         r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result);
 378         if (r)
 379                 prealloc_put_cell(structs, cell_prealloc);
 380
 381         return r;
 382 }
 383
 384  /*----------------------------------------------------------------*/
 385
 386 static bool is_dirty(struct cache *cache, dm_cblock_t b)
 387 {
 388         return test_bit(from_cblock(b), cache->dirty_bitset);
 389 }
 390
 391 static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
 392 {
 393         if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) {
 394                 cache->nr_dirty = to_cblock(from_cblock(cache->nr_dirty) + 1);
 395                 policy_set_dirty(cache->policy, oblock);
 396         }
 397 }
 398
 399 static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
 400 {
 401         if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) {
 402                 policy_clear_dirty(cache->policy, oblock);
 403                 cache->nr_dirty = to_cblock(from_cblock(cache->nr_dirty) - 1);
 404                 if (!from_cblock(cache->nr_dirty))
 405                         dm_table_event(cache->ti->table);
 406         }
 407 }
 408
 409 /*----------------------------------------------------------------*/
 410 static bool block_size_is_power_of_two(struct cache *cache)
 411 {
 412         return cache->sectors_per_block_shift >= 0;
 413 }
 414
 415 static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
 416 {
 417         sector_t discard_blocks = cache->discard_block_size;
 418         dm_block_t b = from_oblock(oblock);
 419
 420         if (!block_size_is_power_of_two(cache))
 421                 (void) sector_div(discard_blocks, cache->sectors_per_block);
 422         else
 423                 discard_blocks >>= cache->sectors_per_block_shift;
 424
 425         (void) sector_div(b, discard_blocks);
 426
 427         return to_dblock(b);
 428 }
 429
 430 static void set_discard(struct cache *cache, dm_dblock_t b)
 431 {
 432         unsigned long flags;
 433
 434         atomic_inc(&cache->stats.discard_count);
 435
 436         spin_lock_irqsave(&cache->lock, flags);
 437         set_bit(from_dblock(b), cache->discard_bitset);
 438         spin_unlock_irqrestore(&cache->lock, flags);
 439 }
 440
 441 static void clear_discard(struct cache *cache, dm_dblock_t b)
 442 {
 443         unsigned long flags;
 444
 445         spin_lock_irqsave(&cache->lock, flags);
 446         clear_bit(from_dblock(b), cache->discard_bitset);
 447         spin_unlock_irqrestore(&cache->lock, flags);
 448 }
 449
 450 static bool is_discarded(struct cache *cache, dm_dblock_t b)
 451 {
 452         int r;
 453         unsigned long flags;
 454
 455         spin_lock_irqsave(&cache->lock, flags);
 456         r = test_bit(from_dblock(b), cache->discard_bitset);
 457         spin_unlock_irqrestore(&cache->lock, flags);
 458
 459         return r;
 460 }
 461
 462 static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
 463 {
 464         int r;
 465         unsigned long flags;
 466
 467         spin_lock_irqsave(&cache->lock, flags);
 468         r = test_bit(from_dblock(oblock_to_dblock(cache, b)),
 469                      cache->discard_bitset);
 470         spin_unlock_irqrestore(&cache->lock, flags);
 471
 472         return r;
 473 }
 474
 475 /*----------------------------------------------------------------*/
 476
 477 static void load_stats(struct cache *cache)
 478 {
 479         struct dm_cache_statistics stats;
 480
 481         dm_cache_metadata_get_stats(cache->cmd, &stats);
 482         atomic_set(&cache->stats.read_hit, stats.read_hits);
 483         atomic_set(&cache->stats.read_miss, stats.read_misses);
 484         atomic_set(&cache->stats.write_hit, stats.write_hits);
 485         atomic_set(&cache->stats.write_miss, stats.write_misses);
 486 }
 487
 488 static void save_stats(struct cache *cache)
 489 {
 490         struct dm_cache_statistics stats;
 491
 492         stats.read_hits = atomic_read(&cache->stats.read_hit);
 493         stats.read_misses = atomic_read(&cache->stats.read_miss);
 494         stats.write_hits = atomic_read(&cache->stats.write_hit);
 495         stats.write_misses = atomic_read(&cache->stats.write_miss);
 496
 497         dm_cache_metadata_set_stats(cache->cmd, &stats);
 498 }
 499
 500 /*----------------------------------------------------------------
 501  * Per bio data
 502  *--------------------------------------------------------------*/
 503 static struct per_bio_data *get_per_bio_data(struct bio *bio)
 504 {
 505         struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
 506         BUG_ON(!pb);
 507         return pb;
 508 }
 509
 510 static struct per_bio_data *init_per_bio_data(struct bio *bio)
 511 {
 512         struct per_bio_data *pb = get_per_bio_data(bio);
 513
 514         pb->tick = false;
 515         pb->req_nr = dm_bio_get_target_bio_nr(bio);
 516         pb->all_io_entry = NULL;
 517
 518         return pb;
 519 }
 520
 521 /*----------------------------------------------------------------
 522  * Remapping
 523  *--------------------------------------------------------------*/
 524 static void remap_to_origin(struct cache *cache, struct bio *bio)
 525 {
 526         bio->bi_bdev = cache->origin_dev->bdev;
 527 }
 528
 529 static void remap_to_cache(struct cache *cache, struct bio *bio,
 530                            dm_cblock_t cblock)
 531 {
 532         sector_t bi_sector = bio->bi_sector;
 533
 534         bio->bi_bdev = cache->cache_dev->bdev;
 535         if (!block_size_is_power_of_two(cache))
 536                 bio->bi_sector = (from_cblock(cblock) * cache->sectors_per_block) +
 537                                 sector_div(bi_sector, cache->sectors_per_block);
 538         else
 539                 bio->bi_sector = (from_cblock(cblock) << cache->sectors_per_block_shift) |
 540                                 (bi_sector & (cache->sectors_per_block - 1));
 541 }
 542
 543 static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
 544 {
 545         unsigned long flags;
 546         struct per_bio_data *pb = get_per_bio_data(bio);
 547
 548         spin_lock_irqsave(&cache->lock, flags);
 549         if (cache->need_tick_bio &&
 550             !(bio->bi_rw & (REQ_FUA | REQ_FLUSH | REQ_DISCARD))) {
 551                 pb->tick = true;
 552                 cache->need_tick_bio = false;
 553         }
 554         spin_unlock_irqrestore(&cache->lock, flags);
 555 }
 556
 557 static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
 558                                   dm_oblock_t oblock)
 559 {
 560         check_if_tick_bio_needed(cache, bio);
 561         remap_to_origin(cache, bio);
 562         if (bio_data_dir(bio) == WRITE)
 563                 clear_discard(cache, oblock_to_dblock(cache, oblock));
 564 }
 565
 566 static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
 567                                  dm_oblock_t oblock, dm_cblock_t cblock)
 568 {
 569         remap_to_cache(cache, bio, cblock);
 570         if (bio_data_dir(bio) == WRITE) {
 571                 set_dirty(cache, oblock, cblock);
 572                 clear_discard(cache, oblock_to_dblock(cache, oblock));
 573         }
 574 }
 575
 576 static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio)
 577 {
 578         sector_t block_nr = bio->bi_sector;
 579
 580         if (!block_size_is_power_of_two(cache))
 581                 (void) sector_div(block_nr, cache->sectors_per_block);
 582         else
 583                 block_nr >>= cache->sectors_per_block_shift;
 584
 585         return to_oblock(block_nr);
 586 }
 587
 588 static int bio_triggers_commit(struct cache *cache, struct bio *bio)
 589 {
 590         return bio->bi_rw & (REQ_FLUSH | REQ_FUA);
 591 }
 592
 593 static void issue(struct cache *cache, struct bio *bio)
 594 {
 595         unsigned long flags;
 596
 597         if (!bio_triggers_commit(cache, bio)) {
 598                 generic_make_request(bio);
 599                 return;
 600         }
 601
 602         /*
 603          * Batch together any bios that trigger commits and then issue a
 604          * single commit for them in do_worker().
 605          */
 606         spin_lock_irqsave(&cache->lock, flags);
 607         cache->commit_requested = true;
 608         bio_list_add(&cache->deferred_flush_bios, bio);
 609         spin_unlock_irqrestore(&cache->lock, flags);
 610 }
 611
 612 /*----------------------------------------------------------------
 613  * Migration processing
 614  *
 615  * Migration covers moving data from the origin device to the cache, or
 616  * vice versa.
 617  *--------------------------------------------------------------*/
 618 static void free_migration(struct dm_cache_migration *mg)
 619 {
 620         mempool_free(mg, mg->cache->migration_pool);
 621 }
 622
 623 static void inc_nr_migrations(struct cache *cache)
 624 {
 625         atomic_inc(&cache->nr_migrations);
 626 }
 627
 628 static void dec_nr_migrations(struct cache *cache)
 629 {
 630         atomic_dec(&cache->nr_migrations);
 631
 632         /*
 633          * Wake the worker in case we're suspending the target.
 634          */
 635         wake_up(&cache->migration_wait);
 636 }
 637
 638 static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
 639                          bool holder)
 640 {
 641         (holder ? dm_cell_release : dm_cell_release_no_holder)
 642                 (cache->prison, cell, &cache->deferred_bios);
 643         free_prison_cell(cache, cell);
 644 }
 645
 646 static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
 647                        bool holder)
 648 {
 649         unsigned long flags;
 650
 651         spin_lock_irqsave(&cache->lock, flags);
 652         __cell_defer(cache, cell, holder);
 653         spin_unlock_irqrestore(&cache->lock, flags);
 654
 655         wake_worker(cache);
 656 }
 657
 658 static void cleanup_migration(struct dm_cache_migration *mg)
 659 {
 660         dec_nr_migrations(mg->cache);
 661         free_migration(mg);
 662 }
 663
 664 static void migration_failure(struct dm_cache_migration *mg)
 665 {
 666         struct cache *cache = mg->cache;
 667
 668         if (mg->writeback) {
 669                 DMWARN_LIMIT("writeback failed; couldn't copy block");
 670                 set_dirty(cache, mg->old_oblock, mg->cblock);
 671                 cell_defer(cache, mg->old_ocell, false);
 672
 673         } else if (mg->demote) {
 674                 DMWARN_LIMIT("demotion failed; couldn't copy block");
 675                 policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock);
 676
 677                 cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1);
 678                 if (mg->promote)
 679                         cell_defer(cache, mg->new_ocell, 1);
 680         } else {
 681                 DMWARN_LIMIT("promotion failed; couldn't copy block");
 682                 policy_remove_mapping(cache->policy, mg->new_oblock);
 683                 cell_defer(cache, mg->new_ocell, 1);
 684         }
 685
 686         cleanup_migration(mg);
 687 }
 688
 689 static void migration_success_pre_commit(struct dm_cache_migration *mg)
 690 {
 691         unsigned long flags;
 692         struct cache *cache = mg->cache;
 693
 694         if (mg->writeback) {
 695                 cell_defer(cache, mg->old_ocell, false);
 696                 clear_dirty(cache, mg->old_oblock, mg->cblock);
 697                 cleanup_migration(mg);
 698                 return;
 699
 700         } else if (mg->demote) {
 701                 if (dm_cache_remove_mapping(cache->cmd, mg->cblock)) {
 702                         DMWARN_LIMIT("demotion failed; couldn't update on disk metadata");
 703                         policy_force_mapping(cache->policy, mg->new_oblock,
 704                                              mg->old_oblock);
 705                         if (mg->promote)
 706                                 cell_defer(cache, mg->new_ocell, true);
 707                         cleanup_migration(mg);
 708                         return;
 709                 }
 710         } else {
 711                 if (dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock)) {
 712                         DMWARN_LIMIT("promotion failed; couldn't update on disk metadata");
 713                         policy_remove_mapping(cache->policy, mg->new_oblock);
 714                         cleanup_migration(mg);
 715                         return;
 716                 }
 717         }
 718
 719         spin_lock_irqsave(&cache->lock, flags);
 720         list_add_tail(&mg->list, &cache->need_commit_migrations);
 721         cache->commit_requested = true;
 722         spin_unlock_irqrestore(&cache->lock, flags);
 723 }
 724
 725 static void migration_success_post_commit(struct dm_cache_migration *mg)
 726 {
 727         unsigned long flags;
 728         struct cache *cache = mg->cache;
 729
 730         if (mg->writeback) {
 731                 DMWARN("writeback unexpectedly triggered commit");
 732                 return;
 733
 734         } else if (mg->demote) {
 735                 cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1);
 736
 737                 if (mg->promote) {
 738                         mg->demote = false;
 739
 740                         spin_lock_irqsave(&cache->lock, flags);
 741                         list_add_tail(&mg->list, &cache->quiesced_migrations);
 742                         spin_unlock_irqrestore(&cache->lock, flags);
 743
 744                 } else
 745                         cleanup_migration(mg);
 746
 747         } else {
 748                 cell_defer(cache, mg->new_ocell, true);
 749                 clear_dirty(cache, mg->new_oblock, mg->cblock);
 750                 cleanup_migration(mg);
 751         }
 752 }
 753
 754 static void copy_complete(int read_err, unsigned long write_err, void *context)
 755 {
 756         unsigned long flags;
 757         struct dm_cache_migration *mg = (struct dm_cache_migration *) context;
 758         struct cache *cache = mg->cache;
 759
 760         if (read_err || write_err)
 761                 mg->err = true;
 762
 763         spin_lock_irqsave(&cache->lock, flags);
 764         list_add_tail(&mg->list, &cache->completed_migrations);
 765         spin_unlock_irqrestore(&cache->lock, flags);
 766
 767         wake_worker(cache);
 768 }
 769
 770 static void issue_copy_real(struct dm_cache_migration *mg)
 771 {
 772         int r;
 773         struct dm_io_region o_region, c_region;
 774         struct cache *cache = mg->cache;
 775
 776         o_region.bdev = cache->origin_dev->bdev;
 777         o_region.count = cache->sectors_per_block;
 778
 779         c_region.bdev = cache->cache_dev->bdev;
 780         c_region.sector = from_cblock(mg->cblock) * cache->sectors_per_block;
 781         c_region.count = cache->sectors_per_block;
 782
 783         if (mg->writeback || mg->demote) {
 784                 /* demote */
 785                 o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block;
 786                 r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg);
 787         } else {
 788                 /* promote */
 789                 o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block;
 790                 r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg);
 791         }
 792
 793         if (r < 0)
 794                 migration_failure(mg);
 795 }
 796
 797 static void avoid_copy(struct dm_cache_migration *mg)
 798 {
 799         atomic_inc(&mg->cache->stats.copies_avoided);
 800         migration_success_pre_commit(mg);
 801 }
 802
 803 static void issue_copy(struct dm_cache_migration *mg)
 804 {
 805         bool avoid;
 806         struct cache *cache = mg->cache;
 807
 808         if (mg->writeback || mg->demote)
 809                 avoid = !is_dirty(cache, mg->cblock) ||
 810                         is_discarded_oblock(cache, mg->old_oblock);
 811         else
 812                 avoid = is_discarded_oblock(cache, mg->new_oblock);
 813
 814         avoid ? avoid_copy(mg) : issue_copy_real(mg);
 815 }
 816
 817 static void complete_migration(struct dm_cache_migration *mg)
 818 {
 819         if (mg->err)
 820                 migration_failure(mg);
 821         else
 822                 migration_success_pre_commit(mg);
 823 }
 824
 825 static void process_migrations(struct cache *cache, struct list_head *head,
 826                                void (*fn)(struct dm_cache_migration *))
 827 {
 828         unsigned long flags;
 829         struct list_head list;
 830         struct dm_cache_migration *mg, *tmp;
 831
 832         INIT_LIST_HEAD(&list);
 833         spin_lock_irqsave(&cache->lock, flags);
 834         list_splice_init(head, &list);
 835         spin_unlock_irqrestore(&cache->lock, flags);
 836
 837         list_for_each_entry_safe(mg, tmp, &list, list)
 838                 fn(mg);
 839 }
 840
 841 static void __queue_quiesced_migration(struct dm_cache_migration *mg)
 842 {
 843         list_add_tail(&mg->list, &mg->cache->quiesced_migrations);
 844 }
 845
 846 static void queue_quiesced_migration(struct dm_cache_migration *mg)
 847 {
 848         unsigned long flags;
 849         struct cache *cache = mg->cache;
 850
 851         spin_lock_irqsave(&cache->lock, flags);
 852         __queue_quiesced_migration(mg);
 853         spin_unlock_irqrestore(&cache->lock, flags);
 854
 855         wake_worker(cache);
 856 }
 857
 858 static void queue_quiesced_migrations(struct cache *cache, struct list_head *work)
 859 {
 860         unsigned long flags;
 861         struct dm_cache_migration *mg, *tmp;
 862
 863         spin_lock_irqsave(&cache->lock, flags);
 864         list_for_each_entry_safe(mg, tmp, work, list)
 865                 __queue_quiesced_migration(mg);
 866         spin_unlock_irqrestore(&cache->lock, flags);
 867
 868         wake_worker(cache);
 869 }
 870
 871 static void check_for_quiesced_migrations(struct cache *cache,
 872                                           struct per_bio_data *pb)
 873 {
 874         struct list_head work;
 875
 876         if (!pb->all_io_entry)
 877                 return;
 878
 879         INIT_LIST_HEAD(&work);
 880         if (pb->all_io_entry)
 881                 dm_deferred_entry_dec(pb->all_io_entry, &work);
 882
 883         if (!list_empty(&work))
 884                 queue_quiesced_migrations(cache, &work);
 885 }
 886
 887 static void quiesce_migration(struct dm_cache_migration *mg)
 888 {
 889         if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list))
 890                 queue_quiesced_migration(mg);
 891 }
 892
 893 static void promote(struct cache *cache, struct prealloc *structs,
 894                     dm_oblock_t oblock, dm_cblock_t cblock,
 895                     struct dm_bio_prison_cell *cell)
 896 {
 897         struct dm_cache_migration *mg = prealloc_get_migration(structs);
 898
 899         mg->err = false;
 900         mg->writeback = false;
 901         mg->demote = false;
 902         mg->promote = true;
 903         mg->cache = cache;
 904         mg->new_oblock = oblock;
 905         mg->cblock = cblock;
 906         mg->old_ocell = NULL;
 907         mg->new_ocell = cell;
 908         mg->start_jiffies = jiffies;
 909
 910         inc_nr_migrations(cache);
 911         quiesce_migration(mg);
 912 }
 913
 914 static void writeback(struct cache *cache, struct prealloc *structs,
 915                       dm_oblock_t oblock, dm_cblock_t cblock,
 916                       struct dm_bio_prison_cell *cell)
 917 {
 918         struct dm_cache_migration *mg = prealloc_get_migration(structs);
 919
 920         mg->err = false;
 921         mg->writeback = true;
 922         mg->demote = false;
 923         mg->promote = false;
 924         mg->cache = cache;
 925         mg->old_oblock = oblock;
 926         mg->cblock = cblock;
 927         mg->old_ocell = cell;
 928         mg->new_ocell = NULL;
 929         mg->start_jiffies = jiffies;
 930
 931         inc_nr_migrations(cache);
 932         quiesce_migration(mg);
 933 }
 934
 935 static void demote_then_promote(struct cache *cache, struct prealloc *structs,
 936                                 dm_oblock_t old_oblock, dm_oblock_t new_oblock,
 937                                 dm_cblock_t cblock,
 938                                 struct dm_bio_prison_cell *old_ocell,
 939                                 struct dm_bio_prison_cell *new_ocell)
 940 {
 941         struct dm_cache_migration *mg = prealloc_get_migration(structs);
 942
 943         mg->err = false;
 944         mg->writeback = false;
 945         mg->demote = true;
 946         mg->promote = true;
 947         mg->cache = cache;
 948         mg->old_oblock = old_oblock;
 949         mg->new_oblock = new_oblock;
 950         mg->cblock = cblock;
 951         mg->old_ocell = old_ocell;
 952         mg->new_ocell = new_ocell;
 953         mg->start_jiffies = jiffies;
 954
 955         inc_nr_migrations(cache);
 956         quiesce_migration(mg);
 957 }
 958
 959 /*----------------------------------------------------------------
 960  * bio processing
 961  *--------------------------------------------------------------*/
 962 static void defer_bio(struct cache *cache, struct bio *bio)
 963 {
 964         unsigned long flags;
 965
 966         spin_lock_irqsave(&cache->lock, flags);
 967         bio_list_add(&cache->deferred_bios, bio);
 968         spin_unlock_irqrestore(&cache->lock, flags);
 969
 970         wake_worker(cache);
 971 }
 972
 973 static void process_flush_bio(struct cache *cache, struct bio *bio)
 974 {
 975         struct per_bio_data *pb = get_per_bio_data(bio);
 976
 977         BUG_ON(bio->bi_size);
 978         if (!pb->req_nr)
 979                 remap_to_origin(cache, bio);
 980         else
 981                 remap_to_cache(cache, bio, 0);
 982
 983         issue(cache, bio);
 984 }
 985
 986 /*
 987  * People generally discard large parts of a device, eg, the whole device
 988  * when formatting.  Splitting these large discards up into cache block
 989  * sized ios and then quiescing (always neccessary for discard) takes too
 990  * long.
 991  *
 992  * We keep it simple, and allow any size of discard to come in, and just
 993  * mark off blocks on the discard bitset.  No passdown occurs!
 994  *
 995  * To implement passdown we need to change the bio_prison such that a cell
 996  * can have a key that spans many blocks.
 997  */
 998 static void process_discard_bio(struct cache *cache, struct bio *bio)
 999 {
1000         dm_block_t start_block = dm_sector_div_up(bio->bi_sector,
1001                                                   cache->discard_block_size);
1002         dm_block_t end_block = bio->bi_sector + bio_sectors(bio);
1003         dm_block_t b;
1004
1005         (void) sector_div(end_block, cache->discard_block_size);
1006
1007         for (b = start_block; b < end_block; b++)
1008                 set_discard(cache, to_dblock(b));
1009
1010         bio_endio(bio, 0);
1011 }
1012
1013 static bool spare_migration_bandwidth(struct cache *cache)
1014 {
1015         sector_t current_volume = (atomic_read(&cache->nr_migrations) + 1) *
1016                 cache->sectors_per_block;
1017         return current_volume < cache->migration_threshold;
1018 }
1019
1020 static bool is_writethrough_io(struct cache *cache, struct bio *bio,
1021                                dm_cblock_t cblock)
1022 {
1023         return bio_data_dir(bio) == WRITE &&
1024                 cache->features.write_through && !is_dirty(cache, cblock);
1025 }
1026
1027 static void inc_hit_counter(struct cache *cache, struct bio *bio)
1028 {
1029         atomic_inc(bio_data_dir(bio) == READ ?
1030                    &cache->stats.read_hit : &cache->stats.write_hit);
1031 }
1032
1033 static void inc_miss_counter(struct cache *cache, struct bio *bio)
1034 {
1035         atomic_inc(bio_data_dir(bio) == READ ?
1036                    &cache->stats.read_miss : &cache->stats.write_miss);
1037 }
1038
1039 static void process_bio(struct cache *cache, struct prealloc *structs,
1040                         struct bio *bio)
1041 {
1042         int r;
1043         bool release_cell = true;
1044         dm_oblock_t block = get_bio_block(cache, bio);
1045         struct dm_bio_prison_cell *cell_prealloc, *old_ocell, *new_ocell;
1046         struct policy_result lookup_result;
1047         struct per_bio_data *pb = get_per_bio_data(bio);
1048         bool discarded_block = is_discarded_oblock(cache, block);
1049         bool can_migrate = discarded_block || spare_migration_bandwidth(cache);
1050
1051         /*
1052          * Check to see if that block is currently migrating.
1053          */
1054         cell_prealloc = prealloc_get_cell(structs);
1055         r = bio_detain(cache, block, bio, cell_prealloc,
1056                        (cell_free_fn) prealloc_put_cell,
1057                        structs, &new_ocell);
1058         if (r > 0)
1059                 return;
1060
1061         r = policy_map(cache->policy, block, true, can_migrate, discarded_block,
1062                        bio, &lookup_result);
1063
1064         if (r == -EWOULDBLOCK)
1065                 /* migration has been denied */
1066                 lookup_result.op = POLICY_MISS;
1067
1068         switch (lookup_result.op) {
1069         case POLICY_HIT:
1070                 inc_hit_counter(cache, bio);
1071                 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
1072
1073                 if (is_writethrough_io(cache, bio, lookup_result.cblock)) {
1074                         /*
1075                          * No need to mark anything dirty in write through mode.
1076                          */
1077                         pb->req_nr == 0 ?
1078                                 remap_to_cache(cache, bio, lookup_result.cblock) :
1079                                 remap_to_origin_clear_discard(cache, bio, block);
1080                 } else
1081                         remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
1082
1083                 issue(cache, bio);
1084                 break;
1085
1086         case POLICY_MISS:
1087                 inc_miss_counter(cache, bio);
1088                 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
1089
1090                 if (pb->req_nr != 0) {
1091                         /*
1092                          * This is a duplicate writethrough io that is no
1093                          * longer needed because the block has been demoted.
1094                          */
1095                         bio_endio(bio, 0);
1096                 } else {
1097                         remap_to_origin_clear_discard(cache, bio, block);
1098                         issue(cache, bio);
1099                 }
1100                 break;
1101
1102         case POLICY_NEW:
1103                 atomic_inc(&cache->stats.promotion);
1104                 promote(cache, structs, block, lookup_result.cblock, new_ocell);
1105                 release_cell = false;
1106                 break;
1107
1108         case POLICY_REPLACE:
1109                 cell_prealloc = prealloc_get_cell(structs);
1110                 r = bio_detain(cache, lookup_result.old_oblock, bio, cell_prealloc,
1111                                (cell_free_fn) prealloc_put_cell,
1112                                structs, &old_ocell);
1113                 if (r > 0) {
1114                         /*
1115                          * We have to be careful to avoid lock inversion of
1116                          * the cells.  So we back off, and wait for the
1117                          * old_ocell to become free.
1118                          */
1119                         policy_force_mapping(cache->policy, block,
1120                                              lookup_result.old_oblock);
1121                         atomic_inc(&cache->stats.cache_cell_clash);
1122                         break;
1123                 }
1124                 atomic_inc(&cache->stats.demotion);
1125                 atomic_inc(&cache->stats.promotion);
1126
1127                 demote_then_promote(cache, structs, lookup_result.old_oblock,
1128                                     block, lookup_result.cblock,
1129                                     old_ocell, new_ocell);
1130                 release_cell = false;
1131                 break;
1132
1133         default:
1134                 DMERR_LIMIT("%s: erroring bio, unknown policy op: %u", __func__,
1135                             (unsigned) lookup_result.op);
1136                 bio_io_error(bio);
1137         }
1138
1139         if (release_cell)
1140                 cell_defer(cache, new_ocell, false);
1141 }
1142
1143 static int need_commit_due_to_time(struct cache *cache)
1144 {
1145         return jiffies < cache->last_commit_jiffies ||
1146                jiffies > cache->last_commit_jiffies + COMMIT_PERIOD;
1147 }
1148
1149 static int commit_if_needed(struct cache *cache)
1150 {
1151         if (dm_cache_changed_this_transaction(cache->cmd) &&
1152             (cache->commit_requested || need_commit_due_to_time(cache))) {
1153                 atomic_inc(&cache->stats.commit_count);
1154                 cache->last_commit_jiffies = jiffies;
1155                 cache->commit_requested = false;
1156                 return dm_cache_commit(cache->cmd, false);
1157         }
1158
1159         return 0;
1160 }
1161
1162 static void process_deferred_bios(struct cache *cache)
1163 {
1164         unsigned long flags;
1165         struct bio_list bios;
1166         struct bio *bio;
1167         struct prealloc structs;
1168
1169         memset(&structs, 0, sizeof(structs));
1170         bio_list_init(&bios);
1171
1172         spin_lock_irqsave(&cache->lock, flags);
1173         bio_list_merge(&bios, &cache->deferred_bios);
1174         bio_list_init(&cache->deferred_bios);
1175         spin_unlock_irqrestore(&cache->lock, flags);
1176
1177         while (!bio_list_empty(&bios)) {
1178                 /*
1179                  * If we've got no free migration structs, and processing
1180                  * this bio might require one, we pause until there are some
1181                  * prepared mappings to process.
1182                  */
1183                 if (prealloc_data_structs(cache, &structs)) {
1184                         spin_lock_irqsave(&cache->lock, flags);
1185                         bio_list_merge(&cache->deferred_bios, &bios);
1186                         spin_unlock_irqrestore(&cache->lock, flags);
1187                         break;
1188                 }
1189
1190                 bio = bio_list_pop(&bios);
1191
1192                 if (bio->bi_rw & REQ_FLUSH)
1193                         process_flush_bio(cache, bio);
1194                 else if (bio->bi_rw & REQ_DISCARD)
1195                         process_discard_bio(cache, bio);
1196                 else
1197                         process_bio(cache, &structs, bio);
1198         }
1199
1200         prealloc_free_structs(cache, &structs);
1201 }
1202
1203 static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
1204 {
1205         unsigned long flags;
1206         struct bio_list bios;
1207         struct bio *bio;
1208
1209         bio_list_init(&bios);
1210
1211         spin_lock_irqsave(&cache->lock, flags);
1212         bio_list_merge(&bios, &cache->deferred_flush_bios);
1213         bio_list_init(&cache->deferred_flush_bios);
1214         spin_unlock_irqrestore(&cache->lock, flags);
1215
1216         while ((bio = bio_list_pop(&bios)))
1217                 submit_bios ? generic_make_request(bio) : bio_io_error(bio);
1218 }
1219
1220 static void writeback_some_dirty_blocks(struct cache *cache)
1221 {
1222         int r = 0;
1223         dm_oblock_t oblock;
1224         dm_cblock_t cblock;
1225         struct prealloc structs;
1226         struct dm_bio_prison_cell *old_ocell;
1227
1228         memset(&structs, 0, sizeof(structs));
1229
1230         while (spare_migration_bandwidth(cache)) {
1231                 if (prealloc_data_structs(cache, &structs))
1232                         break;
1233
1234                 r = policy_writeback_work(cache->policy, &oblock, &cblock);
1235                 if (r)
1236                         break;
1237
1238                 r = get_cell(cache, oblock, &structs, &old_ocell);
1239                 if (r) {
1240                         policy_set_dirty(cache->policy, oblock);
1241                         break;
1242                 }
1243
1244                 writeback(cache, &structs, oblock, cblock, old_ocell);
1245         }
1246
1247         prealloc_free_structs(cache, &structs);
1248 }
1249
1250 /*----------------------------------------------------------------
1251  * Main worker loop
1252  *--------------------------------------------------------------*/
1253 static void start_quiescing(struct cache *cache)
1254 {
1255         unsigned long flags;
1256
1257         spin_lock_irqsave(&cache->lock, flags);
1258         cache->quiescing = 1;
1259         spin_unlock_irqrestore(&cache->lock, flags);
1260 }
1261
1262 static void stop_quiescing(struct cache *cache)
1263 {
1264         unsigned long flags;
1265
1266         spin_lock_irqsave(&cache->lock, flags);
1267         cache->quiescing = 0;
1268         spin_unlock_irqrestore(&cache->lock, flags);
1269 }
1270
1271 static bool is_quiescing(struct cache *cache)
1272 {
1273         int r;
1274         unsigned long flags;
1275
1276         spin_lock_irqsave(&cache->lock, flags);
1277         r = cache->quiescing;
1278         spin_unlock_irqrestore(&cache->lock, flags);
1279
1280         return r;
1281 }
1282
1283 static void wait_for_migrations(struct cache *cache)
1284 {
1285         wait_event(cache->migration_wait, !atomic_read(&cache->nr_migrations));
1286 }
1287
1288 static void stop_worker(struct cache *cache)
1289 {
1290         cancel_delayed_work(&cache->waker);
1291         flush_workqueue(cache->wq);
1292 }
1293
1294 static void requeue_deferred_io(struct cache *cache)
1295 {
1296         struct bio *bio;
1297         struct bio_list bios;
1298
1299         bio_list_init(&bios);
1300         bio_list_merge(&bios, &cache->deferred_bios);
1301         bio_list_init(&cache->deferred_bios);
1302
1303         while ((bio = bio_list_pop(&bios)))
1304                 bio_endio(bio, DM_ENDIO_REQUEUE);
1305 }
1306
1307 static int more_work(struct cache *cache)
1308 {
1309         if (is_quiescing(cache))
1310                 return !list_empty(&cache->quiesced_migrations) ||
1311                         !list_empty(&cache->completed_migrations) ||
1312                         !list_empty(&cache->need_commit_migrations);
1313         else
1314                 return !bio_list_empty(&cache->deferred_bios) ||
1315                         !bio_list_empty(&cache->deferred_flush_bios) ||
1316                         !list_empty(&cache->quiesced_migrations) ||
1317                         !list_empty(&cache->completed_migrations) ||
1318                         !list_empty(&cache->need_commit_migrations);
1319 }
1320
1321 static void do_worker(struct work_struct *ws)
1322 {
1323         struct cache *cache = container_of(ws, struct cache, worker);
1324
1325         do {
1326                 if (!is_quiescing(cache))
1327                         process_deferred_bios(cache);
1328
1329                 process_migrations(cache, &cache->quiesced_migrations, issue_copy);
1330                 process_migrations(cache, &cache->completed_migrations, complete_migration);
1331
1332                 writeback_some_dirty_blocks(cache);
1333
1334                 if (commit_if_needed(cache)) {
1335                         process_deferred_flush_bios(cache, false);
1336
1337                         /*
1338                          * FIXME: rollback metadata or just go into a
1339                          * failure mode and error everything
1340                          */
1341                 } else {
1342                         process_deferred_flush_bios(cache, true);
1343                         process_migrations(cache, &cache->need_commit_migrations,
1344                                            migration_success_post_commit);
1345                 }
1346         } while (more_work(cache));
1347 }
1348
1349 /*
1350  * We want to commit periodically so that not too much
1351  * unwritten metadata builds up.
1352  */
1353 static void do_waker(struct work_struct *ws)
1354 {
1355         struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
1356         wake_worker(cache);
1357         queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
1358 }
1359
1360 /*----------------------------------------------------------------*/
1361
1362 static int is_congested(struct dm_dev *dev, int bdi_bits)
1363 {
1364         struct request_queue *q = bdev_get_queue(dev->bdev);
1365         return bdi_congested(&q->backing_dev_info, bdi_bits);
1366 }
1367
1368 static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
1369 {
1370         struct cache *cache = container_of(cb, struct cache, callbacks);
1371
1372         return is_congested(cache->origin_dev, bdi_bits) ||
1373                 is_congested(cache->cache_dev, bdi_bits);
1374 }
1375
1376 /*----------------------------------------------------------------
1377  * Target methods
1378  *--------------------------------------------------------------*/
1379
1380 /*
1381  * This function gets called on the error paths of the constructor, so we
1382  * have to cope with a partially initialised struct.
1383  */
1384 static void destroy(struct cache *cache)
1385 {
1386         unsigned i;
1387
1388         if (cache->next_migration)
1389                 mempool_free(cache->next_migration, cache->migration_pool);
1390
1391         if (cache->migration_pool)
1392                 mempool_destroy(cache->migration_pool);
1393
1394         if (cache->all_io_ds)
1395                 dm_deferred_set_destroy(cache->all_io_ds);
1396
1397         if (cache->prison)
1398                 dm_bio_prison_destroy(cache->prison);
1399
1400         if (cache->wq)
1401                 destroy_workqueue(cache->wq);
1402
1403         if (cache->dirty_bitset)
1404                 free_bitset(cache->dirty_bitset);
1405
1406         if (cache->discard_bitset)
1407                 free_bitset(cache->discard_bitset);
1408
1409         if (cache->copier)
1410                 dm_kcopyd_client_destroy(cache->copier);
1411
1412         if (cache->cmd)
1413                 dm_cache_metadata_close(cache->cmd);
1414
1415         if (cache->metadata_dev)
1416                 dm_put_device(cache->ti, cache->metadata_dev);
1417
1418         if (cache->origin_dev)
1419                 dm_put_device(cache->ti, cache->origin_dev);
1420
1421         if (cache->cache_dev)
1422                 dm_put_device(cache->ti, cache->cache_dev);
1423
1424         if (cache->policy)
1425                 dm_cache_policy_destroy(cache->policy);
1426
1427         for (i = 0; i < cache->nr_ctr_args ; i++)
1428                 kfree(cache->ctr_args[i]);
1429         kfree(cache->ctr_args);
1430
1431         kfree(cache);
1432 }
1433
1434 static void cache_dtr(struct dm_target *ti)
1435 {
1436         struct cache *cache = ti->private;
1437
1438         destroy(cache);
1439 }
1440
1441 static sector_t get_dev_size(struct dm_dev *dev)
1442 {
1443         return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
1444 }
1445
1446 /*----------------------------------------------------------------*/
1447
1448 /*
1449  * Construct a cache device mapping.
1450  *
1451  * cache <metadata dev> <cache dev> <origin dev> <block size>
1452  *       <#feature args> [<feature arg>]*
1453  *       <policy> <#policy args> [<policy arg>]*
1454  *
1455  * metadata dev    : fast device holding the persistent metadata
1456  * cache dev       : fast device holding cached data blocks
1457  * origin dev      : slow device holding original data blocks
1458  * block size      : cache unit size in sectors
1459  *
1460  * #feature args   : number of feature arguments passed
1461  * feature args    : writethrough.  (The default is writeback.)
1462  *
1463  * policy          : the replacement policy to use
1464  * #policy args    : an even number of policy arguments corresponding
1465  *                   to key/value pairs passed to the policy
1466  * policy args     : key/value pairs passed to the policy
1467  *                   E.g. 'sequential_threshold 1024'
1468  *                   See cache-policies.txt for details.
1469  *
1470  * Optional feature arguments are:
1471  *   writethrough  : write through caching that prohibits cache block
1472  *                   content from being different from origin block content.
1473  *                   Without this argument, the default behaviour is to write
1474  *                   back cache block contents later for performance reasons,
1475  *                   so they may differ from the corresponding origin blocks.
1476  */
1477 struct cache_args {
1478         struct dm_target *ti;
1479
1480         struct dm_dev *metadata_dev;
1481
1482         struct dm_dev *cache_dev;
1483         sector_t cache_sectors;
1484
1485         struct dm_dev *origin_dev;
1486         sector_t origin_sectors;
1487
1488         uint32_t block_size;
1489
1490         const char *policy_name;
1491         int policy_argc;
1492         const char **policy_argv;
1493
1494         struct cache_features features;
1495 };
1496
1497 static void destroy_cache_args(struct cache_args *ca)
1498 {
1499         if (ca->metadata_dev)
1500                 dm_put_device(ca->ti, ca->metadata_dev);
1501
1502         if (ca->cache_dev)
1503                 dm_put_device(ca->ti, ca->cache_dev);
1504
1505         if (ca->origin_dev)
1506                 dm_put_device(ca->ti, ca->origin_dev);
1507
1508         kfree(ca);
1509 }
1510
1511 static bool at_least_one_arg(struct dm_arg_set *as, char **error)
1512 {
1513         if (!as->argc) {
1514                 *error = "Insufficient args";
1515                 return false;
1516         }
1517
1518         return true;
1519 }
1520
1521 static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as,
1522                               char **error)
1523 {
1524         int r;
1525         sector_t metadata_dev_size;
1526         char b[BDEVNAME_SIZE];
1527
1528         if (!at_least_one_arg(as, error))
1529                 return -EINVAL;
1530
1531         r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
1532                           &ca->metadata_dev);
1533         if (r) {
1534                 *error = "Error opening metadata device";
1535                 return r;
1536         }
1537
1538         metadata_dev_size = get_dev_size(ca->metadata_dev);
1539         if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING)
1540                 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
1541                        bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
1542
1543         return 0;
1544 }
1545
1546 static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as,
1547                            char **error)
1548 {
1549         int r;
1550
1551         if (!at_least_one_arg(as, error))
1552                 return -EINVAL;
1553
1554         r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
1555                           &ca->cache_dev);
1556         if (r) {
1557                 *error = "Error opening cache device";
1558                 return r;
1559         }
1560         ca->cache_sectors = get_dev_size(ca->cache_dev);
1561
1562         return 0;
1563 }
1564
1565 static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as,
1566                             char **error)
1567 {
1568         int r;
1569
1570         if (!at_least_one_arg(as, error))
1571                 return -EINVAL;
1572
1573         r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
1574                           &ca->origin_dev);
1575         if (r) {
1576                 *error = "Error opening origin device";
1577                 return r;
1578         }
1579
1580         ca->origin_sectors = get_dev_size(ca->origin_dev);
1581         if (ca->ti->len > ca->origin_sectors) {
1582                 *error = "Device size larger than cached device";
1583                 return -EINVAL;
1584         }
1585
1586         return 0;
1587 }
1588
1589 static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as,
1590                             char **error)
1591 {
1592         unsigned long tmp;
1593
1594         if (!at_least_one_arg(as, error))
1595                 return -EINVAL;
1596
1597         if (kstrtoul(dm_shift_arg(as), 10, &tmp) || !tmp ||
1598             tmp < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
1599             tmp & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
1600                 *error = "Invalid data block size";
1601                 return -EINVAL;
1602         }
1603
1604         if (tmp > ca->cache_sectors) {
1605                 *error = "Data block size is larger than the cache device";
1606                 return -EINVAL;
1607         }
1608
1609         ca->block_size = tmp;
1610
1611         return 0;
1612 }
1613
1614 static void init_features(struct cache_features *cf)
1615 {
1616         cf->mode = CM_WRITE;
1617         cf->write_through = false;
1618 }
1619
1620 static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
1621                           char **error)
1622 {
1623         static struct dm_arg _args[] = {
1624                 {0, 1, "Invalid number of cache feature arguments"},
1625         };
1626
1627         int r;
1628         unsigned argc;
1629         const char *arg;
1630         struct cache_features *cf = &ca->features;
1631
1632         init_features(cf);
1633
1634         r = dm_read_arg_group(_args, as, &argc, error);
1635         if (r)
1636                 return -EINVAL;
1637
1638         while (argc--) {
1639                 arg = dm_shift_arg(as);
1640
1641                 if (!strcasecmp(arg, "writeback"))
1642                         cf->write_through = false;
1643
1644                 else if (!strcasecmp(arg, "writethrough"))
1645                         cf->write_through = true;
1646
1647                 else {
1648                         *error = "Unrecognised cache feature requested";
1649                         return -EINVAL;
1650                 }
1651         }
1652
1653         return 0;
1654 }
1655
1656 static int parse_policy(struct cache_args *ca, struct dm_arg_set *as,
1657                         char **error)
1658 {
1659         static struct dm_arg _args[] = {
1660                 {0, 1024, "Invalid number of policy arguments"},
1661         };
1662
1663         int r;
1664
1665         if (!at_least_one_arg(as, error))
1666                 return -EINVAL;
1667
1668         ca->policy_name = dm_shift_arg(as);
1669
1670         r = dm_read_arg_group(_args, as, &ca->policy_argc, error);
1671         if (r)
1672                 return -EINVAL;
1673
1674         ca->policy_argv = (const char **)as->argv;
1675         dm_consume_args(as, ca->policy_argc);
1676
1677         return 0;
1678 }
1679
1680 static int parse_cache_args(struct cache_args *ca, int argc, char **argv,
1681                             char **error)
1682 {
1683         int r;
1684         struct dm_arg_set as;
1685
1686         as.argc = argc;
1687         as.argv = argv;
1688
1689         r = parse_metadata_dev(ca, &as, error);
1690         if (r)
1691                 return r;
1692
1693         r = parse_cache_dev(ca, &as, error);
1694         if (r)
1695                 return r;
1696
1697         r = parse_origin_dev(ca, &as, error);
1698         if (r)
1699                 return r;
1700
1701         r = parse_block_size(ca, &as, error);
1702         if (r)
1703                 return r;
1704
1705         r = parse_features(ca, &as, error);
1706         if (r)
1707                 return r;
1708
1709         r = parse_policy(ca, &as, error);
1710         if (r)
1711                 return r;
1712
1713         return 0;
1714 }
1715
1716 /*----------------------------------------------------------------*/
1717
1718 static struct kmem_cache *migration_cache;
1719
1720 static int set_config_values(struct dm_cache_policy *p, int argc, const char **argv)
1721 {
1722         int r = 0;
1723
1724         if (argc & 1) {
1725                 DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs.");
1726                 return -EINVAL;
1727         }
1728
1729         while (argc) {
1730                 r = policy_set_config_value(p, argv[0], argv[1]);
1731                 if (r) {
1732                         DMWARN("policy_set_config_value failed: key = '%s', value = '%s'",
1733                                argv[0], argv[1]);
1734                         return r;
1735                 }
1736
1737                 argc -= 2;
1738                 argv += 2;
1739         }
1740
1741         return r;
1742 }
1743
1744 static int create_cache_policy(struct cache *cache, struct cache_args *ca,
1745                                char **error)
1746 {
1747         int r;
1748
1749         cache->policy = dm_cache_policy_create(ca->policy_name,
1750                                                cache->cache_size,
1751                                                cache->origin_sectors,
1752                                                cache->sectors_per_block);
1753         if (!cache->policy) {
1754                 *error = "Error creating cache's policy";
1755                 return -ENOMEM;
1756         }
1757
1758         r = set_config_values(cache->policy, ca->policy_argc, ca->policy_argv);
1759         if (r)
1760                 dm_cache_policy_destroy(cache->policy);
1761
1762         return r;
1763 }
1764
1765 /*
1766  * We want the discard block size to be a power of two, at least the size
1767  * of the cache block size, and have no more than 2^14 discard blocks
1768  * across the origin.
1769  */
1770 #define MAX_DISCARD_BLOCKS (1 << 14)
1771
1772 static bool too_many_discard_blocks(sector_t discard_block_size,
1773                                     sector_t origin_size)
1774 {
1775         (void) sector_div(origin_size, discard_block_size);
1776
1777         return origin_size > MAX_DISCARD_BLOCKS;
1778 }
1779
1780 static sector_t calculate_discard_block_size(sector_t cache_block_size,
1781                                              sector_t origin_size)
1782 {
1783         sector_t discard_block_size;
1784
1785         discard_block_size = roundup_pow_of_two(cache_block_size);
1786
1787         if (origin_size)
1788                 while (too_many_discard_blocks(discard_block_size, origin_size))
1789                         discard_block_size *= 2;
1790
1791         return discard_block_size;
1792 }
1793
1794 #define DEFAULT_MIGRATION_THRESHOLD (2048 * 100)
1795
1796 static unsigned cache_num_write_bios(struct dm_target *ti, struct bio *bio);
1797
1798 static int cache_create(struct cache_args *ca, struct cache **result)
1799 {
1800         int r = 0;
1801         char **error = &ca->ti->error;
1802         struct cache *cache;
1803         struct dm_target *ti = ca->ti;
1804         dm_block_t origin_blocks;
1805         struct dm_cache_metadata *cmd;
1806         bool may_format = ca->features.mode == CM_WRITE;
1807
1808         cache = kzalloc(sizeof(*cache), GFP_KERNEL);
1809         if (!cache)
1810                 return -ENOMEM;
1811
1812         cache->ti = ca->ti;
1813         ti->private = cache;
1814         ti->per_bio_data_size = sizeof(struct per_bio_data);
1815         ti->num_flush_bios = 2;
1816         ti->flush_supported = true;
1817
1818         ti->num_discard_bios = 1;
1819         ti->discards_supported = true;
1820         ti->discard_zeroes_data_unsupported = true;
1821
1822         memcpy(&cache->features, &ca->features, sizeof(cache->features));
1823
1824         if (cache->features.write_through)
1825                 ti->num_write_bios = cache_num_write_bios;
1826
1827         cache->callbacks.congested_fn = cache_is_congested;
1828         dm_table_add_target_callbacks(ti->table, &cache->callbacks);
1829
1830         cache->metadata_dev = ca->metadata_dev;
1831         cache->origin_dev = ca->origin_dev;
1832         cache->cache_dev = ca->cache_dev;
1833
1834         ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL;
1835
1836         /* FIXME: factor out this whole section */
1837         origin_blocks = cache->origin_sectors = ca->origin_sectors;
1838         (void) sector_div(origin_blocks, ca->block_size);
1839         cache->origin_blocks = to_oblock(origin_blocks);
1840
1841         cache->sectors_per_block = ca->block_size;
1842         if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) {
1843                 r = -EINVAL;
1844                 goto bad;
1845         }
1846
1847         if (ca->block_size & (ca->block_size - 1)) {
1848                 dm_block_t cache_size = ca->cache_sectors;
1849
1850                 cache->sectors_per_block_shift = -1;
1851                 (void) sector_div(cache_size, ca->block_size);
1852                 cache->cache_size = to_cblock(cache_size);
1853         } else {
1854                 cache->sectors_per_block_shift = __ffs(ca->block_size);
1855                 cache->cache_size = to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift);
1856         }
1857
1858         r = create_cache_policy(cache, ca, error);
1859         if (r)
1860                 goto bad;
1861         cache->policy_nr_args = ca->policy_argc;
1862
1863         cmd = dm_cache_metadata_open(cache->metadata_dev->bdev,
1864                                      ca->block_size, may_format,
1865                                      dm_cache_policy_get_hint_size(cache->policy));
1866         if (IS_ERR(cmd)) {
1867                 *error = "Error creating metadata object";
1868                 r = PTR_ERR(cmd);
1869                 goto bad;
1870         }
1871         cache->cmd = cmd;
1872
1873         spin_lock_init(&cache->lock);
1874         bio_list_init(&cache->deferred_bios);
1875         bio_list_init(&cache->deferred_flush_bios);
1876         INIT_LIST_HEAD(&cache->quiesced_migrations);
1877         INIT_LIST_HEAD(&cache->completed_migrations);
1878         INIT_LIST_HEAD(&cache->need_commit_migrations);
1879         cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD;
1880         atomic_set(&cache->nr_migrations, 0);
1881         init_waitqueue_head(&cache->migration_wait);
1882
1883         cache->nr_dirty = 0;
1884         cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
1885         if (!cache->dirty_bitset) {
1886                 *error = "could not allocate dirty bitset";
1887                 goto bad;
1888         }
1889         clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size));
1890
1891         cache->discard_block_size =
1892                 calculate_discard_block_size(cache->sectors_per_block,
1893                                              cache->origin_sectors);
1894         cache->discard_nr_blocks = oblock_to_dblock(cache, cache->origin_blocks);
1895         cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks));
1896         if (!cache->discard_bitset) {
1897                 *error = "could not allocate discard bitset";
1898                 goto bad;
1899         }
1900         clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
1901
1902         cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
1903         if (IS_ERR(cache->copier)) {
1904                 *error = "could not create kcopyd client";
1905                 r = PTR_ERR(cache->copier);
1906                 goto bad;
1907         }
1908
1909         cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
1910         if (!cache->wq) {
1911                 *error = "could not create workqueue for metadata object";
1912                 goto bad;
1913         }
1914         INIT_WORK(&cache->worker, do_worker);
1915         INIT_DELAYED_WORK(&cache->waker, do_waker);
1916         cache->last_commit_jiffies = jiffies;
1917
1918         cache->prison = dm_bio_prison_create(PRISON_CELLS);
1919         if (!cache->prison) {
1920                 *error = "could not create bio prison";
1921                 goto bad;
1922         }
1923
1924         cache->all_io_ds = dm_deferred_set_create();
1925         if (!cache->all_io_ds) {
1926                 *error = "could not create all_io deferred set";
1927                 goto bad;
1928         }
1929
1930         cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE,
1931                                                          migration_cache);
1932         if (!cache->migration_pool) {
1933                 *error = "Error creating cache's migration mempool";
1934                 goto bad;
1935         }
1936
1937         cache->next_migration = NULL;
1938
1939         cache->need_tick_bio = true;
1940         cache->sized = false;
1941         cache->quiescing = false;
1942         cache->commit_requested = false;
1943         cache->loaded_mappings = false;
1944         cache->loaded_discards = false;
1945
1946         load_stats(cache);
1947
1948         atomic_set(&cache->stats.demotion, 0);
1949         atomic_set(&cache->stats.promotion, 0);
1950         atomic_set(&cache->stats.copies_avoided, 0);
1951         atomic_set(&cache->stats.cache_cell_clash, 0);
1952         atomic_set(&cache->stats.commit_count, 0);
1953         atomic_set(&cache->stats.discard_count, 0);
1954
1955         *result = cache;
1956         return 0;
1957
1958 bad:
1959         destroy(cache);
1960         return r;
1961 }
1962
1963 static int copy_ctr_args(struct cache *cache, int argc, const char **argv)
1964 {
1965         unsigned i;
1966         const char **copy;
1967
1968         copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL);
1969         if (!copy)
1970                 return -ENOMEM;
1971         for (i = 0; i < argc; i++) {
1972                 copy[i] = kstrdup(argv[i], GFP_KERNEL);
1973                 if (!copy[i]) {
1974                         while (i--)
1975                                 kfree(copy[i]);
1976                         kfree(copy);
1977                         return -ENOMEM;
1978                 }
1979         }
1980
1981         cache->nr_ctr_args = argc;
1982         cache->ctr_args = copy;
1983
1984         return 0;
1985 }
1986
1987 static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv)
1988 {
1989         int r = -EINVAL;
1990         struct cache_args *ca;
1991         struct cache *cache = NULL;
1992
1993         ca = kzalloc(sizeof(*ca), GFP_KERNEL);
1994         if (!ca) {
1995                 ti->error = "Error allocating memory for cache";
1996                 return -ENOMEM;
1997         }
1998         ca->ti = ti;
1999
2000         r = parse_cache_args(ca, argc, argv, &ti->error);
2001         if (r)
2002                 goto out;
2003
2004         r = cache_create(ca, &cache);
2005
2006         r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3);
2007         if (r) {
2008                 destroy(cache);
2009                 goto out;
2010         }
2011
2012         ti->private = cache;
2013
2014 out:
2015         destroy_cache_args(ca);
2016         return r;
2017 }
2018
2019 static unsigned cache_num_write_bios(struct dm_target *ti, struct bio *bio)
2020 {
2021         int r;
2022         struct cache *cache = ti->private;
2023         dm_oblock_t block = get_bio_block(cache, bio);
2024         dm_cblock_t cblock;
2025
2026         r = policy_lookup(cache->policy, block, &cblock);
2027         if (r < 0)
2028                 return 2;       /* assume the worst */
2029
2030         return (!r && !is_dirty(cache, cblock)) ? 2 : 1;
2031 }
2032
2033 static int cache_map(struct dm_target *ti, struct bio *bio)
2034 {
2035         struct cache *cache = ti->private;
2036
2037         int r;
2038         dm_oblock_t block = get_bio_block(cache, bio);
2039         bool can_migrate = false;
2040         bool discarded_block;
2041         struct dm_bio_prison_cell *cell;
2042         struct policy_result lookup_result;
2043         struct per_bio_data *pb;
2044
2045         if (from_oblock(block) > from_oblock(cache->origin_blocks)) {
2046                 /*
2047                  * This can only occur if the io goes to a partial block at
2048                  * the end of the origin device.  We don't cache these.
2049                  * Just remap to the origin and carry on.
2050                  */
2051                 remap_to_origin_clear_discard(cache, bio, block);
2052                 return DM_MAPIO_REMAPPED;
2053         }
2054
2055         pb = init_per_bio_data(bio);
2056
2057         if (bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD)) {
2058                 defer_bio(cache, bio);
2059                 return DM_MAPIO_SUBMITTED;
2060         }
2061
2062         /*
2063          * Check to see if that block is currently migrating.
2064          */
2065         cell = alloc_prison_cell(cache);
2066         if (!cell) {
2067                 defer_bio(cache, bio);
2068                 return DM_MAPIO_SUBMITTED;
2069         }
2070
2071         r = bio_detain(cache, block, bio, cell,
2072                        (cell_free_fn) free_prison_cell,
2073                        cache, &cell);
2074         if (r) {
2075                 if (r < 0)
2076                         defer_bio(cache, bio);
2077
2078                 return DM_MAPIO_SUBMITTED;
2079         }
2080
2081         discarded_block = is_discarded_oblock(cache, block);
2082
2083         r = policy_map(cache->policy, block, false, can_migrate, discarded_block,
2084                        bio, &lookup_result);
2085         if (r == -EWOULDBLOCK) {
2086                 cell_defer(cache, cell, true);
2087                 return DM_MAPIO_SUBMITTED;
2088
2089         } else if (r) {
2090                 DMERR_LIMIT("Unexpected return from cache replacement policy: %d", r);
2091                 bio_io_error(bio);
2092                 return DM_MAPIO_SUBMITTED;
2093         }
2094
2095         switch (lookup_result.op) {
2096         case POLICY_HIT:
2097                 inc_hit_counter(cache, bio);
2098                 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
2099
2100                 if (is_writethrough_io(cache, bio, lookup_result.cblock)) {
2101                         /*
2102                          * No need to mark anything dirty in write through mode.
2103                          */
2104                         pb->req_nr == 0 ?
2105                                 remap_to_cache(cache, bio, lookup_result.cblock) :
2106                                 remap_to_origin_clear_discard(cache, bio, block);
2107                         cell_defer(cache, cell, false);
2108                 } else {
2109                         remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
2110                         cell_defer(cache, cell, false);
2111                 }
2112                 break;
2113
2114         case POLICY_MISS:
2115                 inc_miss_counter(cache, bio);
2116                 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
2117
2118                 if (pb->req_nr != 0) {
2119                         /*
2120                          * This is a duplicate writethrough io that is no
2121                          * longer needed because the block has been demoted.
2122                          */
2123                         bio_endio(bio, 0);
2124                         cell_defer(cache, cell, false);
2125                         return DM_MAPIO_SUBMITTED;
2126                 } else {
2127                         remap_to_origin_clear_discard(cache, bio, block);
2128                         cell_defer(cache, cell, false);
2129                 }
2130                 break;
2131
2132         default:
2133                 DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__,
2134                             (unsigned) lookup_result.op);
2135                 bio_io_error(bio);
2136                 return DM_MAPIO_SUBMITTED;
2137         }
2138
2139         return DM_MAPIO_REMAPPED;
2140 }
2141
2142 static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
2143 {
2144         struct cache *cache = ti->private;
2145         unsigned long flags;
2146         struct per_bio_data *pb = get_per_bio_data(bio);
2147
2148         if (pb->tick) {
2149                 policy_tick(cache->policy);
2150
2151                 spin_lock_irqsave(&cache->lock, flags);
2152                 cache->need_tick_bio = true;
2153                 spin_unlock_irqrestore(&cache->lock, flags);
2154         }
2155
2156         check_for_quiesced_migrations(cache, pb);
2157
2158         return 0;
2159 }
2160
2161 static int write_dirty_bitset(struct cache *cache)
2162 {
2163         unsigned i, r;
2164
2165         for (i = 0; i < from_cblock(cache->cache_size); i++) {
2166                 r = dm_cache_set_dirty(cache->cmd, to_cblock(i),
2167                                        is_dirty(cache, to_cblock(i)));
2168                 if (r)
2169                         return r;
2170         }
2171
2172         return 0;
2173 }
2174
2175 static int write_discard_bitset(struct cache *cache)
2176 {
2177         unsigned i, r;
2178
2179         r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size,
2180                                            cache->discard_nr_blocks);
2181         if (r) {
2182                 DMERR("could not resize on-disk discard bitset");
2183                 return r;
2184         }
2185
2186         for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) {
2187                 r = dm_cache_set_discard(cache->cmd, to_dblock(i),
2188                                          is_discarded(cache, to_dblock(i)));
2189                 if (r)
2190                         return r;
2191         }
2192
2193         return 0;
2194 }
2195
2196 static int save_hint(void *context, dm_cblock_t cblock, dm_oblock_t oblock,
2197                      uint32_t hint)
2198 {
2199         struct cache *cache = context;
2200         return dm_cache_save_hint(cache->cmd, cblock, hint);
2201 }
2202
2203 static int write_hints(struct cache *cache)
2204 {
2205         int r;
2206
2207         r = dm_cache_begin_hints(cache->cmd, cache->policy);
2208         if (r) {
2209                 DMERR("dm_cache_begin_hints failed");
2210                 return r;
2211         }
2212
2213         r = policy_walk_mappings(cache->policy, save_hint, cache);
2214         if (r)
2215                 DMERR("policy_walk_mappings failed");
2216
2217         return r;
2218 }
2219
2220 /*
2221  * returns true on success
2222  */
2223 static bool sync_metadata(struct cache *cache)
2224 {
2225         int r1, r2, r3, r4;
2226
2227         r1 = write_dirty_bitset(cache);
2228         if (r1)
2229                 DMERR("could not write dirty bitset");
2230
2231         r2 = write_discard_bitset(cache);
2232         if (r2)
2233                 DMERR("could not write discard bitset");
2234
2235         save_stats(cache);
2236
2237         r3 = write_hints(cache);
2238         if (r3)
2239                 DMERR("could not write hints");
2240
2241         /*
2242          * If writing the above metadata failed, we still commit, but don't
2243          * set the clean shutdown flag.  This will effectively force every
2244          * dirty bit to be set on reload.
2245          */
2246         r4 = dm_cache_commit(cache->cmd, !r1 && !r2 && !r3);
2247         if (r4)
2248                 DMERR("could not write cache metadata.  Data loss may occur.");
2249
2250         return !r1 && !r2 && !r3 && !r4;
2251 }
2252
2253 static void cache_postsuspend(struct dm_target *ti)
2254 {
2255         struct cache *cache = ti->private;
2256
2257         start_quiescing(cache);
2258         wait_for_migrations(cache);
2259         stop_worker(cache);
2260         requeue_deferred_io(cache);
2261         stop_quiescing(cache);
2262
2263         (void) sync_metadata(cache);
2264 }
2265
2266 static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
2267                         bool dirty, uint32_t hint, bool hint_valid)
2268 {
2269         int r;
2270         struct cache *cache = context;
2271
2272         r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid);
2273         if (r)
2274                 return r;
2275
2276         if (dirty)
2277                 set_dirty(cache, oblock, cblock);
2278         else
2279                 clear_dirty(cache, oblock, cblock);
2280
2281         return 0;
2282 }
2283
2284 static int load_discard(void *context, sector_t discard_block_size,
2285                         dm_dblock_t dblock, bool discard)
2286 {
2287         struct cache *cache = context;
2288
2289         /* FIXME: handle mis-matched block size */
2290
2291         if (discard)
2292                 set_discard(cache, dblock);
2293         else
2294                 clear_discard(cache, dblock);
2295
2296         return 0;
2297 }
2298
2299 static int cache_preresume(struct dm_target *ti)
2300 {
2301         int r = 0;
2302         struct cache *cache = ti->private;
2303         sector_t actual_cache_size = get_dev_size(cache->cache_dev);
2304         (void) sector_div(actual_cache_size, cache->sectors_per_block);
2305
2306         /*
2307          * Check to see if the cache has resized.
2308          */
2309         if (from_cblock(cache->cache_size) != actual_cache_size || !cache->sized) {
2310                 cache->cache_size = to_cblock(actual_cache_size);
2311
2312                 r = dm_cache_resize(cache->cmd, cache->cache_size);
2313                 if (r) {
2314                         DMERR("could not resize cache metadata");
2315                         return r;
2316                 }
2317
2318                 cache->sized = true;
2319         }
2320
2321         if (!cache->loaded_mappings) {
2322                 r = dm_cache_load_mappings(cache->cmd,
2323                                            dm_cache_policy_get_name(cache->policy),
2324                                            load_mapping, cache);
2325                 if (r) {
2326                         DMERR("could not load cache mappings");
2327                         return r;
2328                 }
2329
2330                 cache->loaded_mappings = true;
2331         }
2332
2333         if (!cache->loaded_discards) {
2334                 r = dm_cache_load_discards(cache->cmd, load_discard, cache);
2335                 if (r) {
2336                         DMERR("could not load origin discards");
2337                         return r;
2338                 }
2339
2340                 cache->loaded_discards = true;
2341         }
2342
2343         return r;
2344 }
2345
2346 static void cache_resume(struct dm_target *ti)
2347 {
2348         struct cache *cache = ti->private;
2349
2350         cache->need_tick_bio = true;
2351         do_waker(&cache->waker.work);
2352 }
2353
2354 /*
2355  * Status format:
2356  *
2357  * <#used metadata blocks>/<#total metadata blocks>
2358  * <#read hits> <#read misses> <#write hits> <#write misses>
2359  * <#demotions> <#promotions> <#blocks in cache> <#dirty>
2360  * <#features> <features>*
2361  * <#core args> <core args>
2362  * <#policy args> <policy args>*
2363  */
2364 static void cache_status(struct dm_target *ti, status_type_t type,
2365                          unsigned status_flags, char *result, unsigned maxlen)
2366 {
2367         int r = 0;
2368         unsigned i;
2369         ssize_t sz = 0;
2370         dm_block_t nr_free_blocks_metadata = 0;
2371         dm_block_t nr_blocks_metadata = 0;
2372         char buf[BDEVNAME_SIZE];
2373         struct cache *cache = ti->private;
2374         dm_cblock_t residency;
2375
2376         switch (type) {
2377         case STATUSTYPE_INFO:
2378                 /* Commit to ensure statistics aren't out-of-date */
2379                 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) {
2380                         r = dm_cache_commit(cache->cmd, false);
2381                         if (r)
2382                                 DMERR("could not commit metadata for accurate status");
2383                 }
2384
2385                 r = dm_cache_get_free_metadata_block_count(cache->cmd,
2386                                                            &nr_free_blocks_metadata);
2387                 if (r) {
2388                         DMERR("could not get metadata free block count");
2389                         goto err;
2390                 }
2391
2392                 r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata);
2393                 if (r) {
2394                         DMERR("could not get metadata device size");
2395                         goto err;
2396                 }
2397
2398                 residency = policy_residency(cache->policy);
2399
2400                 DMEMIT("%llu/%llu %u %u %u %u %u %u %llu %u ",
2401                        (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
2402                        (unsigned long long)nr_blocks_metadata,
2403                        (unsigned) atomic_read(&cache->stats.read_hit),
2404                        (unsigned) atomic_read(&cache->stats.read_miss),
2405                        (unsigned) atomic_read(&cache->stats.write_hit),
2406                        (unsigned) atomic_read(&cache->stats.write_miss),
2407                        (unsigned) atomic_read(&cache->stats.demotion),
2408                        (unsigned) atomic_read(&cache->stats.promotion),
2409                        (unsigned long long) from_cblock(residency),
2410                        cache->nr_dirty);
2411
2412                 if (cache->features.write_through)
2413                         DMEMIT("1 writethrough ");
2414                 else
2415                         DMEMIT("0 ");
2416
2417                 DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold);
2418                 if (sz < maxlen) {
2419                         r = policy_emit_config_values(cache->policy, result + sz, maxlen - sz);
2420                         if (r)
2421                                 DMERR("policy_emit_config_values returned %d", r);
2422                 }
2423
2424                 break;
2425
2426         case STATUSTYPE_TABLE:
2427                 format_dev_t(buf, cache->metadata_dev->bdev->bd_dev);
2428                 DMEMIT("%s ", buf);
2429                 format_dev_t(buf, cache->cache_dev->bdev->bd_dev);
2430                 DMEMIT("%s ", buf);
2431                 format_dev_t(buf, cache->origin_dev->bdev->bd_dev);
2432                 DMEMIT("%s", buf);
2433
2434                 for (i = 0; i < cache->nr_ctr_args - 1; i++)
2435                         DMEMIT(" %s", cache->ctr_args[i]);
2436                 if (cache->nr_ctr_args)
2437                         DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]);
2438         }
2439
2440         return;
2441
2442 err:
2443         DMEMIT("Error");
2444 }
2445
2446 #define NOT_CORE_OPTION 1
2447
2448 static int process_config_option(struct cache *cache, char **argv)
2449 {
2450         unsigned long tmp;
2451
2452         if (!strcasecmp(argv[0], "migration_threshold")) {
2453                 if (kstrtoul(argv[1], 10, &tmp))
2454                         return -EINVAL;
2455
2456                 cache->migration_threshold = tmp;
2457                 return 0;
2458         }
2459
2460         return NOT_CORE_OPTION;
2461 }
2462
2463 /*
2464  * Supports <key> <value>.
2465  *
2466  * The key migration_threshold is supported by the cache target core.
2467  */
2468 static int cache_message(struct dm_target *ti, unsigned argc, char **argv)
2469 {
2470         int r;
2471         struct cache *cache = ti->private;
2472
2473         if (argc != 2)
2474                 return -EINVAL;
2475
2476         r = process_config_option(cache, argv);
2477         if (r == NOT_CORE_OPTION)
2478                 return policy_set_config_value(cache->policy, argv[0], argv[1]);
2479
2480         return r;
2481 }
2482
2483 static int cache_iterate_devices(struct dm_target *ti,
2484                                  iterate_devices_callout_fn fn, void *data)
2485 {
2486         int r = 0;
2487         struct cache *cache = ti->private;
2488
2489         r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data);
2490         if (!r)
2491                 r = fn(ti, cache->origin_dev, 0, ti->len, data);
2492
2493         return r;
2494 }
2495
2496 /*
2497  * We assume I/O is going to the origin (which is the volume
2498  * more likely to have restrictions e.g. by being striped).
2499  * (Looking up the exact location of the data would be expensive
2500  * and could always be out of date by the time the bio is submitted.)
2501  */
2502 static int cache_bvec_merge(struct dm_target *ti,
2503                             struct bvec_merge_data *bvm,
2504                             struct bio_vec *biovec, int max_size)
2505 {
2506         struct cache *cache = ti->private;
2507         struct request_queue *q = bdev_get_queue(cache->origin_dev->bdev);
2508
2509         if (!q->merge_bvec_fn)
2510                 return max_size;
2511
2512         bvm->bi_bdev = cache->origin_dev->bdev;
2513         return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
2514 }
2515
2516 static void set_discard_limits(struct cache *cache, struct queue_limits *limits)
2517 {
2518         /*
2519          * FIXME: these limits may be incompatible with the cache device
2520          */
2521         limits->max_discard_sectors = cache->discard_block_size * 1024;
2522         limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT;
2523 }
2524
2525 static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
2526 {
2527         struct cache *cache = ti->private;
2528
2529         blk_limits_io_min(limits, 0);
2530         blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT);
2531         set_discard_limits(cache, limits);
2532 }
2533
2534 /*----------------------------------------------------------------*/
2535
2536 static struct target_type cache_target = {
2537         .name = "cache",
2538         .version = {1, 0, 0},
2539         .module = THIS_MODULE,
2540         .ctr = cache_ctr,
2541         .dtr = cache_dtr,
2542         .map = cache_map,
2543         .end_io = cache_end_io,
2544         .postsuspend = cache_postsuspend,
2545         .preresume = cache_preresume,
2546         .resume = cache_resume,
2547         .status = cache_status,
2548         .message = cache_message,
2549         .iterate_devices = cache_iterate_devices,
2550         .merge = cache_bvec_merge,
2551         .io_hints = cache_io_hints,
2552 };
2553
2554 static int __init dm_cache_init(void)
2555 {
2556         int r;
2557
2558         r = dm_register_target(&cache_target);
2559         if (r) {
2560                 DMERR("cache target registration failed: %d", r);
2561                 return r;
2562         }
2563
2564         migration_cache = KMEM_CACHE(dm_cache_migration, 0);
2565         if (!migration_cache) {
2566                 dm_unregister_target(&cache_target);
2567                 return -ENOMEM;
2568         }
2569
2570         return 0;
2571 }
2572
2573 static void __exit dm_cache_exit(void)
2574 {
2575         dm_unregister_target(&cache_target);
2576         kmem_cache_destroy(migration_cache);
2577 }
2578
2579 module_init(dm_cache_init);
2580 module_exit(dm_cache_exit);
2581
2582 MODULE_DESCRIPTION(DM_NAME " cache target");
2583 MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>");
2584 MODULE_LICENSE("GPL");