]> git.kernelconcepts.de Git - karo-tx-linux.git/blob - drivers/md/dm-cache-target.c
tile: expect new initramfs name from hypervisor file system
[karo-tx-linux.git] / drivers / md / dm-cache-target.c
1 /*
2  * Copyright (C) 2012 Red Hat. All rights reserved.
3  *
4  * This file is released under the GPL.
5  */
6
7 #include "dm.h"
8 #include "dm-bio-prison.h"
9 #include "dm-cache-metadata.h"
10
11 #include <linux/dm-io.h>
12 #include <linux/dm-kcopyd.h>
13 #include <linux/init.h>
14 #include <linux/mempool.h>
15 #include <linux/module.h>
16 #include <linux/slab.h>
17 #include <linux/vmalloc.h>
18
19 #define DM_MSG_PREFIX "cache"
20
21 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
22         "A percentage of time allocated for copying to and/or from cache");
23
24 /*----------------------------------------------------------------*/
25
26 /*
27  * Glossary:
28  *
29  * oblock: index of an origin block
30  * cblock: index of a cache block
31  * promotion: movement of a block from origin to cache
32  * demotion: movement of a block from cache to origin
33  * migration: movement of a block between the origin and cache device,
34  *            either direction
35  */
36
37 /*----------------------------------------------------------------*/
38
39 static size_t bitset_size_in_bytes(unsigned nr_entries)
40 {
41         return sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG);
42 }
43
44 static unsigned long *alloc_bitset(unsigned nr_entries)
45 {
46         size_t s = bitset_size_in_bytes(nr_entries);
47         return vzalloc(s);
48 }
49
50 static void clear_bitset(void *bitset, unsigned nr_entries)
51 {
52         size_t s = bitset_size_in_bytes(nr_entries);
53         memset(bitset, 0, s);
54 }
55
56 static void free_bitset(unsigned long *bits)
57 {
58         vfree(bits);
59 }
60
61 /*----------------------------------------------------------------*/
62
63 #define PRISON_CELLS 1024
64 #define MIGRATION_POOL_SIZE 128
65 #define COMMIT_PERIOD HZ
66 #define MIGRATION_COUNT_WINDOW 10
67
68 /*
69  * The block size of the device holding cache data must be >= 32KB
70  */
71 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT)
72
73 /*
74  * FIXME: the cache is read/write for the time being.
75  */
76 enum cache_mode {
77         CM_WRITE,               /* metadata may be changed */
78         CM_READ_ONLY,           /* metadata may not be changed */
79 };
80
81 struct cache_features {
82         enum cache_mode mode;
83         bool write_through:1;
84 };
85
86 struct cache_stats {
87         atomic_t read_hit;
88         atomic_t read_miss;
89         atomic_t write_hit;
90         atomic_t write_miss;
91         atomic_t demotion;
92         atomic_t promotion;
93         atomic_t copies_avoided;
94         atomic_t cache_cell_clash;
95         atomic_t commit_count;
96         atomic_t discard_count;
97 };
98
99 struct cache {
100         struct dm_target *ti;
101         struct dm_target_callbacks callbacks;
102
103         /*
104          * Metadata is written to this device.
105          */
106         struct dm_dev *metadata_dev;
107
108         /*
109          * The slower of the two data devices.  Typically a spindle.
110          */
111         struct dm_dev *origin_dev;
112
113         /*
114          * The faster of the two data devices.  Typically an SSD.
115          */
116         struct dm_dev *cache_dev;
117
118         /*
119          * Cache features such as write-through.
120          */
121         struct cache_features features;
122
123         /*
124          * Size of the origin device in _complete_ blocks and native sectors.
125          */
126         dm_oblock_t origin_blocks;
127         sector_t origin_sectors;
128
129         /*
130          * Size of the cache device in blocks.
131          */
132         dm_cblock_t cache_size;
133
134         /*
135          * Fields for converting from sectors to blocks.
136          */
137         uint32_t sectors_per_block;
138         int sectors_per_block_shift;
139
140         struct dm_cache_metadata *cmd;
141
142         spinlock_t lock;
143         struct bio_list deferred_bios;
144         struct bio_list deferred_flush_bios;
145         struct list_head quiesced_migrations;
146         struct list_head completed_migrations;
147         struct list_head need_commit_migrations;
148         sector_t migration_threshold;
149         atomic_t nr_migrations;
150         wait_queue_head_t migration_wait;
151
152         /*
153          * cache_size entries, dirty if set
154          */
155         dm_cblock_t nr_dirty;
156         unsigned long *dirty_bitset;
157
158         /*
159          * origin_blocks entries, discarded if set.
160          */
161         sector_t discard_block_size; /* a power of 2 times sectors per block */
162         dm_dblock_t discard_nr_blocks;
163         unsigned long *discard_bitset;
164
165         struct dm_kcopyd_client *copier;
166         struct workqueue_struct *wq;
167         struct work_struct worker;
168
169         struct delayed_work waker;
170         unsigned long last_commit_jiffies;
171
172         struct dm_bio_prison *prison;
173         struct dm_deferred_set *all_io_ds;
174
175         mempool_t *migration_pool;
176         struct dm_cache_migration *next_migration;
177
178         struct dm_cache_policy *policy;
179         unsigned policy_nr_args;
180
181         bool need_tick_bio:1;
182         bool sized:1;
183         bool quiescing:1;
184         bool commit_requested:1;
185         bool loaded_mappings:1;
186         bool loaded_discards:1;
187
188         struct cache_stats stats;
189
190         /*
191          * Rather than reconstructing the table line for the status we just
192          * save it and regurgitate.
193          */
194         unsigned nr_ctr_args;
195         const char **ctr_args;
196 };
197
198 struct per_bio_data {
199         bool tick:1;
200         unsigned req_nr:2;
201         struct dm_deferred_entry *all_io_entry;
202 };
203
204 struct dm_cache_migration {
205         struct list_head list;
206         struct cache *cache;
207
208         unsigned long start_jiffies;
209         dm_oblock_t old_oblock;
210         dm_oblock_t new_oblock;
211         dm_cblock_t cblock;
212
213         bool err:1;
214         bool writeback:1;
215         bool demote:1;
216         bool promote:1;
217
218         struct dm_bio_prison_cell *old_ocell;
219         struct dm_bio_prison_cell *new_ocell;
220 };
221
222 /*
223  * Processing a bio in the worker thread may require these memory
224  * allocations.  We prealloc to avoid deadlocks (the same worker thread
225  * frees them back to the mempool).
226  */
227 struct prealloc {
228         struct dm_cache_migration *mg;
229         struct dm_bio_prison_cell *cell1;
230         struct dm_bio_prison_cell *cell2;
231 };
232
233 static void wake_worker(struct cache *cache)
234 {
235         queue_work(cache->wq, &cache->worker);
236 }
237
238 /*----------------------------------------------------------------*/
239
240 static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache)
241 {
242         /* FIXME: change to use a local slab. */
243         return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT);
244 }
245
246 static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell)
247 {
248         dm_bio_prison_free_cell(cache->prison, cell);
249 }
250
251 static int prealloc_data_structs(struct cache *cache, struct prealloc *p)
252 {
253         if (!p->mg) {
254                 p->mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT);
255                 if (!p->mg)
256                         return -ENOMEM;
257         }
258
259         if (!p->cell1) {
260                 p->cell1 = alloc_prison_cell(cache);
261                 if (!p->cell1)
262                         return -ENOMEM;
263         }
264
265         if (!p->cell2) {
266                 p->cell2 = alloc_prison_cell(cache);
267                 if (!p->cell2)
268                         return -ENOMEM;
269         }
270
271         return 0;
272 }
273
274 static void prealloc_free_structs(struct cache *cache, struct prealloc *p)
275 {
276         if (p->cell2)
277                 free_prison_cell(cache, p->cell2);
278
279         if (p->cell1)
280                 free_prison_cell(cache, p->cell1);
281
282         if (p->mg)
283                 mempool_free(p->mg, cache->migration_pool);
284 }
285
286 static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p)
287 {
288         struct dm_cache_migration *mg = p->mg;
289
290         BUG_ON(!mg);
291         p->mg = NULL;
292
293         return mg;
294 }
295
296 /*
297  * You must have a cell within the prealloc struct to return.  If not this
298  * function will BUG() rather than returning NULL.
299  */
300 static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p)
301 {
302         struct dm_bio_prison_cell *r = NULL;
303
304         if (p->cell1) {
305                 r = p->cell1;
306                 p->cell1 = NULL;
307
308         } else if (p->cell2) {
309                 r = p->cell2;
310                 p->cell2 = NULL;
311         } else
312                 BUG();
313
314         return r;
315 }
316
317 /*
318  * You can't have more than two cells in a prealloc struct.  BUG() will be
319  * called if you try and overfill.
320  */
321 static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell)
322 {
323         if (!p->cell2)
324                 p->cell2 = cell;
325
326         else if (!p->cell1)
327                 p->cell1 = cell;
328
329         else
330                 BUG();
331 }
332
333 /*----------------------------------------------------------------*/
334
335 static void build_key(dm_oblock_t oblock, struct dm_cell_key *key)
336 {
337         key->virtual = 0;
338         key->dev = 0;
339         key->block = from_oblock(oblock);
340 }
341
342 /*
343  * The caller hands in a preallocated cell, and a free function for it.
344  * The cell will be freed if there's an error, or if it wasn't used because
345  * a cell with that key already exists.
346  */
347 typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell);
348
349 static int bio_detain(struct cache *cache, dm_oblock_t oblock,
350                       struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
351                       cell_free_fn free_fn, void *free_context,
352                       struct dm_bio_prison_cell **cell_result)
353 {
354         int r;
355         struct dm_cell_key key;
356
357         build_key(oblock, &key);
358         r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result);
359         if (r)
360                 free_fn(free_context, cell_prealloc);
361
362         return r;
363 }
364
365 static int get_cell(struct cache *cache,
366                     dm_oblock_t oblock,
367                     struct prealloc *structs,
368                     struct dm_bio_prison_cell **cell_result)
369 {
370         int r;
371         struct dm_cell_key key;
372         struct dm_bio_prison_cell *cell_prealloc;
373
374         cell_prealloc = prealloc_get_cell(structs);
375
376         build_key(oblock, &key);
377         r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result);
378         if (r)
379                 prealloc_put_cell(structs, cell_prealloc);
380
381         return r;
382 }
383
384  /*----------------------------------------------------------------*/
385
386 static bool is_dirty(struct cache *cache, dm_cblock_t b)
387 {
388         return test_bit(from_cblock(b), cache->dirty_bitset);
389 }
390
391 static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
392 {
393         if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) {
394                 cache->nr_dirty = to_cblock(from_cblock(cache->nr_dirty) + 1);
395                 policy_set_dirty(cache->policy, oblock);
396         }
397 }
398
399 static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
400 {
401         if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) {
402                 policy_clear_dirty(cache->policy, oblock);
403                 cache->nr_dirty = to_cblock(from_cblock(cache->nr_dirty) - 1);
404                 if (!from_cblock(cache->nr_dirty))
405                         dm_table_event(cache->ti->table);
406         }
407 }
408
409 /*----------------------------------------------------------------*/
410 static bool block_size_is_power_of_two(struct cache *cache)
411 {
412         return cache->sectors_per_block_shift >= 0;
413 }
414
415 static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
416 {
417         sector_t discard_blocks = cache->discard_block_size;
418         dm_block_t b = from_oblock(oblock);
419
420         if (!block_size_is_power_of_two(cache))
421                 (void) sector_div(discard_blocks, cache->sectors_per_block);
422         else
423                 discard_blocks >>= cache->sectors_per_block_shift;
424
425         (void) sector_div(b, discard_blocks);
426
427         return to_dblock(b);
428 }
429
430 static void set_discard(struct cache *cache, dm_dblock_t b)
431 {
432         unsigned long flags;
433
434         atomic_inc(&cache->stats.discard_count);
435
436         spin_lock_irqsave(&cache->lock, flags);
437         set_bit(from_dblock(b), cache->discard_bitset);
438         spin_unlock_irqrestore(&cache->lock, flags);
439 }
440
441 static void clear_discard(struct cache *cache, dm_dblock_t b)
442 {
443         unsigned long flags;
444
445         spin_lock_irqsave(&cache->lock, flags);
446         clear_bit(from_dblock(b), cache->discard_bitset);
447         spin_unlock_irqrestore(&cache->lock, flags);
448 }
449
450 static bool is_discarded(struct cache *cache, dm_dblock_t b)
451 {
452         int r;
453         unsigned long flags;
454
455         spin_lock_irqsave(&cache->lock, flags);
456         r = test_bit(from_dblock(b), cache->discard_bitset);
457         spin_unlock_irqrestore(&cache->lock, flags);
458
459         return r;
460 }
461
462 static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
463 {
464         int r;
465         unsigned long flags;
466
467         spin_lock_irqsave(&cache->lock, flags);
468         r = test_bit(from_dblock(oblock_to_dblock(cache, b)),
469                      cache->discard_bitset);
470         spin_unlock_irqrestore(&cache->lock, flags);
471
472         return r;
473 }
474
475 /*----------------------------------------------------------------*/
476
477 static void load_stats(struct cache *cache)
478 {
479         struct dm_cache_statistics stats;
480
481         dm_cache_metadata_get_stats(cache->cmd, &stats);
482         atomic_set(&cache->stats.read_hit, stats.read_hits);
483         atomic_set(&cache->stats.read_miss, stats.read_misses);
484         atomic_set(&cache->stats.write_hit, stats.write_hits);
485         atomic_set(&cache->stats.write_miss, stats.write_misses);
486 }
487
488 static void save_stats(struct cache *cache)
489 {
490         struct dm_cache_statistics stats;
491
492         stats.read_hits = atomic_read(&cache->stats.read_hit);
493         stats.read_misses = atomic_read(&cache->stats.read_miss);
494         stats.write_hits = atomic_read(&cache->stats.write_hit);
495         stats.write_misses = atomic_read(&cache->stats.write_miss);
496
497         dm_cache_metadata_set_stats(cache->cmd, &stats);
498 }
499
500 /*----------------------------------------------------------------
501  * Per bio data
502  *--------------------------------------------------------------*/
503 static struct per_bio_data *get_per_bio_data(struct bio *bio)
504 {
505         struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
506         BUG_ON(!pb);
507         return pb;
508 }
509
510 static struct per_bio_data *init_per_bio_data(struct bio *bio)
511 {
512         struct per_bio_data *pb = get_per_bio_data(bio);
513
514         pb->tick = false;
515         pb->req_nr = dm_bio_get_target_bio_nr(bio);
516         pb->all_io_entry = NULL;
517
518         return pb;
519 }
520
521 /*----------------------------------------------------------------
522  * Remapping
523  *--------------------------------------------------------------*/
524 static void remap_to_origin(struct cache *cache, struct bio *bio)
525 {
526         bio->bi_bdev = cache->origin_dev->bdev;
527 }
528
529 static void remap_to_cache(struct cache *cache, struct bio *bio,
530                            dm_cblock_t cblock)
531 {
532         sector_t bi_sector = bio->bi_sector;
533
534         bio->bi_bdev = cache->cache_dev->bdev;
535         if (!block_size_is_power_of_two(cache))
536                 bio->bi_sector = (from_cblock(cblock) * cache->sectors_per_block) +
537                                 sector_div(bi_sector, cache->sectors_per_block);
538         else
539                 bio->bi_sector = (from_cblock(cblock) << cache->sectors_per_block_shift) |
540                                 (bi_sector & (cache->sectors_per_block - 1));
541 }
542
543 static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
544 {
545         unsigned long flags;
546         struct per_bio_data *pb = get_per_bio_data(bio);
547
548         spin_lock_irqsave(&cache->lock, flags);
549         if (cache->need_tick_bio &&
550             !(bio->bi_rw & (REQ_FUA | REQ_FLUSH | REQ_DISCARD))) {
551                 pb->tick = true;
552                 cache->need_tick_bio = false;
553         }
554         spin_unlock_irqrestore(&cache->lock, flags);
555 }
556
557 static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
558                                   dm_oblock_t oblock)
559 {
560         check_if_tick_bio_needed(cache, bio);
561         remap_to_origin(cache, bio);
562         if (bio_data_dir(bio) == WRITE)
563                 clear_discard(cache, oblock_to_dblock(cache, oblock));
564 }
565
566 static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
567                                  dm_oblock_t oblock, dm_cblock_t cblock)
568 {
569         remap_to_cache(cache, bio, cblock);
570         if (bio_data_dir(bio) == WRITE) {
571                 set_dirty(cache, oblock, cblock);
572                 clear_discard(cache, oblock_to_dblock(cache, oblock));
573         }
574 }
575
576 static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio)
577 {
578         sector_t block_nr = bio->bi_sector;
579
580         if (!block_size_is_power_of_two(cache))
581                 (void) sector_div(block_nr, cache->sectors_per_block);
582         else
583                 block_nr >>= cache->sectors_per_block_shift;
584
585         return to_oblock(block_nr);
586 }
587
588 static int bio_triggers_commit(struct cache *cache, struct bio *bio)
589 {
590         return bio->bi_rw & (REQ_FLUSH | REQ_FUA);
591 }
592
593 static void issue(struct cache *cache, struct bio *bio)
594 {
595         unsigned long flags;
596
597         if (!bio_triggers_commit(cache, bio)) {
598                 generic_make_request(bio);
599                 return;
600         }
601
602         /*
603          * Batch together any bios that trigger commits and then issue a
604          * single commit for them in do_worker().
605          */
606         spin_lock_irqsave(&cache->lock, flags);
607         cache->commit_requested = true;
608         bio_list_add(&cache->deferred_flush_bios, bio);
609         spin_unlock_irqrestore(&cache->lock, flags);
610 }
611
612 /*----------------------------------------------------------------
613  * Migration processing
614  *
615  * Migration covers moving data from the origin device to the cache, or
616  * vice versa.
617  *--------------------------------------------------------------*/
618 static void free_migration(struct dm_cache_migration *mg)
619 {
620         mempool_free(mg, mg->cache->migration_pool);
621 }
622
623 static void inc_nr_migrations(struct cache *cache)
624 {
625         atomic_inc(&cache->nr_migrations);
626 }
627
628 static void dec_nr_migrations(struct cache *cache)
629 {
630         atomic_dec(&cache->nr_migrations);
631
632         /*
633          * Wake the worker in case we're suspending the target.
634          */
635         wake_up(&cache->migration_wait);
636 }
637
638 static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
639                          bool holder)
640 {
641         (holder ? dm_cell_release : dm_cell_release_no_holder)
642                 (cache->prison, cell, &cache->deferred_bios);
643         free_prison_cell(cache, cell);
644 }
645
646 static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
647                        bool holder)
648 {
649         unsigned long flags;
650
651         spin_lock_irqsave(&cache->lock, flags);
652         __cell_defer(cache, cell, holder);
653         spin_unlock_irqrestore(&cache->lock, flags);
654
655         wake_worker(cache);
656 }
657
658 static void cleanup_migration(struct dm_cache_migration *mg)
659 {
660         dec_nr_migrations(mg->cache);
661         free_migration(mg);
662 }
663
664 static void migration_failure(struct dm_cache_migration *mg)
665 {
666         struct cache *cache = mg->cache;
667
668         if (mg->writeback) {
669                 DMWARN_LIMIT("writeback failed; couldn't copy block");
670                 set_dirty(cache, mg->old_oblock, mg->cblock);
671                 cell_defer(cache, mg->old_ocell, false);
672
673         } else if (mg->demote) {
674                 DMWARN_LIMIT("demotion failed; couldn't copy block");
675                 policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock);
676
677                 cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1);
678                 if (mg->promote)
679                         cell_defer(cache, mg->new_ocell, 1);
680         } else {
681                 DMWARN_LIMIT("promotion failed; couldn't copy block");
682                 policy_remove_mapping(cache->policy, mg->new_oblock);
683                 cell_defer(cache, mg->new_ocell, 1);
684         }
685
686         cleanup_migration(mg);
687 }
688
689 static void migration_success_pre_commit(struct dm_cache_migration *mg)
690 {
691         unsigned long flags;
692         struct cache *cache = mg->cache;
693
694         if (mg->writeback) {
695                 cell_defer(cache, mg->old_ocell, false);
696                 clear_dirty(cache, mg->old_oblock, mg->cblock);
697                 cleanup_migration(mg);
698                 return;
699
700         } else if (mg->demote) {
701                 if (dm_cache_remove_mapping(cache->cmd, mg->cblock)) {
702                         DMWARN_LIMIT("demotion failed; couldn't update on disk metadata");
703                         policy_force_mapping(cache->policy, mg->new_oblock,
704                                              mg->old_oblock);
705                         if (mg->promote)
706                                 cell_defer(cache, mg->new_ocell, true);
707                         cleanup_migration(mg);
708                         return;
709                 }
710         } else {
711                 if (dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock)) {
712                         DMWARN_LIMIT("promotion failed; couldn't update on disk metadata");
713                         policy_remove_mapping(cache->policy, mg->new_oblock);
714                         cleanup_migration(mg);
715                         return;
716                 }
717         }
718
719         spin_lock_irqsave(&cache->lock, flags);
720         list_add_tail(&mg->list, &cache->need_commit_migrations);
721         cache->commit_requested = true;
722         spin_unlock_irqrestore(&cache->lock, flags);
723 }
724
725 static void migration_success_post_commit(struct dm_cache_migration *mg)
726 {
727         unsigned long flags;
728         struct cache *cache = mg->cache;
729
730         if (mg->writeback) {
731                 DMWARN("writeback unexpectedly triggered commit");
732                 return;
733
734         } else if (mg->demote) {
735                 cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1);
736
737                 if (mg->promote) {
738                         mg->demote = false;
739
740                         spin_lock_irqsave(&cache->lock, flags);
741                         list_add_tail(&mg->list, &cache->quiesced_migrations);
742                         spin_unlock_irqrestore(&cache->lock, flags);
743
744                 } else
745                         cleanup_migration(mg);
746
747         } else {
748                 cell_defer(cache, mg->new_ocell, true);
749                 clear_dirty(cache, mg->new_oblock, mg->cblock);
750                 cleanup_migration(mg);
751         }
752 }
753
754 static void copy_complete(int read_err, unsigned long write_err, void *context)
755 {
756         unsigned long flags;
757         struct dm_cache_migration *mg = (struct dm_cache_migration *) context;
758         struct cache *cache = mg->cache;
759
760         if (read_err || write_err)
761                 mg->err = true;
762
763         spin_lock_irqsave(&cache->lock, flags);
764         list_add_tail(&mg->list, &cache->completed_migrations);
765         spin_unlock_irqrestore(&cache->lock, flags);
766
767         wake_worker(cache);
768 }
769
770 static void issue_copy_real(struct dm_cache_migration *mg)
771 {
772         int r;
773         struct dm_io_region o_region, c_region;
774         struct cache *cache = mg->cache;
775
776         o_region.bdev = cache->origin_dev->bdev;
777         o_region.count = cache->sectors_per_block;
778
779         c_region.bdev = cache->cache_dev->bdev;
780         c_region.sector = from_cblock(mg->cblock) * cache->sectors_per_block;
781         c_region.count = cache->sectors_per_block;
782
783         if (mg->writeback || mg->demote) {
784                 /* demote */
785                 o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block;
786                 r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg);
787         } else {
788                 /* promote */
789                 o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block;
790                 r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg);
791         }
792
793         if (r < 0)
794                 migration_failure(mg);
795 }
796
797 static void avoid_copy(struct dm_cache_migration *mg)
798 {
799         atomic_inc(&mg->cache->stats.copies_avoided);
800         migration_success_pre_commit(mg);
801 }
802
803 static void issue_copy(struct dm_cache_migration *mg)
804 {
805         bool avoid;
806         struct cache *cache = mg->cache;
807
808         if (mg->writeback || mg->demote)
809                 avoid = !is_dirty(cache, mg->cblock) ||
810                         is_discarded_oblock(cache, mg->old_oblock);
811         else
812                 avoid = is_discarded_oblock(cache, mg->new_oblock);
813
814         avoid ? avoid_copy(mg) : issue_copy_real(mg);
815 }
816
817 static void complete_migration(struct dm_cache_migration *mg)
818 {
819         if (mg->err)
820                 migration_failure(mg);
821         else
822                 migration_success_pre_commit(mg);
823 }
824
825 static void process_migrations(struct cache *cache, struct list_head *head,
826                                void (*fn)(struct dm_cache_migration *))
827 {
828         unsigned long flags;
829         struct list_head list;
830         struct dm_cache_migration *mg, *tmp;
831
832         INIT_LIST_HEAD(&list);
833         spin_lock_irqsave(&cache->lock, flags);
834         list_splice_init(head, &list);
835         spin_unlock_irqrestore(&cache->lock, flags);
836
837         list_for_each_entry_safe(mg, tmp, &list, list)
838                 fn(mg);
839 }
840
841 static void __queue_quiesced_migration(struct dm_cache_migration *mg)
842 {
843         list_add_tail(&mg->list, &mg->cache->quiesced_migrations);
844 }
845
846 static void queue_quiesced_migration(struct dm_cache_migration *mg)
847 {
848         unsigned long flags;
849         struct cache *cache = mg->cache;
850
851         spin_lock_irqsave(&cache->lock, flags);
852         __queue_quiesced_migration(mg);
853         spin_unlock_irqrestore(&cache->lock, flags);
854
855         wake_worker(cache);
856 }
857
858 static void queue_quiesced_migrations(struct cache *cache, struct list_head *work)
859 {
860         unsigned long flags;
861         struct dm_cache_migration *mg, *tmp;
862
863         spin_lock_irqsave(&cache->lock, flags);
864         list_for_each_entry_safe(mg, tmp, work, list)
865                 __queue_quiesced_migration(mg);
866         spin_unlock_irqrestore(&cache->lock, flags);
867
868         wake_worker(cache);
869 }
870
871 static void check_for_quiesced_migrations(struct cache *cache,
872                                           struct per_bio_data *pb)
873 {
874         struct list_head work;
875
876         if (!pb->all_io_entry)
877                 return;
878
879         INIT_LIST_HEAD(&work);
880         if (pb->all_io_entry)
881                 dm_deferred_entry_dec(pb->all_io_entry, &work);
882
883         if (!list_empty(&work))
884                 queue_quiesced_migrations(cache, &work);
885 }
886
887 static void quiesce_migration(struct dm_cache_migration *mg)
888 {
889         if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list))
890                 queue_quiesced_migration(mg);
891 }
892
893 static void promote(struct cache *cache, struct prealloc *structs,
894                     dm_oblock_t oblock, dm_cblock_t cblock,
895                     struct dm_bio_prison_cell *cell)
896 {
897         struct dm_cache_migration *mg = prealloc_get_migration(structs);
898
899         mg->err = false;
900         mg->writeback = false;
901         mg->demote = false;
902         mg->promote = true;
903         mg->cache = cache;
904         mg->new_oblock = oblock;
905         mg->cblock = cblock;
906         mg->old_ocell = NULL;
907         mg->new_ocell = cell;
908         mg->start_jiffies = jiffies;
909
910         inc_nr_migrations(cache);
911         quiesce_migration(mg);
912 }
913
914 static void writeback(struct cache *cache, struct prealloc *structs,
915                       dm_oblock_t oblock, dm_cblock_t cblock,
916                       struct dm_bio_prison_cell *cell)
917 {
918         struct dm_cache_migration *mg = prealloc_get_migration(structs);
919
920         mg->err = false;
921         mg->writeback = true;
922         mg->demote = false;
923         mg->promote = false;
924         mg->cache = cache;
925         mg->old_oblock = oblock;
926         mg->cblock = cblock;
927         mg->old_ocell = cell;
928         mg->new_ocell = NULL;
929         mg->start_jiffies = jiffies;
930
931         inc_nr_migrations(cache);
932         quiesce_migration(mg);
933 }
934
935 static void demote_then_promote(struct cache *cache, struct prealloc *structs,
936                                 dm_oblock_t old_oblock, dm_oblock_t new_oblock,
937                                 dm_cblock_t cblock,
938                                 struct dm_bio_prison_cell *old_ocell,
939                                 struct dm_bio_prison_cell *new_ocell)
940 {
941         struct dm_cache_migration *mg = prealloc_get_migration(structs);
942
943         mg->err = false;
944         mg->writeback = false;
945         mg->demote = true;
946         mg->promote = true;
947         mg->cache = cache;
948         mg->old_oblock = old_oblock;
949         mg->new_oblock = new_oblock;
950         mg->cblock = cblock;
951         mg->old_ocell = old_ocell;
952         mg->new_ocell = new_ocell;
953         mg->start_jiffies = jiffies;
954
955         inc_nr_migrations(cache);
956         quiesce_migration(mg);
957 }
958
959 /*----------------------------------------------------------------
960  * bio processing
961  *--------------------------------------------------------------*/
962 static void defer_bio(struct cache *cache, struct bio *bio)
963 {
964         unsigned long flags;
965
966         spin_lock_irqsave(&cache->lock, flags);
967         bio_list_add(&cache->deferred_bios, bio);
968         spin_unlock_irqrestore(&cache->lock, flags);
969
970         wake_worker(cache);
971 }
972
973 static void process_flush_bio(struct cache *cache, struct bio *bio)
974 {
975         struct per_bio_data *pb = get_per_bio_data(bio);
976
977         BUG_ON(bio->bi_size);
978         if (!pb->req_nr)
979                 remap_to_origin(cache, bio);
980         else
981                 remap_to_cache(cache, bio, 0);
982
983         issue(cache, bio);
984 }
985
986 /*
987  * People generally discard large parts of a device, eg, the whole device
988  * when formatting.  Splitting these large discards up into cache block
989  * sized ios and then quiescing (always neccessary for discard) takes too
990  * long.
991  *
992  * We keep it simple, and allow any size of discard to come in, and just
993  * mark off blocks on the discard bitset.  No passdown occurs!
994  *
995  * To implement passdown we need to change the bio_prison such that a cell
996  * can have a key that spans many blocks.
997  */
998 static void process_discard_bio(struct cache *cache, struct bio *bio)
999 {
1000         dm_block_t start_block = dm_sector_div_up(bio->bi_sector,
1001                                                   cache->discard_block_size);
1002         dm_block_t end_block = bio->bi_sector + bio_sectors(bio);
1003         dm_block_t b;
1004
1005         (void) sector_div(end_block, cache->discard_block_size);
1006
1007         for (b = start_block; b < end_block; b++)
1008                 set_discard(cache, to_dblock(b));
1009
1010         bio_endio(bio, 0);
1011 }
1012
1013 static bool spare_migration_bandwidth(struct cache *cache)
1014 {
1015         sector_t current_volume = (atomic_read(&cache->nr_migrations) + 1) *
1016                 cache->sectors_per_block;
1017         return current_volume < cache->migration_threshold;
1018 }
1019
1020 static bool is_writethrough_io(struct cache *cache, struct bio *bio,
1021                                dm_cblock_t cblock)
1022 {
1023         return bio_data_dir(bio) == WRITE &&
1024                 cache->features.write_through && !is_dirty(cache, cblock);
1025 }
1026
1027 static void inc_hit_counter(struct cache *cache, struct bio *bio)
1028 {
1029         atomic_inc(bio_data_dir(bio) == READ ?
1030                    &cache->stats.read_hit : &cache->stats.write_hit);
1031 }
1032
1033 static void inc_miss_counter(struct cache *cache, struct bio *bio)
1034 {
1035         atomic_inc(bio_data_dir(bio) == READ ?
1036                    &cache->stats.read_miss : &cache->stats.write_miss);
1037 }
1038
1039 static void process_bio(struct cache *cache, struct prealloc *structs,
1040                         struct bio *bio)
1041 {
1042         int r;
1043         bool release_cell = true;
1044         dm_oblock_t block = get_bio_block(cache, bio);
1045         struct dm_bio_prison_cell *cell_prealloc, *old_ocell, *new_ocell;
1046         struct policy_result lookup_result;
1047         struct per_bio_data *pb = get_per_bio_data(bio);
1048         bool discarded_block = is_discarded_oblock(cache, block);
1049         bool can_migrate = discarded_block || spare_migration_bandwidth(cache);
1050
1051         /*
1052          * Check to see if that block is currently migrating.
1053          */
1054         cell_prealloc = prealloc_get_cell(structs);
1055         r = bio_detain(cache, block, bio, cell_prealloc,
1056                        (cell_free_fn) prealloc_put_cell,
1057                        structs, &new_ocell);
1058         if (r > 0)
1059                 return;
1060
1061         r = policy_map(cache->policy, block, true, can_migrate, discarded_block,
1062                        bio, &lookup_result);
1063
1064         if (r == -EWOULDBLOCK)
1065                 /* migration has been denied */
1066                 lookup_result.op = POLICY_MISS;
1067
1068         switch (lookup_result.op) {
1069         case POLICY_HIT:
1070                 inc_hit_counter(cache, bio);
1071                 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
1072
1073                 if (is_writethrough_io(cache, bio, lookup_result.cblock)) {
1074                         /*
1075                          * No need to mark anything dirty in write through mode.
1076                          */
1077                         pb->req_nr == 0 ?
1078                                 remap_to_cache(cache, bio, lookup_result.cblock) :
1079                                 remap_to_origin_clear_discard(cache, bio, block);
1080                 } else
1081                         remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
1082
1083                 issue(cache, bio);
1084                 break;
1085
1086         case POLICY_MISS:
1087                 inc_miss_counter(cache, bio);
1088                 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
1089
1090                 if (pb->req_nr != 0) {
1091                         /*
1092                          * This is a duplicate writethrough io that is no
1093                          * longer needed because the block has been demoted.
1094                          */
1095                         bio_endio(bio, 0);
1096                 } else {
1097                         remap_to_origin_clear_discard(cache, bio, block);
1098                         issue(cache, bio);
1099                 }
1100                 break;
1101
1102         case POLICY_NEW:
1103                 atomic_inc(&cache->stats.promotion);
1104                 promote(cache, structs, block, lookup_result.cblock, new_ocell);
1105                 release_cell = false;
1106                 break;
1107
1108         case POLICY_REPLACE:
1109                 cell_prealloc = prealloc_get_cell(structs);
1110                 r = bio_detain(cache, lookup_result.old_oblock, bio, cell_prealloc,
1111                                (cell_free_fn) prealloc_put_cell,
1112                                structs, &old_ocell);
1113                 if (r > 0) {
1114                         /*
1115                          * We have to be careful to avoid lock inversion of
1116                          * the cells.  So we back off, and wait for the
1117                          * old_ocell to become free.
1118                          */
1119                         policy_force_mapping(cache->policy, block,
1120                                              lookup_result.old_oblock);
1121                         atomic_inc(&cache->stats.cache_cell_clash);
1122                         break;
1123                 }
1124                 atomic_inc(&cache->stats.demotion);
1125                 atomic_inc(&cache->stats.promotion);
1126
1127                 demote_then_promote(cache, structs, lookup_result.old_oblock,
1128                                     block, lookup_result.cblock,
1129                                     old_ocell, new_ocell);
1130                 release_cell = false;
1131                 break;
1132
1133         default:
1134                 DMERR_LIMIT("%s: erroring bio, unknown policy op: %u", __func__,
1135                             (unsigned) lookup_result.op);
1136                 bio_io_error(bio);
1137         }
1138
1139         if (release_cell)
1140                 cell_defer(cache, new_ocell, false);
1141 }
1142
1143 static int need_commit_due_to_time(struct cache *cache)
1144 {
1145         return jiffies < cache->last_commit_jiffies ||
1146                jiffies > cache->last_commit_jiffies + COMMIT_PERIOD;
1147 }
1148
1149 static int commit_if_needed(struct cache *cache)
1150 {
1151         if (dm_cache_changed_this_transaction(cache->cmd) &&
1152             (cache->commit_requested || need_commit_due_to_time(cache))) {
1153                 atomic_inc(&cache->stats.commit_count);
1154                 cache->last_commit_jiffies = jiffies;
1155                 cache->commit_requested = false;
1156                 return dm_cache_commit(cache->cmd, false);
1157         }
1158
1159         return 0;
1160 }
1161
1162 static void process_deferred_bios(struct cache *cache)
1163 {
1164         unsigned long flags;
1165         struct bio_list bios;
1166         struct bio *bio;
1167         struct prealloc structs;
1168
1169         memset(&structs, 0, sizeof(structs));
1170         bio_list_init(&bios);
1171
1172         spin_lock_irqsave(&cache->lock, flags);
1173         bio_list_merge(&bios, &cache->deferred_bios);
1174         bio_list_init(&cache->deferred_bios);
1175         spin_unlock_irqrestore(&cache->lock, flags);
1176
1177         while (!bio_list_empty(&bios)) {
1178                 /*
1179                  * If we've got no free migration structs, and processing
1180                  * this bio might require one, we pause until there are some
1181                  * prepared mappings to process.
1182                  */
1183                 if (prealloc_data_structs(cache, &structs)) {
1184                         spin_lock_irqsave(&cache->lock, flags);
1185                         bio_list_merge(&cache->deferred_bios, &bios);
1186                         spin_unlock_irqrestore(&cache->lock, flags);
1187                         break;
1188                 }
1189
1190                 bio = bio_list_pop(&bios);
1191
1192                 if (bio->bi_rw & REQ_FLUSH)
1193                         process_flush_bio(cache, bio);
1194                 else if (bio->bi_rw & REQ_DISCARD)
1195                         process_discard_bio(cache, bio);
1196                 else
1197                         process_bio(cache, &structs, bio);
1198         }
1199
1200         prealloc_free_structs(cache, &structs);
1201 }
1202
1203 static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
1204 {
1205         unsigned long flags;
1206         struct bio_list bios;
1207         struct bio *bio;
1208
1209         bio_list_init(&bios);
1210
1211         spin_lock_irqsave(&cache->lock, flags);
1212         bio_list_merge(&bios, &cache->deferred_flush_bios);
1213         bio_list_init(&cache->deferred_flush_bios);
1214         spin_unlock_irqrestore(&cache->lock, flags);
1215
1216         while ((bio = bio_list_pop(&bios)))
1217                 submit_bios ? generic_make_request(bio) : bio_io_error(bio);
1218 }
1219
1220 static void writeback_some_dirty_blocks(struct cache *cache)
1221 {
1222         int r = 0;
1223         dm_oblock_t oblock;
1224         dm_cblock_t cblock;
1225         struct prealloc structs;
1226         struct dm_bio_prison_cell *old_ocell;
1227
1228         memset(&structs, 0, sizeof(structs));
1229
1230         while (spare_migration_bandwidth(cache)) {
1231                 if (prealloc_data_structs(cache, &structs))
1232                         break;
1233
1234                 r = policy_writeback_work(cache->policy, &oblock, &cblock);
1235                 if (r)
1236                         break;
1237
1238                 r = get_cell(cache, oblock, &structs, &old_ocell);
1239                 if (r) {
1240                         policy_set_dirty(cache->policy, oblock);
1241                         break;
1242                 }
1243
1244                 writeback(cache, &structs, oblock, cblock, old_ocell);
1245         }
1246
1247         prealloc_free_structs(cache, &structs);
1248 }
1249
1250 /*----------------------------------------------------------------
1251  * Main worker loop
1252  *--------------------------------------------------------------*/
1253 static void start_quiescing(struct cache *cache)
1254 {
1255         unsigned long flags;
1256
1257         spin_lock_irqsave(&cache->lock, flags);
1258         cache->quiescing = 1;
1259         spin_unlock_irqrestore(&cache->lock, flags);
1260 }
1261
1262 static void stop_quiescing(struct cache *cache)
1263 {
1264         unsigned long flags;
1265
1266         spin_lock_irqsave(&cache->lock, flags);
1267         cache->quiescing = 0;
1268         spin_unlock_irqrestore(&cache->lock, flags);
1269 }
1270
1271 static bool is_quiescing(struct cache *cache)
1272 {
1273         int r;
1274         unsigned long flags;
1275
1276         spin_lock_irqsave(&cache->lock, flags);
1277         r = cache->quiescing;
1278         spin_unlock_irqrestore(&cache->lock, flags);
1279
1280         return r;
1281 }
1282
1283 static void wait_for_migrations(struct cache *cache)
1284 {
1285         wait_event(cache->migration_wait, !atomic_read(&cache->nr_migrations));
1286 }
1287
1288 static void stop_worker(struct cache *cache)
1289 {
1290         cancel_delayed_work(&cache->waker);
1291         flush_workqueue(cache->wq);
1292 }
1293
1294 static void requeue_deferred_io(struct cache *cache)
1295 {
1296         struct bio *bio;
1297         struct bio_list bios;
1298
1299         bio_list_init(&bios);
1300         bio_list_merge(&bios, &cache->deferred_bios);
1301         bio_list_init(&cache->deferred_bios);
1302
1303         while ((bio = bio_list_pop(&bios)))
1304                 bio_endio(bio, DM_ENDIO_REQUEUE);
1305 }
1306
1307 static int more_work(struct cache *cache)
1308 {
1309         if (is_quiescing(cache))
1310                 return !list_empty(&cache->quiesced_migrations) ||
1311                         !list_empty(&cache->completed_migrations) ||
1312                         !list_empty(&cache->need_commit_migrations);
1313         else
1314                 return !bio_list_empty(&cache->deferred_bios) ||
1315                         !bio_list_empty(&cache->deferred_flush_bios) ||
1316                         !list_empty(&cache->quiesced_migrations) ||
1317                         !list_empty(&cache->completed_migrations) ||
1318                         !list_empty(&cache->need_commit_migrations);
1319 }
1320
1321 static void do_worker(struct work_struct *ws)
1322 {
1323         struct cache *cache = container_of(ws, struct cache, worker);
1324
1325         do {
1326                 if (!is_quiescing(cache))
1327                         process_deferred_bios(cache);
1328
1329                 process_migrations(cache, &cache->quiesced_migrations, issue_copy);
1330                 process_migrations(cache, &cache->completed_migrations, complete_migration);
1331
1332                 writeback_some_dirty_blocks(cache);
1333
1334                 if (commit_if_needed(cache)) {
1335                         process_deferred_flush_bios(cache, false);
1336
1337                         /*
1338                          * FIXME: rollback metadata or just go into a
1339                          * failure mode and error everything
1340                          */
1341                 } else {
1342                         process_deferred_flush_bios(cache, true);
1343                         process_migrations(cache, &cache->need_commit_migrations,
1344                                            migration_success_post_commit);
1345                 }
1346         } while (more_work(cache));
1347 }
1348
1349 /*
1350  * We want to commit periodically so that not too much
1351  * unwritten metadata builds up.
1352  */
1353 static void do_waker(struct work_struct *ws)
1354 {
1355         struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
1356         wake_worker(cache);
1357         queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
1358 }
1359
1360 /*----------------------------------------------------------------*/
1361
1362 static int is_congested(struct dm_dev *dev, int bdi_bits)
1363 {
1364         struct request_queue *q = bdev_get_queue(dev->bdev);
1365         return bdi_congested(&q->backing_dev_info, bdi_bits);
1366 }
1367
1368 static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
1369 {
1370         struct cache *cache = container_of(cb, struct cache, callbacks);
1371
1372         return is_congested(cache->origin_dev, bdi_bits) ||
1373                 is_congested(cache->cache_dev, bdi_bits);
1374 }
1375
1376 /*----------------------------------------------------------------
1377  * Target methods
1378  *--------------------------------------------------------------*/
1379
1380 /*
1381  * This function gets called on the error paths of the constructor, so we
1382  * have to cope with a partially initialised struct.
1383  */
1384 static void destroy(struct cache *cache)
1385 {
1386         unsigned i;
1387
1388         if (cache->next_migration)
1389                 mempool_free(cache->next_migration, cache->migration_pool);
1390
1391         if (cache->migration_pool)
1392                 mempool_destroy(cache->migration_pool);
1393
1394         if (cache->all_io_ds)
1395                 dm_deferred_set_destroy(cache->all_io_ds);
1396
1397         if (cache->prison)
1398                 dm_bio_prison_destroy(cache->prison);
1399
1400         if (cache->wq)
1401                 destroy_workqueue(cache->wq);
1402
1403         if (cache->dirty_bitset)
1404                 free_bitset(cache->dirty_bitset);
1405
1406         if (cache->discard_bitset)
1407                 free_bitset(cache->discard_bitset);
1408
1409         if (cache->copier)
1410                 dm_kcopyd_client_destroy(cache->copier);
1411
1412         if (cache->cmd)
1413                 dm_cache_metadata_close(cache->cmd);
1414
1415         if (cache->metadata_dev)
1416                 dm_put_device(cache->ti, cache->metadata_dev);
1417
1418         if (cache->origin_dev)
1419                 dm_put_device(cache->ti, cache->origin_dev);
1420
1421         if (cache->cache_dev)
1422                 dm_put_device(cache->ti, cache->cache_dev);
1423
1424         if (cache->policy)
1425                 dm_cache_policy_destroy(cache->policy);
1426
1427         for (i = 0; i < cache->nr_ctr_args ; i++)
1428                 kfree(cache->ctr_args[i]);
1429         kfree(cache->ctr_args);
1430
1431         kfree(cache);
1432 }
1433
1434 static void cache_dtr(struct dm_target *ti)
1435 {
1436         struct cache *cache = ti->private;
1437
1438         destroy(cache);
1439 }
1440
1441 static sector_t get_dev_size(struct dm_dev *dev)
1442 {
1443         return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
1444 }
1445
1446 /*----------------------------------------------------------------*/
1447
1448 /*
1449  * Construct a cache device mapping.
1450  *
1451  * cache <metadata dev> <cache dev> <origin dev> <block size>
1452  *       <#feature args> [<feature arg>]*
1453  *       <policy> <#policy args> [<policy arg>]*
1454  *
1455  * metadata dev    : fast device holding the persistent metadata
1456  * cache dev       : fast device holding cached data blocks
1457  * origin dev      : slow device holding original data blocks
1458  * block size      : cache unit size in sectors
1459  *
1460  * #feature args   : number of feature arguments passed
1461  * feature args    : writethrough.  (The default is writeback.)
1462  *
1463  * policy          : the replacement policy to use
1464  * #policy args    : an even number of policy arguments corresponding
1465  *                   to key/value pairs passed to the policy
1466  * policy args     : key/value pairs passed to the policy
1467  *                   E.g. 'sequential_threshold 1024'
1468  *                   See cache-policies.txt for details.
1469  *
1470  * Optional feature arguments are:
1471  *   writethrough  : write through caching that prohibits cache block
1472  *                   content from being different from origin block content.
1473  *                   Without this argument, the default behaviour is to write
1474  *                   back cache block contents later for performance reasons,
1475  *                   so they may differ from the corresponding origin blocks.
1476  */
1477 struct cache_args {
1478         struct dm_target *ti;
1479
1480         struct dm_dev *metadata_dev;
1481
1482         struct dm_dev *cache_dev;
1483         sector_t cache_sectors;
1484
1485         struct dm_dev *origin_dev;
1486         sector_t origin_sectors;
1487
1488         uint32_t block_size;
1489
1490         const char *policy_name;
1491         int policy_argc;
1492         const char **policy_argv;
1493
1494         struct cache_features features;
1495 };
1496
1497 static void destroy_cache_args(struct cache_args *ca)
1498 {
1499         if (ca->metadata_dev)
1500                 dm_put_device(ca->ti, ca->metadata_dev);
1501
1502         if (ca->cache_dev)
1503                 dm_put_device(ca->ti, ca->cache_dev);
1504
1505         if (ca->origin_dev)
1506                 dm_put_device(ca->ti, ca->origin_dev);
1507
1508         kfree(ca);
1509 }
1510
1511 static bool at_least_one_arg(struct dm_arg_set *as, char **error)
1512 {
1513         if (!as->argc) {
1514                 *error = "Insufficient args";
1515                 return false;
1516         }
1517
1518         return true;
1519 }
1520
1521 static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as,
1522                               char **error)
1523 {
1524         int r;
1525         sector_t metadata_dev_size;
1526         char b[BDEVNAME_SIZE];
1527
1528         if (!at_least_one_arg(as, error))
1529                 return -EINVAL;
1530
1531         r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
1532                           &ca->metadata_dev);
1533         if (r) {
1534                 *error = "Error opening metadata device";
1535                 return r;
1536         }
1537
1538         metadata_dev_size = get_dev_size(ca->metadata_dev);
1539         if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING)
1540                 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
1541                        bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
1542
1543         return 0;
1544 }
1545
1546 static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as,
1547                            char **error)
1548 {
1549         int r;
1550
1551         if (!at_least_one_arg(as, error))
1552                 return -EINVAL;
1553
1554         r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
1555                           &ca->cache_dev);
1556         if (r) {
1557                 *error = "Error opening cache device";
1558                 return r;
1559         }
1560         ca->cache_sectors = get_dev_size(ca->cache_dev);
1561
1562         return 0;
1563 }
1564
1565 static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as,
1566                             char **error)
1567 {
1568         int r;
1569
1570         if (!at_least_one_arg(as, error))
1571                 return -EINVAL;
1572
1573         r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
1574                           &ca->origin_dev);
1575         if (r) {
1576                 *error = "Error opening origin device";
1577                 return r;
1578         }
1579
1580         ca->origin_sectors = get_dev_size(ca->origin_dev);
1581         if (ca->ti->len > ca->origin_sectors) {
1582                 *error = "Device size larger than cached device";
1583                 return -EINVAL;
1584         }
1585
1586         return 0;
1587 }
1588
1589 static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as,
1590                             char **error)
1591 {
1592         unsigned long tmp;
1593
1594         if (!at_least_one_arg(as, error))
1595                 return -EINVAL;
1596
1597         if (kstrtoul(dm_shift_arg(as), 10, &tmp) || !tmp ||
1598             tmp < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
1599             tmp & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
1600                 *error = "Invalid data block size";
1601                 return -EINVAL;
1602         }
1603
1604         if (tmp > ca->cache_sectors) {
1605                 *error = "Data block size is larger than the cache device";
1606                 return -EINVAL;
1607         }
1608
1609         ca->block_size = tmp;
1610
1611         return 0;
1612 }
1613
1614 static void init_features(struct cache_features *cf)
1615 {
1616         cf->mode = CM_WRITE;
1617         cf->write_through = false;
1618 }
1619
1620 static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
1621                           char **error)
1622 {
1623         static struct dm_arg _args[] = {
1624                 {0, 1, "Invalid number of cache feature arguments"},
1625         };
1626
1627         int r;
1628         unsigned argc;
1629         const char *arg;
1630         struct cache_features *cf = &ca->features;
1631
1632         init_features(cf);
1633
1634         r = dm_read_arg_group(_args, as, &argc, error);
1635         if (r)
1636                 return -EINVAL;
1637
1638         while (argc--) {
1639                 arg = dm_shift_arg(as);
1640
1641                 if (!strcasecmp(arg, "writeback"))
1642                         cf->write_through = false;
1643
1644                 else if (!strcasecmp(arg, "writethrough"))
1645                         cf->write_through = true;
1646
1647                 else {
1648                         *error = "Unrecognised cache feature requested";
1649                         return -EINVAL;
1650                 }
1651         }
1652
1653         return 0;
1654 }
1655
1656 static int parse_policy(struct cache_args *ca, struct dm_arg_set *as,
1657                         char **error)
1658 {
1659         static struct dm_arg _args[] = {
1660                 {0, 1024, "Invalid number of policy arguments"},
1661         };
1662
1663         int r;
1664
1665         if (!at_least_one_arg(as, error))
1666                 return -EINVAL;
1667
1668         ca->policy_name = dm_shift_arg(as);
1669
1670         r = dm_read_arg_group(_args, as, &ca->policy_argc, error);
1671         if (r)
1672                 return -EINVAL;
1673
1674         ca->policy_argv = (const char **)as->argv;
1675         dm_consume_args(as, ca->policy_argc);
1676
1677         return 0;
1678 }
1679
1680 static int parse_cache_args(struct cache_args *ca, int argc, char **argv,
1681                             char **error)
1682 {
1683         int r;
1684         struct dm_arg_set as;
1685
1686         as.argc = argc;
1687         as.argv = argv;
1688
1689         r = parse_metadata_dev(ca, &as, error);
1690         if (r)
1691                 return r;
1692
1693         r = parse_cache_dev(ca, &as, error);
1694         if (r)
1695                 return r;
1696
1697         r = parse_origin_dev(ca, &as, error);
1698         if (r)
1699                 return r;
1700
1701         r = parse_block_size(ca, &as, error);
1702         if (r)
1703                 return r;
1704
1705         r = parse_features(ca, &as, error);
1706         if (r)
1707                 return r;
1708
1709         r = parse_policy(ca, &as, error);
1710         if (r)
1711                 return r;
1712
1713         return 0;
1714 }
1715
1716 /*----------------------------------------------------------------*/
1717
1718 static struct kmem_cache *migration_cache;
1719
1720 static int set_config_values(struct dm_cache_policy *p, int argc, const char **argv)
1721 {
1722         int r = 0;
1723
1724         if (argc & 1) {
1725                 DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs.");
1726                 return -EINVAL;
1727         }
1728
1729         while (argc) {
1730                 r = policy_set_config_value(p, argv[0], argv[1]);
1731                 if (r) {
1732                         DMWARN("policy_set_config_value failed: key = '%s', value = '%s'",
1733                                argv[0], argv[1]);
1734                         return r;
1735                 }
1736
1737                 argc -= 2;
1738                 argv += 2;
1739         }
1740
1741         return r;
1742 }
1743
1744 static int create_cache_policy(struct cache *cache, struct cache_args *ca,
1745                                char **error)
1746 {
1747         int r;
1748
1749         cache->policy = dm_cache_policy_create(ca->policy_name,
1750                                                cache->cache_size,
1751                                                cache->origin_sectors,
1752                                                cache->sectors_per_block);
1753         if (!cache->policy) {
1754                 *error = "Error creating cache's policy";
1755                 return -ENOMEM;
1756         }
1757
1758         r = set_config_values(cache->policy, ca->policy_argc, ca->policy_argv);
1759         if (r)
1760                 dm_cache_policy_destroy(cache->policy);
1761
1762         return r;
1763 }
1764
1765 /*
1766  * We want the discard block size to be a power of two, at least the size
1767  * of the cache block size, and have no more than 2^14 discard blocks
1768  * across the origin.
1769  */
1770 #define MAX_DISCARD_BLOCKS (1 << 14)
1771
1772 static bool too_many_discard_blocks(sector_t discard_block_size,
1773                                     sector_t origin_size)
1774 {
1775         (void) sector_div(origin_size, discard_block_size);
1776
1777         return origin_size > MAX_DISCARD_BLOCKS;
1778 }
1779
1780 static sector_t calculate_discard_block_size(sector_t cache_block_size,
1781                                              sector_t origin_size)
1782 {
1783         sector_t discard_block_size;
1784
1785         discard_block_size = roundup_pow_of_two(cache_block_size);
1786
1787         if (origin_size)
1788                 while (too_many_discard_blocks(discard_block_size, origin_size))
1789                         discard_block_size *= 2;
1790
1791         return discard_block_size;
1792 }
1793
1794 #define DEFAULT_MIGRATION_THRESHOLD (2048 * 100)
1795
1796 static unsigned cache_num_write_bios(struct dm_target *ti, struct bio *bio);
1797
1798 static int cache_create(struct cache_args *ca, struct cache **result)
1799 {
1800         int r = 0;
1801         char **error = &ca->ti->error;
1802         struct cache *cache;
1803         struct dm_target *ti = ca->ti;
1804         dm_block_t origin_blocks;
1805         struct dm_cache_metadata *cmd;
1806         bool may_format = ca->features.mode == CM_WRITE;
1807
1808         cache = kzalloc(sizeof(*cache), GFP_KERNEL);
1809         if (!cache)
1810                 return -ENOMEM;
1811
1812         cache->ti = ca->ti;
1813         ti->private = cache;
1814         ti->per_bio_data_size = sizeof(struct per_bio_data);
1815         ti->num_flush_bios = 2;
1816         ti->flush_supported = true;
1817
1818         ti->num_discard_bios = 1;
1819         ti->discards_supported = true;
1820         ti->discard_zeroes_data_unsupported = true;
1821
1822         memcpy(&cache->features, &ca->features, sizeof(cache->features));
1823
1824         if (cache->features.write_through)
1825                 ti->num_write_bios = cache_num_write_bios;
1826
1827         cache->callbacks.congested_fn = cache_is_congested;
1828         dm_table_add_target_callbacks(ti->table, &cache->callbacks);
1829
1830         cache->metadata_dev = ca->metadata_dev;
1831         cache->origin_dev = ca->origin_dev;
1832         cache->cache_dev = ca->cache_dev;
1833
1834         ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL;
1835
1836         /* FIXME: factor out this whole section */
1837         origin_blocks = cache->origin_sectors = ca->origin_sectors;
1838         (void) sector_div(origin_blocks, ca->block_size);
1839         cache->origin_blocks = to_oblock(origin_blocks);
1840
1841         cache->sectors_per_block = ca->block_size;
1842         if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) {
1843                 r = -EINVAL;
1844                 goto bad;
1845         }
1846
1847         if (ca->block_size & (ca->block_size - 1)) {
1848                 dm_block_t cache_size = ca->cache_sectors;
1849
1850                 cache->sectors_per_block_shift = -1;
1851                 (void) sector_div(cache_size, ca->block_size);
1852                 cache->cache_size = to_cblock(cache_size);
1853         } else {
1854                 cache->sectors_per_block_shift = __ffs(ca->block_size);
1855                 cache->cache_size = to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift);
1856         }
1857
1858         r = create_cache_policy(cache, ca, error);
1859         if (r)
1860                 goto bad;
1861         cache->policy_nr_args = ca->policy_argc;
1862
1863         cmd = dm_cache_metadata_open(cache->metadata_dev->bdev,
1864                                      ca->block_size, may_format,
1865                                      dm_cache_policy_get_hint_size(cache->policy));
1866         if (IS_ERR(cmd)) {
1867                 *error = "Error creating metadata object";
1868                 r = PTR_ERR(cmd);
1869                 goto bad;
1870         }
1871         cache->cmd = cmd;
1872
1873         spin_lock_init(&cache->lock);
1874         bio_list_init(&cache->deferred_bios);
1875         bio_list_init(&cache->deferred_flush_bios);
1876         INIT_LIST_HEAD(&cache->quiesced_migrations);
1877         INIT_LIST_HEAD(&cache->completed_migrations);
1878         INIT_LIST_HEAD(&cache->need_commit_migrations);
1879         cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD;
1880         atomic_set(&cache->nr_migrations, 0);
1881         init_waitqueue_head(&cache->migration_wait);
1882
1883         cache->nr_dirty = 0;
1884         cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
1885         if (!cache->dirty_bitset) {
1886                 *error = "could not allocate dirty bitset";
1887                 goto bad;
1888         }
1889         clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size));
1890
1891         cache->discard_block_size =
1892                 calculate_discard_block_size(cache->sectors_per_block,
1893                                              cache->origin_sectors);
1894         cache->discard_nr_blocks = oblock_to_dblock(cache, cache->origin_blocks);
1895         cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks));
1896         if (!cache->discard_bitset) {
1897                 *error = "could not allocate discard bitset";
1898                 goto bad;
1899         }
1900         clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
1901
1902         cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
1903         if (IS_ERR(cache->copier)) {
1904                 *error = "could not create kcopyd client";
1905                 r = PTR_ERR(cache->copier);
1906                 goto bad;
1907         }
1908
1909         cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
1910         if (!cache->wq) {
1911                 *error = "could not create workqueue for metadata object";
1912                 goto bad;
1913         }
1914         INIT_WORK(&cache->worker, do_worker);
1915         INIT_DELAYED_WORK(&cache->waker, do_waker);
1916         cache->last_commit_jiffies = jiffies;
1917
1918         cache->prison = dm_bio_prison_create(PRISON_CELLS);
1919         if (!cache->prison) {
1920                 *error = "could not create bio prison";
1921                 goto bad;
1922         }
1923
1924         cache->all_io_ds = dm_deferred_set_create();
1925         if (!cache->all_io_ds) {
1926                 *error = "could not create all_io deferred set";
1927                 goto bad;
1928         }
1929
1930         cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE,
1931                                                          migration_cache);
1932         if (!cache->migration_pool) {
1933                 *error = "Error creating cache's migration mempool";
1934                 goto bad;
1935         }
1936
1937         cache->next_migration = NULL;
1938
1939         cache->need_tick_bio = true;
1940         cache->sized = false;
1941         cache->quiescing = false;
1942         cache->commit_requested = false;
1943         cache->loaded_mappings = false;
1944         cache->loaded_discards = false;
1945
1946         load_stats(cache);
1947
1948         atomic_set(&cache->stats.demotion, 0);
1949         atomic_set(&cache->stats.promotion, 0);
1950         atomic_set(&cache->stats.copies_avoided, 0);
1951         atomic_set(&cache->stats.cache_cell_clash, 0);
1952         atomic_set(&cache->stats.commit_count, 0);
1953         atomic_set(&cache->stats.discard_count, 0);
1954
1955         *result = cache;
1956         return 0;
1957
1958 bad:
1959         destroy(cache);
1960         return r;
1961 }
1962
1963 static int copy_ctr_args(struct cache *cache, int argc, const char **argv)
1964 {
1965         unsigned i;
1966         const char **copy;
1967
1968         copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL);
1969         if (!copy)
1970                 return -ENOMEM;
1971         for (i = 0; i < argc; i++) {
1972                 copy[i] = kstrdup(argv[i], GFP_KERNEL);
1973                 if (!copy[i]) {
1974                         while (i--)
1975                                 kfree(copy[i]);
1976                         kfree(copy);
1977                         return -ENOMEM;
1978                 }
1979         }
1980
1981         cache->nr_ctr_args = argc;
1982         cache->ctr_args = copy;
1983
1984         return 0;
1985 }
1986
1987 static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv)
1988 {
1989         int r = -EINVAL;
1990         struct cache_args *ca;
1991         struct cache *cache = NULL;
1992
1993         ca = kzalloc(sizeof(*ca), GFP_KERNEL);
1994         if (!ca) {
1995                 ti->error = "Error allocating memory for cache";
1996                 return -ENOMEM;
1997         }
1998         ca->ti = ti;
1999
2000         r = parse_cache_args(ca, argc, argv, &ti->error);
2001         if (r)
2002                 goto out;
2003
2004         r = cache_create(ca, &cache);
2005
2006         r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3);
2007         if (r) {
2008                 destroy(cache);
2009                 goto out;
2010         }
2011
2012         ti->private = cache;
2013
2014 out:
2015         destroy_cache_args(ca);
2016         return r;
2017 }
2018
2019 static unsigned cache_num_write_bios(struct dm_target *ti, struct bio *bio)
2020 {
2021         int r;
2022         struct cache *cache = ti->private;
2023         dm_oblock_t block = get_bio_block(cache, bio);
2024         dm_cblock_t cblock;
2025
2026         r = policy_lookup(cache->policy, block, &cblock);
2027         if (r < 0)
2028                 return 2;       /* assume the worst */
2029
2030         return (!r && !is_dirty(cache, cblock)) ? 2 : 1;
2031 }
2032
2033 static int cache_map(struct dm_target *ti, struct bio *bio)
2034 {
2035         struct cache *cache = ti->private;
2036
2037         int r;
2038         dm_oblock_t block = get_bio_block(cache, bio);
2039         bool can_migrate = false;
2040         bool discarded_block;
2041         struct dm_bio_prison_cell *cell;
2042         struct policy_result lookup_result;
2043         struct per_bio_data *pb;
2044
2045         if (from_oblock(block) > from_oblock(cache->origin_blocks)) {
2046                 /*
2047                  * This can only occur if the io goes to a partial block at
2048                  * the end of the origin device.  We don't cache these.
2049                  * Just remap to the origin and carry on.
2050                  */
2051                 remap_to_origin_clear_discard(cache, bio, block);
2052                 return DM_MAPIO_REMAPPED;
2053         }
2054
2055         pb = init_per_bio_data(bio);
2056
2057         if (bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD)) {
2058                 defer_bio(cache, bio);
2059                 return DM_MAPIO_SUBMITTED;
2060         }
2061
2062         /*
2063          * Check to see if that block is currently migrating.
2064          */
2065         cell = alloc_prison_cell(cache);
2066         if (!cell) {
2067                 defer_bio(cache, bio);
2068                 return DM_MAPIO_SUBMITTED;
2069         }
2070
2071         r = bio_detain(cache, block, bio, cell,
2072                        (cell_free_fn) free_prison_cell,
2073                        cache, &cell);
2074         if (r) {
2075                 if (r < 0)
2076                         defer_bio(cache, bio);
2077
2078                 return DM_MAPIO_SUBMITTED;
2079         }
2080
2081         discarded_block = is_discarded_oblock(cache, block);
2082
2083         r = policy_map(cache->policy, block, false, can_migrate, discarded_block,
2084                        bio, &lookup_result);
2085         if (r == -EWOULDBLOCK) {
2086                 cell_defer(cache, cell, true);
2087                 return DM_MAPIO_SUBMITTED;
2088
2089         } else if (r) {
2090                 DMERR_LIMIT("Unexpected return from cache replacement policy: %d", r);
2091                 bio_io_error(bio);
2092                 return DM_MAPIO_SUBMITTED;
2093         }
2094
2095         switch (lookup_result.op) {
2096         case POLICY_HIT:
2097                 inc_hit_counter(cache, bio);
2098                 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
2099
2100                 if (is_writethrough_io(cache, bio, lookup_result.cblock)) {
2101                         /*
2102                          * No need to mark anything dirty in write through mode.
2103                          */
2104                         pb->req_nr == 0 ?
2105                                 remap_to_cache(cache, bio, lookup_result.cblock) :
2106                                 remap_to_origin_clear_discard(cache, bio, block);
2107                         cell_defer(cache, cell, false);
2108                 } else {
2109                         remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
2110                         cell_defer(cache, cell, false);
2111                 }
2112                 break;
2113
2114         case POLICY_MISS:
2115                 inc_miss_counter(cache, bio);
2116                 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
2117
2118                 if (pb->req_nr != 0) {
2119                         /*
2120                          * This is a duplicate writethrough io that is no
2121                          * longer needed because the block has been demoted.
2122                          */
2123                         bio_endio(bio, 0);
2124                         cell_defer(cache, cell, false);
2125                         return DM_MAPIO_SUBMITTED;
2126                 } else {
2127                         remap_to_origin_clear_discard(cache, bio, block);
2128                         cell_defer(cache, cell, false);
2129                 }
2130                 break;
2131
2132         default:
2133                 DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__,
2134                             (unsigned) lookup_result.op);
2135                 bio_io_error(bio);
2136                 return DM_MAPIO_SUBMITTED;
2137         }
2138
2139         return DM_MAPIO_REMAPPED;
2140 }
2141
2142 static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
2143 {
2144         struct cache *cache = ti->private;
2145         unsigned long flags;
2146         struct per_bio_data *pb = get_per_bio_data(bio);
2147
2148         if (pb->tick) {
2149                 policy_tick(cache->policy);
2150
2151                 spin_lock_irqsave(&cache->lock, flags);
2152                 cache->need_tick_bio = true;
2153                 spin_unlock_irqrestore(&cache->lock, flags);
2154         }
2155
2156         check_for_quiesced_migrations(cache, pb);
2157
2158         return 0;
2159 }
2160
2161 static int write_dirty_bitset(struct cache *cache)
2162 {
2163         unsigned i, r;
2164
2165         for (i = 0; i < from_cblock(cache->cache_size); i++) {
2166                 r = dm_cache_set_dirty(cache->cmd, to_cblock(i),
2167                                        is_dirty(cache, to_cblock(i)));
2168                 if (r)
2169                         return r;
2170         }
2171
2172         return 0;
2173 }
2174
2175 static int write_discard_bitset(struct cache *cache)
2176 {
2177         unsigned i, r;
2178
2179         r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size,
2180                                            cache->discard_nr_blocks);
2181         if (r) {
2182                 DMERR("could not resize on-disk discard bitset");
2183                 return r;
2184         }
2185
2186         for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) {
2187                 r = dm_cache_set_discard(cache->cmd, to_dblock(i),
2188                                          is_discarded(cache, to_dblock(i)));
2189                 if (r)
2190                         return r;
2191         }
2192
2193         return 0;
2194 }
2195
2196 static int save_hint(void *context, dm_cblock_t cblock, dm_oblock_t oblock,
2197                      uint32_t hint)
2198 {
2199         struct cache *cache = context;
2200         return dm_cache_save_hint(cache->cmd, cblock, hint);
2201 }
2202
2203 static int write_hints(struct cache *cache)
2204 {
2205         int r;
2206
2207         r = dm_cache_begin_hints(cache->cmd, cache->policy);
2208         if (r) {
2209                 DMERR("dm_cache_begin_hints failed");
2210                 return r;
2211         }
2212
2213         r = policy_walk_mappings(cache->policy, save_hint, cache);
2214         if (r)
2215                 DMERR("policy_walk_mappings failed");
2216
2217         return r;
2218 }
2219
2220 /*
2221  * returns true on success
2222  */
2223 static bool sync_metadata(struct cache *cache)
2224 {
2225         int r1, r2, r3, r4;
2226
2227         r1 = write_dirty_bitset(cache);
2228         if (r1)
2229                 DMERR("could not write dirty bitset");
2230
2231         r2 = write_discard_bitset(cache);
2232         if (r2)
2233                 DMERR("could not write discard bitset");
2234
2235         save_stats(cache);
2236
2237         r3 = write_hints(cache);
2238         if (r3)
2239                 DMERR("could not write hints");
2240
2241         /*
2242          * If writing the above metadata failed, we still commit, but don't
2243          * set the clean shutdown flag.  This will effectively force every
2244          * dirty bit to be set on reload.
2245          */
2246         r4 = dm_cache_commit(cache->cmd, !r1 && !r2 && !r3);
2247         if (r4)
2248                 DMERR("could not write cache metadata.  Data loss may occur.");
2249
2250         return !r1 && !r2 && !r3 && !r4;
2251 }
2252
2253 static void cache_postsuspend(struct dm_target *ti)
2254 {
2255         struct cache *cache = ti->private;
2256
2257         start_quiescing(cache);
2258         wait_for_migrations(cache);
2259         stop_worker(cache);
2260         requeue_deferred_io(cache);
2261         stop_quiescing(cache);
2262
2263         (void) sync_metadata(cache);
2264 }
2265
2266 static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
2267                         bool dirty, uint32_t hint, bool hint_valid)
2268 {
2269         int r;
2270         struct cache *cache = context;
2271
2272         r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid);
2273         if (r)
2274                 return r;
2275
2276         if (dirty)
2277                 set_dirty(cache, oblock, cblock);
2278         else
2279                 clear_dirty(cache, oblock, cblock);
2280
2281         return 0;
2282 }
2283
2284 static int load_discard(void *context, sector_t discard_block_size,
2285                         dm_dblock_t dblock, bool discard)
2286 {
2287         struct cache *cache = context;
2288
2289         /* FIXME: handle mis-matched block size */
2290
2291         if (discard)
2292                 set_discard(cache, dblock);
2293         else
2294                 clear_discard(cache, dblock);
2295
2296         return 0;
2297 }
2298
2299 static int cache_preresume(struct dm_target *ti)
2300 {
2301         int r = 0;
2302         struct cache *cache = ti->private;
2303         sector_t actual_cache_size = get_dev_size(cache->cache_dev);
2304         (void) sector_div(actual_cache_size, cache->sectors_per_block);
2305
2306         /*
2307          * Check to see if the cache has resized.
2308          */
2309         if (from_cblock(cache->cache_size) != actual_cache_size || !cache->sized) {
2310                 cache->cache_size = to_cblock(actual_cache_size);
2311
2312                 r = dm_cache_resize(cache->cmd, cache->cache_size);
2313                 if (r) {
2314                         DMERR("could not resize cache metadata");
2315                         return r;
2316                 }
2317
2318                 cache->sized = true;
2319         }
2320
2321         if (!cache->loaded_mappings) {
2322                 r = dm_cache_load_mappings(cache->cmd,
2323                                            dm_cache_policy_get_name(cache->policy),
2324                                            load_mapping, cache);
2325                 if (r) {
2326                         DMERR("could not load cache mappings");
2327                         return r;
2328                 }
2329
2330                 cache->loaded_mappings = true;
2331         }
2332
2333         if (!cache->loaded_discards) {
2334                 r = dm_cache_load_discards(cache->cmd, load_discard, cache);
2335                 if (r) {
2336                         DMERR("could not load origin discards");
2337                         return r;
2338                 }
2339
2340                 cache->loaded_discards = true;
2341         }
2342
2343         return r;
2344 }
2345
2346 static void cache_resume(struct dm_target *ti)
2347 {
2348         struct cache *cache = ti->private;
2349
2350         cache->need_tick_bio = true;
2351         do_waker(&cache->waker.work);
2352 }
2353
2354 /*
2355  * Status format:
2356  *
2357  * <#used metadata blocks>/<#total metadata blocks>
2358  * <#read hits> <#read misses> <#write hits> <#write misses>
2359  * <#demotions> <#promotions> <#blocks in cache> <#dirty>
2360  * <#features> <features>*
2361  * <#core args> <core args>
2362  * <#policy args> <policy args>*
2363  */
2364 static void cache_status(struct dm_target *ti, status_type_t type,
2365                          unsigned status_flags, char *result, unsigned maxlen)
2366 {
2367         int r = 0;
2368         unsigned i;
2369         ssize_t sz = 0;
2370         dm_block_t nr_free_blocks_metadata = 0;
2371         dm_block_t nr_blocks_metadata = 0;
2372         char buf[BDEVNAME_SIZE];
2373         struct cache *cache = ti->private;
2374         dm_cblock_t residency;
2375
2376         switch (type) {
2377         case STATUSTYPE_INFO:
2378                 /* Commit to ensure statistics aren't out-of-date */
2379                 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) {
2380                         r = dm_cache_commit(cache->cmd, false);
2381                         if (r)
2382                                 DMERR("could not commit metadata for accurate status");
2383                 }
2384
2385                 r = dm_cache_get_free_metadata_block_count(cache->cmd,
2386                                                            &nr_free_blocks_metadata);
2387                 if (r) {
2388                         DMERR("could not get metadata free block count");
2389                         goto err;
2390                 }
2391
2392                 r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata);
2393                 if (r) {
2394                         DMERR("could not get metadata device size");
2395                         goto err;
2396                 }
2397
2398                 residency = policy_residency(cache->policy);
2399
2400                 DMEMIT("%llu/%llu %u %u %u %u %u %u %llu %u ",
2401                        (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
2402                        (unsigned long long)nr_blocks_metadata,
2403                        (unsigned) atomic_read(&cache->stats.read_hit),
2404                        (unsigned) atomic_read(&cache->stats.read_miss),
2405                        (unsigned) atomic_read(&cache->stats.write_hit),
2406                        (unsigned) atomic_read(&cache->stats.write_miss),
2407                        (unsigned) atomic_read(&cache->stats.demotion),
2408                        (unsigned) atomic_read(&cache->stats.promotion),
2409                        (unsigned long long) from_cblock(residency),
2410                        cache->nr_dirty);
2411
2412                 if (cache->features.write_through)
2413                         DMEMIT("1 writethrough ");
2414                 else
2415                         DMEMIT("0 ");
2416
2417                 DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold);
2418                 if (sz < maxlen) {
2419                         r = policy_emit_config_values(cache->policy, result + sz, maxlen - sz);
2420                         if (r)
2421                                 DMERR("policy_emit_config_values returned %d", r);
2422                 }
2423
2424                 break;
2425
2426         case STATUSTYPE_TABLE:
2427                 format_dev_t(buf, cache->metadata_dev->bdev->bd_dev);
2428                 DMEMIT("%s ", buf);
2429                 format_dev_t(buf, cache->cache_dev->bdev->bd_dev);
2430                 DMEMIT("%s ", buf);
2431                 format_dev_t(buf, cache->origin_dev->bdev->bd_dev);
2432                 DMEMIT("%s", buf);
2433
2434                 for (i = 0; i < cache->nr_ctr_args - 1; i++)
2435                         DMEMIT(" %s", cache->ctr_args[i]);
2436                 if (cache->nr_ctr_args)
2437                         DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]);
2438         }
2439
2440         return;
2441
2442 err:
2443         DMEMIT("Error");
2444 }
2445
2446 #define NOT_CORE_OPTION 1
2447
2448 static int process_config_option(struct cache *cache, char **argv)
2449 {
2450         unsigned long tmp;
2451
2452         if (!strcasecmp(argv[0], "migration_threshold")) {
2453                 if (kstrtoul(argv[1], 10, &tmp))
2454                         return -EINVAL;
2455
2456                 cache->migration_threshold = tmp;
2457                 return 0;
2458         }
2459
2460         return NOT_CORE_OPTION;
2461 }
2462
2463 /*
2464  * Supports <key> <value>.
2465  *
2466  * The key migration_threshold is supported by the cache target core.
2467  */
2468 static int cache_message(struct dm_target *ti, unsigned argc, char **argv)
2469 {
2470         int r;
2471         struct cache *cache = ti->private;
2472
2473         if (argc != 2)
2474                 return -EINVAL;
2475
2476         r = process_config_option(cache, argv);
2477         if (r == NOT_CORE_OPTION)
2478                 return policy_set_config_value(cache->policy, argv[0], argv[1]);
2479
2480         return r;
2481 }
2482
2483 static int cache_iterate_devices(struct dm_target *ti,
2484                                  iterate_devices_callout_fn fn, void *data)
2485 {
2486         int r = 0;
2487         struct cache *cache = ti->private;
2488
2489         r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data);
2490         if (!r)
2491                 r = fn(ti, cache->origin_dev, 0, ti->len, data);
2492
2493         return r;
2494 }
2495
2496 /*
2497  * We assume I/O is going to the origin (which is the volume
2498  * more likely to have restrictions e.g. by being striped).
2499  * (Looking up the exact location of the data would be expensive
2500  * and could always be out of date by the time the bio is submitted.)
2501  */
2502 static int cache_bvec_merge(struct dm_target *ti,
2503                             struct bvec_merge_data *bvm,
2504                             struct bio_vec *biovec, int max_size)
2505 {
2506         struct cache *cache = ti->private;
2507         struct request_queue *q = bdev_get_queue(cache->origin_dev->bdev);
2508
2509         if (!q->merge_bvec_fn)
2510                 return max_size;
2511
2512         bvm->bi_bdev = cache->origin_dev->bdev;
2513         return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
2514 }
2515
2516 static void set_discard_limits(struct cache *cache, struct queue_limits *limits)
2517 {
2518         /*
2519          * FIXME: these limits may be incompatible with the cache device
2520          */
2521         limits->max_discard_sectors = cache->discard_block_size * 1024;
2522         limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT;
2523 }
2524
2525 static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
2526 {
2527         struct cache *cache = ti->private;
2528
2529         blk_limits_io_min(limits, 0);
2530         blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT);
2531         set_discard_limits(cache, limits);
2532 }
2533
2534 /*----------------------------------------------------------------*/
2535
2536 static struct target_type cache_target = {
2537         .name = "cache",
2538         .version = {1, 0, 0},
2539         .module = THIS_MODULE,
2540         .ctr = cache_ctr,
2541         .dtr = cache_dtr,
2542         .map = cache_map,
2543         .end_io = cache_end_io,
2544         .postsuspend = cache_postsuspend,
2545         .preresume = cache_preresume,
2546         .resume = cache_resume,
2547         .status = cache_status,
2548         .message = cache_message,
2549         .iterate_devices = cache_iterate_devices,
2550         .merge = cache_bvec_merge,
2551         .io_hints = cache_io_hints,
2552 };
2553
2554 static int __init dm_cache_init(void)
2555 {
2556         int r;
2557
2558         r = dm_register_target(&cache_target);
2559         if (r) {
2560                 DMERR("cache target registration failed: %d", r);
2561                 return r;
2562         }
2563
2564         migration_cache = KMEM_CACHE(dm_cache_migration, 0);
2565         if (!migration_cache) {
2566                 dm_unregister_target(&cache_target);
2567                 return -ENOMEM;
2568         }
2569
2570         return 0;
2571 }
2572
2573 static void __exit dm_cache_exit(void)
2574 {
2575         dm_unregister_target(&cache_target);
2576         kmem_cache_destroy(migration_cache);
2577 }
2578
2579 module_init(dm_cache_init);
2580 module_exit(dm_cache_exit);
2581
2582 MODULE_DESCRIPTION(DM_NAME " cache target");
2583 MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>");
2584 MODULE_LICENSE("GPL");