]> git.kernelconcepts.de Git - karo-tx-linux.git/blob - drivers/md/dm-cache-target.c
Merge branch 'topic/livepatch' into next
[karo-tx-linux.git] / drivers / md / dm-cache-target.c
1 /*
2  * Copyright (C) 2012 Red Hat. All rights reserved.
3  *
4  * This file is released under the GPL.
5  */
6
7 #include "dm.h"
8 #include "dm-bio-prison.h"
9 #include "dm-bio-record.h"
10 #include "dm-cache-metadata.h"
11
12 #include <linux/dm-io.h>
13 #include <linux/dm-kcopyd.h>
14 #include <linux/jiffies.h>
15 #include <linux/init.h>
16 #include <linux/mempool.h>
17 #include <linux/module.h>
18 #include <linux/slab.h>
19 #include <linux/vmalloc.h>
20
21 #define DM_MSG_PREFIX "cache"
22
23 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
24         "A percentage of time allocated for copying to and/or from cache");
25
26 /*----------------------------------------------------------------*/
27
28 #define IOT_RESOLUTION 4
29
30 struct io_tracker {
31         spinlock_t lock;
32
33         /*
34          * Sectors of in-flight IO.
35          */
36         sector_t in_flight;
37
38         /*
39          * The time, in jiffies, when this device became idle (if it is
40          * indeed idle).
41          */
42         unsigned long idle_time;
43         unsigned long last_update_time;
44 };
45
46 static void iot_init(struct io_tracker *iot)
47 {
48         spin_lock_init(&iot->lock);
49         iot->in_flight = 0ul;
50         iot->idle_time = 0ul;
51         iot->last_update_time = jiffies;
52 }
53
54 static bool __iot_idle_for(struct io_tracker *iot, unsigned long jifs)
55 {
56         if (iot->in_flight)
57                 return false;
58
59         return time_after(jiffies, iot->idle_time + jifs);
60 }
61
62 static bool iot_idle_for(struct io_tracker *iot, unsigned long jifs)
63 {
64         bool r;
65         unsigned long flags;
66
67         spin_lock_irqsave(&iot->lock, flags);
68         r = __iot_idle_for(iot, jifs);
69         spin_unlock_irqrestore(&iot->lock, flags);
70
71         return r;
72 }
73
74 static void iot_io_begin(struct io_tracker *iot, sector_t len)
75 {
76         unsigned long flags;
77
78         spin_lock_irqsave(&iot->lock, flags);
79         iot->in_flight += len;
80         spin_unlock_irqrestore(&iot->lock, flags);
81 }
82
83 static void __iot_io_end(struct io_tracker *iot, sector_t len)
84 {
85         iot->in_flight -= len;
86         if (!iot->in_flight)
87                 iot->idle_time = jiffies;
88 }
89
90 static void iot_io_end(struct io_tracker *iot, sector_t len)
91 {
92         unsigned long flags;
93
94         spin_lock_irqsave(&iot->lock, flags);
95         __iot_io_end(iot, len);
96         spin_unlock_irqrestore(&iot->lock, flags);
97 }
98
99 /*----------------------------------------------------------------*/
100
101 /*
102  * Glossary:
103  *
104  * oblock: index of an origin block
105  * cblock: index of a cache block
106  * promotion: movement of a block from origin to cache
107  * demotion: movement of a block from cache to origin
108  * migration: movement of a block between the origin and cache device,
109  *            either direction
110  */
111
112 /*----------------------------------------------------------------*/
113
114 /*
115  * There are a couple of places where we let a bio run, but want to do some
116  * work before calling its endio function.  We do this by temporarily
117  * changing the endio fn.
118  */
119 struct dm_hook_info {
120         bio_end_io_t *bi_end_io;
121 };
122
123 static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio,
124                         bio_end_io_t *bi_end_io, void *bi_private)
125 {
126         h->bi_end_io = bio->bi_end_io;
127
128         bio->bi_end_io = bi_end_io;
129         bio->bi_private = bi_private;
130 }
131
132 static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio)
133 {
134         bio->bi_end_io = h->bi_end_io;
135 }
136
137 /*----------------------------------------------------------------*/
138
139 #define MIGRATION_POOL_SIZE 128
140 #define COMMIT_PERIOD HZ
141 #define MIGRATION_COUNT_WINDOW 10
142
143 /*
144  * The block size of the device holding cache data must be
145  * between 32KB and 1GB.
146  */
147 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT)
148 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
149
150 enum cache_metadata_mode {
151         CM_WRITE,               /* metadata may be changed */
152         CM_READ_ONLY,           /* metadata may not be changed */
153         CM_FAIL
154 };
155
156 enum cache_io_mode {
157         /*
158          * Data is written to cached blocks only.  These blocks are marked
159          * dirty.  If you lose the cache device you will lose data.
160          * Potential performance increase for both reads and writes.
161          */
162         CM_IO_WRITEBACK,
163
164         /*
165          * Data is written to both cache and origin.  Blocks are never
166          * dirty.  Potential performance benfit for reads only.
167          */
168         CM_IO_WRITETHROUGH,
169
170         /*
171          * A degraded mode useful for various cache coherency situations
172          * (eg, rolling back snapshots).  Reads and writes always go to the
173          * origin.  If a write goes to a cached oblock, then the cache
174          * block is invalidated.
175          */
176         CM_IO_PASSTHROUGH
177 };
178
179 struct cache_features {
180         enum cache_metadata_mode mode;
181         enum cache_io_mode io_mode;
182 };
183
184 struct cache_stats {
185         atomic_t read_hit;
186         atomic_t read_miss;
187         atomic_t write_hit;
188         atomic_t write_miss;
189         atomic_t demotion;
190         atomic_t promotion;
191         atomic_t copies_avoided;
192         atomic_t cache_cell_clash;
193         atomic_t commit_count;
194         atomic_t discard_count;
195 };
196
197 /*
198  * Defines a range of cblocks, begin to (end - 1) are in the range.  end is
199  * the one-past-the-end value.
200  */
201 struct cblock_range {
202         dm_cblock_t begin;
203         dm_cblock_t end;
204 };
205
206 struct invalidation_request {
207         struct list_head list;
208         struct cblock_range *cblocks;
209
210         atomic_t complete;
211         int err;
212
213         wait_queue_head_t result_wait;
214 };
215
216 struct cache {
217         struct dm_target *ti;
218         struct dm_target_callbacks callbacks;
219
220         struct dm_cache_metadata *cmd;
221
222         /*
223          * Metadata is written to this device.
224          */
225         struct dm_dev *metadata_dev;
226
227         /*
228          * The slower of the two data devices.  Typically a spindle.
229          */
230         struct dm_dev *origin_dev;
231
232         /*
233          * The faster of the two data devices.  Typically an SSD.
234          */
235         struct dm_dev *cache_dev;
236
237         /*
238          * Size of the origin device in _complete_ blocks and native sectors.
239          */
240         dm_oblock_t origin_blocks;
241         sector_t origin_sectors;
242
243         /*
244          * Size of the cache device in blocks.
245          */
246         dm_cblock_t cache_size;
247
248         /*
249          * Fields for converting from sectors to blocks.
250          */
251         uint32_t sectors_per_block;
252         int sectors_per_block_shift;
253
254         spinlock_t lock;
255         struct list_head deferred_cells;
256         struct bio_list deferred_bios;
257         struct bio_list deferred_flush_bios;
258         struct bio_list deferred_writethrough_bios;
259         struct list_head quiesced_migrations;
260         struct list_head completed_migrations;
261         struct list_head need_commit_migrations;
262         sector_t migration_threshold;
263         wait_queue_head_t migration_wait;
264         atomic_t nr_allocated_migrations;
265
266         /*
267          * The number of in flight migrations that are performing
268          * background io. eg, promotion, writeback.
269          */
270         atomic_t nr_io_migrations;
271
272         wait_queue_head_t quiescing_wait;
273         atomic_t quiescing;
274         atomic_t quiescing_ack;
275
276         /*
277          * cache_size entries, dirty if set
278          */
279         atomic_t nr_dirty;
280         unsigned long *dirty_bitset;
281
282         /*
283          * origin_blocks entries, discarded if set.
284          */
285         dm_dblock_t discard_nr_blocks;
286         unsigned long *discard_bitset;
287         uint32_t discard_block_size; /* a power of 2 times sectors per block */
288
289         /*
290          * Rather than reconstructing the table line for the status we just
291          * save it and regurgitate.
292          */
293         unsigned nr_ctr_args;
294         const char **ctr_args;
295
296         struct dm_kcopyd_client *copier;
297         struct workqueue_struct *wq;
298         struct work_struct worker;
299
300         struct delayed_work waker;
301         unsigned long last_commit_jiffies;
302
303         struct dm_bio_prison *prison;
304         struct dm_deferred_set *all_io_ds;
305
306         mempool_t *migration_pool;
307
308         struct dm_cache_policy *policy;
309         unsigned policy_nr_args;
310
311         bool need_tick_bio:1;
312         bool sized:1;
313         bool invalidate:1;
314         bool commit_requested:1;
315         bool loaded_mappings:1;
316         bool loaded_discards:1;
317
318         /*
319          * Cache features such as write-through.
320          */
321         struct cache_features features;
322
323         struct cache_stats stats;
324
325         /*
326          * Invalidation fields.
327          */
328         spinlock_t invalidation_lock;
329         struct list_head invalidation_requests;
330
331         struct io_tracker origin_tracker;
332 };
333
334 struct per_bio_data {
335         bool tick:1;
336         unsigned req_nr:2;
337         struct dm_deferred_entry *all_io_entry;
338         struct dm_hook_info hook_info;
339         sector_t len;
340
341         /*
342          * writethrough fields.  These MUST remain at the end of this
343          * structure and the 'cache' member must be the first as it
344          * is used to determine the offset of the writethrough fields.
345          */
346         struct cache *cache;
347         dm_cblock_t cblock;
348         struct dm_bio_details bio_details;
349 };
350
351 struct dm_cache_migration {
352         struct list_head list;
353         struct cache *cache;
354
355         unsigned long start_jiffies;
356         dm_oblock_t old_oblock;
357         dm_oblock_t new_oblock;
358         dm_cblock_t cblock;
359
360         bool err:1;
361         bool discard:1;
362         bool writeback:1;
363         bool demote:1;
364         bool promote:1;
365         bool requeue_holder:1;
366         bool invalidate:1;
367
368         struct dm_bio_prison_cell *old_ocell;
369         struct dm_bio_prison_cell *new_ocell;
370 };
371
372 /*
373  * Processing a bio in the worker thread may require these memory
374  * allocations.  We prealloc to avoid deadlocks (the same worker thread
375  * frees them back to the mempool).
376  */
377 struct prealloc {
378         struct dm_cache_migration *mg;
379         struct dm_bio_prison_cell *cell1;
380         struct dm_bio_prison_cell *cell2;
381 };
382
383 static enum cache_metadata_mode get_cache_mode(struct cache *cache);
384
385 static void wake_worker(struct cache *cache)
386 {
387         queue_work(cache->wq, &cache->worker);
388 }
389
390 /*----------------------------------------------------------------*/
391
392 static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache)
393 {
394         /* FIXME: change to use a local slab. */
395         return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT);
396 }
397
398 static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell)
399 {
400         dm_bio_prison_free_cell(cache->prison, cell);
401 }
402
403 static struct dm_cache_migration *alloc_migration(struct cache *cache)
404 {
405         struct dm_cache_migration *mg;
406
407         mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT);
408         if (mg) {
409                 mg->cache = cache;
410                 atomic_inc(&mg->cache->nr_allocated_migrations);
411         }
412
413         return mg;
414 }
415
416 static void free_migration(struct dm_cache_migration *mg)
417 {
418         struct cache *cache = mg->cache;
419
420         if (atomic_dec_and_test(&cache->nr_allocated_migrations))
421                 wake_up(&cache->migration_wait);
422
423         mempool_free(mg, cache->migration_pool);
424 }
425
426 static int prealloc_data_structs(struct cache *cache, struct prealloc *p)
427 {
428         if (!p->mg) {
429                 p->mg = alloc_migration(cache);
430                 if (!p->mg)
431                         return -ENOMEM;
432         }
433
434         if (!p->cell1) {
435                 p->cell1 = alloc_prison_cell(cache);
436                 if (!p->cell1)
437                         return -ENOMEM;
438         }
439
440         if (!p->cell2) {
441                 p->cell2 = alloc_prison_cell(cache);
442                 if (!p->cell2)
443                         return -ENOMEM;
444         }
445
446         return 0;
447 }
448
449 static void prealloc_free_structs(struct cache *cache, struct prealloc *p)
450 {
451         if (p->cell2)
452                 free_prison_cell(cache, p->cell2);
453
454         if (p->cell1)
455                 free_prison_cell(cache, p->cell1);
456
457         if (p->mg)
458                 free_migration(p->mg);
459 }
460
461 static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p)
462 {
463         struct dm_cache_migration *mg = p->mg;
464
465         BUG_ON(!mg);
466         p->mg = NULL;
467
468         return mg;
469 }
470
471 /*
472  * You must have a cell within the prealloc struct to return.  If not this
473  * function will BUG() rather than returning NULL.
474  */
475 static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p)
476 {
477         struct dm_bio_prison_cell *r = NULL;
478
479         if (p->cell1) {
480                 r = p->cell1;
481                 p->cell1 = NULL;
482
483         } else if (p->cell2) {
484                 r = p->cell2;
485                 p->cell2 = NULL;
486         } else
487                 BUG();
488
489         return r;
490 }
491
492 /*
493  * You can't have more than two cells in a prealloc struct.  BUG() will be
494  * called if you try and overfill.
495  */
496 static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell)
497 {
498         if (!p->cell2)
499                 p->cell2 = cell;
500
501         else if (!p->cell1)
502                 p->cell1 = cell;
503
504         else
505                 BUG();
506 }
507
508 /*----------------------------------------------------------------*/
509
510 static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key *key)
511 {
512         key->virtual = 0;
513         key->dev = 0;
514         key->block_begin = from_oblock(begin);
515         key->block_end = from_oblock(end);
516 }
517
518 /*
519  * The caller hands in a preallocated cell, and a free function for it.
520  * The cell will be freed if there's an error, or if it wasn't used because
521  * a cell with that key already exists.
522  */
523 typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell);
524
525 static int bio_detain_range(struct cache *cache, dm_oblock_t oblock_begin, dm_oblock_t oblock_end,
526                             struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
527                             cell_free_fn free_fn, void *free_context,
528                             struct dm_bio_prison_cell **cell_result)
529 {
530         int r;
531         struct dm_cell_key key;
532
533         build_key(oblock_begin, oblock_end, &key);
534         r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result);
535         if (r)
536                 free_fn(free_context, cell_prealloc);
537
538         return r;
539 }
540
541 static int bio_detain(struct cache *cache, dm_oblock_t oblock,
542                       struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
543                       cell_free_fn free_fn, void *free_context,
544                       struct dm_bio_prison_cell **cell_result)
545 {
546         dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL);
547         return bio_detain_range(cache, oblock, end, bio,
548                                 cell_prealloc, free_fn, free_context, cell_result);
549 }
550
551 static int get_cell(struct cache *cache,
552                     dm_oblock_t oblock,
553                     struct prealloc *structs,
554                     struct dm_bio_prison_cell **cell_result)
555 {
556         int r;
557         struct dm_cell_key key;
558         struct dm_bio_prison_cell *cell_prealloc;
559
560         cell_prealloc = prealloc_get_cell(structs);
561
562         build_key(oblock, to_oblock(from_oblock(oblock) + 1ULL), &key);
563         r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result);
564         if (r)
565                 prealloc_put_cell(structs, cell_prealloc);
566
567         return r;
568 }
569
570 /*----------------------------------------------------------------*/
571
572 static bool is_dirty(struct cache *cache, dm_cblock_t b)
573 {
574         return test_bit(from_cblock(b), cache->dirty_bitset);
575 }
576
577 static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
578 {
579         if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) {
580                 atomic_inc(&cache->nr_dirty);
581                 policy_set_dirty(cache->policy, oblock);
582         }
583 }
584
585 static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
586 {
587         if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) {
588                 policy_clear_dirty(cache->policy, oblock);
589                 if (atomic_dec_return(&cache->nr_dirty) == 0)
590                         dm_table_event(cache->ti->table);
591         }
592 }
593
594 /*----------------------------------------------------------------*/
595
596 static bool block_size_is_power_of_two(struct cache *cache)
597 {
598         return cache->sectors_per_block_shift >= 0;
599 }
600
601 /* gcc on ARM generates spurious references to __udivdi3 and __umoddi3 */
602 #if defined(CONFIG_ARM) && __GNUC__ == 4 && __GNUC_MINOR__ <= 6
603 __always_inline
604 #endif
605 static dm_block_t block_div(dm_block_t b, uint32_t n)
606 {
607         do_div(b, n);
608
609         return b;
610 }
611
612 static dm_block_t oblocks_per_dblock(struct cache *cache)
613 {
614         dm_block_t oblocks = cache->discard_block_size;
615
616         if (block_size_is_power_of_two(cache))
617                 oblocks >>= cache->sectors_per_block_shift;
618         else
619                 oblocks = block_div(oblocks, cache->sectors_per_block);
620
621         return oblocks;
622 }
623
624 static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
625 {
626         return to_dblock(block_div(from_oblock(oblock),
627                                    oblocks_per_dblock(cache)));
628 }
629
630 static dm_oblock_t dblock_to_oblock(struct cache *cache, dm_dblock_t dblock)
631 {
632         return to_oblock(from_dblock(dblock) * oblocks_per_dblock(cache));
633 }
634
635 static void set_discard(struct cache *cache, dm_dblock_t b)
636 {
637         unsigned long flags;
638
639         BUG_ON(from_dblock(b) >= from_dblock(cache->discard_nr_blocks));
640         atomic_inc(&cache->stats.discard_count);
641
642         spin_lock_irqsave(&cache->lock, flags);
643         set_bit(from_dblock(b), cache->discard_bitset);
644         spin_unlock_irqrestore(&cache->lock, flags);
645 }
646
647 static void clear_discard(struct cache *cache, dm_dblock_t b)
648 {
649         unsigned long flags;
650
651         spin_lock_irqsave(&cache->lock, flags);
652         clear_bit(from_dblock(b), cache->discard_bitset);
653         spin_unlock_irqrestore(&cache->lock, flags);
654 }
655
656 static bool is_discarded(struct cache *cache, dm_dblock_t b)
657 {
658         int r;
659         unsigned long flags;
660
661         spin_lock_irqsave(&cache->lock, flags);
662         r = test_bit(from_dblock(b), cache->discard_bitset);
663         spin_unlock_irqrestore(&cache->lock, flags);
664
665         return r;
666 }
667
668 static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
669 {
670         int r;
671         unsigned long flags;
672
673         spin_lock_irqsave(&cache->lock, flags);
674         r = test_bit(from_dblock(oblock_to_dblock(cache, b)),
675                      cache->discard_bitset);
676         spin_unlock_irqrestore(&cache->lock, flags);
677
678         return r;
679 }
680
681 /*----------------------------------------------------------------*/
682
683 static void load_stats(struct cache *cache)
684 {
685         struct dm_cache_statistics stats;
686
687         dm_cache_metadata_get_stats(cache->cmd, &stats);
688         atomic_set(&cache->stats.read_hit, stats.read_hits);
689         atomic_set(&cache->stats.read_miss, stats.read_misses);
690         atomic_set(&cache->stats.write_hit, stats.write_hits);
691         atomic_set(&cache->stats.write_miss, stats.write_misses);
692 }
693
694 static void save_stats(struct cache *cache)
695 {
696         struct dm_cache_statistics stats;
697
698         if (get_cache_mode(cache) >= CM_READ_ONLY)
699                 return;
700
701         stats.read_hits = atomic_read(&cache->stats.read_hit);
702         stats.read_misses = atomic_read(&cache->stats.read_miss);
703         stats.write_hits = atomic_read(&cache->stats.write_hit);
704         stats.write_misses = atomic_read(&cache->stats.write_miss);
705
706         dm_cache_metadata_set_stats(cache->cmd, &stats);
707 }
708
709 /*----------------------------------------------------------------
710  * Per bio data
711  *--------------------------------------------------------------*/
712
713 /*
714  * If using writeback, leave out struct per_bio_data's writethrough fields.
715  */
716 #define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache))
717 #define PB_DATA_SIZE_WT (sizeof(struct per_bio_data))
718
719 static bool writethrough_mode(struct cache_features *f)
720 {
721         return f->io_mode == CM_IO_WRITETHROUGH;
722 }
723
724 static bool writeback_mode(struct cache_features *f)
725 {
726         return f->io_mode == CM_IO_WRITEBACK;
727 }
728
729 static bool passthrough_mode(struct cache_features *f)
730 {
731         return f->io_mode == CM_IO_PASSTHROUGH;
732 }
733
734 static size_t get_per_bio_data_size(struct cache *cache)
735 {
736         return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB;
737 }
738
739 static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size)
740 {
741         struct per_bio_data *pb = dm_per_bio_data(bio, data_size);
742         BUG_ON(!pb);
743         return pb;
744 }
745
746 static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size)
747 {
748         struct per_bio_data *pb = get_per_bio_data(bio, data_size);
749
750         pb->tick = false;
751         pb->req_nr = dm_bio_get_target_bio_nr(bio);
752         pb->all_io_entry = NULL;
753         pb->len = 0;
754
755         return pb;
756 }
757
758 /*----------------------------------------------------------------
759  * Remapping
760  *--------------------------------------------------------------*/
761 static void remap_to_origin(struct cache *cache, struct bio *bio)
762 {
763         bio->bi_bdev = cache->origin_dev->bdev;
764 }
765
766 static void remap_to_cache(struct cache *cache, struct bio *bio,
767                            dm_cblock_t cblock)
768 {
769         sector_t bi_sector = bio->bi_iter.bi_sector;
770         sector_t block = from_cblock(cblock);
771
772         bio->bi_bdev = cache->cache_dev->bdev;
773         if (!block_size_is_power_of_two(cache))
774                 bio->bi_iter.bi_sector =
775                         (block * cache->sectors_per_block) +
776                         sector_div(bi_sector, cache->sectors_per_block);
777         else
778                 bio->bi_iter.bi_sector =
779                         (block << cache->sectors_per_block_shift) |
780                         (bi_sector & (cache->sectors_per_block - 1));
781 }
782
783 static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
784 {
785         unsigned long flags;
786         size_t pb_data_size = get_per_bio_data_size(cache);
787         struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
788
789         spin_lock_irqsave(&cache->lock, flags);
790         if (cache->need_tick_bio &&
791             !(bio->bi_rw & (REQ_FUA | REQ_FLUSH | REQ_DISCARD))) {
792                 pb->tick = true;
793                 cache->need_tick_bio = false;
794         }
795         spin_unlock_irqrestore(&cache->lock, flags);
796 }
797
798 static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
799                                   dm_oblock_t oblock)
800 {
801         check_if_tick_bio_needed(cache, bio);
802         remap_to_origin(cache, bio);
803         if (bio_data_dir(bio) == WRITE)
804                 clear_discard(cache, oblock_to_dblock(cache, oblock));
805 }
806
807 static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
808                                  dm_oblock_t oblock, dm_cblock_t cblock)
809 {
810         check_if_tick_bio_needed(cache, bio);
811         remap_to_cache(cache, bio, cblock);
812         if (bio_data_dir(bio) == WRITE) {
813                 set_dirty(cache, oblock, cblock);
814                 clear_discard(cache, oblock_to_dblock(cache, oblock));
815         }
816 }
817
818 static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio)
819 {
820         sector_t block_nr = bio->bi_iter.bi_sector;
821
822         if (!block_size_is_power_of_two(cache))
823                 (void) sector_div(block_nr, cache->sectors_per_block);
824         else
825                 block_nr >>= cache->sectors_per_block_shift;
826
827         return to_oblock(block_nr);
828 }
829
830 static int bio_triggers_commit(struct cache *cache, struct bio *bio)
831 {
832         return bio->bi_rw & (REQ_FLUSH | REQ_FUA);
833 }
834
835 /*
836  * You must increment the deferred set whilst the prison cell is held.  To
837  * encourage this, we ask for 'cell' to be passed in.
838  */
839 static void inc_ds(struct cache *cache, struct bio *bio,
840                    struct dm_bio_prison_cell *cell)
841 {
842         size_t pb_data_size = get_per_bio_data_size(cache);
843         struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
844
845         BUG_ON(!cell);
846         BUG_ON(pb->all_io_entry);
847
848         pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
849 }
850
851 static bool accountable_bio(struct cache *cache, struct bio *bio)
852 {
853         return ((bio->bi_bdev == cache->origin_dev->bdev) &&
854                 !(bio->bi_rw & REQ_DISCARD));
855 }
856
857 static void accounted_begin(struct cache *cache, struct bio *bio)
858 {
859         size_t pb_data_size = get_per_bio_data_size(cache);
860         struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
861
862         if (accountable_bio(cache, bio)) {
863                 pb->len = bio_sectors(bio);
864                 iot_io_begin(&cache->origin_tracker, pb->len);
865         }
866 }
867
868 static void accounted_complete(struct cache *cache, struct bio *bio)
869 {
870         size_t pb_data_size = get_per_bio_data_size(cache);
871         struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
872
873         iot_io_end(&cache->origin_tracker, pb->len);
874 }
875
876 static void accounted_request(struct cache *cache, struct bio *bio)
877 {
878         accounted_begin(cache, bio);
879         generic_make_request(bio);
880 }
881
882 static void issue(struct cache *cache, struct bio *bio)
883 {
884         unsigned long flags;
885
886         if (!bio_triggers_commit(cache, bio)) {
887                 accounted_request(cache, bio);
888                 return;
889         }
890
891         /*
892          * Batch together any bios that trigger commits and then issue a
893          * single commit for them in do_worker().
894          */
895         spin_lock_irqsave(&cache->lock, flags);
896         cache->commit_requested = true;
897         bio_list_add(&cache->deferred_flush_bios, bio);
898         spin_unlock_irqrestore(&cache->lock, flags);
899 }
900
901 static void inc_and_issue(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell *cell)
902 {
903         inc_ds(cache, bio, cell);
904         issue(cache, bio);
905 }
906
907 static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
908 {
909         unsigned long flags;
910
911         spin_lock_irqsave(&cache->lock, flags);
912         bio_list_add(&cache->deferred_writethrough_bios, bio);
913         spin_unlock_irqrestore(&cache->lock, flags);
914
915         wake_worker(cache);
916 }
917
918 static void writethrough_endio(struct bio *bio)
919 {
920         struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT);
921
922         dm_unhook_bio(&pb->hook_info, bio);
923
924         if (bio->bi_error) {
925                 bio_endio(bio);
926                 return;
927         }
928
929         dm_bio_restore(&pb->bio_details, bio);
930         remap_to_cache(pb->cache, bio, pb->cblock);
931
932         /*
933          * We can't issue this bio directly, since we're in interrupt
934          * context.  So it gets put on a bio list for processing by the
935          * worker thread.
936          */
937         defer_writethrough_bio(pb->cache, bio);
938 }
939
940 /*
941  * When running in writethrough mode we need to send writes to clean blocks
942  * to both the cache and origin devices.  In future we'd like to clone the
943  * bio and send them in parallel, but for now we're doing them in
944  * series as this is easier.
945  */
946 static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio,
947                                        dm_oblock_t oblock, dm_cblock_t cblock)
948 {
949         struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT);
950
951         pb->cache = cache;
952         pb->cblock = cblock;
953         dm_hook_bio(&pb->hook_info, bio, writethrough_endio, NULL);
954         dm_bio_record(&pb->bio_details, bio);
955
956         remap_to_origin_clear_discard(pb->cache, bio, oblock);
957 }
958
959 /*----------------------------------------------------------------
960  * Failure modes
961  *--------------------------------------------------------------*/
962 static enum cache_metadata_mode get_cache_mode(struct cache *cache)
963 {
964         return cache->features.mode;
965 }
966
967 static const char *cache_device_name(struct cache *cache)
968 {
969         return dm_device_name(dm_table_get_md(cache->ti->table));
970 }
971
972 static void notify_mode_switch(struct cache *cache, enum cache_metadata_mode mode)
973 {
974         const char *descs[] = {
975                 "write",
976                 "read-only",
977                 "fail"
978         };
979
980         dm_table_event(cache->ti->table);
981         DMINFO("%s: switching cache to %s mode",
982                cache_device_name(cache), descs[(int)mode]);
983 }
984
985 static void set_cache_mode(struct cache *cache, enum cache_metadata_mode new_mode)
986 {
987         bool needs_check;
988         enum cache_metadata_mode old_mode = get_cache_mode(cache);
989
990         if (dm_cache_metadata_needs_check(cache->cmd, &needs_check)) {
991                 DMERR("unable to read needs_check flag, setting failure mode");
992                 new_mode = CM_FAIL;
993         }
994
995         if (new_mode == CM_WRITE && needs_check) {
996                 DMERR("%s: unable to switch cache to write mode until repaired.",
997                       cache_device_name(cache));
998                 if (old_mode != new_mode)
999                         new_mode = old_mode;
1000                 else
1001                         new_mode = CM_READ_ONLY;
1002         }
1003
1004         /* Never move out of fail mode */
1005         if (old_mode == CM_FAIL)
1006                 new_mode = CM_FAIL;
1007
1008         switch (new_mode) {
1009         case CM_FAIL:
1010         case CM_READ_ONLY:
1011                 dm_cache_metadata_set_read_only(cache->cmd);
1012                 break;
1013
1014         case CM_WRITE:
1015                 dm_cache_metadata_set_read_write(cache->cmd);
1016                 break;
1017         }
1018
1019         cache->features.mode = new_mode;
1020
1021         if (new_mode != old_mode)
1022                 notify_mode_switch(cache, new_mode);
1023 }
1024
1025 static void abort_transaction(struct cache *cache)
1026 {
1027         const char *dev_name = cache_device_name(cache);
1028
1029         if (get_cache_mode(cache) >= CM_READ_ONLY)
1030                 return;
1031
1032         if (dm_cache_metadata_set_needs_check(cache->cmd)) {
1033                 DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);
1034                 set_cache_mode(cache, CM_FAIL);
1035         }
1036
1037         DMERR_LIMIT("%s: aborting current metadata transaction", dev_name);
1038         if (dm_cache_metadata_abort(cache->cmd)) {
1039                 DMERR("%s: failed to abort metadata transaction", dev_name);
1040                 set_cache_mode(cache, CM_FAIL);
1041         }
1042 }
1043
1044 static void metadata_operation_failed(struct cache *cache, const char *op, int r)
1045 {
1046         DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
1047                     cache_device_name(cache), op, r);
1048         abort_transaction(cache);
1049         set_cache_mode(cache, CM_READ_ONLY);
1050 }
1051
1052 /*----------------------------------------------------------------
1053  * Migration processing
1054  *
1055  * Migration covers moving data from the origin device to the cache, or
1056  * vice versa.
1057  *--------------------------------------------------------------*/
1058 static void inc_io_migrations(struct cache *cache)
1059 {
1060         atomic_inc(&cache->nr_io_migrations);
1061 }
1062
1063 static void dec_io_migrations(struct cache *cache)
1064 {
1065         atomic_dec(&cache->nr_io_migrations);
1066 }
1067
1068 static bool discard_or_flush(struct bio *bio)
1069 {
1070         return bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD);
1071 }
1072
1073 static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell)
1074 {
1075         if (discard_or_flush(cell->holder)) {
1076                 /*
1077                  * We have to handle these bios individually.
1078                  */
1079                 dm_cell_release(cache->prison, cell, &cache->deferred_bios);
1080                 free_prison_cell(cache, cell);
1081         } else
1082                 list_add_tail(&cell->user_list, &cache->deferred_cells);
1083 }
1084
1085 static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, bool holder)
1086 {
1087         unsigned long flags;
1088
1089         if (!holder && dm_cell_promote_or_release(cache->prison, cell)) {
1090                 /*
1091                  * There was no prisoner to promote to holder, the
1092                  * cell has been released.
1093                  */
1094                 free_prison_cell(cache, cell);
1095                 return;
1096         }
1097
1098         spin_lock_irqsave(&cache->lock, flags);
1099         __cell_defer(cache, cell);
1100         spin_unlock_irqrestore(&cache->lock, flags);
1101
1102         wake_worker(cache);
1103 }
1104
1105 static void cell_error_with_code(struct cache *cache, struct dm_bio_prison_cell *cell, int err)
1106 {
1107         dm_cell_error(cache->prison, cell, err);
1108         free_prison_cell(cache, cell);
1109 }
1110
1111 static void cell_requeue(struct cache *cache, struct dm_bio_prison_cell *cell)
1112 {
1113         cell_error_with_code(cache, cell, DM_ENDIO_REQUEUE);
1114 }
1115
1116 static void free_io_migration(struct dm_cache_migration *mg)
1117 {
1118         struct cache *cache = mg->cache;
1119
1120         dec_io_migrations(cache);
1121         free_migration(mg);
1122         wake_worker(cache);
1123 }
1124
1125 static void migration_failure(struct dm_cache_migration *mg)
1126 {
1127         struct cache *cache = mg->cache;
1128         const char *dev_name = cache_device_name(cache);
1129
1130         if (mg->writeback) {
1131                 DMERR_LIMIT("%s: writeback failed; couldn't copy block", dev_name);
1132                 set_dirty(cache, mg->old_oblock, mg->cblock);
1133                 cell_defer(cache, mg->old_ocell, false);
1134
1135         } else if (mg->demote) {
1136                 DMERR_LIMIT("%s: demotion failed; couldn't copy block", dev_name);
1137                 policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock);
1138
1139                 cell_defer(cache, mg->old_ocell, mg->promote ? false : true);
1140                 if (mg->promote)
1141                         cell_defer(cache, mg->new_ocell, true);
1142         } else {
1143                 DMERR_LIMIT("%s: promotion failed; couldn't copy block", dev_name);
1144                 policy_remove_mapping(cache->policy, mg->new_oblock);
1145                 cell_defer(cache, mg->new_ocell, true);
1146         }
1147
1148         free_io_migration(mg);
1149 }
1150
1151 static void migration_success_pre_commit(struct dm_cache_migration *mg)
1152 {
1153         int r;
1154         unsigned long flags;
1155         struct cache *cache = mg->cache;
1156
1157         if (mg->writeback) {
1158                 clear_dirty(cache, mg->old_oblock, mg->cblock);
1159                 cell_defer(cache, mg->old_ocell, false);
1160                 free_io_migration(mg);
1161                 return;
1162
1163         } else if (mg->demote) {
1164                 r = dm_cache_remove_mapping(cache->cmd, mg->cblock);
1165                 if (r) {
1166                         DMERR_LIMIT("%s: demotion failed; couldn't update on disk metadata",
1167                                     cache_device_name(cache));
1168                         metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
1169                         policy_force_mapping(cache->policy, mg->new_oblock,
1170                                              mg->old_oblock);
1171                         if (mg->promote)
1172                                 cell_defer(cache, mg->new_ocell, true);
1173                         free_io_migration(mg);
1174                         return;
1175                 }
1176         } else {
1177                 r = dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock);
1178                 if (r) {
1179                         DMERR_LIMIT("%s: promotion failed; couldn't update on disk metadata",
1180                                     cache_device_name(cache));
1181                         metadata_operation_failed(cache, "dm_cache_insert_mapping", r);
1182                         policy_remove_mapping(cache->policy, mg->new_oblock);
1183                         free_io_migration(mg);
1184                         return;
1185                 }
1186         }
1187
1188         spin_lock_irqsave(&cache->lock, flags);
1189         list_add_tail(&mg->list, &cache->need_commit_migrations);
1190         cache->commit_requested = true;
1191         spin_unlock_irqrestore(&cache->lock, flags);
1192 }
1193
1194 static void migration_success_post_commit(struct dm_cache_migration *mg)
1195 {
1196         unsigned long flags;
1197         struct cache *cache = mg->cache;
1198
1199         if (mg->writeback) {
1200                 DMWARN_LIMIT("%s: writeback unexpectedly triggered commit",
1201                              cache_device_name(cache));
1202                 return;
1203
1204         } else if (mg->demote) {
1205                 cell_defer(cache, mg->old_ocell, mg->promote ? false : true);
1206
1207                 if (mg->promote) {
1208                         mg->demote = false;
1209
1210                         spin_lock_irqsave(&cache->lock, flags);
1211                         list_add_tail(&mg->list, &cache->quiesced_migrations);
1212                         spin_unlock_irqrestore(&cache->lock, flags);
1213
1214                 } else {
1215                         if (mg->invalidate)
1216                                 policy_remove_mapping(cache->policy, mg->old_oblock);
1217                         free_io_migration(mg);
1218                 }
1219
1220         } else {
1221                 if (mg->requeue_holder) {
1222                         clear_dirty(cache, mg->new_oblock, mg->cblock);
1223                         cell_defer(cache, mg->new_ocell, true);
1224                 } else {
1225                         /*
1226                          * The block was promoted via an overwrite, so it's dirty.
1227                          */
1228                         set_dirty(cache, mg->new_oblock, mg->cblock);
1229                         bio_endio(mg->new_ocell->holder);
1230                         cell_defer(cache, mg->new_ocell, false);
1231                 }
1232                 free_io_migration(mg);
1233         }
1234 }
1235
1236 static void copy_complete(int read_err, unsigned long write_err, void *context)
1237 {
1238         unsigned long flags;
1239         struct dm_cache_migration *mg = (struct dm_cache_migration *) context;
1240         struct cache *cache = mg->cache;
1241
1242         if (read_err || write_err)
1243                 mg->err = true;
1244
1245         spin_lock_irqsave(&cache->lock, flags);
1246         list_add_tail(&mg->list, &cache->completed_migrations);
1247         spin_unlock_irqrestore(&cache->lock, flags);
1248
1249         wake_worker(cache);
1250 }
1251
1252 static void issue_copy(struct dm_cache_migration *mg)
1253 {
1254         int r;
1255         struct dm_io_region o_region, c_region;
1256         struct cache *cache = mg->cache;
1257         sector_t cblock = from_cblock(mg->cblock);
1258
1259         o_region.bdev = cache->origin_dev->bdev;
1260         o_region.count = cache->sectors_per_block;
1261
1262         c_region.bdev = cache->cache_dev->bdev;
1263         c_region.sector = cblock * cache->sectors_per_block;
1264         c_region.count = cache->sectors_per_block;
1265
1266         if (mg->writeback || mg->demote) {
1267                 /* demote */
1268                 o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block;
1269                 r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg);
1270         } else {
1271                 /* promote */
1272                 o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block;
1273                 r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg);
1274         }
1275
1276         if (r < 0) {
1277                 DMERR_LIMIT("%s: issuing migration failed", cache_device_name(cache));
1278                 migration_failure(mg);
1279         }
1280 }
1281
1282 static void overwrite_endio(struct bio *bio)
1283 {
1284         struct dm_cache_migration *mg = bio->bi_private;
1285         struct cache *cache = mg->cache;
1286         size_t pb_data_size = get_per_bio_data_size(cache);
1287         struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1288         unsigned long flags;
1289
1290         dm_unhook_bio(&pb->hook_info, bio);
1291
1292         if (bio->bi_error)
1293                 mg->err = true;
1294
1295         mg->requeue_holder = false;
1296
1297         spin_lock_irqsave(&cache->lock, flags);
1298         list_add_tail(&mg->list, &cache->completed_migrations);
1299         spin_unlock_irqrestore(&cache->lock, flags);
1300
1301         wake_worker(cache);
1302 }
1303
1304 static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio)
1305 {
1306         size_t pb_data_size = get_per_bio_data_size(mg->cache);
1307         struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1308
1309         dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg);
1310         remap_to_cache_dirty(mg->cache, bio, mg->new_oblock, mg->cblock);
1311
1312         /*
1313          * No need to inc_ds() here, since the cell will be held for the
1314          * duration of the io.
1315          */
1316         accounted_request(mg->cache, bio);
1317 }
1318
1319 static bool bio_writes_complete_block(struct cache *cache, struct bio *bio)
1320 {
1321         return (bio_data_dir(bio) == WRITE) &&
1322                 (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
1323 }
1324
1325 static void avoid_copy(struct dm_cache_migration *mg)
1326 {
1327         atomic_inc(&mg->cache->stats.copies_avoided);
1328         migration_success_pre_commit(mg);
1329 }
1330
1331 static void calc_discard_block_range(struct cache *cache, struct bio *bio,
1332                                      dm_dblock_t *b, dm_dblock_t *e)
1333 {
1334         sector_t sb = bio->bi_iter.bi_sector;
1335         sector_t se = bio_end_sector(bio);
1336
1337         *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size));
1338
1339         if (se - sb < cache->discard_block_size)
1340                 *e = *b;
1341         else
1342                 *e = to_dblock(block_div(se, cache->discard_block_size));
1343 }
1344
1345 static void issue_discard(struct dm_cache_migration *mg)
1346 {
1347         dm_dblock_t b, e;
1348         struct bio *bio = mg->new_ocell->holder;
1349         struct cache *cache = mg->cache;
1350
1351         calc_discard_block_range(cache, bio, &b, &e);
1352         while (b != e) {
1353                 set_discard(cache, b);
1354                 b = to_dblock(from_dblock(b) + 1);
1355         }
1356
1357         bio_endio(bio);
1358         cell_defer(cache, mg->new_ocell, false);
1359         free_migration(mg);
1360         wake_worker(cache);
1361 }
1362
1363 static void issue_copy_or_discard(struct dm_cache_migration *mg)
1364 {
1365         bool avoid;
1366         struct cache *cache = mg->cache;
1367
1368         if (mg->discard) {
1369                 issue_discard(mg);
1370                 return;
1371         }
1372
1373         if (mg->writeback || mg->demote)
1374                 avoid = !is_dirty(cache, mg->cblock) ||
1375                         is_discarded_oblock(cache, mg->old_oblock);
1376         else {
1377                 struct bio *bio = mg->new_ocell->holder;
1378
1379                 avoid = is_discarded_oblock(cache, mg->new_oblock);
1380
1381                 if (writeback_mode(&cache->features) &&
1382                     !avoid && bio_writes_complete_block(cache, bio)) {
1383                         issue_overwrite(mg, bio);
1384                         return;
1385                 }
1386         }
1387
1388         avoid ? avoid_copy(mg) : issue_copy(mg);
1389 }
1390
1391 static void complete_migration(struct dm_cache_migration *mg)
1392 {
1393         if (mg->err)
1394                 migration_failure(mg);
1395         else
1396                 migration_success_pre_commit(mg);
1397 }
1398
1399 static void process_migrations(struct cache *cache, struct list_head *head,
1400                                void (*fn)(struct dm_cache_migration *))
1401 {
1402         unsigned long flags;
1403         struct list_head list;
1404         struct dm_cache_migration *mg, *tmp;
1405
1406         INIT_LIST_HEAD(&list);
1407         spin_lock_irqsave(&cache->lock, flags);
1408         list_splice_init(head, &list);
1409         spin_unlock_irqrestore(&cache->lock, flags);
1410
1411         list_for_each_entry_safe(mg, tmp, &list, list)
1412                 fn(mg);
1413 }
1414
1415 static void __queue_quiesced_migration(struct dm_cache_migration *mg)
1416 {
1417         list_add_tail(&mg->list, &mg->cache->quiesced_migrations);
1418 }
1419
1420 static void queue_quiesced_migration(struct dm_cache_migration *mg)
1421 {
1422         unsigned long flags;
1423         struct cache *cache = mg->cache;
1424
1425         spin_lock_irqsave(&cache->lock, flags);
1426         __queue_quiesced_migration(mg);
1427         spin_unlock_irqrestore(&cache->lock, flags);
1428
1429         wake_worker(cache);
1430 }
1431
1432 static void queue_quiesced_migrations(struct cache *cache, struct list_head *work)
1433 {
1434         unsigned long flags;
1435         struct dm_cache_migration *mg, *tmp;
1436
1437         spin_lock_irqsave(&cache->lock, flags);
1438         list_for_each_entry_safe(mg, tmp, work, list)
1439                 __queue_quiesced_migration(mg);
1440         spin_unlock_irqrestore(&cache->lock, flags);
1441
1442         wake_worker(cache);
1443 }
1444
1445 static void check_for_quiesced_migrations(struct cache *cache,
1446                                           struct per_bio_data *pb)
1447 {
1448         struct list_head work;
1449
1450         if (!pb->all_io_entry)
1451                 return;
1452
1453         INIT_LIST_HEAD(&work);
1454         dm_deferred_entry_dec(pb->all_io_entry, &work);
1455
1456         if (!list_empty(&work))
1457                 queue_quiesced_migrations(cache, &work);
1458 }
1459
1460 static void quiesce_migration(struct dm_cache_migration *mg)
1461 {
1462         if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list))
1463                 queue_quiesced_migration(mg);
1464 }
1465
1466 static void promote(struct cache *cache, struct prealloc *structs,
1467                     dm_oblock_t oblock, dm_cblock_t cblock,
1468                     struct dm_bio_prison_cell *cell)
1469 {
1470         struct dm_cache_migration *mg = prealloc_get_migration(structs);
1471
1472         mg->err = false;
1473         mg->discard = false;
1474         mg->writeback = false;
1475         mg->demote = false;
1476         mg->promote = true;
1477         mg->requeue_holder = true;
1478         mg->invalidate = false;
1479         mg->cache = cache;
1480         mg->new_oblock = oblock;
1481         mg->cblock = cblock;
1482         mg->old_ocell = NULL;
1483         mg->new_ocell = cell;
1484         mg->start_jiffies = jiffies;
1485
1486         inc_io_migrations(cache);
1487         quiesce_migration(mg);
1488 }
1489
1490 static void writeback(struct cache *cache, struct prealloc *structs,
1491                       dm_oblock_t oblock, dm_cblock_t cblock,
1492                       struct dm_bio_prison_cell *cell)
1493 {
1494         struct dm_cache_migration *mg = prealloc_get_migration(structs);
1495
1496         mg->err = false;
1497         mg->discard = false;
1498         mg->writeback = true;
1499         mg->demote = false;
1500         mg->promote = false;
1501         mg->requeue_holder = true;
1502         mg->invalidate = false;
1503         mg->cache = cache;
1504         mg->old_oblock = oblock;
1505         mg->cblock = cblock;
1506         mg->old_ocell = cell;
1507         mg->new_ocell = NULL;
1508         mg->start_jiffies = jiffies;
1509
1510         inc_io_migrations(cache);
1511         quiesce_migration(mg);
1512 }
1513
1514 static void demote_then_promote(struct cache *cache, struct prealloc *structs,
1515                                 dm_oblock_t old_oblock, dm_oblock_t new_oblock,
1516                                 dm_cblock_t cblock,
1517                                 struct dm_bio_prison_cell *old_ocell,
1518                                 struct dm_bio_prison_cell *new_ocell)
1519 {
1520         struct dm_cache_migration *mg = prealloc_get_migration(structs);
1521
1522         mg->err = false;
1523         mg->discard = false;
1524         mg->writeback = false;
1525         mg->demote = true;
1526         mg->promote = true;
1527         mg->requeue_holder = true;
1528         mg->invalidate = false;
1529         mg->cache = cache;
1530         mg->old_oblock = old_oblock;
1531         mg->new_oblock = new_oblock;
1532         mg->cblock = cblock;
1533         mg->old_ocell = old_ocell;
1534         mg->new_ocell = new_ocell;
1535         mg->start_jiffies = jiffies;
1536
1537         inc_io_migrations(cache);
1538         quiesce_migration(mg);
1539 }
1540
1541 /*
1542  * Invalidate a cache entry.  No writeback occurs; any changes in the cache
1543  * block are thrown away.
1544  */
1545 static void invalidate(struct cache *cache, struct prealloc *structs,
1546                        dm_oblock_t oblock, dm_cblock_t cblock,
1547                        struct dm_bio_prison_cell *cell)
1548 {
1549         struct dm_cache_migration *mg = prealloc_get_migration(structs);
1550
1551         mg->err = false;
1552         mg->discard = false;
1553         mg->writeback = false;
1554         mg->demote = true;
1555         mg->promote = false;
1556         mg->requeue_holder = true;
1557         mg->invalidate = true;
1558         mg->cache = cache;
1559         mg->old_oblock = oblock;
1560         mg->cblock = cblock;
1561         mg->old_ocell = cell;
1562         mg->new_ocell = NULL;
1563         mg->start_jiffies = jiffies;
1564
1565         inc_io_migrations(cache);
1566         quiesce_migration(mg);
1567 }
1568
1569 static void discard(struct cache *cache, struct prealloc *structs,
1570                     struct dm_bio_prison_cell *cell)
1571 {
1572         struct dm_cache_migration *mg = prealloc_get_migration(structs);
1573
1574         mg->err = false;
1575         mg->discard = true;
1576         mg->writeback = false;
1577         mg->demote = false;
1578         mg->promote = false;
1579         mg->requeue_holder = false;
1580         mg->invalidate = false;
1581         mg->cache = cache;
1582         mg->old_ocell = NULL;
1583         mg->new_ocell = cell;
1584         mg->start_jiffies = jiffies;
1585
1586         quiesce_migration(mg);
1587 }
1588
1589 /*----------------------------------------------------------------
1590  * bio processing
1591  *--------------------------------------------------------------*/
1592 static void defer_bio(struct cache *cache, struct bio *bio)
1593 {
1594         unsigned long flags;
1595
1596         spin_lock_irqsave(&cache->lock, flags);
1597         bio_list_add(&cache->deferred_bios, bio);
1598         spin_unlock_irqrestore(&cache->lock, flags);
1599
1600         wake_worker(cache);
1601 }
1602
1603 static void process_flush_bio(struct cache *cache, struct bio *bio)
1604 {
1605         size_t pb_data_size = get_per_bio_data_size(cache);
1606         struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1607
1608         BUG_ON(bio->bi_iter.bi_size);
1609         if (!pb->req_nr)
1610                 remap_to_origin(cache, bio);
1611         else
1612                 remap_to_cache(cache, bio, 0);
1613
1614         /*
1615          * REQ_FLUSH is not directed at any particular block so we don't
1616          * need to inc_ds().  REQ_FUA's are split into a write + REQ_FLUSH
1617          * by dm-core.
1618          */
1619         issue(cache, bio);
1620 }
1621
1622 static void process_discard_bio(struct cache *cache, struct prealloc *structs,
1623                                 struct bio *bio)
1624 {
1625         int r;
1626         dm_dblock_t b, e;
1627         struct dm_bio_prison_cell *cell_prealloc, *new_ocell;
1628
1629         calc_discard_block_range(cache, bio, &b, &e);
1630         if (b == e) {
1631                 bio_endio(bio);
1632                 return;
1633         }
1634
1635         cell_prealloc = prealloc_get_cell(structs);
1636         r = bio_detain_range(cache, dblock_to_oblock(cache, b), dblock_to_oblock(cache, e), bio, cell_prealloc,
1637                              (cell_free_fn) prealloc_put_cell,
1638                              structs, &new_ocell);
1639         if (r > 0)
1640                 return;
1641
1642         discard(cache, structs, new_ocell);
1643 }
1644
1645 static bool spare_migration_bandwidth(struct cache *cache)
1646 {
1647         sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) *
1648                 cache->sectors_per_block;
1649         return current_volume < cache->migration_threshold;
1650 }
1651
1652 static void inc_hit_counter(struct cache *cache, struct bio *bio)
1653 {
1654         atomic_inc(bio_data_dir(bio) == READ ?
1655                    &cache->stats.read_hit : &cache->stats.write_hit);
1656 }
1657
1658 static void inc_miss_counter(struct cache *cache, struct bio *bio)
1659 {
1660         atomic_inc(bio_data_dir(bio) == READ ?
1661                    &cache->stats.read_miss : &cache->stats.write_miss);
1662 }
1663
1664 /*----------------------------------------------------------------*/
1665
1666 struct inc_detail {
1667         struct cache *cache;
1668         struct bio_list bios_for_issue;
1669         struct bio_list unhandled_bios;
1670         bool any_writes;
1671 };
1672
1673 static void inc_fn(void *context, struct dm_bio_prison_cell *cell)
1674 {
1675         struct bio *bio;
1676         struct inc_detail *detail = context;
1677         struct cache *cache = detail->cache;
1678
1679         inc_ds(cache, cell->holder, cell);
1680         if (bio_data_dir(cell->holder) == WRITE)
1681                 detail->any_writes = true;
1682
1683         while ((bio = bio_list_pop(&cell->bios))) {
1684                 if (discard_or_flush(bio)) {
1685                         bio_list_add(&detail->unhandled_bios, bio);
1686                         continue;
1687                 }
1688
1689                 if (bio_data_dir(bio) == WRITE)
1690                         detail->any_writes = true;
1691
1692                 bio_list_add(&detail->bios_for_issue, bio);
1693                 inc_ds(cache, bio, cell);
1694         }
1695 }
1696
1697 // FIXME: refactor these two
1698 static void remap_cell_to_origin_clear_discard(struct cache *cache,
1699                                                struct dm_bio_prison_cell *cell,
1700                                                dm_oblock_t oblock, bool issue_holder)
1701 {
1702         struct bio *bio;
1703         unsigned long flags;
1704         struct inc_detail detail;
1705
1706         detail.cache = cache;
1707         bio_list_init(&detail.bios_for_issue);
1708         bio_list_init(&detail.unhandled_bios);
1709         detail.any_writes = false;
1710
1711         spin_lock_irqsave(&cache->lock, flags);
1712         dm_cell_visit_release(cache->prison, inc_fn, &detail, cell);
1713         bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios);
1714         spin_unlock_irqrestore(&cache->lock, flags);
1715
1716         remap_to_origin(cache, cell->holder);
1717         if (issue_holder)
1718                 issue(cache, cell->holder);
1719         else
1720                 accounted_begin(cache, cell->holder);
1721
1722         if (detail.any_writes)
1723                 clear_discard(cache, oblock_to_dblock(cache, oblock));
1724
1725         while ((bio = bio_list_pop(&detail.bios_for_issue))) {
1726                 remap_to_origin(cache, bio);
1727                 issue(cache, bio);
1728         }
1729
1730         free_prison_cell(cache, cell);
1731 }
1732
1733 static void remap_cell_to_cache_dirty(struct cache *cache, struct dm_bio_prison_cell *cell,
1734                                       dm_oblock_t oblock, dm_cblock_t cblock, bool issue_holder)
1735 {
1736         struct bio *bio;
1737         unsigned long flags;
1738         struct inc_detail detail;
1739
1740         detail.cache = cache;
1741         bio_list_init(&detail.bios_for_issue);
1742         bio_list_init(&detail.unhandled_bios);
1743         detail.any_writes = false;
1744
1745         spin_lock_irqsave(&cache->lock, flags);
1746         dm_cell_visit_release(cache->prison, inc_fn, &detail, cell);
1747         bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios);
1748         spin_unlock_irqrestore(&cache->lock, flags);
1749
1750         remap_to_cache(cache, cell->holder, cblock);
1751         if (issue_holder)
1752                 issue(cache, cell->holder);
1753         else
1754                 accounted_begin(cache, cell->holder);
1755
1756         if (detail.any_writes) {
1757                 set_dirty(cache, oblock, cblock);
1758                 clear_discard(cache, oblock_to_dblock(cache, oblock));
1759         }
1760
1761         while ((bio = bio_list_pop(&detail.bios_for_issue))) {
1762                 remap_to_cache(cache, bio, cblock);
1763                 issue(cache, bio);
1764         }
1765
1766         free_prison_cell(cache, cell);
1767 }
1768
1769 /*----------------------------------------------------------------*/
1770
1771 struct old_oblock_lock {
1772         struct policy_locker locker;
1773         struct cache *cache;
1774         struct prealloc *structs;
1775         struct dm_bio_prison_cell *cell;
1776 };
1777
1778 static int null_locker(struct policy_locker *locker, dm_oblock_t b)
1779 {
1780         /* This should never be called */
1781         BUG();
1782         return 0;
1783 }
1784
1785 static int cell_locker(struct policy_locker *locker, dm_oblock_t b)
1786 {
1787         struct old_oblock_lock *l = container_of(locker, struct old_oblock_lock, locker);
1788         struct dm_bio_prison_cell *cell_prealloc = prealloc_get_cell(l->structs);
1789
1790         return bio_detain(l->cache, b, NULL, cell_prealloc,
1791                           (cell_free_fn) prealloc_put_cell,
1792                           l->structs, &l->cell);
1793 }
1794
1795 static void process_cell(struct cache *cache, struct prealloc *structs,
1796                          struct dm_bio_prison_cell *new_ocell)
1797 {
1798         int r;
1799         bool release_cell = true;
1800         struct bio *bio = new_ocell->holder;
1801         dm_oblock_t block = get_bio_block(cache, bio);
1802         struct policy_result lookup_result;
1803         bool passthrough = passthrough_mode(&cache->features);
1804         bool fast_promotion, can_migrate;
1805         struct old_oblock_lock ool;
1806
1807         fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio);
1808         can_migrate = !passthrough && (fast_promotion || spare_migration_bandwidth(cache));
1809
1810         ool.locker.fn = cell_locker;
1811         ool.cache = cache;
1812         ool.structs = structs;
1813         ool.cell = NULL;
1814         r = policy_map(cache->policy, block, true, can_migrate, fast_promotion,
1815                        bio, &ool.locker, &lookup_result);
1816
1817         if (r == -EWOULDBLOCK)
1818                 /* migration has been denied */
1819                 lookup_result.op = POLICY_MISS;
1820
1821         switch (lookup_result.op) {
1822         case POLICY_HIT:
1823                 if (passthrough) {
1824                         inc_miss_counter(cache, bio);
1825
1826                         /*
1827                          * Passthrough always maps to the origin,
1828                          * invalidating any cache blocks that are written
1829                          * to.
1830                          */
1831
1832                         if (bio_data_dir(bio) == WRITE) {
1833                                 atomic_inc(&cache->stats.demotion);
1834                                 invalidate(cache, structs, block, lookup_result.cblock, new_ocell);
1835                                 release_cell = false;
1836
1837                         } else {
1838                                 /* FIXME: factor out issue_origin() */
1839                                 remap_to_origin_clear_discard(cache, bio, block);
1840                                 inc_and_issue(cache, bio, new_ocell);
1841                         }
1842                 } else {
1843                         inc_hit_counter(cache, bio);
1844
1845                         if (bio_data_dir(bio) == WRITE &&
1846                             writethrough_mode(&cache->features) &&
1847                             !is_dirty(cache, lookup_result.cblock)) {
1848                                 remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
1849                                 inc_and_issue(cache, bio, new_ocell);
1850
1851                         } else {
1852                                 remap_cell_to_cache_dirty(cache, new_ocell, block, lookup_result.cblock, true);
1853                                 release_cell = false;
1854                         }
1855                 }
1856
1857                 break;
1858
1859         case POLICY_MISS:
1860                 inc_miss_counter(cache, bio);
1861                 remap_cell_to_origin_clear_discard(cache, new_ocell, block, true);
1862                 release_cell = false;
1863                 break;
1864
1865         case POLICY_NEW:
1866                 atomic_inc(&cache->stats.promotion);
1867                 promote(cache, structs, block, lookup_result.cblock, new_ocell);
1868                 release_cell = false;
1869                 break;
1870
1871         case POLICY_REPLACE:
1872                 atomic_inc(&cache->stats.demotion);
1873                 atomic_inc(&cache->stats.promotion);
1874                 demote_then_promote(cache, structs, lookup_result.old_oblock,
1875                                     block, lookup_result.cblock,
1876                                     ool.cell, new_ocell);
1877                 release_cell = false;
1878                 break;
1879
1880         default:
1881                 DMERR_LIMIT("%s: %s: erroring bio, unknown policy op: %u",
1882                             cache_device_name(cache), __func__,
1883                             (unsigned) lookup_result.op);
1884                 bio_io_error(bio);
1885         }
1886
1887         if (release_cell)
1888                 cell_defer(cache, new_ocell, false);
1889 }
1890
1891 static void process_bio(struct cache *cache, struct prealloc *structs,
1892                         struct bio *bio)
1893 {
1894         int r;
1895         dm_oblock_t block = get_bio_block(cache, bio);
1896         struct dm_bio_prison_cell *cell_prealloc, *new_ocell;
1897
1898         /*
1899          * Check to see if that block is currently migrating.
1900          */
1901         cell_prealloc = prealloc_get_cell(structs);
1902         r = bio_detain(cache, block, bio, cell_prealloc,
1903                        (cell_free_fn) prealloc_put_cell,
1904                        structs, &new_ocell);
1905         if (r > 0)
1906                 return;
1907
1908         process_cell(cache, structs, new_ocell);
1909 }
1910
1911 static int need_commit_due_to_time(struct cache *cache)
1912 {
1913         return jiffies < cache->last_commit_jiffies ||
1914                jiffies > cache->last_commit_jiffies + COMMIT_PERIOD;
1915 }
1916
1917 /*
1918  * A non-zero return indicates read_only or fail_io mode.
1919  */
1920 static int commit(struct cache *cache, bool clean_shutdown)
1921 {
1922         int r;
1923
1924         if (get_cache_mode(cache) >= CM_READ_ONLY)
1925                 return -EINVAL;
1926
1927         atomic_inc(&cache->stats.commit_count);
1928         r = dm_cache_commit(cache->cmd, clean_shutdown);
1929         if (r)
1930                 metadata_operation_failed(cache, "dm_cache_commit", r);
1931
1932         return r;
1933 }
1934
1935 static int commit_if_needed(struct cache *cache)
1936 {
1937         int r = 0;
1938
1939         if ((cache->commit_requested || need_commit_due_to_time(cache)) &&
1940             dm_cache_changed_this_transaction(cache->cmd)) {
1941                 r = commit(cache, false);
1942                 cache->commit_requested = false;
1943                 cache->last_commit_jiffies = jiffies;
1944         }
1945
1946         return r;
1947 }
1948
1949 static void process_deferred_bios(struct cache *cache)
1950 {
1951         bool prealloc_used = false;
1952         unsigned long flags;
1953         struct bio_list bios;
1954         struct bio *bio;
1955         struct prealloc structs;
1956
1957         memset(&structs, 0, sizeof(structs));
1958         bio_list_init(&bios);
1959
1960         spin_lock_irqsave(&cache->lock, flags);
1961         bio_list_merge(&bios, &cache->deferred_bios);
1962         bio_list_init(&cache->deferred_bios);
1963         spin_unlock_irqrestore(&cache->lock, flags);
1964
1965         while (!bio_list_empty(&bios)) {
1966                 /*
1967                  * If we've got no free migration structs, and processing
1968                  * this bio might require one, we pause until there are some
1969                  * prepared mappings to process.
1970                  */
1971                 prealloc_used = true;
1972                 if (prealloc_data_structs(cache, &structs)) {
1973                         spin_lock_irqsave(&cache->lock, flags);
1974                         bio_list_merge(&cache->deferred_bios, &bios);
1975                         spin_unlock_irqrestore(&cache->lock, flags);
1976                         break;
1977                 }
1978
1979                 bio = bio_list_pop(&bios);
1980
1981                 if (bio->bi_rw & REQ_FLUSH)
1982                         process_flush_bio(cache, bio);
1983                 else if (bio->bi_rw & REQ_DISCARD)
1984                         process_discard_bio(cache, &structs, bio);
1985                 else
1986                         process_bio(cache, &structs, bio);
1987         }
1988
1989         if (prealloc_used)
1990                 prealloc_free_structs(cache, &structs);
1991 }
1992
1993 static void process_deferred_cells(struct cache *cache)
1994 {
1995         bool prealloc_used = false;
1996         unsigned long flags;
1997         struct dm_bio_prison_cell *cell, *tmp;
1998         struct list_head cells;
1999         struct prealloc structs;
2000
2001         memset(&structs, 0, sizeof(structs));
2002
2003         INIT_LIST_HEAD(&cells);
2004
2005         spin_lock_irqsave(&cache->lock, flags);
2006         list_splice_init(&cache->deferred_cells, &cells);
2007         spin_unlock_irqrestore(&cache->lock, flags);
2008
2009         list_for_each_entry_safe(cell, tmp, &cells, user_list) {
2010                 /*
2011                  * If we've got no free migration structs, and processing
2012                  * this bio might require one, we pause until there are some
2013                  * prepared mappings to process.
2014                  */
2015                 prealloc_used = true;
2016                 if (prealloc_data_structs(cache, &structs)) {
2017                         spin_lock_irqsave(&cache->lock, flags);
2018                         list_splice(&cells, &cache->deferred_cells);
2019                         spin_unlock_irqrestore(&cache->lock, flags);
2020                         break;
2021                 }
2022
2023                 process_cell(cache, &structs, cell);
2024         }
2025
2026         if (prealloc_used)
2027                 prealloc_free_structs(cache, &structs);
2028 }
2029
2030 static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
2031 {
2032         unsigned long flags;
2033         struct bio_list bios;
2034         struct bio *bio;
2035
2036         bio_list_init(&bios);
2037
2038         spin_lock_irqsave(&cache->lock, flags);
2039         bio_list_merge(&bios, &cache->deferred_flush_bios);
2040         bio_list_init(&cache->deferred_flush_bios);
2041         spin_unlock_irqrestore(&cache->lock, flags);
2042
2043         /*
2044          * These bios have already been through inc_ds()
2045          */
2046         while ((bio = bio_list_pop(&bios)))
2047                 submit_bios ? accounted_request(cache, bio) : bio_io_error(bio);
2048 }
2049
2050 static void process_deferred_writethrough_bios(struct cache *cache)
2051 {
2052         unsigned long flags;
2053         struct bio_list bios;
2054         struct bio *bio;
2055
2056         bio_list_init(&bios);
2057
2058         spin_lock_irqsave(&cache->lock, flags);
2059         bio_list_merge(&bios, &cache->deferred_writethrough_bios);
2060         bio_list_init(&cache->deferred_writethrough_bios);
2061         spin_unlock_irqrestore(&cache->lock, flags);
2062
2063         /*
2064          * These bios have already been through inc_ds()
2065          */
2066         while ((bio = bio_list_pop(&bios)))
2067                 accounted_request(cache, bio);
2068 }
2069
2070 static void writeback_some_dirty_blocks(struct cache *cache)
2071 {
2072         bool prealloc_used = false;
2073         dm_oblock_t oblock;
2074         dm_cblock_t cblock;
2075         struct prealloc structs;
2076         struct dm_bio_prison_cell *old_ocell;
2077         bool busy = !iot_idle_for(&cache->origin_tracker, HZ);
2078
2079         memset(&structs, 0, sizeof(structs));
2080
2081         while (spare_migration_bandwidth(cache)) {
2082                 if (policy_writeback_work(cache->policy, &oblock, &cblock, busy))
2083                         break; /* no work to do */
2084
2085                 prealloc_used = true;
2086                 if (prealloc_data_structs(cache, &structs) ||
2087                     get_cell(cache, oblock, &structs, &old_ocell)) {
2088                         policy_set_dirty(cache->policy, oblock);
2089                         break;
2090                 }
2091
2092                 writeback(cache, &structs, oblock, cblock, old_ocell);
2093         }
2094
2095         if (prealloc_used)
2096                 prealloc_free_structs(cache, &structs);
2097 }
2098
2099 /*----------------------------------------------------------------
2100  * Invalidations.
2101  * Dropping something from the cache *without* writing back.
2102  *--------------------------------------------------------------*/
2103
2104 static void process_invalidation_request(struct cache *cache, struct invalidation_request *req)
2105 {
2106         int r = 0;
2107         uint64_t begin = from_cblock(req->cblocks->begin);
2108         uint64_t end = from_cblock(req->cblocks->end);
2109
2110         while (begin != end) {
2111                 r = policy_remove_cblock(cache->policy, to_cblock(begin));
2112                 if (!r) {
2113                         r = dm_cache_remove_mapping(cache->cmd, to_cblock(begin));
2114                         if (r) {
2115                                 metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
2116                                 break;
2117                         }
2118
2119                 } else if (r == -ENODATA) {
2120                         /* harmless, already unmapped */
2121                         r = 0;
2122
2123                 } else {
2124                         DMERR("%s: policy_remove_cblock failed", cache_device_name(cache));
2125                         break;
2126                 }
2127
2128                 begin++;
2129         }
2130
2131         cache->commit_requested = true;
2132
2133         req->err = r;
2134         atomic_set(&req->complete, 1);
2135
2136         wake_up(&req->result_wait);
2137 }
2138
2139 static void process_invalidation_requests(struct cache *cache)
2140 {
2141         struct list_head list;
2142         struct invalidation_request *req, *tmp;
2143
2144         INIT_LIST_HEAD(&list);
2145         spin_lock(&cache->invalidation_lock);
2146         list_splice_init(&cache->invalidation_requests, &list);
2147         spin_unlock(&cache->invalidation_lock);
2148
2149         list_for_each_entry_safe (req, tmp, &list, list)
2150                 process_invalidation_request(cache, req);
2151 }
2152
2153 /*----------------------------------------------------------------
2154  * Main worker loop
2155  *--------------------------------------------------------------*/
2156 static bool is_quiescing(struct cache *cache)
2157 {
2158         return atomic_read(&cache->quiescing);
2159 }
2160
2161 static void ack_quiescing(struct cache *cache)
2162 {
2163         if (is_quiescing(cache)) {
2164                 atomic_inc(&cache->quiescing_ack);
2165                 wake_up(&cache->quiescing_wait);
2166         }
2167 }
2168
2169 static void wait_for_quiescing_ack(struct cache *cache)
2170 {
2171         wait_event(cache->quiescing_wait, atomic_read(&cache->quiescing_ack));
2172 }
2173
2174 static void start_quiescing(struct cache *cache)
2175 {
2176         atomic_inc(&cache->quiescing);
2177         wait_for_quiescing_ack(cache);
2178 }
2179
2180 static void stop_quiescing(struct cache *cache)
2181 {
2182         atomic_set(&cache->quiescing, 0);
2183         atomic_set(&cache->quiescing_ack, 0);
2184 }
2185
2186 static void wait_for_migrations(struct cache *cache)
2187 {
2188         wait_event(cache->migration_wait, !atomic_read(&cache->nr_allocated_migrations));
2189 }
2190
2191 static void stop_worker(struct cache *cache)
2192 {
2193         cancel_delayed_work(&cache->waker);
2194         flush_workqueue(cache->wq);
2195 }
2196
2197 static void requeue_deferred_cells(struct cache *cache)
2198 {
2199         unsigned long flags;
2200         struct list_head cells;
2201         struct dm_bio_prison_cell *cell, *tmp;
2202
2203         INIT_LIST_HEAD(&cells);
2204         spin_lock_irqsave(&cache->lock, flags);
2205         list_splice_init(&cache->deferred_cells, &cells);
2206         spin_unlock_irqrestore(&cache->lock, flags);
2207
2208         list_for_each_entry_safe(cell, tmp, &cells, user_list)
2209                 cell_requeue(cache, cell);
2210 }
2211
2212 static void requeue_deferred_bios(struct cache *cache)
2213 {
2214         struct bio *bio;
2215         struct bio_list bios;
2216
2217         bio_list_init(&bios);
2218         bio_list_merge(&bios, &cache->deferred_bios);
2219         bio_list_init(&cache->deferred_bios);
2220
2221         while ((bio = bio_list_pop(&bios))) {
2222                 bio->bi_error = DM_ENDIO_REQUEUE;
2223                 bio_endio(bio);
2224         }
2225 }
2226
2227 static int more_work(struct cache *cache)
2228 {
2229         if (is_quiescing(cache))
2230                 return !list_empty(&cache->quiesced_migrations) ||
2231                         !list_empty(&cache->completed_migrations) ||
2232                         !list_empty(&cache->need_commit_migrations);
2233         else
2234                 return !bio_list_empty(&cache->deferred_bios) ||
2235                         !list_empty(&cache->deferred_cells) ||
2236                         !bio_list_empty(&cache->deferred_flush_bios) ||
2237                         !bio_list_empty(&cache->deferred_writethrough_bios) ||
2238                         !list_empty(&cache->quiesced_migrations) ||
2239                         !list_empty(&cache->completed_migrations) ||
2240                         !list_empty(&cache->need_commit_migrations) ||
2241                         cache->invalidate;
2242 }
2243
2244 static void do_worker(struct work_struct *ws)
2245 {
2246         struct cache *cache = container_of(ws, struct cache, worker);
2247
2248         do {
2249                 if (!is_quiescing(cache)) {
2250                         writeback_some_dirty_blocks(cache);
2251                         process_deferred_writethrough_bios(cache);
2252                         process_deferred_bios(cache);
2253                         process_deferred_cells(cache);
2254                         process_invalidation_requests(cache);
2255                 }
2256
2257                 process_migrations(cache, &cache->quiesced_migrations, issue_copy_or_discard);
2258                 process_migrations(cache, &cache->completed_migrations, complete_migration);
2259
2260                 if (commit_if_needed(cache)) {
2261                         process_deferred_flush_bios(cache, false);
2262                         process_migrations(cache, &cache->need_commit_migrations, migration_failure);
2263                 } else {
2264                         process_deferred_flush_bios(cache, true);
2265                         process_migrations(cache, &cache->need_commit_migrations,
2266                                            migration_success_post_commit);
2267                 }
2268
2269                 ack_quiescing(cache);
2270
2271         } while (more_work(cache));
2272 }
2273
2274 /*
2275  * We want to commit periodically so that not too much
2276  * unwritten metadata builds up.
2277  */
2278 static void do_waker(struct work_struct *ws)
2279 {
2280         struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
2281         policy_tick(cache->policy, true);
2282         wake_worker(cache);
2283         queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
2284 }
2285
2286 /*----------------------------------------------------------------*/
2287
2288 static int is_congested(struct dm_dev *dev, int bdi_bits)
2289 {
2290         struct request_queue *q = bdev_get_queue(dev->bdev);
2291         return bdi_congested(&q->backing_dev_info, bdi_bits);
2292 }
2293
2294 static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
2295 {
2296         struct cache *cache = container_of(cb, struct cache, callbacks);
2297
2298         return is_congested(cache->origin_dev, bdi_bits) ||
2299                 is_congested(cache->cache_dev, bdi_bits);
2300 }
2301
2302 /*----------------------------------------------------------------
2303  * Target methods
2304  *--------------------------------------------------------------*/
2305
2306 /*
2307  * This function gets called on the error paths of the constructor, so we
2308  * have to cope with a partially initialised struct.
2309  */
2310 static void destroy(struct cache *cache)
2311 {
2312         unsigned i;
2313
2314         mempool_destroy(cache->migration_pool);
2315
2316         if (cache->all_io_ds)
2317                 dm_deferred_set_destroy(cache->all_io_ds);
2318
2319         if (cache->prison)
2320                 dm_bio_prison_destroy(cache->prison);
2321
2322         if (cache->wq)
2323                 destroy_workqueue(cache->wq);
2324
2325         if (cache->dirty_bitset)
2326                 free_bitset(cache->dirty_bitset);
2327
2328         if (cache->discard_bitset)
2329                 free_bitset(cache->discard_bitset);
2330
2331         if (cache->copier)
2332                 dm_kcopyd_client_destroy(cache->copier);
2333
2334         if (cache->cmd)
2335                 dm_cache_metadata_close(cache->cmd);
2336
2337         if (cache->metadata_dev)
2338                 dm_put_device(cache->ti, cache->metadata_dev);
2339
2340         if (cache->origin_dev)
2341                 dm_put_device(cache->ti, cache->origin_dev);
2342
2343         if (cache->cache_dev)
2344                 dm_put_device(cache->ti, cache->cache_dev);
2345
2346         if (cache->policy)
2347                 dm_cache_policy_destroy(cache->policy);
2348
2349         for (i = 0; i < cache->nr_ctr_args ; i++)
2350                 kfree(cache->ctr_args[i]);
2351         kfree(cache->ctr_args);
2352
2353         kfree(cache);
2354 }
2355
2356 static void cache_dtr(struct dm_target *ti)
2357 {
2358         struct cache *cache = ti->private;
2359
2360         destroy(cache);
2361 }
2362
2363 static sector_t get_dev_size(struct dm_dev *dev)
2364 {
2365         return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
2366 }
2367
2368 /*----------------------------------------------------------------*/
2369
2370 /*
2371  * Construct a cache device mapping.
2372  *
2373  * cache <metadata dev> <cache dev> <origin dev> <block size>
2374  *       <#feature args> [<feature arg>]*
2375  *       <policy> <#policy args> [<policy arg>]*
2376  *
2377  * metadata dev    : fast device holding the persistent metadata
2378  * cache dev       : fast device holding cached data blocks
2379  * origin dev      : slow device holding original data blocks
2380  * block size      : cache unit size in sectors
2381  *
2382  * #feature args   : number of feature arguments passed
2383  * feature args    : writethrough.  (The default is writeback.)
2384  *
2385  * policy          : the replacement policy to use
2386  * #policy args    : an even number of policy arguments corresponding
2387  *                   to key/value pairs passed to the policy
2388  * policy args     : key/value pairs passed to the policy
2389  *                   E.g. 'sequential_threshold 1024'
2390  *                   See cache-policies.txt for details.
2391  *
2392  * Optional feature arguments are:
2393  *   writethrough  : write through caching that prohibits cache block
2394  *                   content from being different from origin block content.
2395  *                   Without this argument, the default behaviour is to write
2396  *                   back cache block contents later for performance reasons,
2397  *                   so they may differ from the corresponding origin blocks.
2398  */
2399 struct cache_args {
2400         struct dm_target *ti;
2401
2402         struct dm_dev *metadata_dev;
2403
2404         struct dm_dev *cache_dev;
2405         sector_t cache_sectors;
2406
2407         struct dm_dev *origin_dev;
2408         sector_t origin_sectors;
2409
2410         uint32_t block_size;
2411
2412         const char *policy_name;
2413         int policy_argc;
2414         const char **policy_argv;
2415
2416         struct cache_features features;
2417 };
2418
2419 static void destroy_cache_args(struct cache_args *ca)
2420 {
2421         if (ca->metadata_dev)
2422                 dm_put_device(ca->ti, ca->metadata_dev);
2423
2424         if (ca->cache_dev)
2425                 dm_put_device(ca->ti, ca->cache_dev);
2426
2427         if (ca->origin_dev)
2428                 dm_put_device(ca->ti, ca->origin_dev);
2429
2430         kfree(ca);
2431 }
2432
2433 static bool at_least_one_arg(struct dm_arg_set *as, char **error)
2434 {
2435         if (!as->argc) {
2436                 *error = "Insufficient args";
2437                 return false;
2438         }
2439
2440         return true;
2441 }
2442
2443 static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as,
2444                               char **error)
2445 {
2446         int r;
2447         sector_t metadata_dev_size;
2448         char b[BDEVNAME_SIZE];
2449
2450         if (!at_least_one_arg(as, error))
2451                 return -EINVAL;
2452
2453         r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
2454                           &ca->metadata_dev);
2455         if (r) {
2456                 *error = "Error opening metadata device";
2457                 return r;
2458         }
2459
2460         metadata_dev_size = get_dev_size(ca->metadata_dev);
2461         if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING)
2462                 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
2463                        bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
2464
2465         return 0;
2466 }
2467
2468 static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as,
2469                            char **error)
2470 {
2471         int r;
2472
2473         if (!at_least_one_arg(as, error))
2474                 return -EINVAL;
2475
2476         r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
2477                           &ca->cache_dev);
2478         if (r) {
2479                 *error = "Error opening cache device";
2480                 return r;
2481         }
2482         ca->cache_sectors = get_dev_size(ca->cache_dev);
2483
2484         return 0;
2485 }
2486
2487 static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as,
2488                             char **error)
2489 {
2490         int r;
2491
2492         if (!at_least_one_arg(as, error))
2493                 return -EINVAL;
2494
2495         r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
2496                           &ca->origin_dev);
2497         if (r) {
2498                 *error = "Error opening origin device";
2499                 return r;
2500         }
2501
2502         ca->origin_sectors = get_dev_size(ca->origin_dev);
2503         if (ca->ti->len > ca->origin_sectors) {
2504                 *error = "Device size larger than cached device";
2505                 return -EINVAL;
2506         }
2507
2508         return 0;
2509 }
2510
2511 static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as,
2512                             char **error)
2513 {
2514         unsigned long block_size;
2515
2516         if (!at_least_one_arg(as, error))
2517                 return -EINVAL;
2518
2519         if (kstrtoul(dm_shift_arg(as), 10, &block_size) || !block_size ||
2520             block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
2521             block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
2522             block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
2523                 *error = "Invalid data block size";
2524                 return -EINVAL;
2525         }
2526
2527         if (block_size > ca->cache_sectors) {
2528                 *error = "Data block size is larger than the cache device";
2529                 return -EINVAL;
2530         }
2531
2532         ca->block_size = block_size;
2533
2534         return 0;
2535 }
2536
2537 static void init_features(struct cache_features *cf)
2538 {
2539         cf->mode = CM_WRITE;
2540         cf->io_mode = CM_IO_WRITEBACK;
2541 }
2542
2543 static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
2544                           char **error)
2545 {
2546         static struct dm_arg _args[] = {
2547                 {0, 1, "Invalid number of cache feature arguments"},
2548         };
2549
2550         int r;
2551         unsigned argc;
2552         const char *arg;
2553         struct cache_features *cf = &ca->features;
2554
2555         init_features(cf);
2556
2557         r = dm_read_arg_group(_args, as, &argc, error);
2558         if (r)
2559                 return -EINVAL;
2560
2561         while (argc--) {
2562                 arg = dm_shift_arg(as);
2563
2564                 if (!strcasecmp(arg, "writeback"))
2565                         cf->io_mode = CM_IO_WRITEBACK;
2566
2567                 else if (!strcasecmp(arg, "writethrough"))
2568                         cf->io_mode = CM_IO_WRITETHROUGH;
2569
2570                 else if (!strcasecmp(arg, "passthrough"))
2571                         cf->io_mode = CM_IO_PASSTHROUGH;
2572
2573                 else {
2574                         *error = "Unrecognised cache feature requested";
2575                         return -EINVAL;
2576                 }
2577         }
2578
2579         return 0;
2580 }
2581
2582 static int parse_policy(struct cache_args *ca, struct dm_arg_set *as,
2583                         char **error)
2584 {
2585         static struct dm_arg _args[] = {
2586                 {0, 1024, "Invalid number of policy arguments"},
2587         };
2588
2589         int r;
2590
2591         if (!at_least_one_arg(as, error))
2592                 return -EINVAL;
2593
2594         ca->policy_name = dm_shift_arg(as);
2595
2596         r = dm_read_arg_group(_args, as, &ca->policy_argc, error);
2597         if (r)
2598                 return -EINVAL;
2599
2600         ca->policy_argv = (const char **)as->argv;
2601         dm_consume_args(as, ca->policy_argc);
2602
2603         return 0;
2604 }
2605
2606 static int parse_cache_args(struct cache_args *ca, int argc, char **argv,
2607                             char **error)
2608 {
2609         int r;
2610         struct dm_arg_set as;
2611
2612         as.argc = argc;
2613         as.argv = argv;
2614
2615         r = parse_metadata_dev(ca, &as, error);
2616         if (r)
2617                 return r;
2618
2619         r = parse_cache_dev(ca, &as, error);
2620         if (r)
2621                 return r;
2622
2623         r = parse_origin_dev(ca, &as, error);
2624         if (r)
2625                 return r;
2626
2627         r = parse_block_size(ca, &as, error);
2628         if (r)
2629                 return r;
2630
2631         r = parse_features(ca, &as, error);
2632         if (r)
2633                 return r;
2634
2635         r = parse_policy(ca, &as, error);
2636         if (r)
2637                 return r;
2638
2639         return 0;
2640 }
2641
2642 /*----------------------------------------------------------------*/
2643
2644 static struct kmem_cache *migration_cache;
2645
2646 #define NOT_CORE_OPTION 1
2647
2648 static int process_config_option(struct cache *cache, const char *key, const char *value)
2649 {
2650         unsigned long tmp;
2651
2652         if (!strcasecmp(key, "migration_threshold")) {
2653                 if (kstrtoul(value, 10, &tmp))
2654                         return -EINVAL;
2655
2656                 cache->migration_threshold = tmp;
2657                 return 0;
2658         }
2659
2660         return NOT_CORE_OPTION;
2661 }
2662
2663 static int set_config_value(struct cache *cache, const char *key, const char *value)
2664 {
2665         int r = process_config_option(cache, key, value);
2666
2667         if (r == NOT_CORE_OPTION)
2668                 r = policy_set_config_value(cache->policy, key, value);
2669
2670         if (r)
2671                 DMWARN("bad config value for %s: %s", key, value);
2672
2673         return r;
2674 }
2675
2676 static int set_config_values(struct cache *cache, int argc, const char **argv)
2677 {
2678         int r = 0;
2679
2680         if (argc & 1) {
2681                 DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs.");
2682                 return -EINVAL;
2683         }
2684
2685         while (argc) {
2686                 r = set_config_value(cache, argv[0], argv[1]);
2687                 if (r)
2688                         break;
2689
2690                 argc -= 2;
2691                 argv += 2;
2692         }
2693
2694         return r;
2695 }
2696
2697 static int create_cache_policy(struct cache *cache, struct cache_args *ca,
2698                                char **error)
2699 {
2700         struct dm_cache_policy *p = dm_cache_policy_create(ca->policy_name,
2701                                                            cache->cache_size,
2702                                                            cache->origin_sectors,
2703                                                            cache->sectors_per_block);
2704         if (IS_ERR(p)) {
2705                 *error = "Error creating cache's policy";
2706                 return PTR_ERR(p);
2707         }
2708         cache->policy = p;
2709
2710         return 0;
2711 }
2712
2713 /*
2714  * We want the discard block size to be at least the size of the cache
2715  * block size and have no more than 2^14 discard blocks across the origin.
2716  */
2717 #define MAX_DISCARD_BLOCKS (1 << 14)
2718
2719 static bool too_many_discard_blocks(sector_t discard_block_size,
2720                                     sector_t origin_size)
2721 {
2722         (void) sector_div(origin_size, discard_block_size);
2723
2724         return origin_size > MAX_DISCARD_BLOCKS;
2725 }
2726
2727 static sector_t calculate_discard_block_size(sector_t cache_block_size,
2728                                              sector_t origin_size)
2729 {
2730         sector_t discard_block_size = cache_block_size;
2731
2732         if (origin_size)
2733                 while (too_many_discard_blocks(discard_block_size, origin_size))
2734                         discard_block_size *= 2;
2735
2736         return discard_block_size;
2737 }
2738
2739 static void set_cache_size(struct cache *cache, dm_cblock_t size)
2740 {
2741         dm_block_t nr_blocks = from_cblock(size);
2742
2743         if (nr_blocks > (1 << 20) && cache->cache_size != size)
2744                 DMWARN_LIMIT("You have created a cache device with a lot of individual cache blocks (%llu)\n"
2745                              "All these mappings can consume a lot of kernel memory, and take some time to read/write.\n"
2746                              "Please consider increasing the cache block size to reduce the overall cache block count.",
2747                              (unsigned long long) nr_blocks);
2748
2749         cache->cache_size = size;
2750 }
2751
2752 #define DEFAULT_MIGRATION_THRESHOLD 2048
2753
2754 static int cache_create(struct cache_args *ca, struct cache **result)
2755 {
2756         int r = 0;
2757         char **error = &ca->ti->error;
2758         struct cache *cache;
2759         struct dm_target *ti = ca->ti;
2760         dm_block_t origin_blocks;
2761         struct dm_cache_metadata *cmd;
2762         bool may_format = ca->features.mode == CM_WRITE;
2763
2764         cache = kzalloc(sizeof(*cache), GFP_KERNEL);
2765         if (!cache)
2766                 return -ENOMEM;
2767
2768         cache->ti = ca->ti;
2769         ti->private = cache;
2770         ti->num_flush_bios = 2;
2771         ti->flush_supported = true;
2772
2773         ti->num_discard_bios = 1;
2774         ti->discards_supported = true;
2775         ti->discard_zeroes_data_unsupported = true;
2776         ti->split_discard_bios = false;
2777
2778         cache->features = ca->features;
2779         ti->per_io_data_size = get_per_bio_data_size(cache);
2780
2781         cache->callbacks.congested_fn = cache_is_congested;
2782         dm_table_add_target_callbacks(ti->table, &cache->callbacks);
2783
2784         cache->metadata_dev = ca->metadata_dev;
2785         cache->origin_dev = ca->origin_dev;
2786         cache->cache_dev = ca->cache_dev;
2787
2788         ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL;
2789
2790         /* FIXME: factor out this whole section */
2791         origin_blocks = cache->origin_sectors = ca->origin_sectors;
2792         origin_blocks = block_div(origin_blocks, ca->block_size);
2793         cache->origin_blocks = to_oblock(origin_blocks);
2794
2795         cache->sectors_per_block = ca->block_size;
2796         if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) {
2797                 r = -EINVAL;
2798                 goto bad;
2799         }
2800
2801         if (ca->block_size & (ca->block_size - 1)) {
2802                 dm_block_t cache_size = ca->cache_sectors;
2803
2804                 cache->sectors_per_block_shift = -1;
2805                 cache_size = block_div(cache_size, ca->block_size);
2806                 set_cache_size(cache, to_cblock(cache_size));
2807         } else {
2808                 cache->sectors_per_block_shift = __ffs(ca->block_size);
2809                 set_cache_size(cache, to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift));
2810         }
2811
2812         r = create_cache_policy(cache, ca, error);
2813         if (r)
2814                 goto bad;
2815
2816         cache->policy_nr_args = ca->policy_argc;
2817         cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD;
2818
2819         r = set_config_values(cache, ca->policy_argc, ca->policy_argv);
2820         if (r) {
2821                 *error = "Error setting cache policy's config values";
2822                 goto bad;
2823         }
2824
2825         cmd = dm_cache_metadata_open(cache->metadata_dev->bdev,
2826                                      ca->block_size, may_format,
2827                                      dm_cache_policy_get_hint_size(cache->policy));
2828         if (IS_ERR(cmd)) {
2829                 *error = "Error creating metadata object";
2830                 r = PTR_ERR(cmd);
2831                 goto bad;
2832         }
2833         cache->cmd = cmd;
2834         set_cache_mode(cache, CM_WRITE);
2835         if (get_cache_mode(cache) != CM_WRITE) {
2836                 *error = "Unable to get write access to metadata, please check/repair metadata.";
2837                 r = -EINVAL;
2838                 goto bad;
2839         }
2840
2841         if (passthrough_mode(&cache->features)) {
2842                 bool all_clean;
2843
2844                 r = dm_cache_metadata_all_clean(cache->cmd, &all_clean);
2845                 if (r) {
2846                         *error = "dm_cache_metadata_all_clean() failed";
2847                         goto bad;
2848                 }
2849
2850                 if (!all_clean) {
2851                         *error = "Cannot enter passthrough mode unless all blocks are clean";
2852                         r = -EINVAL;
2853                         goto bad;
2854                 }
2855         }
2856
2857         spin_lock_init(&cache->lock);
2858         INIT_LIST_HEAD(&cache->deferred_cells);
2859         bio_list_init(&cache->deferred_bios);
2860         bio_list_init(&cache->deferred_flush_bios);
2861         bio_list_init(&cache->deferred_writethrough_bios);
2862         INIT_LIST_HEAD(&cache->quiesced_migrations);
2863         INIT_LIST_HEAD(&cache->completed_migrations);
2864         INIT_LIST_HEAD(&cache->need_commit_migrations);
2865         atomic_set(&cache->nr_allocated_migrations, 0);
2866         atomic_set(&cache->nr_io_migrations, 0);
2867         init_waitqueue_head(&cache->migration_wait);
2868
2869         init_waitqueue_head(&cache->quiescing_wait);
2870         atomic_set(&cache->quiescing, 0);
2871         atomic_set(&cache->quiescing_ack, 0);
2872
2873         r = -ENOMEM;
2874         atomic_set(&cache->nr_dirty, 0);
2875         cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
2876         if (!cache->dirty_bitset) {
2877                 *error = "could not allocate dirty bitset";
2878                 goto bad;
2879         }
2880         clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size));
2881
2882         cache->discard_block_size =
2883                 calculate_discard_block_size(cache->sectors_per_block,
2884                                              cache->origin_sectors);
2885         cache->discard_nr_blocks = to_dblock(dm_sector_div_up(cache->origin_sectors,
2886                                                               cache->discard_block_size));
2887         cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks));
2888         if (!cache->discard_bitset) {
2889                 *error = "could not allocate discard bitset";
2890                 goto bad;
2891         }
2892         clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
2893
2894         cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
2895         if (IS_ERR(cache->copier)) {
2896                 *error = "could not create kcopyd client";
2897                 r = PTR_ERR(cache->copier);
2898                 goto bad;
2899         }
2900
2901         cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
2902         if (!cache->wq) {
2903                 *error = "could not create workqueue for metadata object";
2904                 goto bad;
2905         }
2906         INIT_WORK(&cache->worker, do_worker);
2907         INIT_DELAYED_WORK(&cache->waker, do_waker);
2908         cache->last_commit_jiffies = jiffies;
2909
2910         cache->prison = dm_bio_prison_create();
2911         if (!cache->prison) {
2912                 *error = "could not create bio prison";
2913                 goto bad;
2914         }
2915
2916         cache->all_io_ds = dm_deferred_set_create();
2917         if (!cache->all_io_ds) {
2918                 *error = "could not create all_io deferred set";
2919                 goto bad;
2920         }
2921
2922         cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE,
2923                                                          migration_cache);
2924         if (!cache->migration_pool) {
2925                 *error = "Error creating cache's migration mempool";
2926                 goto bad;
2927         }
2928
2929         cache->need_tick_bio = true;
2930         cache->sized = false;
2931         cache->invalidate = false;
2932         cache->commit_requested = false;
2933         cache->loaded_mappings = false;
2934         cache->loaded_discards = false;
2935
2936         load_stats(cache);
2937
2938         atomic_set(&cache->stats.demotion, 0);
2939         atomic_set(&cache->stats.promotion, 0);
2940         atomic_set(&cache->stats.copies_avoided, 0);
2941         atomic_set(&cache->stats.cache_cell_clash, 0);
2942         atomic_set(&cache->stats.commit_count, 0);
2943         atomic_set(&cache->stats.discard_count, 0);
2944
2945         spin_lock_init(&cache->invalidation_lock);
2946         INIT_LIST_HEAD(&cache->invalidation_requests);
2947
2948         iot_init(&cache->origin_tracker);
2949
2950         *result = cache;
2951         return 0;
2952
2953 bad:
2954         destroy(cache);
2955         return r;
2956 }
2957
2958 static int copy_ctr_args(struct cache *cache, int argc, const char **argv)
2959 {
2960         unsigned i;
2961         const char **copy;
2962
2963         copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL);
2964         if (!copy)
2965                 return -ENOMEM;
2966         for (i = 0; i < argc; i++) {
2967                 copy[i] = kstrdup(argv[i], GFP_KERNEL);
2968                 if (!copy[i]) {
2969                         while (i--)
2970                                 kfree(copy[i]);
2971                         kfree(copy);
2972                         return -ENOMEM;
2973                 }
2974         }
2975
2976         cache->nr_ctr_args = argc;
2977         cache->ctr_args = copy;
2978
2979         return 0;
2980 }
2981
2982 static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv)
2983 {
2984         int r = -EINVAL;
2985         struct cache_args *ca;
2986         struct cache *cache = NULL;
2987
2988         ca = kzalloc(sizeof(*ca), GFP_KERNEL);
2989         if (!ca) {
2990                 ti->error = "Error allocating memory for cache";
2991                 return -ENOMEM;
2992         }
2993         ca->ti = ti;
2994
2995         r = parse_cache_args(ca, argc, argv, &ti->error);
2996         if (r)
2997                 goto out;
2998
2999         r = cache_create(ca, &cache);
3000         if (r)
3001                 goto out;
3002
3003         r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3);
3004         if (r) {
3005                 destroy(cache);
3006                 goto out;
3007         }
3008
3009         ti->private = cache;
3010
3011 out:
3012         destroy_cache_args(ca);
3013         return r;
3014 }
3015
3016 /*----------------------------------------------------------------*/
3017
3018 static int cache_map(struct dm_target *ti, struct bio *bio)
3019 {
3020         struct cache *cache = ti->private;
3021
3022         int r;
3023         struct dm_bio_prison_cell *cell = NULL;
3024         dm_oblock_t block = get_bio_block(cache, bio);
3025         size_t pb_data_size = get_per_bio_data_size(cache);
3026         bool can_migrate = false;
3027         bool fast_promotion;
3028         struct policy_result lookup_result;
3029         struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size);
3030         struct old_oblock_lock ool;
3031
3032         ool.locker.fn = null_locker;
3033
3034         if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) {
3035                 /*
3036                  * This can only occur if the io goes to a partial block at
3037                  * the end of the origin device.  We don't cache these.
3038                  * Just remap to the origin and carry on.
3039                  */
3040                 remap_to_origin(cache, bio);
3041                 accounted_begin(cache, bio);
3042                 return DM_MAPIO_REMAPPED;
3043         }
3044
3045         if (discard_or_flush(bio)) {
3046                 defer_bio(cache, bio);
3047                 return DM_MAPIO_SUBMITTED;
3048         }
3049
3050         /*
3051          * Check to see if that block is currently migrating.
3052          */
3053         cell = alloc_prison_cell(cache);
3054         if (!cell) {
3055                 defer_bio(cache, bio);
3056                 return DM_MAPIO_SUBMITTED;
3057         }
3058
3059         r = bio_detain(cache, block, bio, cell,
3060                        (cell_free_fn) free_prison_cell,
3061                        cache, &cell);
3062         if (r) {
3063                 if (r < 0)
3064                         defer_bio(cache, bio);
3065
3066                 return DM_MAPIO_SUBMITTED;
3067         }
3068
3069         fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio);
3070
3071         r = policy_map(cache->policy, block, false, can_migrate, fast_promotion,
3072                        bio, &ool.locker, &lookup_result);
3073         if (r == -EWOULDBLOCK) {
3074                 cell_defer(cache, cell, true);
3075                 return DM_MAPIO_SUBMITTED;
3076
3077         } else if (r) {
3078                 DMERR_LIMIT("%s: Unexpected return from cache replacement policy: %d",
3079                             cache_device_name(cache), r);
3080                 cell_defer(cache, cell, false);
3081                 bio_io_error(bio);
3082                 return DM_MAPIO_SUBMITTED;
3083         }
3084
3085         r = DM_MAPIO_REMAPPED;
3086         switch (lookup_result.op) {
3087         case POLICY_HIT:
3088                 if (passthrough_mode(&cache->features)) {
3089                         if (bio_data_dir(bio) == WRITE) {
3090                                 /*
3091                                  * We need to invalidate this block, so
3092                                  * defer for the worker thread.
3093                                  */
3094                                 cell_defer(cache, cell, true);
3095                                 r = DM_MAPIO_SUBMITTED;
3096
3097                         } else {
3098                                 inc_miss_counter(cache, bio);
3099                                 remap_to_origin_clear_discard(cache, bio, block);
3100                                 accounted_begin(cache, bio);
3101                                 inc_ds(cache, bio, cell);
3102                                 // FIXME: we want to remap hits or misses straight
3103                                 // away rather than passing over to the worker.
3104                                 cell_defer(cache, cell, false);
3105                         }
3106
3107                 } else {
3108                         inc_hit_counter(cache, bio);
3109                         if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) &&
3110                             !is_dirty(cache, lookup_result.cblock)) {
3111                                 remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
3112                                 accounted_begin(cache, bio);
3113                                 inc_ds(cache, bio, cell);
3114                                 cell_defer(cache, cell, false);
3115
3116                         } else
3117                                 remap_cell_to_cache_dirty(cache, cell, block, lookup_result.cblock, false);
3118                 }
3119                 break;
3120
3121         case POLICY_MISS:
3122                 inc_miss_counter(cache, bio);
3123                 if (pb->req_nr != 0) {
3124                         /*
3125                          * This is a duplicate writethrough io that is no
3126                          * longer needed because the block has been demoted.
3127                          */
3128                         bio_endio(bio);
3129                         // FIXME: remap everything as a miss
3130                         cell_defer(cache, cell, false);
3131                         r = DM_MAPIO_SUBMITTED;
3132
3133                 } else
3134                         remap_cell_to_origin_clear_discard(cache, cell, block, false);
3135                 break;
3136
3137         default:
3138                 DMERR_LIMIT("%s: %s: erroring bio: unknown policy op: %u",
3139                             cache_device_name(cache), __func__,
3140                             (unsigned) lookup_result.op);
3141                 cell_defer(cache, cell, false);
3142                 bio_io_error(bio);
3143                 r = DM_MAPIO_SUBMITTED;
3144         }
3145
3146         return r;
3147 }
3148
3149 static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
3150 {
3151         struct cache *cache = ti->private;
3152         unsigned long flags;
3153         size_t pb_data_size = get_per_bio_data_size(cache);
3154         struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
3155
3156         if (pb->tick) {
3157                 policy_tick(cache->policy, false);
3158
3159                 spin_lock_irqsave(&cache->lock, flags);
3160                 cache->need_tick_bio = true;
3161                 spin_unlock_irqrestore(&cache->lock, flags);
3162         }
3163
3164         check_for_quiesced_migrations(cache, pb);
3165         accounted_complete(cache, bio);
3166
3167         return 0;
3168 }
3169
3170 static int write_dirty_bitset(struct cache *cache)
3171 {
3172         unsigned i, r;
3173
3174         if (get_cache_mode(cache) >= CM_READ_ONLY)
3175                 return -EINVAL;
3176
3177         for (i = 0; i < from_cblock(cache->cache_size); i++) {
3178                 r = dm_cache_set_dirty(cache->cmd, to_cblock(i),
3179                                        is_dirty(cache, to_cblock(i)));
3180                 if (r) {
3181                         metadata_operation_failed(cache, "dm_cache_set_dirty", r);
3182                         return r;
3183                 }
3184         }
3185
3186         return 0;
3187 }
3188
3189 static int write_discard_bitset(struct cache *cache)
3190 {
3191         unsigned i, r;
3192
3193         if (get_cache_mode(cache) >= CM_READ_ONLY)
3194                 return -EINVAL;
3195
3196         r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size,
3197                                            cache->discard_nr_blocks);
3198         if (r) {
3199                 DMERR("%s: could not resize on-disk discard bitset", cache_device_name(cache));
3200                 metadata_operation_failed(cache, "dm_cache_discard_bitset_resize", r);
3201                 return r;
3202         }
3203
3204         for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) {
3205                 r = dm_cache_set_discard(cache->cmd, to_dblock(i),
3206                                          is_discarded(cache, to_dblock(i)));
3207                 if (r) {
3208                         metadata_operation_failed(cache, "dm_cache_set_discard", r);
3209                         return r;
3210                 }
3211         }
3212
3213         return 0;
3214 }
3215
3216 static int write_hints(struct cache *cache)
3217 {
3218         int r;
3219
3220         if (get_cache_mode(cache) >= CM_READ_ONLY)
3221                 return -EINVAL;
3222
3223         r = dm_cache_write_hints(cache->cmd, cache->policy);
3224         if (r) {
3225                 metadata_operation_failed(cache, "dm_cache_write_hints", r);
3226                 return r;
3227         }
3228
3229         return 0;
3230 }
3231
3232 /*
3233  * returns true on success
3234  */
3235 static bool sync_metadata(struct cache *cache)
3236 {
3237         int r1, r2, r3, r4;
3238
3239         r1 = write_dirty_bitset(cache);
3240         if (r1)
3241                 DMERR("%s: could not write dirty bitset", cache_device_name(cache));
3242
3243         r2 = write_discard_bitset(cache);
3244         if (r2)
3245                 DMERR("%s: could not write discard bitset", cache_device_name(cache));
3246
3247         save_stats(cache);
3248
3249         r3 = write_hints(cache);
3250         if (r3)
3251                 DMERR("%s: could not write hints", cache_device_name(cache));
3252
3253         /*
3254          * If writing the above metadata failed, we still commit, but don't
3255          * set the clean shutdown flag.  This will effectively force every
3256          * dirty bit to be set on reload.
3257          */
3258         r4 = commit(cache, !r1 && !r2 && !r3);
3259         if (r4)
3260                 DMERR("%s: could not write cache metadata", cache_device_name(cache));
3261
3262         return !r1 && !r2 && !r3 && !r4;
3263 }
3264
3265 static void cache_postsuspend(struct dm_target *ti)
3266 {
3267         struct cache *cache = ti->private;
3268
3269         start_quiescing(cache);
3270         wait_for_migrations(cache);
3271         stop_worker(cache);
3272         requeue_deferred_bios(cache);
3273         requeue_deferred_cells(cache);
3274         stop_quiescing(cache);
3275
3276         if (get_cache_mode(cache) == CM_WRITE)
3277                 (void) sync_metadata(cache);
3278 }
3279
3280 static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
3281                         bool dirty, uint32_t hint, bool hint_valid)
3282 {
3283         int r;
3284         struct cache *cache = context;
3285
3286         r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid);
3287         if (r)
3288                 return r;
3289
3290         if (dirty)
3291                 set_dirty(cache, oblock, cblock);
3292         else
3293                 clear_dirty(cache, oblock, cblock);
3294
3295         return 0;
3296 }
3297
3298 /*
3299  * The discard block size in the on disk metadata is not
3300  * neccessarily the same as we're currently using.  So we have to
3301  * be careful to only set the discarded attribute if we know it
3302  * covers a complete block of the new size.
3303  */
3304 struct discard_load_info {
3305         struct cache *cache;
3306
3307         /*
3308          * These blocks are sized using the on disk dblock size, rather
3309          * than the current one.
3310          */
3311         dm_block_t block_size;
3312         dm_block_t discard_begin, discard_end;
3313 };
3314
3315 static void discard_load_info_init(struct cache *cache,
3316                                    struct discard_load_info *li)
3317 {
3318         li->cache = cache;
3319         li->discard_begin = li->discard_end = 0;
3320 }
3321
3322 static void set_discard_range(struct discard_load_info *li)
3323 {
3324         sector_t b, e;
3325
3326         if (li->discard_begin == li->discard_end)
3327                 return;
3328
3329         /*
3330          * Convert to sectors.
3331          */
3332         b = li->discard_begin * li->block_size;
3333         e = li->discard_end * li->block_size;
3334
3335         /*
3336          * Then convert back to the current dblock size.
3337          */
3338         b = dm_sector_div_up(b, li->cache->discard_block_size);
3339         sector_div(e, li->cache->discard_block_size);
3340
3341         /*
3342          * The origin may have shrunk, so we need to check we're still in
3343          * bounds.
3344          */
3345         if (e > from_dblock(li->cache->discard_nr_blocks))
3346                 e = from_dblock(li->cache->discard_nr_blocks);
3347
3348         for (; b < e; b++)
3349                 set_discard(li->cache, to_dblock(b));
3350 }
3351
3352 static int load_discard(void *context, sector_t discard_block_size,
3353                         dm_dblock_t dblock, bool discard)
3354 {
3355         struct discard_load_info *li = context;
3356
3357         li->block_size = discard_block_size;
3358
3359         if (discard) {
3360                 if (from_dblock(dblock) == li->discard_end)
3361                         /*
3362                          * We're already in a discard range, just extend it.
3363                          */
3364                         li->discard_end = li->discard_end + 1ULL;
3365
3366                 else {
3367                         /*
3368                          * Emit the old range and start a new one.
3369                          */
3370                         set_discard_range(li);
3371                         li->discard_begin = from_dblock(dblock);
3372                         li->discard_end = li->discard_begin + 1ULL;
3373                 }
3374         } else {
3375                 set_discard_range(li);
3376                 li->discard_begin = li->discard_end = 0;
3377         }
3378
3379         return 0;
3380 }
3381
3382 static dm_cblock_t get_cache_dev_size(struct cache *cache)
3383 {
3384         sector_t size = get_dev_size(cache->cache_dev);
3385         (void) sector_div(size, cache->sectors_per_block);
3386         return to_cblock(size);
3387 }
3388
3389 static bool can_resize(struct cache *cache, dm_cblock_t new_size)
3390 {
3391         if (from_cblock(new_size) > from_cblock(cache->cache_size))
3392                 return true;
3393
3394         /*
3395          * We can't drop a dirty block when shrinking the cache.
3396          */
3397         while (from_cblock(new_size) < from_cblock(cache->cache_size)) {
3398                 new_size = to_cblock(from_cblock(new_size) + 1);
3399                 if (is_dirty(cache, new_size)) {
3400                         DMERR("%s: unable to shrink cache; cache block %llu is dirty",
3401                               cache_device_name(cache),
3402                               (unsigned long long) from_cblock(new_size));
3403                         return false;
3404                 }
3405         }
3406
3407         return true;
3408 }
3409
3410 static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size)
3411 {
3412         int r;
3413
3414         r = dm_cache_resize(cache->cmd, new_size);
3415         if (r) {
3416                 DMERR("%s: could not resize cache metadata", cache_device_name(cache));
3417                 metadata_operation_failed(cache, "dm_cache_resize", r);
3418                 return r;
3419         }
3420
3421         set_cache_size(cache, new_size);
3422
3423         return 0;
3424 }
3425
3426 static int cache_preresume(struct dm_target *ti)
3427 {
3428         int r = 0;
3429         struct cache *cache = ti->private;
3430         dm_cblock_t csize = get_cache_dev_size(cache);
3431
3432         /*
3433          * Check to see if the cache has resized.
3434          */
3435         if (!cache->sized) {
3436                 r = resize_cache_dev(cache, csize);
3437                 if (r)
3438                         return r;
3439
3440                 cache->sized = true;
3441
3442         } else if (csize != cache->cache_size) {
3443                 if (!can_resize(cache, csize))
3444                         return -EINVAL;
3445
3446                 r = resize_cache_dev(cache, csize);
3447                 if (r)
3448                         return r;
3449         }
3450
3451         if (!cache->loaded_mappings) {
3452                 r = dm_cache_load_mappings(cache->cmd, cache->policy,
3453                                            load_mapping, cache);
3454                 if (r) {
3455                         DMERR("%s: could not load cache mappings", cache_device_name(cache));
3456                         metadata_operation_failed(cache, "dm_cache_load_mappings", r);
3457                         return r;
3458                 }
3459
3460                 cache->loaded_mappings = true;
3461         }
3462
3463         if (!cache->loaded_discards) {
3464                 struct discard_load_info li;
3465
3466                 /*
3467                  * The discard bitset could have been resized, or the
3468                  * discard block size changed.  To be safe we start by
3469                  * setting every dblock to not discarded.
3470                  */
3471                 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
3472
3473                 discard_load_info_init(cache, &li);
3474                 r = dm_cache_load_discards(cache->cmd, load_discard, &li);
3475                 if (r) {
3476                         DMERR("%s: could not load origin discards", cache_device_name(cache));
3477                         metadata_operation_failed(cache, "dm_cache_load_discards", r);
3478                         return r;
3479                 }
3480                 set_discard_range(&li);
3481
3482                 cache->loaded_discards = true;
3483         }
3484
3485         return r;
3486 }
3487
3488 static void cache_resume(struct dm_target *ti)
3489 {
3490         struct cache *cache = ti->private;
3491
3492         cache->need_tick_bio = true;
3493         do_waker(&cache->waker.work);
3494 }
3495
3496 /*
3497  * Status format:
3498  *
3499  * <metadata block size> <#used metadata blocks>/<#total metadata blocks>
3500  * <cache block size> <#used cache blocks>/<#total cache blocks>
3501  * <#read hits> <#read misses> <#write hits> <#write misses>
3502  * <#demotions> <#promotions> <#dirty>
3503  * <#features> <features>*
3504  * <#core args> <core args>
3505  * <policy name> <#policy args> <policy args>* <cache metadata mode> <needs_check>
3506  */
3507 static void cache_status(struct dm_target *ti, status_type_t type,
3508                          unsigned status_flags, char *result, unsigned maxlen)
3509 {
3510         int r = 0;
3511         unsigned i;
3512         ssize_t sz = 0;
3513         dm_block_t nr_free_blocks_metadata = 0;
3514         dm_block_t nr_blocks_metadata = 0;
3515         char buf[BDEVNAME_SIZE];
3516         struct cache *cache = ti->private;
3517         dm_cblock_t residency;
3518         bool needs_check;
3519
3520         switch (type) {
3521         case STATUSTYPE_INFO:
3522                 if (get_cache_mode(cache) == CM_FAIL) {
3523                         DMEMIT("Fail");
3524                         break;
3525                 }
3526
3527                 /* Commit to ensure statistics aren't out-of-date */
3528                 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
3529                         (void) commit(cache, false);
3530
3531                 r = dm_cache_get_free_metadata_block_count(cache->cmd, &nr_free_blocks_metadata);
3532                 if (r) {
3533                         DMERR("%s: dm_cache_get_free_metadata_block_count returned %d",
3534                               cache_device_name(cache), r);
3535                         goto err;
3536                 }
3537
3538                 r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata);
3539                 if (r) {
3540                         DMERR("%s: dm_cache_get_metadata_dev_size returned %d",
3541                               cache_device_name(cache), r);
3542                         goto err;
3543                 }
3544
3545                 residency = policy_residency(cache->policy);
3546
3547                 DMEMIT("%u %llu/%llu %u %llu/%llu %u %u %u %u %u %u %lu ",
3548                        (unsigned)DM_CACHE_METADATA_BLOCK_SIZE,
3549                        (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
3550                        (unsigned long long)nr_blocks_metadata,
3551                        cache->sectors_per_block,
3552                        (unsigned long long) from_cblock(residency),
3553                        (unsigned long long) from_cblock(cache->cache_size),
3554                        (unsigned) atomic_read(&cache->stats.read_hit),
3555                        (unsigned) atomic_read(&cache->stats.read_miss),
3556                        (unsigned) atomic_read(&cache->stats.write_hit),
3557                        (unsigned) atomic_read(&cache->stats.write_miss),
3558                        (unsigned) atomic_read(&cache->stats.demotion),
3559                        (unsigned) atomic_read(&cache->stats.promotion),
3560                        (unsigned long) atomic_read(&cache->nr_dirty));
3561
3562                 if (writethrough_mode(&cache->features))
3563                         DMEMIT("1 writethrough ");
3564
3565                 else if (passthrough_mode(&cache->features))
3566                         DMEMIT("1 passthrough ");
3567
3568                 else if (writeback_mode(&cache->features))
3569                         DMEMIT("1 writeback ");
3570
3571                 else {
3572                         DMERR("%s: internal error: unknown io mode: %d",
3573                               cache_device_name(cache), (int) cache->features.io_mode);
3574                         goto err;
3575                 }
3576
3577                 DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold);
3578
3579                 DMEMIT("%s ", dm_cache_policy_get_name(cache->policy));
3580                 if (sz < maxlen) {
3581                         r = policy_emit_config_values(cache->policy, result, maxlen, &sz);
3582                         if (r)
3583                                 DMERR("%s: policy_emit_config_values returned %d",
3584                                       cache_device_name(cache), r);
3585                 }
3586
3587                 if (get_cache_mode(cache) == CM_READ_ONLY)
3588                         DMEMIT("ro ");
3589                 else
3590                         DMEMIT("rw ");
3591
3592                 r = dm_cache_metadata_needs_check(cache->cmd, &needs_check);
3593
3594                 if (r || needs_check)
3595                         DMEMIT("needs_check ");
3596                 else
3597                         DMEMIT("- ");
3598
3599                 break;
3600
3601         case STATUSTYPE_TABLE:
3602                 format_dev_t(buf, cache->metadata_dev->bdev->bd_dev);
3603                 DMEMIT("%s ", buf);
3604                 format_dev_t(buf, cache->cache_dev->bdev->bd_dev);
3605                 DMEMIT("%s ", buf);
3606                 format_dev_t(buf, cache->origin_dev->bdev->bd_dev);
3607                 DMEMIT("%s", buf);
3608
3609                 for (i = 0; i < cache->nr_ctr_args - 1; i++)
3610                         DMEMIT(" %s", cache->ctr_args[i]);
3611                 if (cache->nr_ctr_args)
3612                         DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]);
3613         }
3614
3615         return;
3616
3617 err:
3618         DMEMIT("Error");
3619 }
3620
3621 /*
3622  * A cache block range can take two forms:
3623  *
3624  * i) A single cblock, eg. '3456'
3625  * ii) A begin and end cblock with dots between, eg. 123-234
3626  */
3627 static int parse_cblock_range(struct cache *cache, const char *str,
3628                               struct cblock_range *result)
3629 {
3630         char dummy;
3631         uint64_t b, e;
3632         int r;
3633
3634         /*
3635          * Try and parse form (ii) first.
3636          */
3637         r = sscanf(str, "%llu-%llu%c", &b, &e, &dummy);
3638         if (r < 0)
3639                 return r;
3640
3641         if (r == 2) {
3642                 result->begin = to_cblock(b);
3643                 result->end = to_cblock(e);
3644                 return 0;
3645         }
3646
3647         /*
3648          * That didn't work, try form (i).
3649          */
3650         r = sscanf(str, "%llu%c", &b, &dummy);
3651         if (r < 0)
3652                 return r;
3653
3654         if (r == 1) {
3655                 result->begin = to_cblock(b);
3656                 result->end = to_cblock(from_cblock(result->begin) + 1u);
3657                 return 0;
3658         }
3659
3660         DMERR("%s: invalid cblock range '%s'", cache_device_name(cache), str);
3661         return -EINVAL;
3662 }
3663
3664 static int validate_cblock_range(struct cache *cache, struct cblock_range *range)
3665 {
3666         uint64_t b = from_cblock(range->begin);
3667         uint64_t e = from_cblock(range->end);
3668         uint64_t n = from_cblock(cache->cache_size);
3669
3670         if (b >= n) {
3671                 DMERR("%s: begin cblock out of range: %llu >= %llu",
3672                       cache_device_name(cache), b, n);
3673                 return -EINVAL;
3674         }
3675
3676         if (e > n) {
3677                 DMERR("%s: end cblock out of range: %llu > %llu",
3678                       cache_device_name(cache), e, n);
3679                 return -EINVAL;
3680         }
3681
3682         if (b >= e) {
3683                 DMERR("%s: invalid cblock range: %llu >= %llu",
3684                       cache_device_name(cache), b, e);
3685                 return -EINVAL;
3686         }
3687
3688         return 0;
3689 }
3690
3691 static int request_invalidation(struct cache *cache, struct cblock_range *range)
3692 {
3693         struct invalidation_request req;
3694
3695         INIT_LIST_HEAD(&req.list);
3696         req.cblocks = range;
3697         atomic_set(&req.complete, 0);
3698         req.err = 0;
3699         init_waitqueue_head(&req.result_wait);
3700
3701         spin_lock(&cache->invalidation_lock);
3702         list_add(&req.list, &cache->invalidation_requests);
3703         spin_unlock(&cache->invalidation_lock);
3704         wake_worker(cache);
3705
3706         wait_event(req.result_wait, atomic_read(&req.complete));
3707         return req.err;
3708 }
3709
3710 static int process_invalidate_cblocks_message(struct cache *cache, unsigned count,
3711                                               const char **cblock_ranges)
3712 {
3713         int r = 0;
3714         unsigned i;
3715         struct cblock_range range;
3716
3717         if (!passthrough_mode(&cache->features)) {
3718                 DMERR("%s: cache has to be in passthrough mode for invalidation",
3719                       cache_device_name(cache));
3720                 return -EPERM;
3721         }
3722
3723         for (i = 0; i < count; i++) {
3724                 r = parse_cblock_range(cache, cblock_ranges[i], &range);
3725                 if (r)
3726                         break;
3727
3728                 r = validate_cblock_range(cache, &range);
3729                 if (r)
3730                         break;
3731
3732                 /*
3733                  * Pass begin and end origin blocks to the worker and wake it.
3734                  */
3735                 r = request_invalidation(cache, &range);
3736                 if (r)
3737                         break;
3738         }
3739
3740         return r;
3741 }
3742
3743 /*
3744  * Supports
3745  *      "<key> <value>"
3746  * and
3747  *     "invalidate_cblocks [(<begin>)|(<begin>-<end>)]*
3748  *
3749  * The key migration_threshold is supported by the cache target core.
3750  */
3751 static int cache_message(struct dm_target *ti, unsigned argc, char **argv)
3752 {
3753         struct cache *cache = ti->private;
3754
3755         if (!argc)
3756                 return -EINVAL;
3757
3758         if (get_cache_mode(cache) >= CM_READ_ONLY) {
3759                 DMERR("%s: unable to service cache target messages in READ_ONLY or FAIL mode",
3760                       cache_device_name(cache));
3761                 return -EOPNOTSUPP;
3762         }
3763
3764         if (!strcasecmp(argv[0], "invalidate_cblocks"))
3765                 return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1);
3766
3767         if (argc != 2)
3768                 return -EINVAL;
3769
3770         return set_config_value(cache, argv[0], argv[1]);
3771 }
3772
3773 static int cache_iterate_devices(struct dm_target *ti,
3774                                  iterate_devices_callout_fn fn, void *data)
3775 {
3776         int r = 0;
3777         struct cache *cache = ti->private;
3778
3779         r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data);
3780         if (!r)
3781                 r = fn(ti, cache->origin_dev, 0, ti->len, data);
3782
3783         return r;
3784 }
3785
3786 static void set_discard_limits(struct cache *cache, struct queue_limits *limits)
3787 {
3788         /*
3789          * FIXME: these limits may be incompatible with the cache device
3790          */
3791         limits->max_discard_sectors = min_t(sector_t, cache->discard_block_size * 1024,
3792                                             cache->origin_sectors);
3793         limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT;
3794 }
3795
3796 static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
3797 {
3798         struct cache *cache = ti->private;
3799         uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
3800
3801         /*
3802          * If the system-determined stacked limits are compatible with the
3803          * cache's blocksize (io_opt is a factor) do not override them.
3804          */
3805         if (io_opt_sectors < cache->sectors_per_block ||
3806             do_div(io_opt_sectors, cache->sectors_per_block)) {
3807                 blk_limits_io_min(limits, cache->sectors_per_block << SECTOR_SHIFT);
3808                 blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT);
3809         }
3810         set_discard_limits(cache, limits);
3811 }
3812
3813 /*----------------------------------------------------------------*/
3814
3815 static struct target_type cache_target = {
3816         .name = "cache",
3817         .version = {1, 9, 0},
3818         .module = THIS_MODULE,
3819         .ctr = cache_ctr,
3820         .dtr = cache_dtr,
3821         .map = cache_map,
3822         .end_io = cache_end_io,
3823         .postsuspend = cache_postsuspend,
3824         .preresume = cache_preresume,
3825         .resume = cache_resume,
3826         .status = cache_status,
3827         .message = cache_message,
3828         .iterate_devices = cache_iterate_devices,
3829         .io_hints = cache_io_hints,
3830 };
3831
3832 static int __init dm_cache_init(void)
3833 {
3834         int r;
3835
3836         r = dm_register_target(&cache_target);
3837         if (r) {
3838                 DMERR("cache target registration failed: %d", r);
3839                 return r;
3840         }
3841
3842         migration_cache = KMEM_CACHE(dm_cache_migration, 0);
3843         if (!migration_cache) {
3844                 dm_unregister_target(&cache_target);
3845                 return -ENOMEM;
3846         }
3847
3848         return 0;
3849 }
3850
3851 static void __exit dm_cache_exit(void)
3852 {
3853         dm_unregister_target(&cache_target);
3854         kmem_cache_destroy(migration_cache);
3855 }
3856
3857 module_init(dm_cache_init);
3858 module_exit(dm_cache_exit);
3859
3860 MODULE_DESCRIPTION(DM_NAME " cache target");
3861 MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>");
3862 MODULE_LICENSE("GPL");