]> git.kernelconcepts.de Git - karo-tx-linux.git/blob - drivers/md/raid5-cache.c
30c7e5e79a02ef1575d9124089adb82c4f8137a0
[karo-tx-linux.git] / drivers / md / raid5-cache.c
1 /*
2  * Copyright (C) 2015 Shaohua Li <shli@fb.com>
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  */
14 #include <linux/kernel.h>
15 #include <linux/wait.h>
16 #include <linux/blkdev.h>
17 #include <linux/slab.h>
18 #include <linux/raid/md_p.h>
19 #include <linux/crc32c.h>
20 #include <linux/random.h>
21 #include "md.h"
22 #include "raid5.h"
23
24 /*
25  * metadata/data stored in disk with 4k size unit (a block) regardless
26  * underneath hardware sector size. only works with PAGE_SIZE == 4096
27  */
28 #define BLOCK_SECTORS (8)
29
30 /*
31  * reclaim runs every 1/4 disk size or 10G reclaimable space. This can prevent
32  * recovery scans a very long log
33  */
34 #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */
35 #define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
36
37 struct r5l_log {
38         struct md_rdev *rdev;
39
40         u32 uuid_checksum;
41
42         sector_t device_size;           /* log device size, round to
43                                          * BLOCK_SECTORS */
44         sector_t max_free_space;        /* reclaim run if free space is at
45                                          * this size */
46
47         sector_t last_checkpoint;       /* log tail. where recovery scan
48                                          * starts from */
49         u64 last_cp_seq;                /* log tail sequence */
50
51         sector_t log_start;             /* log head. where new data appends */
52         u64 seq;                        /* log head sequence */
53
54         struct mutex io_mutex;
55         struct r5l_io_unit *current_io; /* current io_unit accepting new data */
56
57         spinlock_t io_list_lock;
58         struct list_head running_ios;   /* io_units which are still running,
59                                          * and have not yet been completely
60                                          * written to the log */
61         struct list_head io_end_ios;    /* io_units which have been completely
62                                          * written to the log but not yet written
63                                          * to the RAID */
64         struct list_head flushing_ios;  /* io_units which are waiting for log
65                                          * cache flush */
66         struct list_head flushed_ios;   /* io_units which settle down in log disk */
67         struct bio flush_bio;
68         struct list_head stripe_end_ios;/* io_units which have been completely
69                                          * written to the RAID but have not yet
70                                          * been considered for updating super */
71
72         struct kmem_cache *io_kc;
73
74         struct md_thread *reclaim_thread;
75         unsigned long reclaim_target;   /* number of space that need to be
76                                          * reclaimed.  if it's 0, reclaim spaces
77                                          * used by io_units which are in
78                                          * IO_UNIT_STRIPE_END state (eg, reclaim
79                                          * dones't wait for specific io_unit
80                                          * switching to IO_UNIT_STRIPE_END
81                                          * state) */
82         wait_queue_head_t iounit_wait;
83
84         struct list_head no_space_stripes; /* pending stripes, log has no space */
85         spinlock_t no_space_stripes_lock;
86 };
87
88 /*
89  * an IO range starts from a meta data block and end at the next meta data
90  * block. The io unit's the meta data block tracks data/parity followed it. io
91  * unit is written to log disk with normal write, as we always flush log disk
92  * first and then start move data to raid disks, there is no requirement to
93  * write io unit with FLUSH/FUA
94  */
95 struct r5l_io_unit {
96         struct r5l_log *log;
97
98         struct page *meta_page; /* store meta block */
99         int meta_offset;        /* current offset in meta_page */
100
101         struct bio_list bios;
102         atomic_t pending_io;    /* pending bios not written to log yet */
103         struct bio *current_bio;/* current_bio accepting new data */
104
105         atomic_t pending_stripe;/* how many stripes not flushed to raid */
106         u64 seq;                /* seq number of the metablock */
107         sector_t log_start;     /* where the io_unit starts */
108         sector_t log_end;       /* where the io_unit ends */
109         struct list_head log_sibling; /* log->running_ios */
110         struct list_head stripe_list; /* stripes added to the io_unit */
111
112         int state;
113 };
114
115 /* r5l_io_unit state */
116 enum r5l_io_unit_state {
117         IO_UNIT_RUNNING = 0,    /* accepting new IO */
118         IO_UNIT_IO_START = 1,   /* io_unit bio start writing to log,
119                                  * don't accepting new bio */
120         IO_UNIT_IO_END = 2,     /* io_unit bio finish writing to log */
121         IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */
122 };
123
124 static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc)
125 {
126         start += inc;
127         if (start >= log->device_size)
128                 start = start - log->device_size;
129         return start;
130 }
131
132 static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start,
133                                   sector_t end)
134 {
135         if (end >= start)
136                 return end - start;
137         else
138                 return end + log->device_size - start;
139 }
140
141 static bool r5l_has_free_space(struct r5l_log *log, sector_t size)
142 {
143         sector_t used_size;
144
145         used_size = r5l_ring_distance(log, log->last_checkpoint,
146                                         log->log_start);
147
148         return log->device_size > used_size + size;
149 }
150
151 static struct r5l_io_unit *r5l_alloc_io_unit(struct r5l_log *log)
152 {
153         struct r5l_io_unit *io;
154         /* We can't handle memory allocate failure so far */
155         gfp_t gfp = GFP_NOIO | __GFP_NOFAIL;
156
157         io = kmem_cache_zalloc(log->io_kc, gfp);
158         io->log = log;
159         io->meta_page = alloc_page(gfp | __GFP_ZERO);
160
161         bio_list_init(&io->bios);
162         INIT_LIST_HEAD(&io->log_sibling);
163         INIT_LIST_HEAD(&io->stripe_list);
164         io->state = IO_UNIT_RUNNING;
165         return io;
166 }
167
168 static void r5l_free_io_unit(struct r5l_log *log, struct r5l_io_unit *io)
169 {
170         __free_page(io->meta_page);
171         kmem_cache_free(log->io_kc, io);
172 }
173
174 static void r5l_move_io_unit_list(struct list_head *from, struct list_head *to,
175                                   enum r5l_io_unit_state state)
176 {
177         struct r5l_io_unit *io;
178
179         while (!list_empty(from)) {
180                 io = list_first_entry(from, struct r5l_io_unit, log_sibling);
181                 /* don't change list order */
182                 if (io->state >= state)
183                         list_move_tail(&io->log_sibling, to);
184                 else
185                         break;
186         }
187 }
188
189 /*
190  * We don't want too many io_units reside in stripe_end_ios list, which will
191  * waste a lot of memory. So we try to remove some. But we must keep at least 2
192  * io_units. The superblock must point to a valid meta, if it's the last meta,
193  * recovery can scan less
194  */
195 static void r5l_compress_stripe_end_list(struct r5l_log *log)
196 {
197         struct r5l_io_unit *first, *last, *io;
198
199         first = list_first_entry(&log->stripe_end_ios,
200                                  struct r5l_io_unit, log_sibling);
201         last = list_last_entry(&log->stripe_end_ios,
202                                struct r5l_io_unit, log_sibling);
203         if (first == last)
204                 return;
205         list_del(&first->log_sibling);
206         list_del(&last->log_sibling);
207         while (!list_empty(&log->stripe_end_ios)) {
208                 io = list_first_entry(&log->stripe_end_ios,
209                                       struct r5l_io_unit, log_sibling);
210                 list_del(&io->log_sibling);
211                 first->log_end = io->log_end;
212                 r5l_free_io_unit(log, io);
213         }
214         list_add_tail(&first->log_sibling, &log->stripe_end_ios);
215         list_add_tail(&last->log_sibling, &log->stripe_end_ios);
216 }
217
218 static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
219                                     enum r5l_io_unit_state state)
220 {
221         if (WARN_ON(io->state >= state))
222                 return;
223         io->state = state;
224 }
225
226 /* XXX: totally ignores I/O errors */
227 static void r5l_log_endio(struct bio *bio)
228 {
229         struct r5l_io_unit *io = bio->bi_private;
230         struct r5l_log *log = io->log;
231         unsigned long flags;
232
233         bio_put(bio);
234
235         if (!atomic_dec_and_test(&io->pending_io))
236                 return;
237
238         spin_lock_irqsave(&log->io_list_lock, flags);
239         __r5l_set_io_unit_state(io, IO_UNIT_IO_END);
240         r5l_move_io_unit_list(&log->running_ios, &log->io_end_ios,
241                         IO_UNIT_IO_END);
242         spin_unlock_irqrestore(&log->io_list_lock, flags);
243
244         md_wakeup_thread(log->rdev->mddev->thread);
245 }
246
247 static void r5l_submit_current_io(struct r5l_log *log)
248 {
249         struct r5l_io_unit *io = log->current_io;
250         struct r5l_meta_block *block;
251         struct bio *bio;
252         unsigned long flags;
253         u32 crc;
254
255         if (!io)
256                 return;
257
258         block = page_address(io->meta_page);
259         block->meta_size = cpu_to_le32(io->meta_offset);
260         crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE);
261         block->checksum = cpu_to_le32(crc);
262
263         log->current_io = NULL;
264         spin_lock_irqsave(&log->io_list_lock, flags);
265         __r5l_set_io_unit_state(io, IO_UNIT_IO_START);
266         spin_unlock_irqrestore(&log->io_list_lock, flags);
267
268         while ((bio = bio_list_pop(&io->bios))) {
269                 /* all IO must start from rdev->data_offset */
270                 bio->bi_iter.bi_sector += log->rdev->data_offset;
271                 submit_bio(WRITE, bio);
272         }
273 }
274
275 static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
276 {
277         struct r5l_io_unit *io;
278         struct r5l_meta_block *block;
279         struct bio *bio;
280
281         io = r5l_alloc_io_unit(log);
282
283         block = page_address(io->meta_page);
284         block->magic = cpu_to_le32(R5LOG_MAGIC);
285         block->version = R5LOG_VERSION;
286         block->seq = cpu_to_le64(log->seq);
287         block->position = cpu_to_le64(log->log_start);
288
289         io->log_start = log->log_start;
290         io->meta_offset = sizeof(struct r5l_meta_block);
291         io->seq = log->seq;
292
293         bio = bio_kmalloc(GFP_NOIO | __GFP_NOFAIL, BIO_MAX_PAGES);
294         io->current_bio = bio;
295         bio->bi_rw = WRITE;
296         bio->bi_bdev = log->rdev->bdev;
297         bio->bi_iter.bi_sector = log->log_start;
298         bio_add_page(bio, io->meta_page, PAGE_SIZE, 0);
299         bio->bi_end_io = r5l_log_endio;
300         bio->bi_private = io;
301
302         bio_list_add(&io->bios, bio);
303         atomic_inc(&io->pending_io);
304
305         log->seq++;
306         log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS);
307         io->log_end = log->log_start;
308         /* current bio hit disk end */
309         if (log->log_start == 0)
310                 io->current_bio = NULL;
311
312         spin_lock_irq(&log->io_list_lock);
313         list_add_tail(&io->log_sibling, &log->running_ios);
314         spin_unlock_irq(&log->io_list_lock);
315
316         return io;
317 }
318
319 static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size)
320 {
321         struct r5l_io_unit *io;
322
323         io = log->current_io;
324         if (io && io->meta_offset + payload_size > PAGE_SIZE)
325                 r5l_submit_current_io(log);
326         io = log->current_io;
327         if (io)
328                 return 0;
329
330         log->current_io = r5l_new_meta(log);
331         return 0;
332 }
333
334 static void r5l_append_payload_meta(struct r5l_log *log, u16 type,
335                                     sector_t location,
336                                     u32 checksum1, u32 checksum2,
337                                     bool checksum2_valid)
338 {
339         struct r5l_io_unit *io = log->current_io;
340         struct r5l_payload_data_parity *payload;
341
342         payload = page_address(io->meta_page) + io->meta_offset;
343         payload->header.type = cpu_to_le16(type);
344         payload->header.flags = cpu_to_le16(0);
345         payload->size = cpu_to_le32((1 + !!checksum2_valid) <<
346                                     (PAGE_SHIFT - 9));
347         payload->location = cpu_to_le64(location);
348         payload->checksum[0] = cpu_to_le32(checksum1);
349         if (checksum2_valid)
350                 payload->checksum[1] = cpu_to_le32(checksum2);
351
352         io->meta_offset += sizeof(struct r5l_payload_data_parity) +
353                 sizeof(__le32) * (1 + !!checksum2_valid);
354 }
355
356 static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
357 {
358         struct r5l_io_unit *io = log->current_io;
359
360 alloc_bio:
361         if (!io->current_bio) {
362                 struct bio *bio;
363
364                 bio = bio_kmalloc(GFP_NOIO | __GFP_NOFAIL, BIO_MAX_PAGES);
365                 bio->bi_rw = WRITE;
366                 bio->bi_bdev = log->rdev->bdev;
367                 bio->bi_iter.bi_sector = log->log_start;
368                 bio->bi_end_io = r5l_log_endio;
369                 bio->bi_private = io;
370                 bio_list_add(&io->bios, bio);
371                 atomic_inc(&io->pending_io);
372                 io->current_bio = bio;
373         }
374         if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0)) {
375                 io->current_bio = NULL;
376                 goto alloc_bio;
377         }
378         log->log_start = r5l_ring_add(log, log->log_start,
379                                       BLOCK_SECTORS);
380         /* current bio hit disk end */
381         if (log->log_start == 0)
382                 io->current_bio = NULL;
383
384         io->log_end = log->log_start;
385 }
386
387 static void r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
388                            int data_pages, int parity_pages)
389 {
390         int i;
391         int meta_size;
392         struct r5l_io_unit *io;
393
394         meta_size =
395                 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
396                  * data_pages) +
397                 sizeof(struct r5l_payload_data_parity) +
398                 sizeof(__le32) * parity_pages;
399
400         r5l_get_meta(log, meta_size);
401         io = log->current_io;
402
403         for (i = 0; i < sh->disks; i++) {
404                 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
405                         continue;
406                 if (i == sh->pd_idx || i == sh->qd_idx)
407                         continue;
408                 r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA,
409                                         raid5_compute_blocknr(sh, i, 0),
410                                         sh->dev[i].log_checksum, 0, false);
411                 r5l_append_payload_page(log, sh->dev[i].page);
412         }
413
414         if (sh->qd_idx >= 0) {
415                 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
416                                         sh->sector, sh->dev[sh->pd_idx].log_checksum,
417                                         sh->dev[sh->qd_idx].log_checksum, true);
418                 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
419                 r5l_append_payload_page(log, sh->dev[sh->qd_idx].page);
420         } else {
421                 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
422                                         sh->sector, sh->dev[sh->pd_idx].log_checksum,
423                                         0, false);
424                 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
425         }
426
427         list_add_tail(&sh->log_list, &io->stripe_list);
428         atomic_inc(&io->pending_stripe);
429         sh->log_io = io;
430 }
431
432 static void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
433 /*
434  * running in raid5d, where reclaim could wait for raid5d too (when it flushes
435  * data from log to raid disks), so we shouldn't wait for reclaim here
436  */
437 int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
438 {
439         int write_disks = 0;
440         int data_pages, parity_pages;
441         int meta_size;
442         int reserve;
443         int i;
444
445         if (!log)
446                 return -EAGAIN;
447         /* Don't support stripe batch */
448         if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
449             test_bit(STRIPE_SYNCING, &sh->state)) {
450                 /* the stripe is written to log, we start writing it to raid */
451                 clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
452                 return -EAGAIN;
453         }
454
455         for (i = 0; i < sh->disks; i++) {
456                 void *addr;
457
458                 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
459                         continue;
460                 write_disks++;
461                 /* checksum is already calculated in last run */
462                 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
463                         continue;
464                 addr = kmap_atomic(sh->dev[i].page);
465                 sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
466                                                     addr, PAGE_SIZE);
467                 kunmap_atomic(addr);
468         }
469         parity_pages = 1 + !!(sh->qd_idx >= 0);
470         data_pages = write_disks - parity_pages;
471
472         meta_size =
473                 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
474                  * data_pages) +
475                 sizeof(struct r5l_payload_data_parity) +
476                 sizeof(__le32) * parity_pages;
477         /* Doesn't work with very big raid array */
478         if (meta_size + sizeof(struct r5l_meta_block) > PAGE_SIZE)
479                 return -EINVAL;
480
481         set_bit(STRIPE_LOG_TRAPPED, &sh->state);
482         atomic_inc(&sh->count);
483
484         mutex_lock(&log->io_mutex);
485         /* meta + data */
486         reserve = (1 + write_disks) << (PAGE_SHIFT - 9);
487         if (r5l_has_free_space(log, reserve))
488                 r5l_log_stripe(log, sh, data_pages, parity_pages);
489         else {
490                 spin_lock(&log->no_space_stripes_lock);
491                 list_add_tail(&sh->log_list, &log->no_space_stripes);
492                 spin_unlock(&log->no_space_stripes_lock);
493
494                 r5l_wake_reclaim(log, reserve);
495         }
496         mutex_unlock(&log->io_mutex);
497
498         return 0;
499 }
500
501 void r5l_write_stripe_run(struct r5l_log *log)
502 {
503         if (!log)
504                 return;
505         mutex_lock(&log->io_mutex);
506         r5l_submit_current_io(log);
507         mutex_unlock(&log->io_mutex);
508 }
509
510 int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
511 {
512         if (!log)
513                 return -ENODEV;
514         /*
515          * we flush log disk cache first, then write stripe data to raid disks.
516          * So if bio is finished, the log disk cache is flushed already. The
517          * recovery guarantees we can recovery the bio from log disk, so we
518          * don't need to flush again
519          */
520         if (bio->bi_iter.bi_size == 0) {
521                 bio_endio(bio);
522                 return 0;
523         }
524         bio->bi_rw &= ~REQ_FLUSH;
525         return -EAGAIN;
526 }
527
528 /* This will run after log space is reclaimed */
529 static void r5l_run_no_space_stripes(struct r5l_log *log)
530 {
531         struct stripe_head *sh;
532
533         spin_lock(&log->no_space_stripes_lock);
534         while (!list_empty(&log->no_space_stripes)) {
535                 sh = list_first_entry(&log->no_space_stripes,
536                                       struct stripe_head, log_list);
537                 list_del_init(&sh->log_list);
538                 set_bit(STRIPE_HANDLE, &sh->state);
539                 raid5_release_stripe(sh);
540         }
541         spin_unlock(&log->no_space_stripes_lock);
542 }
543
544 static void __r5l_stripe_write_finished(struct r5l_io_unit *io)
545 {
546         struct r5l_log *log = io->log;
547         struct r5l_io_unit *last;
548         sector_t reclaimable_space;
549         unsigned long flags;
550
551         spin_lock_irqsave(&log->io_list_lock, flags);
552         __r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END);
553         /* might move 0 entry */
554         r5l_move_io_unit_list(&log->flushed_ios, &log->stripe_end_ios,
555                               IO_UNIT_STRIPE_END);
556         if (list_empty(&log->stripe_end_ios)) {
557                 spin_unlock_irqrestore(&log->io_list_lock, flags);
558                 return;
559         }
560
561         last = list_last_entry(&log->stripe_end_ios,
562                                struct r5l_io_unit, log_sibling);
563         reclaimable_space = r5l_ring_distance(log, log->last_checkpoint,
564                                               last->log_end);
565         if (reclaimable_space >= log->max_free_space)
566                 r5l_wake_reclaim(log, 0);
567
568         r5l_compress_stripe_end_list(log);
569         spin_unlock_irqrestore(&log->io_list_lock, flags);
570         wake_up(&log->iounit_wait);
571 }
572
573 void r5l_stripe_write_finished(struct stripe_head *sh)
574 {
575         struct r5l_io_unit *io;
576
577         io = sh->log_io;
578         sh->log_io = NULL;
579
580         if (io && atomic_dec_and_test(&io->pending_stripe))
581                 __r5l_stripe_write_finished(io);
582 }
583
584 static void r5l_log_flush_endio(struct bio *bio)
585 {
586         struct r5l_log *log = container_of(bio, struct r5l_log,
587                 flush_bio);
588         unsigned long flags;
589         struct r5l_io_unit *io;
590         struct stripe_head *sh;
591
592         spin_lock_irqsave(&log->io_list_lock, flags);
593         list_for_each_entry(io, &log->flushing_ios, log_sibling) {
594                 while (!list_empty(&io->stripe_list)) {
595                         sh = list_first_entry(&io->stripe_list,
596                                 struct stripe_head, log_list);
597                         list_del_init(&sh->log_list);
598                         set_bit(STRIPE_HANDLE, &sh->state);
599                         raid5_release_stripe(sh);
600                 }
601         }
602         list_splice_tail_init(&log->flushing_ios, &log->flushed_ios);
603         spin_unlock_irqrestore(&log->io_list_lock, flags);
604 }
605
606 /*
607  * Starting dispatch IO to raid.
608  * io_unit(meta) consists of a log. There is one situation we want to avoid. A
609  * broken meta in the middle of a log causes recovery can't find meta at the
610  * head of log. If operations require meta at the head persistent in log, we
611  * must make sure meta before it persistent in log too. A case is:
612  *
613  * stripe data/parity is in log, we start write stripe to raid disks. stripe
614  * data/parity must be persistent in log before we do the write to raid disks.
615  *
616  * The solution is we restrictly maintain io_unit list order. In this case, we
617  * only write stripes of an io_unit to raid disks till the io_unit is the first
618  * one whose data/parity is in log.
619  */
620 void r5l_flush_stripe_to_raid(struct r5l_log *log)
621 {
622         bool do_flush;
623         if (!log)
624                 return;
625
626         spin_lock_irq(&log->io_list_lock);
627         /* flush bio is running */
628         if (!list_empty(&log->flushing_ios)) {
629                 spin_unlock_irq(&log->io_list_lock);
630                 return;
631         }
632         list_splice_tail_init(&log->io_end_ios, &log->flushing_ios);
633         do_flush = !list_empty(&log->flushing_ios);
634         spin_unlock_irq(&log->io_list_lock);
635
636         if (!do_flush)
637                 return;
638         bio_reset(&log->flush_bio);
639         log->flush_bio.bi_bdev = log->rdev->bdev;
640         log->flush_bio.bi_end_io = r5l_log_flush_endio;
641         submit_bio(WRITE_FLUSH, &log->flush_bio);
642 }
643
644 static void r5l_kick_io_unit(struct r5l_log *log)
645 {
646         md_wakeup_thread(log->rdev->mddev->thread);
647         wait_event_lock_irq(log->iounit_wait, !list_empty(&log->stripe_end_ios),
648                             log->io_list_lock);
649 }
650
651 static void r5l_write_super(struct r5l_log *log, sector_t cp);
652 static void r5l_do_reclaim(struct r5l_log *log)
653 {
654         struct r5l_io_unit *io, *last;
655         LIST_HEAD(list);
656         sector_t free = 0;
657         sector_t reclaim_target = xchg(&log->reclaim_target, 0);
658
659         spin_lock_irq(&log->io_list_lock);
660         /*
661          * move proper io_unit to reclaim list. We should not change the order.
662          * reclaimable/unreclaimable io_unit can be mixed in the list, we
663          * shouldn't reuse space of an unreclaimable io_unit
664          */
665         while (1) {
666                 struct list_head *target_list = NULL;
667
668                 while (!list_empty(&log->stripe_end_ios)) {
669                         io = list_first_entry(&log->stripe_end_ios,
670                                               struct r5l_io_unit, log_sibling);
671                         list_move_tail(&io->log_sibling, &list);
672                         free += r5l_ring_distance(log, io->log_start,
673                                                   io->log_end);
674                 }
675
676                 if (free >= reclaim_target ||
677                     (list_empty(&log->running_ios) &&
678                      list_empty(&log->io_end_ios) &&
679                      list_empty(&log->flushing_ios) &&
680                      list_empty(&log->flushed_ios)))
681                         break;
682
683                 /* Below waiting mostly happens when we shutdown the raid */
684                 if (!list_empty(&log->flushed_ios))
685                         target_list = &log->flushed_ios;
686                 else if (!list_empty(&log->flushing_ios))
687                         target_list = &log->flushing_ios;
688                 else if (!list_empty(&log->io_end_ios))
689                         target_list = &log->io_end_ios;
690                 else if (!list_empty(&log->running_ios))
691                         target_list = &log->running_ios;
692
693                 r5l_kick_io_unit(log);
694         }
695         spin_unlock_irq(&log->io_list_lock);
696
697         if (list_empty(&list))
698                 return;
699
700         /* super always point to last valid meta */
701         last = list_last_entry(&list, struct r5l_io_unit, log_sibling);
702         /*
703          * write_super will flush cache of each raid disk. We must write super
704          * here, because the log area might be reused soon and we don't want to
705          * confuse recovery
706          */
707         r5l_write_super(log, last->log_start);
708
709         mutex_lock(&log->io_mutex);
710         log->last_checkpoint = last->log_start;
711         log->last_cp_seq = last->seq;
712         mutex_unlock(&log->io_mutex);
713         r5l_run_no_space_stripes(log);
714
715         while (!list_empty(&list)) {
716                 io = list_first_entry(&list, struct r5l_io_unit, log_sibling);
717                 list_del(&io->log_sibling);
718                 r5l_free_io_unit(log, io);
719         }
720 }
721
722 static void r5l_reclaim_thread(struct md_thread *thread)
723 {
724         struct mddev *mddev = thread->mddev;
725         struct r5conf *conf = mddev->private;
726         struct r5l_log *log = conf->log;
727
728         if (!log)
729                 return;
730         r5l_do_reclaim(log);
731 }
732
733 static void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
734 {
735         unsigned long target;
736         unsigned long new = (unsigned long)space; /* overflow in theory */
737
738         do {
739                 target = log->reclaim_target;
740                 if (new < target)
741                         return;
742         } while (cmpxchg(&log->reclaim_target, target, new) != target);
743         md_wakeup_thread(log->reclaim_thread);
744 }
745
746 struct r5l_recovery_ctx {
747         struct page *meta_page;         /* current meta */
748         sector_t meta_total_blocks;     /* total size of current meta and data */
749         sector_t pos;                   /* recovery position */
750         u64 seq;                        /* recovery position seq */
751 };
752
753 static int r5l_read_meta_block(struct r5l_log *log,
754                                struct r5l_recovery_ctx *ctx)
755 {
756         struct page *page = ctx->meta_page;
757         struct r5l_meta_block *mb;
758         u32 crc, stored_crc;
759
760         if (!sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, READ, false))
761                 return -EIO;
762
763         mb = page_address(page);
764         stored_crc = le32_to_cpu(mb->checksum);
765         mb->checksum = 0;
766
767         if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
768             le64_to_cpu(mb->seq) != ctx->seq ||
769             mb->version != R5LOG_VERSION ||
770             le64_to_cpu(mb->position) != ctx->pos)
771                 return -EINVAL;
772
773         crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
774         if (stored_crc != crc)
775                 return -EINVAL;
776
777         if (le32_to_cpu(mb->meta_size) > PAGE_SIZE)
778                 return -EINVAL;
779
780         ctx->meta_total_blocks = BLOCK_SECTORS;
781
782         return 0;
783 }
784
785 static int r5l_recovery_flush_one_stripe(struct r5l_log *log,
786                                          struct r5l_recovery_ctx *ctx,
787                                          sector_t stripe_sect,
788                                          int *offset, sector_t *log_offset)
789 {
790         struct r5conf *conf = log->rdev->mddev->private;
791         struct stripe_head *sh;
792         struct r5l_payload_data_parity *payload;
793         int disk_index;
794
795         sh = raid5_get_active_stripe(conf, stripe_sect, 0, 0, 0);
796         while (1) {
797                 payload = page_address(ctx->meta_page) + *offset;
798
799                 if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
800                         raid5_compute_sector(conf,
801                                              le64_to_cpu(payload->location), 0,
802                                              &disk_index, sh);
803
804                         sync_page_io(log->rdev, *log_offset, PAGE_SIZE,
805                                      sh->dev[disk_index].page, READ, false);
806                         sh->dev[disk_index].log_checksum =
807                                 le32_to_cpu(payload->checksum[0]);
808                         set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
809                         ctx->meta_total_blocks += BLOCK_SECTORS;
810                 } else {
811                         disk_index = sh->pd_idx;
812                         sync_page_io(log->rdev, *log_offset, PAGE_SIZE,
813                                      sh->dev[disk_index].page, READ, false);
814                         sh->dev[disk_index].log_checksum =
815                                 le32_to_cpu(payload->checksum[0]);
816                         set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
817
818                         if (sh->qd_idx >= 0) {
819                                 disk_index = sh->qd_idx;
820                                 sync_page_io(log->rdev,
821                                              r5l_ring_add(log, *log_offset, BLOCK_SECTORS),
822                                              PAGE_SIZE, sh->dev[disk_index].page,
823                                              READ, false);
824                                 sh->dev[disk_index].log_checksum =
825                                         le32_to_cpu(payload->checksum[1]);
826                                 set_bit(R5_Wantwrite,
827                                         &sh->dev[disk_index].flags);
828                         }
829                         ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded;
830                 }
831
832                 *log_offset = r5l_ring_add(log, *log_offset,
833                                            le32_to_cpu(payload->size));
834                 *offset += sizeof(struct r5l_payload_data_parity) +
835                         sizeof(__le32) *
836                         (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
837                 if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY)
838                         break;
839         }
840
841         for (disk_index = 0; disk_index < sh->disks; disk_index++) {
842                 void *addr;
843                 u32 checksum;
844
845                 if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
846                         continue;
847                 addr = kmap_atomic(sh->dev[disk_index].page);
848                 checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
849                 kunmap_atomic(addr);
850                 if (checksum != sh->dev[disk_index].log_checksum)
851                         goto error;
852         }
853
854         for (disk_index = 0; disk_index < sh->disks; disk_index++) {
855                 struct md_rdev *rdev, *rrdev;
856
857                 if (!test_and_clear_bit(R5_Wantwrite,
858                                         &sh->dev[disk_index].flags))
859                         continue;
860
861                 /* in case device is broken */
862                 rdev = rcu_dereference(conf->disks[disk_index].rdev);
863                 if (rdev)
864                         sync_page_io(rdev, stripe_sect, PAGE_SIZE,
865                                      sh->dev[disk_index].page, WRITE, false);
866                 rrdev = rcu_dereference(conf->disks[disk_index].replacement);
867                 if (rrdev)
868                         sync_page_io(rrdev, stripe_sect, PAGE_SIZE,
869                                      sh->dev[disk_index].page, WRITE, false);
870         }
871         raid5_release_stripe(sh);
872         return 0;
873
874 error:
875         for (disk_index = 0; disk_index < sh->disks; disk_index++)
876                 sh->dev[disk_index].flags = 0;
877         raid5_release_stripe(sh);
878         return -EINVAL;
879 }
880
881 static int r5l_recovery_flush_one_meta(struct r5l_log *log,
882                                        struct r5l_recovery_ctx *ctx)
883 {
884         struct r5conf *conf = log->rdev->mddev->private;
885         struct r5l_payload_data_parity *payload;
886         struct r5l_meta_block *mb;
887         int offset;
888         sector_t log_offset;
889         sector_t stripe_sector;
890
891         mb = page_address(ctx->meta_page);
892         offset = sizeof(struct r5l_meta_block);
893         log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
894
895         while (offset < le32_to_cpu(mb->meta_size)) {
896                 int dd;
897
898                 payload = (void *)mb + offset;
899                 stripe_sector = raid5_compute_sector(conf,
900                                                      le64_to_cpu(payload->location), 0, &dd, NULL);
901                 if (r5l_recovery_flush_one_stripe(log, ctx, stripe_sector,
902                                                   &offset, &log_offset))
903                         return -EINVAL;
904         }
905         return 0;
906 }
907
908 /* copy data/parity from log to raid disks */
909 static void r5l_recovery_flush_log(struct r5l_log *log,
910                                    struct r5l_recovery_ctx *ctx)
911 {
912         while (1) {
913                 if (r5l_read_meta_block(log, ctx))
914                         return;
915                 if (r5l_recovery_flush_one_meta(log, ctx))
916                         return;
917                 ctx->seq++;
918                 ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks);
919         }
920 }
921
922 static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
923                                           u64 seq)
924 {
925         struct page *page;
926         struct r5l_meta_block *mb;
927         u32 crc;
928
929         page = alloc_page(GFP_KERNEL | __GFP_ZERO);
930         if (!page)
931                 return -ENOMEM;
932         mb = page_address(page);
933         mb->magic = cpu_to_le32(R5LOG_MAGIC);
934         mb->version = R5LOG_VERSION;
935         mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block));
936         mb->seq = cpu_to_le64(seq);
937         mb->position = cpu_to_le64(pos);
938         crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
939         mb->checksum = cpu_to_le32(crc);
940
941         if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, WRITE_FUA, false)) {
942                 __free_page(page);
943                 return -EIO;
944         }
945         __free_page(page);
946         return 0;
947 }
948
949 static int r5l_recovery_log(struct r5l_log *log)
950 {
951         struct r5l_recovery_ctx ctx;
952
953         ctx.pos = log->last_checkpoint;
954         ctx.seq = log->last_cp_seq;
955         ctx.meta_page = alloc_page(GFP_KERNEL);
956         if (!ctx.meta_page)
957                 return -ENOMEM;
958
959         r5l_recovery_flush_log(log, &ctx);
960         __free_page(ctx.meta_page);
961
962         /*
963          * we did a recovery. Now ctx.pos points to an invalid meta block. New
964          * log will start here. but we can't let superblock point to last valid
965          * meta block. The log might looks like:
966          * | meta 1| meta 2| meta 3|
967          * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If
968          * superblock points to meta 1, we write a new valid meta 2n.  if crash
969          * happens again, new recovery will start from meta 1. Since meta 2n is
970          * valid now, recovery will think meta 3 is valid, which is wrong.
971          * The solution is we create a new meta in meta2 with its seq == meta
972          * 1's seq + 10 and let superblock points to meta2. The same recovery will
973          * not think meta 3 is a valid meta, because its seq doesn't match
974          */
975         if (ctx.seq > log->last_cp_seq + 1) {
976                 int ret;
977
978                 ret = r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq + 10);
979                 if (ret)
980                         return ret;
981                 log->seq = ctx.seq + 11;
982                 log->log_start = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
983                 r5l_write_super(log, ctx.pos);
984         } else {
985                 log->log_start = ctx.pos;
986                 log->seq = ctx.seq;
987         }
988         return 0;
989 }
990
991 static void r5l_write_super(struct r5l_log *log, sector_t cp)
992 {
993         struct mddev *mddev = log->rdev->mddev;
994
995         log->rdev->journal_tail = cp;
996         set_bit(MD_CHANGE_DEVS, &mddev->flags);
997 }
998
999 static int r5l_load_log(struct r5l_log *log)
1000 {
1001         struct md_rdev *rdev = log->rdev;
1002         struct page *page;
1003         struct r5l_meta_block *mb;
1004         sector_t cp = log->rdev->journal_tail;
1005         u32 stored_crc, expected_crc;
1006         bool create_super = false;
1007         int ret;
1008
1009         /* Make sure it's valid */
1010         if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp)
1011                 cp = 0;
1012         page = alloc_page(GFP_KERNEL);
1013         if (!page)
1014                 return -ENOMEM;
1015
1016         if (!sync_page_io(rdev, cp, PAGE_SIZE, page, READ, false)) {
1017                 ret = -EIO;
1018                 goto ioerr;
1019         }
1020         mb = page_address(page);
1021
1022         if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
1023             mb->version != R5LOG_VERSION) {
1024                 create_super = true;
1025                 goto create;
1026         }
1027         stored_crc = le32_to_cpu(mb->checksum);
1028         mb->checksum = 0;
1029         expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
1030         if (stored_crc != expected_crc) {
1031                 create_super = true;
1032                 goto create;
1033         }
1034         if (le64_to_cpu(mb->position) != cp) {
1035                 create_super = true;
1036                 goto create;
1037         }
1038 create:
1039         if (create_super) {
1040                 log->last_cp_seq = prandom_u32();
1041                 cp = 0;
1042                 /*
1043                  * Make sure super points to correct address. Log might have
1044                  * data very soon. If super hasn't correct log tail address,
1045                  * recovery can't find the log
1046                  */
1047                 r5l_write_super(log, cp);
1048         } else
1049                 log->last_cp_seq = le64_to_cpu(mb->seq);
1050
1051         log->device_size = round_down(rdev->sectors, BLOCK_SECTORS);
1052         log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT;
1053         if (log->max_free_space > RECLAIM_MAX_FREE_SPACE)
1054                 log->max_free_space = RECLAIM_MAX_FREE_SPACE;
1055         log->last_checkpoint = cp;
1056
1057         __free_page(page);
1058
1059         return r5l_recovery_log(log);
1060 ioerr:
1061         __free_page(page);
1062         return ret;
1063 }
1064
1065 int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
1066 {
1067         struct r5l_log *log;
1068
1069         if (PAGE_SIZE != 4096)
1070                 return -EINVAL;
1071         log = kzalloc(sizeof(*log), GFP_KERNEL);
1072         if (!log)
1073                 return -ENOMEM;
1074         log->rdev = rdev;
1075
1076         log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid,
1077                                        sizeof(rdev->mddev->uuid));
1078
1079         mutex_init(&log->io_mutex);
1080
1081         spin_lock_init(&log->io_list_lock);
1082         INIT_LIST_HEAD(&log->running_ios);
1083         INIT_LIST_HEAD(&log->io_end_ios);
1084         INIT_LIST_HEAD(&log->stripe_end_ios);
1085         INIT_LIST_HEAD(&log->flushing_ios);
1086         INIT_LIST_HEAD(&log->flushed_ios);
1087         bio_init(&log->flush_bio);
1088
1089         log->io_kc = KMEM_CACHE(r5l_io_unit, 0);
1090         if (!log->io_kc)
1091                 goto io_kc;
1092
1093         log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
1094                                                  log->rdev->mddev, "reclaim");
1095         if (!log->reclaim_thread)
1096                 goto reclaim_thread;
1097         init_waitqueue_head(&log->iounit_wait);
1098
1099         INIT_LIST_HEAD(&log->no_space_stripes);
1100         spin_lock_init(&log->no_space_stripes_lock);
1101
1102         if (r5l_load_log(log))
1103                 goto error;
1104
1105         conf->log = log;
1106         return 0;
1107 error:
1108         md_unregister_thread(&log->reclaim_thread);
1109 reclaim_thread:
1110         kmem_cache_destroy(log->io_kc);
1111 io_kc:
1112         kfree(log);
1113         return -EINVAL;
1114 }
1115
1116 void r5l_exit_log(struct r5l_log *log)
1117 {
1118         /*
1119          * at this point all stripes are finished, so io_unit is at least in
1120          * STRIPE_END state
1121          */
1122         r5l_wake_reclaim(log, -1L);
1123         md_unregister_thread(&log->reclaim_thread);
1124         r5l_do_reclaim(log);
1125         /*
1126          * force a super update, r5l_do_reclaim might updated the super.
1127          * mddev->thread is already stopped
1128          */
1129         md_update_sb(log->rdev->mddev, 1);
1130
1131         kmem_cache_destroy(log->io_kc);
1132         kfree(log);
1133 }