]> git.kernelconcepts.de Git - karo-tx-linux.git/blob - drivers/md/raid5-cache.c
raid5-cache: don't delay stripe captured in log
[karo-tx-linux.git] / drivers / md / raid5-cache.c
1 /*
2  * Copyright (C) 2015 Shaohua Li <shli@fb.com>
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  */
14 #include <linux/kernel.h>
15 #include <linux/wait.h>
16 #include <linux/blkdev.h>
17 #include <linux/slab.h>
18 #include <linux/raid/md_p.h>
19 #include <linux/crc32c.h>
20 #include <linux/random.h>
21 #include "md.h"
22 #include "raid5.h"
23
24 /*
25  * metadata/data stored in disk with 4k size unit (a block) regardless
26  * underneath hardware sector size. only works with PAGE_SIZE == 4096
27  */
28 #define BLOCK_SECTORS (8)
29
30 /*
31  * reclaim runs every 1/4 disk size or 10G reclaimable space. This can prevent
32  * recovery scans a very long log
33  */
34 #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */
35 #define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
36
37 struct r5l_log {
38         struct md_rdev *rdev;
39
40         u32 uuid_checksum;
41
42         sector_t device_size;           /* log device size, round to
43                                          * BLOCK_SECTORS */
44         sector_t max_free_space;        /* reclaim run if free space is at
45                                          * this size */
46
47         sector_t last_checkpoint;       /* log tail. where recovery scan
48                                          * starts from */
49         u64 last_cp_seq;                /* log tail sequence */
50
51         sector_t log_start;             /* log head. where new data appends */
52         u64 seq;                        /* log head sequence */
53
54         struct mutex io_mutex;
55         struct r5l_io_unit *current_io; /* current io_unit accepting new data */
56
57         spinlock_t io_list_lock;
58         struct list_head running_ios;   /* io_units which are still running,
59                                          * and have not yet been completely
60                                          * written to the log */
61         struct list_head io_end_ios;    /* io_units which have been completely
62                                          * written to the log but not yet written
63                                          * to the RAID */
64         struct list_head flushing_ios;  /* io_units which are waiting for log
65                                          * cache flush */
66         struct list_head flushed_ios;   /* io_units which settle down in log disk */
67         struct bio flush_bio;
68         struct list_head stripe_end_ios;/* io_units which have been completely
69                                          * written to the RAID but have not yet
70                                          * been considered for updating super */
71
72         struct kmem_cache *io_kc;
73
74         struct md_thread *reclaim_thread;
75         unsigned long reclaim_target;   /* number of space that need to be
76                                          * reclaimed.  if it's 0, reclaim spaces
77                                          * used by io_units which are in
78                                          * IO_UNIT_STRIPE_END state (eg, reclaim
79                                          * dones't wait for specific io_unit
80                                          * switching to IO_UNIT_STRIPE_END
81                                          * state) */
82         wait_queue_head_t iounit_wait;
83
84         struct list_head no_space_stripes; /* pending stripes, log has no space */
85         spinlock_t no_space_stripes_lock;
86 };
87
88 /*
89  * an IO range starts from a meta data block and end at the next meta data
90  * block. The io unit's the meta data block tracks data/parity followed it. io
91  * unit is written to log disk with normal write, as we always flush log disk
92  * first and then start move data to raid disks, there is no requirement to
93  * write io unit with FLUSH/FUA
94  */
95 struct r5l_io_unit {
96         struct r5l_log *log;
97
98         struct page *meta_page; /* store meta block */
99         int meta_offset;        /* current offset in meta_page */
100
101         struct bio_list bios;
102         atomic_t pending_io;    /* pending bios not written to log yet */
103         struct bio *current_bio;/* current_bio accepting new data */
104
105         atomic_t pending_stripe;/* how many stripes not flushed to raid */
106         u64 seq;                /* seq number of the metablock */
107         sector_t log_start;     /* where the io_unit starts */
108         sector_t log_end;       /* where the io_unit ends */
109         struct list_head log_sibling; /* log->running_ios */
110         struct list_head stripe_list; /* stripes added to the io_unit */
111
112         int state;
113 };
114
115 /* r5l_io_unit state */
116 enum r5l_io_unit_state {
117         IO_UNIT_RUNNING = 0,    /* accepting new IO */
118         IO_UNIT_IO_START = 1,   /* io_unit bio start writing to log,
119                                  * don't accepting new bio */
120         IO_UNIT_IO_END = 2,     /* io_unit bio finish writing to log */
121         IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */
122 };
123
124 static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc)
125 {
126         start += inc;
127         if (start >= log->device_size)
128                 start = start - log->device_size;
129         return start;
130 }
131
132 static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start,
133                                   sector_t end)
134 {
135         if (end >= start)
136                 return end - start;
137         else
138                 return end + log->device_size - start;
139 }
140
141 static bool r5l_has_free_space(struct r5l_log *log, sector_t size)
142 {
143         sector_t used_size;
144
145         used_size = r5l_ring_distance(log, log->last_checkpoint,
146                                         log->log_start);
147
148         return log->device_size > used_size + size;
149 }
150
151 static struct r5l_io_unit *r5l_alloc_io_unit(struct r5l_log *log)
152 {
153         struct r5l_io_unit *io;
154         /* We can't handle memory allocate failure so far */
155         gfp_t gfp = GFP_NOIO | __GFP_NOFAIL;
156
157         io = kmem_cache_zalloc(log->io_kc, gfp);
158         io->log = log;
159         io->meta_page = alloc_page(gfp | __GFP_ZERO);
160
161         bio_list_init(&io->bios);
162         INIT_LIST_HEAD(&io->log_sibling);
163         INIT_LIST_HEAD(&io->stripe_list);
164         io->state = IO_UNIT_RUNNING;
165         return io;
166 }
167
168 static void r5l_free_io_unit(struct r5l_log *log, struct r5l_io_unit *io)
169 {
170         __free_page(io->meta_page);
171         kmem_cache_free(log->io_kc, io);
172 }
173
174 static void r5l_move_io_unit_list(struct list_head *from, struct list_head *to,
175                                   enum r5l_io_unit_state state)
176 {
177         struct r5l_io_unit *io;
178
179         while (!list_empty(from)) {
180                 io = list_first_entry(from, struct r5l_io_unit, log_sibling);
181                 /* don't change list order */
182                 if (io->state >= state)
183                         list_move_tail(&io->log_sibling, to);
184                 else
185                         break;
186         }
187 }
188
189 /*
190  * We don't want too many io_units reside in stripe_end_ios list, which will
191  * waste a lot of memory. So we try to remove some. But we must keep at least 2
192  * io_units. The superblock must point to a valid meta, if it's the last meta,
193  * recovery can scan less
194  */
195 static void r5l_compress_stripe_end_list(struct r5l_log *log)
196 {
197         struct r5l_io_unit *first, *last, *io;
198
199         first = list_first_entry(&log->stripe_end_ios,
200                                  struct r5l_io_unit, log_sibling);
201         last = list_last_entry(&log->stripe_end_ios,
202                                struct r5l_io_unit, log_sibling);
203         if (first == last)
204                 return;
205         list_del(&first->log_sibling);
206         list_del(&last->log_sibling);
207         while (!list_empty(&log->stripe_end_ios)) {
208                 io = list_first_entry(&log->stripe_end_ios,
209                                       struct r5l_io_unit, log_sibling);
210                 list_del(&io->log_sibling);
211                 first->log_end = io->log_end;
212                 r5l_free_io_unit(log, io);
213         }
214         list_add_tail(&first->log_sibling, &log->stripe_end_ios);
215         list_add_tail(&last->log_sibling, &log->stripe_end_ios);
216 }
217
218 static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
219                                     enum r5l_io_unit_state state)
220 {
221         if (WARN_ON(io->state >= state))
222                 return;
223         io->state = state;
224 }
225
226 /* XXX: totally ignores I/O errors */
227 static void r5l_log_endio(struct bio *bio)
228 {
229         struct r5l_io_unit *io = bio->bi_private;
230         struct r5l_log *log = io->log;
231         unsigned long flags;
232
233         bio_put(bio);
234
235         if (!atomic_dec_and_test(&io->pending_io))
236                 return;
237
238         spin_lock_irqsave(&log->io_list_lock, flags);
239         __r5l_set_io_unit_state(io, IO_UNIT_IO_END);
240         r5l_move_io_unit_list(&log->running_ios, &log->io_end_ios,
241                         IO_UNIT_IO_END);
242         spin_unlock_irqrestore(&log->io_list_lock, flags);
243
244         md_wakeup_thread(log->rdev->mddev->thread);
245 }
246
247 static void r5l_submit_current_io(struct r5l_log *log)
248 {
249         struct r5l_io_unit *io = log->current_io;
250         struct r5l_meta_block *block;
251         struct bio *bio;
252         unsigned long flags;
253         u32 crc;
254
255         if (!io)
256                 return;
257
258         block = page_address(io->meta_page);
259         block->meta_size = cpu_to_le32(io->meta_offset);
260         crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE);
261         block->checksum = cpu_to_le32(crc);
262
263         log->current_io = NULL;
264         spin_lock_irqsave(&log->io_list_lock, flags);
265         __r5l_set_io_unit_state(io, IO_UNIT_IO_START);
266         spin_unlock_irqrestore(&log->io_list_lock, flags);
267
268         while ((bio = bio_list_pop(&io->bios))) {
269                 /* all IO must start from rdev->data_offset */
270                 bio->bi_iter.bi_sector += log->rdev->data_offset;
271                 submit_bio(WRITE, bio);
272         }
273 }
274
275 static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
276 {
277         struct r5l_io_unit *io;
278         struct r5l_meta_block *block;
279         struct bio *bio;
280
281         io = r5l_alloc_io_unit(log);
282
283         block = page_address(io->meta_page);
284         block->magic = cpu_to_le32(R5LOG_MAGIC);
285         block->version = R5LOG_VERSION;
286         block->seq = cpu_to_le64(log->seq);
287         block->position = cpu_to_le64(log->log_start);
288
289         io->log_start = log->log_start;
290         io->meta_offset = sizeof(struct r5l_meta_block);
291         io->seq = log->seq;
292
293         bio = bio_kmalloc(GFP_NOIO | __GFP_NOFAIL, BIO_MAX_PAGES);
294         io->current_bio = bio;
295         bio->bi_rw = WRITE;
296         bio->bi_bdev = log->rdev->bdev;
297         bio->bi_iter.bi_sector = log->log_start;
298         bio_add_page(bio, io->meta_page, PAGE_SIZE, 0);
299         bio->bi_end_io = r5l_log_endio;
300         bio->bi_private = io;
301
302         bio_list_add(&io->bios, bio);
303         atomic_inc(&io->pending_io);
304
305         log->seq++;
306         log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS);
307         io->log_end = log->log_start;
308         /* current bio hit disk end */
309         if (log->log_start == 0)
310                 io->current_bio = NULL;
311
312         spin_lock_irq(&log->io_list_lock);
313         list_add_tail(&io->log_sibling, &log->running_ios);
314         spin_unlock_irq(&log->io_list_lock);
315
316         return io;
317 }
318
319 static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size)
320 {
321         struct r5l_io_unit *io;
322
323         io = log->current_io;
324         if (io && io->meta_offset + payload_size > PAGE_SIZE)
325                 r5l_submit_current_io(log);
326         io = log->current_io;
327         if (io)
328                 return 0;
329
330         log->current_io = r5l_new_meta(log);
331         return 0;
332 }
333
334 static void r5l_append_payload_meta(struct r5l_log *log, u16 type,
335                                     sector_t location,
336                                     u32 checksum1, u32 checksum2,
337                                     bool checksum2_valid)
338 {
339         struct r5l_io_unit *io = log->current_io;
340         struct r5l_payload_data_parity *payload;
341
342         payload = page_address(io->meta_page) + io->meta_offset;
343         payload->header.type = cpu_to_le16(type);
344         payload->header.flags = cpu_to_le16(0);
345         payload->size = cpu_to_le32((1 + !!checksum2_valid) <<
346                                     (PAGE_SHIFT - 9));
347         payload->location = cpu_to_le64(location);
348         payload->checksum[0] = cpu_to_le32(checksum1);
349         if (checksum2_valid)
350                 payload->checksum[1] = cpu_to_le32(checksum2);
351
352         io->meta_offset += sizeof(struct r5l_payload_data_parity) +
353                 sizeof(__le32) * (1 + !!checksum2_valid);
354 }
355
356 static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
357 {
358         struct r5l_io_unit *io = log->current_io;
359
360 alloc_bio:
361         if (!io->current_bio) {
362                 struct bio *bio;
363
364                 bio = bio_kmalloc(GFP_NOIO | __GFP_NOFAIL, BIO_MAX_PAGES);
365                 bio->bi_rw = WRITE;
366                 bio->bi_bdev = log->rdev->bdev;
367                 bio->bi_iter.bi_sector = log->log_start;
368                 bio->bi_end_io = r5l_log_endio;
369                 bio->bi_private = io;
370                 bio_list_add(&io->bios, bio);
371                 atomic_inc(&io->pending_io);
372                 io->current_bio = bio;
373         }
374         if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0)) {
375                 io->current_bio = NULL;
376                 goto alloc_bio;
377         }
378         log->log_start = r5l_ring_add(log, log->log_start,
379                                       BLOCK_SECTORS);
380         /* current bio hit disk end */
381         if (log->log_start == 0)
382                 io->current_bio = NULL;
383
384         io->log_end = log->log_start;
385 }
386
387 static void r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
388                            int data_pages, int parity_pages)
389 {
390         int i;
391         int meta_size;
392         struct r5l_io_unit *io;
393
394         meta_size =
395                 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
396                  * data_pages) +
397                 sizeof(struct r5l_payload_data_parity) +
398                 sizeof(__le32) * parity_pages;
399
400         r5l_get_meta(log, meta_size);
401         io = log->current_io;
402
403         for (i = 0; i < sh->disks; i++) {
404                 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
405                         continue;
406                 if (i == sh->pd_idx || i == sh->qd_idx)
407                         continue;
408                 r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA,
409                                         raid5_compute_blocknr(sh, i, 0),
410                                         sh->dev[i].log_checksum, 0, false);
411                 r5l_append_payload_page(log, sh->dev[i].page);
412         }
413
414         if (sh->qd_idx >= 0) {
415                 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
416                                         sh->sector, sh->dev[sh->pd_idx].log_checksum,
417                                         sh->dev[sh->qd_idx].log_checksum, true);
418                 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
419                 r5l_append_payload_page(log, sh->dev[sh->qd_idx].page);
420         } else {
421                 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
422                                         sh->sector, sh->dev[sh->pd_idx].log_checksum,
423                                         0, false);
424                 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
425         }
426
427         list_add_tail(&sh->log_list, &io->stripe_list);
428         atomic_inc(&io->pending_stripe);
429         sh->log_io = io;
430 }
431
432 static void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
433 /*
434  * running in raid5d, where reclaim could wait for raid5d too (when it flushes
435  * data from log to raid disks), so we shouldn't wait for reclaim here
436  */
437 int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
438 {
439         int write_disks = 0;
440         int data_pages, parity_pages;
441         int meta_size;
442         int reserve;
443         int i;
444
445         if (!log)
446                 return -EAGAIN;
447         /* Don't support stripe batch */
448         if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
449             test_bit(STRIPE_SYNCING, &sh->state)) {
450                 /* the stripe is written to log, we start writing it to raid */
451                 clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
452                 return -EAGAIN;
453         }
454
455         for (i = 0; i < sh->disks; i++) {
456                 void *addr;
457
458                 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
459                         continue;
460                 write_disks++;
461                 /* checksum is already calculated in last run */
462                 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
463                         continue;
464                 addr = kmap_atomic(sh->dev[i].page);
465                 sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
466                                                     addr, PAGE_SIZE);
467                 kunmap_atomic(addr);
468         }
469         parity_pages = 1 + !!(sh->qd_idx >= 0);
470         data_pages = write_disks - parity_pages;
471
472         meta_size =
473                 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
474                  * data_pages) +
475                 sizeof(struct r5l_payload_data_parity) +
476                 sizeof(__le32) * parity_pages;
477         /* Doesn't work with very big raid array */
478         if (meta_size + sizeof(struct r5l_meta_block) > PAGE_SIZE)
479                 return -EINVAL;
480
481         set_bit(STRIPE_LOG_TRAPPED, &sh->state);
482         /*
483          * The stripe must enter state machine again to finish the write, so
484          * don't delay.
485          */
486         clear_bit(STRIPE_DELAYED, &sh->state);
487         atomic_inc(&sh->count);
488
489         mutex_lock(&log->io_mutex);
490         /* meta + data */
491         reserve = (1 + write_disks) << (PAGE_SHIFT - 9);
492         if (r5l_has_free_space(log, reserve))
493                 r5l_log_stripe(log, sh, data_pages, parity_pages);
494         else {
495                 spin_lock(&log->no_space_stripes_lock);
496                 list_add_tail(&sh->log_list, &log->no_space_stripes);
497                 spin_unlock(&log->no_space_stripes_lock);
498
499                 r5l_wake_reclaim(log, reserve);
500         }
501         mutex_unlock(&log->io_mutex);
502
503         return 0;
504 }
505
506 void r5l_write_stripe_run(struct r5l_log *log)
507 {
508         if (!log)
509                 return;
510         mutex_lock(&log->io_mutex);
511         r5l_submit_current_io(log);
512         mutex_unlock(&log->io_mutex);
513 }
514
515 int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
516 {
517         if (!log)
518                 return -ENODEV;
519         /*
520          * we flush log disk cache first, then write stripe data to raid disks.
521          * So if bio is finished, the log disk cache is flushed already. The
522          * recovery guarantees we can recovery the bio from log disk, so we
523          * don't need to flush again
524          */
525         if (bio->bi_iter.bi_size == 0) {
526                 bio_endio(bio);
527                 return 0;
528         }
529         bio->bi_rw &= ~REQ_FLUSH;
530         return -EAGAIN;
531 }
532
533 /* This will run after log space is reclaimed */
534 static void r5l_run_no_space_stripes(struct r5l_log *log)
535 {
536         struct stripe_head *sh;
537
538         spin_lock(&log->no_space_stripes_lock);
539         while (!list_empty(&log->no_space_stripes)) {
540                 sh = list_first_entry(&log->no_space_stripes,
541                                       struct stripe_head, log_list);
542                 list_del_init(&sh->log_list);
543                 set_bit(STRIPE_HANDLE, &sh->state);
544                 raid5_release_stripe(sh);
545         }
546         spin_unlock(&log->no_space_stripes_lock);
547 }
548
549 static void __r5l_stripe_write_finished(struct r5l_io_unit *io)
550 {
551         struct r5l_log *log = io->log;
552         struct r5l_io_unit *last;
553         sector_t reclaimable_space;
554         unsigned long flags;
555
556         spin_lock_irqsave(&log->io_list_lock, flags);
557         __r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END);
558         /* might move 0 entry */
559         r5l_move_io_unit_list(&log->flushed_ios, &log->stripe_end_ios,
560                               IO_UNIT_STRIPE_END);
561         if (list_empty(&log->stripe_end_ios)) {
562                 spin_unlock_irqrestore(&log->io_list_lock, flags);
563                 return;
564         }
565
566         last = list_last_entry(&log->stripe_end_ios,
567                                struct r5l_io_unit, log_sibling);
568         reclaimable_space = r5l_ring_distance(log, log->last_checkpoint,
569                                               last->log_end);
570         if (reclaimable_space >= log->max_free_space)
571                 r5l_wake_reclaim(log, 0);
572
573         r5l_compress_stripe_end_list(log);
574         spin_unlock_irqrestore(&log->io_list_lock, flags);
575         wake_up(&log->iounit_wait);
576 }
577
578 void r5l_stripe_write_finished(struct stripe_head *sh)
579 {
580         struct r5l_io_unit *io;
581
582         io = sh->log_io;
583         sh->log_io = NULL;
584
585         if (io && atomic_dec_and_test(&io->pending_stripe))
586                 __r5l_stripe_write_finished(io);
587 }
588
589 static void r5l_log_flush_endio(struct bio *bio)
590 {
591         struct r5l_log *log = container_of(bio, struct r5l_log,
592                 flush_bio);
593         unsigned long flags;
594         struct r5l_io_unit *io;
595         struct stripe_head *sh;
596
597         spin_lock_irqsave(&log->io_list_lock, flags);
598         list_for_each_entry(io, &log->flushing_ios, log_sibling) {
599                 while (!list_empty(&io->stripe_list)) {
600                         sh = list_first_entry(&io->stripe_list,
601                                 struct stripe_head, log_list);
602                         list_del_init(&sh->log_list);
603                         set_bit(STRIPE_HANDLE, &sh->state);
604                         raid5_release_stripe(sh);
605                 }
606         }
607         list_splice_tail_init(&log->flushing_ios, &log->flushed_ios);
608         spin_unlock_irqrestore(&log->io_list_lock, flags);
609 }
610
611 /*
612  * Starting dispatch IO to raid.
613  * io_unit(meta) consists of a log. There is one situation we want to avoid. A
614  * broken meta in the middle of a log causes recovery can't find meta at the
615  * head of log. If operations require meta at the head persistent in log, we
616  * must make sure meta before it persistent in log too. A case is:
617  *
618  * stripe data/parity is in log, we start write stripe to raid disks. stripe
619  * data/parity must be persistent in log before we do the write to raid disks.
620  *
621  * The solution is we restrictly maintain io_unit list order. In this case, we
622  * only write stripes of an io_unit to raid disks till the io_unit is the first
623  * one whose data/parity is in log.
624  */
625 void r5l_flush_stripe_to_raid(struct r5l_log *log)
626 {
627         bool do_flush;
628         if (!log)
629                 return;
630
631         spin_lock_irq(&log->io_list_lock);
632         /* flush bio is running */
633         if (!list_empty(&log->flushing_ios)) {
634                 spin_unlock_irq(&log->io_list_lock);
635                 return;
636         }
637         list_splice_tail_init(&log->io_end_ios, &log->flushing_ios);
638         do_flush = !list_empty(&log->flushing_ios);
639         spin_unlock_irq(&log->io_list_lock);
640
641         if (!do_flush)
642                 return;
643         bio_reset(&log->flush_bio);
644         log->flush_bio.bi_bdev = log->rdev->bdev;
645         log->flush_bio.bi_end_io = r5l_log_flush_endio;
646         submit_bio(WRITE_FLUSH, &log->flush_bio);
647 }
648
649 static void r5l_kick_io_unit(struct r5l_log *log)
650 {
651         md_wakeup_thread(log->rdev->mddev->thread);
652         wait_event_lock_irq(log->iounit_wait, !list_empty(&log->stripe_end_ios),
653                             log->io_list_lock);
654 }
655
656 static void r5l_write_super(struct r5l_log *log, sector_t cp);
657 static void r5l_do_reclaim(struct r5l_log *log)
658 {
659         struct r5l_io_unit *io, *last;
660         LIST_HEAD(list);
661         sector_t free = 0;
662         sector_t reclaim_target = xchg(&log->reclaim_target, 0);
663
664         spin_lock_irq(&log->io_list_lock);
665         /*
666          * move proper io_unit to reclaim list. We should not change the order.
667          * reclaimable/unreclaimable io_unit can be mixed in the list, we
668          * shouldn't reuse space of an unreclaimable io_unit
669          */
670         while (1) {
671                 struct list_head *target_list = NULL;
672
673                 while (!list_empty(&log->stripe_end_ios)) {
674                         io = list_first_entry(&log->stripe_end_ios,
675                                               struct r5l_io_unit, log_sibling);
676                         list_move_tail(&io->log_sibling, &list);
677                         free += r5l_ring_distance(log, io->log_start,
678                                                   io->log_end);
679                 }
680
681                 if (free >= reclaim_target ||
682                     (list_empty(&log->running_ios) &&
683                      list_empty(&log->io_end_ios) &&
684                      list_empty(&log->flushing_ios) &&
685                      list_empty(&log->flushed_ios)))
686                         break;
687
688                 /* Below waiting mostly happens when we shutdown the raid */
689                 if (!list_empty(&log->flushed_ios))
690                         target_list = &log->flushed_ios;
691                 else if (!list_empty(&log->flushing_ios))
692                         target_list = &log->flushing_ios;
693                 else if (!list_empty(&log->io_end_ios))
694                         target_list = &log->io_end_ios;
695                 else if (!list_empty(&log->running_ios))
696                         target_list = &log->running_ios;
697
698                 r5l_kick_io_unit(log);
699         }
700         spin_unlock_irq(&log->io_list_lock);
701
702         if (list_empty(&list))
703                 return;
704
705         /* super always point to last valid meta */
706         last = list_last_entry(&list, struct r5l_io_unit, log_sibling);
707         /*
708          * write_super will flush cache of each raid disk. We must write super
709          * here, because the log area might be reused soon and we don't want to
710          * confuse recovery
711          */
712         r5l_write_super(log, last->log_start);
713
714         mutex_lock(&log->io_mutex);
715         log->last_checkpoint = last->log_start;
716         log->last_cp_seq = last->seq;
717         mutex_unlock(&log->io_mutex);
718         r5l_run_no_space_stripes(log);
719
720         while (!list_empty(&list)) {
721                 io = list_first_entry(&list, struct r5l_io_unit, log_sibling);
722                 list_del(&io->log_sibling);
723                 r5l_free_io_unit(log, io);
724         }
725 }
726
727 static void r5l_reclaim_thread(struct md_thread *thread)
728 {
729         struct mddev *mddev = thread->mddev;
730         struct r5conf *conf = mddev->private;
731         struct r5l_log *log = conf->log;
732
733         if (!log)
734                 return;
735         r5l_do_reclaim(log);
736 }
737
738 static void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
739 {
740         unsigned long target;
741         unsigned long new = (unsigned long)space; /* overflow in theory */
742
743         do {
744                 target = log->reclaim_target;
745                 if (new < target)
746                         return;
747         } while (cmpxchg(&log->reclaim_target, target, new) != target);
748         md_wakeup_thread(log->reclaim_thread);
749 }
750
751 struct r5l_recovery_ctx {
752         struct page *meta_page;         /* current meta */
753         sector_t meta_total_blocks;     /* total size of current meta and data */
754         sector_t pos;                   /* recovery position */
755         u64 seq;                        /* recovery position seq */
756 };
757
758 static int r5l_read_meta_block(struct r5l_log *log,
759                                struct r5l_recovery_ctx *ctx)
760 {
761         struct page *page = ctx->meta_page;
762         struct r5l_meta_block *mb;
763         u32 crc, stored_crc;
764
765         if (!sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, READ, false))
766                 return -EIO;
767
768         mb = page_address(page);
769         stored_crc = le32_to_cpu(mb->checksum);
770         mb->checksum = 0;
771
772         if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
773             le64_to_cpu(mb->seq) != ctx->seq ||
774             mb->version != R5LOG_VERSION ||
775             le64_to_cpu(mb->position) != ctx->pos)
776                 return -EINVAL;
777
778         crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
779         if (stored_crc != crc)
780                 return -EINVAL;
781
782         if (le32_to_cpu(mb->meta_size) > PAGE_SIZE)
783                 return -EINVAL;
784
785         ctx->meta_total_blocks = BLOCK_SECTORS;
786
787         return 0;
788 }
789
790 static int r5l_recovery_flush_one_stripe(struct r5l_log *log,
791                                          struct r5l_recovery_ctx *ctx,
792                                          sector_t stripe_sect,
793                                          int *offset, sector_t *log_offset)
794 {
795         struct r5conf *conf = log->rdev->mddev->private;
796         struct stripe_head *sh;
797         struct r5l_payload_data_parity *payload;
798         int disk_index;
799
800         sh = raid5_get_active_stripe(conf, stripe_sect, 0, 0, 0);
801         while (1) {
802                 payload = page_address(ctx->meta_page) + *offset;
803
804                 if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
805                         raid5_compute_sector(conf,
806                                              le64_to_cpu(payload->location), 0,
807                                              &disk_index, sh);
808
809                         sync_page_io(log->rdev, *log_offset, PAGE_SIZE,
810                                      sh->dev[disk_index].page, READ, false);
811                         sh->dev[disk_index].log_checksum =
812                                 le32_to_cpu(payload->checksum[0]);
813                         set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
814                         ctx->meta_total_blocks += BLOCK_SECTORS;
815                 } else {
816                         disk_index = sh->pd_idx;
817                         sync_page_io(log->rdev, *log_offset, PAGE_SIZE,
818                                      sh->dev[disk_index].page, READ, false);
819                         sh->dev[disk_index].log_checksum =
820                                 le32_to_cpu(payload->checksum[0]);
821                         set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
822
823                         if (sh->qd_idx >= 0) {
824                                 disk_index = sh->qd_idx;
825                                 sync_page_io(log->rdev,
826                                              r5l_ring_add(log, *log_offset, BLOCK_SECTORS),
827                                              PAGE_SIZE, sh->dev[disk_index].page,
828                                              READ, false);
829                                 sh->dev[disk_index].log_checksum =
830                                         le32_to_cpu(payload->checksum[1]);
831                                 set_bit(R5_Wantwrite,
832                                         &sh->dev[disk_index].flags);
833                         }
834                         ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded;
835                 }
836
837                 *log_offset = r5l_ring_add(log, *log_offset,
838                                            le32_to_cpu(payload->size));
839                 *offset += sizeof(struct r5l_payload_data_parity) +
840                         sizeof(__le32) *
841                         (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
842                 if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY)
843                         break;
844         }
845
846         for (disk_index = 0; disk_index < sh->disks; disk_index++) {
847                 void *addr;
848                 u32 checksum;
849
850                 if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
851                         continue;
852                 addr = kmap_atomic(sh->dev[disk_index].page);
853                 checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
854                 kunmap_atomic(addr);
855                 if (checksum != sh->dev[disk_index].log_checksum)
856                         goto error;
857         }
858
859         for (disk_index = 0; disk_index < sh->disks; disk_index++) {
860                 struct md_rdev *rdev, *rrdev;
861
862                 if (!test_and_clear_bit(R5_Wantwrite,
863                                         &sh->dev[disk_index].flags))
864                         continue;
865
866                 /* in case device is broken */
867                 rdev = rcu_dereference(conf->disks[disk_index].rdev);
868                 if (rdev)
869                         sync_page_io(rdev, stripe_sect, PAGE_SIZE,
870                                      sh->dev[disk_index].page, WRITE, false);
871                 rrdev = rcu_dereference(conf->disks[disk_index].replacement);
872                 if (rrdev)
873                         sync_page_io(rrdev, stripe_sect, PAGE_SIZE,
874                                      sh->dev[disk_index].page, WRITE, false);
875         }
876         raid5_release_stripe(sh);
877         return 0;
878
879 error:
880         for (disk_index = 0; disk_index < sh->disks; disk_index++)
881                 sh->dev[disk_index].flags = 0;
882         raid5_release_stripe(sh);
883         return -EINVAL;
884 }
885
886 static int r5l_recovery_flush_one_meta(struct r5l_log *log,
887                                        struct r5l_recovery_ctx *ctx)
888 {
889         struct r5conf *conf = log->rdev->mddev->private;
890         struct r5l_payload_data_parity *payload;
891         struct r5l_meta_block *mb;
892         int offset;
893         sector_t log_offset;
894         sector_t stripe_sector;
895
896         mb = page_address(ctx->meta_page);
897         offset = sizeof(struct r5l_meta_block);
898         log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
899
900         while (offset < le32_to_cpu(mb->meta_size)) {
901                 int dd;
902
903                 payload = (void *)mb + offset;
904                 stripe_sector = raid5_compute_sector(conf,
905                                                      le64_to_cpu(payload->location), 0, &dd, NULL);
906                 if (r5l_recovery_flush_one_stripe(log, ctx, stripe_sector,
907                                                   &offset, &log_offset))
908                         return -EINVAL;
909         }
910         return 0;
911 }
912
913 /* copy data/parity from log to raid disks */
914 static void r5l_recovery_flush_log(struct r5l_log *log,
915                                    struct r5l_recovery_ctx *ctx)
916 {
917         while (1) {
918                 if (r5l_read_meta_block(log, ctx))
919                         return;
920                 if (r5l_recovery_flush_one_meta(log, ctx))
921                         return;
922                 ctx->seq++;
923                 ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks);
924         }
925 }
926
927 static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
928                                           u64 seq)
929 {
930         struct page *page;
931         struct r5l_meta_block *mb;
932         u32 crc;
933
934         page = alloc_page(GFP_KERNEL | __GFP_ZERO);
935         if (!page)
936                 return -ENOMEM;
937         mb = page_address(page);
938         mb->magic = cpu_to_le32(R5LOG_MAGIC);
939         mb->version = R5LOG_VERSION;
940         mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block));
941         mb->seq = cpu_to_le64(seq);
942         mb->position = cpu_to_le64(pos);
943         crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
944         mb->checksum = cpu_to_le32(crc);
945
946         if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, WRITE_FUA, false)) {
947                 __free_page(page);
948                 return -EIO;
949         }
950         __free_page(page);
951         return 0;
952 }
953
954 static int r5l_recovery_log(struct r5l_log *log)
955 {
956         struct r5l_recovery_ctx ctx;
957
958         ctx.pos = log->last_checkpoint;
959         ctx.seq = log->last_cp_seq;
960         ctx.meta_page = alloc_page(GFP_KERNEL);
961         if (!ctx.meta_page)
962                 return -ENOMEM;
963
964         r5l_recovery_flush_log(log, &ctx);
965         __free_page(ctx.meta_page);
966
967         /*
968          * we did a recovery. Now ctx.pos points to an invalid meta block. New
969          * log will start here. but we can't let superblock point to last valid
970          * meta block. The log might looks like:
971          * | meta 1| meta 2| meta 3|
972          * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If
973          * superblock points to meta 1, we write a new valid meta 2n.  if crash
974          * happens again, new recovery will start from meta 1. Since meta 2n is
975          * valid now, recovery will think meta 3 is valid, which is wrong.
976          * The solution is we create a new meta in meta2 with its seq == meta
977          * 1's seq + 10 and let superblock points to meta2. The same recovery will
978          * not think meta 3 is a valid meta, because its seq doesn't match
979          */
980         if (ctx.seq > log->last_cp_seq + 1) {
981                 int ret;
982
983                 ret = r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq + 10);
984                 if (ret)
985                         return ret;
986                 log->seq = ctx.seq + 11;
987                 log->log_start = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
988                 r5l_write_super(log, ctx.pos);
989         } else {
990                 log->log_start = ctx.pos;
991                 log->seq = ctx.seq;
992         }
993         return 0;
994 }
995
996 static void r5l_write_super(struct r5l_log *log, sector_t cp)
997 {
998         struct mddev *mddev = log->rdev->mddev;
999
1000         log->rdev->journal_tail = cp;
1001         set_bit(MD_CHANGE_DEVS, &mddev->flags);
1002 }
1003
1004 static int r5l_load_log(struct r5l_log *log)
1005 {
1006         struct md_rdev *rdev = log->rdev;
1007         struct page *page;
1008         struct r5l_meta_block *mb;
1009         sector_t cp = log->rdev->journal_tail;
1010         u32 stored_crc, expected_crc;
1011         bool create_super = false;
1012         int ret;
1013
1014         /* Make sure it's valid */
1015         if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp)
1016                 cp = 0;
1017         page = alloc_page(GFP_KERNEL);
1018         if (!page)
1019                 return -ENOMEM;
1020
1021         if (!sync_page_io(rdev, cp, PAGE_SIZE, page, READ, false)) {
1022                 ret = -EIO;
1023                 goto ioerr;
1024         }
1025         mb = page_address(page);
1026
1027         if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
1028             mb->version != R5LOG_VERSION) {
1029                 create_super = true;
1030                 goto create;
1031         }
1032         stored_crc = le32_to_cpu(mb->checksum);
1033         mb->checksum = 0;
1034         expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
1035         if (stored_crc != expected_crc) {
1036                 create_super = true;
1037                 goto create;
1038         }
1039         if (le64_to_cpu(mb->position) != cp) {
1040                 create_super = true;
1041                 goto create;
1042         }
1043 create:
1044         if (create_super) {
1045                 log->last_cp_seq = prandom_u32();
1046                 cp = 0;
1047                 /*
1048                  * Make sure super points to correct address. Log might have
1049                  * data very soon. If super hasn't correct log tail address,
1050                  * recovery can't find the log
1051                  */
1052                 r5l_write_super(log, cp);
1053         } else
1054                 log->last_cp_seq = le64_to_cpu(mb->seq);
1055
1056         log->device_size = round_down(rdev->sectors, BLOCK_SECTORS);
1057         log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT;
1058         if (log->max_free_space > RECLAIM_MAX_FREE_SPACE)
1059                 log->max_free_space = RECLAIM_MAX_FREE_SPACE;
1060         log->last_checkpoint = cp;
1061
1062         __free_page(page);
1063
1064         return r5l_recovery_log(log);
1065 ioerr:
1066         __free_page(page);
1067         return ret;
1068 }
1069
1070 int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
1071 {
1072         struct r5l_log *log;
1073
1074         if (PAGE_SIZE != 4096)
1075                 return -EINVAL;
1076         log = kzalloc(sizeof(*log), GFP_KERNEL);
1077         if (!log)
1078                 return -ENOMEM;
1079         log->rdev = rdev;
1080
1081         log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid,
1082                                        sizeof(rdev->mddev->uuid));
1083
1084         mutex_init(&log->io_mutex);
1085
1086         spin_lock_init(&log->io_list_lock);
1087         INIT_LIST_HEAD(&log->running_ios);
1088         INIT_LIST_HEAD(&log->io_end_ios);
1089         INIT_LIST_HEAD(&log->stripe_end_ios);
1090         INIT_LIST_HEAD(&log->flushing_ios);
1091         INIT_LIST_HEAD(&log->flushed_ios);
1092         bio_init(&log->flush_bio);
1093
1094         log->io_kc = KMEM_CACHE(r5l_io_unit, 0);
1095         if (!log->io_kc)
1096                 goto io_kc;
1097
1098         log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
1099                                                  log->rdev->mddev, "reclaim");
1100         if (!log->reclaim_thread)
1101                 goto reclaim_thread;
1102         init_waitqueue_head(&log->iounit_wait);
1103
1104         INIT_LIST_HEAD(&log->no_space_stripes);
1105         spin_lock_init(&log->no_space_stripes_lock);
1106
1107         if (r5l_load_log(log))
1108                 goto error;
1109
1110         conf->log = log;
1111         return 0;
1112 error:
1113         md_unregister_thread(&log->reclaim_thread);
1114 reclaim_thread:
1115         kmem_cache_destroy(log->io_kc);
1116 io_kc:
1117         kfree(log);
1118         return -EINVAL;
1119 }
1120
1121 void r5l_exit_log(struct r5l_log *log)
1122 {
1123         /*
1124          * at this point all stripes are finished, so io_unit is at least in
1125          * STRIPE_END state
1126          */
1127         r5l_wake_reclaim(log, -1L);
1128         md_unregister_thread(&log->reclaim_thread);
1129         r5l_do_reclaim(log);
1130         /*
1131          * force a super update, r5l_do_reclaim might updated the super.
1132          * mddev->thread is already stopped
1133          */
1134         md_update_sb(log->rdev->mddev, 1);
1135
1136         kmem_cache_destroy(log->io_kc);
1137         kfree(log);
1138 }