2 * Copyright (C) 2012 Fusion-io All rights reserved.
3 * Copyright (C) 2012 Intel Corp. All rights reserved.
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public
7 * License v2 as published by the Free Software Foundation.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
14 * You should have received a copy of the GNU General Public
15 * License along with this program; if not, write to the
16 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
17 * Boston, MA 021110-1307, USA.
19 #include <linux/sched.h>
20 #include <linux/wait.h>
21 #include <linux/bio.h>
22 #include <linux/slab.h>
23 #include <linux/buffer_head.h>
24 #include <linux/blkdev.h>
25 #include <linux/random.h>
26 #include <linux/iocontext.h>
27 #include <linux/capability.h>
28 #include <linux/ratelimit.h>
29 #include <linux/kthread.h>
30 #include <linux/raid/pq.h>
31 #include <linux/hash.h>
32 #include <linux/list_sort.h>
33 #include <linux/raid/xor.h>
34 #include <linux/vmalloc.h>
35 #include <asm/div64.h>
37 #include "extent_map.h"
39 #include "transaction.h"
40 #include "print-tree.h"
43 #include "async-thread.h"
44 #include "check-integrity.h"
45 #include "rcu-string.h"
47 /* set when additional merges to this rbio are not allowed */
48 #define RBIO_RMW_LOCKED_BIT 1
51 * set when this rbio is sitting in the hash, but it is just a cache
54 #define RBIO_CACHE_BIT 2
57 * set when it is safe to trust the stripe_pages for caching
59 #define RBIO_CACHE_READY_BIT 3
62 * bbio and raid_map is managed by the caller, so we shouldn't free
63 * them here. And besides that, all rbios with this flag should not
64 * be cached, because we need raid_map to check the rbios' stripe
65 * is the same or not, but it is very likely that the caller has
66 * free raid_map, so don't cache those rbios.
68 #define RBIO_HOLD_BBIO_MAP_BIT 4
70 #define RBIO_CACHE_SIZE 1024
72 struct btrfs_raid_bio {
73 struct btrfs_fs_info *fs_info;
74 struct btrfs_bio *bbio;
77 * logical block numbers for the start of each stripe
78 * The last one or two are p/q. These are sorted,
79 * so raid_map[0] is the start of our full stripe
83 /* while we're doing rmw on a stripe
84 * we put it into a hash table so we can
85 * lock the stripe and merge more rbios
88 struct list_head hash_list;
91 * LRU list for the stripe cache
93 struct list_head stripe_cache;
96 * for scheduling work in the helper threads
98 struct btrfs_work work;
101 * bio list and bio_list_lock are used
102 * to add more bios into the stripe
103 * in hopes of avoiding the full rmw
105 struct bio_list bio_list;
106 spinlock_t bio_list_lock;
108 /* also protected by the bio_list_lock, the
109 * plug list is used by the plugging code
110 * to collect partial bios while plugged. The
111 * stripe locking code also uses it to hand off
112 * the stripe lock to the next pending IO
114 struct list_head plug_list;
117 * flags that tell us if it is safe to
118 * merge with this bio
122 /* size of each individual stripe on disk */
125 /* number of data stripes (no p/q) */
129 * set if we're doing a parity rebuild
130 * for a read from higher up, which is handled
131 * differently from a parity rebuild as part of
136 /* first bad stripe */
139 /* second bad stripe (for raid6 use) */
143 * number of pages needed to represent the full
149 * size of all the bios in the bio_list. This
150 * helps us decide if the rbio maps to a full
158 atomic_t stripes_pending;
162 * these are two arrays of pointers. We allocate the
163 * rbio big enough to hold them both and setup their
164 * locations when the rbio is allocated
167 /* pointers to pages that we allocated for
168 * reading/writing stripes directly from the disk (including P/Q)
170 struct page **stripe_pages;
173 * pointers to the pages in the bio_list. Stored
174 * here for faster lookup
176 struct page **bio_pages;
179 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
180 static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
181 static void rmw_work(struct btrfs_work *work);
182 static void read_rebuild_work(struct btrfs_work *work);
183 static void async_rmw_stripe(struct btrfs_raid_bio *rbio);
184 static void async_read_rebuild(struct btrfs_raid_bio *rbio);
185 static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
186 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
187 static void __free_raid_bio(struct btrfs_raid_bio *rbio);
188 static void index_rbio_pages(struct btrfs_raid_bio *rbio);
189 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
192 * the stripe hash table is used for locking, and to collect
193 * bios in hopes of making a full stripe
195 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
197 struct btrfs_stripe_hash_table *table;
198 struct btrfs_stripe_hash_table *x;
199 struct btrfs_stripe_hash *cur;
200 struct btrfs_stripe_hash *h;
201 int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
205 if (info->stripe_hash_table)
209 * The table is large, starting with order 4 and can go as high as
210 * order 7 in case lock debugging is turned on.
212 * Try harder to allocate and fallback to vmalloc to lower the chance
213 * of a failing mount.
215 table_size = sizeof(*table) + sizeof(*h) * num_entries;
216 table = kzalloc(table_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
218 table = vzalloc(table_size);
223 spin_lock_init(&table->cache_lock);
224 INIT_LIST_HEAD(&table->stripe_cache);
228 for (i = 0; i < num_entries; i++) {
230 INIT_LIST_HEAD(&cur->hash_list);
231 spin_lock_init(&cur->lock);
232 init_waitqueue_head(&cur->wait);
235 x = cmpxchg(&info->stripe_hash_table, NULL, table);
237 if (is_vmalloc_addr(x))
246 * caching an rbio means to copy anything from the
247 * bio_pages array into the stripe_pages array. We
248 * use the page uptodate bit in the stripe cache array
249 * to indicate if it has valid data
251 * once the caching is done, we set the cache ready
254 static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
261 ret = alloc_rbio_pages(rbio);
265 for (i = 0; i < rbio->nr_pages; i++) {
266 if (!rbio->bio_pages[i])
269 s = kmap(rbio->bio_pages[i]);
270 d = kmap(rbio->stripe_pages[i]);
272 memcpy(d, s, PAGE_CACHE_SIZE);
274 kunmap(rbio->bio_pages[i]);
275 kunmap(rbio->stripe_pages[i]);
276 SetPageUptodate(rbio->stripe_pages[i]);
278 set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
282 * we hash on the first logical address of the stripe
284 static int rbio_bucket(struct btrfs_raid_bio *rbio)
286 u64 num = rbio->raid_map[0];
289 * we shift down quite a bit. We're using byte
290 * addressing, and most of the lower bits are zeros.
291 * This tends to upset hash_64, and it consistently
292 * returns just one or two different values.
294 * shifting off the lower bits fixes things.
296 return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
300 * stealing an rbio means taking all the uptodate pages from the stripe
301 * array in the source rbio and putting them into the destination rbio
303 static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
309 if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
312 for (i = 0; i < dest->nr_pages; i++) {
313 s = src->stripe_pages[i];
314 if (!s || !PageUptodate(s)) {
318 d = dest->stripe_pages[i];
322 dest->stripe_pages[i] = s;
323 src->stripe_pages[i] = NULL;
328 * merging means we take the bio_list from the victim and
329 * splice it into the destination. The victim should
330 * be discarded afterwards.
332 * must be called with dest->rbio_list_lock held
334 static void merge_rbio(struct btrfs_raid_bio *dest,
335 struct btrfs_raid_bio *victim)
337 bio_list_merge(&dest->bio_list, &victim->bio_list);
338 dest->bio_list_bytes += victim->bio_list_bytes;
339 bio_list_init(&victim->bio_list);
343 * used to prune items that are in the cache. The caller
344 * must hold the hash table lock.
346 static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
348 int bucket = rbio_bucket(rbio);
349 struct btrfs_stripe_hash_table *table;
350 struct btrfs_stripe_hash *h;
354 * check the bit again under the hash table lock.
356 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
359 table = rbio->fs_info->stripe_hash_table;
360 h = table->table + bucket;
362 /* hold the lock for the bucket because we may be
363 * removing it from the hash table
368 * hold the lock for the bio list because we need
369 * to make sure the bio list is empty
371 spin_lock(&rbio->bio_list_lock);
373 if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
374 list_del_init(&rbio->stripe_cache);
375 table->cache_size -= 1;
378 /* if the bio list isn't empty, this rbio is
379 * still involved in an IO. We take it out
380 * of the cache list, and drop the ref that
381 * was held for the list.
383 * If the bio_list was empty, we also remove
384 * the rbio from the hash_table, and drop
385 * the corresponding ref
387 if (bio_list_empty(&rbio->bio_list)) {
388 if (!list_empty(&rbio->hash_list)) {
389 list_del_init(&rbio->hash_list);
390 atomic_dec(&rbio->refs);
391 BUG_ON(!list_empty(&rbio->plug_list));
396 spin_unlock(&rbio->bio_list_lock);
397 spin_unlock(&h->lock);
400 __free_raid_bio(rbio);
404 * prune a given rbio from the cache
406 static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
408 struct btrfs_stripe_hash_table *table;
411 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
414 table = rbio->fs_info->stripe_hash_table;
416 spin_lock_irqsave(&table->cache_lock, flags);
417 __remove_rbio_from_cache(rbio);
418 spin_unlock_irqrestore(&table->cache_lock, flags);
422 * remove everything in the cache
424 static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
426 struct btrfs_stripe_hash_table *table;
428 struct btrfs_raid_bio *rbio;
430 table = info->stripe_hash_table;
432 spin_lock_irqsave(&table->cache_lock, flags);
433 while (!list_empty(&table->stripe_cache)) {
434 rbio = list_entry(table->stripe_cache.next,
435 struct btrfs_raid_bio,
437 __remove_rbio_from_cache(rbio);
439 spin_unlock_irqrestore(&table->cache_lock, flags);
443 * remove all cached entries and free the hash table
446 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
448 if (!info->stripe_hash_table)
450 btrfs_clear_rbio_cache(info);
451 if (is_vmalloc_addr(info->stripe_hash_table))
452 vfree(info->stripe_hash_table);
454 kfree(info->stripe_hash_table);
455 info->stripe_hash_table = NULL;
459 * insert an rbio into the stripe cache. It
460 * must have already been prepared by calling
463 * If this rbio was already cached, it gets
464 * moved to the front of the lru.
466 * If the size of the rbio cache is too big, we
469 static void cache_rbio(struct btrfs_raid_bio *rbio)
471 struct btrfs_stripe_hash_table *table;
474 if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
477 table = rbio->fs_info->stripe_hash_table;
479 spin_lock_irqsave(&table->cache_lock, flags);
480 spin_lock(&rbio->bio_list_lock);
482 /* bump our ref if we were not in the list before */
483 if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
484 atomic_inc(&rbio->refs);
486 if (!list_empty(&rbio->stripe_cache)){
487 list_move(&rbio->stripe_cache, &table->stripe_cache);
489 list_add(&rbio->stripe_cache, &table->stripe_cache);
490 table->cache_size += 1;
493 spin_unlock(&rbio->bio_list_lock);
495 if (table->cache_size > RBIO_CACHE_SIZE) {
496 struct btrfs_raid_bio *found;
498 found = list_entry(table->stripe_cache.prev,
499 struct btrfs_raid_bio,
503 __remove_rbio_from_cache(found);
506 spin_unlock_irqrestore(&table->cache_lock, flags);
511 * helper function to run the xor_blocks api. It is only
512 * able to do MAX_XOR_BLOCKS at a time, so we need to
515 static void run_xor(void **pages, int src_cnt, ssize_t len)
519 void *dest = pages[src_cnt];
522 xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
523 xor_blocks(xor_src_cnt, len, dest, pages + src_off);
525 src_cnt -= xor_src_cnt;
526 src_off += xor_src_cnt;
531 * returns true if the bio list inside this rbio
532 * covers an entire stripe (no rmw required).
533 * Must be called with the bio list lock held, or
534 * at a time when you know it is impossible to add
535 * new bios into the list
537 static int __rbio_is_full(struct btrfs_raid_bio *rbio)
539 unsigned long size = rbio->bio_list_bytes;
542 if (size != rbio->nr_data * rbio->stripe_len)
545 BUG_ON(size > rbio->nr_data * rbio->stripe_len);
549 static int rbio_is_full(struct btrfs_raid_bio *rbio)
554 spin_lock_irqsave(&rbio->bio_list_lock, flags);
555 ret = __rbio_is_full(rbio);
556 spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
561 * returns 1 if it is safe to merge two rbios together.
562 * The merging is safe if the two rbios correspond to
563 * the same stripe and if they are both going in the same
564 * direction (read vs write), and if neither one is
565 * locked for final IO
567 * The caller is responsible for locking such that
568 * rmw_locked is safe to test
570 static int rbio_can_merge(struct btrfs_raid_bio *last,
571 struct btrfs_raid_bio *cur)
573 if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
574 test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
578 * we can't merge with cached rbios, since the
579 * idea is that when we merge the destination
580 * rbio is going to run our IO for us. We can
581 * steal from cached rbio's though, other functions
584 if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
585 test_bit(RBIO_CACHE_BIT, &cur->flags))
588 if (last->raid_map[0] !=
592 /* reads can't merge with writes */
593 if (last->read_rebuild !=
602 * helper to index into the pstripe
604 static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
606 index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
607 return rbio->stripe_pages[index];
611 * helper to index into the qstripe, returns null
612 * if there is no qstripe
614 static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
616 if (rbio->nr_data + 1 == rbio->bbio->num_stripes)
619 index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
621 return rbio->stripe_pages[index];
625 * The first stripe in the table for a logical address
626 * has the lock. rbios are added in one of three ways:
628 * 1) Nobody has the stripe locked yet. The rbio is given
629 * the lock and 0 is returned. The caller must start the IO
632 * 2) Someone has the stripe locked, but we're able to merge
633 * with the lock owner. The rbio is freed and the IO will
634 * start automatically along with the existing rbio. 1 is returned.
636 * 3) Someone has the stripe locked, but we're not able to merge.
637 * The rbio is added to the lock owner's plug list, or merged into
638 * an rbio already on the plug list. When the lock owner unlocks,
639 * the next rbio on the list is run and the IO is started automatically.
642 * If we return 0, the caller still owns the rbio and must continue with
643 * IO submission. If we return 1, the caller must assume the rbio has
644 * already been freed.
646 static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
648 int bucket = rbio_bucket(rbio);
649 struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket;
650 struct btrfs_raid_bio *cur;
651 struct btrfs_raid_bio *pending;
654 struct btrfs_raid_bio *freeit = NULL;
655 struct btrfs_raid_bio *cache_drop = NULL;
659 spin_lock_irqsave(&h->lock, flags);
660 list_for_each_entry(cur, &h->hash_list, hash_list) {
662 if (cur->raid_map[0] == rbio->raid_map[0]) {
663 spin_lock(&cur->bio_list_lock);
665 /* can we steal this cached rbio's pages? */
666 if (bio_list_empty(&cur->bio_list) &&
667 list_empty(&cur->plug_list) &&
668 test_bit(RBIO_CACHE_BIT, &cur->flags) &&
669 !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
670 list_del_init(&cur->hash_list);
671 atomic_dec(&cur->refs);
673 steal_rbio(cur, rbio);
675 spin_unlock(&cur->bio_list_lock);
680 /* can we merge into the lock owner? */
681 if (rbio_can_merge(cur, rbio)) {
682 merge_rbio(cur, rbio);
683 spin_unlock(&cur->bio_list_lock);
691 * we couldn't merge with the running
692 * rbio, see if we can merge with the
693 * pending ones. We don't have to
694 * check for rmw_locked because there
695 * is no way they are inside finish_rmw
698 list_for_each_entry(pending, &cur->plug_list,
700 if (rbio_can_merge(pending, rbio)) {
701 merge_rbio(pending, rbio);
702 spin_unlock(&cur->bio_list_lock);
709 /* no merging, put us on the tail of the plug list,
710 * our rbio will be started with the currently
711 * running rbio unlocks
713 list_add_tail(&rbio->plug_list, &cur->plug_list);
714 spin_unlock(&cur->bio_list_lock);
720 atomic_inc(&rbio->refs);
721 list_add(&rbio->hash_list, &h->hash_list);
723 spin_unlock_irqrestore(&h->lock, flags);
725 remove_rbio_from_cache(cache_drop);
727 __free_raid_bio(freeit);
732 * called as rmw or parity rebuild is completed. If the plug list has more
733 * rbios waiting for this stripe, the next one on the list will be started
735 static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
738 struct btrfs_stripe_hash *h;
742 bucket = rbio_bucket(rbio);
743 h = rbio->fs_info->stripe_hash_table->table + bucket;
745 if (list_empty(&rbio->plug_list))
748 spin_lock_irqsave(&h->lock, flags);
749 spin_lock(&rbio->bio_list_lock);
751 if (!list_empty(&rbio->hash_list)) {
753 * if we're still cached and there is no other IO
754 * to perform, just leave this rbio here for others
755 * to steal from later
757 if (list_empty(&rbio->plug_list) &&
758 test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
760 clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
761 BUG_ON(!bio_list_empty(&rbio->bio_list));
765 list_del_init(&rbio->hash_list);
766 atomic_dec(&rbio->refs);
769 * we use the plug list to hold all the rbios
770 * waiting for the chance to lock this stripe.
771 * hand the lock over to one of them.
773 if (!list_empty(&rbio->plug_list)) {
774 struct btrfs_raid_bio *next;
775 struct list_head *head = rbio->plug_list.next;
777 next = list_entry(head, struct btrfs_raid_bio,
780 list_del_init(&rbio->plug_list);
782 list_add(&next->hash_list, &h->hash_list);
783 atomic_inc(&next->refs);
784 spin_unlock(&rbio->bio_list_lock);
785 spin_unlock_irqrestore(&h->lock, flags);
787 if (next->read_rebuild)
788 async_read_rebuild(next);
790 steal_rbio(rbio, next);
791 async_rmw_stripe(next);
795 } else if (waitqueue_active(&h->wait)) {
796 spin_unlock(&rbio->bio_list_lock);
797 spin_unlock_irqrestore(&h->lock, flags);
803 spin_unlock(&rbio->bio_list_lock);
804 spin_unlock_irqrestore(&h->lock, flags);
808 remove_rbio_from_cache(rbio);
812 __free_bbio_and_raid_map(struct btrfs_bio *bbio, u64 *raid_map, int need)
820 static inline void free_bbio_and_raid_map(struct btrfs_raid_bio *rbio)
822 __free_bbio_and_raid_map(rbio->bbio, rbio->raid_map,
823 !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags));
826 static void __free_raid_bio(struct btrfs_raid_bio *rbio)
830 WARN_ON(atomic_read(&rbio->refs) < 0);
831 if (!atomic_dec_and_test(&rbio->refs))
834 WARN_ON(!list_empty(&rbio->stripe_cache));
835 WARN_ON(!list_empty(&rbio->hash_list));
836 WARN_ON(!bio_list_empty(&rbio->bio_list));
838 for (i = 0; i < rbio->nr_pages; i++) {
839 if (rbio->stripe_pages[i]) {
840 __free_page(rbio->stripe_pages[i]);
841 rbio->stripe_pages[i] = NULL;
845 free_bbio_and_raid_map(rbio);
850 static void free_raid_bio(struct btrfs_raid_bio *rbio)
853 __free_raid_bio(rbio);
857 * this frees the rbio and runs through all the bios in the
858 * bio_list and calls end_io on them
860 static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
862 struct bio *cur = bio_list_get(&rbio->bio_list);
870 set_bit(BIO_UPTODATE, &cur->bi_flags);
877 * end io function used by finish_rmw. When we finally
878 * get here, we've written a full stripe
880 static void raid_write_end_io(struct bio *bio, int err)
882 struct btrfs_raid_bio *rbio = bio->bi_private;
885 fail_bio_stripe(rbio, bio);
889 if (!atomic_dec_and_test(&rbio->stripes_pending))
894 /* OK, we have read all the stripes we need to. */
895 if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
898 rbio_orig_end_io(rbio, err, 0);
903 * the read/modify/write code wants to use the original bio for
904 * any pages it included, and then use the rbio for everything
905 * else. This function decides if a given index (stripe number)
906 * and page number in that stripe fall inside the original bio
909 * if you set bio_list_only, you'll get a NULL back for any ranges
910 * that are outside the bio_list
912 * This doesn't take any refs on anything, you get a bare page pointer
913 * and the caller must bump refs as required.
915 * You must call index_rbio_pages once before you can trust
916 * the answers from this function.
918 static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
919 int index, int pagenr, int bio_list_only)
922 struct page *p = NULL;
924 chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
926 spin_lock_irq(&rbio->bio_list_lock);
927 p = rbio->bio_pages[chunk_page];
928 spin_unlock_irq(&rbio->bio_list_lock);
930 if (p || bio_list_only)
933 return rbio->stripe_pages[chunk_page];
937 * number of pages we need for the entire stripe across all the
940 static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
942 unsigned long nr = stripe_len * nr_stripes;
943 return DIV_ROUND_UP(nr, PAGE_CACHE_SIZE);
947 * allocation and initial setup for the btrfs_raid_bio. Not
948 * this does not allocate any pages for rbio->pages.
950 static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
951 struct btrfs_bio *bbio, u64 *raid_map,
954 struct btrfs_raid_bio *rbio;
956 int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes);
959 rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2,
962 return ERR_PTR(-ENOMEM);
964 bio_list_init(&rbio->bio_list);
965 INIT_LIST_HEAD(&rbio->plug_list);
966 spin_lock_init(&rbio->bio_list_lock);
967 INIT_LIST_HEAD(&rbio->stripe_cache);
968 INIT_LIST_HEAD(&rbio->hash_list);
970 rbio->raid_map = raid_map;
971 rbio->fs_info = root->fs_info;
972 rbio->stripe_len = stripe_len;
973 rbio->nr_pages = num_pages;
976 atomic_set(&rbio->refs, 1);
977 atomic_set(&rbio->error, 0);
978 atomic_set(&rbio->stripes_pending, 0);
981 * the stripe_pages and bio_pages array point to the extra
982 * memory we allocated past the end of the rbio
985 rbio->stripe_pages = p;
986 rbio->bio_pages = p + sizeof(struct page *) * num_pages;
988 if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
989 nr_data = bbio->num_stripes - 2;
991 nr_data = bbio->num_stripes - 1;
993 rbio->nr_data = nr_data;
997 /* allocate pages for all the stripes in the bio, including parity */
998 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
1003 for (i = 0; i < rbio->nr_pages; i++) {
1004 if (rbio->stripe_pages[i])
1006 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
1009 rbio->stripe_pages[i] = page;
1010 ClearPageUptodate(page);
1015 /* allocate pages for just the p/q stripes */
1016 static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
1021 i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
1023 for (; i < rbio->nr_pages; i++) {
1024 if (rbio->stripe_pages[i])
1026 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
1029 rbio->stripe_pages[i] = page;
1035 * add a single page from a specific stripe into our list of bios for IO
1036 * this will try to merge into existing bios if possible, and returns
1037 * zero if all went well.
1039 static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
1040 struct bio_list *bio_list,
1043 unsigned long page_index,
1044 unsigned long bio_max_len)
1046 struct bio *last = bio_list->tail;
1050 struct btrfs_bio_stripe *stripe;
1053 stripe = &rbio->bbio->stripes[stripe_nr];
1054 disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT);
1056 /* if the device is missing, just fail this stripe */
1057 if (!stripe->dev->bdev)
1058 return fail_rbio_index(rbio, stripe_nr);
1060 /* see if we can add this page onto our existing bio */
1062 last_end = (u64)last->bi_iter.bi_sector << 9;
1063 last_end += last->bi_iter.bi_size;
1066 * we can't merge these if they are from different
1067 * devices or if they are not contiguous
1069 if (last_end == disk_start && stripe->dev->bdev &&
1070 test_bit(BIO_UPTODATE, &last->bi_flags) &&
1071 last->bi_bdev == stripe->dev->bdev) {
1072 ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0);
1073 if (ret == PAGE_CACHE_SIZE)
1078 /* put a new bio on the list */
1079 bio = btrfs_io_bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1);
1083 bio->bi_iter.bi_size = 0;
1084 bio->bi_bdev = stripe->dev->bdev;
1085 bio->bi_iter.bi_sector = disk_start >> 9;
1086 set_bit(BIO_UPTODATE, &bio->bi_flags);
1088 bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
1089 bio_list_add(bio_list, bio);
1094 * while we're doing the read/modify/write cycle, we could
1095 * have errors in reading pages off the disk. This checks
1096 * for errors and if we're not able to read the page it'll
1097 * trigger parity reconstruction. The rmw will be finished
1098 * after we've reconstructed the failed stripes
1100 static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
1102 if (rbio->faila >= 0 || rbio->failb >= 0) {
1103 BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1);
1104 __raid56_parity_recover(rbio);
1111 * these are just the pages from the rbio array, not from anything
1112 * the FS sent down to us
1114 static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page)
1117 index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT);
1119 return rbio->stripe_pages[index];
1123 * helper function to walk our bio list and populate the bio_pages array with
1124 * the result. This seems expensive, but it is faster than constantly
1125 * searching through the bio list as we setup the IO in finish_rmw or stripe
1128 * This must be called before you trust the answers from page_in_rbio
1130 static void index_rbio_pages(struct btrfs_raid_bio *rbio)
1134 unsigned long stripe_offset;
1135 unsigned long page_index;
1139 spin_lock_irq(&rbio->bio_list_lock);
1140 bio_list_for_each(bio, &rbio->bio_list) {
1141 start = (u64)bio->bi_iter.bi_sector << 9;
1142 stripe_offset = start - rbio->raid_map[0];
1143 page_index = stripe_offset >> PAGE_CACHE_SHIFT;
1145 for (i = 0; i < bio->bi_vcnt; i++) {
1146 p = bio->bi_io_vec[i].bv_page;
1147 rbio->bio_pages[page_index + i] = p;
1150 spin_unlock_irq(&rbio->bio_list_lock);
1154 * this is called from one of two situations. We either
1155 * have a full stripe from the higher layers, or we've read all
1156 * the missing bits off disk.
1158 * This will calculate the parity and then send down any
1161 static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1163 struct btrfs_bio *bbio = rbio->bbio;
1164 void *pointers[bbio->num_stripes];
1165 int stripe_len = rbio->stripe_len;
1166 int nr_data = rbio->nr_data;
1171 struct bio_list bio_list;
1173 int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT;
1176 bio_list_init(&bio_list);
1178 if (bbio->num_stripes - rbio->nr_data == 1) {
1179 p_stripe = bbio->num_stripes - 1;
1180 } else if (bbio->num_stripes - rbio->nr_data == 2) {
1181 p_stripe = bbio->num_stripes - 2;
1182 q_stripe = bbio->num_stripes - 1;
1187 /* at this point we either have a full stripe,
1188 * or we've read the full stripe from the drive.
1189 * recalculate the parity and write the new results.
1191 * We're not allowed to add any new bios to the
1192 * bio list here, anyone else that wants to
1193 * change this stripe needs to do their own rmw.
1195 spin_lock_irq(&rbio->bio_list_lock);
1196 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1197 spin_unlock_irq(&rbio->bio_list_lock);
1199 atomic_set(&rbio->error, 0);
1202 * now that we've set rmw_locked, run through the
1203 * bio list one last time and map the page pointers
1205 * We don't cache full rbios because we're assuming
1206 * the higher layers are unlikely to use this area of
1207 * the disk again soon. If they do use it again,
1208 * hopefully they will send another full bio.
1210 index_rbio_pages(rbio);
1211 if (!rbio_is_full(rbio))
1212 cache_rbio_pages(rbio);
1214 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
1216 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
1218 /* first collect one page from each data stripe */
1219 for (stripe = 0; stripe < nr_data; stripe++) {
1220 p = page_in_rbio(rbio, stripe, pagenr, 0);
1221 pointers[stripe] = kmap(p);
1224 /* then add the parity stripe */
1225 p = rbio_pstripe_page(rbio, pagenr);
1227 pointers[stripe++] = kmap(p);
1229 if (q_stripe != -1) {
1232 * raid6, add the qstripe and call the
1233 * library function to fill in our p/q
1235 p = rbio_qstripe_page(rbio, pagenr);
1237 pointers[stripe++] = kmap(p);
1239 raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE,
1243 memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
1244 run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
1248 for (stripe = 0; stripe < bbio->num_stripes; stripe++)
1249 kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
1253 * time to start writing. Make bios for everything from the
1254 * higher layers (the bio_list in our rbio) and our p/q. Ignore
1257 for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
1258 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
1260 if (stripe < rbio->nr_data) {
1261 page = page_in_rbio(rbio, stripe, pagenr, 1);
1265 page = rbio_stripe_page(rbio, stripe, pagenr);
1268 ret = rbio_add_io_page(rbio, &bio_list,
1269 page, stripe, pagenr, rbio->stripe_len);
1275 atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
1276 BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
1279 bio = bio_list_pop(&bio_list);
1283 bio->bi_private = rbio;
1284 bio->bi_end_io = raid_write_end_io;
1285 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
1286 submit_bio(WRITE, bio);
1291 rbio_orig_end_io(rbio, -EIO, 0);
1295 * helper to find the stripe number for a given bio. Used to figure out which
1296 * stripe has failed. This expects the bio to correspond to a physical disk,
1297 * so it looks up based on physical sector numbers.
1299 static int find_bio_stripe(struct btrfs_raid_bio *rbio,
1302 u64 physical = bio->bi_iter.bi_sector;
1305 struct btrfs_bio_stripe *stripe;
1309 for (i = 0; i < rbio->bbio->num_stripes; i++) {
1310 stripe = &rbio->bbio->stripes[i];
1311 stripe_start = stripe->physical;
1312 if (physical >= stripe_start &&
1313 physical < stripe_start + rbio->stripe_len) {
1321 * helper to find the stripe number for a given
1322 * bio (before mapping). Used to figure out which stripe has
1323 * failed. This looks up based on logical block numbers.
1325 static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
1328 u64 logical = bio->bi_iter.bi_sector;
1334 for (i = 0; i < rbio->nr_data; i++) {
1335 stripe_start = rbio->raid_map[i];
1336 if (logical >= stripe_start &&
1337 logical < stripe_start + rbio->stripe_len) {
1345 * returns -EIO if we had too many failures
1347 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
1349 unsigned long flags;
1352 spin_lock_irqsave(&rbio->bio_list_lock, flags);
1354 /* we already know this stripe is bad, move on */
1355 if (rbio->faila == failed || rbio->failb == failed)
1358 if (rbio->faila == -1) {
1359 /* first failure on this rbio */
1360 rbio->faila = failed;
1361 atomic_inc(&rbio->error);
1362 } else if (rbio->failb == -1) {
1363 /* second failure on this rbio */
1364 rbio->failb = failed;
1365 atomic_inc(&rbio->error);
1370 spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
1376 * helper to fail a stripe based on a physical disk
1379 static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
1382 int failed = find_bio_stripe(rbio, bio);
1387 return fail_rbio_index(rbio, failed);
1391 * this sets each page in the bio uptodate. It should only be used on private
1392 * rbio pages, nothing that comes in from the higher layers
1394 static void set_bio_pages_uptodate(struct bio *bio)
1399 for (i = 0; i < bio->bi_vcnt; i++) {
1400 p = bio->bi_io_vec[i].bv_page;
1406 * end io for the read phase of the rmw cycle. All the bios here are physical
1407 * stripe bios we've read from the disk so we can recalculate the parity of the
1410 * This will usually kick off finish_rmw once all the bios are read in, but it
1411 * may trigger parity reconstruction if we had any errors along the way
1413 static void raid_rmw_end_io(struct bio *bio, int err)
1415 struct btrfs_raid_bio *rbio = bio->bi_private;
1418 fail_bio_stripe(rbio, bio);
1420 set_bio_pages_uptodate(bio);
1424 if (!atomic_dec_and_test(&rbio->stripes_pending))
1428 if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
1432 * this will normally call finish_rmw to start our write
1433 * but if there are any failed stripes we'll reconstruct
1436 validate_rbio_for_rmw(rbio);
1441 rbio_orig_end_io(rbio, -EIO, 0);
1444 static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
1446 btrfs_init_work(&rbio->work, btrfs_rmw_helper,
1447 rmw_work, NULL, NULL);
1449 btrfs_queue_work(rbio->fs_info->rmw_workers,
1453 static void async_read_rebuild(struct btrfs_raid_bio *rbio)
1455 btrfs_init_work(&rbio->work, btrfs_rmw_helper,
1456 read_rebuild_work, NULL, NULL);
1458 btrfs_queue_work(rbio->fs_info->rmw_workers,
1463 * the stripe must be locked by the caller. It will
1464 * unlock after all the writes are done
1466 static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1468 int bios_to_read = 0;
1469 struct bio_list bio_list;
1471 int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
1476 bio_list_init(&bio_list);
1478 ret = alloc_rbio_pages(rbio);
1482 index_rbio_pages(rbio);
1484 atomic_set(&rbio->error, 0);
1486 * build a list of bios to read all the missing parts of this
1489 for (stripe = 0; stripe < rbio->nr_data; stripe++) {
1490 for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1493 * we want to find all the pages missing from
1494 * the rbio and read them from the disk. If
1495 * page_in_rbio finds a page in the bio list
1496 * we don't need to read it off the stripe.
1498 page = page_in_rbio(rbio, stripe, pagenr, 1);
1502 page = rbio_stripe_page(rbio, stripe, pagenr);
1504 * the bio cache may have handed us an uptodate
1505 * page. If so, be happy and use it
1507 if (PageUptodate(page))
1510 ret = rbio_add_io_page(rbio, &bio_list, page,
1511 stripe, pagenr, rbio->stripe_len);
1517 bios_to_read = bio_list_size(&bio_list);
1518 if (!bios_to_read) {
1520 * this can happen if others have merged with
1521 * us, it means there is nothing left to read.
1522 * But if there are missing devices it may not be
1523 * safe to do the full stripe write yet.
1529 * the bbio may be freed once we submit the last bio. Make sure
1530 * not to touch it after that
1532 atomic_set(&rbio->stripes_pending, bios_to_read);
1534 bio = bio_list_pop(&bio_list);
1538 bio->bi_private = rbio;
1539 bio->bi_end_io = raid_rmw_end_io;
1541 btrfs_bio_wq_end_io(rbio->fs_info, bio,
1542 BTRFS_WQ_ENDIO_RAID56);
1544 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
1545 submit_bio(READ, bio);
1547 /* the actual write will happen once the reads are done */
1551 rbio_orig_end_io(rbio, -EIO, 0);
1555 validate_rbio_for_rmw(rbio);
1560 * if the upper layers pass in a full stripe, we thank them by only allocating
1561 * enough pages to hold the parity, and sending it all down quickly.
1563 static int full_stripe_write(struct btrfs_raid_bio *rbio)
1567 ret = alloc_rbio_parity_pages(rbio);
1569 __free_raid_bio(rbio);
1573 ret = lock_stripe_add(rbio);
1580 * partial stripe writes get handed over to async helpers.
1581 * We're really hoping to merge a few more writes into this
1582 * rbio before calculating new parity
1584 static int partial_stripe_write(struct btrfs_raid_bio *rbio)
1588 ret = lock_stripe_add(rbio);
1590 async_rmw_stripe(rbio);
1595 * sometimes while we were reading from the drive to
1596 * recalculate parity, enough new bios come into create
1597 * a full stripe. So we do a check here to see if we can
1598 * go directly to finish_rmw
1600 static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
1602 /* head off into rmw land if we don't have a full stripe */
1603 if (!rbio_is_full(rbio))
1604 return partial_stripe_write(rbio);
1605 return full_stripe_write(rbio);
1609 * We use plugging call backs to collect full stripes.
1610 * Any time we get a partial stripe write while plugged
1611 * we collect it into a list. When the unplug comes down,
1612 * we sort the list by logical block number and merge
1613 * everything we can into the same rbios
1615 struct btrfs_plug_cb {
1616 struct blk_plug_cb cb;
1617 struct btrfs_fs_info *info;
1618 struct list_head rbio_list;
1619 struct btrfs_work work;
1623 * rbios on the plug list are sorted for easier merging.
1625 static int plug_cmp(void *priv, struct list_head *a, struct list_head *b)
1627 struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
1629 struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
1631 u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
1632 u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
1634 if (a_sector < b_sector)
1636 if (a_sector > b_sector)
1641 static void run_plug(struct btrfs_plug_cb *plug)
1643 struct btrfs_raid_bio *cur;
1644 struct btrfs_raid_bio *last = NULL;
1647 * sort our plug list then try to merge
1648 * everything we can in hopes of creating full
1651 list_sort(NULL, &plug->rbio_list, plug_cmp);
1652 while (!list_empty(&plug->rbio_list)) {
1653 cur = list_entry(plug->rbio_list.next,
1654 struct btrfs_raid_bio, plug_list);
1655 list_del_init(&cur->plug_list);
1657 if (rbio_is_full(cur)) {
1658 /* we have a full stripe, send it down */
1659 full_stripe_write(cur);
1663 if (rbio_can_merge(last, cur)) {
1664 merge_rbio(last, cur);
1665 __free_raid_bio(cur);
1669 __raid56_parity_write(last);
1674 __raid56_parity_write(last);
1680 * if the unplug comes from schedule, we have to push the
1681 * work off to a helper thread
1683 static void unplug_work(struct btrfs_work *work)
1685 struct btrfs_plug_cb *plug;
1686 plug = container_of(work, struct btrfs_plug_cb, work);
1690 static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1692 struct btrfs_plug_cb *plug;
1693 plug = container_of(cb, struct btrfs_plug_cb, cb);
1695 if (from_schedule) {
1696 btrfs_init_work(&plug->work, btrfs_rmw_helper,
1697 unplug_work, NULL, NULL);
1698 btrfs_queue_work(plug->info->rmw_workers,
1706 * our main entry point for writes from the rest of the FS.
1708 int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
1709 struct btrfs_bio *bbio, u64 *raid_map,
1712 struct btrfs_raid_bio *rbio;
1713 struct btrfs_plug_cb *plug = NULL;
1714 struct blk_plug_cb *cb;
1716 rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
1718 __free_bbio_and_raid_map(bbio, raid_map, 1);
1719 return PTR_ERR(rbio);
1721 bio_list_add(&rbio->bio_list, bio);
1722 rbio->bio_list_bytes = bio->bi_iter.bi_size;
1725 * don't plug on full rbios, just get them out the door
1726 * as quickly as we can
1728 if (rbio_is_full(rbio))
1729 return full_stripe_write(rbio);
1731 cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info,
1734 plug = container_of(cb, struct btrfs_plug_cb, cb);
1736 plug->info = root->fs_info;
1737 INIT_LIST_HEAD(&plug->rbio_list);
1739 list_add_tail(&rbio->plug_list, &plug->rbio_list);
1741 return __raid56_parity_write(rbio);
1747 * all parity reconstruction happens here. We've read in everything
1748 * we can find from the drives and this does the heavy lifting of
1749 * sorting the good from the bad.
1751 static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1755 int faila = -1, failb = -1;
1756 int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
1761 pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *),
1768 faila = rbio->faila;
1769 failb = rbio->failb;
1771 if (rbio->read_rebuild) {
1772 spin_lock_irq(&rbio->bio_list_lock);
1773 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1774 spin_unlock_irq(&rbio->bio_list_lock);
1777 index_rbio_pages(rbio);
1779 for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1780 /* setup our array of pointers with pages
1783 for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
1785 * if we're rebuilding a read, we have to use
1786 * pages from the bio list
1788 if (rbio->read_rebuild &&
1789 (stripe == faila || stripe == failb)) {
1790 page = page_in_rbio(rbio, stripe, pagenr, 0);
1792 page = rbio_stripe_page(rbio, stripe, pagenr);
1794 pointers[stripe] = kmap(page);
1797 /* all raid6 handling here */
1798 if (rbio->raid_map[rbio->bbio->num_stripes - 1] ==
1802 * single failure, rebuild from parity raid5
1806 if (faila == rbio->nr_data) {
1808 * Just the P stripe has failed, without
1809 * a bad data or Q stripe.
1810 * TODO, we should redo the xor here.
1816 * a single failure in raid6 is rebuilt
1817 * in the pstripe code below
1822 /* make sure our ps and qs are in order */
1823 if (faila > failb) {
1829 /* if the q stripe is failed, do a pstripe reconstruction
1831 * If both the q stripe and the P stripe are failed, we're
1832 * here due to a crc mismatch and we can't give them the
1835 if (rbio->raid_map[failb] == RAID6_Q_STRIPE) {
1836 if (rbio->raid_map[faila] == RAID5_P_STRIPE) {
1841 * otherwise we have one bad data stripe and
1842 * a good P stripe. raid5!
1847 if (rbio->raid_map[failb] == RAID5_P_STRIPE) {
1848 raid6_datap_recov(rbio->bbio->num_stripes,
1849 PAGE_SIZE, faila, pointers);
1851 raid6_2data_recov(rbio->bbio->num_stripes,
1852 PAGE_SIZE, faila, failb,
1858 /* rebuild from P stripe here (raid5 or raid6) */
1859 BUG_ON(failb != -1);
1861 /* Copy parity block into failed block to start with */
1862 memcpy(pointers[faila],
1863 pointers[rbio->nr_data],
1866 /* rearrange the pointer array */
1867 p = pointers[faila];
1868 for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
1869 pointers[stripe] = pointers[stripe + 1];
1870 pointers[rbio->nr_data - 1] = p;
1872 /* xor in the rest */
1873 run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE);
1875 /* if we're doing this rebuild as part of an rmw, go through
1876 * and set all of our private rbio pages in the
1877 * failed stripes as uptodate. This way finish_rmw will
1878 * know they can be trusted. If this was a read reconstruction,
1879 * other endio functions will fiddle the uptodate bits
1881 if (!rbio->read_rebuild) {
1882 for (i = 0; i < nr_pages; i++) {
1884 page = rbio_stripe_page(rbio, faila, i);
1885 SetPageUptodate(page);
1888 page = rbio_stripe_page(rbio, failb, i);
1889 SetPageUptodate(page);
1893 for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
1895 * if we're rebuilding a read, we have to use
1896 * pages from the bio list
1898 if (rbio->read_rebuild &&
1899 (stripe == faila || stripe == failb)) {
1900 page = page_in_rbio(rbio, stripe, pagenr, 0);
1902 page = rbio_stripe_page(rbio, stripe, pagenr);
1914 if (rbio->read_rebuild) {
1916 !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags))
1917 cache_rbio_pages(rbio);
1919 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
1921 rbio_orig_end_io(rbio, err, err == 0);
1922 } else if (err == 0) {
1927 rbio_orig_end_io(rbio, err, 0);
1932 * This is called only for stripes we've read from disk to
1933 * reconstruct the parity.
1935 static void raid_recover_end_io(struct bio *bio, int err)
1937 struct btrfs_raid_bio *rbio = bio->bi_private;
1940 * we only read stripe pages off the disk, set them
1941 * up to date if there were no errors
1944 fail_bio_stripe(rbio, bio);
1946 set_bio_pages_uptodate(bio);
1949 if (!atomic_dec_and_test(&rbio->stripes_pending))
1952 if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
1953 rbio_orig_end_io(rbio, -EIO, 0);
1955 __raid_recover_end_io(rbio);
1959 * reads everything we need off the disk to reconstruct
1960 * the parity. endio handlers trigger final reconstruction
1961 * when the IO is done.
1963 * This is used both for reads from the higher layers and for
1964 * parity construction required to finish a rmw cycle.
1966 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
1968 int bios_to_read = 0;
1969 struct btrfs_bio *bbio = rbio->bbio;
1970 struct bio_list bio_list;
1972 int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
1977 bio_list_init(&bio_list);
1979 ret = alloc_rbio_pages(rbio);
1983 atomic_set(&rbio->error, 0);
1986 * read everything that hasn't failed. Thanks to the
1987 * stripe cache, it is possible that some or all of these
1988 * pages are going to be uptodate.
1990 for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
1991 if (rbio->faila == stripe || rbio->failb == stripe) {
1992 atomic_inc(&rbio->error);
1996 for (pagenr = 0; pagenr < nr_pages; pagenr++) {
2000 * the rmw code may have already read this
2003 p = rbio_stripe_page(rbio, stripe, pagenr);
2004 if (PageUptodate(p))
2007 ret = rbio_add_io_page(rbio, &bio_list,
2008 rbio_stripe_page(rbio, stripe, pagenr),
2009 stripe, pagenr, rbio->stripe_len);
2015 bios_to_read = bio_list_size(&bio_list);
2016 if (!bios_to_read) {
2018 * we might have no bios to read just because the pages
2019 * were up to date, or we might have no bios to read because
2020 * the devices were gone.
2022 if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) {
2023 __raid_recover_end_io(rbio);
2031 * the bbio may be freed once we submit the last bio. Make sure
2032 * not to touch it after that
2034 atomic_set(&rbio->stripes_pending, bios_to_read);
2036 bio = bio_list_pop(&bio_list);
2040 bio->bi_private = rbio;
2041 bio->bi_end_io = raid_recover_end_io;
2043 btrfs_bio_wq_end_io(rbio->fs_info, bio,
2044 BTRFS_WQ_ENDIO_RAID56);
2046 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
2047 submit_bio(READ, bio);
2053 if (rbio->read_rebuild)
2054 rbio_orig_end_io(rbio, -EIO, 0);
2059 * the main entry point for reads from the higher layers. This
2060 * is really only called when the normal read path had a failure,
2061 * so we assume the bio they send down corresponds to a failed part
2064 int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
2065 struct btrfs_bio *bbio, u64 *raid_map,
2066 u64 stripe_len, int mirror_num, int hold_bbio)
2068 struct btrfs_raid_bio *rbio;
2071 rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
2073 __free_bbio_and_raid_map(bbio, raid_map, !hold_bbio);
2074 return PTR_ERR(rbio);
2078 set_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags);
2079 rbio->read_rebuild = 1;
2080 bio_list_add(&rbio->bio_list, bio);
2081 rbio->bio_list_bytes = bio->bi_iter.bi_size;
2083 rbio->faila = find_logical_bio_stripe(rbio, bio);
2084 if (rbio->faila == -1) {
2086 __free_bbio_and_raid_map(bbio, raid_map, !hold_bbio);
2092 * reconstruct from the q stripe if they are
2093 * asking for mirror 3
2095 if (mirror_num == 3)
2096 rbio->failb = bbio->num_stripes - 2;
2098 ret = lock_stripe_add(rbio);
2101 * __raid56_parity_recover will end the bio with
2102 * any errors it hits. We don't want to return
2103 * its error value up the stack because our caller
2104 * will end up calling bio_endio with any nonzero
2108 __raid56_parity_recover(rbio);
2110 * our rbio has been added to the list of
2111 * rbios that will be handled after the
2112 * currently lock owner is done
2118 static void rmw_work(struct btrfs_work *work)
2120 struct btrfs_raid_bio *rbio;
2122 rbio = container_of(work, struct btrfs_raid_bio, work);
2123 raid56_rmw_stripe(rbio);
2126 static void read_rebuild_work(struct btrfs_work *work)
2128 struct btrfs_raid_bio *rbio;
2130 rbio = container_of(work, struct btrfs_raid_bio, work);
2131 __raid56_parity_recover(rbio);