704ef7fcfbf83239ff7ac119d37354576dea5221
[karo-tx-linux.git] / drivers / md / raid5.c
1 /*
2  * raid5.c : Multiple Devices driver for Linux
3  *         Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
4  *         Copyright (C) 1999, 2000 Ingo Molnar
5  *         Copyright (C) 2002, 2003 H. Peter Anvin
6  *
7  * RAID-4/5/6 management functions.
8  * Thanks to Penguin Computing for making the RAID-6 development possible
9  * by donating a test server!
10  *
11  * This program is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU General Public License as published by
13  * the Free Software Foundation; either version 2, or (at your option)
14  * any later version.
15  *
16  * You should have received a copy of the GNU General Public License
17  * (for example /usr/src/linux/COPYING); if not, write to the Free
18  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19  */
20
21 /*
22  * BITMAP UNPLUGGING:
23  *
24  * The sequencing for updating the bitmap reliably is a little
25  * subtle (and I got it wrong the first time) so it deserves some
26  * explanation.
27  *
28  * We group bitmap updates into batches.  Each batch has a number.
29  * We may write out several batches at once, but that isn't very important.
30  * conf->seq_write is the number of the last batch successfully written.
31  * conf->seq_flush is the number of the last batch that was closed to
32  *    new additions.
33  * When we discover that we will need to write to any block in a stripe
34  * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq
35  * the number of the batch it will be in. This is seq_flush+1.
36  * When we are ready to do a write, if that batch hasn't been written yet,
37  *   we plug the array and queue the stripe for later.
38  * When an unplug happens, we increment bm_flush, thus closing the current
39  *   batch.
40  * When we notice that bm_flush > bm_write, we write out all pending updates
41  * to the bitmap, and advance bm_write to where bm_flush was.
42  * This may occasionally write a bit out twice, but is sure never to
43  * miss any bits.
44  */
45
46 #include <linux/blkdev.h>
47 #include <linux/kthread.h>
48 #include <linux/raid/pq.h>
49 #include <linux/async_tx.h>
50 #include <linux/module.h>
51 #include <linux/async.h>
52 #include <linux/seq_file.h>
53 #include <linux/cpu.h>
54 #include <linux/slab.h>
55 #include <linux/ratelimit.h>
56 #include <linux/nodemask.h>
57 #include <linux/flex_array.h>
58 #include <trace/events/block.h>
59
60 #include "md.h"
61 #include "raid5.h"
62 #include "raid0.h"
63 #include "bitmap.h"
64
65 #define cpu_to_group(cpu) cpu_to_node(cpu)
66 #define ANY_GROUP NUMA_NO_NODE
67
68 static bool devices_handle_discard_safely = false;
69 module_param(devices_handle_discard_safely, bool, 0644);
70 MODULE_PARM_DESC(devices_handle_discard_safely,
71                  "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions");
72 static struct workqueue_struct *raid5_wq;
73 /*
74  * Stripe cache
75  */
76
77 #define NR_STRIPES              256
78 #define STRIPE_SIZE             PAGE_SIZE
79 #define STRIPE_SHIFT            (PAGE_SHIFT - 9)
80 #define STRIPE_SECTORS          (STRIPE_SIZE>>9)
81 #define IO_THRESHOLD            1
82 #define BYPASS_THRESHOLD        1
83 #define NR_HASH                 (PAGE_SIZE / sizeof(struct hlist_head))
84 #define HASH_MASK               (NR_HASH - 1)
85 #define MAX_STRIPE_BATCH        8
86
87 static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
88 {
89         int hash = (sect >> STRIPE_SHIFT) & HASH_MASK;
90         return &conf->stripe_hashtbl[hash];
91 }
92
93 static inline int stripe_hash_locks_hash(sector_t sect)
94 {
95         return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK;
96 }
97
98 static inline void lock_device_hash_lock(struct r5conf *conf, int hash)
99 {
100         spin_lock_irq(conf->hash_locks + hash);
101         spin_lock(&conf->device_lock);
102 }
103
104 static inline void unlock_device_hash_lock(struct r5conf *conf, int hash)
105 {
106         spin_unlock(&conf->device_lock);
107         spin_unlock_irq(conf->hash_locks + hash);
108 }
109
110 static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
111 {
112         int i;
113         local_irq_disable();
114         spin_lock(conf->hash_locks);
115         for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
116                 spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks);
117         spin_lock(&conf->device_lock);
118 }
119
120 static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
121 {
122         int i;
123         spin_unlock(&conf->device_lock);
124         for (i = NR_STRIPE_HASH_LOCKS; i; i--)
125                 spin_unlock(conf->hash_locks + i - 1);
126         local_irq_enable();
127 }
128
129 /* bio's attached to a stripe+device for I/O are linked together in bi_sector
130  * order without overlap.  There may be several bio's per stripe+device, and
131  * a bio could span several devices.
132  * When walking this list for a particular stripe+device, we must never proceed
133  * beyond a bio that extends past this device, as the next bio might no longer
134  * be valid.
135  * This function is used to determine the 'next' bio in the list, given the sector
136  * of the current stripe+device
137  */
138 static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
139 {
140         int sectors = bio_sectors(bio);
141         if (bio->bi_iter.bi_sector + sectors < sector + STRIPE_SECTORS)
142                 return bio->bi_next;
143         else
144                 return NULL;
145 }
146
147 /*
148  * We maintain a biased count of active stripes in the bottom 16 bits of
149  * bi_phys_segments, and a count of processed stripes in the upper 16 bits
150  */
151 static inline int raid5_bi_processed_stripes(struct bio *bio)
152 {
153         atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
154         return (atomic_read(segments) >> 16) & 0xffff;
155 }
156
157 static inline int raid5_dec_bi_active_stripes(struct bio *bio)
158 {
159         atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
160         return atomic_sub_return(1, segments) & 0xffff;
161 }
162
163 static inline void raid5_inc_bi_active_stripes(struct bio *bio)
164 {
165         atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
166         atomic_inc(segments);
167 }
168
169 static inline void raid5_set_bi_processed_stripes(struct bio *bio,
170         unsigned int cnt)
171 {
172         atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
173         int old, new;
174
175         do {
176                 old = atomic_read(segments);
177                 new = (old & 0xffff) | (cnt << 16);
178         } while (atomic_cmpxchg(segments, old, new) != old);
179 }
180
181 static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt)
182 {
183         atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
184         atomic_set(segments, cnt);
185 }
186
187 /* Find first data disk in a raid6 stripe */
188 static inline int raid6_d0(struct stripe_head *sh)
189 {
190         if (sh->ddf_layout)
191                 /* ddf always start from first device */
192                 return 0;
193         /* md starts just after Q block */
194         if (sh->qd_idx == sh->disks - 1)
195                 return 0;
196         else
197                 return sh->qd_idx + 1;
198 }
199 static inline int raid6_next_disk(int disk, int raid_disks)
200 {
201         disk++;
202         return (disk < raid_disks) ? disk : 0;
203 }
204
205 /* When walking through the disks in a raid5, starting at raid6_d0,
206  * We need to map each disk to a 'slot', where the data disks are slot
207  * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk
208  * is raid_disks-1.  This help does that mapping.
209  */
210 static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
211                              int *count, int syndrome_disks)
212 {
213         int slot = *count;
214
215         if (sh->ddf_layout)
216                 (*count)++;
217         if (idx == sh->pd_idx)
218                 return syndrome_disks;
219         if (idx == sh->qd_idx)
220                 return syndrome_disks + 1;
221         if (!sh->ddf_layout)
222                 (*count)++;
223         return slot;
224 }
225
226 static void return_io(struct bio_list *return_bi)
227 {
228         struct bio *bi;
229         while ((bi = bio_list_pop(return_bi)) != NULL) {
230                 bi->bi_iter.bi_size = 0;
231                 trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
232                                          bi, 0);
233                 bio_endio(bi);
234         }
235 }
236
237 static void print_raid5_conf (struct r5conf *conf);
238
239 static int stripe_operations_active(struct stripe_head *sh)
240 {
241         return sh->check_state || sh->reconstruct_state ||
242                test_bit(STRIPE_BIOFILL_RUN, &sh->state) ||
243                test_bit(STRIPE_COMPUTE_RUN, &sh->state);
244 }
245
246 static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
247 {
248         struct r5conf *conf = sh->raid_conf;
249         struct r5worker_group *group;
250         int thread_cnt;
251         int i, cpu = sh->cpu;
252
253         if (!cpu_online(cpu)) {
254                 cpu = cpumask_any(cpu_online_mask);
255                 sh->cpu = cpu;
256         }
257
258         if (list_empty(&sh->lru)) {
259                 struct r5worker_group *group;
260                 group = conf->worker_groups + cpu_to_group(cpu);
261                 list_add_tail(&sh->lru, &group->handle_list);
262                 group->stripes_cnt++;
263                 sh->group = group;
264         }
265
266         if (conf->worker_cnt_per_group == 0) {
267                 md_wakeup_thread(conf->mddev->thread);
268                 return;
269         }
270
271         group = conf->worker_groups + cpu_to_group(sh->cpu);
272
273         group->workers[0].working = true;
274         /* at least one worker should run to avoid race */
275         queue_work_on(sh->cpu, raid5_wq, &group->workers[0].work);
276
277         thread_cnt = group->stripes_cnt / MAX_STRIPE_BATCH - 1;
278         /* wakeup more workers */
279         for (i = 1; i < conf->worker_cnt_per_group && thread_cnt > 0; i++) {
280                 if (group->workers[i].working == false) {
281                         group->workers[i].working = true;
282                         queue_work_on(sh->cpu, raid5_wq,
283                                       &group->workers[i].work);
284                         thread_cnt--;
285                 }
286         }
287 }
288
289 static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
290                               struct list_head *temp_inactive_list)
291 {
292         BUG_ON(!list_empty(&sh->lru));
293         BUG_ON(atomic_read(&conf->active_stripes)==0);
294         if (test_bit(STRIPE_HANDLE, &sh->state)) {
295                 if (test_bit(STRIPE_DELAYED, &sh->state) &&
296                     !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
297                         list_add_tail(&sh->lru, &conf->delayed_list);
298                 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
299                            sh->bm_seq - conf->seq_write > 0)
300                         list_add_tail(&sh->lru, &conf->bitmap_list);
301                 else {
302                         clear_bit(STRIPE_DELAYED, &sh->state);
303                         clear_bit(STRIPE_BIT_DELAY, &sh->state);
304                         if (conf->worker_cnt_per_group == 0) {
305                                 list_add_tail(&sh->lru, &conf->handle_list);
306                         } else {
307                                 raid5_wakeup_stripe_thread(sh);
308                                 return;
309                         }
310                 }
311                 md_wakeup_thread(conf->mddev->thread);
312         } else {
313                 BUG_ON(stripe_operations_active(sh));
314                 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
315                         if (atomic_dec_return(&conf->preread_active_stripes)
316                             < IO_THRESHOLD)
317                                 md_wakeup_thread(conf->mddev->thread);
318                 atomic_dec(&conf->active_stripes);
319                 if (!test_bit(STRIPE_EXPANDING, &sh->state))
320                         list_add_tail(&sh->lru, temp_inactive_list);
321         }
322 }
323
324 static void __release_stripe(struct r5conf *conf, struct stripe_head *sh,
325                              struct list_head *temp_inactive_list)
326 {
327         if (atomic_dec_and_test(&sh->count))
328                 do_release_stripe(conf, sh, temp_inactive_list);
329 }
330
331 /*
332  * @hash could be NR_STRIPE_HASH_LOCKS, then we have a list of inactive_list
333  *
334  * Be careful: Only one task can add/delete stripes from temp_inactive_list at
335  * given time. Adding stripes only takes device lock, while deleting stripes
336  * only takes hash lock.
337  */
338 static void release_inactive_stripe_list(struct r5conf *conf,
339                                          struct list_head *temp_inactive_list,
340                                          int hash)
341 {
342         int size;
343         unsigned long do_wakeup = 0;
344         int i = 0;
345         unsigned long flags;
346
347         if (hash == NR_STRIPE_HASH_LOCKS) {
348                 size = NR_STRIPE_HASH_LOCKS;
349                 hash = NR_STRIPE_HASH_LOCKS - 1;
350         } else
351                 size = 1;
352         while (size) {
353                 struct list_head *list = &temp_inactive_list[size - 1];
354
355                 /*
356                  * We don't hold any lock here yet, raid5_get_active_stripe() might
357                  * remove stripes from the list
358                  */
359                 if (!list_empty_careful(list)) {
360                         spin_lock_irqsave(conf->hash_locks + hash, flags);
361                         if (list_empty(conf->inactive_list + hash) &&
362                             !list_empty(list))
363                                 atomic_dec(&conf->empty_inactive_list_nr);
364                         list_splice_tail_init(list, conf->inactive_list + hash);
365                         do_wakeup |= 1 << hash;
366                         spin_unlock_irqrestore(conf->hash_locks + hash, flags);
367                 }
368                 size--;
369                 hash--;
370         }
371
372         for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) {
373                 if (do_wakeup & (1 << i))
374                         wake_up(&conf->wait_for_stripe[i]);
375         }
376
377         if (do_wakeup) {
378                 if (atomic_read(&conf->active_stripes) == 0)
379                         wake_up(&conf->wait_for_quiescent);
380                 if (conf->retry_read_aligned)
381                         md_wakeup_thread(conf->mddev->thread);
382         }
383 }
384
385 /* should hold conf->device_lock already */
386 static int release_stripe_list(struct r5conf *conf,
387                                struct list_head *temp_inactive_list)
388 {
389         struct stripe_head *sh;
390         int count = 0;
391         struct llist_node *head;
392
393         head = llist_del_all(&conf->released_stripes);
394         head = llist_reverse_order(head);
395         while (head) {
396                 int hash;
397
398                 sh = llist_entry(head, struct stripe_head, release_list);
399                 head = llist_next(head);
400                 /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */
401                 smp_mb();
402                 clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state);
403                 /*
404                  * Don't worry the bit is set here, because if the bit is set
405                  * again, the count is always > 1. This is true for
406                  * STRIPE_ON_UNPLUG_LIST bit too.
407                  */
408                 hash = sh->hash_lock_index;
409                 __release_stripe(conf, sh, &temp_inactive_list[hash]);
410                 count++;
411         }
412
413         return count;
414 }
415
416 void raid5_release_stripe(struct stripe_head *sh)
417 {
418         struct r5conf *conf = sh->raid_conf;
419         unsigned long flags;
420         struct list_head list;
421         int hash;
422         bool wakeup;
423
424         /* Avoid release_list until the last reference.
425          */
426         if (atomic_add_unless(&sh->count, -1, 1))
427                 return;
428
429         if (unlikely(!conf->mddev->thread) ||
430                 test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state))
431                 goto slow_path;
432         wakeup = llist_add(&sh->release_list, &conf->released_stripes);
433         if (wakeup)
434                 md_wakeup_thread(conf->mddev->thread);
435         return;
436 slow_path:
437         local_irq_save(flags);
438         /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */
439         if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) {
440                 INIT_LIST_HEAD(&list);
441                 hash = sh->hash_lock_index;
442                 do_release_stripe(conf, sh, &list);
443                 spin_unlock(&conf->device_lock);
444                 release_inactive_stripe_list(conf, &list, hash);
445         }
446         local_irq_restore(flags);
447 }
448
449 static inline void remove_hash(struct stripe_head *sh)
450 {
451         pr_debug("remove_hash(), stripe %llu\n",
452                 (unsigned long long)sh->sector);
453
454         hlist_del_init(&sh->hash);
455 }
456
457 static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh)
458 {
459         struct hlist_head *hp = stripe_hash(conf, sh->sector);
460
461         pr_debug("insert_hash(), stripe %llu\n",
462                 (unsigned long long)sh->sector);
463
464         hlist_add_head(&sh->hash, hp);
465 }
466
467 /* find an idle stripe, make sure it is unhashed, and return it. */
468 static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash)
469 {
470         struct stripe_head *sh = NULL;
471         struct list_head *first;
472
473         if (list_empty(conf->inactive_list + hash))
474                 goto out;
475         first = (conf->inactive_list + hash)->next;
476         sh = list_entry(first, struct stripe_head, lru);
477         list_del_init(first);
478         remove_hash(sh);
479         atomic_inc(&conf->active_stripes);
480         BUG_ON(hash != sh->hash_lock_index);
481         if (list_empty(conf->inactive_list + hash))
482                 atomic_inc(&conf->empty_inactive_list_nr);
483 out:
484         return sh;
485 }
486
487 static void shrink_buffers(struct stripe_head *sh)
488 {
489         struct page *p;
490         int i;
491         int num = sh->raid_conf->pool_size;
492
493         for (i = 0; i < num ; i++) {
494                 WARN_ON(sh->dev[i].page != sh->dev[i].orig_page);
495                 p = sh->dev[i].page;
496                 if (!p)
497                         continue;
498                 sh->dev[i].page = NULL;
499                 put_page(p);
500         }
501 }
502
503 static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
504 {
505         int i;
506         int num = sh->raid_conf->pool_size;
507
508         for (i = 0; i < num; i++) {
509                 struct page *page;
510
511                 if (!(page = alloc_page(gfp))) {
512                         return 1;
513                 }
514                 sh->dev[i].page = page;
515                 sh->dev[i].orig_page = page;
516         }
517         return 0;
518 }
519
520 static void raid5_build_block(struct stripe_head *sh, int i, int previous);
521 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
522                             struct stripe_head *sh);
523
524 static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
525 {
526         struct r5conf *conf = sh->raid_conf;
527         int i, seq;
528
529         BUG_ON(atomic_read(&sh->count) != 0);
530         BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
531         BUG_ON(stripe_operations_active(sh));
532         BUG_ON(sh->batch_head);
533
534         pr_debug("init_stripe called, stripe %llu\n",
535                 (unsigned long long)sector);
536 retry:
537         seq = read_seqcount_begin(&conf->gen_lock);
538         sh->generation = conf->generation - previous;
539         sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
540         sh->sector = sector;
541         stripe_set_idx(sector, conf, previous, sh);
542         sh->state = 0;
543
544         for (i = sh->disks; i--; ) {
545                 struct r5dev *dev = &sh->dev[i];
546
547                 if (dev->toread || dev->read || dev->towrite || dev->written ||
548                     test_bit(R5_LOCKED, &dev->flags)) {
549                         printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n",
550                                (unsigned long long)sh->sector, i, dev->toread,
551                                dev->read, dev->towrite, dev->written,
552                                test_bit(R5_LOCKED, &dev->flags));
553                         WARN_ON(1);
554                 }
555                 dev->flags = 0;
556                 raid5_build_block(sh, i, previous);
557         }
558         if (read_seqcount_retry(&conf->gen_lock, seq))
559                 goto retry;
560         sh->overwrite_disks = 0;
561         insert_hash(conf, sh);
562         sh->cpu = smp_processor_id();
563         set_bit(STRIPE_BATCH_READY, &sh->state);
564 }
565
566 static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
567                                          short generation)
568 {
569         struct stripe_head *sh;
570
571         pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);
572         hlist_for_each_entry(sh, stripe_hash(conf, sector), hash)
573                 if (sh->sector == sector && sh->generation == generation)
574                         return sh;
575         pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector);
576         return NULL;
577 }
578
579 /*
580  * Need to check if array has failed when deciding whether to:
581  *  - start an array
582  *  - remove non-faulty devices
583  *  - add a spare
584  *  - allow a reshape
585  * This determination is simple when no reshape is happening.
586  * However if there is a reshape, we need to carefully check
587  * both the before and after sections.
588  * This is because some failed devices may only affect one
589  * of the two sections, and some non-in_sync devices may
590  * be insync in the section most affected by failed devices.
591  */
592 static int calc_degraded(struct r5conf *conf)
593 {
594         int degraded, degraded2;
595         int i;
596
597         rcu_read_lock();
598         degraded = 0;
599         for (i = 0; i < conf->previous_raid_disks; i++) {
600                 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
601                 if (rdev && test_bit(Faulty, &rdev->flags))
602                         rdev = rcu_dereference(conf->disks[i].replacement);
603                 if (!rdev || test_bit(Faulty, &rdev->flags))
604                         degraded++;
605                 else if (test_bit(In_sync, &rdev->flags))
606                         ;
607                 else
608                         /* not in-sync or faulty.
609                          * If the reshape increases the number of devices,
610                          * this is being recovered by the reshape, so
611                          * this 'previous' section is not in_sync.
612                          * If the number of devices is being reduced however,
613                          * the device can only be part of the array if
614                          * we are reverting a reshape, so this section will
615                          * be in-sync.
616                          */
617                         if (conf->raid_disks >= conf->previous_raid_disks)
618                                 degraded++;
619         }
620         rcu_read_unlock();
621         if (conf->raid_disks == conf->previous_raid_disks)
622                 return degraded;
623         rcu_read_lock();
624         degraded2 = 0;
625         for (i = 0; i < conf->raid_disks; i++) {
626                 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
627                 if (rdev && test_bit(Faulty, &rdev->flags))
628                         rdev = rcu_dereference(conf->disks[i].replacement);
629                 if (!rdev || test_bit(Faulty, &rdev->flags))
630                         degraded2++;
631                 else if (test_bit(In_sync, &rdev->flags))
632                         ;
633                 else
634                         /* not in-sync or faulty.
635                          * If reshape increases the number of devices, this
636                          * section has already been recovered, else it
637                          * almost certainly hasn't.
638                          */
639                         if (conf->raid_disks <= conf->previous_raid_disks)
640                                 degraded2++;
641         }
642         rcu_read_unlock();
643         if (degraded2 > degraded)
644                 return degraded2;
645         return degraded;
646 }
647
648 static int has_failed(struct r5conf *conf)
649 {
650         int degraded;
651
652         if (conf->mddev->reshape_position == MaxSector)
653                 return conf->mddev->degraded > conf->max_degraded;
654
655         degraded = calc_degraded(conf);
656         if (degraded > conf->max_degraded)
657                 return 1;
658         return 0;
659 }
660
661 struct stripe_head *
662 raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
663                         int previous, int noblock, int noquiesce)
664 {
665         struct stripe_head *sh;
666         int hash = stripe_hash_locks_hash(sector);
667
668         pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
669
670         spin_lock_irq(conf->hash_locks + hash);
671
672         do {
673                 wait_event_lock_irq(conf->wait_for_quiescent,
674                                     conf->quiesce == 0 || noquiesce,
675                                     *(conf->hash_locks + hash));
676                 sh = __find_stripe(conf, sector, conf->generation - previous);
677                 if (!sh) {
678                         if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) {
679                                 sh = get_free_stripe(conf, hash);
680                                 if (!sh && !test_bit(R5_DID_ALLOC,
681                                                      &conf->cache_state))
682                                         set_bit(R5_ALLOC_MORE,
683                                                 &conf->cache_state);
684                         }
685                         if (noblock && sh == NULL)
686                                 break;
687                         if (!sh) {
688                                 set_bit(R5_INACTIVE_BLOCKED,
689                                         &conf->cache_state);
690                                 wait_event_exclusive_cmd(
691                                         conf->wait_for_stripe[hash],
692                                         !list_empty(conf->inactive_list + hash) &&
693                                         (atomic_read(&conf->active_stripes)
694                                          < (conf->max_nr_stripes * 3 / 4)
695                                          || !test_bit(R5_INACTIVE_BLOCKED,
696                                                       &conf->cache_state)),
697                                         spin_unlock_irq(conf->hash_locks + hash),
698                                         spin_lock_irq(conf->hash_locks + hash));
699                                 clear_bit(R5_INACTIVE_BLOCKED,
700                                           &conf->cache_state);
701                         } else {
702                                 init_stripe(sh, sector, previous);
703                                 atomic_inc(&sh->count);
704                         }
705                 } else if (!atomic_inc_not_zero(&sh->count)) {
706                         spin_lock(&conf->device_lock);
707                         if (!atomic_read(&sh->count)) {
708                                 if (!test_bit(STRIPE_HANDLE, &sh->state))
709                                         atomic_inc(&conf->active_stripes);
710                                 BUG_ON(list_empty(&sh->lru) &&
711                                        !test_bit(STRIPE_EXPANDING, &sh->state));
712                                 list_del_init(&sh->lru);
713                                 if (sh->group) {
714                                         sh->group->stripes_cnt--;
715                                         sh->group = NULL;
716                                 }
717                         }
718                         atomic_inc(&sh->count);
719                         spin_unlock(&conf->device_lock);
720                 }
721         } while (sh == NULL);
722
723         if (!list_empty(conf->inactive_list + hash))
724                 wake_up(&conf->wait_for_stripe[hash]);
725
726         spin_unlock_irq(conf->hash_locks + hash);
727         return sh;
728 }
729
730 static bool is_full_stripe_write(struct stripe_head *sh)
731 {
732         BUG_ON(sh->overwrite_disks > (sh->disks - sh->raid_conf->max_degraded));
733         return sh->overwrite_disks == (sh->disks - sh->raid_conf->max_degraded);
734 }
735
736 static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
737 {
738         local_irq_disable();
739         if (sh1 > sh2) {
740                 spin_lock(&sh2->stripe_lock);
741                 spin_lock_nested(&sh1->stripe_lock, 1);
742         } else {
743                 spin_lock(&sh1->stripe_lock);
744                 spin_lock_nested(&sh2->stripe_lock, 1);
745         }
746 }
747
748 static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
749 {
750         spin_unlock(&sh1->stripe_lock);
751         spin_unlock(&sh2->stripe_lock);
752         local_irq_enable();
753 }
754
755 /* Only freshly new full stripe normal write stripe can be added to a batch list */
756 static bool stripe_can_batch(struct stripe_head *sh)
757 {
758         struct r5conf *conf = sh->raid_conf;
759
760         if (conf->log)
761                 return false;
762         return test_bit(STRIPE_BATCH_READY, &sh->state) &&
763                 !test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&
764                 is_full_stripe_write(sh);
765 }
766
767 /* we only do back search */
768 static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh)
769 {
770         struct stripe_head *head;
771         sector_t head_sector, tmp_sec;
772         int hash;
773         int dd_idx;
774
775         if (!stripe_can_batch(sh))
776                 return;
777         /* Don't cross chunks, so stripe pd_idx/qd_idx is the same */
778         tmp_sec = sh->sector;
779         if (!sector_div(tmp_sec, conf->chunk_sectors))
780                 return;
781         head_sector = sh->sector - STRIPE_SECTORS;
782
783         hash = stripe_hash_locks_hash(head_sector);
784         spin_lock_irq(conf->hash_locks + hash);
785         head = __find_stripe(conf, head_sector, conf->generation);
786         if (head && !atomic_inc_not_zero(&head->count)) {
787                 spin_lock(&conf->device_lock);
788                 if (!atomic_read(&head->count)) {
789                         if (!test_bit(STRIPE_HANDLE, &head->state))
790                                 atomic_inc(&conf->active_stripes);
791                         BUG_ON(list_empty(&head->lru) &&
792                                !test_bit(STRIPE_EXPANDING, &head->state));
793                         list_del_init(&head->lru);
794                         if (head->group) {
795                                 head->group->stripes_cnt--;
796                                 head->group = NULL;
797                         }
798                 }
799                 atomic_inc(&head->count);
800                 spin_unlock(&conf->device_lock);
801         }
802         spin_unlock_irq(conf->hash_locks + hash);
803
804         if (!head)
805                 return;
806         if (!stripe_can_batch(head))
807                 goto out;
808
809         lock_two_stripes(head, sh);
810         /* clear_batch_ready clear the flag */
811         if (!stripe_can_batch(head) || !stripe_can_batch(sh))
812                 goto unlock_out;
813
814         if (sh->batch_head)
815                 goto unlock_out;
816
817         dd_idx = 0;
818         while (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx)
819                 dd_idx++;
820         if (head->dev[dd_idx].towrite->bi_rw != sh->dev[dd_idx].towrite->bi_rw)
821                 goto unlock_out;
822
823         if (head->batch_head) {
824                 spin_lock(&head->batch_head->batch_lock);
825                 /* This batch list is already running */
826                 if (!stripe_can_batch(head)) {
827                         spin_unlock(&head->batch_head->batch_lock);
828                         goto unlock_out;
829                 }
830
831                 /*
832                  * at this point, head's BATCH_READY could be cleared, but we
833                  * can still add the stripe to batch list
834                  */
835                 list_add(&sh->batch_list, &head->batch_list);
836                 spin_unlock(&head->batch_head->batch_lock);
837
838                 sh->batch_head = head->batch_head;
839         } else {
840                 head->batch_head = head;
841                 sh->batch_head = head->batch_head;
842                 spin_lock(&head->batch_lock);
843                 list_add_tail(&sh->batch_list, &head->batch_list);
844                 spin_unlock(&head->batch_lock);
845         }
846
847         if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
848                 if (atomic_dec_return(&conf->preread_active_stripes)
849                     < IO_THRESHOLD)
850                         md_wakeup_thread(conf->mddev->thread);
851
852         if (test_and_clear_bit(STRIPE_BIT_DELAY, &sh->state)) {
853                 int seq = sh->bm_seq;
854                 if (test_bit(STRIPE_BIT_DELAY, &sh->batch_head->state) &&
855                     sh->batch_head->bm_seq > seq)
856                         seq = sh->batch_head->bm_seq;
857                 set_bit(STRIPE_BIT_DELAY, &sh->batch_head->state);
858                 sh->batch_head->bm_seq = seq;
859         }
860
861         atomic_inc(&sh->count);
862 unlock_out:
863         unlock_two_stripes(head, sh);
864 out:
865         raid5_release_stripe(head);
866 }
867
868 /* Determine if 'data_offset' or 'new_data_offset' should be used
869  * in this stripe_head.
870  */
871 static int use_new_offset(struct r5conf *conf, struct stripe_head *sh)
872 {
873         sector_t progress = conf->reshape_progress;
874         /* Need a memory barrier to make sure we see the value
875          * of conf->generation, or ->data_offset that was set before
876          * reshape_progress was updated.
877          */
878         smp_rmb();
879         if (progress == MaxSector)
880                 return 0;
881         if (sh->generation == conf->generation - 1)
882                 return 0;
883         /* We are in a reshape, and this is a new-generation stripe,
884          * so use new_data_offset.
885          */
886         return 1;
887 }
888
889 static void
890 raid5_end_read_request(struct bio *bi);
891 static void
892 raid5_end_write_request(struct bio *bi);
893
894 static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
895 {
896         struct r5conf *conf = sh->raid_conf;
897         int i, disks = sh->disks;
898         struct stripe_head *head_sh = sh;
899
900         might_sleep();
901
902         if (r5l_write_stripe(conf->log, sh) == 0)
903                 return;
904         for (i = disks; i--; ) {
905                 int rw;
906                 int replace_only = 0;
907                 struct bio *bi, *rbi;
908                 struct md_rdev *rdev, *rrdev = NULL;
909
910                 sh = head_sh;
911                 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
912                         if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
913                                 rw = WRITE_FUA;
914                         else
915                                 rw = WRITE;
916                         if (test_bit(R5_Discard, &sh->dev[i].flags))
917                                 rw |= REQ_DISCARD;
918                 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
919                         rw = READ;
920                 else if (test_and_clear_bit(R5_WantReplace,
921                                             &sh->dev[i].flags)) {
922                         rw = WRITE;
923                         replace_only = 1;
924                 } else
925                         continue;
926                 if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags))
927                         rw |= REQ_SYNC;
928
929 again:
930                 bi = &sh->dev[i].req;
931                 rbi = &sh->dev[i].rreq; /* For writing to replacement */
932
933                 rcu_read_lock();
934                 rrdev = rcu_dereference(conf->disks[i].replacement);
935                 smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */
936                 rdev = rcu_dereference(conf->disks[i].rdev);
937                 if (!rdev) {
938                         rdev = rrdev;
939                         rrdev = NULL;
940                 }
941                 if (rw & WRITE) {
942                         if (replace_only)
943                                 rdev = NULL;
944                         if (rdev == rrdev)
945                                 /* We raced and saw duplicates */
946                                 rrdev = NULL;
947                 } else {
948                         if (test_bit(R5_ReadRepl, &head_sh->dev[i].flags) && rrdev)
949                                 rdev = rrdev;
950                         rrdev = NULL;
951                 }
952
953                 if (rdev && test_bit(Faulty, &rdev->flags))
954                         rdev = NULL;
955                 if (rdev)
956                         atomic_inc(&rdev->nr_pending);
957                 if (rrdev && test_bit(Faulty, &rrdev->flags))
958                         rrdev = NULL;
959                 if (rrdev)
960                         atomic_inc(&rrdev->nr_pending);
961                 rcu_read_unlock();
962
963                 /* We have already checked bad blocks for reads.  Now
964                  * need to check for writes.  We never accept write errors
965                  * on the replacement, so we don't to check rrdev.
966                  */
967                 while ((rw & WRITE) && rdev &&
968                        test_bit(WriteErrorSeen, &rdev->flags)) {
969                         sector_t first_bad;
970                         int bad_sectors;
971                         int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
972                                               &first_bad, &bad_sectors);
973                         if (!bad)
974                                 break;
975
976                         if (bad < 0) {
977                                 set_bit(BlockedBadBlocks, &rdev->flags);
978                                 if (!conf->mddev->external &&
979                                     conf->mddev->flags) {
980                                         /* It is very unlikely, but we might
981                                          * still need to write out the
982                                          * bad block log - better give it
983                                          * a chance*/
984                                         md_check_recovery(conf->mddev);
985                                 }
986                                 /*
987                                  * Because md_wait_for_blocked_rdev
988                                  * will dec nr_pending, we must
989                                  * increment it first.
990                                  */
991                                 atomic_inc(&rdev->nr_pending);
992                                 md_wait_for_blocked_rdev(rdev, conf->mddev);
993                         } else {
994                                 /* Acknowledged bad block - skip the write */
995                                 rdev_dec_pending(rdev, conf->mddev);
996                                 rdev = NULL;
997                         }
998                 }
999
1000                 if (rdev) {
1001                         if (s->syncing || s->expanding || s->expanded
1002                             || s->replacing)
1003                                 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
1004
1005                         set_bit(STRIPE_IO_STARTED, &sh->state);
1006
1007                         bio_reset(bi);
1008                         bi->bi_bdev = rdev->bdev;
1009                         bi->bi_rw = rw;
1010                         bi->bi_end_io = (rw & WRITE)
1011                                 ? raid5_end_write_request
1012                                 : raid5_end_read_request;
1013                         bi->bi_private = sh;
1014
1015                         pr_debug("%s: for %llu schedule op %ld on disc %d\n",
1016                                 __func__, (unsigned long long)sh->sector,
1017                                 bi->bi_rw, i);
1018                         atomic_inc(&sh->count);
1019                         if (sh != head_sh)
1020                                 atomic_inc(&head_sh->count);
1021                         if (use_new_offset(conf, sh))
1022                                 bi->bi_iter.bi_sector = (sh->sector
1023                                                  + rdev->new_data_offset);
1024                         else
1025                                 bi->bi_iter.bi_sector = (sh->sector
1026                                                  + rdev->data_offset);
1027                         if (test_bit(R5_ReadNoMerge, &head_sh->dev[i].flags))
1028                                 bi->bi_rw |= REQ_NOMERGE;
1029
1030                         if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
1031                                 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
1032                         sh->dev[i].vec.bv_page = sh->dev[i].page;
1033                         bi->bi_vcnt = 1;
1034                         bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
1035                         bi->bi_io_vec[0].bv_offset = 0;
1036                         bi->bi_iter.bi_size = STRIPE_SIZE;
1037                         /*
1038                          * If this is discard request, set bi_vcnt 0. We don't
1039                          * want to confuse SCSI because SCSI will replace payload
1040                          */
1041                         if (rw & REQ_DISCARD)
1042                                 bi->bi_vcnt = 0;
1043                         if (rrdev)
1044                                 set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
1045
1046                         if (conf->mddev->gendisk)
1047                                 trace_block_bio_remap(bdev_get_queue(bi->bi_bdev),
1048                                                       bi, disk_devt(conf->mddev->gendisk),
1049                                                       sh->dev[i].sector);
1050                         generic_make_request(bi);
1051                 }
1052                 if (rrdev) {
1053                         if (s->syncing || s->expanding || s->expanded
1054                             || s->replacing)
1055                                 md_sync_acct(rrdev->bdev, STRIPE_SECTORS);
1056
1057                         set_bit(STRIPE_IO_STARTED, &sh->state);
1058
1059                         bio_reset(rbi);
1060                         rbi->bi_bdev = rrdev->bdev;
1061                         rbi->bi_rw = rw;
1062                         BUG_ON(!(rw & WRITE));
1063                         rbi->bi_end_io = raid5_end_write_request;
1064                         rbi->bi_private = sh;
1065
1066                         pr_debug("%s: for %llu schedule op %ld on "
1067                                  "replacement disc %d\n",
1068                                 __func__, (unsigned long long)sh->sector,
1069                                 rbi->bi_rw, i);
1070                         atomic_inc(&sh->count);
1071                         if (sh != head_sh)
1072                                 atomic_inc(&head_sh->count);
1073                         if (use_new_offset(conf, sh))
1074                                 rbi->bi_iter.bi_sector = (sh->sector
1075                                                   + rrdev->new_data_offset);
1076                         else
1077                                 rbi->bi_iter.bi_sector = (sh->sector
1078                                                   + rrdev->data_offset);
1079                         if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
1080                                 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
1081                         sh->dev[i].rvec.bv_page = sh->dev[i].page;
1082                         rbi->bi_vcnt = 1;
1083                         rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
1084                         rbi->bi_io_vec[0].bv_offset = 0;
1085                         rbi->bi_iter.bi_size = STRIPE_SIZE;
1086                         /*
1087                          * If this is discard request, set bi_vcnt 0. We don't
1088                          * want to confuse SCSI because SCSI will replace payload
1089                          */
1090                         if (rw & REQ_DISCARD)
1091                                 rbi->bi_vcnt = 0;
1092                         if (conf->mddev->gendisk)
1093                                 trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev),
1094                                                       rbi, disk_devt(conf->mddev->gendisk),
1095                                                       sh->dev[i].sector);
1096                         generic_make_request(rbi);
1097                 }
1098                 if (!rdev && !rrdev) {
1099                         if (rw & WRITE)
1100                                 set_bit(STRIPE_DEGRADED, &sh->state);
1101                         pr_debug("skip op %ld on disc %d for sector %llu\n",
1102                                 bi->bi_rw, i, (unsigned long long)sh->sector);
1103                         clear_bit(R5_LOCKED, &sh->dev[i].flags);
1104                         set_bit(STRIPE_HANDLE, &sh->state);
1105                 }
1106
1107                 if (!head_sh->batch_head)
1108                         continue;
1109                 sh = list_first_entry(&sh->batch_list, struct stripe_head,
1110                                       batch_list);
1111                 if (sh != head_sh)
1112                         goto again;
1113         }
1114 }
1115
1116 static struct dma_async_tx_descriptor *
1117 async_copy_data(int frombio, struct bio *bio, struct page **page,
1118         sector_t sector, struct dma_async_tx_descriptor *tx,
1119         struct stripe_head *sh)
1120 {
1121         struct bio_vec bvl;
1122         struct bvec_iter iter;
1123         struct page *bio_page;
1124         int page_offset;
1125         struct async_submit_ctl submit;
1126         enum async_tx_flags flags = 0;
1127
1128         if (bio->bi_iter.bi_sector >= sector)
1129                 page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512;
1130         else
1131                 page_offset = (signed)(sector - bio->bi_iter.bi_sector) * -512;
1132
1133         if (frombio)
1134                 flags |= ASYNC_TX_FENCE;
1135         init_async_submit(&submit, flags, tx, NULL, NULL, NULL);
1136
1137         bio_for_each_segment(bvl, bio, iter) {
1138                 int len = bvl.bv_len;
1139                 int clen;
1140                 int b_offset = 0;
1141
1142                 if (page_offset < 0) {
1143                         b_offset = -page_offset;
1144                         page_offset += b_offset;
1145                         len -= b_offset;
1146                 }
1147
1148                 if (len > 0 && page_offset + len > STRIPE_SIZE)
1149                         clen = STRIPE_SIZE - page_offset;
1150                 else
1151                         clen = len;
1152
1153                 if (clen > 0) {
1154                         b_offset += bvl.bv_offset;
1155                         bio_page = bvl.bv_page;
1156                         if (frombio) {
1157                                 if (sh->raid_conf->skip_copy &&
1158                                     b_offset == 0 && page_offset == 0 &&
1159                                     clen == STRIPE_SIZE)
1160                                         *page = bio_page;
1161                                 else
1162                                         tx = async_memcpy(*page, bio_page, page_offset,
1163                                                   b_offset, clen, &submit);
1164                         } else
1165                                 tx = async_memcpy(bio_page, *page, b_offset,
1166                                                   page_offset, clen, &submit);
1167                 }
1168                 /* chain the operations */
1169                 submit.depend_tx = tx;
1170
1171                 if (clen < len) /* hit end of page */
1172                         break;
1173                 page_offset +=  len;
1174         }
1175
1176         return tx;
1177 }
1178
1179 static void ops_complete_biofill(void *stripe_head_ref)
1180 {
1181         struct stripe_head *sh = stripe_head_ref;
1182         struct bio_list return_bi = BIO_EMPTY_LIST;
1183         int i;
1184
1185         pr_debug("%s: stripe %llu\n", __func__,
1186                 (unsigned long long)sh->sector);
1187
1188         /* clear completed biofills */
1189         for (i = sh->disks; i--; ) {
1190                 struct r5dev *dev = &sh->dev[i];
1191
1192                 /* acknowledge completion of a biofill operation */
1193                 /* and check if we need to reply to a read request,
1194                  * new R5_Wantfill requests are held off until
1195                  * !STRIPE_BIOFILL_RUN
1196                  */
1197                 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) {
1198                         struct bio *rbi, *rbi2;
1199
1200                         BUG_ON(!dev->read);
1201                         rbi = dev->read;
1202                         dev->read = NULL;
1203                         while (rbi && rbi->bi_iter.bi_sector <
1204                                 dev->sector + STRIPE_SECTORS) {
1205                                 rbi2 = r5_next_bio(rbi, dev->sector);
1206                                 if (!raid5_dec_bi_active_stripes(rbi))
1207                                         bio_list_add(&return_bi, rbi);
1208                                 rbi = rbi2;
1209                         }
1210                 }
1211         }
1212         clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
1213
1214         return_io(&return_bi);
1215
1216         set_bit(STRIPE_HANDLE, &sh->state);
1217         raid5_release_stripe(sh);
1218 }
1219
1220 static void ops_run_biofill(struct stripe_head *sh)
1221 {
1222         struct dma_async_tx_descriptor *tx = NULL;
1223         struct async_submit_ctl submit;
1224         int i;
1225
1226         BUG_ON(sh->batch_head);
1227         pr_debug("%s: stripe %llu\n", __func__,
1228                 (unsigned long long)sh->sector);
1229
1230         for (i = sh->disks; i--; ) {
1231                 struct r5dev *dev = &sh->dev[i];
1232                 if (test_bit(R5_Wantfill, &dev->flags)) {
1233                         struct bio *rbi;
1234                         spin_lock_irq(&sh->stripe_lock);
1235                         dev->read = rbi = dev->toread;
1236                         dev->toread = NULL;
1237                         spin_unlock_irq(&sh->stripe_lock);
1238                         while (rbi && rbi->bi_iter.bi_sector <
1239                                 dev->sector + STRIPE_SECTORS) {
1240                                 tx = async_copy_data(0, rbi, &dev->page,
1241                                         dev->sector, tx, sh);
1242                                 rbi = r5_next_bio(rbi, dev->sector);
1243                         }
1244                 }
1245         }
1246
1247         atomic_inc(&sh->count);
1248         init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL);
1249         async_trigger_callback(&submit);
1250 }
1251
1252 static void mark_target_uptodate(struct stripe_head *sh, int target)
1253 {
1254         struct r5dev *tgt;
1255
1256         if (target < 0)
1257                 return;
1258
1259         tgt = &sh->dev[target];
1260         set_bit(R5_UPTODATE, &tgt->flags);
1261         BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1262         clear_bit(R5_Wantcompute, &tgt->flags);
1263 }
1264
1265 static void ops_complete_compute(void *stripe_head_ref)
1266 {
1267         struct stripe_head *sh = stripe_head_ref;
1268
1269         pr_debug("%s: stripe %llu\n", __func__,
1270                 (unsigned long long)sh->sector);
1271
1272         /* mark the computed target(s) as uptodate */
1273         mark_target_uptodate(sh, sh->ops.target);
1274         mark_target_uptodate(sh, sh->ops.target2);
1275
1276         clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
1277         if (sh->check_state == check_state_compute_run)
1278                 sh->check_state = check_state_compute_result;
1279         set_bit(STRIPE_HANDLE, &sh->state);
1280         raid5_release_stripe(sh);
1281 }
1282
1283 /* return a pointer to the address conversion region of the scribble buffer */
1284 static addr_conv_t *to_addr_conv(struct stripe_head *sh,
1285                                  struct raid5_percpu *percpu, int i)
1286 {
1287         void *addr;
1288
1289         addr = flex_array_get(percpu->scribble, i);
1290         return addr + sizeof(struct page *) * (sh->disks + 2);
1291 }
1292
1293 /* return a pointer to the address conversion region of the scribble buffer */
1294 static struct page **to_addr_page(struct raid5_percpu *percpu, int i)
1295 {
1296         void *addr;
1297
1298         addr = flex_array_get(percpu->scribble, i);
1299         return addr;
1300 }
1301
1302 static struct dma_async_tx_descriptor *
1303 ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
1304 {
1305         int disks = sh->disks;
1306         struct page **xor_srcs = to_addr_page(percpu, 0);
1307         int target = sh->ops.target;
1308         struct r5dev *tgt = &sh->dev[target];
1309         struct page *xor_dest = tgt->page;
1310         int count = 0;
1311         struct dma_async_tx_descriptor *tx;
1312         struct async_submit_ctl submit;
1313         int i;
1314
1315         BUG_ON(sh->batch_head);
1316
1317         pr_debug("%s: stripe %llu block: %d\n",
1318                 __func__, (unsigned long long)sh->sector, target);
1319         BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1320
1321         for (i = disks; i--; )
1322                 if (i != target)
1323                         xor_srcs[count++] = sh->dev[i].page;
1324
1325         atomic_inc(&sh->count);
1326
1327         init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
1328                           ops_complete_compute, sh, to_addr_conv(sh, percpu, 0));
1329         if (unlikely(count == 1))
1330                 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
1331         else
1332                 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1333
1334         return tx;
1335 }
1336
1337 /* set_syndrome_sources - populate source buffers for gen_syndrome
1338  * @srcs - (struct page *) array of size sh->disks
1339  * @sh - stripe_head to parse
1340  *
1341  * Populates srcs in proper layout order for the stripe and returns the
1342  * 'count' of sources to be used in a call to async_gen_syndrome.  The P
1343  * destination buffer is recorded in srcs[count] and the Q destination
1344  * is recorded in srcs[count+1]].
1345  */
1346 static int set_syndrome_sources(struct page **srcs,
1347                                 struct stripe_head *sh,
1348                                 int srctype)
1349 {
1350         int disks = sh->disks;
1351         int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
1352         int d0_idx = raid6_d0(sh);
1353         int count;
1354         int i;
1355
1356         for (i = 0; i < disks; i++)
1357                 srcs[i] = NULL;
1358
1359         count = 0;
1360         i = d0_idx;
1361         do {
1362                 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
1363                 struct r5dev *dev = &sh->dev[i];
1364
1365                 if (i == sh->qd_idx || i == sh->pd_idx ||
1366                     (srctype == SYNDROME_SRC_ALL) ||
1367                     (srctype == SYNDROME_SRC_WANT_DRAIN &&
1368                      test_bit(R5_Wantdrain, &dev->flags)) ||
1369                     (srctype == SYNDROME_SRC_WRITTEN &&
1370                      dev->written))
1371                         srcs[slot] = sh->dev[i].page;
1372                 i = raid6_next_disk(i, disks);
1373         } while (i != d0_idx);
1374
1375         return syndrome_disks;
1376 }
1377
1378 static struct dma_async_tx_descriptor *
1379 ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
1380 {
1381         int disks = sh->disks;
1382         struct page **blocks = to_addr_page(percpu, 0);
1383         int target;
1384         int qd_idx = sh->qd_idx;
1385         struct dma_async_tx_descriptor *tx;
1386         struct async_submit_ctl submit;
1387         struct r5dev *tgt;
1388         struct page *dest;
1389         int i;
1390         int count;
1391
1392         BUG_ON(sh->batch_head);
1393         if (sh->ops.target < 0)
1394                 target = sh->ops.target2;
1395         else if (sh->ops.target2 < 0)
1396                 target = sh->ops.target;
1397         else
1398                 /* we should only have one valid target */
1399                 BUG();
1400         BUG_ON(target < 0);
1401         pr_debug("%s: stripe %llu block: %d\n",
1402                 __func__, (unsigned long long)sh->sector, target);
1403
1404         tgt = &sh->dev[target];
1405         BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1406         dest = tgt->page;
1407
1408         atomic_inc(&sh->count);
1409
1410         if (target == qd_idx) {
1411                 count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
1412                 blocks[count] = NULL; /* regenerating p is not necessary */
1413                 BUG_ON(blocks[count+1] != dest); /* q should already be set */
1414                 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1415                                   ops_complete_compute, sh,
1416                                   to_addr_conv(sh, percpu, 0));
1417                 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
1418         } else {
1419                 /* Compute any data- or p-drive using XOR */
1420                 count = 0;
1421                 for (i = disks; i-- ; ) {
1422                         if (i == target || i == qd_idx)
1423                                 continue;
1424                         blocks[count++] = sh->dev[i].page;
1425                 }
1426
1427                 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
1428                                   NULL, ops_complete_compute, sh,
1429                                   to_addr_conv(sh, percpu, 0));
1430                 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit);
1431         }
1432
1433         return tx;
1434 }
1435
1436 static struct dma_async_tx_descriptor *
1437 ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
1438 {
1439         int i, count, disks = sh->disks;
1440         int syndrome_disks = sh->ddf_layout ? disks : disks-2;
1441         int d0_idx = raid6_d0(sh);
1442         int faila = -1, failb = -1;
1443         int target = sh->ops.target;
1444         int target2 = sh->ops.target2;
1445         struct r5dev *tgt = &sh->dev[target];
1446         struct r5dev *tgt2 = &sh->dev[target2];
1447         struct dma_async_tx_descriptor *tx;
1448         struct page **blocks = to_addr_page(percpu, 0);
1449         struct async_submit_ctl submit;
1450
1451         BUG_ON(sh->batch_head);
1452         pr_debug("%s: stripe %llu block1: %d block2: %d\n",
1453                  __func__, (unsigned long long)sh->sector, target, target2);
1454         BUG_ON(target < 0 || target2 < 0);
1455         BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1456         BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags));
1457
1458         /* we need to open-code set_syndrome_sources to handle the
1459          * slot number conversion for 'faila' and 'failb'
1460          */
1461         for (i = 0; i < disks ; i++)
1462                 blocks[i] = NULL;
1463         count = 0;
1464         i = d0_idx;
1465         do {
1466                 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
1467
1468                 blocks[slot] = sh->dev[i].page;
1469
1470                 if (i == target)
1471                         faila = slot;
1472                 if (i == target2)
1473                         failb = slot;
1474                 i = raid6_next_disk(i, disks);
1475         } while (i != d0_idx);
1476
1477         BUG_ON(faila == failb);
1478         if (failb < faila)
1479                 swap(faila, failb);
1480         pr_debug("%s: stripe: %llu faila: %d failb: %d\n",
1481                  __func__, (unsigned long long)sh->sector, faila, failb);
1482
1483         atomic_inc(&sh->count);
1484
1485         if (failb == syndrome_disks+1) {
1486                 /* Q disk is one of the missing disks */
1487                 if (faila == syndrome_disks) {
1488                         /* Missing P+Q, just recompute */
1489                         init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1490                                           ops_complete_compute, sh,
1491                                           to_addr_conv(sh, percpu, 0));
1492                         return async_gen_syndrome(blocks, 0, syndrome_disks+2,
1493                                                   STRIPE_SIZE, &submit);
1494                 } else {
1495                         struct page *dest;
1496                         int data_target;
1497                         int qd_idx = sh->qd_idx;
1498
1499                         /* Missing D+Q: recompute D from P, then recompute Q */
1500                         if (target == qd_idx)
1501                                 data_target = target2;
1502                         else
1503                                 data_target = target;
1504
1505                         count = 0;
1506                         for (i = disks; i-- ; ) {
1507                                 if (i == data_target || i == qd_idx)
1508                                         continue;
1509                                 blocks[count++] = sh->dev[i].page;
1510                         }
1511                         dest = sh->dev[data_target].page;
1512                         init_async_submit(&submit,
1513                                           ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
1514                                           NULL, NULL, NULL,
1515                                           to_addr_conv(sh, percpu, 0));
1516                         tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
1517                                        &submit);
1518
1519                         count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
1520                         init_async_submit(&submit, ASYNC_TX_FENCE, tx,
1521                                           ops_complete_compute, sh,
1522                                           to_addr_conv(sh, percpu, 0));
1523                         return async_gen_syndrome(blocks, 0, count+2,
1524                                                   STRIPE_SIZE, &submit);
1525                 }
1526         } else {
1527                 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1528                                   ops_complete_compute, sh,
1529                                   to_addr_conv(sh, percpu, 0));
1530                 if (failb == syndrome_disks) {
1531                         /* We're missing D+P. */
1532                         return async_raid6_datap_recov(syndrome_disks+2,
1533                                                        STRIPE_SIZE, faila,
1534                                                        blocks, &submit);
1535                 } else {
1536                         /* We're missing D+D. */
1537                         return async_raid6_2data_recov(syndrome_disks+2,
1538                                                        STRIPE_SIZE, faila, failb,
1539                                                        blocks, &submit);
1540                 }
1541         }
1542 }
1543
1544 static void ops_complete_prexor(void *stripe_head_ref)
1545 {
1546         struct stripe_head *sh = stripe_head_ref;
1547
1548         pr_debug("%s: stripe %llu\n", __func__,
1549                 (unsigned long long)sh->sector);
1550 }
1551
1552 static struct dma_async_tx_descriptor *
1553 ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu,
1554                 struct dma_async_tx_descriptor *tx)
1555 {
1556         int disks = sh->disks;
1557         struct page **xor_srcs = to_addr_page(percpu, 0);
1558         int count = 0, pd_idx = sh->pd_idx, i;
1559         struct async_submit_ctl submit;
1560
1561         /* existing parity data subtracted */
1562         struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
1563
1564         BUG_ON(sh->batch_head);
1565         pr_debug("%s: stripe %llu\n", __func__,
1566                 (unsigned long long)sh->sector);
1567
1568         for (i = disks; i--; ) {
1569                 struct r5dev *dev = &sh->dev[i];
1570                 /* Only process blocks that are known to be uptodate */
1571                 if (test_bit(R5_Wantdrain, &dev->flags))
1572                         xor_srcs[count++] = dev->page;
1573         }
1574
1575         init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
1576                           ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
1577         tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1578
1579         return tx;
1580 }
1581
1582 static struct dma_async_tx_descriptor *
1583 ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu,
1584                 struct dma_async_tx_descriptor *tx)
1585 {
1586         struct page **blocks = to_addr_page(percpu, 0);
1587         int count;
1588         struct async_submit_ctl submit;
1589
1590         pr_debug("%s: stripe %llu\n", __func__,
1591                 (unsigned long long)sh->sector);
1592
1593         count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_WANT_DRAIN);
1594
1595         init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx,
1596                           ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
1597         tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE,  &submit);
1598
1599         return tx;
1600 }
1601
1602 static struct dma_async_tx_descriptor *
1603 ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1604 {
1605         int disks = sh->disks;
1606         int i;
1607         struct stripe_head *head_sh = sh;
1608
1609         pr_debug("%s: stripe %llu\n", __func__,
1610                 (unsigned long long)sh->sector);
1611
1612         for (i = disks; i--; ) {
1613                 struct r5dev *dev;
1614                 struct bio *chosen;
1615
1616                 sh = head_sh;
1617                 if (test_and_clear_bit(R5_Wantdrain, &head_sh->dev[i].flags)) {
1618                         struct bio *wbi;
1619
1620 again:
1621                         dev = &sh->dev[i];
1622                         spin_lock_irq(&sh->stripe_lock);
1623                         chosen = dev->towrite;
1624                         dev->towrite = NULL;
1625                         sh->overwrite_disks = 0;
1626                         BUG_ON(dev->written);
1627                         wbi = dev->written = chosen;
1628                         spin_unlock_irq(&sh->stripe_lock);
1629                         WARN_ON(dev->page != dev->orig_page);
1630
1631                         while (wbi && wbi->bi_iter.bi_sector <
1632                                 dev->sector + STRIPE_SECTORS) {
1633                                 if (wbi->bi_rw & REQ_FUA)
1634                                         set_bit(R5_WantFUA, &dev->flags);
1635                                 if (wbi->bi_rw & REQ_SYNC)
1636                                         set_bit(R5_SyncIO, &dev->flags);
1637                                 if (wbi->bi_rw & REQ_DISCARD)
1638                                         set_bit(R5_Discard, &dev->flags);
1639                                 else {
1640                                         tx = async_copy_data(1, wbi, &dev->page,
1641                                                 dev->sector, tx, sh);
1642                                         if (dev->page != dev->orig_page) {
1643                                                 set_bit(R5_SkipCopy, &dev->flags);
1644                                                 clear_bit(R5_UPTODATE, &dev->flags);
1645                                                 clear_bit(R5_OVERWRITE, &dev->flags);
1646                                         }
1647                                 }
1648                                 wbi = r5_next_bio(wbi, dev->sector);
1649                         }
1650
1651                         if (head_sh->batch_head) {
1652                                 sh = list_first_entry(&sh->batch_list,
1653                                                       struct stripe_head,
1654                                                       batch_list);
1655                                 if (sh == head_sh)
1656                                         continue;
1657                                 goto again;
1658                         }
1659                 }
1660         }
1661
1662         return tx;
1663 }
1664
1665 static void ops_complete_reconstruct(void *stripe_head_ref)
1666 {
1667         struct stripe_head *sh = stripe_head_ref;
1668         int disks = sh->disks;
1669         int pd_idx = sh->pd_idx;
1670         int qd_idx = sh->qd_idx;
1671         int i;
1672         bool fua = false, sync = false, discard = false;
1673
1674         pr_debug("%s: stripe %llu\n", __func__,
1675                 (unsigned long long)sh->sector);
1676
1677         for (i = disks; i--; ) {
1678                 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
1679                 sync |= test_bit(R5_SyncIO, &sh->dev[i].flags);
1680                 discard |= test_bit(R5_Discard, &sh->dev[i].flags);
1681         }
1682
1683         for (i = disks; i--; ) {
1684                 struct r5dev *dev = &sh->dev[i];
1685
1686                 if (dev->written || i == pd_idx || i == qd_idx) {
1687                         if (!discard && !test_bit(R5_SkipCopy, &dev->flags))
1688                                 set_bit(R5_UPTODATE, &dev->flags);
1689                         if (fua)
1690                                 set_bit(R5_WantFUA, &dev->flags);
1691                         if (sync)
1692                                 set_bit(R5_SyncIO, &dev->flags);
1693                 }
1694         }
1695
1696         if (sh->reconstruct_state == reconstruct_state_drain_run)
1697                 sh->reconstruct_state = reconstruct_state_drain_result;
1698         else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run)
1699                 sh->reconstruct_state = reconstruct_state_prexor_drain_result;
1700         else {
1701                 BUG_ON(sh->reconstruct_state != reconstruct_state_run);
1702                 sh->reconstruct_state = reconstruct_state_result;
1703         }
1704
1705         set_bit(STRIPE_HANDLE, &sh->state);
1706         raid5_release_stripe(sh);
1707 }
1708
1709 static void
1710 ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
1711                      struct dma_async_tx_descriptor *tx)
1712 {
1713         int disks = sh->disks;
1714         struct page **xor_srcs;
1715         struct async_submit_ctl submit;
1716         int count, pd_idx = sh->pd_idx, i;
1717         struct page *xor_dest;
1718         int prexor = 0;
1719         unsigned long flags;
1720         int j = 0;
1721         struct stripe_head *head_sh = sh;
1722         int last_stripe;
1723
1724         pr_debug("%s: stripe %llu\n", __func__,
1725                 (unsigned long long)sh->sector);
1726
1727         for (i = 0; i < sh->disks; i++) {
1728                 if (pd_idx == i)
1729                         continue;
1730                 if (!test_bit(R5_Discard, &sh->dev[i].flags))
1731                         break;
1732         }
1733         if (i >= sh->disks) {
1734                 atomic_inc(&sh->count);
1735                 set_bit(R5_Discard, &sh->dev[pd_idx].flags);
1736                 ops_complete_reconstruct(sh);
1737                 return;
1738         }
1739 again:
1740         count = 0;
1741         xor_srcs = to_addr_page(percpu, j);
1742         /* check if prexor is active which means only process blocks
1743          * that are part of a read-modify-write (written)
1744          */
1745         if (head_sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
1746                 prexor = 1;
1747                 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
1748                 for (i = disks; i--; ) {
1749                         struct r5dev *dev = &sh->dev[i];
1750                         if (head_sh->dev[i].written)
1751                                 xor_srcs[count++] = dev->page;
1752                 }
1753         } else {
1754                 xor_dest = sh->dev[pd_idx].page;
1755                 for (i = disks; i--; ) {
1756                         struct r5dev *dev = &sh->dev[i];
1757                         if (i != pd_idx)
1758                                 xor_srcs[count++] = dev->page;
1759                 }
1760         }
1761
1762         /* 1/ if we prexor'd then the dest is reused as a source
1763          * 2/ if we did not prexor then we are redoing the parity
1764          * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
1765          * for the synchronous xor case
1766          */
1767         last_stripe = !head_sh->batch_head ||
1768                 list_first_entry(&sh->batch_list,
1769                                  struct stripe_head, batch_list) == head_sh;
1770         if (last_stripe) {
1771                 flags = ASYNC_TX_ACK |
1772                         (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
1773
1774                 atomic_inc(&head_sh->count);
1775                 init_async_submit(&submit, flags, tx, ops_complete_reconstruct, head_sh,
1776                                   to_addr_conv(sh, percpu, j));
1777         } else {
1778                 flags = prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST;
1779                 init_async_submit(&submit, flags, tx, NULL, NULL,
1780                                   to_addr_conv(sh, percpu, j));
1781         }
1782
1783         if (unlikely(count == 1))
1784                 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
1785         else
1786                 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1787         if (!last_stripe) {
1788                 j++;
1789                 sh = list_first_entry(&sh->batch_list, struct stripe_head,
1790                                       batch_list);
1791                 goto again;
1792         }
1793 }
1794
1795 static void
1796 ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
1797                      struct dma_async_tx_descriptor *tx)
1798 {
1799         struct async_submit_ctl submit;
1800         struct page **blocks;
1801         int count, i, j = 0;
1802         struct stripe_head *head_sh = sh;
1803         int last_stripe;
1804         int synflags;
1805         unsigned long txflags;
1806
1807         pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
1808
1809         for (i = 0; i < sh->disks; i++) {
1810                 if (sh->pd_idx == i || sh->qd_idx == i)
1811                         continue;
1812                 if (!test_bit(R5_Discard, &sh->dev[i].flags))
1813                         break;
1814         }
1815         if (i >= sh->disks) {
1816                 atomic_inc(&sh->count);
1817                 set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
1818                 set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
1819                 ops_complete_reconstruct(sh);
1820                 return;
1821         }
1822
1823 again:
1824         blocks = to_addr_page(percpu, j);
1825
1826         if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
1827                 synflags = SYNDROME_SRC_WRITTEN;
1828                 txflags = ASYNC_TX_ACK | ASYNC_TX_PQ_XOR_DST;
1829         } else {
1830                 synflags = SYNDROME_SRC_ALL;
1831                 txflags = ASYNC_TX_ACK;
1832         }
1833
1834         count = set_syndrome_sources(blocks, sh, synflags);
1835         last_stripe = !head_sh->batch_head ||
1836                 list_first_entry(&sh->batch_list,
1837                                  struct stripe_head, batch_list) == head_sh;
1838
1839         if (last_stripe) {
1840                 atomic_inc(&head_sh->count);
1841                 init_async_submit(&submit, txflags, tx, ops_complete_reconstruct,
1842                                   head_sh, to_addr_conv(sh, percpu, j));
1843         } else
1844                 init_async_submit(&submit, 0, tx, NULL, NULL,
1845                                   to_addr_conv(sh, percpu, j));
1846         tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE,  &submit);
1847         if (!last_stripe) {
1848                 j++;
1849                 sh = list_first_entry(&sh->batch_list, struct stripe_head,
1850                                       batch_list);
1851                 goto again;
1852         }
1853 }
1854
1855 static void ops_complete_check(void *stripe_head_ref)
1856 {
1857         struct stripe_head *sh = stripe_head_ref;
1858
1859         pr_debug("%s: stripe %llu\n", __func__,
1860                 (unsigned long long)sh->sector);
1861
1862         sh->check_state = check_state_check_result;
1863         set_bit(STRIPE_HANDLE, &sh->state);
1864         raid5_release_stripe(sh);
1865 }
1866
1867 static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
1868 {
1869         int disks = sh->disks;
1870         int pd_idx = sh->pd_idx;
1871         int qd_idx = sh->qd_idx;
1872         struct page *xor_dest;
1873         struct page **xor_srcs = to_addr_page(percpu, 0);
1874         struct dma_async_tx_descriptor *tx;
1875         struct async_submit_ctl submit;
1876         int count;
1877         int i;
1878
1879         pr_debug("%s: stripe %llu\n", __func__,
1880                 (unsigned long long)sh->sector);
1881
1882         BUG_ON(sh->batch_head);
1883         count = 0;
1884         xor_dest = sh->dev[pd_idx].page;
1885         xor_srcs[count++] = xor_dest;
1886         for (i = disks; i--; ) {
1887                 if (i == pd_idx || i == qd_idx)
1888                         continue;
1889                 xor_srcs[count++] = sh->dev[i].page;
1890         }
1891
1892         init_async_submit(&submit, 0, NULL, NULL, NULL,
1893                           to_addr_conv(sh, percpu, 0));
1894         tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
1895                            &sh->ops.zero_sum_result, &submit);
1896
1897         atomic_inc(&sh->count);
1898         init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL);
1899         tx = async_trigger_callback(&submit);
1900 }
1901
1902 static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp)
1903 {
1904         struct page **srcs = to_addr_page(percpu, 0);
1905         struct async_submit_ctl submit;
1906         int count;
1907
1908         pr_debug("%s: stripe %llu checkp: %d\n", __func__,
1909                 (unsigned long long)sh->sector, checkp);
1910
1911         BUG_ON(sh->batch_head);
1912         count = set_syndrome_sources(srcs, sh, SYNDROME_SRC_ALL);
1913         if (!checkp)
1914                 srcs[count] = NULL;
1915
1916         atomic_inc(&sh->count);
1917         init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check,
1918                           sh, to_addr_conv(sh, percpu, 0));
1919         async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE,
1920                            &sh->ops.zero_sum_result, percpu->spare_page, &submit);
1921 }
1922
1923 static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
1924 {
1925         int overlap_clear = 0, i, disks = sh->disks;
1926         struct dma_async_tx_descriptor *tx = NULL;
1927         struct r5conf *conf = sh->raid_conf;
1928         int level = conf->level;
1929         struct raid5_percpu *percpu;
1930         unsigned long cpu;
1931
1932         cpu = get_cpu();
1933         percpu = per_cpu_ptr(conf->percpu, cpu);
1934         if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
1935                 ops_run_biofill(sh);
1936                 overlap_clear++;
1937         }
1938
1939         if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
1940                 if (level < 6)
1941                         tx = ops_run_compute5(sh, percpu);
1942                 else {
1943                         if (sh->ops.target2 < 0 || sh->ops.target < 0)
1944                                 tx = ops_run_compute6_1(sh, percpu);
1945                         else
1946                                 tx = ops_run_compute6_2(sh, percpu);
1947                 }
1948                 /* terminate the chain if reconstruct is not set to be run */
1949                 if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request))
1950                         async_tx_ack(tx);
1951         }
1952
1953         if (test_bit(STRIPE_OP_PREXOR, &ops_request)) {
1954                 if (level < 6)
1955                         tx = ops_run_prexor5(sh, percpu, tx);
1956                 else
1957                         tx = ops_run_prexor6(sh, percpu, tx);
1958         }
1959
1960         if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
1961                 tx = ops_run_biodrain(sh, tx);
1962                 overlap_clear++;
1963         }
1964
1965         if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) {
1966                 if (level < 6)
1967                         ops_run_reconstruct5(sh, percpu, tx);
1968                 else
1969                         ops_run_reconstruct6(sh, percpu, tx);
1970         }
1971
1972         if (test_bit(STRIPE_OP_CHECK, &ops_request)) {
1973                 if (sh->check_state == check_state_run)
1974                         ops_run_check_p(sh, percpu);
1975                 else if (sh->check_state == check_state_run_q)
1976                         ops_run_check_pq(sh, percpu, 0);
1977                 else if (sh->check_state == check_state_run_pq)
1978                         ops_run_check_pq(sh, percpu, 1);
1979                 else
1980                         BUG();
1981         }
1982
1983         if (overlap_clear && !sh->batch_head)
1984                 for (i = disks; i--; ) {
1985                         struct r5dev *dev = &sh->dev[i];
1986                         if (test_and_clear_bit(R5_Overlap, &dev->flags))
1987                                 wake_up(&sh->raid_conf->wait_for_overlap);
1988                 }
1989         put_cpu();
1990 }
1991
1992 static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp)
1993 {
1994         struct stripe_head *sh;
1995
1996         sh = kmem_cache_zalloc(sc, gfp);
1997         if (sh) {
1998                 spin_lock_init(&sh->stripe_lock);
1999                 spin_lock_init(&sh->batch_lock);
2000                 INIT_LIST_HEAD(&sh->batch_list);
2001                 INIT_LIST_HEAD(&sh->lru);
2002                 atomic_set(&sh->count, 1);
2003         }
2004         return sh;
2005 }
2006 static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
2007 {
2008         struct stripe_head *sh;
2009
2010         sh = alloc_stripe(conf->slab_cache, gfp);
2011         if (!sh)
2012                 return 0;
2013
2014         sh->raid_conf = conf;
2015
2016         if (grow_buffers(sh, gfp)) {
2017                 shrink_buffers(sh);
2018                 kmem_cache_free(conf->slab_cache, sh);
2019                 return 0;
2020         }
2021         sh->hash_lock_index =
2022                 conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
2023         /* we just created an active stripe so... */
2024         atomic_inc(&conf->active_stripes);
2025
2026         raid5_release_stripe(sh);
2027         conf->max_nr_stripes++;
2028         return 1;
2029 }
2030
2031 static int grow_stripes(struct r5conf *conf, int num)
2032 {
2033         struct kmem_cache *sc;
2034         int devs = max(conf->raid_disks, conf->previous_raid_disks);
2035
2036         if (conf->mddev->gendisk)
2037                 sprintf(conf->cache_name[0],
2038                         "raid%d-%s", conf->level, mdname(conf->mddev));
2039         else
2040                 sprintf(conf->cache_name[0],
2041                         "raid%d-%p", conf->level, conf->mddev);
2042         sprintf(conf->cache_name[1], "%s-alt", conf->cache_name[0]);
2043
2044         conf->active_name = 0;
2045         sc = kmem_cache_create(conf->cache_name[conf->active_name],
2046                                sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
2047                                0, 0, NULL);
2048         if (!sc)
2049                 return 1;
2050         conf->slab_cache = sc;
2051         conf->pool_size = devs;
2052         while (num--)
2053                 if (!grow_one_stripe(conf, GFP_KERNEL))
2054                         return 1;
2055
2056         return 0;
2057 }
2058
2059 /**
2060  * scribble_len - return the required size of the scribble region
2061  * @num - total number of disks in the array
2062  *
2063  * The size must be enough to contain:
2064  * 1/ a struct page pointer for each device in the array +2
2065  * 2/ room to convert each entry in (1) to its corresponding dma
2066  *    (dma_map_page()) or page (page_address()) address.
2067  *
2068  * Note: the +2 is for the destination buffers of the ddf/raid6 case where we
2069  * calculate over all devices (not just the data blocks), using zeros in place
2070  * of the P and Q blocks.
2071  */
2072 static struct flex_array *scribble_alloc(int num, int cnt, gfp_t flags)
2073 {
2074         struct flex_array *ret;
2075         size_t len;
2076
2077         len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2);
2078         ret = flex_array_alloc(len, cnt, flags);
2079         if (!ret)
2080                 return NULL;
2081         /* always prealloc all elements, so no locking is required */
2082         if (flex_array_prealloc(ret, 0, cnt, flags)) {
2083                 flex_array_free(ret);
2084                 return NULL;
2085         }
2086         return ret;
2087 }
2088
2089 static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors)
2090 {
2091         unsigned long cpu;
2092         int err = 0;
2093
2094         mddev_suspend(conf->mddev);
2095         get_online_cpus();
2096         for_each_present_cpu(cpu) {
2097                 struct raid5_percpu *percpu;
2098                 struct flex_array *scribble;
2099
2100                 percpu = per_cpu_ptr(conf->percpu, cpu);
2101                 scribble = scribble_alloc(new_disks,
2102                                           new_sectors / STRIPE_SECTORS,
2103                                           GFP_NOIO);
2104
2105                 if (scribble) {
2106                         flex_array_free(percpu->scribble);
2107                         percpu->scribble = scribble;
2108                 } else {
2109                         err = -ENOMEM;
2110                         break;
2111                 }
2112         }
2113         put_online_cpus();
2114         mddev_resume(conf->mddev);
2115         return err;
2116 }
2117
2118 static int resize_stripes(struct r5conf *conf, int newsize)
2119 {
2120         /* Make all the stripes able to hold 'newsize' devices.
2121          * New slots in each stripe get 'page' set to a new page.
2122          *
2123          * This happens in stages:
2124          * 1/ create a new kmem_cache and allocate the required number of
2125          *    stripe_heads.
2126          * 2/ gather all the old stripe_heads and transfer the pages across
2127          *    to the new stripe_heads.  This will have the side effect of
2128          *    freezing the array as once all stripe_heads have been collected,
2129          *    no IO will be possible.  Old stripe heads are freed once their
2130          *    pages have been transferred over, and the old kmem_cache is
2131          *    freed when all stripes are done.
2132          * 3/ reallocate conf->disks to be suitable bigger.  If this fails,
2133          *    we simple return a failre status - no need to clean anything up.
2134          * 4/ allocate new pages for the new slots in the new stripe_heads.
2135          *    If this fails, we don't bother trying the shrink the
2136          *    stripe_heads down again, we just leave them as they are.
2137          *    As each stripe_head is processed the new one is released into
2138          *    active service.
2139          *
2140          * Once step2 is started, we cannot afford to wait for a write,
2141          * so we use GFP_NOIO allocations.
2142          */
2143         struct stripe_head *osh, *nsh;
2144         LIST_HEAD(newstripes);
2145         struct disk_info *ndisks;
2146         int err;
2147         struct kmem_cache *sc;
2148         int i;
2149         int hash, cnt;
2150
2151         if (newsize <= conf->pool_size)
2152                 return 0; /* never bother to shrink */
2153
2154         err = md_allow_write(conf->mddev);
2155         if (err)
2156                 return err;
2157
2158         /* Step 1 */
2159         sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
2160                                sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev),
2161                                0, 0, NULL);
2162         if (!sc)
2163                 return -ENOMEM;
2164
2165         /* Need to ensure auto-resizing doesn't interfere */
2166         mutex_lock(&conf->cache_size_mutex);
2167
2168         for (i = conf->max_nr_stripes; i; i--) {
2169                 nsh = alloc_stripe(sc, GFP_KERNEL);
2170                 if (!nsh)
2171                         break;
2172
2173                 nsh->raid_conf = conf;
2174                 list_add(&nsh->lru, &newstripes);
2175         }
2176         if (i) {
2177                 /* didn't get enough, give up */
2178                 while (!list_empty(&newstripes)) {
2179                         nsh = list_entry(newstripes.next, struct stripe_head, lru);
2180                         list_del(&nsh->lru);
2181                         kmem_cache_free(sc, nsh);
2182                 }
2183                 kmem_cache_destroy(sc);
2184                 mutex_unlock(&conf->cache_size_mutex);
2185                 return -ENOMEM;
2186         }
2187         /* Step 2 - Must use GFP_NOIO now.
2188          * OK, we have enough stripes, start collecting inactive
2189          * stripes and copying them over
2190          */
2191         hash = 0;
2192         cnt = 0;
2193         list_for_each_entry(nsh, &newstripes, lru) {
2194                 lock_device_hash_lock(conf, hash);
2195                 wait_event_exclusive_cmd(conf->wait_for_stripe[hash],
2196                                     !list_empty(conf->inactive_list + hash),
2197                                     unlock_device_hash_lock(conf, hash),
2198                                     lock_device_hash_lock(conf, hash));
2199                 osh = get_free_stripe(conf, hash);
2200                 unlock_device_hash_lock(conf, hash);
2201
2202                 for(i=0; i<conf->pool_size; i++) {
2203                         nsh->dev[i].page = osh->dev[i].page;
2204                         nsh->dev[i].orig_page = osh->dev[i].page;
2205                 }
2206                 nsh->hash_lock_index = hash;
2207                 kmem_cache_free(conf->slab_cache, osh);
2208                 cnt++;
2209                 if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS +
2210                     !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) {
2211                         hash++;
2212                         cnt = 0;
2213                 }
2214         }
2215         kmem_cache_destroy(conf->slab_cache);
2216
2217         /* Step 3.
2218          * At this point, we are holding all the stripes so the array
2219          * is completely stalled, so now is a good time to resize
2220          * conf->disks and the scribble region
2221          */
2222         ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO);
2223         if (ndisks) {
2224                 for (i=0; i<conf->raid_disks; i++)
2225                         ndisks[i] = conf->disks[i];
2226                 kfree(conf->disks);
2227                 conf->disks = ndisks;
2228         } else
2229                 err = -ENOMEM;
2230
2231         mutex_unlock(&conf->cache_size_mutex);
2232         /* Step 4, return new stripes to service */
2233         while(!list_empty(&newstripes)) {
2234                 nsh = list_entry(newstripes.next, struct stripe_head, lru);
2235                 list_del_init(&nsh->lru);
2236
2237                 for (i=conf->raid_disks; i < newsize; i++)
2238                         if (nsh->dev[i].page == NULL) {
2239                                 struct page *p = alloc_page(GFP_NOIO);
2240                                 nsh->dev[i].page = p;
2241                                 nsh->dev[i].orig_page = p;
2242                                 if (!p)
2243                                         err = -ENOMEM;
2244                         }
2245                 raid5_release_stripe(nsh);
2246         }
2247         /* critical section pass, GFP_NOIO no longer needed */
2248
2249         conf->slab_cache = sc;
2250         conf->active_name = 1-conf->active_name;
2251         if (!err)
2252                 conf->pool_size = newsize;
2253         return err;
2254 }
2255
2256 static int drop_one_stripe(struct r5conf *conf)
2257 {
2258         struct stripe_head *sh;
2259         int hash = (conf->max_nr_stripes - 1) & STRIPE_HASH_LOCKS_MASK;
2260
2261         spin_lock_irq(conf->hash_locks + hash);
2262         sh = get_free_stripe(conf, hash);
2263         spin_unlock_irq(conf->hash_locks + hash);
2264         if (!sh)
2265                 return 0;
2266         BUG_ON(atomic_read(&sh->count));
2267         shrink_buffers(sh);
2268         kmem_cache_free(conf->slab_cache, sh);
2269         atomic_dec(&conf->active_stripes);
2270         conf->max_nr_stripes--;
2271         return 1;
2272 }
2273
2274 static void shrink_stripes(struct r5conf *conf)
2275 {
2276         while (conf->max_nr_stripes &&
2277                drop_one_stripe(conf))
2278                 ;
2279
2280         kmem_cache_destroy(conf->slab_cache);
2281         conf->slab_cache = NULL;
2282 }
2283
2284 static void raid5_end_read_request(struct bio * bi)
2285 {
2286         struct stripe_head *sh = bi->bi_private;
2287         struct r5conf *conf = sh->raid_conf;
2288         int disks = sh->disks, i;
2289         char b[BDEVNAME_SIZE];
2290         struct md_rdev *rdev = NULL;
2291         sector_t s;
2292
2293         for (i=0 ; i<disks; i++)
2294                 if (bi == &sh->dev[i].req)
2295                         break;
2296
2297         pr_debug("end_read_request %llu/%d, count: %d, error %d.\n",
2298                 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
2299                 bi->bi_error);
2300         if (i == disks) {
2301                 BUG();
2302                 return;
2303         }
2304         if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
2305                 /* If replacement finished while this request was outstanding,
2306                  * 'replacement' might be NULL already.
2307                  * In that case it moved down to 'rdev'.
2308                  * rdev is not removed until all requests are finished.
2309                  */
2310                 rdev = conf->disks[i].replacement;
2311         if (!rdev)
2312                 rdev = conf->disks[i].rdev;
2313
2314         if (use_new_offset(conf, sh))
2315                 s = sh->sector + rdev->new_data_offset;
2316         else
2317                 s = sh->sector + rdev->data_offset;
2318         if (!bi->bi_error) {
2319                 set_bit(R5_UPTODATE, &sh->dev[i].flags);
2320                 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
2321                         /* Note that this cannot happen on a
2322                          * replacement device.  We just fail those on
2323                          * any error
2324                          */
2325                         printk_ratelimited(
2326                                 KERN_INFO
2327                                 "md/raid:%s: read error corrected"
2328                                 " (%lu sectors at %llu on %s)\n",
2329                                 mdname(conf->mddev), STRIPE_SECTORS,
2330                                 (unsigned long long)s,
2331                                 bdevname(rdev->bdev, b));
2332                         atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
2333                         clear_bit(R5_ReadError, &sh->dev[i].flags);
2334                         clear_bit(R5_ReWrite, &sh->dev[i].flags);
2335                 } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
2336                         clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
2337
2338                 if (atomic_read(&rdev->read_errors))
2339                         atomic_set(&rdev->read_errors, 0);
2340         } else {
2341                 const char *bdn = bdevname(rdev->bdev, b);
2342                 int retry = 0;
2343                 int set_bad = 0;
2344
2345                 clear_bit(R5_UPTODATE, &sh->dev[i].flags);
2346                 atomic_inc(&rdev->read_errors);
2347                 if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
2348                         printk_ratelimited(
2349                                 KERN_WARNING
2350                                 "md/raid:%s: read error on replacement device "
2351                                 "(sector %llu on %s).\n",
2352                                 mdname(conf->mddev),
2353                                 (unsigned long long)s,
2354                                 bdn);
2355                 else if (conf->mddev->degraded >= conf->max_degraded) {
2356                         set_bad = 1;
2357                         printk_ratelimited(
2358                                 KERN_WARNING
2359                                 "md/raid:%s: read error not correctable "
2360                                 "(sector %llu on %s).\n",
2361                                 mdname(conf->mddev),
2362                                 (unsigned long long)s,
2363                                 bdn);
2364                 } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) {
2365                         /* Oh, no!!! */
2366                         set_bad = 1;
2367                         printk_ratelimited(
2368                                 KERN_WARNING
2369                                 "md/raid:%s: read error NOT corrected!! "
2370                                 "(sector %llu on %s).\n",
2371                                 mdname(conf->mddev),
2372                                 (unsigned long long)s,
2373                                 bdn);
2374                 } else if (atomic_read(&rdev->read_errors)
2375                          > conf->max_nr_stripes)
2376                         printk(KERN_WARNING
2377                                "md/raid:%s: Too many read errors, failing device %s.\n",
2378                                mdname(conf->mddev), bdn);
2379                 else
2380                         retry = 1;
2381                 if (set_bad && test_bit(In_sync, &rdev->flags)
2382                     && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
2383                         retry = 1;
2384                 if (retry)
2385                         if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {
2386                                 set_bit(R5_ReadError, &sh->dev[i].flags);
2387                                 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
2388                         } else
2389                                 set_bit(R5_ReadNoMerge, &sh->dev[i].flags);
2390                 else {
2391                         clear_bit(R5_ReadError, &sh->dev[i].flags);
2392                         clear_bit(R5_ReWrite, &sh->dev[i].flags);
2393                         if (!(set_bad
2394                               && test_bit(In_sync, &rdev->flags)
2395                               && rdev_set_badblocks(
2396                                       rdev, sh->sector, STRIPE_SECTORS, 0)))
2397                                 md_error(conf->mddev, rdev);
2398                 }
2399         }
2400         rdev_dec_pending(rdev, conf->mddev);
2401         clear_bit(R5_LOCKED, &sh->dev[i].flags);
2402         set_bit(STRIPE_HANDLE, &sh->state);
2403         raid5_release_stripe(sh);
2404 }
2405
2406 static void raid5_end_write_request(struct bio *bi)
2407 {
2408         struct stripe_head *sh = bi->bi_private;
2409         struct r5conf *conf = sh->raid_conf;
2410         int disks = sh->disks, i;
2411         struct md_rdev *uninitialized_var(rdev);
2412         sector_t first_bad;
2413         int bad_sectors;
2414         int replacement = 0;
2415
2416         for (i = 0 ; i < disks; i++) {
2417                 if (bi == &sh->dev[i].req) {
2418                         rdev = conf->disks[i].rdev;
2419                         break;
2420                 }
2421                 if (bi == &sh->dev[i].rreq) {
2422                         rdev = conf->disks[i].replacement;
2423                         if (rdev)
2424                                 replacement = 1;
2425                         else
2426                                 /* rdev was removed and 'replacement'
2427                                  * replaced it.  rdev is not removed
2428                                  * until all requests are finished.
2429                                  */
2430                                 rdev = conf->disks[i].rdev;
2431                         break;
2432                 }
2433         }
2434         pr_debug("end_write_request %llu/%d, count %d, error: %d.\n",
2435                 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
2436                 bi->bi_error);
2437         if (i == disks) {
2438                 BUG();
2439                 return;
2440         }
2441
2442         if (replacement) {
2443                 if (bi->bi_error)
2444                         md_error(conf->mddev, rdev);
2445                 else if (is_badblock(rdev, sh->sector,
2446                                      STRIPE_SECTORS,
2447                                      &first_bad, &bad_sectors))
2448                         set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
2449         } else {
2450                 if (bi->bi_error) {
2451                         set_bit(STRIPE_DEGRADED, &sh->state);
2452                         set_bit(WriteErrorSeen, &rdev->flags);
2453                         set_bit(R5_WriteError, &sh->dev[i].flags);
2454                         if (!test_and_set_bit(WantReplacement, &rdev->flags))
2455                                 set_bit(MD_RECOVERY_NEEDED,
2456                                         &rdev->mddev->recovery);
2457                 } else if (is_badblock(rdev, sh->sector,
2458                                        STRIPE_SECTORS,
2459                                        &first_bad, &bad_sectors)) {
2460                         set_bit(R5_MadeGood, &sh->dev[i].flags);
2461                         if (test_bit(R5_ReadError, &sh->dev[i].flags))
2462                                 /* That was a successful write so make
2463                                  * sure it looks like we already did
2464                                  * a re-write.
2465                                  */
2466                                 set_bit(R5_ReWrite, &sh->dev[i].flags);
2467                 }
2468         }
2469         rdev_dec_pending(rdev, conf->mddev);
2470
2471         if (sh->batch_head && bi->bi_error && !replacement)
2472                 set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state);
2473
2474         if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
2475                 clear_bit(R5_LOCKED, &sh->dev[i].flags);
2476         set_bit(STRIPE_HANDLE, &sh->state);
2477         raid5_release_stripe(sh);
2478
2479         if (sh->batch_head && sh != sh->batch_head)
2480                 raid5_release_stripe(sh->batch_head);
2481 }
2482
2483 static void raid5_build_block(struct stripe_head *sh, int i, int previous)
2484 {
2485         struct r5dev *dev = &sh->dev[i];
2486
2487         bio_init(&dev->req);
2488         dev->req.bi_io_vec = &dev->vec;
2489         dev->req.bi_max_vecs = 1;
2490         dev->req.bi_private = sh;
2491
2492         bio_init(&dev->rreq);
2493         dev->rreq.bi_io_vec = &dev->rvec;
2494         dev->rreq.bi_max_vecs = 1;
2495         dev->rreq.bi_private = sh;
2496
2497         dev->flags = 0;
2498         dev->sector = raid5_compute_blocknr(sh, i, previous);
2499 }
2500
2501 static void error(struct mddev *mddev, struct md_rdev *rdev)
2502 {
2503         char b[BDEVNAME_SIZE];
2504         struct r5conf *conf = mddev->private;
2505         unsigned long flags;
2506         pr_debug("raid456: error called\n");
2507
2508         spin_lock_irqsave(&conf->device_lock, flags);
2509         clear_bit(In_sync, &rdev->flags);
2510         mddev->degraded = calc_degraded(conf);
2511         spin_unlock_irqrestore(&conf->device_lock, flags);
2512         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
2513
2514         set_bit(Blocked, &rdev->flags);
2515         set_bit(Faulty, &rdev->flags);
2516         set_bit(MD_CHANGE_DEVS, &mddev->flags);
2517         set_bit(MD_CHANGE_PENDING, &mddev->flags);
2518         printk(KERN_ALERT
2519                "md/raid:%s: Disk failure on %s, disabling device.\n"
2520                "md/raid:%s: Operation continuing on %d devices.\n",
2521                mdname(mddev),
2522                bdevname(rdev->bdev, b),
2523                mdname(mddev),
2524                conf->raid_disks - mddev->degraded);
2525 }
2526
2527 /*
2528  * Input: a 'big' sector number,
2529  * Output: index of the data and parity disk, and the sector # in them.
2530  */
2531 sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
2532                               int previous, int *dd_idx,
2533                               struct stripe_head *sh)
2534 {
2535         sector_t stripe, stripe2;
2536         sector_t chunk_number;
2537         unsigned int chunk_offset;
2538         int pd_idx, qd_idx;
2539         int ddf_layout = 0;
2540         sector_t new_sector;
2541         int algorithm = previous ? conf->prev_algo
2542                                  : conf->algorithm;
2543         int sectors_per_chunk = previous ? conf->prev_chunk_sectors
2544                                          : conf->chunk_sectors;
2545         int raid_disks = previous ? conf->previous_raid_disks
2546                                   : conf->raid_disks;
2547         int data_disks = raid_disks - conf->max_degraded;
2548
2549         /* First compute the information on this sector */
2550
2551         /*
2552          * Compute the chunk number and the sector offset inside the chunk
2553          */
2554         chunk_offset = sector_div(r_sector, sectors_per_chunk);
2555         chunk_number = r_sector;
2556
2557         /*
2558          * Compute the stripe number
2559          */
2560         stripe = chunk_number;
2561         *dd_idx = sector_div(stripe, data_disks);
2562         stripe2 = stripe;
2563         /*
2564          * Select the parity disk based on the user selected algorithm.
2565          */
2566         pd_idx = qd_idx = -1;
2567         switch(conf->level) {
2568         case 4:
2569                 pd_idx = data_disks;
2570                 break;
2571         case 5:
2572                 switch (algorithm) {
2573                 case ALGORITHM_LEFT_ASYMMETRIC:
2574                         pd_idx = data_disks - sector_div(stripe2, raid_disks);
2575                         if (*dd_idx >= pd_idx)
2576                                 (*dd_idx)++;
2577                         break;
2578                 case ALGORITHM_RIGHT_ASYMMETRIC:
2579                         pd_idx = sector_div(stripe2, raid_disks);
2580                         if (*dd_idx >= pd_idx)
2581                                 (*dd_idx)++;
2582                         break;
2583                 case ALGORITHM_LEFT_SYMMETRIC:
2584                         pd_idx = data_disks - sector_div(stripe2, raid_disks);
2585                         *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
2586                         break;
2587                 case ALGORITHM_RIGHT_SYMMETRIC:
2588                         pd_idx = sector_div(stripe2, raid_disks);
2589                         *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
2590                         break;
2591                 case ALGORITHM_PARITY_0:
2592                         pd_idx = 0;
2593                         (*dd_idx)++;
2594                         break;
2595                 case ALGORITHM_PARITY_N:
2596                         pd_idx = data_disks;
2597                         break;
2598                 default:
2599                         BUG();
2600                 }
2601                 break;
2602         case 6:
2603
2604                 switch (algorithm) {
2605                 case ALGORITHM_LEFT_ASYMMETRIC:
2606                         pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
2607                         qd_idx = pd_idx + 1;
2608                         if (pd_idx == raid_disks-1) {
2609                                 (*dd_idx)++;    /* Q D D D P */
2610                                 qd_idx = 0;
2611                         } else if (*dd_idx >= pd_idx)
2612                                 (*dd_idx) += 2; /* D D P Q D */
2613                         break;
2614                 case ALGORITHM_RIGHT_ASYMMETRIC:
2615                         pd_idx = sector_div(stripe2, raid_disks);
2616                         qd_idx = pd_idx + 1;
2617                         if (pd_idx == raid_disks-1) {
2618                                 (*dd_idx)++;    /* Q D D D P */
2619                                 qd_idx = 0;
2620                         } else if (*dd_idx >= pd_idx)
2621                                 (*dd_idx) += 2; /* D D P Q D */
2622                         break;
2623                 case ALGORITHM_LEFT_SYMMETRIC:
2624                         pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
2625                         qd_idx = (pd_idx + 1) % raid_disks;
2626                         *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
2627                         break;
2628                 case ALGORITHM_RIGHT_SYMMETRIC:
2629                         pd_idx = sector_div(stripe2, raid_disks);
2630                         qd_idx = (pd_idx + 1) % raid_disks;
2631                         *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
2632                         break;
2633
2634                 case ALGORITHM_PARITY_0:
2635                         pd_idx = 0;
2636                         qd_idx = 1;
2637                         (*dd_idx) += 2;
2638                         break;
2639                 case ALGORITHM_PARITY_N:
2640                         pd_idx = data_disks;
2641                         qd_idx = data_disks + 1;
2642                         break;
2643
2644                 case ALGORITHM_ROTATING_ZERO_RESTART:
2645                         /* Exactly the same as RIGHT_ASYMMETRIC, but or
2646                          * of blocks for computing Q is different.
2647                          */
2648                         pd_idx = sector_div(stripe2, raid_disks);
2649                         qd_idx = pd_idx + 1;
2650                         if (pd_idx == raid_disks-1) {
2651                                 (*dd_idx)++;    /* Q D D D P */
2652                                 qd_idx = 0;
2653                         } else if (*dd_idx >= pd_idx)
2654                                 (*dd_idx) += 2; /* D D P Q D */
2655                         ddf_layout = 1;
2656                         break;
2657
2658                 case ALGORITHM_ROTATING_N_RESTART:
2659                         /* Same a left_asymmetric, by first stripe is
2660                          * D D D P Q  rather than
2661                          * Q D D D P
2662                          */
2663                         stripe2 += 1;
2664                         pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
2665                         qd_idx = pd_idx + 1;
2666                         if (pd_idx == raid_disks-1) {
2667                                 (*dd_idx)++;    /* Q D D D P */
2668                                 qd_idx = 0;
2669                         } else if (*dd_idx >= pd_idx)
2670                                 (*dd_idx) += 2; /* D D P Q D */
2671                         ddf_layout = 1;
2672                         break;
2673
2674                 case ALGORITHM_ROTATING_N_CONTINUE:
2675                         /* Same as left_symmetric but Q is before P */
2676                         pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
2677                         qd_idx = (pd_idx + raid_disks - 1) % raid_disks;
2678                         *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
2679                         ddf_layout = 1;
2680                         break;
2681
2682                 case ALGORITHM_LEFT_ASYMMETRIC_6:
2683                         /* RAID5 left_asymmetric, with Q on last device */
2684                         pd_idx = data_disks - sector_div(stripe2, raid_disks-1);
2685                         if (*dd_idx >= pd_idx)
2686                                 (*dd_idx)++;
2687                         qd_idx = raid_disks - 1;
2688                         break;
2689
2690                 case ALGORITHM_RIGHT_ASYMMETRIC_6:
2691                         pd_idx = sector_div(stripe2, raid_disks-1);
2692                         if (*dd_idx >= pd_idx)
2693                                 (*dd_idx)++;
2694                         qd_idx = raid_disks - 1;
2695                         break;
2696
2697                 case ALGORITHM_LEFT_SYMMETRIC_6:
2698                         pd_idx = data_disks - sector_div(stripe2, raid_disks-1);
2699                         *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
2700                         qd_idx = raid_disks - 1;
2701                         break;
2702
2703                 case ALGORITHM_RIGHT_SYMMETRIC_6:
2704                         pd_idx = sector_div(stripe2, raid_disks-1);
2705                         *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
2706                         qd_idx = raid_disks - 1;
2707                         break;
2708
2709                 case ALGORITHM_PARITY_0_6:
2710                         pd_idx = 0;
2711                         (*dd_idx)++;
2712                         qd_idx = raid_disks - 1;
2713                         break;
2714
2715                 default:
2716                         BUG();
2717                 }
2718                 break;
2719         }
2720
2721         if (sh) {
2722                 sh->pd_idx = pd_idx;
2723                 sh->qd_idx = qd_idx;
2724                 sh->ddf_layout = ddf_layout;
2725         }
2726         /*
2727          * Finally, compute the new sector number
2728          */
2729         new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset;
2730         return new_sector;
2731 }
2732
2733 sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
2734 {
2735         struct r5conf *conf = sh->raid_conf;
2736         int raid_disks = sh->disks;
2737         int data_disks = raid_disks - conf->max_degraded;
2738         sector_t new_sector = sh->sector, check;
2739         int sectors_per_chunk = previous ? conf->prev_chunk_sectors
2740                                          : conf->chunk_sectors;
2741         int algorithm = previous ? conf->prev_algo
2742                                  : conf->algorithm;
2743         sector_t stripe;
2744         int chunk_offset;
2745         sector_t chunk_number;
2746         int dummy1, dd_idx = i;
2747         sector_t r_sector;
2748         struct stripe_head sh2;
2749
2750         chunk_offset = sector_div(new_sector, sectors_per_chunk);
2751         stripe = new_sector;
2752
2753         if (i == sh->pd_idx)
2754                 return 0;
2755         switch(conf->level) {
2756         case 4: break;
2757         case 5:
2758                 switch (algorithm) {
2759                 case ALGORITHM_LEFT_ASYMMETRIC:
2760                 case ALGORITHM_RIGHT_ASYMMETRIC:
2761                         if (i > sh->pd_idx)
2762                                 i--;
2763                         break;
2764                 case ALGORITHM_LEFT_SYMMETRIC:
2765                 case ALGORITHM_RIGHT_SYMMETRIC:
2766                         if (i < sh->pd_idx)
2767                                 i += raid_disks;
2768                         i -= (sh->pd_idx + 1);
2769                         break;
2770                 case ALGORITHM_PARITY_0:
2771                         i -= 1;
2772                         break;
2773                 case ALGORITHM_PARITY_N:
2774                         break;
2775                 default:
2776                         BUG();
2777                 }
2778                 break;
2779         case 6:
2780                 if (i == sh->qd_idx)
2781                         return 0; /* It is the Q disk */
2782                 switch (algorithm) {
2783                 case ALGORITHM_LEFT_ASYMMETRIC:
2784                 case ALGORITHM_RIGHT_ASYMMETRIC:
2785                 case ALGORITHM_ROTATING_ZERO_RESTART:
2786                 case ALGORITHM_ROTATING_N_RESTART:
2787                         if (sh->pd_idx == raid_disks-1)
2788                                 i--;    /* Q D D D P */
2789                         else if (i > sh->pd_idx)
2790                                 i -= 2; /* D D P Q D */
2791                         break;
2792                 case ALGORITHM_LEFT_SYMMETRIC:
2793                 case ALGORITHM_RIGHT_SYMMETRIC:
2794                         if (sh->pd_idx == raid_disks-1)
2795                                 i--; /* Q D D D P */
2796                         else {
2797                                 /* D D P Q D */
2798                                 if (i < sh->pd_idx)
2799                                         i += raid_disks;
2800                                 i -= (sh->pd_idx + 2);
2801                         }
2802                         break;
2803                 case ALGORITHM_PARITY_0:
2804                         i -= 2;
2805                         break;
2806                 case ALGORITHM_PARITY_N:
2807                         break;
2808                 case ALGORITHM_ROTATING_N_CONTINUE:
2809                         /* Like left_symmetric, but P is before Q */
2810                         if (sh->pd_idx == 0)
2811                                 i--;    /* P D D D Q */
2812                         else {
2813                                 /* D D Q P D */
2814                                 if (i < sh->pd_idx)
2815                                         i += raid_disks;
2816                                 i -= (sh->pd_idx + 1);
2817                         }
2818                         break;
2819                 case ALGORITHM_LEFT_ASYMMETRIC_6:
2820                 case ALGORITHM_RIGHT_ASYMMETRIC_6:
2821                         if (i > sh->pd_idx)
2822                                 i--;
2823                         break;
2824                 case ALGORITHM_LEFT_SYMMETRIC_6:
2825                 case ALGORITHM_RIGHT_SYMMETRIC_6:
2826                         if (i < sh->pd_idx)
2827                                 i += data_disks + 1;
2828                         i -= (sh->pd_idx + 1);
2829                         break;
2830                 case ALGORITHM_PARITY_0_6:
2831                         i -= 1;
2832                         break;
2833                 default:
2834                         BUG();
2835                 }
2836                 break;
2837         }
2838
2839         chunk_number = stripe * data_disks + i;
2840         r_sector = chunk_number * sectors_per_chunk + chunk_offset;
2841
2842         check = raid5_compute_sector(conf, r_sector,
2843                                      previous, &dummy1, &sh2);
2844         if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx
2845                 || sh2.qd_idx != sh->qd_idx) {
2846                 printk(KERN_ERR "md/raid:%s: compute_blocknr: map not correct\n",
2847                        mdname(conf->mddev));
2848                 return 0;
2849         }
2850         return r_sector;
2851 }
2852
2853 static void
2854 schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
2855                          int rcw, int expand)
2856 {
2857         int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx, disks = sh->disks;
2858         struct r5conf *conf = sh->raid_conf;
2859         int level = conf->level;
2860
2861         if (rcw) {
2862
2863                 for (i = disks; i--; ) {
2864                         struct r5dev *dev = &sh->dev[i];
2865
2866                         if (dev->towrite) {
2867                                 set_bit(R5_LOCKED, &dev->flags);
2868                                 set_bit(R5_Wantdrain, &dev->flags);
2869                                 if (!expand)
2870                                         clear_bit(R5_UPTODATE, &dev->flags);
2871                                 s->locked++;
2872                         }
2873                 }
2874                 /* if we are not expanding this is a proper write request, and
2875                  * there will be bios with new data to be drained into the
2876                  * stripe cache
2877                  */
2878                 if (!expand) {
2879                         if (!s->locked)
2880                                 /* False alarm, nothing to do */
2881                                 return;
2882                         sh->reconstruct_state = reconstruct_state_drain_run;
2883                         set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
2884                 } else
2885                         sh->reconstruct_state = reconstruct_state_run;
2886
2887                 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
2888
2889                 if (s->locked + conf->max_degraded == disks)
2890                         if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
2891                                 atomic_inc(&conf->pending_full_writes);
2892         } else {
2893                 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
2894                         test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
2895                 BUG_ON(level == 6 &&
2896                         (!(test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags) ||
2897                            test_bit(R5_Wantcompute, &sh->dev[qd_idx].flags))));
2898
2899                 for (i = disks; i--; ) {
2900                         struct r5dev *dev = &sh->dev[i];
2901                         if (i == pd_idx || i == qd_idx)
2902                                 continue;
2903
2904                         if (dev->towrite &&
2905                             (test_bit(R5_UPTODATE, &dev->flags) ||
2906                              test_bit(R5_Wantcompute, &dev->flags))) {
2907                                 set_bit(R5_Wantdrain, &dev->flags);
2908                                 set_bit(R5_LOCKED, &dev->flags);
2909                                 clear_bit(R5_UPTODATE, &dev->flags);
2910                                 s->locked++;
2911                         }
2912                 }
2913                 if (!s->locked)
2914                         /* False alarm - nothing to do */
2915                         return;
2916                 sh->reconstruct_state = reconstruct_state_prexor_drain_run;
2917                 set_bit(STRIPE_OP_PREXOR, &s->ops_request);
2918                 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
2919                 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
2920         }
2921
2922         /* keep the parity disk(s) locked while asynchronous operations
2923          * are in flight
2924          */
2925         set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
2926         clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
2927         s->locked++;
2928
2929         if (level == 6) {
2930                 int qd_idx = sh->qd_idx;
2931                 struct r5dev *dev = &sh->dev[qd_idx];
2932
2933                 set_bit(R5_LOCKED, &dev->flags);
2934                 clear_bit(R5_UPTODATE, &dev->flags);
2935                 s->locked++;
2936         }
2937
2938         pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
2939                 __func__, (unsigned long long)sh->sector,
2940                 s->locked, s->ops_request);
2941 }
2942
2943 /*
2944  * Each stripe/dev can have one or more bion attached.
2945  * toread/towrite point to the first in a chain.
2946  * The bi_next chain must be in order.
2947  */
2948 static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
2949                           int forwrite, int previous)
2950 {
2951         struct bio **bip;
2952         struct r5conf *conf = sh->raid_conf;
2953         int firstwrite=0;
2954
2955         pr_debug("adding bi b#%llu to stripe s#%llu\n",
2956                 (unsigned long long)bi->bi_iter.bi_sector,
2957                 (unsigned long long)sh->sector);
2958
2959         /*
2960          * If several bio share a stripe. The bio bi_phys_segments acts as a
2961          * reference count to avoid race. The reference count should already be
2962          * increased before this function is called (for example, in
2963          * make_request()), so other bio sharing this stripe will not free the
2964          * stripe. If a stripe is owned by one stripe, the stripe lock will
2965          * protect it.
2966          */
2967         spin_lock_irq(&sh->stripe_lock);
2968         /* Don't allow new IO added to stripes in batch list */
2969         if (sh->batch_head)
2970                 goto overlap;
2971         if (forwrite) {
2972                 bip = &sh->dev[dd_idx].towrite;
2973                 if (*bip == NULL)
2974                         firstwrite = 1;
2975         } else
2976                 bip = &sh->dev[dd_idx].toread;
2977         while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) {
2978                 if (bio_end_sector(*bip) > bi->bi_iter.bi_sector)
2979                         goto overlap;
2980                 bip = & (*bip)->bi_next;
2981         }
2982         if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi))
2983                 goto overlap;
2984
2985         if (!forwrite || previous)
2986                 clear_bit(STRIPE_BATCH_READY, &sh->state);
2987
2988         BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
2989         if (*bip)
2990                 bi->bi_next = *bip;
2991         *bip = bi;
2992         raid5_inc_bi_active_stripes(bi);
2993
2994         if (forwrite) {
2995                 /* check if page is covered */
2996                 sector_t sector = sh->dev[dd_idx].sector;
2997                 for (bi=sh->dev[dd_idx].towrite;
2998                      sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
2999                              bi && bi->bi_iter.bi_sector <= sector;
3000                      bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
3001                         if (bio_end_sector(bi) >= sector)
3002                                 sector = bio_end_sector(bi);
3003                 }
3004                 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
3005                         if (!test_and_set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags))
3006                                 sh->overwrite_disks++;
3007         }
3008
3009         pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
3010                 (unsigned long long)(*bip)->bi_iter.bi_sector,
3011                 (unsigned long long)sh->sector, dd_idx);
3012
3013         if (conf->mddev->bitmap && firstwrite) {
3014                 /* Cannot hold spinlock over bitmap_startwrite,
3015                  * but must ensure this isn't added to a batch until
3016                  * we have added to the bitmap and set bm_seq.
3017                  * So set STRIPE_BITMAP_PENDING to prevent
3018                  * batching.
3019                  * If multiple add_stripe_bio() calls race here they
3020                  * much all set STRIPE_BITMAP_PENDING.  So only the first one
3021                  * to complete "bitmap_startwrite" gets to set
3022                  * STRIPE_BIT_DELAY.  This is important as once a stripe
3023                  * is added to a batch, STRIPE_BIT_DELAY cannot be changed
3024                  * any more.
3025                  */
3026                 set_bit(STRIPE_BITMAP_PENDING, &sh->state);
3027                 spin_unlock_irq(&sh->stripe_lock);
3028                 bitmap_startwrite(conf->mddev->bitmap, sh->sector,
3029                                   STRIPE_SECTORS, 0);
3030                 spin_lock_irq(&sh->stripe_lock);
3031                 clear_bit(STRIPE_BITMAP_PENDING, &sh->state);
3032                 if (!sh->batch_head) {
3033                         sh->bm_seq = conf->seq_flush+1;
3034                         set_bit(STRIPE_BIT_DELAY, &sh->state);
3035                 }
3036         }
3037         spin_unlock_irq(&sh->stripe_lock);
3038
3039         if (stripe_can_batch(sh))
3040                 stripe_add_to_batch_list(conf, sh);
3041         return 1;
3042
3043  overlap:
3044         set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
3045         spin_unlock_irq(&sh->stripe_lock);
3046         return 0;
3047 }
3048
3049 static void end_reshape(struct r5conf *conf);
3050
3051 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
3052                             struct stripe_head *sh)
3053 {
3054         int sectors_per_chunk =
3055                 previous ? conf->prev_chunk_sectors : conf->chunk_sectors;
3056         int dd_idx;
3057         int chunk_offset = sector_div(stripe, sectors_per_chunk);
3058         int disks = previous ? conf->previous_raid_disks : conf->raid_disks;
3059
3060         raid5_compute_sector(conf,
3061                              stripe * (disks - conf->max_degraded)
3062                              *sectors_per_chunk + chunk_offset,
3063                              previous,
3064                              &dd_idx, sh);
3065 }
3066
3067 static void
3068 handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
3069                                 struct stripe_head_state *s, int disks,
3070                                 struct bio_list *return_bi)
3071 {
3072         int i;
3073         BUG_ON(sh->batch_head);
3074         for (i = disks; i--; ) {
3075                 struct bio *bi;
3076                 int bitmap_end = 0;
3077
3078                 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
3079                         struct md_rdev *rdev;
3080                         rcu_read_lock();
3081                         rdev = rcu_dereference(conf->disks[i].rdev);
3082                         if (rdev && test_bit(In_sync, &rdev->flags))
3083                                 atomic_inc(&rdev->nr_pending);
3084                         else
3085                                 rdev = NULL;
3086                         rcu_read_unlock();
3087                         if (rdev) {
3088                                 if (!rdev_set_badblocks(
3089                                             rdev,
3090                                             sh->sector,
3091                                             STRIPE_SECTORS, 0))
3092                                         md_error(conf->mddev, rdev);
3093                                 rdev_dec_pending(rdev, conf->mddev);
3094                         }
3095                 }
3096                 spin_lock_irq(&sh->stripe_lock);
3097                 /* fail all writes first */
3098                 bi = sh->dev[i].towrite;
3099                 sh->dev[i].towrite = NULL;
3100                 sh->overwrite_disks = 0;
3101                 spin_unlock_irq(&sh->stripe_lock);
3102                 if (bi)
3103                         bitmap_end = 1;
3104
3105                 r5l_stripe_write_finished(sh);
3106
3107                 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
3108                         wake_up(&conf->wait_for_overlap);
3109
3110                 while (bi && bi->bi_iter.bi_sector <
3111                         sh->dev[i].sector + STRIPE_SECTORS) {
3112                         struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
3113
3114                         bi->bi_error = -EIO;
3115                         if (!raid5_dec_bi_active_stripes(bi)) {
3116                                 md_write_end(conf->mddev);
3117                                 bio_list_add(return_bi, bi);
3118                         }
3119                         bi = nextbi;
3120                 }
3121                 if (bitmap_end)
3122                         bitmap_endwrite(conf->mddev->bitmap, sh->sector,
3123                                 STRIPE_SECTORS, 0, 0);
3124                 bitmap_end = 0;
3125                 /* and fail all 'written' */
3126                 bi = sh->dev[i].written;
3127                 sh->dev[i].written = NULL;
3128                 if (test_and_clear_bit(R5_SkipCopy, &sh->dev[i].flags)) {
3129                         WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
3130                         sh->dev[i].page = sh->dev[i].orig_page;
3131                 }
3132
3133                 if (bi) bitmap_end = 1;
3134                 while (bi && bi->bi_iter.bi_sector <
3135                        sh->dev[i].sector + STRIPE_SECTORS) {
3136                         struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
3137
3138                         bi->bi_error = -EIO;
3139                         if (!raid5_dec_bi_active_stripes(bi)) {
3140                                 md_write_end(conf->mddev);
3141                                 bio_list_add(return_bi, bi);
3142                         }
3143                         bi = bi2;
3144                 }
3145
3146                 /* fail any reads if this device is non-operational and
3147                  * the data has not reached the cache yet.
3148                  */
3149                 if (!test_bit(R5_Wantfill, &sh->dev[i].flags) &&
3150                     s->failed > conf->max_degraded &&
3151                     (!test_bit(R5_Insync, &sh->dev[i].flags) ||
3152                       test_bit(R5_ReadError, &sh->dev[i].flags))) {
3153                         spin_lock_irq(&sh->stripe_lock);
3154                         bi = sh->dev[i].toread;
3155                         sh->dev[i].toread = NULL;
3156                         spin_unlock_irq(&sh->stripe_lock);
3157                         if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
3158                                 wake_up(&conf->wait_for_overlap);
3159                         if (bi)
3160                                 s->to_read--;
3161                         while (bi && bi->bi_iter.bi_sector <
3162                                sh->dev[i].sector + STRIPE_SECTORS) {
3163                                 struct bio *nextbi =
3164                                         r5_next_bio(bi, sh->dev[i].sector);
3165
3166                                 bi->bi_error = -EIO;
3167