Merge remote-tracking branch 'md/for-next'
[karo-tx-linux.git] / drivers / md / md.c
1 /*
2    md.c : Multiple Devices driver for Linux
3      Copyright (C) 1998, 1999, 2000 Ingo Molnar
4
5      completely rewritten, based on the MD driver code from Marc Zyngier
6
7    Changes:
8
9    - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
10    - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com>
11    - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
12    - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
13    - kmod support by: Cyrus Durgin
14    - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
15    - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
16
17    - lots of fixes and improvements to the RAID1/RAID5 and generic
18      RAID code (such as request based resynchronization):
19
20      Neil Brown <neilb@cse.unsw.edu.au>.
21
22    - persistent bitmap code
23      Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
24
25    This program is free software; you can redistribute it and/or modify
26    it under the terms of the GNU General Public License as published by
27    the Free Software Foundation; either version 2, or (at your option)
28    any later version.
29
30    You should have received a copy of the GNU General Public License
31    (for example /usr/src/linux/COPYING); if not, write to the Free
32    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
33 */
34
35 #include <linux/kthread.h>
36 #include <linux/blkdev.h>
37 #include <linux/sysctl.h>
38 #include <linux/seq_file.h>
39 #include <linux/fs.h>
40 #include <linux/poll.h>
41 #include <linux/ctype.h>
42 #include <linux/string.h>
43 #include <linux/hdreg.h>
44 #include <linux/proc_fs.h>
45 #include <linux/random.h>
46 #include <linux/module.h>
47 #include <linux/reboot.h>
48 #include <linux/file.h>
49 #include <linux/compat.h>
50 #include <linux/delay.h>
51 #include <linux/raid/md_p.h>
52 #include <linux/raid/md_u.h>
53 #include <linux/slab.h>
54 #include "md.h"
55 #include "bitmap.h"
56 #include "md-cluster.h"
57
58 #ifndef MODULE
59 static void autostart_arrays(int part);
60 #endif
61
62 /* pers_list is a list of registered personalities protected
63  * by pers_lock.
64  * pers_lock does extra service to protect accesses to
65  * mddev->thread when the mutex cannot be held.
66  */
67 static LIST_HEAD(pers_list);
68 static DEFINE_SPINLOCK(pers_lock);
69
70 struct md_cluster_operations *md_cluster_ops;
71 EXPORT_SYMBOL(md_cluster_ops);
72 struct module *md_cluster_mod;
73 EXPORT_SYMBOL(md_cluster_mod);
74
75 static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
76 static struct workqueue_struct *md_wq;
77 static struct workqueue_struct *md_misc_wq;
78
79 static int remove_and_add_spares(struct mddev *mddev,
80                                  struct md_rdev *this);
81 static void mddev_detach(struct mddev *mddev);
82
83 /*
84  * Default number of read corrections we'll attempt on an rdev
85  * before ejecting it from the array. We divide the read error
86  * count by 2 for every hour elapsed between read errors.
87  */
88 #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20
89 /*
90  * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
91  * is 1000 KB/sec, so the extra system load does not show up that much.
92  * Increase it if you want to have more _guaranteed_ speed. Note that
93  * the RAID driver will use the maximum available bandwidth if the IO
94  * subsystem is idle. There is also an 'absolute maximum' reconstruction
95  * speed limit - in case reconstruction slows down your system despite
96  * idle IO detection.
97  *
98  * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
99  * or /sys/block/mdX/md/sync_speed_{min,max}
100  */
101
102 static int sysctl_speed_limit_min = 1000;
103 static int sysctl_speed_limit_max = 200000;
104 static inline int speed_min(struct mddev *mddev)
105 {
106         return mddev->sync_speed_min ?
107                 mddev->sync_speed_min : sysctl_speed_limit_min;
108 }
109
110 static inline int speed_max(struct mddev *mddev)
111 {
112         return mddev->sync_speed_max ?
113                 mddev->sync_speed_max : sysctl_speed_limit_max;
114 }
115
116 static struct ctl_table_header *raid_table_header;
117
118 static struct ctl_table raid_table[] = {
119         {
120                 .procname       = "speed_limit_min",
121                 .data           = &sysctl_speed_limit_min,
122                 .maxlen         = sizeof(int),
123                 .mode           = S_IRUGO|S_IWUSR,
124                 .proc_handler   = proc_dointvec,
125         },
126         {
127                 .procname       = "speed_limit_max",
128                 .data           = &sysctl_speed_limit_max,
129                 .maxlen         = sizeof(int),
130                 .mode           = S_IRUGO|S_IWUSR,
131                 .proc_handler   = proc_dointvec,
132         },
133         { }
134 };
135
136 static struct ctl_table raid_dir_table[] = {
137         {
138                 .procname       = "raid",
139                 .maxlen         = 0,
140                 .mode           = S_IRUGO|S_IXUGO,
141                 .child          = raid_table,
142         },
143         { }
144 };
145
146 static struct ctl_table raid_root_table[] = {
147         {
148                 .procname       = "dev",
149                 .maxlen         = 0,
150                 .mode           = 0555,
151                 .child          = raid_dir_table,
152         },
153         {  }
154 };
155
156 static const struct block_device_operations md_fops;
157
158 static int start_readonly;
159
160 /* bio_clone_mddev
161  * like bio_clone, but with a local bio set
162  */
163
164 struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
165                             struct mddev *mddev)
166 {
167         struct bio *b;
168
169         if (!mddev || !mddev->bio_set)
170                 return bio_alloc(gfp_mask, nr_iovecs);
171
172         b = bio_alloc_bioset(gfp_mask, nr_iovecs, mddev->bio_set);
173         if (!b)
174                 return NULL;
175         return b;
176 }
177 EXPORT_SYMBOL_GPL(bio_alloc_mddev);
178
179 struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
180                             struct mddev *mddev)
181 {
182         if (!mddev || !mddev->bio_set)
183                 return bio_clone(bio, gfp_mask);
184
185         return bio_clone_bioset(bio, gfp_mask, mddev->bio_set);
186 }
187 EXPORT_SYMBOL_GPL(bio_clone_mddev);
188
189 /*
190  * We have a system wide 'event count' that is incremented
191  * on any 'interesting' event, and readers of /proc/mdstat
192  * can use 'poll' or 'select' to find out when the event
193  * count increases.
194  *
195  * Events are:
196  *  start array, stop array, error, add device, remove device,
197  *  start build, activate spare
198  */
199 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
200 static atomic_t md_event_count;
201 void md_new_event(struct mddev *mddev)
202 {
203         atomic_inc(&md_event_count);
204         wake_up(&md_event_waiters);
205 }
206 EXPORT_SYMBOL_GPL(md_new_event);
207
208 /* Alternate version that can be called from interrupts
209  * when calling sysfs_notify isn't needed.
210  */
211 static void md_new_event_inintr(struct mddev *mddev)
212 {
213         atomic_inc(&md_event_count);
214         wake_up(&md_event_waiters);
215 }
216
217 /*
218  * Enables to iterate over all existing md arrays
219  * all_mddevs_lock protects this list.
220  */
221 static LIST_HEAD(all_mddevs);
222 static DEFINE_SPINLOCK(all_mddevs_lock);
223
224 /*
225  * iterates through all used mddevs in the system.
226  * We take care to grab the all_mddevs_lock whenever navigating
227  * the list, and to always hold a refcount when unlocked.
228  * Any code which breaks out of this loop while own
229  * a reference to the current mddev and must mddev_put it.
230  */
231 #define for_each_mddev(_mddev,_tmp)                                     \
232                                                                         \
233         for (({ spin_lock(&all_mddevs_lock);                            \
234                 _tmp = all_mddevs.next;                                 \
235                 _mddev = NULL;});                                       \
236              ({ if (_tmp != &all_mddevs)                                \
237                         mddev_get(list_entry(_tmp, struct mddev, all_mddevs));\
238                 spin_unlock(&all_mddevs_lock);                          \
239                 if (_mddev) mddev_put(_mddev);                          \
240                 _mddev = list_entry(_tmp, struct mddev, all_mddevs);    \
241                 _tmp != &all_mddevs;});                                 \
242              ({ spin_lock(&all_mddevs_lock);                            \
243                 _tmp = _tmp->next;})                                    \
244                 )
245
246 /* Rather than calling directly into the personality make_request function,
247  * IO requests come here first so that we can check if the device is
248  * being suspended pending a reconfiguration.
249  * We hold a refcount over the call to ->make_request.  By the time that
250  * call has finished, the bio has been linked into some internal structure
251  * and so is visible to ->quiesce(), so we don't need the refcount any more.
252  */
253 static void md_make_request(struct request_queue *q, struct bio *bio)
254 {
255         const int rw = bio_data_dir(bio);
256         struct mddev *mddev = q->queuedata;
257         unsigned int sectors;
258         int cpu;
259
260         blk_queue_split(q, &bio, q->bio_split);
261
262         if (mddev == NULL || mddev->pers == NULL
263             || !mddev->ready) {
264                 bio_io_error(bio);
265                 return;
266         }
267         if (mddev->ro == 1 && unlikely(rw == WRITE)) {
268                 if (bio_sectors(bio) != 0)
269                         bio->bi_error = -EROFS;
270                 bio_endio(bio);
271                 return;
272         }
273         smp_rmb(); /* Ensure implications of  'active' are visible */
274         rcu_read_lock();
275         if (mddev->suspended) {
276                 DEFINE_WAIT(__wait);
277                 for (;;) {
278                         prepare_to_wait(&mddev->sb_wait, &__wait,
279                                         TASK_UNINTERRUPTIBLE);
280                         if (!mddev->suspended)
281                                 break;
282                         rcu_read_unlock();
283                         schedule();
284                         rcu_read_lock();
285                 }
286                 finish_wait(&mddev->sb_wait, &__wait);
287         }
288         atomic_inc(&mddev->active_io);
289         rcu_read_unlock();
290
291         /*
292          * save the sectors now since our bio can
293          * go away inside make_request
294          */
295         sectors = bio_sectors(bio);
296         mddev->pers->make_request(mddev, bio);
297
298         cpu = part_stat_lock();
299         part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
300         part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors);
301         part_stat_unlock();
302
303         if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
304                 wake_up(&mddev->sb_wait);
305 }
306
307 /* mddev_suspend makes sure no new requests are submitted
308  * to the device, and that any requests that have been submitted
309  * are completely handled.
310  * Once mddev_detach() is called and completes, the module will be
311  * completely unused.
312  */
313 void mddev_suspend(struct mddev *mddev)
314 {
315         BUG_ON(mddev->suspended);
316         mddev->suspended = 1;
317         synchronize_rcu();
318         wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
319         mddev->pers->quiesce(mddev, 1);
320
321         del_timer_sync(&mddev->safemode_timer);
322 }
323 EXPORT_SYMBOL_GPL(mddev_suspend);
324
325 void mddev_resume(struct mddev *mddev)
326 {
327         mddev->suspended = 0;
328         wake_up(&mddev->sb_wait);
329         mddev->pers->quiesce(mddev, 0);
330
331         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
332         md_wakeup_thread(mddev->thread);
333         md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
334 }
335 EXPORT_SYMBOL_GPL(mddev_resume);
336
337 int mddev_congested(struct mddev *mddev, int bits)
338 {
339         struct md_personality *pers = mddev->pers;
340         int ret = 0;
341
342         rcu_read_lock();
343         if (mddev->suspended)
344                 ret = 1;
345         else if (pers && pers->congested)
346                 ret = pers->congested(mddev, bits);
347         rcu_read_unlock();
348         return ret;
349 }
350 EXPORT_SYMBOL_GPL(mddev_congested);
351 static int md_congested(void *data, int bits)
352 {
353         struct mddev *mddev = data;
354         return mddev_congested(mddev, bits);
355 }
356
357 /*
358  * Generic flush handling for md
359  */
360
361 static void md_end_flush(struct bio *bio)
362 {
363         struct md_rdev *rdev = bio->bi_private;
364         struct mddev *mddev = rdev->mddev;
365
366         rdev_dec_pending(rdev, mddev);
367
368         if (atomic_dec_and_test(&mddev->flush_pending)) {
369                 /* The pre-request flush has finished */
370                 queue_work(md_wq, &mddev->flush_work);
371         }
372         bio_put(bio);
373 }
374
375 static void md_submit_flush_data(struct work_struct *ws);
376
377 static void submit_flushes(struct work_struct *ws)
378 {
379         struct mddev *mddev = container_of(ws, struct mddev, flush_work);
380         struct md_rdev *rdev;
381
382         INIT_WORK(&mddev->flush_work, md_submit_flush_data);
383         atomic_set(&mddev->flush_pending, 1);
384         rcu_read_lock();
385         rdev_for_each_rcu(rdev, mddev)
386                 if (rdev->raid_disk >= 0 &&
387                     !test_bit(Faulty, &rdev->flags)) {
388                         /* Take two references, one is dropped
389                          * when request finishes, one after
390                          * we reclaim rcu_read_lock
391                          */
392                         struct bio *bi;
393                         atomic_inc(&rdev->nr_pending);
394                         atomic_inc(&rdev->nr_pending);
395                         rcu_read_unlock();
396                         bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
397                         bi->bi_end_io = md_end_flush;
398                         bi->bi_private = rdev;
399                         bi->bi_bdev = rdev->bdev;
400                         atomic_inc(&mddev->flush_pending);
401                         submit_bio(WRITE_FLUSH, bi);
402                         rcu_read_lock();
403                         rdev_dec_pending(rdev, mddev);
404                 }
405         rcu_read_unlock();
406         if (atomic_dec_and_test(&mddev->flush_pending))
407                 queue_work(md_wq, &mddev->flush_work);
408 }
409
410 static void md_submit_flush_data(struct work_struct *ws)
411 {
412         struct mddev *mddev = container_of(ws, struct mddev, flush_work);
413         struct bio *bio = mddev->flush_bio;
414
415         if (bio->bi_iter.bi_size == 0)
416                 /* an empty barrier - all done */
417                 bio_endio(bio);
418         else {
419                 bio->bi_rw &= ~REQ_FLUSH;
420                 mddev->pers->make_request(mddev, bio);
421         }
422
423         mddev->flush_bio = NULL;
424         wake_up(&mddev->sb_wait);
425 }
426
427 void md_flush_request(struct mddev *mddev, struct bio *bio)
428 {
429         spin_lock_irq(&mddev->lock);
430         wait_event_lock_irq(mddev->sb_wait,
431                             !mddev->flush_bio,
432                             mddev->lock);
433         mddev->flush_bio = bio;
434         spin_unlock_irq(&mddev->lock);
435
436         INIT_WORK(&mddev->flush_work, submit_flushes);
437         queue_work(md_wq, &mddev->flush_work);
438 }
439 EXPORT_SYMBOL(md_flush_request);
440
441 void md_unplug(struct blk_plug_cb *cb, bool from_schedule)
442 {
443         struct mddev *mddev = cb->data;
444         md_wakeup_thread(mddev->thread);
445         kfree(cb);
446 }
447 EXPORT_SYMBOL(md_unplug);
448
449 static inline struct mddev *mddev_get(struct mddev *mddev)
450 {
451         atomic_inc(&mddev->active);
452         return mddev;
453 }
454
455 static void mddev_delayed_delete(struct work_struct *ws);
456
457 static void mddev_put(struct mddev *mddev)
458 {
459         struct bio_set *bs = NULL;
460
461         if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
462                 return;
463         if (!mddev->raid_disks && list_empty(&mddev->disks) &&
464             mddev->ctime == 0 && !mddev->hold_active) {
465                 /* Array is not configured at all, and not held active,
466                  * so destroy it */
467                 list_del_init(&mddev->all_mddevs);
468                 bs = mddev->bio_set;
469                 mddev->bio_set = NULL;
470                 if (mddev->gendisk) {
471                         /* We did a probe so need to clean up.  Call
472                          * queue_work inside the spinlock so that
473                          * flush_workqueue() after mddev_find will
474                          * succeed in waiting for the work to be done.
475                          */
476                         INIT_WORK(&mddev->del_work, mddev_delayed_delete);
477                         queue_work(md_misc_wq, &mddev->del_work);
478                 } else
479                         kfree(mddev);
480         }
481         spin_unlock(&all_mddevs_lock);
482         if (bs)
483                 bioset_free(bs);
484 }
485
486 static void md_safemode_timeout(unsigned long data);
487
488 void mddev_init(struct mddev *mddev)
489 {
490         mutex_init(&mddev->open_mutex);
491         mutex_init(&mddev->reconfig_mutex);
492         mutex_init(&mddev->bitmap_info.mutex);
493         INIT_LIST_HEAD(&mddev->disks);
494         INIT_LIST_HEAD(&mddev->all_mddevs);
495         setup_timer(&mddev->safemode_timer, md_safemode_timeout,
496                     (unsigned long) mddev);
497         atomic_set(&mddev->active, 1);
498         atomic_set(&mddev->openers, 0);
499         atomic_set(&mddev->active_io, 0);
500         spin_lock_init(&mddev->lock);
501         atomic_set(&mddev->flush_pending, 0);
502         init_waitqueue_head(&mddev->sb_wait);
503         init_waitqueue_head(&mddev->recovery_wait);
504         mddev->reshape_position = MaxSector;
505         mddev->reshape_backwards = 0;
506         mddev->last_sync_action = "none";
507         mddev->resync_min = 0;
508         mddev->resync_max = MaxSector;
509         mddev->level = LEVEL_NONE;
510 }
511 EXPORT_SYMBOL_GPL(mddev_init);
512
513 static struct mddev *mddev_find(dev_t unit)
514 {
515         struct mddev *mddev, *new = NULL;
516
517         if (unit && MAJOR(unit) != MD_MAJOR)
518                 unit &= ~((1<<MdpMinorShift)-1);
519
520  retry:
521         spin_lock(&all_mddevs_lock);
522
523         if (unit) {
524                 list_for_each_entry(mddev, &all_mddevs, all_mddevs)
525                         if (mddev->unit == unit) {
526                                 mddev_get(mddev);
527                                 spin_unlock(&all_mddevs_lock);
528                                 kfree(new);
529                                 return mddev;
530                         }
531
532                 if (new) {
533                         list_add(&new->all_mddevs, &all_mddevs);
534                         spin_unlock(&all_mddevs_lock);
535                         new->hold_active = UNTIL_IOCTL;
536                         return new;
537                 }
538         } else if (new) {
539                 /* find an unused unit number */
540                 static int next_minor = 512;
541                 int start = next_minor;
542                 int is_free = 0;
543                 int dev = 0;
544                 while (!is_free) {
545                         dev = MKDEV(MD_MAJOR, next_minor);
546                         next_minor++;
547                         if (next_minor > MINORMASK)
548                                 next_minor = 0;
549                         if (next_minor == start) {
550                                 /* Oh dear, all in use. */
551                                 spin_unlock(&all_mddevs_lock);
552                                 kfree(new);
553                                 return NULL;
554                         }
555
556                         is_free = 1;
557                         list_for_each_entry(mddev, &all_mddevs, all_mddevs)
558                                 if (mddev->unit == dev) {
559                                         is_free = 0;
560                                         break;
561                                 }
562                 }
563                 new->unit = dev;
564                 new->md_minor = MINOR(dev);
565                 new->hold_active = UNTIL_STOP;
566                 list_add(&new->all_mddevs, &all_mddevs);
567                 spin_unlock(&all_mddevs_lock);
568                 return new;
569         }
570         spin_unlock(&all_mddevs_lock);
571
572         new = kzalloc(sizeof(*new), GFP_KERNEL);
573         if (!new)
574                 return NULL;
575
576         new->unit = unit;
577         if (MAJOR(unit) == MD_MAJOR)
578                 new->md_minor = MINOR(unit);
579         else
580                 new->md_minor = MINOR(unit) >> MdpMinorShift;
581
582         mddev_init(new);
583
584         goto retry;
585 }
586
587 static struct attribute_group md_redundancy_group;
588
589 void mddev_unlock(struct mddev *mddev)
590 {
591         if (mddev->to_remove) {
592                 /* These cannot be removed under reconfig_mutex as
593                  * an access to the files will try to take reconfig_mutex
594                  * while holding the file unremovable, which leads to
595                  * a deadlock.
596                  * So hold set sysfs_active while the remove in happeing,
597                  * and anything else which might set ->to_remove or my
598                  * otherwise change the sysfs namespace will fail with
599                  * -EBUSY if sysfs_active is still set.
600                  * We set sysfs_active under reconfig_mutex and elsewhere
601                  * test it under the same mutex to ensure its correct value
602                  * is seen.
603                  */
604                 struct attribute_group *to_remove = mddev->to_remove;
605                 mddev->to_remove = NULL;
606                 mddev->sysfs_active = 1;
607                 mutex_unlock(&mddev->reconfig_mutex);
608
609                 if (mddev->kobj.sd) {
610                         if (to_remove != &md_redundancy_group)
611                                 sysfs_remove_group(&mddev->kobj, to_remove);
612                         if (mddev->pers == NULL ||
613                             mddev->pers->sync_request == NULL) {
614                                 sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
615                                 if (mddev->sysfs_action)
616                                         sysfs_put(mddev->sysfs_action);
617                                 mddev->sysfs_action = NULL;
618                         }
619                 }
620                 mddev->sysfs_active = 0;
621         } else
622                 mutex_unlock(&mddev->reconfig_mutex);
623
624         /* As we've dropped the mutex we need a spinlock to
625          * make sure the thread doesn't disappear
626          */
627         spin_lock(&pers_lock);
628         md_wakeup_thread(mddev->thread);
629         spin_unlock(&pers_lock);
630 }
631 EXPORT_SYMBOL_GPL(mddev_unlock);
632
633 struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr)
634 {
635         struct md_rdev *rdev;
636
637         rdev_for_each_rcu(rdev, mddev)
638                 if (rdev->desc_nr == nr)
639                         return rdev;
640
641         return NULL;
642 }
643 EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu);
644
645 static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev)
646 {
647         struct md_rdev *rdev;
648
649         rdev_for_each(rdev, mddev)
650                 if (rdev->bdev->bd_dev == dev)
651                         return rdev;
652
653         return NULL;
654 }
655
656 static struct md_rdev *find_rdev_rcu(struct mddev *mddev, dev_t dev)
657 {
658         struct md_rdev *rdev;
659
660         rdev_for_each_rcu(rdev, mddev)
661                 if (rdev->bdev->bd_dev == dev)
662                         return rdev;
663
664         return NULL;
665 }
666
667 static struct md_personality *find_pers(int level, char *clevel)
668 {
669         struct md_personality *pers;
670         list_for_each_entry(pers, &pers_list, list) {
671                 if (level != LEVEL_NONE && pers->level == level)
672                         return pers;
673                 if (strcmp(pers->name, clevel)==0)
674                         return pers;
675         }
676         return NULL;
677 }
678
679 /* return the offset of the super block in 512byte sectors */
680 static inline sector_t calc_dev_sboffset(struct md_rdev *rdev)
681 {
682         sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512;
683         return MD_NEW_SIZE_SECTORS(num_sectors);
684 }
685
686 static int alloc_disk_sb(struct md_rdev *rdev)
687 {
688         rdev->sb_page = alloc_page(GFP_KERNEL);
689         if (!rdev->sb_page) {
690                 printk(KERN_ALERT "md: out of memory.\n");
691                 return -ENOMEM;
692         }
693
694         return 0;
695 }
696
697 void md_rdev_clear(struct md_rdev *rdev)
698 {
699         if (rdev->sb_page) {
700                 put_page(rdev->sb_page);
701                 rdev->sb_loaded = 0;
702                 rdev->sb_page = NULL;
703                 rdev->sb_start = 0;
704                 rdev->sectors = 0;
705         }
706         if (rdev->bb_page) {
707                 put_page(rdev->bb_page);
708                 rdev->bb_page = NULL;
709         }
710         kfree(rdev->badblocks.page);
711         rdev->badblocks.page = NULL;
712 }
713 EXPORT_SYMBOL_GPL(md_rdev_clear);
714
715 static void super_written(struct bio *bio)
716 {
717         struct md_rdev *rdev = bio->bi_private;
718         struct mddev *mddev = rdev->mddev;
719
720         if (bio->bi_error) {
721                 printk("md: super_written gets error=%d\n", bio->bi_error);
722                 md_error(mddev, rdev);
723         }
724
725         if (atomic_dec_and_test(&mddev->pending_writes))
726                 wake_up(&mddev->sb_wait);
727         bio_put(bio);
728 }
729
730 void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
731                    sector_t sector, int size, struct page *page)
732 {
733         /* write first size bytes of page to sector of rdev
734          * Increment mddev->pending_writes before returning
735          * and decrement it on completion, waking up sb_wait
736          * if zero is reached.
737          * If an error occurred, call md_error
738          */
739         struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev);
740
741         bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev;
742         bio->bi_iter.bi_sector = sector;
743         bio_add_page(bio, page, size, 0);
744         bio->bi_private = rdev;
745         bio->bi_end_io = super_written;
746
747         atomic_inc(&mddev->pending_writes);
748         submit_bio(WRITE_FLUSH_FUA, bio);
749 }
750
751 void md_super_wait(struct mddev *mddev)
752 {
753         /* wait for all superblock writes that were scheduled to complete */
754         wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0);
755 }
756
757 int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
758                  struct page *page, int rw, bool metadata_op)
759 {
760         struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev);
761         int ret;
762
763         bio->bi_bdev = (metadata_op && rdev->meta_bdev) ?
764                 rdev->meta_bdev : rdev->bdev;
765         if (metadata_op)
766                 bio->bi_iter.bi_sector = sector + rdev->sb_start;
767         else if (rdev->mddev->reshape_position != MaxSector &&
768                  (rdev->mddev->reshape_backwards ==
769                   (sector >= rdev->mddev->reshape_position)))
770                 bio->bi_iter.bi_sector = sector + rdev->new_data_offset;
771         else
772                 bio->bi_iter.bi_sector = sector + rdev->data_offset;
773         bio_add_page(bio, page, size, 0);
774         submit_bio_wait(rw, bio);
775
776         ret = !bio->bi_error;
777         bio_put(bio);
778         return ret;
779 }
780 EXPORT_SYMBOL_GPL(sync_page_io);
781
782 static int read_disk_sb(struct md_rdev *rdev, int size)
783 {
784         char b[BDEVNAME_SIZE];
785
786         if (rdev->sb_loaded)
787                 return 0;
788
789         if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, true))
790                 goto fail;
791         rdev->sb_loaded = 1;
792         return 0;
793
794 fail:
795         printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n",
796                 bdevname(rdev->bdev,b));
797         return -EINVAL;
798 }
799
800 static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
801 {
802         return  sb1->set_uuid0 == sb2->set_uuid0 &&
803                 sb1->set_uuid1 == sb2->set_uuid1 &&
804                 sb1->set_uuid2 == sb2->set_uuid2 &&
805                 sb1->set_uuid3 == sb2->set_uuid3;
806 }
807
808 static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
809 {
810         int ret;
811         mdp_super_t *tmp1, *tmp2;
812
813         tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
814         tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
815
816         if (!tmp1 || !tmp2) {
817                 ret = 0;
818                 printk(KERN_INFO "md.c sb_equal(): failed to allocate memory!\n");
819                 goto abort;
820         }
821
822         *tmp1 = *sb1;
823         *tmp2 = *sb2;
824
825         /*
826          * nr_disks is not constant
827          */
828         tmp1->nr_disks = 0;
829         tmp2->nr_disks = 0;
830
831         ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0);
832 abort:
833         kfree(tmp1);
834         kfree(tmp2);
835         return ret;
836 }
837
838 static u32 md_csum_fold(u32 csum)
839 {
840         csum = (csum & 0xffff) + (csum >> 16);
841         return (csum & 0xffff) + (csum >> 16);
842 }
843
844 static unsigned int calc_sb_csum(mdp_super_t *sb)
845 {
846         u64 newcsum = 0;
847         u32 *sb32 = (u32*)sb;
848         int i;
849         unsigned int disk_csum, csum;
850
851         disk_csum = sb->sb_csum;
852         sb->sb_csum = 0;
853
854         for (i = 0; i < MD_SB_BYTES/4 ; i++)
855                 newcsum += sb32[i];
856         csum = (newcsum & 0xffffffff) + (newcsum>>32);
857
858 #ifdef CONFIG_ALPHA
859         /* This used to use csum_partial, which was wrong for several
860          * reasons including that different results are returned on
861          * different architectures.  It isn't critical that we get exactly
862          * the same return value as before (we always csum_fold before
863          * testing, and that removes any differences).  However as we
864          * know that csum_partial always returned a 16bit value on
865          * alphas, do a fold to maximise conformity to previous behaviour.
866          */
867         sb->sb_csum = md_csum_fold(disk_csum);
868 #else
869         sb->sb_csum = disk_csum;
870 #endif
871         return csum;
872 }
873
874 /*
875  * Handle superblock details.
876  * We want to be able to handle multiple superblock formats
877  * so we have a common interface to them all, and an array of
878  * different handlers.
879  * We rely on user-space to write the initial superblock, and support
880  * reading and updating of superblocks.
881  * Interface methods are:
882  *   int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version)
883  *      loads and validates a superblock on dev.
884  *      if refdev != NULL, compare superblocks on both devices
885  *    Return:
886  *      0 - dev has a superblock that is compatible with refdev
887  *      1 - dev has a superblock that is compatible and newer than refdev
888  *          so dev should be used as the refdev in future
889  *     -EINVAL superblock incompatible or invalid
890  *     -othererror e.g. -EIO
891  *
892  *   int validate_super(struct mddev *mddev, struct md_rdev *dev)
893  *      Verify that dev is acceptable into mddev.
894  *       The first time, mddev->raid_disks will be 0, and data from
895  *       dev should be merged in.  Subsequent calls check that dev
896  *       is new enough.  Return 0 or -EINVAL
897  *
898  *   void sync_super(struct mddev *mddev, struct md_rdev *dev)
899  *     Update the superblock for rdev with data in mddev
900  *     This does not write to disc.
901  *
902  */
903
904 struct super_type  {
905         char                *name;
906         struct module       *owner;
907         int                 (*load_super)(struct md_rdev *rdev,
908                                           struct md_rdev *refdev,
909                                           int minor_version);
910         int                 (*validate_super)(struct mddev *mddev,
911                                               struct md_rdev *rdev);
912         void                (*sync_super)(struct mddev *mddev,
913                                           struct md_rdev *rdev);
914         unsigned long long  (*rdev_size_change)(struct md_rdev *rdev,
915                                                 sector_t num_sectors);
916         int                 (*allow_new_offset)(struct md_rdev *rdev,
917                                                 unsigned long long new_offset);
918 };
919
920 /*
921  * Check that the given mddev has no bitmap.
922  *
923  * This function is called from the run method of all personalities that do not
924  * support bitmaps. It prints an error message and returns non-zero if mddev
925  * has a bitmap. Otherwise, it returns 0.
926  *
927  */
928 int md_check_no_bitmap(struct mddev *mddev)
929 {
930         if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset)
931                 return 0;
932         printk(KERN_ERR "%s: bitmaps are not supported for %s\n",
933                 mdname(mddev), mddev->pers->name);
934         return 1;
935 }
936 EXPORT_SYMBOL(md_check_no_bitmap);
937
938 /*
939  * load_super for 0.90.0
940  */
941 static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
942 {
943         char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
944         mdp_super_t *sb;
945         int ret;
946
947         /*
948          * Calculate the position of the superblock (512byte sectors),
949          * it's at the end of the disk.
950          *
951          * It also happens to be a multiple of 4Kb.
952          */
953         rdev->sb_start = calc_dev_sboffset(rdev);
954
955         ret = read_disk_sb(rdev, MD_SB_BYTES);
956         if (ret) return ret;
957
958         ret = -EINVAL;
959
960         bdevname(rdev->bdev, b);
961         sb = page_address(rdev->sb_page);
962
963         if (sb->md_magic != MD_SB_MAGIC) {
964                 printk(KERN_ERR "md: invalid raid superblock magic on %s\n",
965                        b);
966                 goto abort;
967         }
968
969         if (sb->major_version != 0 ||
970             sb->minor_version < 90 ||
971             sb->minor_version > 91) {
972                 printk(KERN_WARNING "Bad version number %d.%d on %s\n",
973                         sb->major_version, sb->minor_version,
974                         b);
975                 goto abort;
976         }
977
978         if (sb->raid_disks <= 0)
979                 goto abort;
980
981         if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) {
982                 printk(KERN_WARNING "md: invalid superblock checksum on %s\n",
983                         b);
984                 goto abort;
985         }
986
987         rdev->preferred_minor = sb->md_minor;
988         rdev->data_offset = 0;
989         rdev->new_data_offset = 0;
990         rdev->sb_size = MD_SB_BYTES;
991         rdev->badblocks.shift = -1;
992
993         if (sb->level == LEVEL_MULTIPATH)
994                 rdev->desc_nr = -1;
995         else
996                 rdev->desc_nr = sb->this_disk.number;
997
998         if (!refdev) {
999                 ret = 1;
1000         } else {
1001                 __u64 ev1, ev2;
1002                 mdp_super_t *refsb = page_address(refdev->sb_page);
1003                 if (!uuid_equal(refsb, sb)) {
1004                         printk(KERN_WARNING "md: %s has different UUID to %s\n",
1005                                 b, bdevname(refdev->bdev,b2));
1006                         goto abort;
1007                 }
1008                 if (!sb_equal(refsb, sb)) {
1009                         printk(KERN_WARNING "md: %s has same UUID"
1010                                " but different superblock to %s\n",
1011                                b, bdevname(refdev->bdev, b2));
1012                         goto abort;
1013                 }
1014                 ev1 = md_event(sb);
1015                 ev2 = md_event(refsb);
1016                 if (ev1 > ev2)
1017                         ret = 1;
1018                 else
1019                         ret = 0;
1020         }
1021         rdev->sectors = rdev->sb_start;
1022         /* Limit to 4TB as metadata cannot record more than that.
1023          * (not needed for Linear and RAID0 as metadata doesn't
1024          * record this size)
1025          */
1026         if (rdev->sectors >= (2ULL << 32) && sb->level >= 1)
1027                 rdev->sectors = (2ULL << 32) - 2;
1028
1029         if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1)
1030                 /* "this cannot possibly happen" ... */
1031                 ret = -EINVAL;
1032
1033  abort:
1034         return ret;
1035 }
1036
1037 /*
1038  * validate_super for 0.90.0
1039  */
1040 static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
1041 {
1042         mdp_disk_t *desc;
1043         mdp_super_t *sb = page_address(rdev->sb_page);
1044         __u64 ev1 = md_event(sb);
1045
1046         rdev->raid_disk = -1;
1047         clear_bit(Faulty, &rdev->flags);
1048         clear_bit(In_sync, &rdev->flags);
1049         clear_bit(Bitmap_sync, &rdev->flags);
1050         clear_bit(WriteMostly, &rdev->flags);
1051
1052         if (mddev->raid_disks == 0) {
1053                 mddev->major_version = 0;
1054                 mddev->minor_version = sb->minor_version;
1055                 mddev->patch_version = sb->patch_version;
1056                 mddev->external = 0;
1057                 mddev->chunk_sectors = sb->chunk_size >> 9;
1058                 mddev->ctime = sb->ctime;
1059                 mddev->utime = sb->utime;
1060                 mddev->level = sb->level;
1061                 mddev->clevel[0] = 0;
1062                 mddev->layout = sb->layout;
1063                 mddev->raid_disks = sb->raid_disks;
1064                 mddev->dev_sectors = ((sector_t)sb->size) * 2;
1065                 mddev->events = ev1;
1066                 mddev->bitmap_info.offset = 0;
1067                 mddev->bitmap_info.space = 0;
1068                 /* bitmap can use 60 K after the 4K superblocks */
1069                 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
1070                 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
1071                 mddev->reshape_backwards = 0;
1072
1073                 if (mddev->minor_version >= 91) {
1074                         mddev->reshape_position = sb->reshape_position;
1075                         mddev->delta_disks = sb->delta_disks;
1076                         mddev->new_level = sb->new_level;
1077                         mddev->new_layout = sb->new_layout;
1078                         mddev->new_chunk_sectors = sb->new_chunk >> 9;
1079                         if (mddev->delta_disks < 0)
1080                                 mddev->reshape_backwards = 1;
1081                 } else {
1082                         mddev->reshape_position = MaxSector;
1083                         mddev->delta_disks = 0;
1084                         mddev->new_level = mddev->level;
1085                         mddev->new_layout = mddev->layout;
1086                         mddev->new_chunk_sectors = mddev->chunk_sectors;
1087                 }
1088
1089                 if (sb->state & (1<<MD_SB_CLEAN))
1090                         mddev->recovery_cp = MaxSector;
1091                 else {
1092                         if (sb->events_hi == sb->cp_events_hi &&
1093                                 sb->events_lo == sb->cp_events_lo) {
1094                                 mddev->recovery_cp = sb->recovery_cp;
1095                         } else
1096                                 mddev->recovery_cp = 0;
1097                 }
1098
1099                 memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
1100                 memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
1101                 memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
1102                 memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
1103
1104                 mddev->max_disks = MD_SB_DISKS;
1105
1106                 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
1107                     mddev->bitmap_info.file == NULL) {
1108                         mddev->bitmap_info.offset =
1109                                 mddev->bitmap_info.default_offset;
1110                         mddev->bitmap_info.space =
1111                                 mddev->bitmap_info.default_space;
1112                 }
1113
1114         } else if (mddev->pers == NULL) {
1115                 /* Insist on good event counter while assembling, except
1116                  * for spares (which don't need an event count) */
1117                 ++ev1;
1118                 if (sb->disks[rdev->desc_nr].state & (
1119                             (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))
1120                         if (ev1 < mddev->events)
1121                                 return -EINVAL;
1122         } else if (mddev->bitmap) {
1123                 /* if adding to array with a bitmap, then we can accept an
1124                  * older device ... but not too old.
1125                  */
1126                 if (ev1 < mddev->bitmap->events_cleared)
1127                         return 0;
1128                 if (ev1 < mddev->events)
1129                         set_bit(Bitmap_sync, &rdev->flags);
1130         } else {
1131                 if (ev1 < mddev->events)
1132                         /* just a hot-add of a new device, leave raid_disk at -1 */
1133                         return 0;
1134         }
1135
1136         if (mddev->level != LEVEL_MULTIPATH) {
1137                 desc = sb->disks + rdev->desc_nr;
1138
1139                 if (desc->state & (1<<MD_DISK_FAULTY))
1140                         set_bit(Faulty, &rdev->flags);
1141                 else if (desc->state & (1<<MD_DISK_SYNC) /* &&
1142                             desc->raid_disk < mddev->raid_disks */) {
1143                         set_bit(In_sync, &rdev->flags);
1144                         rdev->raid_disk = desc->raid_disk;
1145                         rdev->saved_raid_disk = desc->raid_disk;
1146                 } else if (desc->state & (1<<MD_DISK_ACTIVE)) {
1147                         /* active but not in sync implies recovery up to
1148                          * reshape position.  We don't know exactly where
1149                          * that is, so set to zero for now */
1150                         if (mddev->minor_version >= 91) {
1151                                 rdev->recovery_offset = 0;
1152                                 rdev->raid_disk = desc->raid_disk;
1153                         }
1154                 }
1155                 if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
1156                         set_bit(WriteMostly, &rdev->flags);
1157         } else /* MULTIPATH are always insync */
1158                 set_bit(In_sync, &rdev->flags);
1159         return 0;
1160 }
1161
1162 /*
1163  * sync_super for 0.90.0
1164  */
1165 static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev)
1166 {
1167         mdp_super_t *sb;
1168         struct md_rdev *rdev2;
1169         int next_spare = mddev->raid_disks;
1170
1171         /* make rdev->sb match mddev data..
1172          *
1173          * 1/ zero out disks
1174          * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare);
1175          * 3/ any empty disks < next_spare become removed
1176          *
1177          * disks[0] gets initialised to REMOVED because
1178          * we cannot be sure from other fields if it has
1179          * been initialised or not.
1180          */
1181         int i;
1182         int active=0, working=0,failed=0,spare=0,nr_disks=0;
1183
1184         rdev->sb_size = MD_SB_BYTES;
1185
1186         sb = page_address(rdev->sb_page);
1187
1188         memset(sb, 0, sizeof(*sb));
1189
1190         sb->md_magic = MD_SB_MAGIC;
1191         sb->major_version = mddev->major_version;
1192         sb->patch_version = mddev->patch_version;
1193         sb->gvalid_words  = 0; /* ignored */
1194         memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
1195         memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
1196         memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
1197         memcpy(&sb->set_uuid3, mddev->uuid+12,4);
1198
1199         sb->ctime = mddev->ctime;
1200         sb->level = mddev->level;
1201         sb->size = mddev->dev_sectors / 2;
1202         sb->raid_disks = mddev->raid_disks;
1203         sb->md_minor = mddev->md_minor;
1204         sb->not_persistent = 0;
1205         sb->utime = mddev->utime;
1206         sb->state = 0;
1207         sb->events_hi = (mddev->events>>32);
1208         sb->events_lo = (u32)mddev->events;
1209
1210         if (mddev->reshape_position == MaxSector)
1211                 sb->minor_version = 90;
1212         else {
1213                 sb->minor_version = 91;
1214                 sb->reshape_position = mddev->reshape_position;
1215                 sb->new_level = mddev->new_level;
1216                 sb->delta_disks = mddev->delta_disks;
1217                 sb->new_layout = mddev->new_layout;
1218                 sb->new_chunk = mddev->new_chunk_sectors << 9;
1219         }
1220         mddev->minor_version = sb->minor_version;
1221         if (mddev->in_sync)
1222         {
1223                 sb->recovery_cp = mddev->recovery_cp;
1224                 sb->cp_events_hi = (mddev->events>>32);
1225                 sb->cp_events_lo = (u32)mddev->events;
1226                 if (mddev->recovery_cp == MaxSector)
1227                         sb->state = (1<< MD_SB_CLEAN);
1228         } else
1229                 sb->recovery_cp = 0;
1230
1231         sb->layout = mddev->layout;
1232         sb->chunk_size = mddev->chunk_sectors << 9;
1233
1234         if (mddev->bitmap && mddev->bitmap_info.file == NULL)
1235                 sb->state |= (1<<MD_SB_BITMAP_PRESENT);
1236
1237         sb->disks[0].state = (1<<MD_DISK_REMOVED);
1238         rdev_for_each(rdev2, mddev) {
1239                 mdp_disk_t *d;
1240                 int desc_nr;
1241                 int is_active = test_bit(In_sync, &rdev2->flags);
1242
1243                 if (rdev2->raid_disk >= 0 &&
1244                     sb->minor_version >= 91)
1245                         /* we have nowhere to store the recovery_offset,
1246                          * but if it is not below the reshape_position,
1247                          * we can piggy-back on that.
1248                          */
1249                         is_active = 1;
1250                 if (rdev2->raid_disk < 0 ||
1251                     test_bit(Faulty, &rdev2->flags))
1252                         is_active = 0;
1253                 if (is_active)
1254                         desc_nr = rdev2->raid_disk;
1255                 else
1256                         desc_nr = next_spare++;
1257                 rdev2->desc_nr = desc_nr;
1258                 d = &sb->disks[rdev2->desc_nr];
1259                 nr_disks++;
1260                 d->number = rdev2->desc_nr;
1261                 d->major = MAJOR(rdev2->bdev->bd_dev);
1262                 d->minor = MINOR(rdev2->bdev->bd_dev);
1263                 if (is_active)
1264                         d->raid_disk = rdev2->raid_disk;
1265                 else
1266                         d->raid_disk = rdev2->desc_nr; /* compatibility */
1267                 if (test_bit(Faulty, &rdev2->flags))
1268                         d->state = (1<<MD_DISK_FAULTY);
1269                 else if (is_active) {
1270                         d->state = (1<<MD_DISK_ACTIVE);
1271                         if (test_bit(In_sync, &rdev2->flags))
1272                                 d->state |= (1<<MD_DISK_SYNC);
1273                         active++;
1274                         working++;
1275                 } else {
1276                         d->state = 0;
1277                         spare++;
1278                         working++;
1279                 }
1280                 if (test_bit(WriteMostly, &rdev2->flags))
1281                         d->state |= (1<<MD_DISK_WRITEMOSTLY);
1282         }
1283         /* now set the "removed" and "faulty" bits on any missing devices */
1284         for (i=0 ; i < mddev->raid_disks ; i++) {
1285                 mdp_disk_t *d = &sb->disks[i];
1286                 if (d->state == 0 && d->number == 0) {
1287                         d->number = i;
1288                         d->raid_disk = i;
1289                         d->state = (1<<MD_DISK_REMOVED);
1290                         d->state |= (1<<MD_DISK_FAULTY);
1291                         failed++;
1292                 }
1293         }
1294         sb->nr_disks = nr_disks;
1295         sb->active_disks = active;
1296         sb->working_disks = working;
1297         sb->failed_disks = failed;
1298         sb->spare_disks = spare;
1299
1300         sb->this_disk = sb->disks[rdev->desc_nr];
1301         sb->sb_csum = calc_sb_csum(sb);
1302 }
1303
1304 /*
1305  * rdev_size_change for 0.90.0
1306  */
1307 static unsigned long long
1308 super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
1309 {
1310         if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1311                 return 0; /* component must fit device */
1312         if (rdev->mddev->bitmap_info.offset)
1313                 return 0; /* can't move bitmap */
1314         rdev->sb_start = calc_dev_sboffset(rdev);
1315         if (!num_sectors || num_sectors > rdev->sb_start)
1316                 num_sectors = rdev->sb_start;
1317         /* Limit to 4TB as metadata cannot record more than that.
1318          * 4TB == 2^32 KB, or 2*2^32 sectors.
1319          */
1320         if (num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1)
1321                 num_sectors = (2ULL << 32) - 2;
1322         md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1323                        rdev->sb_page);
1324         md_super_wait(rdev->mddev);
1325         return num_sectors;
1326 }
1327
1328 static int
1329 super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset)
1330 {
1331         /* non-zero offset changes not possible with v0.90 */
1332         return new_offset == 0;
1333 }
1334
1335 /*
1336  * version 1 superblock
1337  */
1338
1339 static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb)
1340 {
1341         __le32 disk_csum;
1342         u32 csum;
1343         unsigned long long newcsum;
1344         int size = 256 + le32_to_cpu(sb->max_dev)*2;
1345         __le32 *isuper = (__le32*)sb;
1346
1347         disk_csum = sb->sb_csum;
1348         sb->sb_csum = 0;
1349         newcsum = 0;
1350         for (; size >= 4; size -= 4)
1351                 newcsum += le32_to_cpu(*isuper++);
1352
1353         if (size == 2)
1354                 newcsum += le16_to_cpu(*(__le16*) isuper);
1355
1356         csum = (newcsum & 0xffffffff) + (newcsum >> 32);
1357         sb->sb_csum = disk_csum;
1358         return cpu_to_le32(csum);
1359 }
1360
1361 static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
1362                             int acknowledged);
1363 static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
1364 {
1365         struct mdp_superblock_1 *sb;
1366         int ret;
1367         sector_t sb_start;
1368         sector_t sectors;
1369         char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1370         int bmask;
1371
1372         /*
1373          * Calculate the position of the superblock in 512byte sectors.
1374          * It is always aligned to a 4K boundary and
1375          * depeding on minor_version, it can be:
1376          * 0: At least 8K, but less than 12K, from end of device
1377          * 1: At start of device
1378          * 2: 4K from start of device.
1379          */
1380         switch(minor_version) {
1381         case 0:
1382                 sb_start = i_size_read(rdev->bdev->bd_inode) >> 9;
1383                 sb_start -= 8*2;
1384                 sb_start &= ~(sector_t)(4*2-1);
1385                 break;
1386         case 1:
1387                 sb_start = 0;
1388                 break;
1389         case 2:
1390                 sb_start = 8;
1391                 break;
1392         default:
1393                 return -EINVAL;
1394         }
1395         rdev->sb_start = sb_start;
1396
1397         /* superblock is rarely larger than 1K, but it can be larger,
1398          * and it is safe to read 4k, so we do that
1399          */
1400         ret = read_disk_sb(rdev, 4096);
1401         if (ret) return ret;
1402
1403         sb = page_address(rdev->sb_page);
1404
1405         if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
1406             sb->major_version != cpu_to_le32(1) ||
1407             le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
1408             le64_to_cpu(sb->super_offset) != rdev->sb_start ||
1409             (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
1410                 return -EINVAL;
1411
1412         if (calc_sb_1_csum(sb) != sb->sb_csum) {
1413                 printk("md: invalid superblock checksum on %s\n",
1414                         bdevname(rdev->bdev,b));
1415                 return -EINVAL;
1416         }
1417         if (le64_to_cpu(sb->data_size) < 10) {
1418                 printk("md: data_size too small on %s\n",
1419                        bdevname(rdev->bdev,b));
1420                 return -EINVAL;
1421         }
1422         if (sb->pad0 ||
1423             sb->pad3[0] ||
1424             memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1])))
1425                 /* Some padding is non-zero, might be a new feature */
1426                 return -EINVAL;
1427
1428         rdev->preferred_minor = 0xffff;
1429         rdev->data_offset = le64_to_cpu(sb->data_offset);
1430         rdev->new_data_offset = rdev->data_offset;
1431         if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) &&
1432             (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET))
1433                 rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset);
1434         atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
1435
1436         rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
1437         bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1438         if (rdev->sb_size & bmask)
1439                 rdev->sb_size = (rdev->sb_size | bmask) + 1;
1440
1441         if (minor_version
1442             && rdev->data_offset < sb_start + (rdev->sb_size/512))
1443                 return -EINVAL;
1444         if (minor_version
1445             && rdev->new_data_offset < sb_start + (rdev->sb_size/512))
1446                 return -EINVAL;
1447
1448         if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
1449                 rdev->desc_nr = -1;
1450         else
1451                 rdev->desc_nr = le32_to_cpu(sb->dev_number);
1452
1453         if (!rdev->bb_page) {
1454                 rdev->bb_page = alloc_page(GFP_KERNEL);
1455                 if (!rdev->bb_page)
1456                         return -ENOMEM;
1457         }
1458         if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) &&
1459             rdev->badblocks.count == 0) {
1460                 /* need to load the bad block list.
1461                  * Currently we limit it to one page.
1462                  */
1463                 s32 offset;
1464                 sector_t bb_sector;
1465                 u64 *bbp;
1466                 int i;
1467                 int sectors = le16_to_cpu(sb->bblog_size);
1468                 if (sectors > (PAGE_SIZE / 512))
1469                         return -EINVAL;
1470                 offset = le32_to_cpu(sb->bblog_offset);
1471                 if (offset == 0)
1472                         return -EINVAL;
1473                 bb_sector = (long long)offset;
1474                 if (!sync_page_io(rdev, bb_sector, sectors << 9,
1475                                   rdev->bb_page, READ, true))
1476                         return -EIO;
1477                 bbp = (u64 *)page_address(rdev->bb_page);
1478                 rdev->badblocks.shift = sb->bblog_shift;
1479                 for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) {
1480                         u64 bb = le64_to_cpu(*bbp);
1481                         int count = bb & (0x3ff);
1482                         u64 sector = bb >> 10;
1483                         sector <<= sb->bblog_shift;
1484                         count <<= sb->bblog_shift;
1485                         if (bb + 1 == 0)
1486                                 break;
1487                         if (md_set_badblocks(&rdev->badblocks,
1488                                              sector, count, 1) == 0)
1489                                 return -EINVAL;
1490                 }
1491         } else if (sb->bblog_offset != 0)
1492                 rdev->badblocks.shift = 0;
1493
1494         if (!refdev) {
1495                 ret = 1;
1496         } else {
1497                 __u64 ev1, ev2;
1498                 struct mdp_superblock_1 *refsb = page_address(refdev->sb_page);
1499
1500                 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
1501                     sb->level != refsb->level ||
1502                     sb->layout != refsb->layout ||
1503                     sb->chunksize != refsb->chunksize) {
1504                         printk(KERN_WARNING "md: %s has strangely different"
1505                                 " superblock to %s\n",
1506                                 bdevname(rdev->bdev,b),
1507                                 bdevname(refdev->bdev,b2));
1508                         return -EINVAL;
1509                 }
1510                 ev1 = le64_to_cpu(sb->events);
1511                 ev2 = le64_to_cpu(refsb->events);
1512
1513                 if (ev1 > ev2)
1514                         ret = 1;
1515                 else
1516                         ret = 0;
1517         }
1518         if (minor_version) {
1519                 sectors = (i_size_read(rdev->bdev->bd_inode) >> 9);
1520                 sectors -= rdev->data_offset;
1521         } else
1522                 sectors = rdev->sb_start;
1523         if (sectors < le64_to_cpu(sb->data_size))
1524                 return -EINVAL;
1525         rdev->sectors = le64_to_cpu(sb->data_size);
1526         return ret;
1527 }
1528
1529 static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
1530 {
1531         struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
1532         __u64 ev1 = le64_to_cpu(sb->events);
1533
1534         rdev->raid_disk = -1;
1535         clear_bit(Faulty, &rdev->flags);
1536         clear_bit(In_sync, &rdev->flags);
1537         clear_bit(Bitmap_sync, &rdev->flags);
1538         clear_bit(WriteMostly, &rdev->flags);
1539
1540         if (mddev->raid_disks == 0) {
1541                 mddev->major_version = 1;
1542                 mddev->patch_version = 0;
1543                 mddev->external = 0;
1544                 mddev->chunk_sectors = le32_to_cpu(sb->chunksize);
1545                 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
1546                 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
1547                 mddev->level = le32_to_cpu(sb->level);
1548                 mddev->clevel[0] = 0;
1549                 mddev->layout = le32_to_cpu(sb->layout);
1550                 mddev->raid_disks = le32_to_cpu(sb->raid_disks);
1551                 mddev->dev_sectors = le64_to_cpu(sb->size);
1552                 mddev->events = ev1;
1553                 mddev->bitmap_info.offset = 0;
1554                 mddev->bitmap_info.space = 0;
1555                 /* Default location for bitmap is 1K after superblock
1556                  * using 3K - total of 4K
1557                  */
1558                 mddev->bitmap_info.default_offset = 1024 >> 9;
1559                 mddev->bitmap_info.default_space = (4096-1024) >> 9;
1560                 mddev->reshape_backwards = 0;
1561
1562                 mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
1563                 memcpy(mddev->uuid, sb->set_uuid, 16);
1564
1565                 mddev->max_disks =  (4096-256)/2;
1566
1567                 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
1568                     mddev->bitmap_info.file == NULL) {
1569                         mddev->bitmap_info.offset =
1570                                 (__s32)le32_to_cpu(sb->bitmap_offset);
1571                         /* Metadata doesn't record how much space is available.
1572                          * For 1.0, we assume we can use up to the superblock
1573                          * if before, else to 4K beyond superblock.
1574                          * For others, assume no change is possible.
1575                          */
1576                         if (mddev->minor_version > 0)
1577                                 mddev->bitmap_info.space = 0;
1578                         else if (mddev->bitmap_info.offset > 0)
1579                                 mddev->bitmap_info.space =
1580                                         8 - mddev->bitmap_info.offset;
1581                         else
1582                                 mddev->bitmap_info.space =
1583                                         -mddev->bitmap_info.offset;
1584                 }
1585
1586                 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
1587                         mddev->reshape_position = le64_to_cpu(sb->reshape_position);
1588                         mddev->delta_disks = le32_to_cpu(sb->delta_disks);
1589                         mddev->new_level = le32_to_cpu(sb->new_level);
1590                         mddev->new_layout = le32_to_cpu(sb->new_layout);
1591                         mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk);
1592                         if (mddev->delta_disks < 0 ||
1593                             (mddev->delta_disks == 0 &&
1594                              (le32_to_cpu(sb->feature_map)
1595                               & MD_FEATURE_RESHAPE_BACKWARDS)))
1596                                 mddev->reshape_backwards = 1;
1597                 } else {
1598                         mddev->reshape_position = MaxSector;
1599                         mddev->delta_disks = 0;
1600                         mddev->new_level = mddev->level;
1601                         mddev->new_layout = mddev->layout;
1602                         mddev->new_chunk_sectors = mddev->chunk_sectors;
1603                 }
1604
1605         } else if (mddev->pers == NULL) {
1606                 /* Insist of good event counter while assembling, except for
1607                  * spares (which don't need an event count) */
1608                 ++ev1;
1609                 if (rdev->desc_nr >= 0 &&
1610                     rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
1611                     (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
1612                      le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))
1613                         if (ev1 < mddev->events)
1614                                 return -EINVAL;
1615         } else if (mddev->bitmap) {
1616                 /* If adding to array with a bitmap, then we can accept an
1617                  * older device, but not too old.
1618                  */
1619                 if (ev1 < mddev->bitmap->events_cleared)
1620                         return 0;
1621                 if (ev1 < mddev->events)
1622                         set_bit(Bitmap_sync, &rdev->flags);
1623         } else {
1624                 if (ev1 < mddev->events)
1625                         /* just a hot-add of a new device, leave raid_disk at -1 */
1626                         return 0;
1627         }
1628         if (mddev->level != LEVEL_MULTIPATH) {
1629                 int role;
1630                 if (rdev->desc_nr < 0 ||
1631                     rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
1632                         role = MD_DISK_ROLE_SPARE;
1633                         rdev->desc_nr = -1;
1634                 } else
1635                         role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
1636                 switch(role) {
1637                 case MD_DISK_ROLE_SPARE: /* spare */
1638                         break;
1639                 case MD_DISK_ROLE_FAULTY: /* faulty */
1640                         set_bit(Faulty, &rdev->flags);
1641                         break;
1642                 case MD_DISK_ROLE_JOURNAL: /* journal device */
1643                         if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) {
1644                                 /* journal device without journal feature */
1645                                 printk(KERN_WARNING
1646                                   "md: journal device provided without journal feature, ignoring the device\n");
1647                                 return -EINVAL;
1648                         }
1649                         set_bit(Journal, &rdev->flags);
1650                         rdev->journal_tail = le64_to_cpu(sb->journal_tail);
1651                         if (mddev->recovery_cp == MaxSector)
1652                                 set_bit(MD_JOURNAL_CLEAN, &mddev->flags);
1653                         rdev->raid_disk = mddev->raid_disks;
1654                         break;
1655                 default:
1656                         rdev->saved_raid_disk = role;
1657                         if ((le32_to_cpu(sb->feature_map) &
1658                              MD_FEATURE_RECOVERY_OFFSET)) {
1659                                 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
1660                                 if (!(le32_to_cpu(sb->feature_map) &
1661                                       MD_FEATURE_RECOVERY_BITMAP))
1662                                         rdev->saved_raid_disk = -1;
1663                         } else
1664                                 set_bit(In_sync, &rdev->flags);
1665                         rdev->raid_disk = role;
1666                         break;
1667                 }
1668                 if (sb->devflags & WriteMostly1)
1669                         set_bit(WriteMostly, &rdev->flags);
1670                 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
1671                         set_bit(Replacement, &rdev->flags);
1672                 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)
1673                         set_bit(MD_HAS_JOURNAL, &mddev->flags);
1674         } else /* MULTIPATH are always insync */
1675                 set_bit(In_sync, &rdev->flags);
1676
1677         return 0;
1678 }
1679
1680 static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
1681 {
1682         struct mdp_superblock_1 *sb;
1683         struct md_rdev *rdev2;
1684         int max_dev, i;
1685         /* make rdev->sb match mddev and rdev data. */
1686
1687         sb = page_address(rdev->sb_page);
1688
1689         sb->feature_map = 0;
1690         sb->pad0 = 0;
1691         sb->recovery_offset = cpu_to_le64(0);
1692         memset(sb->pad3, 0, sizeof(sb->pad3));
1693
1694         sb->utime = cpu_to_le64((__u64)mddev->utime);
1695         sb->events = cpu_to_le64(mddev->events);
1696         if (mddev->in_sync)
1697                 sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
1698         else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags))
1699                 sb->resync_offset = cpu_to_le64(MaxSector);
1700         else
1701                 sb->resync_offset = cpu_to_le64(0);
1702
1703         sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors));
1704
1705         sb->raid_disks = cpu_to_le32(mddev->raid_disks);
1706         sb->size = cpu_to_le64(mddev->dev_sectors);
1707         sb->chunksize = cpu_to_le32(mddev->chunk_sectors);
1708         sb->level = cpu_to_le32(mddev->level);
1709         sb->layout = cpu_to_le32(mddev->layout);
1710
1711         if (test_bit(WriteMostly, &rdev->flags))
1712                 sb->devflags |= WriteMostly1;
1713         else
1714                 sb->devflags &= ~WriteMostly1;
1715         sb->data_offset = cpu_to_le64(rdev->data_offset);
1716         sb->data_size = cpu_to_le64(rdev->sectors);
1717
1718         if (mddev->bitmap && mddev->bitmap_info.file == NULL) {
1719                 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset);
1720                 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
1721         }
1722
1723         if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) &&
1724             !test_bit(In_sync, &rdev->flags)) {
1725                 sb->feature_map |=
1726                         cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
1727                 sb->recovery_offset =
1728                         cpu_to_le64(rdev->recovery_offset);
1729                 if (rdev->saved_raid_disk >= 0 && mddev->bitmap)
1730                         sb->feature_map |=
1731                                 cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP);
1732         }
1733         /* Note: recovery_offset and journal_tail share space  */
1734         if (test_bit(Journal, &rdev->flags))
1735                 sb->journal_tail = cpu_to_le64(rdev->journal_tail);
1736         if (test_bit(Replacement, &rdev->flags))
1737                 sb->feature_map |=
1738                         cpu_to_le32(MD_FEATURE_REPLACEMENT);
1739
1740         if (mddev->reshape_position != MaxSector) {
1741                 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
1742                 sb->reshape_position = cpu_to_le64(mddev->reshape_position);
1743                 sb->new_layout = cpu_to_le32(mddev->new_layout);
1744                 sb->delta_disks = cpu_to_le32(mddev->delta_disks);
1745                 sb->new_level = cpu_to_le32(mddev->new_level);
1746                 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
1747                 if (mddev->delta_disks == 0 &&
1748                     mddev->reshape_backwards)
1749                         sb->feature_map
1750                                 |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS);
1751                 if (rdev->new_data_offset != rdev->data_offset) {
1752                         sb->feature_map
1753                                 |= cpu_to_le32(MD_FEATURE_NEW_OFFSET);
1754                         sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset
1755                                                              - rdev->data_offset));
1756                 }
1757         }
1758
1759         if (mddev_is_clustered(mddev))
1760                 sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED);
1761
1762         if (rdev->badblocks.count == 0)
1763                 /* Nothing to do for bad blocks*/ ;
1764         else if (sb->bblog_offset == 0)
1765                 /* Cannot record bad blocks on this device */
1766                 md_error(mddev, rdev);
1767         else {
1768                 struct badblocks *bb = &rdev->badblocks;
1769                 u64 *bbp = (u64 *)page_address(rdev->bb_page);
1770                 u64 *p = bb->page;
1771                 sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS);
1772                 if (bb->changed) {
1773                         unsigned seq;
1774
1775 retry:
1776                         seq = read_seqbegin(&bb->lock);
1777
1778                         memset(bbp, 0xff, PAGE_SIZE);
1779
1780                         for (i = 0 ; i < bb->count ; i++) {
1781                                 u64 internal_bb = p[i];
1782                                 u64 store_bb = ((BB_OFFSET(internal_bb) << 10)
1783                                                 | BB_LEN(internal_bb));
1784                                 bbp[i] = cpu_to_le64(store_bb);
1785                         }
1786                         bb->changed = 0;
1787                         if (read_seqretry(&bb->lock, seq))
1788                                 goto retry;
1789
1790                         bb->sector = (rdev->sb_start +
1791                                       (int)le32_to_cpu(sb->bblog_offset));
1792                         bb->size = le16_to_cpu(sb->bblog_size);
1793                 }
1794         }
1795
1796         max_dev = 0;
1797         rdev_for_each(rdev2, mddev)
1798                 if (rdev2->desc_nr+1 > max_dev)
1799                         max_dev = rdev2->desc_nr+1;
1800
1801         if (max_dev > le32_to_cpu(sb->max_dev)) {
1802                 int bmask;
1803                 sb->max_dev = cpu_to_le32(max_dev);
1804                 rdev->sb_size = max_dev * 2 + 256;
1805                 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1806                 if (rdev->sb_size & bmask)
1807                         rdev->sb_size = (rdev->sb_size | bmask) + 1;
1808         } else
1809                 max_dev = le32_to_cpu(sb->max_dev);
1810
1811         for (i=0; i<max_dev;i++)
1812                 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY);
1813
1814         if (test_bit(MD_HAS_JOURNAL, &mddev->flags))
1815                 sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL);
1816
1817         rdev_for_each(rdev2, mddev) {
1818                 i = rdev2->desc_nr;
1819                 if (test_bit(Faulty, &rdev2->flags))
1820                         sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY);
1821                 else if (test_bit(In_sync, &rdev2->flags))
1822                         sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1823                 else if (test_bit(Journal, &rdev2->flags))
1824                         sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL);
1825                 else if (rdev2->raid_disk >= 0)
1826                         sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1827                 else
1828                         sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE);
1829         }
1830
1831         sb->sb_csum = calc_sb_1_csum(sb);
1832 }
1833
1834 static unsigned long long
1835 super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
1836 {
1837         struct mdp_superblock_1 *sb;
1838         sector_t max_sectors;
1839         if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1840                 return 0; /* component must fit device */
1841         if (rdev->data_offset != rdev->new_data_offset)
1842                 return 0; /* too confusing */
1843         if (rdev->sb_start < rdev->data_offset) {
1844                 /* minor versions 1 and 2; superblock before data */
1845                 max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9;
1846                 max_sectors -= rdev->data_offset;
1847                 if (!num_sectors || num_sectors > max_sectors)
1848                         num_sectors = max_sectors;
1849         } else if (rdev->mddev->bitmap_info.offset) {
1850                 /* minor version 0 with bitmap we can't move */
1851                 return 0;
1852         } else {
1853                 /* minor version 0; superblock after data */
1854                 sector_t sb_start;
1855                 sb_start = (i_size_read(rdev->bdev->bd_inode) >> 9) - 8*2;
1856                 sb_start &= ~(sector_t)(4*2 - 1);
1857                 max_sectors = rdev->sectors + sb_start - rdev->sb_start;
1858                 if (!num_sectors || num_sectors > max_sectors)
1859                         num_sectors = max_sectors;
1860                 rdev->sb_start = sb_start;
1861         }
1862         sb = page_address(rdev->sb_page);
1863         sb->data_size = cpu_to_le64(num_sectors);
1864         sb->super_offset = rdev->sb_start;
1865         sb->sb_csum = calc_sb_1_csum(sb);
1866         md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1867                        rdev->sb_page);
1868         md_super_wait(rdev->mddev);
1869         return num_sectors;
1870
1871 }
1872
1873 static int
1874 super_1_allow_new_offset(struct md_rdev *rdev,
1875                          unsigned long long new_offset)
1876 {
1877         /* All necessary checks on new >= old have been done */
1878         struct bitmap *bitmap;
1879         if (new_offset >= rdev->data_offset)
1880                 return 1;
1881
1882         /* with 1.0 metadata, there is no metadata to tread on
1883          * so we can always move back */
1884         if (rdev->mddev->minor_version == 0)
1885                 return 1;
1886
1887         /* otherwise we must be sure not to step on
1888          * any metadata, so stay:
1889          * 36K beyond start of superblock
1890          * beyond end of badblocks
1891          * beyond write-intent bitmap
1892          */
1893         if (rdev->sb_start + (32+4)*2 > new_offset)
1894                 return 0;
1895         bitmap = rdev->mddev->bitmap;
1896         if (bitmap && !rdev->mddev->bitmap_info.file &&
1897             rdev->sb_start + rdev->mddev->bitmap_info.offset +
1898             bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset)
1899                 return 0;
1900         if (rdev->badblocks.sector + rdev->badblocks.size > new_offset)
1901                 return 0;
1902
1903         return 1;
1904 }
1905
1906 static struct super_type super_types[] = {
1907         [0] = {
1908                 .name   = "0.90.0",
1909                 .owner  = THIS_MODULE,
1910                 .load_super         = super_90_load,
1911                 .validate_super     = super_90_validate,
1912                 .sync_super         = super_90_sync,
1913                 .rdev_size_change   = super_90_rdev_size_change,
1914                 .allow_new_offset   = super_90_allow_new_offset,
1915         },
1916         [1] = {
1917                 .name   = "md-1",
1918                 .owner  = THIS_MODULE,
1919                 .load_super         = super_1_load,
1920                 .validate_super     = super_1_validate,
1921                 .sync_super         = super_1_sync,
1922                 .rdev_size_change   = super_1_rdev_size_change,
1923                 .allow_new_offset   = super_1_allow_new_offset,
1924         },
1925 };
1926
1927 static void sync_super(struct mddev *mddev, struct md_rdev *rdev)
1928 {
1929         if (mddev->sync_super) {
1930                 mddev->sync_super(mddev, rdev);
1931                 return;
1932         }
1933
1934         BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types));
1935
1936         super_types[mddev->major_version].sync_super(mddev, rdev);
1937 }
1938
1939 static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2)
1940 {
1941         struct md_rdev *rdev, *rdev2;
1942
1943         rcu_read_lock();
1944         rdev_for_each_rcu(rdev, mddev1) {
1945                 if (test_bit(Faulty, &rdev->flags) ||
1946                     test_bit(Journal, &rdev->flags) ||
1947                     rdev->raid_disk == -1)
1948                         continue;
1949                 rdev_for_each_rcu(rdev2, mddev2) {
1950                         if (test_bit(Faulty, &rdev2->flags) ||
1951                             test_bit(Journal, &rdev2->flags) ||
1952                             rdev2->raid_disk == -1)
1953                                 continue;
1954                         if (rdev->bdev->bd_contains ==
1955                             rdev2->bdev->bd_contains) {
1956                                 rcu_read_unlock();
1957                                 return 1;
1958                         }
1959                 }
1960         }
1961         rcu_read_unlock();
1962         return 0;
1963 }
1964
1965 static LIST_HEAD(pending_raid_disks);
1966
1967 /*
1968  * Try to register data integrity profile for an mddev
1969  *
1970  * This is called when an array is started and after a disk has been kicked
1971  * from the array. It only succeeds if all working and active component devices
1972  * are integrity capable with matching profiles.
1973  */
1974 int md_integrity_register(struct mddev *mddev)
1975 {
1976         struct md_rdev *rdev, *reference = NULL;
1977
1978         if (list_empty(&mddev->disks))
1979                 return 0; /* nothing to do */
1980         if (!mddev->gendisk || blk_get_integrity(mddev->gendisk))
1981                 return 0; /* shouldn't register, or already is */
1982         rdev_for_each(rdev, mddev) {
1983                 /* skip spares and non-functional disks */
1984                 if (test_bit(Faulty, &rdev->flags))
1985                         continue;
1986                 if (rdev->raid_disk < 0)
1987                         continue;
1988                 if (!reference) {
1989                         /* Use the first rdev as the reference */
1990                         reference = rdev;
1991                         continue;
1992                 }
1993                 /* does this rdev's profile match the reference profile? */
1994                 if (blk_integrity_compare(reference->bdev->bd_disk,
1995                                 rdev->bdev->bd_disk) < 0)
1996                         return -EINVAL;
1997         }
1998         if (!reference || !bdev_get_integrity(reference->bdev))
1999                 return 0;
2000         /*
2001          * All component devices are integrity capable and have matching
2002          * profiles, register the common profile for the md device.
2003          */
2004         blk_integrity_register(mddev->gendisk,
2005                                bdev_get_integrity(reference->bdev));
2006
2007         printk(KERN_NOTICE "md: data integrity enabled on %s\n", mdname(mddev));
2008         if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) {
2009                 printk(KERN_ERR "md: failed to create integrity pool for %s\n",
2010                        mdname(mddev));
2011                 return -EINVAL;
2012         }
2013         return 0;
2014 }
2015 EXPORT_SYMBOL(md_integrity_register);
2016
2017 /* Disable data integrity if non-capable/non-matching disk is being added */
2018 void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev)
2019 {
2020         struct blk_integrity *bi_rdev;
2021         struct blk_integrity *bi_mddev;
2022
2023         if (!mddev->gendisk)
2024                 return;
2025
2026         bi_rdev = bdev_get_integrity(rdev->bdev);
2027         bi_mddev = blk_get_integrity(mddev->gendisk);
2028
2029         if (!bi_mddev) /* nothing to do */
2030                 return;
2031         if (rdev->raid_disk < 0) /* skip spares */
2032                 return;
2033         if (bi_rdev && blk_integrity_compare(mddev->gendisk,
2034                                              rdev->bdev->bd_disk) >= 0)
2035                 return;
2036         WARN_ON_ONCE(!mddev->suspended);
2037         printk(KERN_NOTICE "disabling data integrity on %s\n", mdname(mddev));
2038         blk_integrity_unregister(mddev->gendisk);
2039 }
2040 EXPORT_SYMBOL(md_integrity_add_rdev);
2041
2042 static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
2043 {
2044         char b[BDEVNAME_SIZE];
2045         struct kobject *ko;
2046         int err;
2047
2048         /* prevent duplicates */
2049         if (find_rdev(mddev, rdev->bdev->bd_dev))
2050                 return -EEXIST;
2051
2052         /* make sure rdev->sectors exceeds mddev->dev_sectors */
2053         if (rdev->sectors && (mddev->dev_sectors == 0 ||
2054                         rdev->sectors < mddev->dev_sectors)) {
2055                 if (mddev->pers) {
2056                         /* Cannot change size, so fail
2057                          * If mddev->level <= 0, then we don't care
2058                          * about aligning sizes (e.g. linear)
2059                          */
2060                         if (mddev->level > 0)
2061                                 return -ENOSPC;
2062                 } else
2063                         mddev->dev_sectors = rdev->sectors;
2064         }
2065
2066         /* Verify rdev->desc_nr is unique.
2067          * If it is -1, assign a free number, else
2068          * check number is not in use
2069          */
2070         rcu_read_lock();
2071         if (rdev->desc_nr < 0) {
2072                 int choice = 0;
2073                 if (mddev->pers)
2074                         choice = mddev->raid_disks;
2075                 while (md_find_rdev_nr_rcu(mddev, choice))
2076                         choice++;
2077                 rdev->desc_nr = choice;
2078         } else {
2079                 if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) {
2080                         rcu_read_unlock();
2081                         return -EBUSY;
2082                 }
2083         }
2084         rcu_read_unlock();
2085         if (mddev->max_disks && rdev->desc_nr >= mddev->max_disks) {
2086                 printk(KERN_WARNING "md: %s: array is limited to %d devices\n",
2087                        mdname(mddev), mddev->max_disks);
2088                 return -EBUSY;
2089         }
2090         bdevname(rdev->bdev,b);
2091         strreplace(b, '/', '!');
2092
2093         rdev->mddev = mddev;
2094         printk(KERN_INFO "md: bind<%s>\n", b);
2095
2096         if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
2097                 goto fail;
2098
2099         ko = &part_to_dev(rdev->bdev->bd_part)->kobj;
2100         if (sysfs_create_link(&rdev->kobj, ko, "block"))
2101                 /* failure here is OK */;
2102         rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state");
2103
2104         list_add_rcu(&rdev->same_set, &mddev->disks);
2105         bd_link_disk_holder(rdev->bdev, mddev->gendisk);
2106
2107         /* May as well allow recovery to be retried once */
2108         mddev->recovery_disabled++;
2109
2110         return 0;
2111
2112  fail:
2113         printk(KERN_WARNING "md: failed to register dev-%s for %s\n",
2114                b, mdname(mddev));
2115         return err;
2116 }
2117
2118 static void md_delayed_delete(struct work_struct *ws)
2119 {
2120         struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work);
2121         kobject_del(&rdev->kobj);
2122         kobject_put(&rdev->kobj);
2123 }
2124
2125 static void unbind_rdev_from_array(struct md_rdev *rdev)
2126 {
2127         char b[BDEVNAME_SIZE];
2128
2129         bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
2130         list_del_rcu(&rdev->same_set);
2131         printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
2132         rdev->mddev = NULL;
2133         sysfs_remove_link(&rdev->kobj, "block");
2134         sysfs_put(rdev->sysfs_state);
2135         rdev->sysfs_state = NULL;
2136         rdev->badblocks.count = 0;
2137         /* We need to delay this, otherwise we can deadlock when
2138          * writing to 'remove' to "dev/state".  We also need
2139          * to delay it due to rcu usage.
2140          */
2141         synchronize_rcu();
2142         INIT_WORK(&rdev->del_work, md_delayed_delete);
2143         kobject_get(&rdev->kobj);
2144         queue_work(md_misc_wq, &rdev->del_work);
2145 }
2146
2147 /*
2148  * prevent the device from being mounted, repartitioned or
2149  * otherwise reused by a RAID array (or any other kernel
2150  * subsystem), by bd_claiming the device.
2151  */
2152 static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared)
2153 {
2154         int err = 0;
2155         struct block_device *bdev;
2156         char b[BDEVNAME_SIZE];
2157
2158         bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
2159                                  shared ? (struct md_rdev *)lock_rdev : rdev);
2160         if (IS_ERR(bdev)) {
2161                 printk(KERN_ERR "md: could not open %s.\n",
2162                         __bdevname(dev, b));
2163                 return PTR_ERR(bdev);
2164         }
2165         rdev->bdev = bdev;
2166         return err;
2167 }
2168
2169 static void unlock_rdev(struct md_rdev *rdev)
2170 {
2171         struct block_device *bdev = rdev->bdev;
2172         rdev->bdev = NULL;
2173         blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
2174 }
2175
2176 void md_autodetect_dev(dev_t dev);
2177
2178 static void export_rdev(struct md_rdev *rdev)
2179 {
2180         char b[BDEVNAME_SIZE];
2181
2182         printk(KERN_INFO "md: export_rdev(%s)\n",
2183                 bdevname(rdev->bdev,b));
2184         md_rdev_clear(rdev);
2185 #ifndef MODULE
2186         if (test_bit(AutoDetected, &rdev->flags))
2187                 md_autodetect_dev(rdev->bdev->bd_dev);
2188 #endif
2189         unlock_rdev(rdev);
2190         kobject_put(&rdev->kobj);
2191 }
2192
2193 void md_kick_rdev_from_array(struct md_rdev *rdev)
2194 {
2195         unbind_rdev_from_array(rdev);
2196         export_rdev(rdev);
2197 }
2198 EXPORT_SYMBOL_GPL(md_kick_rdev_from_array);
2199
2200 static void export_array(struct mddev *mddev)
2201 {
2202         struct md_rdev *rdev;
2203
2204         while (!list_empty(&mddev->disks)) {
2205                 rdev = list_first_entry(&mddev->disks, struct md_rdev,
2206                                         same_set);
2207                 md_kick_rdev_from_array(rdev);
2208         }
2209         mddev->raid_disks = 0;
2210         mddev->major_version = 0;
2211 }
2212
2213 static void sync_sbs(struct mddev *mddev, int nospares)
2214 {
2215         /* Update each superblock (in-memory image), but
2216          * if we are allowed to, skip spares which already
2217          * have the right event counter, or have one earlier
2218          * (which would mean they aren't being marked as dirty
2219          * with the rest of the array)
2220          */
2221         struct md_rdev *rdev;
2222         rdev_for_each(rdev, mddev) {
2223                 if (rdev->sb_events == mddev->events ||
2224                     (nospares &&
2225                      rdev->raid_disk < 0 &&
2226                      rdev->sb_events+1 == mddev->events)) {
2227                         /* Don't update this superblock */
2228                         rdev->sb_loaded = 2;
2229                 } else {
2230                         sync_super(mddev, rdev);
2231                         rdev->sb_loaded = 1;
2232                 }
2233         }
2234 }
2235
2236 static bool does_sb_need_changing(struct mddev *mddev)
2237 {
2238         struct md_rdev *rdev;
2239         struct mdp_superblock_1 *sb;
2240         int role;
2241
2242         /* Find a good rdev */
2243         rdev_for_each(rdev, mddev)
2244                 if ((rdev->raid_disk >= 0) && !test_bit(Faulty, &rdev->flags))
2245                         break;
2246
2247         /* No good device found. */
2248         if (!rdev)
2249                 return false;
2250
2251         sb = page_address(rdev->sb_page);
2252         /* Check if a device has become faulty or a spare become active */
2253         rdev_for_each(rdev, mddev) {
2254                 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
2255                 /* Device activated? */
2256                 if (role == 0xffff && rdev->raid_disk >=0 &&
2257                     !test_bit(Faulty, &rdev->flags))
2258                         return true;
2259                 /* Device turned faulty? */
2260                 if (test_bit(Faulty, &rdev->flags) && (role < 0xfffd))
2261                         return true;
2262         }
2263
2264         /* Check if any mddev parameters have changed */
2265         if ((mddev->dev_sectors != le64_to_cpu(sb->size)) ||
2266             (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) ||
2267             (mddev->layout != le64_to_cpu(sb->layout)) ||
2268             (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) ||
2269             (mddev->chunk_sectors != le32_to_cpu(sb->chunksize)))
2270                 return true;
2271
2272         return false;
2273 }
2274
2275 void md_update_sb(struct mddev *mddev, int force_change)
2276 {
2277         struct md_rdev *rdev;
2278         int sync_req;
2279         int nospares = 0;
2280         int any_badblocks_changed = 0;
2281         int ret = -1;
2282
2283         if (mddev->ro) {
2284                 if (force_change)
2285                         set_bit(MD_CHANGE_DEVS, &mddev->flags);
2286                 return;
2287         }
2288
2289         if (mddev_is_clustered(mddev)) {
2290                 if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags))
2291                         force_change = 1;
2292                 ret = md_cluster_ops->metadata_update_start(mddev);
2293                 /* Has someone else has updated the sb */
2294                 if (!does_sb_need_changing(mddev)) {
2295                         if (ret == 0)
2296                                 md_cluster_ops->metadata_update_cancel(mddev);
2297                         clear_bit(MD_CHANGE_PENDING, &mddev->flags);
2298                         return;
2299                 }
2300         }
2301 repeat:
2302         /* First make sure individual recovery_offsets are correct */
2303         rdev_for_each(rdev, mddev) {
2304                 if (rdev->raid_disk >= 0 &&
2305                     mddev->delta_disks >= 0 &&
2306                     !test_bit(Journal, &rdev->flags) &&
2307                     !test_bit(In_sync, &rdev->flags) &&
2308                     mddev->curr_resync_completed > rdev->recovery_offset)
2309                                 rdev->recovery_offset = mddev->curr_resync_completed;
2310
2311         }
2312         if (!mddev->persistent) {
2313                 clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
2314                 clear_bit(MD_CHANGE_DEVS, &mddev->flags);
2315                 if (!mddev->external) {
2316                         clear_bit(MD_CHANGE_PENDING, &mddev->flags);
2317                         rdev_for_each(rdev, mddev) {
2318                                 if (rdev->badblocks.changed) {
2319                                         rdev->badblocks.changed = 0;
2320                                         md_ack_all_badblocks(&rdev->badblocks);
2321                                         md_error(mddev, rdev);
2322                                 }
2323                                 clear_bit(Blocked, &rdev->flags);
2324                                 clear_bit(BlockedBadBlocks, &rdev->flags);
2325                                 wake_up(&rdev->blocked_wait);
2326                         }
2327                 }
2328                 wake_up(&mddev->sb_wait);
2329                 return;
2330         }
2331
2332         spin_lock(&mddev->lock);
2333
2334         mddev->utime = get_seconds();
2335
2336         if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags))
2337                 force_change = 1;
2338         if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags))
2339                 /* just a clean<-> dirty transition, possibly leave spares alone,
2340                  * though if events isn't the right even/odd, we will have to do
2341                  * spares after all
2342                  */
2343                 nospares = 1;
2344         if (force_change)
2345                 nospares = 0;
2346         if (mddev->degraded)
2347                 /* If the array is degraded, then skipping spares is both
2348                  * dangerous and fairly pointless.
2349                  * Dangerous because a device that was removed from the array
2350                  * might have a event_count that still looks up-to-date,
2351                  * so it can be re-added without a resync.
2352                  * Pointless because if there are any spares to skip,
2353                  * then a recovery will happen and soon that array won't
2354                  * be degraded any more and the spare can go back to sleep then.
2355                  */
2356                 nospares = 0;
2357
2358         sync_req = mddev->in_sync;
2359
2360         /* If this is just a dirty<->clean transition, and the array is clean
2361          * and 'events' is odd, we can roll back to the previous clean state */
2362         if (nospares
2363             && (mddev->in_sync && mddev->recovery_cp == MaxSector)
2364             && mddev->can_decrease_events
2365             && mddev->events != 1) {
2366                 mddev->events--;
2367                 mddev->can_decrease_events = 0;
2368         } else {
2369                 /* otherwise we have to go forward and ... */
2370                 mddev->events ++;
2371                 mddev->can_decrease_events = nospares;
2372         }
2373
2374         /*
2375          * This 64-bit counter should never wrap.
2376          * Either we are in around ~1 trillion A.C., assuming
2377          * 1 reboot per second, or we have a bug...
2378          */
2379         WARN_ON(mddev->events == 0);
2380
2381         rdev_for_each(rdev, mddev) {
2382                 if (rdev->badblocks.changed)
2383                         any_badblocks_changed++;
2384                 if (test_bit(Faulty, &rdev->flags))
2385                         set_bit(FaultRecorded, &rdev->flags);
2386         }
2387
2388         sync_sbs(mddev, nospares);
2389         spin_unlock(&mddev->lock);
2390
2391         pr_debug("md: updating %s RAID superblock on device (in sync %d)\n",
2392                  mdname(mddev), mddev->in_sync);
2393
2394         bitmap_update_sb(mddev->bitmap);
2395         rdev_for_each(rdev, mddev) {
2396                 char b[BDEVNAME_SIZE];
2397
2398                 if (rdev->sb_loaded != 1)
2399                         continue; /* no noise on spare devices */
2400
2401                 if (!test_bit(Faulty, &rdev->flags)) {
2402                         md_super_write(mddev,rdev,
2403                                        rdev->sb_start, rdev->sb_size,
2404                                        rdev->sb_page);
2405                         pr_debug("md: (write) %s's sb offset: %llu\n",
2406                                  bdevname(rdev->bdev, b),
2407                                  (unsigned long long)rdev->sb_start);
2408                         rdev->sb_events = mddev->events;
2409                         if (rdev->badblocks.size) {
2410                                 md_super_write(mddev, rdev,
2411                                                rdev->badblocks.sector,
2412                                                rdev->badblocks.size << 9,
2413                                                rdev->bb_page);
2414                                 rdev->badblocks.size = 0;
2415                         }
2416
2417                 } else
2418                         pr_debug("md: %s (skipping faulty)\n",
2419                                  bdevname(rdev->bdev, b));
2420
2421                 if (mddev->level == LEVEL_MULTIPATH)
2422                         /* only need to write one superblock... */
2423                         break;
2424         }
2425         md_super_wait(mddev);
2426         /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */
2427
2428         spin_lock(&mddev->lock);
2429         if (mddev->in_sync != sync_req ||
2430             test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
2431                 /* have to write it out again */
2432                 spin_unlock(&mddev->lock);
2433                 goto repeat;
2434         }
2435         clear_bit(MD_CHANGE_PENDING, &mddev->flags);
2436         spin_unlock(&mddev->lock);
2437         wake_up(&mddev->sb_wait);
2438         if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
2439                 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
2440
2441         rdev_for_each(rdev, mddev) {
2442                 if (test_and_clear_bit(FaultRecorded, &rdev->flags))
2443                         clear_bit(Blocked, &rdev->flags);
2444
2445                 if (any_badblocks_changed)
2446                         md_ack_all_badblocks(&rdev->badblocks);
2447                 clear_bit(BlockedBadBlocks, &rdev->flags);
2448                 wake_up(&rdev->blocked_wait);
2449         }
2450
2451         if (mddev_is_clustered(mddev) && ret == 0)
2452                 md_cluster_ops->metadata_update_finish(mddev);
2453 }
2454 EXPORT_SYMBOL(md_update_sb);
2455
2456 static int add_bound_rdev(struct md_rdev *rdev)
2457 {
2458         struct mddev *mddev = rdev->mddev;
2459         int err = 0;
2460
2461         if (!mddev->pers->hot_remove_disk) {
2462                 /* If there is hot_add_disk but no hot_remove_disk
2463                  * then added disks for geometry changes,
2464                  * and should be added immediately.
2465                  */
2466                 super_types[mddev->major_version].
2467                         validate_super(mddev, rdev);
2468                 err = mddev->pers->hot_add_disk(mddev, rdev);
2469                 if (err) {
2470                         unbind_rdev_from_array(rdev);
2471                         export_rdev(rdev);
2472                         return err;
2473                 }
2474         }
2475         sysfs_notify_dirent_safe(rdev->sysfs_state);
2476
2477         set_bit(MD_CHANGE_DEVS, &mddev->flags);
2478         if (mddev->degraded)
2479                 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
2480         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2481         md_new_event(mddev);
2482         md_wakeup_thread(mddev->thread);
2483         return 0;
2484 }
2485
2486 /* words written to sysfs files may, or may not, be \n terminated.
2487  * We want to accept with case. For this we use cmd_match.
2488  */
2489 static int cmd_match(const char *cmd, const char *str)
2490 {
2491         /* See if cmd, written into a sysfs file, matches
2492          * str.  They must either be the same, or cmd can
2493          * have a trailing newline
2494          */
2495         while (*cmd && *str && *cmd == *str) {
2496                 cmd++;
2497                 str++;
2498         }
2499         if (*cmd == '\n')
2500                 cmd++;
2501         if (*str || *cmd)
2502                 return 0;
2503         return 1;
2504 }
2505
2506 struct rdev_sysfs_entry {
2507         struct attribute attr;
2508         ssize_t (*show)(struct md_rdev *, char *);
2509         ssize_t (*store)(struct md_rdev *, const char *, size_t);
2510 };
2511
2512 static ssize_t
2513 state_show(struct md_rdev *rdev, char *page)
2514 {
2515         char *sep = "";
2516         size_t len = 0;
2517         unsigned long flags = ACCESS_ONCE(rdev->flags);
2518
2519         if (test_bit(Faulty, &flags) ||
2520             rdev->badblocks.unacked_exist) {
2521                 len+= sprintf(page+len, "%sfaulty",sep);
2522                 sep = ",";
2523         }
2524         if (test_bit(In_sync, &flags)) {
2525                 len += sprintf(page+len, "%sin_sync",sep);
2526                 sep = ",";
2527         }
2528         if (test_bit(Journal, &flags)) {
2529                 len += sprintf(page+len, "%sjournal",sep);
2530                 sep = ",";
2531         }
2532         if (test_bit(WriteMostly, &flags)) {
2533                 len += sprintf(page+len, "%swrite_mostly",sep);
2534                 sep = ",";
2535         }
2536         if (test_bit(Blocked, &flags) ||
2537             (rdev->badblocks.unacked_exist
2538              && !test_bit(Faulty, &flags))) {
2539                 len += sprintf(page+len, "%sblocked", sep);
2540                 sep = ",";
2541         }
2542         if (!test_bit(Faulty, &flags) &&
2543             !test_bit(Journal, &flags) &&
2544             !test_bit(In_sync, &flags)) {
2545                 len += sprintf(page+len, "%sspare", sep);
2546                 sep = ",";
2547         }
2548         if (test_bit(WriteErrorSeen, &flags)) {
2549                 len += sprintf(page+len, "%swrite_error", sep);
2550                 sep = ",";
2551         }
2552         if (test_bit(WantReplacement, &flags)) {
2553                 len += sprintf(page+len, "%swant_replacement", sep);
2554                 sep = ",";
2555         }
2556         if (test_bit(Replacement, &flags)) {
2557                 len += sprintf(page+len, "%sreplacement", sep);
2558                 sep = ",";
2559         }
2560
2561         return len+sprintf(page+len, "\n");
2562 }
2563
2564 static ssize_t
2565 state_store(struct md_rdev *rdev, const char *buf, size_t len)
2566 {
2567         /* can write
2568          *  faulty  - simulates an error
2569          *  remove  - disconnects the device
2570          *  writemostly - sets write_mostly
2571          *  -writemostly - clears write_mostly
2572          *  blocked - sets the Blocked flags
2573          *  -blocked - clears the Blocked and possibly simulates an error
2574          *  insync - sets Insync providing device isn't active
2575          *  -insync - clear Insync for a device with a slot assigned,
2576          *            so that it gets rebuilt based on bitmap
2577          *  write_error - sets WriteErrorSeen
2578          *  -write_error - clears WriteErrorSeen
2579          */
2580         int err = -EINVAL;
2581         if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
2582                 md_error(rdev->mddev, rdev);
2583                 if (test_bit(Faulty, &rdev->flags))
2584                         err = 0;
2585                 else
2586                         err = -EBUSY;
2587         } else if (cmd_match(buf, "remove")) {
2588                 if (rdev->raid_disk >= 0)
2589                         err = -EBUSY;
2590                 else {
2591                         struct mddev *mddev = rdev->mddev;
2592                         err = 0;
2593                         if (mddev_is_clustered(mddev))
2594                                 err = md_cluster_ops->remove_disk(mddev, rdev);
2595
2596                         if (err == 0) {
2597                                 md_kick_rdev_from_array(rdev);
2598                                 if (mddev->pers)
2599                                         md_update_sb(mddev, 1);
2600                                 md_new_event(mddev);
2601                         }
2602                 }
2603         } else if (cmd_match(buf, "writemostly")) {
2604                 set_bit(WriteMostly, &rdev->flags);
2605                 err = 0;
2606         } else if (cmd_match(buf, "-writemostly")) {
2607                 clear_bit(WriteMostly, &rdev->flags);
2608                 err = 0;
2609         } else if (cmd_match(buf, "blocked")) {
2610                 set_bit(Blocked, &rdev->flags);
2611                 err = 0;
2612         } else if (cmd_match(buf, "-blocked")) {
2613                 if (!test_bit(Faulty, &rdev->flags) &&
2614                     rdev->badblocks.unacked_exist) {
2615                         /* metadata handler doesn't understand badblocks,
2616                          * so we need to fail the device
2617                          */
2618                         md_error(rdev->mddev, rdev);
2619                 }
2620                 clear_bit(Blocked, &rdev->flags);
2621                 clear_bit(BlockedBadBlocks, &rdev->flags);
2622                 wake_up(&rdev->blocked_wait);
2623                 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2624                 md_wakeup_thread(rdev->mddev->thread);
2625
2626                 err = 0;
2627         } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
2628                 set_bit(In_sync, &rdev->flags);
2629                 err = 0;
2630         } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 &&
2631                    !test_bit(Journal, &rdev->flags)) {
2632                 if (rdev->mddev->pers == NULL) {
2633                         clear_bit(In_sync, &rdev->flags);
2634                         rdev->saved_raid_disk = rdev->raid_disk;
2635                         rdev->raid_disk = -1;
2636                         err = 0;
2637                 }
2638         } else if (cmd_match(buf, "write_error")) {
2639                 set_bit(WriteErrorSeen, &rdev->flags);
2640                 err = 0;
2641         } else if (cmd_match(buf, "-write_error")) {
2642                 clear_bit(WriteErrorSeen, &rdev->flags);
2643                 err = 0;
2644         } else if (cmd_match(buf, "want_replacement")) {
2645                 /* Any non-spare device that is not a replacement can
2646                  * become want_replacement at any time, but we then need to
2647                  * check if recovery is needed.
2648                  */
2649                 if (rdev->raid_disk >= 0 &&
2650                     !test_bit(Journal, &rdev->flags) &&
2651                     !test_bit(Replacement, &rdev->flags))
2652                         set_bit(WantReplacement, &rdev->flags);
2653                 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2654                 md_wakeup_thread(rdev->mddev->thread);
2655                 err = 0;
2656         } else if (cmd_match(buf, "-want_replacement")) {
2657                 /* Clearing 'want_replacement' is always allowed.
2658                  * Once replacements starts it is too late though.
2659                  */
2660                 err = 0;
2661                 clear_bit(WantReplacement, &rdev->flags);
2662         } else if (cmd_match(buf, "replacement")) {
2663                 /* Can only set a device as a replacement when array has not
2664                  * yet been started.  Once running, replacement is automatic
2665                  * from spares, or by assigning 'slot'.
2666                  */
2667                 if (rdev->mddev->pers)
2668                         err = -EBUSY;
2669                 else {
2670                         set_bit(Replacement, &rdev->flags);
2671                         err = 0;
2672                 }
2673         } else if (cmd_match(buf, "-replacement")) {
2674                 /* Similarly, can only clear Replacement before start */
2675                 if (rdev->mddev->pers)
2676                         err = -EBUSY;
2677                 else {
2678                         clear_bit(Replacement, &rdev->flags);
2679                         err = 0;
2680                 }
2681         } else if (cmd_match(buf, "re-add")) {
2682                 if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1)) {
2683                         /* clear_bit is performed _after_ all the devices
2684                          * have their local Faulty bit cleared. If any writes
2685                          * happen in the meantime in the local node, they
2686                          * will land in the local bitmap, which will be synced
2687                          * by this node eventually
2688                          */
2689                         if (!mddev_is_clustered(rdev->mddev) ||
2690                             (err = md_cluster_ops->gather_bitmaps(rdev)) == 0) {
2691                                 clear_bit(Faulty, &rdev->flags);
2692                                 err = add_bound_rdev(rdev);
2693                         }
2694                 } else
2695                         err = -EBUSY;
2696         }
2697         if (!err)
2698                 sysfs_notify_dirent_safe(rdev->sysfs_state);
2699         return err ? err : len;
2700 }
2701 static struct rdev_sysfs_entry rdev_state =
2702 __ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store);
2703
2704 static ssize_t
2705 errors_show(struct md_rdev *rdev, char *page)
2706 {
2707         return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
2708 }
2709
2710 static ssize_t
2711 errors_store(struct md_rdev *rdev, const char *buf, size_t len)
2712 {
2713         unsigned int n;
2714         int rv;
2715
2716         rv = kstrtouint(buf, 10, &n);
2717         if (rv < 0)
2718                 return rv;
2719         atomic_set(&rdev->corrected_errors, n);
2720         return len;
2721 }
2722 static struct rdev_sysfs_entry rdev_errors =
2723 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
2724
2725 static ssize_t
2726 slot_show(struct md_rdev *rdev, char *page)
2727 {
2728         if (test_bit(Journal, &rdev->flags))
2729                 return sprintf(page, "journal\n");
2730         else if (rdev->raid_disk < 0)
2731                 return sprintf(page, "none\n");
2732         else
2733                 return sprintf(page, "%d\n", rdev->raid_disk);
2734 }
2735
2736 static ssize_t
2737 slot_store(struct md_rdev *rdev, const char *buf, size_t len)
2738 {
2739         int slot;
2740         int err;
2741
2742         if (test_bit(Journal, &rdev->flags))
2743                 return -EBUSY;
2744         if (strncmp(buf, "none", 4)==0)
2745                 slot = -1;
2746         else {
2747                 err = kstrtouint(buf, 10, (unsigned int *)&slot);
2748                 if (err < 0)
2749                         return err;
2750         }
2751         if (rdev->mddev->pers && slot == -1) {
2752                 /* Setting 'slot' on an active array requires also
2753                  * updating the 'rd%d' link, and communicating
2754                  * with the personality with ->hot_*_disk.
2755                  * For now we only support removing
2756                  * failed/spare devices.  This normally happens automatically,
2757                  * but not when the metadata is externally managed.
2758                  */
2759                 if (rdev->raid_disk == -1)
2760                         return -EEXIST;
2761                 /* personality does all needed checks */
2762                 if (rdev->mddev->pers->hot_remove_disk == NULL)
2763                         return -EINVAL;
2764                 clear_bit(Blocked, &rdev->flags);
2765                 remove_and_add_spares(rdev->mddev, rdev);
2766                 if (rdev->raid_disk >= 0)
2767                         return -EBUSY;
2768                 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2769                 md_wakeup_thread(rdev->mddev->thread);
2770         } else if (rdev->mddev->pers) {
2771                 /* Activating a spare .. or possibly reactivating
2772                  * if we ever get bitmaps working here.
2773                  */
2774
2775                 if (rdev->raid_disk != -1)
2776                         return -EBUSY;
2777
2778                 if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery))
2779                         return -EBUSY;
2780
2781                 if (rdev->mddev->pers->hot_add_disk == NULL)
2782                         return -EINVAL;
2783
2784                 if (slot >= rdev->mddev->raid_disks &&
2785                     slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
2786                         return -ENOSPC;
2787
2788                 rdev->raid_disk = slot;
2789                 if (test_bit(In_sync, &rdev->flags))
2790                         rdev->saved_raid_disk = slot;
2791                 else
2792                         rdev->saved_raid_disk = -1;
2793                 clear_bit(In_sync, &rdev->flags);
2794                 clear_bit(Bitmap_sync, &rdev->flags);
2795                 remove_and_add_spares(rdev->mddev, rdev);
2796                 if (rdev->raid_disk == -1)
2797                         return -EBUSY;
2798                 /* don't wakeup anyone, leave that to userspace. */
2799         } else {
2800                 if (slot >= rdev->mddev->raid_disks &&
2801                     slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
2802                         return -ENOSPC;
2803                 rdev->raid_disk = slot;
2804                 /* assume it is working */
2805                 clear_bit(Faulty, &rdev->flags);
2806                 clear_bit(WriteMostly, &rdev->flags);
2807                 set_bit(In_sync, &rdev->flags);
2808                 sysfs_notify_dirent_safe(rdev->sysfs_state);
2809         }
2810         return len;
2811 }
2812
2813 static struct rdev_sysfs_entry rdev_slot =
2814 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store);
2815
2816 static ssize_t
2817 offset_show(struct md_rdev *rdev, char *page)
2818 {
2819         return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
2820 }
2821
2822 static ssize_t
2823 offset_store(struct md_rdev *rdev, const char *buf, size_t len)
2824 {
2825         unsigned long long offset;
2826         if (kstrtoull(buf, 10, &offset) < 0)
2827                 return -EINVAL;
2828         if (rdev->mddev->pers && rdev->raid_disk >= 0)
2829                 return -EBUSY;
2830         if (rdev->sectors && rdev->mddev->external)
2831                 /* Must set offset before size, so overlap checks
2832                  * can be sane */
2833                 return -EBUSY;
2834         rdev->data_offset = offset;
2835         rdev->new_data_offset = offset;
2836         return len;
2837 }
2838
2839 static struct rdev_sysfs_entry rdev_offset =
2840 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
2841
2842 static ssize_t new_offset_show(struct md_rdev *rdev, char *page)
2843 {
2844         return sprintf(page, "%llu\n",
2845                        (unsigned long long)rdev->new_data_offset);
2846 }
2847
2848 static ssize_t new_offset_store(struct md_rdev *rdev,
2849                                 const char *buf, size_t len)
2850 {
2851         unsigned long long new_offset;
2852         struct mddev *mddev = rdev->mddev;
2853
2854         if (kstrtoull(buf, 10, &new_offset) < 0)
2855                 return -EINVAL;
2856
2857         if (mddev->sync_thread ||
2858             test_bit(MD_RECOVERY_RUNNING,&mddev->recovery))
2859                 return -EBUSY;
2860         if (new_offset == rdev->data_offset)
2861                 /* reset is always permitted */
2862                 ;
2863         else if (new_offset > rdev->data_offset) {
2864                 /* must not push array size beyond rdev_sectors */
2865                 if (new_offset - rdev->data_offset
2866                     + mddev->dev_sectors > rdev->sectors)
2867                                 return -E2BIG;
2868         }
2869         /* Metadata worries about other space details. */
2870
2871         /* decreasing the offset is inconsistent with a backwards
2872          * reshape.
2873          */
2874         if (new_offset < rdev->data_offset &&
2875             mddev->reshape_backwards)
2876                 return -EINVAL;
2877         /* Increasing offset is inconsistent with forwards
2878          * reshape.  reshape_direction should be set to
2879          * 'backwards' first.
2880          */
2881         if (new_offset > rdev->data_offset &&
2882             !mddev->reshape_backwards)
2883                 return -EINVAL;
2884
2885         if (mddev->pers && mddev->persistent &&
2886             !super_types[mddev->major_version]
2887             .allow_new_offset(rdev, new_offset))
2888                 return -E2BIG;
2889         rdev->new_data_offset = new_offset;
2890         if (new_offset > rdev->data_offset)
2891                 mddev->reshape_backwards = 1;
2892         else if (new_offset < rdev->data_offset)
2893                 mddev->reshape_backwards = 0;
2894
2895         return len;
2896 }
2897 static struct rdev_sysfs_entry rdev_new_offset =
2898 __ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store);
2899
2900 static ssize_t
2901 rdev_size_show(struct md_rdev *rdev, char *page)
2902 {
2903         return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2);
2904 }
2905
2906 static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
2907 {
2908         /* check if two start/length pairs overlap */
2909         if (s1+l1 <= s2)
2910                 return 0;
2911         if (s2+l2 <= s1)
2912                 return 0;
2913         return 1;
2914 }
2915
2916 static int strict_blocks_to_sectors(const char *buf, sector_t *sectors)
2917 {
2918         unsigned long long blocks;
2919         sector_t new;
2920
2921         if (kstrtoull(buf, 10, &blocks) < 0)
2922                 return -EINVAL;
2923
2924         if (blocks & 1ULL << (8 * sizeof(blocks) - 1))
2925                 return -EINVAL; /* sector conversion overflow */
2926
2927         new = blocks * 2;
2928         if (new != blocks * 2)
2929                 return -EINVAL; /* unsigned long long to sector_t overflow */
2930
2931         *sectors = new;
2932         return 0;
2933 }
2934
2935 static ssize_t
2936 rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
2937 {
2938         struct mddev *my_mddev = rdev->mddev;
2939         sector_t oldsectors = rdev->sectors;
2940         sector_t sectors;
2941
2942         if (test_bit(Journal, &rdev->flags))
2943                 return -EBUSY;
2944         if (strict_blocks_to_sectors(buf, &sectors) < 0)
2945                 return -EINVAL;
2946         if (rdev->data_offset != rdev->new_data_offset)
2947                 return -EINVAL; /* too confusing */
2948         if (my_mddev->pers && rdev->raid_disk >= 0) {
2949                 if (my_mddev->persistent) {
2950                         sectors = super_types[my_mddev->major_version].
2951                                 rdev_size_change(rdev, sectors);
2952                         if (!sectors)
2953                                 return -EBUSY;
2954                 } else if (!sectors)
2955                         sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) -
2956                                 rdev->data_offset;
2957                 if (!my_mddev->pers->resize)
2958                         /* Cannot change size for RAID0 or Linear etc */
2959                         return -EINVAL;
2960         }
2961         if (sectors < my_mddev->dev_sectors)
2962                 return -EINVAL; /* component must fit device */
2963
2964         rdev->sectors = sectors;
2965         if (sectors > oldsectors && my_mddev->external) {
2966                 /* Need to check that all other rdevs with the same
2967                  * ->bdev do not overlap.  'rcu' is sufficient to walk
2968                  * the rdev lists safely.
2969                  * This check does not provide a hard guarantee, it
2970                  * just helps avoid dangerous mistakes.
2971                  */
2972                 struct mddev *mddev;
2973                 int overlap = 0;
2974                 struct list_head *tmp;
2975
2976                 rcu_read_lock();
2977                 for_each_mddev(mddev, tmp) {
2978                         struct md_rdev *rdev2;
2979
2980                         rdev_for_each(rdev2, mddev)
2981                                 if (rdev->bdev == rdev2->bdev &&
2982                                     rdev != rdev2 &&
2983                                     overlaps(rdev->data_offset, rdev->sectors,
2984                                              rdev2->data_offset,
2985                                              rdev2->sectors)) {
2986                                         overlap = 1;
2987                                         break;
2988                                 }
2989                         if (overlap) {
2990                                 mddev_put(mddev);
2991                                 break;
2992                         }
2993                 }
2994                 rcu_read_unlock();
2995                 if (overlap) {
2996                         /* Someone else could have slipped in a size
2997                          * change here, but doing so is just silly.
2998                          * We put oldsectors back because we *know* it is
2999                          * safe, and trust userspace not to race with
3000                          * itself
3001                          */
3002                         rdev->sectors = oldsectors;
3003                         return -EBUSY;
3004                 }
3005         }
3006         return len;
3007 }
3008
3009 static struct rdev_sysfs_entry rdev_size =
3010 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
3011
3012 static ssize_t recovery_start_show(struct md_rdev *rdev, char *page)
3013 {
3014         unsigned long long recovery_start = rdev->recovery_offset;
3015
3016         if (test_bit(In_sync, &rdev->flags) ||
3017             recovery_start == MaxSector)
3018                 return sprintf(page, "none\n");
3019
3020         return sprintf(page, "%llu\n", recovery_start);
3021 }
3022
3023 static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len)
3024 {
3025         unsigned long long recovery_start;
3026
3027         if (cmd_match(buf, "none"))
3028                 recovery_start = MaxSector;
3029         else if (kstrtoull(buf, 10, &recovery_start))
3030                 return -EINVAL;
3031
3032         if (rdev->mddev->pers &&
3033             rdev->raid_disk >= 0)
3034                 return -EBUSY;
3035
3036         rdev->recovery_offset = recovery_start;
3037         if (recovery_start == MaxSector)
3038                 set_bit(In_sync, &rdev->flags);
3039         else
3040                 clear_bit(In_sync, &rdev->flags);
3041         return len;
3042 }
3043
3044 static struct rdev_sysfs_entry rdev_recovery_start =
3045 __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store);
3046
3047 static ssize_t
3048 badblocks_show(struct badblocks *bb, char *page, int unack);
3049 static ssize_t
3050 badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack);
3051
3052 static ssize_t bb_show(struct md_rdev *rdev, char *page)
3053 {
3054         return badblocks_show(&rdev->badblocks, page, 0);
3055 }
3056 static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len)
3057 {
3058         int rv = badblocks_store(&rdev->badblocks, page, len, 0);
3059         /* Maybe that ack was all we needed */
3060         if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags))
3061                 wake_up(&rdev->blocked_wait);
3062         return rv;
3063 }
3064 static struct rdev_sysfs_entry rdev_bad_blocks =
3065 __ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store);
3066
3067 static ssize_t ubb_show(struct md_rdev *rdev, char *page)
3068 {
3069         return badblocks_show(&rdev->badblocks, page, 1);
3070 }
3071 static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len)
3072 {
3073         return badblocks_store(&rdev->badblocks, page, len, 1);
3074 }
3075 static struct rdev_sysfs_entry rdev_unack_bad_blocks =
3076 __ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store);
3077
3078 static struct attribute *rdev_default_attrs[] = {
3079         &rdev_state.attr,
3080         &rdev_errors.attr,
3081         &rdev_slot.attr,
3082         &rdev_offset.attr,
3083         &rdev_new_offset.attr,
3084         &rdev_size.attr,
3085         &rdev_recovery_start.attr,
3086         &rdev_bad_blocks.attr,
3087         &rdev_unack_bad_blocks.attr,
3088         NULL,
3089 };
3090 static ssize_t
3091 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
3092 {
3093         struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
3094         struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
3095
3096         if (!entry->show)
3097                 return -EIO;
3098         if (!rdev->mddev)
3099                 return -EBUSY;
3100         return entry->show(rdev, page);
3101 }
3102
3103 static ssize_t
3104 rdev_attr_store(struct kobject *kobj, struct attribute *attr,
3105               const char *page, size_t length)
3106 {
3107         struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
3108         struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
3109         ssize_t rv;
3110         struct mddev *mddev = rdev->mddev;
3111
3112         if (!entry->store)
3113                 return -EIO;
3114         if (!capable(CAP_SYS_ADMIN))
3115                 return -EACCES;
3116         rv = mddev ? mddev_lock(mddev): -EBUSY;
3117         if (!rv) {
3118                 if (rdev->mddev == NULL)
3119                         rv = -EBUSY;
3120                 else
3121                         rv = entry->store(rdev, page, length);
3122                 mddev_unlock(mddev);
3123         }
3124         return rv;
3125 }
3126
3127 static void rdev_free(struct kobject *ko)
3128 {
3129         struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj);
3130         kfree(rdev);
3131 }
3132 static const struct sysfs_ops rdev_sysfs_ops = {
3133         .show           = rdev_attr_show,
3134         .store          = rdev_attr_store,
3135 };
3136 static struct kobj_type rdev_ktype = {
3137         .release        = rdev_free,
3138         .sysfs_ops      = &rdev_sysfs_ops,
3139         .default_attrs  = rdev_default_attrs,
3140 };
3141
3142 int md_rdev_init(struct md_rdev *rdev)
3143 {
3144         rdev->desc_nr = -1;
3145         rdev->saved_raid_disk = -1;
3146         rdev->raid_disk = -1;
3147         rdev->flags = 0;
3148         rdev->data_offset = 0;
3149         rdev->new_data_offset = 0;
3150         rdev->sb_events = 0;
3151         rdev->last_read_error.tv_sec  = 0;
3152         rdev->last_read_error.tv_nsec = 0;
3153         rdev->sb_loaded = 0;
3154         rdev->bb_page = NULL;
3155         atomic_set(&rdev->nr_pending, 0);
3156         atomic_set(&rdev->read_errors, 0);
3157         atomic_set(&rdev->corrected_errors, 0);
3158
3159         INIT_LIST_HEAD(&rdev->same_set);
3160         init_waitqueue_head(&rdev->blocked_wait);
3161
3162         /* Add space to store bad block list.
3163          * This reserves the space even on arrays where it cannot
3164          * be used - I wonder if that matters
3165          */
3166         rdev->badblocks.count = 0;
3167         rdev->badblocks.shift = -1; /* disabled until explicitly enabled */
3168         rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL);
3169         seqlock_init(&rdev->badblocks.lock);
3170         if (rdev->badblocks.page == NULL)
3171                 return -ENOMEM;
3172
3173         return 0;
3174 }
3175 EXPORT_SYMBOL_GPL(md_rdev_init);
3176 /*
3177  * Import a device. If 'super_format' >= 0, then sanity check the superblock
3178  *
3179  * mark the device faulty if:
3180  *
3181  *   - the device is nonexistent (zero size)
3182  *   - the device has no valid superblock
3183  *
3184  * a faulty rdev _never_ has rdev->sb set.
3185  */
3186 static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor)
3187 {
3188         char b[BDEVNAME_SIZE];
3189         int err;
3190         struct md_rdev *rdev;
3191         sector_t size;
3192
3193         rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
3194         if (!rdev) {
3195                 printk(KERN_ERR "md: could not alloc mem for new device!\n");
3196                 return ERR_PTR(-ENOMEM);
3197         }
3198
3199         err = md_rdev_init(rdev);
3200         if (err)
3201                 goto abort_free;
3202         err = alloc_disk_sb(rdev);
3203         if (err)
3204                 goto abort_free;
3205
3206         err = lock_rdev(rdev, newdev, super_format == -2);
3207         if (err)
3208                 goto abort_free;
3209
3210         kobject_init(&rdev->kobj, &rdev_ktype);
3211
3212         size = i_size_read(rdev->bdev->bd_inode) >> BLOCK_SIZE_BITS;
3213         if (!size) {
3214                 printk(KERN_WARNING
3215                         "md: %s has zero or unknown size, marking faulty!\n",
3216                         bdevname(rdev->bdev,b));
3217                 err = -EINVAL;
3218                 goto abort_free;
3219         }
3220
3221         if (super_format >= 0) {
3222                 err = super_types[super_format].
3223                         load_super(rdev, NULL, super_minor);
3224                 if (err == -EINVAL) {
3225                         printk(KERN_WARNING
3226                                 "md: %s does not have a valid v%d.%d "
3227                                "superblock, not importing!\n",
3228                                 bdevname(rdev->bdev,b),
3229                                super_format, super_minor);
3230                         goto abort_free;
3231                 }
3232                 if (err < 0) {
3233                         printk(KERN_WARNING
3234                                 "md: could not read %s's sb, not importing!\n",
3235                                 bdevname(rdev->bdev,b));
3236                         goto abort_free;
3237                 }
3238         }
3239
3240         return rdev;
3241
3242 abort_free:
3243         if (rdev->bdev)
3244                 unlock_rdev(rdev);
3245         md_rdev_clear(rdev);
3246         kfree(rdev);
3247         return ERR_PTR(err);
3248 }
3249
3250 /*
3251  * Check a full RAID array for plausibility
3252  */
3253
3254 static void analyze_sbs(struct mddev *mddev)
3255 {
3256         int i;
3257         struct md_rdev *rdev, *freshest, *tmp;
3258         char b[BDEVNAME_SIZE];
3259
3260         freshest = NULL;
3261         rdev_for_each_safe(rdev, tmp, mddev)
3262                 switch (super_types[mddev->major_version].
3263                         load_super(rdev, freshest, mddev->minor_version)) {
3264                 case 1:
3265                         freshest = rdev;
3266                         break;
3267                 case 0:
3268                         break;
3269                 default:
3270                         printk( KERN_ERR \
3271                                 "md: fatal superblock inconsistency in %s"
3272                                 " -- removing from array\n",
3273                                 bdevname(rdev->bdev,b));
3274                         md_kick_rdev_from_array(rdev);
3275                 }
3276
3277         super_types[mddev->major_version].
3278                 validate_super(mddev, freshest);
3279
3280         i = 0;
3281         rdev_for_each_safe(rdev, tmp, mddev) {
3282                 if (mddev->max_disks &&
3283                     (rdev->desc_nr >= mddev->max_disks ||
3284                      i > mddev->max_disks)) {
3285                         printk(KERN_WARNING
3286                                "md: %s: %s: only %d devices permitted\n",
3287                                mdname(mddev), bdevname(rdev->bdev, b),
3288                                mddev->max_disks);
3289                         md_kick_rdev_from_array(rdev);
3290                         continue;
3291                 }
3292                 if (rdev != freshest) {
3293                         if (super_types[mddev->major_version].
3294                             validate_super(mddev, rdev)) {
3295                                 printk(KERN_WARNING "md: kicking non-fresh %s"
3296                                         " from array!\n",
3297                                         bdevname(rdev->bdev,b));
3298                                 md_kick_rdev_from_array(rdev);
3299                                 continue;
3300                         }
3301                 }
3302                 if (mddev->level == LEVEL_MULTIPATH) {
3303                         rdev->desc_nr = i++;
3304                         rdev->raid_disk = rdev->desc_nr;
3305                         set_bit(In_sync, &rdev->flags);
3306                 } else if (rdev->raid_disk >=
3307                             (mddev->raid_disks - min(0, mddev->delta_disks)) &&
3308                            !test_bit(Journal, &rdev->flags)) {
3309                         rdev->raid_disk = -1;
3310                         clear_bit(In_sync, &rdev->flags);
3311                 }
3312         }
3313 }
3314
3315 /* Read a fixed-point number.
3316  * Numbers in sysfs attributes should be in "standard" units where
3317  * possible, so time should be in seconds.
3318  * However we internally use a a much smaller unit such as
3319  * milliseconds or jiffies.
3320  * This function takes a decimal number with a possible fractional
3321  * component, and produces an integer which is the result of
3322  * multiplying that number by 10^'scale'.
3323  * all without any floating-point arithmetic.
3324  */
3325 int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale)
3326 {
3327         unsigned long result = 0;
3328         long decimals = -1;
3329         while (isdigit(*cp) || (*cp == '.' && decimals < 0)) {
3330                 if (*cp == '.')
3331                         decimals = 0;
3332                 else if (decimals < scale) {
3333                         unsigned int value;
3334                         value = *cp - '0';
3335               &n