]> git.kernelconcepts.de Git - karo-tx-linux.git/blobdiff - drivers/md/raid1.c
Merge remote-tracking branch 'jwb/next' into next
[karo-tx-linux.git] / drivers / md / raid1.c
index ede2461e79c51e8d3bd1ac04fcac4c7a974bbbbe..a368db2431a596020a98a3e64f67685ccef9c6cc 100644 (file)
@@ -135,7 +135,7 @@ out_free_pages:
                        put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page);
        j = -1;
 out_free_bio:
-       while ( ++j < pi->raid_disks )
+       while (++j < pi->raid_disks)
                bio_put(r1_bio->bios[j]);
        r1bio_pool_free(r1_bio, data);
        return NULL;
@@ -164,7 +164,7 @@ static void put_all_bios(struct r1conf *conf, struct r1bio *r1_bio)
 {
        int i;
 
-       for (i = 0; i < conf->raid_disks; i++) {
+       for (i = 0; i < conf->raid_disks * 2; i++) {
                struct bio **bio = r1_bio->bios + i;
                if (!BIO_SPECIAL(*bio))
                        bio_put(*bio);
@@ -185,7 +185,7 @@ static void put_buf(struct r1bio *r1_bio)
        struct r1conf *conf = r1_bio->mddev->private;
        int i;
 
-       for (i=0; i<conf->raid_disks; i++) {
+       for (i = 0; i < conf->raid_disks * 2; i++) {
                struct bio *bio = r1_bio->bios[i];
                if (bio->bi_end_io)
                        rdev_dec_pending(conf->mirrors[i].rdev, r1_bio->mddev);
@@ -277,13 +277,14 @@ static inline void update_head_pos(int disk, struct r1bio *r1_bio)
 static int find_bio_disk(struct r1bio *r1_bio, struct bio *bio)
 {
        int mirror;
-       int raid_disks = r1_bio->mddev->raid_disks;
+       struct r1conf *conf = r1_bio->mddev->private;
+       int raid_disks = conf->raid_disks;
 
-       for (mirror = 0; mirror < raid_disks; mirror++)
+       for (mirror = 0; mirror < raid_disks * 2; mirror++)
                if (r1_bio->bios[mirror] == bio)
                        break;
 
-       BUG_ON(mirror == raid_disks);
+       BUG_ON(mirror == raid_disks * 2);
        update_head_pos(mirror, r1_bio);
 
        return mirror;
@@ -390,6 +391,11 @@ static void raid1_end_write_request(struct bio *bio, int error)
        if (!uptodate) {
                set_bit(WriteErrorSeen,
                        &conf->mirrors[mirror].rdev->flags);
+               if (!test_and_set_bit(WantReplacement,
+                                     &conf->mirrors[mirror].rdev->flags))
+                       set_bit(MD_RECOVERY_NEEDED, &
+                               conf->mddev->recovery);
+
                set_bit(R1BIO_WriteError, &r1_bio->state);
        } else {
                /*
@@ -505,7 +511,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
                start_disk = conf->last_used;
        }
 
-       for (i = 0 ; i < conf->raid_disks ; i++) {
+       for (i = 0 ; i < conf->raid_disks * 2 ; i++) {
                sector_t dist;
                sector_t first_bad;
                int bad_sectors;
@@ -525,8 +531,17 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
                if (test_bit(WriteMostly, &rdev->flags)) {
                        /* Don't balance among write-mostly, just
                         * use the first as a last resort */
-                       if (best_disk < 0)
+                       if (best_disk < 0) {
+                               if (is_badblock(rdev, this_sector, sectors,
+                                               &first_bad, &bad_sectors)) {
+                                       if (first_bad < this_sector)
+                                               /* Cannot use this */
+                                               continue;
+                                       best_good_sectors = first_bad - this_sector;
+                               } else
+                                       best_good_sectors = sectors;
                                best_disk = disk;
+                       }
                        continue;
                }
                /* This is a reasonable device to use.  It might
@@ -609,7 +624,7 @@ int md_raid1_congested(struct mddev *mddev, int bits)
                return 1;
 
        rcu_read_lock();
-       for (i = 0; i < mddev->raid_disks; i++) {
+       for (i = 0; i < conf->raid_disks; i++) {
                struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
                if (rdev && !test_bit(Faulty, &rdev->flags)) {
                        struct request_queue *q = bdev_get_queue(rdev->bdev);
@@ -974,7 +989,7 @@ read_again:
         */
        plugged = mddev_check_plugged(mddev);
 
-       disks = conf->raid_disks;
+       disks = conf->raid_disks * 2;
  retry_write:
        blocked_rdev = NULL;
        rcu_read_lock();
@@ -988,7 +1003,8 @@ read_again:
                }
                r1_bio->bios[i] = NULL;
                if (!rdev || test_bit(Faulty, &rdev->flags)) {
-                       set_bit(R1BIO_Degraded, &r1_bio->state);
+                       if (i < conf->raid_disks)
+                               set_bit(R1BIO_Degraded, &r1_bio->state);
                        continue;
                }
 
@@ -1263,6 +1279,25 @@ static int raid1_spare_active(struct mddev *mddev)
         */
        for (i = 0; i < conf->raid_disks; i++) {
                struct md_rdev *rdev = conf->mirrors[i].rdev;
+               struct md_rdev *repl = conf->mirrors[conf->raid_disks + i].rdev;
+               if (repl
+                   && repl->recovery_offset == MaxSector
+                   && !test_bit(Faulty, &repl->flags)
+                   && !test_and_set_bit(In_sync, &repl->flags)) {
+                       /* replacement has just become active */
+                       if (!rdev ||
+                           !test_and_clear_bit(In_sync, &rdev->flags))
+                               count++;
+                       if (rdev) {
+                               /* Replaced device not technically
+                                * faulty, but we need to be sure
+                                * it gets removed and never re-added
+                                */
+                               set_bit(Faulty, &rdev->flags);
+                               sysfs_notify_dirent_safe(
+                                       rdev->sysfs_state);
+                       }
+               }
                if (rdev
                    && !test_bit(Faulty, &rdev->flags)
                    && !test_and_set_bit(In_sync, &rdev->flags)) {
@@ -1286,7 +1321,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
        int mirror = 0;
        struct mirror_info *p;
        int first = 0;
-       int last = mddev->raid_disks - 1;
+       int last = conf->raid_disks - 1;
 
        if (mddev->recovery_disabled == conf->recovery_disabled)
                return -EBUSY;
@@ -1294,8 +1329,9 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
        if (rdev->raid_disk >= 0)
                first = last = rdev->raid_disk;
 
-       for (mirror = first; mirror <= last; mirror++)
-               if ( !(p=conf->mirrors+mirror)->rdev) {
+       for (mirror = first; mirror <= last; mirror++) {
+               p = conf->mirrors+mirror;
+               if (!p->rdev) {
 
                        disk_stack_limits(mddev->gendisk, rdev->bdev,
                                          rdev->data_offset << 9);
@@ -1322,21 +1358,35 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
                        rcu_assign_pointer(p->rdev, rdev);
                        break;
                }
+               if (test_bit(WantReplacement, &p->rdev->flags) &&
+                   p[conf->raid_disks].rdev == NULL) {
+                       /* Add this device as a replacement */
+                       clear_bit(In_sync, &rdev->flags);
+                       set_bit(Replacement, &rdev->flags);
+                       rdev->raid_disk = mirror;
+                       err = 0;
+                       conf->fullsync = 1;
+                       rcu_assign_pointer(p[conf->raid_disks].rdev, rdev);
+                       break;
+               }
+       }
        md_integrity_add_rdev(rdev, mddev);
        print_conf(conf);
        return err;
 }
 
-static int raid1_remove_disk(struct mddev *mddev, int number)
+static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 {
        struct r1conf *conf = mddev->private;
        int err = 0;
-       struct md_rdev *rdev;
+       int number = rdev->raid_disk;
        struct mirror_info *p = conf->mirrors+ number;
 
+       if (rdev != p->rdev)
+               p = conf->mirrors + conf->raid_disks + number;
+
        print_conf(conf);
-       rdev = p->rdev;
-       if (rdev) {
+       if (rdev == p->rdev) {
                if (test_bit(In_sync, &rdev->flags) ||
                    atomic_read(&rdev->nr_pending)) {
                        err = -EBUSY;
@@ -1358,7 +1408,21 @@ static int raid1_remove_disk(struct mddev *mddev, int number)
                        err = -EBUSY;
                        p->rdev = rdev;
                        goto abort;
-               }
+               } else if (conf->mirrors[conf->raid_disks + number].rdev) {
+                       /* We just removed a device that is being replaced.
+                        * Move down the replacement.  We drain all IO before
+                        * doing this to avoid confusion.
+                        */
+                       struct md_rdev *repl =
+                               conf->mirrors[conf->raid_disks + number].rdev;
+                       raise_barrier(conf);
+                       clear_bit(Replacement, &repl->flags);
+                       p->rdev = repl;
+                       conf->mirrors[conf->raid_disks + number].rdev = NULL;
+                       lower_barrier(conf);
+                       clear_bit(WantReplacement, &rdev->flags);
+               } else
+                       clear_bit(WantReplacement, &rdev->flags);
                err = md_integrity_register(mddev);
        }
 abort:
@@ -1411,6 +1475,10 @@ static void end_sync_write(struct bio *bio, int error)
                } while (sectors_to_go > 0);
                set_bit(WriteErrorSeen,
                        &conf->mirrors[mirror].rdev->flags);
+               if (!test_and_set_bit(WantReplacement,
+                                     &conf->mirrors[mirror].rdev->flags))
+                       set_bit(MD_RECOVERY_NEEDED, &
+                               mddev->recovery);
                set_bit(R1BIO_WriteError, &r1_bio->state);
        } else if (is_badblock(conf->mirrors[mirror].rdev,
                               r1_bio->sector,
@@ -1441,8 +1509,13 @@ static int r1_sync_page_io(struct md_rdev *rdev, sector_t sector,
        if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))
                /* success */
                return 1;
-       if (rw == WRITE)
+       if (rw == WRITE) {
                set_bit(WriteErrorSeen, &rdev->flags);
+               if (!test_and_set_bit(WantReplacement,
+                                     &rdev->flags))
+                       set_bit(MD_RECOVERY_NEEDED, &
+                               rdev->mddev->recovery);
+       }
        /* need to record an error - either for the block or the device */
        if (!rdev_set_badblocks(rdev, sector, sectors, 0))
                md_error(rdev->mddev, rdev);
@@ -1493,7 +1566,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
                                }
                        }
                        d++;
-                       if (d == conf->raid_disks)
+                       if (d == conf->raid_disks * 2)
                                d = 0;
                } while (!success && d != r1_bio->read_disk);
 
@@ -1510,7 +1583,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
                               mdname(mddev),
                               bdevname(bio->bi_bdev, b),
                               (unsigned long long)r1_bio->sector);
-                       for (d = 0; d < conf->raid_disks; d++) {
+                       for (d = 0; d < conf->raid_disks * 2; d++) {
                                rdev = conf->mirrors[d].rdev;
                                if (!rdev || test_bit(Faulty, &rdev->flags))
                                        continue;
@@ -1536,7 +1609,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
                /* write it back and re-read */
                while (d != r1_bio->read_disk) {
                        if (d == 0)
-                               d = conf->raid_disks;
+                               d = conf->raid_disks * 2;
                        d--;
                        if (r1_bio->bios[d]->bi_end_io != end_sync_read)
                                continue;
@@ -1551,7 +1624,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
                d = start;
                while (d != r1_bio->read_disk) {
                        if (d == 0)
-                               d = conf->raid_disks;
+                               d = conf->raid_disks * 2;
                        d--;
                        if (r1_bio->bios[d]->bi_end_io != end_sync_read)
                                continue;
@@ -1584,7 +1657,7 @@ static int process_checks(struct r1bio *r1_bio)
        int primary;
        int i;
 
-       for (primary = 0; primary < conf->raid_disks; primary++)
+       for (primary = 0; primary < conf->raid_disks * 2; primary++)
                if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
                    test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) {
                        r1_bio->bios[primary]->bi_end_io = NULL;
@@ -1592,7 +1665,7 @@ static int process_checks(struct r1bio *r1_bio)
                        break;
                }
        r1_bio->read_disk = primary;
-       for (i = 0; i < conf->raid_disks; i++) {
+       for (i = 0; i < conf->raid_disks * 2; i++) {
                int j;
                int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9);
                struct bio *pbio = r1_bio->bios[primary];
@@ -1656,7 +1729,7 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
 {
        struct r1conf *conf = mddev->private;
        int i;
-       int disks = conf->raid_disks;
+       int disks = conf->raid_disks * 2;
        struct bio *bio, *wbio;
 
        bio = r1_bio->bios[r1_bio->read_disk];
@@ -1737,7 +1810,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
                                success = 1;
                        else {
                                d++;
-                               if (d == conf->raid_disks)
+                               if (d == conf->raid_disks * 2)
                                        d = 0;
                        }
                } while (!success && d != read_disk);
@@ -1753,7 +1826,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
                start = d;
                while (d != read_disk) {
                        if (d==0)
-                               d = conf->raid_disks;
+                               d = conf->raid_disks * 2;
                        d--;
                        rdev = conf->mirrors[d].rdev;
                        if (rdev &&
@@ -1765,7 +1838,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
                while (d != read_disk) {
                        char b[BDEVNAME_SIZE];
                        if (d==0)
-                               d = conf->raid_disks;
+                               d = conf->raid_disks * 2;
                        d--;
                        rdev = conf->mirrors[d].rdev;
                        if (rdev &&
@@ -1887,7 +1960,7 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio
 {
        int m;
        int s = r1_bio->sectors;
-       for (m = 0; m < conf->raid_disks ; m++) {
+       for (m = 0; m < conf->raid_disks * 2 ; m++) {
                struct md_rdev *rdev = conf->mirrors[m].rdev;
                struct bio *bio = r1_bio->bios[m];
                if (bio->bi_end_io == NULL)
@@ -1909,7 +1982,7 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio
 static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
 {
        int m;
-       for (m = 0; m < conf->raid_disks ; m++)
+       for (m = 0; m < conf->raid_disks * 2 ; m++)
                if (r1_bio->bios[m] == IO_MADE_GOOD) {
                        struct md_rdev *rdev = conf->mirrors[m].rdev;
                        rdev_clear_badblocks(rdev,
@@ -2184,7 +2257,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
        r1_bio->state = 0;
        set_bit(R1BIO_IsSync, &r1_bio->state);
 
-       for (i=0; i < conf->raid_disks; i++) {
+       for (i = 0; i < conf->raid_disks * 2; i++) {
                struct md_rdev *rdev;
                bio = r1_bio->bios[i];
 
@@ -2203,7 +2276,8 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
                rdev = rcu_dereference(conf->mirrors[i].rdev);
                if (rdev == NULL ||
                    test_bit(Faulty, &rdev->flags)) {
-                       still_degraded = 1;
+                       if (i < conf->raid_disks)
+                               still_degraded = 1;
                } else if (!test_bit(In_sync, &rdev->flags)) {
                        bio->bi_rw = WRITE;
                        bio->bi_end_io = end_sync_write;
@@ -2254,7 +2328,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
                 * need to mark them bad on all write targets
                 */
                int ok = 1;
-               for (i = 0 ; i < conf->raid_disks ; i++)
+               for (i = 0 ; i < conf->raid_disks * 2 ; i++)
                        if (r1_bio->bios[i]->bi_end_io == end_sync_write) {
                                struct md_rdev *rdev =
                                        rcu_dereference(conf->mirrors[i].rdev);
@@ -2323,7 +2397,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
                                len = sync_blocks<<9;
                }
 
-               for (i=0 ; i < conf->raid_disks; i++) {
+               for (i = 0 ; i < conf->raid_disks * 2; i++) {
                        bio = r1_bio->bios[i];
                        if (bio->bi_end_io) {
                                page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
@@ -2356,7 +2430,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
         */
        if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
                atomic_set(&r1_bio->remaining, read_targets);
-               for (i=0; i<conf->raid_disks; i++) {
+               for (i = 0; i < conf->raid_disks * 2; i++) {
                        bio = r1_bio->bios[i];
                        if (bio->bi_end_io == end_sync_read) {
                                md_sync_acct(bio->bi_bdev, nr_sectors);
@@ -2393,7 +2467,8 @@ static struct r1conf *setup_conf(struct mddev *mddev)
        if (!conf)
                goto abort;
 
-       conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks,
+       conf->mirrors = kzalloc(sizeof(struct mirror_info)
+                               * mddev->raid_disks * 2,
                                 GFP_KERNEL);
        if (!conf->mirrors)
                goto abort;
@@ -2405,7 +2480,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
        conf->poolinfo = kzalloc(sizeof(*conf->poolinfo), GFP_KERNEL);
        if (!conf->poolinfo)
                goto abort;
-       conf->poolinfo->raid_disks = mddev->raid_disks;
+       conf->poolinfo->raid_disks = mddev->raid_disks * 2;
        conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
                                          r1bio_pool_free,
                                          conf->poolinfo);
@@ -2414,14 +2489,20 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 
        conf->poolinfo->mddev = mddev;
 
+       err = -EINVAL;
        spin_lock_init(&conf->device_lock);
        list_for_each_entry(rdev, &mddev->disks, same_set) {
                int disk_idx = rdev->raid_disk;
                if (disk_idx >= mddev->raid_disks
                    || disk_idx < 0)
                        continue;
-               disk = conf->mirrors + disk_idx;
+               if (test_bit(Replacement, &rdev->flags))
+                       disk = conf->mirrors + conf->raid_disks + disk_idx;
+               else
+                       disk = conf->mirrors + disk_idx;
 
+               if (disk->rdev)
+                       goto abort;
                disk->rdev = rdev;
 
                disk->head_position = 0;
@@ -2437,11 +2518,27 @@ static struct r1conf *setup_conf(struct mddev *mddev)
        conf->pending_count = 0;
        conf->recovery_disabled = mddev->recovery_disabled - 1;
 
+       err = -EIO;
        conf->last_used = -1;
-       for (i = 0; i < conf->raid_disks; i++) {
+       for (i = 0; i < conf->raid_disks * 2; i++) {
 
                disk = conf->mirrors + i;
 
+               if (i < conf->raid_disks &&
+                   disk[conf->raid_disks].rdev) {
+                       /* This slot has a replacement. */
+                       if (!disk->rdev) {
+                               /* No original, just make the replacement
+                                * a recovering spare
+                                */
+                               disk->rdev =
+                                       disk[conf->raid_disks].rdev;
+                               disk[conf->raid_disks].rdev = NULL;
+                       } else if (!test_bit(In_sync, &disk->rdev->flags))
+                               /* Original is not in_sync - bad */
+                               goto abort;
+               }
+
                if (!disk->rdev ||
                    !test_bit(In_sync, &disk->rdev->flags)) {
                        disk->head_position = 0;
@@ -2455,7 +2552,6 @@ static struct r1conf *setup_conf(struct mddev *mddev)
                        conf->last_used = i;
        }
 
-       err = -EIO;
        if (conf->last_used < 0) {
                printk(KERN_ERR "md/raid1:%s: no operational mirrors\n",
                       mdname(mddev));
@@ -2665,7 +2761,7 @@ static int raid1_reshape(struct mddev *mddev)
        if (!newpoolinfo)
                return -ENOMEM;
        newpoolinfo->mddev = mddev;
-       newpoolinfo->raid_disks = raid_disks;
+       newpoolinfo->raid_disks = raid_disks * 2;
 
        newpool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
                                 r1bio_pool_free, newpoolinfo);
@@ -2673,7 +2769,8 @@ static int raid1_reshape(struct mddev *mddev)
                kfree(newpoolinfo);
                return -ENOMEM;
        }
-       newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks, GFP_KERNEL);
+       newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks * 2,
+                            GFP_KERNEL);
        if (!newmirrors) {
                kfree(newpoolinfo);
                mempool_destroy(newpool);