]> git.kernelconcepts.de Git - karo-tx-linux.git/blobdiff - drivers/md/md.c
md: Avoid waking up a thread after it has been freed.
[karo-tx-linux.git] / drivers / md / md.c
index dfc9425db70bc7214f0735a4e3b8c9f5cf366176..5c95ccb595007cab1aba9859cc4da62cad80536f 100644 (file)
 static void autostart_arrays(int part);
 #endif
 
+/* pers_list is a list of registered personalities protected
+ * by pers_lock.
+ * pers_lock does extra service to protect accesses to
+ * mddev->thread when the mutex cannot be held.
+ */
 static LIST_HEAD(pers_list);
 static DEFINE_SPINLOCK(pers_lock);
 
@@ -215,6 +220,55 @@ struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
 }
 EXPORT_SYMBOL_GPL(bio_clone_mddev);
 
+void md_trim_bio(struct bio *bio, int offset, int size)
+{
+       /* 'bio' is a cloned bio which we need to trim to match
+        * the given offset and size.
+        * This requires adjusting bi_sector, bi_size, and bi_io_vec
+        */
+       int i;
+       struct bio_vec *bvec;
+       int sofar = 0;
+
+       size <<= 9;
+       if (offset == 0 && size == bio->bi_size)
+               return;
+
+       bio->bi_sector += offset;
+       bio->bi_size = size;
+       offset <<= 9;
+       clear_bit(BIO_SEG_VALID, &bio->bi_flags);
+
+       while (bio->bi_idx < bio->bi_vcnt &&
+              bio->bi_io_vec[bio->bi_idx].bv_len <= offset) {
+               /* remove this whole bio_vec */
+               offset -= bio->bi_io_vec[bio->bi_idx].bv_len;
+               bio->bi_idx++;
+       }
+       if (bio->bi_idx < bio->bi_vcnt) {
+               bio->bi_io_vec[bio->bi_idx].bv_offset += offset;
+               bio->bi_io_vec[bio->bi_idx].bv_len -= offset;
+       }
+       /* avoid any complications with bi_idx being non-zero*/
+       if (bio->bi_idx) {
+               memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx,
+                       (bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec));
+               bio->bi_vcnt -= bio->bi_idx;
+               bio->bi_idx = 0;
+       }
+       /* Make sure vcnt and last bv are not too big */
+       bio_for_each_segment(bvec, bio, i) {
+               if (sofar + bvec->bv_len > size)
+                       bvec->bv_len = size - sofar;
+               if (bvec->bv_len == 0) {
+                       bio->bi_vcnt = i;
+                       break;
+               }
+               sofar += bvec->bv_len;
+       }
+}
+EXPORT_SYMBOL_GPL(md_trim_bio);
+
 /*
  * We have a system wide 'event count' that is incremented
  * on any 'interesting' event, and readers of /proc/mdstat
@@ -690,7 +744,12 @@ static void mddev_unlock(mddev_t * mddev)
        } else
                mutex_unlock(&mddev->reconfig_mutex);
 
+       /* was we've dropped the mutex we need a spinlock to
+        * make sur the thread doesn't disappear
+        */
+       spin_lock(&pers_lock);
        md_wakeup_thread(mddev->thread);
+       spin_unlock(&pers_lock);
 }
 
 static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
@@ -757,6 +816,10 @@ static void free_disk_sb(mdk_rdev_t * rdev)
                rdev->sb_start = 0;
                rdev->sectors = 0;
        }
+       if (rdev->bb_page) {
+               put_page(rdev->bb_page);
+               rdev->bb_page = NULL;
+       }
 }
 
 
@@ -795,7 +858,7 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
        bio->bi_end_io = super_written;
 
        atomic_inc(&mddev->pending_writes);
-       submit_bio(REQ_WRITE | REQ_SYNC | REQ_FLUSH | REQ_FUA, bio);
+       submit_bio(WRITE_FLUSH_FUA, bio);
 }
 
 void md_super_wait(mddev_t *mddev)
@@ -1025,7 +1088,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
        ret = -EINVAL;
 
        bdevname(rdev->bdev, b);
-       sb = (mdp_super_t*)page_address(rdev->sb_page);
+       sb = page_address(rdev->sb_page);
 
        if (sb->md_magic != MD_SB_MAGIC) {
                printk(KERN_ERR "md: invalid raid superblock magic on %s\n",
@@ -1054,6 +1117,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
        rdev->preferred_minor = sb->md_minor;
        rdev->data_offset = 0;
        rdev->sb_size = MD_SB_BYTES;
+       rdev->badblocks.shift = -1;
 
        if (sb->level == LEVEL_MULTIPATH)
                rdev->desc_nr = -1;
@@ -1064,7 +1128,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
                ret = 1;
        } else {
                __u64 ev1, ev2;
-               mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page);
+               mdp_super_t *refsb = page_address(refdev->sb_page);
                if (!uuid_equal(refsb, sb)) {
                        printk(KERN_WARNING "md: %s has different UUID to %s\n",
                                b, bdevname(refdev->bdev,b2));
@@ -1084,8 +1148,11 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
                        ret = 0;
        }
        rdev->sectors = rdev->sb_start;
+       /* Limit to 4TB as metadata cannot record more than that */
+       if (rdev->sectors >= (2ULL << 32))
+               rdev->sectors = (2ULL << 32) - 2;
 
-       if (rdev->sectors < sb->size * 2 && sb->level > 1)
+       if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1)
                /* "this cannot possibly happen" ... */
                ret = -EINVAL;
 
@@ -1099,7 +1166,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
 static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 {
        mdp_disk_t *desc;
-       mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page);
+       mdp_super_t *sb = page_address(rdev->sb_page);
        __u64 ev1 = md_event(sb);
 
        rdev->raid_disk = -1;
@@ -1119,7 +1186,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
                mddev->clevel[0] = 0;
                mddev->layout = sb->layout;
                mddev->raid_disks = sb->raid_disks;
-               mddev->dev_sectors = sb->size * 2;
+               mddev->dev_sectors = ((sector_t)sb->size) * 2;
                mddev->events = ev1;
                mddev->bitmap_info.offset = 0;
                mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
@@ -1230,7 +1297,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 
        rdev->sb_size = MD_SB_BYTES;
 
-       sb = (mdp_super_t*)page_address(rdev->sb_page);
+       sb = page_address(rdev->sb_page);
 
        memset(sb, 0, sizeof(*sb));
 
@@ -1361,6 +1428,11 @@ super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
        rdev->sb_start = calc_dev_sboffset(rdev);
        if (!num_sectors || num_sectors > rdev->sb_start)
                num_sectors = rdev->sb_start;
+       /* Limit to 4TB as metadata cannot record more than that.
+        * 4TB == 2^32 KB, or 2*2^32 sectors.
+        */
+       if (num_sectors >= (2ULL << 32))
+               num_sectors = (2ULL << 32) - 2;
        md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
                       rdev->sb_page);
        md_super_wait(rdev->mddev);
@@ -1395,6 +1467,8 @@ static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb)
        return cpu_to_le32(csum);
 }
 
+static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
+                           int acknowledged);
 static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
 {
        struct mdp_superblock_1 *sb;
@@ -1435,7 +1509,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
        if (ret) return ret;
 
 
-       sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
+       sb = page_address(rdev->sb_page);
 
        if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
            sb->major_version != cpu_to_le32(1) ||
@@ -1473,12 +1547,52 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
        else
                rdev->desc_nr = le32_to_cpu(sb->dev_number);
 
+       if (!rdev->bb_page) {
+               rdev->bb_page = alloc_page(GFP_KERNEL);
+               if (!rdev->bb_page)
+                       return -ENOMEM;
+       }
+       if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) &&
+           rdev->badblocks.count == 0) {
+               /* need to load the bad block list.
+                * Currently we limit it to one page.
+                */
+               s32 offset;
+               sector_t bb_sector;
+               u64 *bbp;
+               int i;
+               int sectors = le16_to_cpu(sb->bblog_size);
+               if (sectors > (PAGE_SIZE / 512))
+                       return -EINVAL;
+               offset = le32_to_cpu(sb->bblog_offset);
+               if (offset == 0)
+                       return -EINVAL;
+               bb_sector = (long long)offset;
+               if (!sync_page_io(rdev, bb_sector, sectors << 9,
+                                 rdev->bb_page, READ, true))
+                       return -EIO;
+               bbp = (u64 *)page_address(rdev->bb_page);
+               rdev->badblocks.shift = sb->bblog_shift;
+               for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) {
+                       u64 bb = le64_to_cpu(*bbp);
+                       int count = bb & (0x3ff);
+                       u64 sector = bb >> 10;
+                       sector <<= sb->bblog_shift;
+                       count <<= sb->bblog_shift;
+                       if (bb + 1 == 0)
+                               break;
+                       if (md_set_badblocks(&rdev->badblocks,
+                                            sector, count, 1) == 0)
+                               return -EINVAL;
+               }
+       } else if (sb->bblog_offset == 0)
+               rdev->badblocks.shift = -1;
+
        if (!refdev) {
                ret = 1;
        } else {
                __u64 ev1, ev2;
-               struct mdp_superblock_1 *refsb = 
-                       (struct mdp_superblock_1*)page_address(refdev->sb_page);
+               struct mdp_superblock_1 *refsb = page_address(refdev->sb_page);
 
                if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
                    sb->level != refsb->level ||
@@ -1513,7 +1627,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
 
 static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 {
-       struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
+       struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
        __u64 ev1 = le64_to_cpu(sb->events);
 
        rdev->raid_disk = -1;
@@ -1619,13 +1733,12 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
        int max_dev, i;
        /* make rdev->sb match mddev and rdev data. */
 
-       sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
+       sb = page_address(rdev->sb_page);
 
        sb->feature_map = 0;
        sb->pad0 = 0;
        sb->recovery_offset = cpu_to_le64(0);
        memset(sb->pad1, 0, sizeof(sb->pad1));
-       memset(sb->pad2, 0, sizeof(sb->pad2));
        memset(sb->pad3, 0, sizeof(sb->pad3));
 
        sb->utime = cpu_to_le64((__u64)mddev->utime);
@@ -1643,6 +1756,11 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
        sb->level = cpu_to_le32(mddev->level);
        sb->layout = cpu_to_le32(mddev->layout);
 
+       if (test_bit(WriteMostly, &rdev->flags))
+               sb->devflags |= WriteMostly1;
+       else
+               sb->devflags &= ~WriteMostly1;
+
        if (mddev->bitmap && mddev->bitmap_info.file == NULL) {
                sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset);
                sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
@@ -1665,6 +1783,40 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
                sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
        }
 
+       if (rdev->badblocks.count == 0)
+               /* Nothing to do for bad blocks*/ ;
+       else if (sb->bblog_offset == 0)
+               /* Cannot record bad blocks on this device */
+               md_error(mddev, rdev);
+       else {
+               struct badblocks *bb = &rdev->badblocks;
+               u64 *bbp = (u64 *)page_address(rdev->bb_page);
+               u64 *p = bb->page;
+               sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS);
+               if (bb->changed) {
+                       unsigned seq;
+
+retry:
+                       seq = read_seqbegin(&bb->lock);
+
+                       memset(bbp, 0xff, PAGE_SIZE);
+
+                       for (i = 0 ; i < bb->count ; i++) {
+                               u64 internal_bb = *p++;
+                               u64 store_bb = ((BB_OFFSET(internal_bb) << 10)
+                                               | BB_LEN(internal_bb));
+                               *bbp++ = cpu_to_le64(store_bb);
+                       }
+                       if (read_seqretry(&bb->lock, seq))
+                               goto retry;
+
+                       bb->sector = (rdev->sb_start +
+                                     (int)le32_to_cpu(sb->bblog_offset));
+                       bb->size = le16_to_cpu(sb->bblog_size);
+                       bb->changed = 0;
+               }
+       }
+
        max_dev = 0;
        list_for_each_entry(rdev2, &mddev->disks, same_set)
                if (rdev2->desc_nr+1 > max_dev)
@@ -1724,7 +1876,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
                        num_sectors = max_sectors;
                rdev->sb_start = sb_start;
        }
-       sb = (struct mdp_superblock_1 *) page_address(rdev->sb_page);
+       sb = page_address(rdev->sb_page);
        sb->data_size = cpu_to_le64(num_sectors);
        sb->super_offset = rdev->sb_start;
        sb->sb_csum = calc_sb_1_csum(sb);
@@ -1922,7 +2074,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
        bd_link_disk_holder(rdev->bdev, mddev->gendisk);
 
        /* May as well allow recovery to be retried once */
-       mddev->recovery_disabled = 0;
+       mddev->recovery_disabled++;
 
        return 0;
 
@@ -1953,6 +2105,9 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev)
        sysfs_remove_link(&rdev->kobj, "block");
        sysfs_put(rdev->sysfs_state);
        rdev->sysfs_state = NULL;
+       kfree(rdev->badblocks.page);
+       rdev->badblocks.count = 0;
+       rdev->badblocks.page = NULL;
        /* We need to delay this, otherwise we can deadlock when
         * writing to 'remove' to "dev/state".  We also need
         * to delay it due to rcu usage.
@@ -2127,10 +2282,10 @@ static void print_rdev(mdk_rdev_t *rdev, int major_version)
                printk(KERN_INFO "md: rdev superblock (MJ:%d):\n", major_version);
                switch (major_version) {
                case 0:
-                       print_sb_90((mdp_super_t*)page_address(rdev->sb_page));
+                       print_sb_90(page_address(rdev->sb_page));
                        break;
                case 1:
-                       print_sb_1((struct mdp_superblock_1 *)page_address(rdev->sb_page));
+                       print_sb_1(page_address(rdev->sb_page));
                        break;
                }
        } else
@@ -2194,6 +2349,7 @@ static void md_update_sb(mddev_t * mddev, int force_change)
        mdk_rdev_t *rdev;
        int sync_req;
        int nospares = 0;
+       int any_badblocks_changed = 0;
 
 repeat:
        /* First make sure individual recovery_offsets are correct */
@@ -2208,8 +2364,18 @@ repeat:
        if (!mddev->persistent) {
                clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
                clear_bit(MD_CHANGE_DEVS, &mddev->flags);
-               if (!mddev->external)
+               if (!mddev->external) {
                        clear_bit(MD_CHANGE_PENDING, &mddev->flags);
+                       list_for_each_entry(rdev, &mddev->disks, same_set) {
+                               if (rdev->badblocks.changed) {
+                                       md_ack_all_badblocks(&rdev->badblocks);
+                                       md_error(mddev, rdev);
+                               }
+                               clear_bit(Blocked, &rdev->flags);
+                               clear_bit(BlockedBadBlocks, &rdev->flags);
+                               wake_up(&rdev->blocked_wait);
+                       }
+               }
                wake_up(&mddev->sb_wait);
                return;
        }
@@ -2265,6 +2431,14 @@ repeat:
                MD_BUG();
                mddev->events --;
        }
+
+       list_for_each_entry(rdev, &mddev->disks, same_set) {
+               if (rdev->badblocks.changed)
+                       any_badblocks_changed++;
+               if (test_bit(Faulty, &rdev->flags))
+                       set_bit(FaultRecorded, &rdev->flags);
+       }
+
        sync_sbs(mddev, nospares);
        spin_unlock_irq(&mddev->write_lock);
 
@@ -2290,6 +2464,13 @@ repeat:
                                bdevname(rdev->bdev,b),
                                (unsigned long long)rdev->sb_start);
                        rdev->sb_events = mddev->events;
+                       if (rdev->badblocks.size) {
+                               md_super_write(mddev, rdev,
+                                              rdev->badblocks.sector,
+                                              rdev->badblocks.size << 9,
+                                              rdev->bb_page);
+                               rdev->badblocks.size = 0;
+                       }
 
                } else
                        dprintk(")\n");
@@ -2313,6 +2494,15 @@ repeat:
        if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
                sysfs_notify(&mddev->kobj, NULL, "sync_completed");
 
+       list_for_each_entry(rdev, &mddev->disks, same_set) {
+               if (test_and_clear_bit(FaultRecorded, &rdev->flags))
+                       clear_bit(Blocked, &rdev->flags);
+
+               if (any_badblocks_changed)
+                       md_ack_all_badblocks(&rdev->badblocks);
+               clear_bit(BlockedBadBlocks, &rdev->flags);
+               wake_up(&rdev->blocked_wait);
+       }
 }
 
 /* words written to sysfs files may, or may not, be \n terminated.
@@ -2347,7 +2537,8 @@ state_show(mdk_rdev_t *rdev, char *page)
        char *sep = "";
        size_t len = 0;
 
-       if (test_bit(Faulty, &rdev->flags)) {
+       if (test_bit(Faulty, &rdev->flags) ||
+           rdev->badblocks.unacked_exist) {
                len+= sprintf(page+len, "%sfaulty",sep);
                sep = ",";
        }
@@ -2359,7 +2550,8 @@ state_show(mdk_rdev_t *rdev, char *page)
                len += sprintf(page+len, "%swrite_mostly",sep);
                sep = ",";
        }
-       if (test_bit(Blocked, &rdev->flags)) {
+       if (test_bit(Blocked, &rdev->flags) ||
+           rdev->badblocks.unacked_exist) {
                len += sprintf(page+len, "%sblocked", sep);
                sep = ",";
        }
@@ -2368,6 +2560,10 @@ state_show(mdk_rdev_t *rdev, char *page)
                len += sprintf(page+len, "%sspare", sep);
                sep = ",";
        }
+       if (test_bit(WriteErrorSeen, &rdev->flags)) {
+               len += sprintf(page+len, "%swrite_error", sep);
+               sep = ",";
+       }
        return len+sprintf(page+len, "\n");
 }
 
@@ -2375,18 +2571,23 @@ static ssize_t
 state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 {
        /* can write
-        *  faulty  - simulates and error
+        *  faulty  - simulates an error
         *  remove  - disconnects the device
         *  writemostly - sets write_mostly
         *  -writemostly - clears write_mostly
-        *  blocked - sets the Blocked flag
-        *  -blocked - clears the Blocked flag
+        *  blocked - sets the Blocked flags
+        *  -blocked - clears the Blocked and possibly simulates an error
         *  insync - sets Insync providing device isn't active
+        *  write_error - sets WriteErrorSeen
+        *  -write_error - clears WriteErrorSeen
         */
        int err = -EINVAL;
        if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
                md_error(rdev->mddev, rdev);
-               err = 0;
+               if (test_bit(Faulty, &rdev->flags))
+                       err = 0;
+               else
+                       err = -EBUSY;
        } else if (cmd_match(buf, "remove")) {
                if (rdev->raid_disk >= 0)
                        err = -EBUSY;
@@ -2408,7 +2609,15 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
                set_bit(Blocked, &rdev->flags);
                err = 0;
        } else if (cmd_match(buf, "-blocked")) {
+               if (!test_bit(Faulty, &rdev->flags) &&
+                   rdev->badblocks.unacked_exist) {
+                       /* metadata handler doesn't understand badblocks,
+                        * so we need to fail the device
+                        */
+                       md_error(rdev->mddev, rdev);
+               }
                clear_bit(Blocked, &rdev->flags);
+               clear_bit(BlockedBadBlocks, &rdev->flags);
                wake_up(&rdev->blocked_wait);
                set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
                md_wakeup_thread(rdev->mddev->thread);
@@ -2417,6 +2626,12 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
        } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
                set_bit(In_sync, &rdev->flags);
                err = 0;
+       } else if (cmd_match(buf, "write_error")) {
+               set_bit(WriteErrorSeen, &rdev->flags);
+               err = 0;
+       } else if (cmd_match(buf, "-write_error")) {
+               clear_bit(WriteErrorSeen, &rdev->flags);
+               err = 0;
        }
        if (!err)
                sysfs_notify_dirent_safe(rdev->sysfs_state);
@@ -2459,7 +2674,6 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 {
        char *e;
        int err;
-       char nm[20];
        int slot = simple_strtoul(buf, &e, 10);
        if (strncmp(buf, "none", 4)==0)
                slot = -1;
@@ -2482,8 +2696,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
                        hot_remove_disk(rdev->mddev, rdev->raid_disk);
                if (err)
                        return err;
-               sprintf(nm, "rd%d", rdev->raid_disk);
-               sysfs_remove_link(&rdev->mddev->kobj, nm);
+               sysfs_unlink_rdev(rdev->mddev, rdev);
                rdev->raid_disk = -1;
                set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
                md_wakeup_thread(rdev->mddev->thread);
@@ -2522,8 +2735,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
                        return err;
                } else
                        sysfs_notify_dirent_safe(rdev->sysfs_state);
-               sprintf(nm, "rd%d", rdev->raid_disk);
-               if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm))
+               if (sysfs_link_rdev(rdev->mddev, rdev))
                        /* failure here is OK */;
                /* don't wakeup anyone, leave that to userspace. */
        } else {
@@ -2712,6 +2924,39 @@ static ssize_t recovery_start_store(mdk_rdev_t *rdev, const char *buf, size_t le
 static struct rdev_sysfs_entry rdev_recovery_start =
 __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store);
 
+
+static ssize_t
+badblocks_show(struct badblocks *bb, char *page, int unack);
+static ssize_t
+badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack);
+
+static ssize_t bb_show(mdk_rdev_t *rdev, char *page)
+{
+       return badblocks_show(&rdev->badblocks, page, 0);
+}
+static ssize_t bb_store(mdk_rdev_t *rdev, const char *page, size_t len)
+{
+       int rv = badblocks_store(&rdev->badblocks, page, len, 0);
+       /* Maybe that ack was all we needed */
+       if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags))
+               wake_up(&rdev->blocked_wait);
+       return rv;
+}
+static struct rdev_sysfs_entry rdev_bad_blocks =
+__ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store);
+
+
+static ssize_t ubb_show(mdk_rdev_t *rdev, char *page)
+{
+       return badblocks_show(&rdev->badblocks, page, 1);
+}
+static ssize_t ubb_store(mdk_rdev_t *rdev, const char *page, size_t len)
+{
+       return badblocks_store(&rdev->badblocks, page, len, 1);
+}
+static struct rdev_sysfs_entry rdev_unack_bad_blocks =
+__ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store);
+
 static struct attribute *rdev_default_attrs[] = {
        &rdev_state.attr,
        &rdev_errors.attr,
@@ -2719,6 +2964,8 @@ static struct attribute *rdev_default_attrs[] = {
        &rdev_offset.attr,
        &rdev_size.attr,
        &rdev_recovery_start.attr,
+       &rdev_bad_blocks.attr,
+       &rdev_unack_bad_blocks.attr,
        NULL,
 };
 static ssize_t
@@ -2782,7 +3029,7 @@ static struct kobj_type rdev_ktype = {
        .default_attrs  = rdev_default_attrs,
 };
 
-void md_rdev_init(mdk_rdev_t *rdev)
+int md_rdev_init(mdk_rdev_t *rdev)
 {
        rdev->desc_nr = -1;
        rdev->saved_raid_disk = -1;
@@ -2792,12 +3039,27 @@ void md_rdev_init(mdk_rdev_t *rdev)
        rdev->sb_events = 0;
        rdev->last_read_error.tv_sec  = 0;
        rdev->last_read_error.tv_nsec = 0;
+       rdev->sb_loaded = 0;
+       rdev->bb_page = NULL;
        atomic_set(&rdev->nr_pending, 0);
        atomic_set(&rdev->read_errors, 0);
        atomic_set(&rdev->corrected_errors, 0);
 
        INIT_LIST_HEAD(&rdev->same_set);
        init_waitqueue_head(&rdev->blocked_wait);
+
+       /* Add space to store bad block list.
+        * This reserves the space even on arrays where it cannot
+        * be used - I wonder if that matters
+        */
+       rdev->badblocks.count = 0;
+       rdev->badblocks.shift = 0;
+       rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL);
+       seqlock_init(&rdev->badblocks.lock);
+       if (rdev->badblocks.page == NULL)
+               return -ENOMEM;
+
+       return 0;
 }
 EXPORT_SYMBOL_GPL(md_rdev_init);
 /*
@@ -2823,8 +3085,11 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
                return ERR_PTR(-ENOMEM);
        }
 
-       md_rdev_init(rdev);
-       if ((err = alloc_disk_sb(rdev)))
+       err = md_rdev_init(rdev);
+       if (err)
+               goto abort_free;
+       err = alloc_disk_sb(rdev);
+       if (err)
                goto abort_free;
 
        err = lock_rdev(rdev, newdev, super_format == -2);
@@ -2860,15 +3125,17 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
                        goto abort_free;
                }
        }
+       if (super_format == -1)
+               /* hot-add for 0.90, or non-persistent: so no badblocks */
+               rdev->badblocks.shift = -1;
 
        return rdev;
 
 abort_free:
-       if (rdev->sb_page) {
-               if (rdev->bdev)
-                       unlock_rdev(rdev);
-               free_disk_sb(rdev);
-       }
+       if (rdev->bdev)
+               unlock_rdev(rdev);
+       free_disk_sb(rdev);
+       kfree(rdev->badblocks.page);
        kfree(rdev);
        return ERR_PTR(err);
 }
@@ -3149,15 +3416,13 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
        }
 
        list_for_each_entry(rdev, &mddev->disks, same_set) {
-               char nm[20];
                if (rdev->raid_disk < 0)
                        continue;
                if (rdev->new_raid_disk >= mddev->raid_disks)
                        rdev->new_raid_disk = -1;
                if (rdev->new_raid_disk == rdev->raid_disk)
                        continue;
-               sprintf(nm, "rd%d", rdev->raid_disk);
-               sysfs_remove_link(&mddev->kobj, nm);
+               sysfs_unlink_rdev(mddev, rdev);
        }
        list_for_each_entry(rdev, &mddev->disks, same_set) {
                if (rdev->raid_disk < 0)
@@ -3168,11 +3433,10 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
                if (rdev->raid_disk < 0)
                        clear_bit(In_sync, &rdev->flags);
                else {
-                       char nm[20];
-                       sprintf(nm, "rd%d", rdev->raid_disk);
-                       if(sysfs_create_link(&mddev->kobj, &rdev->kobj, nm))
-                               printk("md: cannot register %s for %s after level change\n",
-                                      nm, mdname(mddev));
+                       if (sysfs_link_rdev(mddev, rdev))
+                               printk(KERN_WARNING "md: cannot register rd%d"
+                                      " for %s after level change\n",
+                                      rdev->raid_disk, mdname(mddev));
                }
        }
 
@@ -4504,7 +4768,8 @@ int md_run(mddev_t *mddev)
        }
 
        if (mddev->bio_set == NULL)
-               mddev->bio_set = bioset_create(BIO_POOL_SIZE, sizeof(mddev));
+               mddev->bio_set = bioset_create(BIO_POOL_SIZE,
+                                              sizeof(mddev_t *));
 
        spin_lock(&pers_lock);
        pers = find_pers(mddev->level, mddev->clevel);
@@ -4621,12 +4886,9 @@ int md_run(mddev_t *mddev)
        smp_wmb();
        mddev->ready = 1;
        list_for_each_entry(rdev, &mddev->disks, same_set)
-               if (rdev->raid_disk >= 0) {
-                       char nm[20];
-                       sprintf(nm, "rd%d", rdev->raid_disk);
-                       if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm))
+               if (rdev->raid_disk >= 0)
+                       if (sysfs_link_rdev(mddev, rdev))
                                /* failure here is OK */;
-               }
        
        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
        
@@ -4854,11 +5116,8 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
                sysfs_notify_dirent_safe(mddev->sysfs_state);
 
                list_for_each_entry(rdev, &mddev->disks, same_set)
-                       if (rdev->raid_disk >= 0) {
-                               char nm[20];
-                               sprintf(nm, "rd%d", rdev->raid_disk);
-                               sysfs_remove_link(&mddev->kobj, nm);
-                       }
+                       if (rdev->raid_disk >= 0)
+                               sysfs_unlink_rdev(mddev, rdev);
 
                set_capacity(disk, 0);
                mutex_unlock(&mddev->open_mutex);
@@ -5750,6 +6009,8 @@ static int set_disk_faulty(mddev_t *mddev, dev_t dev)
                return -ENODEV;
 
        md_error(mddev, rdev);
+       if (!test_bit(Faulty, &rdev->flags))
+               return -EBUSY;
        return 0;
 }
 
@@ -6178,11 +6439,18 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
        return thread;
 }
 
-void md_unregister_thread(mdk_thread_t *thread)
+void md_unregister_thread(mdk_thread_t **threadp)
 {
+       mdk_thread_t *thread = *threadp;
        if (!thread)
                return;
        dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
+       /* Locking ensures that mddev_unlock does not wake_up a
+        * non-existent thread
+        */
+       spin_lock(&pers_lock);
+       *threadp = NULL;
+       spin_unlock(&pers_lock);
 
        kthread_stop(thread->tsk);
        kfree(thread);
@@ -6198,18 +6466,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
        if (!rdev || test_bit(Faulty, &rdev->flags))
                return;
 
-       if (mddev->external)
-               set_bit(Blocked, &rdev->flags);
-/*
-       dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
-               mdname(mddev),
-               MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev),
-               __builtin_return_address(0),__builtin_return_address(1),
-               __builtin_return_address(2),__builtin_return_address(3));
-*/
-       if (!mddev->pers)
-               return;
-       if (!mddev->pers->error_handler)
+       if (!mddev->pers || !mddev->pers->error_handler)
                return;
        mddev->pers->error_handler(mddev,rdev);
        if (mddev->degraded)
@@ -6933,11 +7190,14 @@ void md_do_sync(mddev_t *mddev)
                        atomic_add(sectors, &mddev->recovery_active);
                }
 
+               if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
+                       break;
+
                j += sectors;
                if (j>1) mddev->curr_resync = j;
                mddev->curr_mark_cnt = io_sectors;
                if (last_check == 0)
-                       /* this is the earliers that rebuilt will be
+                       /* this is the earliest that rebuild will be
                         * visible in /proc/mdstat
                         */
                        md_new_event(mddev);
@@ -6946,10 +7206,6 @@ void md_do_sync(mddev_t *mddev)
                        continue;
 
                last_check = io_sectors;
-
-               if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
-                       break;
-
        repeat:
                if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
                        /* step marks */
@@ -7067,29 +7323,23 @@ static int remove_and_add_spares(mddev_t *mddev)
                    atomic_read(&rdev->nr_pending)==0) {
                        if (mddev->pers->hot_remove_disk(
                                    mddev, rdev->raid_disk)==0) {
-                               char nm[20];
-                               sprintf(nm,"rd%d", rdev->raid_disk);
-                               sysfs_remove_link(&mddev->kobj, nm);
+                               sysfs_unlink_rdev(mddev, rdev);
                                rdev->raid_disk = -1;
                        }
                }
 
-       if (mddev->degraded && !mddev->recovery_disabled) {
+       if (mddev->degraded) {
                list_for_each_entry(rdev, &mddev->disks, same_set) {
                        if (rdev->raid_disk >= 0 &&
                            !test_bit(In_sync, &rdev->flags) &&
-                           !test_bit(Faulty, &rdev->flags) &&
-                           !test_bit(Blocked, &rdev->flags))
+                           !test_bit(Faulty, &rdev->flags))
                                spares++;
                        if (rdev->raid_disk < 0
                            && !test_bit(Faulty, &rdev->flags)) {
                                rdev->recovery_offset = 0;
                                if (mddev->pers->
                                    hot_add_disk(mddev, rdev) == 0) {
-                                       char nm[20];
-                                       sprintf(nm, "rd%d", rdev->raid_disk);
-                                       if (sysfs_create_link(&mddev->kobj,
-                                                             &rdev->kobj, nm))
+                                       if (sysfs_link_rdev(mddev, rdev))
                                                /* failure here is OK */;
                                        spares++;
                                        md_new_event(mddev);
@@ -7107,8 +7357,7 @@ static void reap_sync_thread(mddev_t *mddev)
        mdk_rdev_t *rdev;
 
        /* resync has finished, collect result */
-       md_unregister_thread(mddev->sync_thread);
-       mddev->sync_thread = NULL;
+       md_unregister_thread(&mddev->sync_thread);
        if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
            !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
                /* success...*/
@@ -7138,6 +7387,8 @@ static void reap_sync_thread(mddev_t *mddev)
        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
        sysfs_notify_dirent_safe(mddev->sysfs_action);
        md_new_event(mddev);
+       if (mddev->event_work.func)
+               queue_work(md_misc_wq, &mddev->event_work);
 }
 
 /*
@@ -7170,9 +7421,6 @@ void md_check_recovery(mddev_t *mddev)
        if (mddev->bitmap)
                bitmap_daemon_work(mddev);
 
-       if (mddev->ro)
-               return;
-
        if (signal_pending(current)) {
                if (mddev->pers->sync_request && !mddev->external) {
                        printk(KERN_INFO "md: %s in immediate safe mode\n",
@@ -7209,9 +7457,7 @@ void md_check_recovery(mddev_t *mddev)
                                    atomic_read(&rdev->nr_pending)==0) {
                                        if (mddev->pers->hot_remove_disk(
                                                    mddev, rdev->raid_disk)==0) {
-                                               char nm[20];
-                                               sprintf(nm,"rd%d", rdev->raid_disk);
-                                               sysfs_remove_link(&mddev->kobj, nm);
+                                               sysfs_unlink_rdev(mddev, rdev);
                                                rdev->raid_disk = -1;
                                        }
                                }
@@ -7331,12 +7577,499 @@ void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
 {
        sysfs_notify_dirent_safe(rdev->sysfs_state);
        wait_event_timeout(rdev->blocked_wait,
-                          !test_bit(Blocked, &rdev->flags),
+                          !test_bit(Blocked, &rdev->flags) &&
+                          !test_bit(BlockedBadBlocks, &rdev->flags),
                           msecs_to_jiffies(5000));
        rdev_dec_pending(rdev, mddev);
 }
 EXPORT_SYMBOL(md_wait_for_blocked_rdev);
 
+
+/* Bad block management.
+ * We can record which blocks on each device are 'bad' and so just
+ * fail those blocks, or that stripe, rather than the whole device.
+ * Entries in the bad-block table are 64bits wide.  This comprises:
+ * Length of bad-range, in sectors: 0-511 for lengths 1-512
+ * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes)
+ *  A 'shift' can be set so that larger blocks are tracked and
+ *  consequently larger devices can be covered.
+ * 'Acknowledged' flag - 1 bit. - the most significant bit.
+ *
+ * Locking of the bad-block table uses a seqlock so md_is_badblock
+ * might need to retry if it is very unlucky.
+ * We will sometimes want to check for bad blocks in a bi_end_io function,
+ * so we use the write_seqlock_irq variant.
+ *
+ * When looking for a bad block we specify a range and want to
+ * know if any block in the range is bad.  So we binary-search
+ * to the last range that starts at-or-before the given endpoint,
+ * (or "before the sector after the target range")
+ * then see if it ends after the given start.
+ * We return
+ *  0 if there are no known bad blocks in the range
+ *  1 if there are known bad block which are all acknowledged
+ * -1 if there are bad blocks which have not yet been acknowledged in metadata.
+ * plus the start/length of the first bad section we overlap.
+ */
+int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
+                  sector_t *first_bad, int *bad_sectors)
+{
+       int hi;
+       int lo = 0;
+       u64 *p = bb->page;
+       int rv = 0;
+       sector_t target = s + sectors;
+       unsigned seq;
+
+       if (bb->shift > 0) {
+               /* round the start down, and the end up */
+               s >>= bb->shift;
+               target += (1<<bb->shift) - 1;
+               target >>= bb->shift;
+               sectors = target - s;
+       }
+       /* 'target' is now the first block after the bad range */
+
+retry:
+       seq = read_seqbegin(&bb->lock);
+
+       hi = bb->count;
+
+       /* Binary search between lo and hi for 'target'
+        * i.e. for the last range that starts before 'target'
+        */
+       /* INVARIANT: ranges before 'lo' and at-or-after 'hi'
+        * are known not to be the last range before target.
+        * VARIANT: hi-lo is the number of possible
+        * ranges, and decreases until it reaches 1
+        */
+       while (hi - lo > 1) {
+               int mid = (lo + hi) / 2;
+               sector_t a = BB_OFFSET(p[mid]);
+               if (a < target)
+                       /* This could still be the one, earlier ranges
+                        * could not. */
+                       lo = mid;
+               else
+                       /* This and later ranges are definitely out. */
+                       hi = mid;
+       }
+       /* 'lo' might be the last that started before target, but 'hi' isn't */
+       if (hi > lo) {
+               /* need to check all range that end after 's' to see if
+                * any are unacknowledged.
+                */
+               while (lo >= 0 &&
+                      BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
+                       if (BB_OFFSET(p[lo]) < target) {
+                               /* starts before the end, and finishes after
+                                * the start, so they must overlap
+                                */
+                               if (rv != -1 && BB_ACK(p[lo]))
+                                       rv = 1;
+                               else
+                                       rv = -1;
+                               *first_bad = BB_OFFSET(p[lo]);
+                               *bad_sectors = BB_LEN(p[lo]);
+                       }
+                       lo--;
+               }
+       }
+
+       if (read_seqretry(&bb->lock, seq))
+               goto retry;
+
+       return rv;
+}
+EXPORT_SYMBOL_GPL(md_is_badblock);
+
+/*
+ * Add a range of bad blocks to the table.
+ * This might extend the table, or might contract it
+ * if two adjacent ranges can be merged.
+ * We binary-search to find the 'insertion' point, then
+ * decide how best to handle it.
+ */
+static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
+                           int acknowledged)
+{
+       u64 *p;
+       int lo, hi;
+       int rv = 1;
+
+       if (bb->shift < 0)
+               /* badblocks are disabled */
+               return 0;
+
+       if (bb->shift) {
+               /* round the start down, and the end up */
+               sector_t next = s + sectors;
+               s >>= bb->shift;
+               next += (1<<bb->shift) - 1;
+               next >>= bb->shift;
+               sectors = next - s;
+       }
+
+       write_seqlock_irq(&bb->lock);
+
+       p = bb->page;
+       lo = 0;
+       hi = bb->count;
+       /* Find the last range that starts at-or-before 's' */
+       while (hi - lo > 1) {
+               int mid = (lo + hi) / 2;
+               sector_t a = BB_OFFSET(p[mid]);
+               if (a <= s)
+                       lo = mid;
+               else
+                       hi = mid;
+       }
+       if (hi > lo && BB_OFFSET(p[lo]) > s)
+               hi = lo;
+
+       if (hi > lo) {
+               /* we found a range that might merge with the start
+                * of our new range
+                */
+               sector_t a = BB_OFFSET(p[lo]);
+               sector_t e = a + BB_LEN(p[lo]);
+               int ack = BB_ACK(p[lo]);
+               if (e >= s) {
+                       /* Yes, we can merge with a previous range */
+                       if (s == a && s + sectors >= e)
+                               /* new range covers old */
+                               ack = acknowledged;
+                       else
+                               ack = ack && acknowledged;
+
+                       if (e < s + sectors)
+                               e = s + sectors;
+                       if (e - a <= BB_MAX_LEN) {
+                               p[lo] = BB_MAKE(a, e-a, ack);
+                               s = e;
+                       } else {
+                               /* does not all fit in one range,
+                                * make p[lo] maximal
+                                */
+                               if (BB_LEN(p[lo]) != BB_MAX_LEN)
+                                       p[lo] = BB_MAKE(a, BB_MAX_LEN, ack);
+                               s = a + BB_MAX_LEN;
+                       }
+                       sectors = e - s;
+               }
+       }
+       if (sectors && hi < bb->count) {
+               /* 'hi' points to the first range that starts after 's'.
+                * Maybe we can merge with the start of that range */
+               sector_t a = BB_OFFSET(p[hi]);
+               sector_t e = a + BB_LEN(p[hi]);
+               int ack = BB_ACK(p[hi]);
+               if (a <= s + sectors) {
+                       /* merging is possible */
+                       if (e <= s + sectors) {
+                               /* full overlap */
+                               e = s + sectors;
+                               ack = acknowledged;
+                       } else
+                               ack = ack && acknowledged;
+
+                       a = s;
+                       if (e - a <= BB_MAX_LEN) {
+                               p[hi] = BB_MAKE(a, e-a, ack);
+                               s = e;
+                       } else {
+                               p[hi] = BB_MAKE(a, BB_MAX_LEN, ack);
+                               s = a + BB_MAX_LEN;
+                       }
+                       sectors = e - s;
+                       lo = hi;
+                       hi++;
+               }
+       }
+       if (sectors == 0 && hi < bb->count) {
+               /* we might be able to combine lo and hi */
+               /* Note: 's' is at the end of 'lo' */
+               sector_t a = BB_OFFSET(p[hi]);
+               int lolen = BB_LEN(p[lo]);
+               int hilen = BB_LEN(p[hi]);
+               int newlen = lolen + hilen - (s - a);
+               if (s >= a && newlen < BB_MAX_LEN) {
+                       /* yes, we can combine them */
+                       int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]);
+                       p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack);
+                       memmove(p + hi, p + hi + 1,
+                               (bb->count - hi - 1) * 8);
+                       bb->count--;
+               }
+       }
+       while (sectors) {
+               /* didn't merge (it all).
+                * Need to add a range just before 'hi' */
+               if (bb->count >= MD_MAX_BADBLOCKS) {
+                       /* No room for more */
+                       rv = 0;
+                       break;
+               } else {
+                       int this_sectors = sectors;
+                       memmove(p + hi + 1, p + hi,
+                               (bb->count - hi) * 8);
+                       bb->count++;
+
+                       if (this_sectors > BB_MAX_LEN)
+                               this_sectors = BB_MAX_LEN;
+                       p[hi] = BB_MAKE(s, this_sectors, acknowledged);
+                       sectors -= this_sectors;
+                       s += this_sectors;
+               }
+       }
+
+       bb->changed = 1;
+       if (!acknowledged)
+               bb->unacked_exist = 1;
+       write_sequnlock_irq(&bb->lock);
+
+       return rv;
+}
+
+int rdev_set_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors,
+                      int acknowledged)
+{
+       int rv = md_set_badblocks(&rdev->badblocks,
+                                 s + rdev->data_offset, sectors, acknowledged);
+       if (rv) {
+               /* Make sure they get written out promptly */
+               set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags);
+               md_wakeup_thread(rdev->mddev->thread);
+       }
+       return rv;
+}
+EXPORT_SYMBOL_GPL(rdev_set_badblocks);
+
+/*
+ * Remove a range of bad blocks from the table.
+ * This may involve extending the table if we spilt a region,
+ * but it must not fail.  So if the table becomes full, we just
+ * drop the remove request.
+ */
+static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors)
+{
+       u64 *p;
+       int lo, hi;
+       sector_t target = s + sectors;
+       int rv = 0;
+
+       if (bb->shift > 0) {
+               /* When clearing we round the start up and the end down.
+                * This should not matter as the shift should align with
+                * the block size and no rounding should ever be needed.
+                * However it is better the think a block is bad when it
+                * isn't than to think a block is not bad when it is.
+                */
+               s += (1<<bb->shift) - 1;
+               s >>= bb->shift;
+               target >>= bb->shift;
+               sectors = target - s;
+       }
+
+       write_seqlock_irq(&bb->lock);
+
+       p = bb->page;
+       lo = 0;
+       hi = bb->count;
+       /* Find the last range that starts before 'target' */
+       while (hi - lo > 1) {
+               int mid = (lo + hi) / 2;
+               sector_t a = BB_OFFSET(p[mid]);
+               if (a < target)
+                       lo = mid;
+               else
+                       hi = mid;
+       }
+       if (hi > lo) {
+               /* p[lo] is the last range that could overlap the
+                * current range.  Earlier ranges could also overlap,
+                * but only this one can overlap the end of the range.
+                */
+               if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) {
+                       /* Partial overlap, leave the tail of this range */
+                       int ack = BB_ACK(p[lo]);
+                       sector_t a = BB_OFFSET(p[lo]);
+                       sector_t end = a + BB_LEN(p[lo]);
+
+                       if (a < s) {
+                               /* we need to split this range */
+                               if (bb->count >= MD_MAX_BADBLOCKS) {
+                                       rv = 0;
+                                       goto out;
+                               }
+                               memmove(p+lo+1, p+lo, (bb->count - lo) * 8);
+                               bb->count++;
+                               p[lo] = BB_MAKE(a, s-a, ack);
+                               lo++;
+                       }
+                       p[lo] = BB_MAKE(target, end - target, ack);
+                       /* there is no longer an overlap */
+                       hi = lo;
+                       lo--;
+               }
+               while (lo >= 0 &&
+                      BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
+                       /* This range does overlap */
+                       if (BB_OFFSET(p[lo]) < s) {
+                               /* Keep the early parts of this range. */
+                               int ack = BB_ACK(p[lo]);
+                               sector_t start = BB_OFFSET(p[lo]);
+                               p[lo] = BB_MAKE(start, s - start, ack);
+                               /* now low doesn't overlap, so.. */
+                               break;
+                       }
+                       lo--;
+               }
+               /* 'lo' is strictly before, 'hi' is strictly after,
+                * anything between needs to be discarded
+                */
+               if (hi - lo > 1) {
+                       memmove(p+lo+1, p+hi, (bb->count - hi) * 8);
+                       bb->count -= (hi - lo - 1);
+               }
+       }
+
+       bb->changed = 1;
+out:
+       write_sequnlock_irq(&bb->lock);
+       return rv;
+}
+
+int rdev_clear_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors)
+{
+       return md_clear_badblocks(&rdev->badblocks,
+                                 s + rdev->data_offset,
+                                 sectors);
+}
+EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
+
+/*
+ * Acknowledge all bad blocks in a list.
+ * This only succeeds if ->changed is clear.  It is used by
+ * in-kernel metadata updates
+ */
+void md_ack_all_badblocks(struct badblocks *bb)
+{
+       if (bb->page == NULL || bb->changed)
+               /* no point even trying */
+               return;
+       write_seqlock_irq(&bb->lock);
+
+       if (bb->changed == 0) {
+               u64 *p = bb->page;
+               int i;
+               for (i = 0; i < bb->count ; i++) {
+                       if (!BB_ACK(p[i])) {
+                               sector_t start = BB_OFFSET(p[i]);
+                               int len = BB_LEN(p[i]);
+                               p[i] = BB_MAKE(start, len, 1);
+                       }
+               }
+               bb->unacked_exist = 0;
+       }
+       write_sequnlock_irq(&bb->lock);
+}
+EXPORT_SYMBOL_GPL(md_ack_all_badblocks);
+
+/* sysfs access to bad-blocks list.
+ * We present two files.
+ * 'bad-blocks' lists sector numbers and lengths of ranges that
+ *    are recorded as bad.  The list is truncated to fit within
+ *    the one-page limit of sysfs.
+ *    Writing "sector length" to this file adds an acknowledged
+ *    bad block list.
+ * 'unacknowledged-bad-blocks' lists bad blocks that have not yet
+ *    been acknowledged.  Writing to this file adds bad blocks
+ *    without acknowledging them.  This is largely for testing.
+ */
+
+static ssize_t
+badblocks_show(struct badblocks *bb, char *page, int unack)
+{
+       size_t len;
+       int i;
+       u64 *p = bb->page;
+       unsigned seq;
+
+       if (bb->shift < 0)
+               return 0;
+
+retry:
+       seq = read_seqbegin(&bb->lock);
+
+       len = 0;
+       i = 0;
+
+       while (len < PAGE_SIZE && i < bb->count) {
+               sector_t s = BB_OFFSET(p[i]);
+               unsigned int length = BB_LEN(p[i]);
+               int ack = BB_ACK(p[i]);
+               i++;
+
+               if (unack && ack)
+                       continue;
+
+               len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n",
+                               (unsigned long long)s << bb->shift,
+                               length << bb->shift);
+       }
+       if (unack && len == 0)
+               bb->unacked_exist = 0;
+
+       if (read_seqretry(&bb->lock, seq))
+               goto retry;
+
+       return len;
+}
+
+#define DO_DEBUG 1
+
+static ssize_t
+badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack)
+{
+       unsigned long long sector;
+       int length;
+       char newline;
+#ifdef DO_DEBUG
+       /* Allow clearing via sysfs *only* for testing/debugging.
+        * Normally only a successful write may clear a badblock
+        */
+       int clear = 0;
+       if (page[0] == '-') {
+               clear = 1;
+               page++;
+       }
+#endif /* DO_DEBUG */
+
+       switch (sscanf(page, "%llu %d%c", &sector, &length, &newline)) {
+       case 3:
+               if (newline != '\n')
+                       return -EINVAL;
+       case 2:
+               if (length <= 0)
+                       return -EINVAL;
+               break;
+       default:
+               return -EINVAL;
+       }
+
+#ifdef DO_DEBUG
+       if (clear) {
+               md_clear_badblocks(bb, sector, length);
+               return len;
+       }
+#endif /* DO_DEBUG */
+       if (md_set_badblocks(bb, sector, length, !unack))
+               return len;
+       else
+               return -ENOSPC;
+}
+
 static int md_notify_reboot(struct notifier_block *this,
                            unsigned long code, void *x)
 {