]> git.kernelconcepts.de Git - karo-tx-linux.git/commitdiff
Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 24 Feb 2017 22:42:19 +0000 (14:42 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 24 Feb 2017 22:42:19 +0000 (14:42 -0800)
Pull md updates from Shaohua Li:
 "Mainly fixes bugs and improves performance:

   - Improve scalability for raid1 from Coly

   - Improve raid5-cache read performance, disk efficiency and IO
     pattern from Song and me

   - Fix a race condition of disk hotplug for linear from Coly

   - A few cleanup patches from Ming and Byungchul

   - Fix a memory leak from Neil

   - Fix WRITE SAME IO failure from me

   - Add doc for raid5-cache from me"

* 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md: (23 commits)
  md/raid1: fix write behind issues introduced by bio_clone_bioset_partial
  md/raid1: handle flush request correctly
  md/linear: shutup lockdep warnning
  md/raid1: fix a use-after-free bug
  RAID1: avoid unnecessary spin locks in I/O barrier code
  RAID1: a new I/O barrier implementation to remove resync window
  md/raid5: Don't reinvent the wheel but use existing llist API
  md: fast clone bio in bio_clone_mddev()
  md: remove unnecessary check on mddev
  md/raid1: use bio_clone_bioset_partial() in case of write behind
  md: fail if mddev->bio_set can't be created
  block: introduce bio_clone_bioset_partial()
  md: disable WRITE SAME if it fails in underlayer disks
  md/raid5-cache: exclude reclaiming stripes in reclaim check
  md/raid5-cache: stripe reclaim only counts valid stripes
  MD: add doc for raid5-cache
  Documentation: move MD related doc into a separate dir
  md: ensure md devices are freed before module is unloaded.
  md/r5cache: improve journal device efficiency
  md/r5cache: enable chunk_aligned_read with write back cache
  ...

20 files changed:
Documentation/00-INDEX
Documentation/admin-guide/md.rst
Documentation/md/md-cluster.txt [moved from Documentation/md-cluster.txt with 100% similarity]
Documentation/md/raid5-cache.txt [new file with mode: 0644]
block/bio.c
drivers/md/faulty.c
drivers/md/linear.c
drivers/md/linear.h
drivers/md/md.c
drivers/md/md.h
drivers/md/multipath.c
drivers/md/raid0.c
drivers/md/raid1.c
drivers/md/raid1.h
drivers/md/raid10.c
drivers/md/raid5-cache.c
drivers/md/raid5.c
drivers/md/raid5.h
include/linux/bio.h
lib/radix-tree.c

index c8a8eb1a2b119c064f038559fa67f6511a31bce6..793acf999e9eac87057af3214ca1f98ad65b922f 100644 (file)
@@ -270,8 +270,8 @@ m68k/
        - directory with info about Linux on Motorola 68k architecture.
 mailbox.txt
        - How to write drivers for the common mailbox framework (IPC).
-md-cluster.txt
-       - info on shared-device RAID MD cluster.
+md/
+       - directory with info about Linux Software RAID
 media/
        - info on media drivers: uAPI, kAPI and driver documentation.
 memory-barriers.txt
index e449fb5f277c25b9b31800561f73a2d2e0d63593..1e61bf50595c84c936cfe8788bb37114a6f7d5f0 100644 (file)
@@ -725,3 +725,8 @@ These currently include:
       to 1.  Setting this to 0 disables bypass accounting and
       requires preread stripes to wait until all full-width stripe-
       writes are complete.  Valid values are 0 to stripe_cache_size.
+
+  journal_mode (currently raid5 only)
+      The cache mode for raid5. raid5 could include an extra disk for
+      caching. The mode can be "write-throuth" and "write-back". The
+      default is "write-through".
diff --git a/Documentation/md/raid5-cache.txt b/Documentation/md/raid5-cache.txt
new file mode 100644 (file)
index 0000000..2b210f2
--- /dev/null
@@ -0,0 +1,109 @@
+RAID5 cache
+
+Raid 4/5/6 could include an extra disk for data cache besides normal RAID
+disks. The role of RAID disks isn't changed with the cache disk. The cache disk
+caches data to the RAID disks. The cache can be in write-through (supported
+since 4.4) or write-back mode (supported since 4.10). mdadm (supported since
+3.4) has a new option '--write-journal' to create array with cache. Please
+refer to mdadm manual for details. By default (RAID array starts), the cache is
+in write-through mode. A user can switch it to write-back mode by:
+
+echo "write-back" > /sys/block/md0/md/journal_mode
+
+And switch it back to write-through mode by:
+
+echo "write-through" > /sys/block/md0/md/journal_mode
+
+In both modes, all writes to the array will hit cache disk first. This means
+the cache disk must be fast and sustainable.
+
+-------------------------------------
+write-through mode:
+
+This mode mainly fixes the 'write hole' issue. For RAID 4/5/6 array, an unclean
+shutdown can cause data in some stripes to not be in consistent state, eg, data
+and parity don't match. The reason is that a stripe write involves several RAID
+disks and it's possible the writes don't hit all RAID disks yet before the
+unclean shutdown. We call an array degraded if it has inconsistent data. MD
+tries to resync the array to bring it back to normal state. But before the
+resync completes, any system crash will expose the chance of real data
+corruption in the RAID array. This problem is called 'write hole'.
+
+The write-through cache will cache all data on cache disk first. After the data
+is safe on the cache disk, the data will be flushed onto RAID disks. The
+two-step write will guarantee MD can recover correct data after unclean
+shutdown even the array is degraded. Thus the cache can close the 'write hole'.
+
+In write-through mode, MD reports IO completion to upper layer (usually
+filesystems) after the data is safe on RAID disks, so cache disk failure
+doesn't cause data loss. Of course cache disk failure means the array is
+exposed to 'write hole' again.
+
+In write-through mode, the cache disk isn't required to be big. Several
+hundreds megabytes are enough.
+
+--------------------------------------
+write-back mode:
+
+write-back mode fixes the 'write hole' issue too, since all write data is
+cached on cache disk. But the main goal of 'write-back' cache is to speed up
+write. If a write crosses all RAID disks of a stripe, we call it full-stripe
+write. For non-full-stripe writes, MD must read old data before the new parity
+can be calculated. These synchronous reads hurt write throughput. Some writes
+which are sequential but not dispatched in the same time will suffer from this
+overhead too. Write-back cache will aggregate the data and flush the data to
+RAID disks only after the data becomes a full stripe write. This will
+completely avoid the overhead, so it's very helpful for some workloads. A
+typical workload which does sequential write followed by fsync is an example.
+
+In write-back mode, MD reports IO completion to upper layer (usually
+filesystems) right after the data hits cache disk. The data is flushed to raid
+disks later after specific conditions met. So cache disk failure will cause
+data loss.
+
+In write-back mode, MD also caches data in memory. The memory cache includes
+the same data stored on cache disk, so a power loss doesn't cause data loss.
+The memory cache size has performance impact for the array. It's recommended
+the size is big. A user can configure the size by:
+
+echo "2048" > /sys/block/md0/md/stripe_cache_size
+
+Too small cache disk will make the write aggregation less efficient in this
+mode depending on the workloads. It's recommended to use a cache disk with at
+least several gigabytes size in write-back mode.
+
+--------------------------------------
+The implementation:
+
+The write-through and write-back cache use the same disk format. The cache disk
+is organized as a simple write log. The log consists of 'meta data' and 'data'
+pairs. The meta data describes the data. It also includes checksum and sequence
+ID for recovery identification. Data can be IO data and parity data. Data is
+checksumed too. The checksum is stored in the meta data ahead of the data. The
+checksum is an optimization because MD can write meta and data freely without
+worry about the order. MD superblock has a field pointed to the valid meta data
+of log head.
+
+The log implementation is pretty straightforward. The difficult part is the
+order in which MD writes data to cache disk and RAID disks. Specifically, in
+write-through mode, MD calculates parity for IO data, writes both IO data and
+parity to the log, writes the data and parity to RAID disks after the data and
+parity is settled down in log and finally the IO is finished. Read just reads
+from raid disks as usual.
+
+In write-back mode, MD writes IO data to the log and reports IO completion. The
+data is also fully cached in memory at that time, which means read must query
+memory cache. If some conditions are met, MD will flush the data to RAID disks.
+MD will calculate parity for the data and write parity into the log. After this
+is finished, MD will write both data and parity into RAID disks, then MD can
+release the memory cache. The flush conditions could be stripe becomes a full
+stripe write, free cache disk space is low or free in-kernel memory cache space
+is low.
+
+After an unclean shutdown, MD does recovery. MD reads all meta data and data
+from the log. The sequence ID and checksum will help us detect corrupted meta
+data and data. If MD finds a stripe with data and valid parities (1 parity for
+raid4/5 and 2 for raid6), MD will write the data and parities to RAID disks. If
+parities are incompleted, they are discarded. If part of data is corrupted,
+they are discarded too. MD then loads valid data and writes them to RAID disks
+in normal way.
index 4b564d0c3e29a4c15becf545a996794b296ae622..5eec5e08417f6ff1989e3e2a07b31c62901953d5 100644 (file)
@@ -625,21 +625,20 @@ struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs)
 }
 EXPORT_SYMBOL(bio_clone_fast);
 
-/**
- *     bio_clone_bioset - clone a bio
- *     @bio_src: bio to clone
- *     @gfp_mask: allocation priority
- *     @bs: bio_set to allocate from
- *
- *     Clone bio. Caller will own the returned bio, but not the actual data it
- *     points to. Reference count of returned bio will be one.
- */
-struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
-                            struct bio_set *bs)
+static struct bio *__bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
+                                     struct bio_set *bs, int offset,
+                                     int size)
 {
        struct bvec_iter iter;
        struct bio_vec bv;
        struct bio *bio;
+       struct bvec_iter iter_src = bio_src->bi_iter;
+
+       /* for supporting partial clone */
+       if (offset || size != bio_src->bi_iter.bi_size) {
+               bio_advance_iter(bio_src, &iter_src, offset);
+               iter_src.bi_size = size;
+       }
 
        /*
         * Pre immutable biovecs, __bio_clone() used to just do a memcpy from
@@ -663,7 +662,8 @@ struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
         *    __bio_clone_fast() anyways.
         */
 
-       bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs);
+       bio = bio_alloc_bioset(gfp_mask, __bio_segments(bio_src,
+                              &iter_src), bs);
        if (!bio)
                return NULL;
        bio->bi_bdev            = bio_src->bi_bdev;
@@ -680,7 +680,7 @@ struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
                bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0];
                break;
        default:
-               bio_for_each_segment(bv, bio_src, iter)
+               __bio_for_each_segment(bv, bio_src, iter, iter_src)
                        bio->bi_io_vec[bio->bi_vcnt++] = bv;
                break;
        }
@@ -699,8 +699,43 @@ struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
 
        return bio;
 }
+
+/**
+ *     bio_clone_bioset - clone a bio
+ *     @bio_src: bio to clone
+ *     @gfp_mask: allocation priority
+ *     @bs: bio_set to allocate from
+ *
+ *     Clone bio. Caller will own the returned bio, but not the actual data it
+ *     points to. Reference count of returned bio will be one.
+ */
+struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
+                            struct bio_set *bs)
+{
+       return __bio_clone_bioset(bio_src, gfp_mask, bs, 0,
+                                 bio_src->bi_iter.bi_size);
+}
 EXPORT_SYMBOL(bio_clone_bioset);
 
+/**
+ *     bio_clone_bioset_partial - clone a partial bio
+ *     @bio_src: bio to clone
+ *     @gfp_mask: allocation priority
+ *     @bs: bio_set to allocate from
+ *     @offset: cloned starting from the offset
+ *     @size: size for the cloned bio
+ *
+ *     Clone bio. Caller will own the returned bio, but not the actual data it
+ *     points to. Reference count of returned bio will be one.
+ */
+struct bio *bio_clone_bioset_partial(struct bio *bio_src, gfp_t gfp_mask,
+                                    struct bio_set *bs, int offset,
+                                    int size)
+{
+       return __bio_clone_bioset(bio_src, gfp_mask, bs, offset, size);
+}
+EXPORT_SYMBOL(bio_clone_bioset_partial);
+
 /**
  *     bio_add_pc_page -       attempt to add page to bio
  *     @q: the target queue
index 685aa2d77e2526935f8f2f416ad8d8681b6c7b14..b0536cfd8e174b83a53d49391552b0c7ec64aef8 100644 (file)
@@ -214,7 +214,7 @@ static void faulty_make_request(struct mddev *mddev, struct bio *bio)
                }
        }
        if (failit) {
-               struct bio *b = bio_clone_mddev(bio, GFP_NOIO, mddev);
+               struct bio *b = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
 
                b->bi_bdev = conf->rdev->bdev;
                b->bi_private = bio;
index f1c7bbac31a580bb6f708b614696f17404badc0d..3e38e0207a3eb44339ad6431dc3557ae27d05612 100644 (file)
@@ -53,18 +53,26 @@ static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector)
        return conf->disks + lo;
 }
 
+/*
+ * In linear_congested() conf->raid_disks is used as a copy of
+ * mddev->raid_disks to iterate conf->disks[], because conf->raid_disks
+ * and conf->disks[] are created in linear_conf(), they are always
+ * consitent with each other, but mddev->raid_disks does not.
+ */
 static int linear_congested(struct mddev *mddev, int bits)
 {
        struct linear_conf *conf;
        int i, ret = 0;
 
-       conf = mddev->private;
+       rcu_read_lock();
+       conf = rcu_dereference(mddev->private);
 
-       for (i = 0; i < mddev->raid_disks && !ret ; i++) {
+       for (i = 0; i < conf->raid_disks && !ret ; i++) {
                struct request_queue *q = bdev_get_queue(conf->disks[i].rdev->bdev);
                ret |= bdi_congested(q->backing_dev_info, bits);
        }
 
+       rcu_read_unlock();
        return ret;
 }
 
@@ -144,6 +152,19 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
                        conf->disks[i-1].end_sector +
                        conf->disks[i].rdev->sectors;
 
+       /*
+        * conf->raid_disks is copy of mddev->raid_disks. The reason to
+        * keep a copy of mddev->raid_disks in struct linear_conf is,
+        * mddev->raid_disks may not be consistent with pointers number of
+        * conf->disks[] when it is updated in linear_add() and used to
+        * iterate old conf->disks[] earray in linear_congested().
+        * Here conf->raid_disks is always consitent with number of
+        * pointers in conf->disks[] array, and mddev->private is updated
+        * with rcu_assign_pointer() in linear_addr(), such race can be
+        * avoided.
+        */
+       conf->raid_disks = raid_disks;
+
        return conf;
 
 out:
@@ -196,15 +217,24 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev)
        if (!newconf)
                return -ENOMEM;
 
+       /* newconf->raid_disks already keeps a copy of * the increased
+        * value of mddev->raid_disks, WARN_ONCE() is just used to make
+        * sure of this. It is possible that oldconf is still referenced
+        * in linear_congested(), therefore kfree_rcu() is used to free
+        * oldconf until no one uses it anymore.
+        */
        mddev_suspend(mddev);
-       oldconf = mddev->private;
+       oldconf = rcu_dereference_protected(mddev->private,
+                       lockdep_is_held(&mddev->reconfig_mutex));
        mddev->raid_disks++;
-       mddev->private = newconf;
+       WARN_ONCE(mddev->raid_disks != newconf->raid_disks,
+               "copied raid_disks doesn't match mddev->raid_disks");
+       rcu_assign_pointer(mddev->private, newconf);
        md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
        set_capacity(mddev->gendisk, mddev->array_sectors);
        mddev_resume(mddev);
        revalidate_disk(mddev->gendisk);
-       kfree(oldconf);
+       kfree_rcu(oldconf, rcu);
        return 0;
 }
 
@@ -262,6 +292,7 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
                                trace_block_bio_remap(bdev_get_queue(split->bi_bdev),
                                                      split, disk_devt(mddev->gendisk),
                                                      bio_sector);
+                       mddev_check_writesame(mddev, split);
                        generic_make_request(split);
                }
        } while (split != bio);
index b685ddd7d7f76c25553ce88abd049fd9ead545e6..8d392e6098b3295ddbebac59e418a27ae21712e5 100644 (file)
@@ -10,6 +10,7 @@ struct linear_conf
 {
        struct rcu_head         rcu;
        sector_t                array_sectors;
+       int                     raid_disks; /* a copy of mddev->raid_disks */
        struct dev_info         disks[0];
 };
 #endif
index ba485dcf1064dd463bdb93edd85157b247d0fcb5..985374f20e2e3f4d78ac1d7d77213b2e1ad7e80c 100644 (file)
@@ -190,16 +190,6 @@ struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
 }
 EXPORT_SYMBOL_GPL(bio_alloc_mddev);
 
-struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
-                           struct mddev *mddev)
-{
-       if (!mddev || !mddev->bio_set)
-               return bio_clone(bio, gfp_mask);
-
-       return bio_clone_bioset(bio, gfp_mask, mddev->bio_set);
-}
-EXPORT_SYMBOL_GPL(bio_clone_mddev);
-
 /*
  * We have a system wide 'event count' that is incremented
  * on any 'interesting' event, and readers of /proc/mdstat
@@ -5228,8 +5218,11 @@ int md_run(struct mddev *mddev)
                sysfs_notify_dirent_safe(rdev->sysfs_state);
        }
 
-       if (mddev->bio_set == NULL)
+       if (mddev->bio_set == NULL) {
                mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0);
+               if (!mddev->bio_set)
+                       return -ENOMEM;
+       }
 
        spin_lock(&pers_lock);
        pers = find_pers(mddev->level, mddev->clevel);
@@ -8980,7 +8973,14 @@ static __exit void md_exit(void)
 
        for_each_mddev(mddev, tmp) {
                export_array(mddev);
+               mddev->ctime = 0;
                mddev->hold_active = 0;
+               /*
+                * for_each_mddev() will call mddev_put() at the end of each
+                * iteration.  As the mddev is now fully clear, this will
+                * schedule the mddev for destruction by a workqueue, and the
+                * destroy_workqueue() below will wait for that to complete.
+                */
        }
        destroy_workqueue(md_misc_wq);
        destroy_workqueue(md_wq);
index 2a514036a83dc0da07c0966b7fe247c18356bbbf..b8859cbf84b618b39ed3d92a2887e8764c403919 100644 (file)
@@ -673,8 +673,6 @@ extern void md_rdev_clear(struct md_rdev *rdev);
 
 extern void mddev_suspend(struct mddev *mddev);
 extern void mddev_resume(struct mddev *mddev);
-extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
-                                  struct mddev *mddev);
 extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
                                   struct mddev *mddev);
 
@@ -710,4 +708,11 @@ static inline void mddev_clear_unsupported_flags(struct mddev *mddev,
 {
        mddev->flags &= ~unsupported_flags;
 }
+
+static inline void mddev_check_writesame(struct mddev *mddev, struct bio *bio)
+{
+       if (bio_op(bio) == REQ_OP_WRITE_SAME &&
+           !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors)
+               mddev->queue->limits.max_write_same_sectors = 0;
+}
 #endif /* _MD_MD_H */
index d457afa672d57a172965aa000913fa6ff6625878..79a12b59250bbca870be857eb7cf350c0c9b53ad 100644 (file)
@@ -138,6 +138,7 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio)
        mp_bh->bio.bi_opf |= REQ_FAILFAST_TRANSPORT;
        mp_bh->bio.bi_end_io = multipath_end_request;
        mp_bh->bio.bi_private = mp_bh;
+       mddev_check_writesame(mddev, &mp_bh->bio);
        generic_make_request(&mp_bh->bio);
        return;
 }
index d6585239bff22809edbcaf3881dc2f2ae0a2f41e..93347ca7c7a617e097ccafcbedbecdfa396d4968 100644 (file)
@@ -503,6 +503,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
                                trace_block_bio_remap(bdev_get_queue(split->bi_bdev),
                                                      split, disk_devt(mddev->gendisk),
                                                      bio_sector);
+                       mddev_check_writesame(mddev, split);
                        generic_make_request(split);
                }
        } while (split != bio);
index 830ff2b203463ef075d53a6c7a2ae22e0ec2c7d9..7453d94eeed700c8ac30da1b8d7857b4788fdbd5 100644 (file)
@@ -71,9 +71,8 @@
  */
 static int max_queued_requests = 1024;
 
-static void allow_barrier(struct r1conf *conf, sector_t start_next_window,
-                         sector_t bi_sector);
-static void lower_barrier(struct r1conf *conf);
+static void allow_barrier(struct r1conf *conf, sector_t sector_nr);
+static void lower_barrier(struct r1conf *conf, sector_t sector_nr);
 
 #define raid1_log(md, fmt, args...)                            \
        do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0)
@@ -100,7 +99,6 @@ static void r1bio_pool_free(void *r1_bio, void *data)
 #define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9)
 #define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW)
 #define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9)
-#define NEXT_NORMALIO_DISTANCE (3 * RESYNC_WINDOW_SECTORS)
 
 static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 {
@@ -205,6 +203,7 @@ static void free_r1bio(struct r1bio *r1_bio)
 static void put_buf(struct r1bio *r1_bio)
 {
        struct r1conf *conf = r1_bio->mddev->private;
+       sector_t sect = r1_bio->sector;
        int i;
 
        for (i = 0; i < conf->raid_disks * 2; i++) {
@@ -215,7 +214,7 @@ static void put_buf(struct r1bio *r1_bio)
 
        mempool_free(r1_bio, conf->r1buf_pool);
 
-       lower_barrier(conf);
+       lower_barrier(conf, sect);
 }
 
 static void reschedule_retry(struct r1bio *r1_bio)
@@ -223,10 +222,12 @@ static void reschedule_retry(struct r1bio *r1_bio)
        unsigned long flags;
        struct mddev *mddev = r1_bio->mddev;
        struct r1conf *conf = mddev->private;
+       int idx;
 
+       idx = sector_to_idx(r1_bio->sector);
        spin_lock_irqsave(&conf->device_lock, flags);
        list_add(&r1_bio->retry_list, &conf->retry_list);
-       conf->nr_queued ++;
+       atomic_inc(&conf->nr_queued[idx]);
        spin_unlock_irqrestore(&conf->device_lock, flags);
 
        wake_up(&conf->wait_barrier);
@@ -243,7 +244,6 @@ static void call_bio_endio(struct r1bio *r1_bio)
        struct bio *bio = r1_bio->master_bio;
        int done;
        struct r1conf *conf = r1_bio->mddev->private;
-       sector_t start_next_window = r1_bio->start_next_window;
        sector_t bi_sector = bio->bi_iter.bi_sector;
 
        if (bio->bi_phys_segments) {
@@ -269,7 +269,7 @@ static void call_bio_endio(struct r1bio *r1_bio)
                 * Wake up any possible resync thread that waits for the device
                 * to go idle.
                 */
-               allow_barrier(conf, start_next_window, bi_sector);
+               allow_barrier(conf, bi_sector);
        }
 }
 
@@ -517,6 +517,25 @@ static void raid1_end_write_request(struct bio *bio)
                bio_put(to_put);
 }
 
+static sector_t align_to_barrier_unit_end(sector_t start_sector,
+                                         sector_t sectors)
+{
+       sector_t len;
+
+       WARN_ON(sectors == 0);
+       /*
+        * len is the number of sectors from start_sector to end of the
+        * barrier unit which start_sector belongs to.
+        */
+       len = round_up(start_sector + 1, BARRIER_UNIT_SECTOR_SIZE) -
+             start_sector;
+
+       if (len > sectors)
+               len = sectors;
+
+       return len;
+}
+
 /*
  * This routine returns the disk from which the requested read should
  * be done. There is a per-array 'next expected sequential IO' sector
@@ -813,168 +832,228 @@ static void flush_pending_writes(struct r1conf *conf)
  */
 static void raise_barrier(struct r1conf *conf, sector_t sector_nr)
 {
+       int idx = sector_to_idx(sector_nr);
+
        spin_lock_irq(&conf->resync_lock);
 
        /* Wait until no block IO is waiting */
-       wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting,
+       wait_event_lock_irq(conf->wait_barrier,
+                           !atomic_read(&conf->nr_waiting[idx]),
                            conf->resync_lock);
 
        /* block any new IO from starting */
-       conf->barrier++;
-       conf->next_resync = sector_nr;
+       atomic_inc(&conf->barrier[idx]);
+       /*
+        * In raise_barrier() we firstly increase conf->barrier[idx] then
+        * check conf->nr_pending[idx]. In _wait_barrier() we firstly
+        * increase conf->nr_pending[idx] then check conf->barrier[idx].
+        * A memory barrier here to make sure conf->nr_pending[idx] won't
+        * be fetched before conf->barrier[idx] is increased. Otherwise
+        * there will be a race between raise_barrier() and _wait_barrier().
+        */
+       smp_mb__after_atomic();
 
        /* For these conditions we must wait:
         * A: while the array is in frozen state
-        * B: while barrier >= RESYNC_DEPTH, meaning resync reach
-        *    the max count which allowed.
-        * C: next_resync + RESYNC_SECTORS > start_next_window, meaning
-        *    next resync will reach to the window which normal bios are
-        *    handling.
-        * D: while there are any active requests in the current window.
+        * B: while conf->nr_pending[idx] is not 0, meaning regular I/O
+        *    existing in corresponding I/O barrier bucket.
+        * C: while conf->barrier[idx] >= RESYNC_DEPTH, meaning reaches
+        *    max resync count which allowed on current I/O barrier bucket.
         */
        wait_event_lock_irq(conf->wait_barrier,
                            !conf->array_frozen &&
-                           conf->barrier < RESYNC_DEPTH &&
-                           conf->current_window_requests == 0 &&
-                           (conf->start_next_window >=
-                            conf->next_resync + RESYNC_SECTORS),
+                            !atomic_read(&conf->nr_pending[idx]) &&
+                            atomic_read(&conf->barrier[idx]) < RESYNC_DEPTH,
                            conf->resync_lock);
 
-       conf->nr_pending++;
+       atomic_inc(&conf->nr_pending[idx]);
        spin_unlock_irq(&conf->resync_lock);
 }
 
-static void lower_barrier(struct r1conf *conf)
+static void lower_barrier(struct r1conf *conf, sector_t sector_nr)
 {
-       unsigned long flags;
-       BUG_ON(conf->barrier <= 0);
-       spin_lock_irqsave(&conf->resync_lock, flags);
-       conf->barrier--;
-       conf->nr_pending--;
-       spin_unlock_irqrestore(&conf->resync_lock, flags);
+       int idx = sector_to_idx(sector_nr);
+
+       BUG_ON(atomic_read(&conf->barrier[idx]) <= 0);
+
+       atomic_dec(&conf->barrier[idx]);
+       atomic_dec(&conf->nr_pending[idx]);
        wake_up(&conf->wait_barrier);
 }
 
-static bool need_to_wait_for_sync(struct r1conf *conf, struct bio *bio)
+static void _wait_barrier(struct r1conf *conf, int idx)
 {
-       bool wait = false;
+       /*
+        * We need to increase conf->nr_pending[idx] very early here,
+        * then raise_barrier() can be blocked when it waits for
+        * conf->nr_pending[idx] to be 0. Then we can avoid holding
+        * conf->resync_lock when there is no barrier raised in same
+        * barrier unit bucket. Also if the array is frozen, I/O
+        * should be blocked until array is unfrozen.
+        */
+       atomic_inc(&conf->nr_pending[idx]);
+       /*
+        * In _wait_barrier() we firstly increase conf->nr_pending[idx], then
+        * check conf->barrier[idx]. In raise_barrier() we firstly increase
+        * conf->barrier[idx], then check conf->nr_pending[idx]. A memory
+        * barrier is necessary here to make sure conf->barrier[idx] won't be
+        * fetched before conf->nr_pending[idx] is increased. Otherwise there
+        * will be a race between _wait_barrier() and raise_barrier().
+        */
+       smp_mb__after_atomic();
 
-       if (conf->array_frozen || !bio)
-               wait = true;
-       else if (conf->barrier && bio_data_dir(bio) == WRITE) {
-               if ((conf->mddev->curr_resync_completed
-                    >= bio_end_sector(bio)) ||
-                   (conf->start_next_window + NEXT_NORMALIO_DISTANCE
-                    <= bio->bi_iter.bi_sector))
-                       wait = false;
-               else
-                       wait = true;
-       }
+       /*
+        * Don't worry about checking two atomic_t variables at same time
+        * here. If during we check conf->barrier[idx], the array is
+        * frozen (conf->array_frozen is 1), and chonf->barrier[idx] is
+        * 0, it is safe to return and make the I/O continue. Because the
+        * array is frozen, all I/O returned here will eventually complete
+        * or be queued, no race will happen. See code comment in
+        * frozen_array().
+        */
+       if (!READ_ONCE(conf->array_frozen) &&
+           !atomic_read(&conf->barrier[idx]))
+               return;
 
-       return wait;
+       /*
+        * After holding conf->resync_lock, conf->nr_pending[idx]
+        * should be decreased before waiting for barrier to drop.
+        * Otherwise, we may encounter a race condition because
+        * raise_barrer() might be waiting for conf->nr_pending[idx]
+        * to be 0 at same time.
+        */
+       spin_lock_irq(&conf->resync_lock);
+       atomic_inc(&conf->nr_waiting[idx]);
+       atomic_dec(&conf->nr_pending[idx]);
+       /*
+        * In case freeze_array() is waiting for
+        * get_unqueued_pending() == extra
+        */
+       wake_up(&conf->wait_barrier);
+       /* Wait for the barrier in same barrier unit bucket to drop. */
+       wait_event_lock_irq(conf->wait_barrier,
+                           !conf->array_frozen &&
+                            !atomic_read(&conf->barrier[idx]),
+                           conf->resync_lock);
+       atomic_inc(&conf->nr_pending[idx]);
+       atomic_dec(&conf->nr_waiting[idx]);
+       spin_unlock_irq(&conf->resync_lock);
 }
 
-static sector_t wait_barrier(struct r1conf *conf, struct bio *bio)
+static void wait_read_barrier(struct r1conf *conf, sector_t sector_nr)
 {
-       sector_t sector = 0;
+       int idx = sector_to_idx(sector_nr);
 
-       spin_lock_irq(&conf->resync_lock);
-       if (need_to_wait_for_sync(conf, bio)) {
-               conf->nr_waiting++;
-               /* Wait for the barrier to drop.
-                * However if there are already pending
-                * requests (preventing the barrier from
-                * rising completely), and the
-                * per-process bio queue isn't empty,
-                * then don't wait, as we need to empty
-                * that queue to allow conf->start_next_window
-                * to increase.
-                */
-               raid1_log(conf->mddev, "wait barrier");
-               wait_event_lock_irq(conf->wait_barrier,
-                                   !conf->array_frozen &&
-                                   (!conf->barrier ||
-                                    ((conf->start_next_window <
-                                      conf->next_resync + RESYNC_SECTORS) &&
-                                     current->bio_list &&
-                                     !bio_list_empty(current->bio_list))),
-                                   conf->resync_lock);
-               conf->nr_waiting--;
-       }
-
-       if (bio && bio_data_dir(bio) == WRITE) {
-               if (bio->bi_iter.bi_sector >= conf->next_resync) {
-                       if (conf->start_next_window == MaxSector)
-                               conf->start_next_window =
-                                       conf->next_resync +
-                                       NEXT_NORMALIO_DISTANCE;
-
-                       if ((conf->start_next_window + NEXT_NORMALIO_DISTANCE)
-                           <= bio->bi_iter.bi_sector)
-                               conf->next_window_requests++;
-                       else
-                               conf->current_window_requests++;
-                       sector = conf->start_next_window;
-               }
-       }
+       /*
+        * Very similar to _wait_barrier(). The difference is, for read
+        * I/O we don't need wait for sync I/O, but if the whole array
+        * is frozen, the read I/O still has to wait until the array is
+        * unfrozen. Since there is no ordering requirement with
+        * conf->barrier[idx] here, memory barrier is unnecessary as well.
+        */
+       atomic_inc(&conf->nr_pending[idx]);
 
-       conf->nr_pending++;
+       if (!READ_ONCE(conf->array_frozen))
+               return;
+
+       spin_lock_irq(&conf->resync_lock);
+       atomic_inc(&conf->nr_waiting[idx]);
+       atomic_dec(&conf->nr_pending[idx]);
+       /*
+        * In case freeze_array() is waiting for
+        * get_unqueued_pending() == extra
+        */
+       wake_up(&conf->wait_barrier);
+       /* Wait for array to be unfrozen */
+       wait_event_lock_irq(conf->wait_barrier,
+                           !conf->array_frozen,
+                           conf->resync_lock);
+       atomic_inc(&conf->nr_pending[idx]);
+       atomic_dec(&conf->nr_waiting[idx]);
        spin_unlock_irq(&conf->resync_lock);
-       return sector;
 }
 
-static void allow_barrier(struct r1conf *conf, sector_t start_next_window,
-                         sector_t bi_sector)
+static void wait_barrier(struct r1conf *conf, sector_t sector_nr)
 {
-       unsigned long flags;
+       int idx = sector_to_idx(sector_nr);
 
-       spin_lock_irqsave(&conf->resync_lock, flags);
-       conf->nr_pending--;
-       if (start_next_window) {
-               if (start_next_window == conf->start_next_window) {
-                       if (conf->start_next_window + NEXT_NORMALIO_DISTANCE
-                           <= bi_sector)
-                               conf->next_window_requests--;
-                       else
-                               conf->current_window_requests--;
-               } else
-                       conf->current_window_requests--;
-
-               if (!conf->current_window_requests) {
-                       if (conf->next_window_requests) {
-                               conf->current_window_requests =
-                                       conf->next_window_requests;
-                               conf->next_window_requests = 0;
-                               conf->start_next_window +=
-                                       NEXT_NORMALIO_DISTANCE;
-                       } else
-                               conf->start_next_window = MaxSector;
-               }
-       }
-       spin_unlock_irqrestore(&conf->resync_lock, flags);
+       _wait_barrier(conf, idx);
+}
+
+static void wait_all_barriers(struct r1conf *conf)
+{
+       int idx;
+
+       for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++)
+               _wait_barrier(conf, idx);
+}
+
+static void _allow_barrier(struct r1conf *conf, int idx)
+{
+       atomic_dec(&conf->nr_pending[idx]);
        wake_up(&conf->wait_barrier);
 }
 
+static void allow_barrier(struct r1conf *conf, sector_t sector_nr)
+{
+       int idx = sector_to_idx(sector_nr);
+
+       _allow_barrier(conf, idx);
+}
+
+static void allow_all_barriers(struct r1conf *conf)
+{
+       int idx;
+
+       for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++)
+               _allow_barrier(conf, idx);
+}
+
+/* conf->resync_lock should be held */
+static int get_unqueued_pending(struct r1conf *conf)
+{
+       int idx, ret;
+
+       for (ret = 0, idx = 0; idx < BARRIER_BUCKETS_NR; idx++)
+               ret += atomic_read(&conf->nr_pending[idx]) -
+                       atomic_read(&conf->nr_queued[idx]);
+
+       return ret;
+}
+
 static void freeze_array(struct r1conf *conf, int extra)
 {
-       /* stop syncio and normal IO and wait for everything to
+       /* Stop sync I/O and normal I/O and wait for everything to
         * go quite.
-        * We wait until nr_pending match nr_queued+extra
-        * This is called in the context of one normal IO request
-        * that has failed. Thus any sync request that might be pending
-        * will be blocked by nr_pending, and we need to wait for
-        * pending IO requests to complete or be queued for re-try.
-        * Thus the number queued (nr_queued) plus this request (extra)
-        * must match the number of pending IOs (nr_pending) before
-        * we continue.
+        * This is called in two situations:
+        * 1) management command handlers (reshape, remove disk, quiesce).
+        * 2) one normal I/O request failed.
+
+        * After array_frozen is set to 1, new sync IO will be blocked at
+        * raise_barrier(), and new normal I/O will blocked at _wait_barrier()
+        * or wait_read_barrier(). The flying I/Os will either complete or be
+        * queued. When everything goes quite, there are only queued I/Os left.
+
+        * Every flying I/O contributes to a conf->nr_pending[idx], idx is the
+        * barrier bucket index which this I/O request hits. When all sync and
+        * normal I/O are queued, sum of all conf->nr_pending[] will match sum
+        * of all conf->nr_queued[]. But normal I/O failure is an exception,
+        * in handle_read_error(), we may call freeze_array() before trying to
+        * fix the read error. In this case, the error read I/O is not queued,
+        * so get_unqueued_pending() == 1.
+        *
+        * Therefore before this function returns, we need to wait until
+        * get_unqueued_pendings(conf) gets equal to extra. For
+        * normal I/O context, extra is 1, in rested situations extra is 0.
         */
        spin_lock_irq(&conf->resync_lock);
        conf->array_frozen = 1;
        raid1_log(conf->mddev, "wait freeze");
-       wait_event_lock_irq_cmd(conf->wait_barrier,
-                               conf->nr_pending == conf->nr_queued+extra,
-                               conf->resync_lock,
-                               flush_pending_writes(conf));
+       wait_event_lock_irq_cmd(
+               conf->wait_barrier,
+               get_unqueued_pending(conf) == extra,
+               conf->resync_lock,
+               flush_pending_writes(conf));
        spin_unlock_irq(&conf->resync_lock);
 }
 static void unfreeze_array(struct r1conf *conf)
@@ -982,8 +1061,8 @@ static void unfreeze_array(struct r1conf *conf)
        /* reverse the effect of the freeze */
        spin_lock_irq(&conf->resync_lock);
        conf->array_frozen = 0;
-       wake_up(&conf->wait_barrier);
        spin_unlock_irq(&conf->resync_lock);
+       wake_up(&conf->wait_barrier);
 }
 
 /* duplicate the data pages for behind I/O
@@ -1070,11 +1149,28 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
        kfree(plug);
 }
 
-static void raid1_read_request(struct mddev *mddev, struct bio *bio,
-                                struct r1bio *r1_bio)
+static inline struct r1bio *
+alloc_r1bio(struct mddev *mddev, struct bio *bio, sector_t sectors_handled)
+{
+       struct r1conf *conf = mddev->private;
+       struct r1bio *r1_bio;
+
+       r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
+
+       r1_bio->master_bio = bio;
+       r1_bio->sectors = bio_sectors(bio) - sectors_handled;
+       r1_bio->state = 0;
+       r1_bio->mddev = mddev;
+       r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
+
+       return r1_bio;
+}
+
+static void raid1_read_request(struct mddev *mddev, struct bio *bio)
 {
        struct r1conf *conf = mddev->private;
        struct raid1_info *mirror;
+       struct r1bio *r1_bio;
        struct bio *read_bio;
        struct bitmap *bitmap = mddev->bitmap;
        const int op = bio_op(bio);
@@ -1083,8 +1179,29 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
        int max_sectors;
        int rdisk;
 
-       wait_barrier(conf, bio);
+       /*
+        * Still need barrier for READ in case that whole
+        * array is frozen.
+        */
+       wait_read_barrier(conf, bio->bi_iter.bi_sector);
+
+       r1_bio = alloc_r1bio(mddev, bio, 0);
 
+       /*
+        * We might need to issue multiple reads to different
+        * devices if there are bad blocks around, so we keep
+        * track of the number of reads in bio->bi_phys_segments.
+        * If this is 0, there is only one r1_bio and no locking
+        * will be needed when requests complete.  If it is
+        * non-zero, then it is the number of not-completed requests.
+        */
+       bio->bi_phys_segments = 0;
+       bio_clear_flag(bio, BIO_SEG_VALID);
+
+       /*
+        * make_request() can abort the operation when read-ahead is being
+        * used and no empty request is available.
+        */
 read_again:
        rdisk = read_balance(conf, r1_bio, &max_sectors);
 
@@ -1106,9 +1223,8 @@ read_again:
                           atomic_read(&bitmap->behind_writes) == 0);
        }
        r1_bio->read_disk = rdisk;
-       r1_bio->start_next_window = 0;
 
-       read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+       read_bio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
        bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector,
                 max_sectors);
 
@@ -1151,22 +1267,16 @@ read_again:
                 */
                reschedule_retry(r1_bio);
 
-               r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
-
-               r1_bio->master_bio = bio;
-               r1_bio->sectors = bio_sectors(bio) - sectors_handled;
-               r1_bio->state = 0;
-               r1_bio->mddev = mddev;
-               r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
+               r1_bio = alloc_r1bio(mddev, bio, sectors_handled);
                goto read_again;
        } else
                generic_make_request(read_bio);
 }
 
-static void raid1_write_request(struct mddev *mddev, struct bio *bio,
-                               struct r1bio *r1_bio)
+static void raid1_write_request(struct mddev *mddev, struct bio *bio)
 {
        struct r1conf *conf = mddev->private;
+       struct r1bio *r1_bio;
        int i, disks;
        struct bitmap *bitmap = mddev->bitmap;
        unsigned long flags;
@@ -1176,7 +1286,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
        int first_clone;
        int sectors_handled;
        int max_sectors;
-       sector_t start_next_window;
 
        /*
         * Register the new request and wait if the reconstruction
@@ -1212,7 +1321,19 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
                }
                finish_wait(&conf->wait_barrier, &w);
        }
-       start_next_window = wait_barrier(conf, bio);
+       wait_barrier(conf, bio->bi_iter.bi_sector);
+
+       r1_bio = alloc_r1bio(mddev, bio, 0);
+
+       /* We might need to issue multiple writes to different
+        * devices if there are bad blocks around, so we keep
+        * track of the number of writes in bio->bi_phys_segments.
+        * If this is 0, there is only one r1_bio and no locking
+        * will be needed when requests complete.  If it is
+        * non-zero, then it is the number of not-completed requests.
+        */
+       bio->bi_phys_segments = 0;
+       bio_clear_flag(bio, BIO_SEG_VALID);
 
        if (conf->pending_count >= max_queued_requests) {
                md_wakeup_thread(mddev->thread);
@@ -1233,7 +1354,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 
        disks = conf->raid_disks * 2;
  retry_write:
-       r1_bio->start_next_window = start_next_window;
        blocked_rdev = NULL;
        rcu_read_lock();
        max_sectors = r1_bio->sectors;
@@ -1300,25 +1420,15 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
        if (unlikely(blocked_rdev)) {
                /* Wait for this device to become unblocked */
                int j;
-               sector_t old = start_next_window;
 
                for (j = 0; j < i; j++)
                        if (r1_bio->bios[j])
                                rdev_dec_pending(conf->mirrors[j].rdev, mddev);
                r1_bio->state = 0;
-               allow_barrier(conf, start_next_window, bio->bi_iter.bi_sector);
+               allow_barrier(conf, bio->bi_iter.bi_sector);
                raid1_log(mddev, "wait rdev %d blocked", blocked_rdev->raid_disk);
                md_wait_for_blocked_rdev(blocked_rdev, mddev);
-               start_next_window = wait_barrier(conf, bio);
-               /*
-                * We must make sure the multi r1bios of bio have
-                * the same value of bi_phys_segments
-                */
-               if (bio->bi_phys_segments && old &&
-                   old != start_next_window)
-                       /* Wait for the former r1bio(s) to complete */
-                       wait_event(conf->wait_barrier,
-                                  bio->bi_phys_segments == 1);
+               wait_barrier(conf, bio->bi_iter.bi_sector);
                goto retry_write;
        }
 
@@ -1341,13 +1451,12 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 
        first_clone = 1;
        for (i = 0; i < disks; i++) {
-               struct bio *mbio;
+               struct bio *mbio = NULL;
+               sector_t offset;
                if (!r1_bio->bios[i])
                        continue;
 
-               mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-               bio_trim(mbio, r1_bio->sector - bio->bi_iter.bi_sector,
-                        max_sectors);
+               offset = r1_bio->sector - bio->bi_iter.bi_sector;
 
                if (first_clone) {
                        /* do behind I/O ?
@@ -1357,8 +1466,13 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
                        if (bitmap &&
                            (atomic_read(&bitmap->behind_writes)
                             < mddev->bitmap_info.max_write_behind) &&
-                           !waitqueue_active(&bitmap->behind_wait))
+                           !waitqueue_active(&bitmap->behind_wait)) {
+                               mbio = bio_clone_bioset_partial(bio, GFP_NOIO,
+                                                               mddev->bio_set,
+                                                               offset << 9,
+                                                               max_sectors << 9);
                                alloc_behind_pages(mbio, r1_bio);
+                       }
 
                        bitmap_startwrite(bitmap, r1_bio->sector,
                                          r1_bio->sectors,
@@ -1366,6 +1480,19 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
                                                   &r1_bio->state));
                        first_clone = 0;
                }
+
+               if (!mbio) {
+                       if (r1_bio->behind_bvecs)
+                               mbio = bio_clone_bioset_partial(bio, GFP_NOIO,
+                                                               mddev->bio_set,
+                                                               offset << 9,
+                                                               max_sectors << 9);
+                       else {
+                               mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
+                               bio_trim(mbio, offset, max_sectors);
+                       }
+               }
+
                if (r1_bio->behind_bvecs) {
                        struct bio_vec *bvec;
                        int j;
@@ -1385,8 +1512,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
                                   conf->mirrors[i].rdev->data_offset);
                mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
                mbio->bi_end_io = raid1_end_write_request;
-               mbio->bi_opf = bio_op(bio) |
-                       (bio->bi_opf & (REQ_SYNC | REQ_PREFLUSH | REQ_FUA));
+               mbio->bi_opf = bio_op(bio) | (bio->bi_opf & (REQ_SYNC | REQ_FUA));
                if (test_bit(FailFast, &conf->mirrors[i].rdev->flags) &&
                    !test_bit(WriteMostly, &conf->mirrors[i].rdev->flags) &&
                    conf->raid_disks - mddev->degraded > 1)
@@ -1427,12 +1553,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
                /* We need another r1_bio.  It has already been counted
                 * in bio->bi_phys_segments
                 */
-               r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
-               r1_bio->master_bio = bio;
-               r1_bio->sectors = bio_sectors(bio) - sectors_handled;
-               r1_bio->state = 0;
-               r1_bio->mddev = mddev;
-               r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
+               r1_bio = alloc_r1bio(mddev, bio, sectors_handled);
                goto retry_write;
        }
 
@@ -1444,36 +1565,30 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 
 static void raid1_make_request(struct mddev *mddev, struct bio *bio)
 {
-       struct r1conf *conf = mddev->private;
-       struct r1bio *r1_bio;
+       struct bio *split;
+       sector_t sectors;
 
-       /*
-        * make_request() can abort the operation when read-ahead is being
-        * used and no empty request is available.
-        *
-        */
-       r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
-
-       r1_bio->master_bio = bio;
-       r1_bio->sectors = bio_sectors(bio);
-       r1_bio->state = 0;
-       r1_bio->mddev = mddev;
-       r1_bio->sector = bio->bi_iter.bi_sector;
+       if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
+               md_flush_request(mddev, bio);
+               return;
+       }
 
-       /*
-        * We might need to issue multiple reads to different devices if there
-        * are bad blocks around, so we keep track of the number of reads in
-        * bio->bi_phys_segments.  If this is 0, there is only one r1_bio and
-        * no locking will be needed when requests complete.  If it is
-        * non-zero, then it is the number of not-completed requests.
-        */
-       bio->bi_phys_segments = 0;
-       bio_clear_flag(bio, BIO_SEG_VALID);
+       /* if bio exceeds barrier unit boundary, split it */
+       do {
+               sectors = align_to_barrier_unit_end(
+                               bio->bi_iter.bi_sector, bio_sectors(bio));
+               if (sectors < bio_sectors(bio)) {
+                       split = bio_split(bio, sectors, GFP_NOIO, fs_bio_set);
+                       bio_chain(split, bio);
+               } else {
+                       split = bio;
+               }
 
-       if (bio_data_dir(bio) == READ)
-               raid1_read_request(mddev, bio, r1_bio);
-       else
-               raid1_write_request(mddev, bio, r1_bio);
+               if (bio_data_dir(split) == READ)
+                       raid1_read_request(mddev, split);
+               else
+                       raid1_write_request(mddev, split);
+       } while (split != bio);
 }
 
 static void raid1_status(struct seq_file *seq, struct mddev *mddev)
@@ -1564,19 +1679,11 @@ static void print_conf(struct r1conf *conf)
 
 static void close_sync(struct r1conf *conf)
 {
-       wait_barrier(conf, NULL);
-       allow_barrier(conf, 0, 0);
+       wait_all_barriers(conf);
+       allow_all_barriers(conf);
 
        mempool_destroy(conf->r1buf_pool);
        conf->r1buf_pool = NULL;
-
-       spin_lock_irq(&conf->resync_lock);
-       conf->next_resync = MaxSector - 2 * NEXT_NORMALIO_DISTANCE;
-       conf->start_next_window = MaxSector;
-       conf->current_window_requests +=
-               conf->next_window_requests;
-       conf->next_window_requests = 0;
-       spin_unlock_irq(&conf->resync_lock);
 }
 
 static int raid1_spare_active(struct mddev *mddev)
@@ -2273,7 +2380,8 @@ static int narrow_write_error(struct r1bio *r1_bio, int i)
 
                        wbio->bi_vcnt = vcnt;
                } else {
-                       wbio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev);
+                       wbio = bio_clone_fast(r1_bio->master_bio, GFP_NOIO,
+                                             mddev->bio_set);
                }
 
                bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
@@ -2323,8 +2431,9 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio
 
 static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
 {
-       int m;
+       int m, idx;
        bool fail = false;
+
        for (m = 0; m < conf->raid_disks * 2 ; m++)
                if (r1_bio->bios[m] == IO_MADE_GOOD) {
                        struct md_rdev *rdev = conf->mirrors[m].rdev;
@@ -2350,8 +2459,14 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
        if (fail) {
                spin_lock_irq(&conf->device_lock);
                list_add(&r1_bio->retry_list, &conf->bio_end_io_list);
-               conf->nr_queued++;
+               idx = sector_to_idx(r1_bio->sector);
+               atomic_inc(&conf->nr_queued[idx]);
                spin_unlock_irq(&conf->device_lock);
+               /*
+                * In case freeze_array() is waiting for condition
+                * get_unqueued_pending() == extra to be true.
+                */
+               wake_up(&conf->wait_barrier);
                md_wakeup_thread(conf->mddev->thread);
        } else {
                if (test_bit(R1BIO_WriteError, &r1_bio->state))
@@ -2411,7 +2526,8 @@ read_more:
                const unsigned long do_sync
                        = r1_bio->master_bio->bi_opf & REQ_SYNC;
                r1_bio->read_disk = disk;
-               bio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev);
+               bio = bio_clone_fast(r1_bio->master_bio, GFP_NOIO,
+                                    mddev->bio_set);
                bio_trim(bio, r1_bio->sector - bio->bi_iter.bi_sector,
                         max_sectors);
                r1_bio->bios[r1_bio->read_disk] = bio;
@@ -2445,15 +2561,8 @@ read_more:
                        generic_make_request(bio);
                        bio = NULL;
 
-                       r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
-
-                       r1_bio->master_bio = mbio;
-                       r1_bio->sectors = bio_sectors(mbio) - sectors_handled;
-                       r1_bio->state = 0;
+                       r1_bio = alloc_r1bio(mddev, mbio, sectors_handled);
                        set_bit(R1BIO_ReadError, &r1_bio->state);
-                       r1_bio->mddev = mddev;
-                       r1_bio->sector = mbio->bi_iter.bi_sector +
-                               sectors_handled;
 
                        goto read_more;
                } else {
@@ -2472,6 +2581,7 @@ static void raid1d(struct md_thread *thread)
        struct r1conf *conf = mddev->private;
        struct list_head *head = &conf->retry_list;
        struct blk_plug plug;
+       int idx;
 
        md_check_recovery(mddev);
 
@@ -2479,17 +2589,15 @@ static void raid1d(struct md_thread *thread)
            !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
                LIST_HEAD(tmp);
                spin_lock_irqsave(&conf->device_lock, flags);
-               if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
-                       while (!list_empty(&conf->bio_end_io_list)) {
-                               list_move(conf->bio_end_io_list.prev, &tmp);
-                               conf->nr_queued--;
-                       }
-               }
+               if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
+                       list_splice_init(&conf->bio_end_io_list, &tmp);
                spin_unlock_irqrestore(&conf->device_lock, flags);
                while (!list_empty(&tmp)) {
                        r1_bio = list_first_entry(&tmp, struct r1bio,
                                                  retry_list);
                        list_del(&r1_bio->retry_list);
+                       idx = sector_to_idx(r1_bio->sector);
+                       atomic_dec(&conf->nr_queued[idx]);
                        if (mddev->degraded)
                                set_bit(R1BIO_Degraded, &r1_bio->state);
                        if (test_bit(R1BIO_WriteError, &r1_bio->state))
@@ -2510,7 +2618,8 @@ static void raid1d(struct md_thread *thread)
                }
                r1_bio = list_entry(head->prev, struct r1bio, retry_list);
                list_del(head->prev);
-               conf->nr_queued--;
+               idx = sector_to_idx(r1_bio->sector);
+               atomic_dec(&conf->nr_queued[idx]);
                spin_unlock_irqrestore(&conf->device_lock, flags);
 
                mddev = r1_bio->mddev;
@@ -2549,7 +2658,6 @@ static int init_resync(struct r1conf *conf)
                                          conf->poolinfo);
        if (!conf->r1buf_pool)
                return -ENOMEM;
-       conf->next_resync = 0;
        return 0;
 }
 
@@ -2578,6 +2686,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
        int still_degraded = 0;
        int good_sectors = RESYNC_SECTORS;
        int min_bad = 0; /* number of sectors that are bad in all devices */
+       int idx = sector_to_idx(sector_nr);
 
        if (!conf->r1buf_pool)
                if (init_resync(conf))
@@ -2627,7 +2736,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
         * If there is non-resync activity waiting for a turn, then let it
         * though before starting on this new sync request.
         */
-       if (conf->nr_waiting)
+       if (atomic_read(&conf->nr_waiting[idx]))
                schedule_timeout_uninterruptible(1);
 
        /* we are incrementing sector_nr below. To be safe, we check against
@@ -2654,6 +2763,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
        r1_bio->sector = sector_nr;
        r1_bio->state = 0;
        set_bit(R1BIO_IsSync, &r1_bio->state);
+       /* make sure good_sectors won't go across barrier unit boundary */
+       good_sectors = align_to_barrier_unit_end(sector_nr, good_sectors);
 
        for (i = 0; i < conf->raid_disks * 2; i++) {
                struct md_rdev *rdev;
@@ -2884,6 +2995,26 @@ static struct r1conf *setup_conf(struct mddev *mddev)
        if (!conf)
                goto abort;
 
+       conf->nr_pending = kcalloc(BARRIER_BUCKETS_NR,
+                                  sizeof(atomic_t), GFP_KERNEL);
+       if (!conf->nr_pending)
+               goto abort;
+
+       conf->nr_waiting = kcalloc(BARRIER_BUCKETS_NR,
+                                  sizeof(atomic_t), GFP_KERNEL);
+       if (!conf->nr_waiting)
+               goto abort;
+
+       conf->nr_queued = kcalloc(BARRIER_BUCKETS_NR,
+                                 sizeof(atomic_t), GFP_KERNEL);
+       if (!conf->nr_queued)
+               goto abort;
+
+       conf->barrier = kcalloc(BARRIER_BUCKETS_NR,
+                               sizeof(atomic_t), GFP_KERNEL);
+       if (!conf->barrier)
+               goto abort;
+
        conf->mirrors = kzalloc(sizeof(struct raid1_info)
                                * mddev->raid_disks * 2,
                                 GFP_KERNEL);
@@ -2939,9 +3070,6 @@ static struct r1conf *setup_conf(struct mddev *mddev)
        conf->pending_count = 0;
        conf->recovery_disabled = mddev->recovery_disabled - 1;
 
-       conf->start_next_window = MaxSector;
-       conf->current_window_requests = conf->next_window_requests = 0;
-
        err = -EIO;
        for (i = 0; i < conf->raid_disks * 2; i++) {
 
@@ -2984,6 +3112,10 @@ static struct r1conf *setup_conf(struct mddev *mddev)
                kfree(conf->mirrors);
                safe_put_page(conf->tmppage);
                kfree(conf->poolinfo);
+               kfree(conf->nr_pending);
+               kfree(conf->nr_waiting);
+               kfree(conf->nr_queued);
+               kfree(conf->barrier);
                kfree(conf);
        }
        return ERR_PTR(err);
@@ -3085,6 +3217,10 @@ static void raid1_free(struct mddev *mddev, void *priv)
        kfree(conf->mirrors);
        safe_put_page(conf->tmppage);
        kfree(conf->poolinfo);
+       kfree(conf->nr_pending);
+       kfree(conf->nr_waiting);
+       kfree(conf->nr_queued);
+       kfree(conf->barrier);
        kfree(conf);
 }
 
index c52ef424a24b2313949971143a162c959e8f068d..dd22a37d0d8332e12785b9c270445aba09cce576 100644 (file)
@@ -1,6 +1,30 @@
 #ifndef _RAID1_H
 #define _RAID1_H
 
+/*
+ * each barrier unit size is 64MB fow now
+ * note: it must be larger than RESYNC_DEPTH
+ */
+#define BARRIER_UNIT_SECTOR_BITS       17
+#define BARRIER_UNIT_SECTOR_SIZE       (1<<17)
+/*
+ * In struct r1conf, the following members are related to I/O barrier
+ * buckets,
+ *     atomic_t        *nr_pending;
+ *     atomic_t        *nr_waiting;
+ *     atomic_t        *nr_queued;
+ *     atomic_t        *barrier;
+ * Each of them points to array of atomic_t variables, each array is
+ * designed to have BARRIER_BUCKETS_NR elements and occupy a single
+ * memory page. The data width of atomic_t variables is 4 bytes, equal
+ * to 1<<(ilog2(sizeof(atomic_t))), BARRIER_BUCKETS_NR_BITS is defined
+ * as (PAGE_SHIFT - ilog2(sizeof(int))) to make sure an array of
+ * atomic_t variables with BARRIER_BUCKETS_NR elements just exactly
+ * occupies a single memory page.
+ */
+#define BARRIER_BUCKETS_NR_BITS                (PAGE_SHIFT - ilog2(sizeof(atomic_t)))
+#define BARRIER_BUCKETS_NR             (1<<BARRIER_BUCKETS_NR_BITS)
+
 struct raid1_info {
        struct md_rdev  *rdev;
        sector_t        head_position;
@@ -35,25 +59,6 @@ struct r1conf {
                                                 */
        int                     raid_disks;
 
-       /* During resync, read_balancing is only allowed on the part
-        * of the array that has been resynced.  'next_resync' tells us
-        * where that is.
-        */
-       sector_t                next_resync;
-
-       /* When raid1 starts resync, we divide array into four partitions
-        * |---------|--------------|---------------------|-------------|
-        *        next_resync   start_next_window       end_window
-        * start_next_window = next_resync + NEXT_NORMALIO_DISTANCE
-        * end_window = start_next_window + NEXT_NORMALIO_DISTANCE
-        * current_window_requests means the count of normalIO between
-        *   start_next_window and end_window.
-        * next_window_requests means the count of normalIO after end_window.
-        * */
-       sector_t                start_next_window;
-       int                     current_window_requests;
-       int                     next_window_requests;
-
        spinlock_t              device_lock;
 
        /* list of 'struct r1bio' that need to be processed by raid1d,
@@ -79,10 +84,10 @@ struct r1conf {
         */
        wait_queue_head_t       wait_barrier;
        spinlock_t              resync_lock;
-       int                     nr_pending;
-       int                     nr_waiting;
-       int                     nr_queued;
-       int                     barrier;
+       atomic_t                *nr_pending;
+       atomic_t                *nr_waiting;
+       atomic_t                *nr_queued;
+       atomic_t                *barrier;
        int                     array_frozen;
 
        /* Set to 1 if a full sync is needed, (fresh device added).
@@ -135,7 +140,6 @@ struct r1bio {
                                                 * in this BehindIO request
                                                 */
        sector_t                sector;
-       sector_t                start_next_window;
        int                     sectors;
        unsigned long           state;
        struct mddev            *mddev;
@@ -185,4 +189,10 @@ enum r1bio_state {
        R1BIO_WriteError,
        R1BIO_FailFast,
 };
+
+static inline int sector_to_idx(sector_t sector)
+{
+       return hash_long(sector >> BARRIER_UNIT_SECTOR_BITS,
+                        BARRIER_BUCKETS_NR_BITS);
+}
 #endif
index 6bc5c2a85160e2654050716ef9270c1de3e903a3..063c43d83b72c2f0f753edb7b08f8dd608fa15ad 100644 (file)
@@ -1132,7 +1132,7 @@ read_again:
        }
        slot = r10_bio->read_slot;
 
-       read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+       read_bio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
        bio_trim(read_bio, r10_bio->sector - bio->bi_iter.bi_sector,
                 max_sectors);
 
@@ -1406,7 +1406,7 @@ retry_write:
                int d = r10_bio->devs[i].devnum;
                if (r10_bio->devs[i].bio) {
                        struct md_rdev *rdev = conf->mirrors[d].rdev;
-                       mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+                       mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
                        bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector,
                                 max_sectors);
                        r10_bio->devs[i].bio = mbio;
@@ -1457,7 +1457,7 @@ retry_write:
                                smp_mb();
                                rdev = conf->mirrors[d].rdev;
                        }
-                       mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+                       mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
                        bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector,
                                 max_sectors);
                        r10_bio->devs[i].repl_bio = mbio;
@@ -2565,7 +2565,7 @@ static int narrow_write_error(struct r10bio *r10_bio, int i)
                if (sectors > sect_to_write)
                        sectors = sect_to_write;
                /* Write at 'sector' for 'sectors' */
-               wbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+               wbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
                bio_trim(wbio, sector - bio->bi_iter.bi_sector, sectors);
                wsector = r10_bio->devs[i].addr + (sector - r10_bio->sector);
                wbio->bi_iter.bi_sector = wsector +
@@ -2641,8 +2641,7 @@ read_more:
                           mdname(mddev),
                           bdevname(rdev->bdev, b),
                           (unsigned long long)r10_bio->sector);
-       bio = bio_clone_mddev(r10_bio->master_bio,
-                             GFP_NOIO, mddev);
+       bio = bio_clone_fast(r10_bio->master_bio, GFP_NOIO, mddev->bio_set);
        bio_trim(bio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors);
        r10_bio->devs[slot].bio = bio;
        r10_bio->devs[slot].rdev = rdev;
index 302dea3296ba5ccd07740365314f45d74df49ec2..3f307be01b10cc70eb7b08bc31b9a2a3717372b8 100644 (file)
@@ -20,6 +20,7 @@
 #include <linux/crc32c.h>
 #include <linux/random.h>
 #include <linux/kthread.h>
+#include <linux/types.h>
 #include "md.h"
 #include "raid5.h"
 #include "bitmap.h"
@@ -164,8 +165,59 @@ struct r5l_log {
        struct work_struct deferred_io_work;
        /* to disable write back during in degraded mode */
        struct work_struct disable_writeback_work;
+
+       /* to for chunk_aligned_read in writeback mode, details below */
+       spinlock_t tree_lock;
+       struct radix_tree_root big_stripe_tree;
 };
 
+/*
+ * Enable chunk_aligned_read() with write back cache.
+ *
+ * Each chunk may contain more than one stripe (for example, a 256kB
+ * chunk contains 64 4kB-page, so this chunk contain 64 stripes). For
+ * chunk_aligned_read, these stripes are grouped into one "big_stripe".
+ * For each big_stripe, we count how many stripes of this big_stripe
+ * are in the write back cache. These data are tracked in a radix tree
+ * (big_stripe_tree). We use radix_tree item pointer as the counter.
+ * r5c_tree_index() is used to calculate keys for the radix tree.
+ *
+ * chunk_aligned_read() calls r5c_big_stripe_cached() to look up
+ * big_stripe of each chunk in the tree. If this big_stripe is in the
+ * tree, chunk_aligned_read() aborts. This look up is protected by
+ * rcu_read_lock().
+ *
+ * It is necessary to remember whether a stripe is counted in
+ * big_stripe_tree. Instead of adding new flag, we reuses existing flags:
+ * STRIPE_R5C_PARTIAL_STRIPE and STRIPE_R5C_FULL_STRIPE. If either of these
+ * two flags are set, the stripe is counted in big_stripe_tree. This
+ * requires moving set_bit(STRIPE_R5C_PARTIAL_STRIPE) to
+ * r5c_try_caching_write(); and moving clear_bit of
+ * STRIPE_R5C_PARTIAL_STRIPE and STRIPE_R5C_FULL_STRIPE to
+ * r5c_finish_stripe_write_out().
+ */
+
+/*
+ * radix tree requests lowest 2 bits of data pointer to be 2b'00.
+ * So it is necessary to left shift the counter by 2 bits before using it
+ * as data pointer of the tree.
+ */
+#define R5C_RADIX_COUNT_SHIFT 2
+
+/*
+ * calculate key for big_stripe_tree
+ *
+ * sect: align_bi->bi_iter.bi_sector or sh->sector
+ */
+static inline sector_t r5c_tree_index(struct r5conf *conf,
+                                     sector_t sect)
+{
+       sector_t offset;
+
+       offset = sector_div(sect, conf->chunk_sectors);
+       return sect;
+}
+
 /*
  * an IO range starts from a meta data block and end at the next meta data
  * block. The io unit's the meta data block tracks data/parity followed it. io
@@ -337,17 +389,30 @@ void r5c_check_cached_full_stripe(struct r5conf *conf)
 /*
  * Total log space (in sectors) needed to flush all data in cache
  *
- * Currently, writing-out phase automatically includes all pending writes
- * to the same sector. So the reclaim of each stripe takes up to
- * (conf->raid_disks + 1) pages of log space.
+ * To avoid deadlock due to log space, it is necessary to reserve log
+ * space to flush critical stripes (stripes that occupying log space near
+ * last_checkpoint). This function helps check how much log space is
+ * required to flush all cached stripes.
  *
- * To totally avoid deadlock due to log space, the code reserves
- * (conf->raid_disks + 1) pages for each stripe in cache, which is not
- * necessary in most cases.
+ * To reduce log space requirements, two mechanisms are used to give cache
+ * flush higher priorities:
+ *    1. In handle_stripe_dirtying() and schedule_reconstruction(),
+ *       stripes ALREADY in journal can be flushed w/o pending writes;
+ *    2. In r5l_write_stripe() and r5c_cache_data(), stripes NOT in journal
+ *       can be delayed (r5l_add_no_space_stripe).
  *
- * To improve this, we will need writing-out phase to be able to NOT include
- * pending writes, which will reduce the requirement to
- * (conf->max_degraded + 1) pages per stripe in cache.
+ * In cache flush, the stripe goes through 1 and then 2. For a stripe that
+ * already passed 1, flushing it requires at most (conf->max_degraded + 1)
+ * pages of journal space. For stripes that has not passed 1, flushing it
+ * requires (conf->raid_disks + 1) pages of journal space. There are at
+ * most (conf->group_cnt + 1) stripe that passed 1. So total journal space
+ * required to flush all cached stripes (in pages) is:
+ *
+ *     (stripe_in_journal_count - group_cnt - 1) * (max_degraded + 1) +
+ *     (group_cnt + 1) * (raid_disks + 1)
+ * or
+ *     (stripe_in_journal_count) * (max_degraded + 1) +
+ *     (group_cnt + 1) * (raid_disks - max_degraded)
  */
 static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf)
 {
@@ -356,8 +421,9 @@ static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf)
        if (!r5c_is_writeback(log))
                return 0;
 
-       return BLOCK_SECTORS * (conf->raid_disks + 1) *
-               atomic_read(&log->stripe_in_journal_count);
+       return BLOCK_SECTORS *
+               ((conf->max_degraded + 1) * atomic_read(&log->stripe_in_journal_count) +
+                (conf->raid_disks - conf->max_degraded) * (conf->group_cnt + 1));
 }
 
 /*
@@ -412,16 +478,6 @@ void r5c_make_stripe_write_out(struct stripe_head *sh)
 
        if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
                atomic_inc(&conf->preread_active_stripes);
-
-       if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) {
-               BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0);
-               atomic_dec(&conf->r5c_cached_partial_stripes);
-       }
-
-       if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
-               BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0);
-               atomic_dec(&conf->r5c_cached_full_stripes);
-       }
 }
 
 static void r5c_handle_data_cached(struct stripe_head *sh)
@@ -1271,6 +1327,10 @@ static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh)
        atomic_inc(&conf->active_stripes);
        r5c_make_stripe_write_out(sh);
 
+       if (test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state))
+               atomic_inc(&conf->r5c_flushing_partial_stripes);
+       else
+               atomic_inc(&conf->r5c_flushing_full_stripes);
        raid5_release_stripe(sh);
 }
 
@@ -1313,12 +1373,16 @@ static void r5c_do_reclaim(struct r5conf *conf)
        unsigned long flags;
        int total_cached;
        int stripes_to_flush;
+       int flushing_partial, flushing_full;
 
        if (!r5c_is_writeback(log))
                return;
 
+       flushing_partial = atomic_read(&conf->r5c_flushing_partial_stripes);
+       flushing_full = atomic_read(&conf->r5c_flushing_full_stripes);
        total_cached = atomic_read(&conf->r5c_cached_partial_stripes) +
-               atomic_read(&conf->r5c_cached_full_stripes);
+               atomic_read(&conf->r5c_cached_full_stripes) -
+               flushing_full - flushing_partial;
 
        if (total_cached > conf->min_nr_stripes * 3 / 4 ||
            atomic_read(&conf->empty_inactive_list_nr) > 0)
@@ -1328,7 +1392,7 @@ static void r5c_do_reclaim(struct r5conf *conf)
                 */
                stripes_to_flush = R5C_RECLAIM_STRIPE_GROUP;
        else if (total_cached > conf->min_nr_stripes * 1 / 2 ||
-                atomic_read(&conf->r5c_cached_full_stripes) >
+                atomic_read(&conf->r5c_cached_full_stripes) - flushing_full >
                 R5C_FULL_STRIPE_FLUSH_BATCH)
                /*
                 * if stripe cache pressure moderate, or if there is many full
@@ -1362,9 +1426,9 @@ static void r5c_do_reclaim(struct r5conf *conf)
                            !test_bit(STRIPE_HANDLE, &sh->state) &&
                            atomic_read(&sh->count) == 0) {
                                r5c_flush_stripe(conf, sh);
+                               if (count++ >= R5C_RECLAIM_STRIPE_GROUP)
+                                       break;
                        }
-                       if (count++ >= R5C_RECLAIM_STRIPE_GROUP)
-                               break;
                }
                spin_unlock(&conf->device_lock);
                spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
@@ -2320,6 +2384,10 @@ int r5c_try_caching_write(struct r5conf *conf,
        int i;
        struct r5dev *dev;
        int to_cache = 0;
+       void **pslot;
+       sector_t tree_index;
+       int ret;
+       uintptr_t refcount;
 
        BUG_ON(!r5c_is_writeback(log));
 
@@ -2364,6 +2432,44 @@ int r5c_try_caching_write(struct r5conf *conf,
                }
        }
 
+       /* if the stripe is not counted in big_stripe_tree, add it now */
+       if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) &&
+           !test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
+               tree_index = r5c_tree_index(conf, sh->sector);
+               spin_lock(&log->tree_lock);
+               pslot = radix_tree_lookup_slot(&log->big_stripe_tree,
+                                              tree_index);
+               if (pslot) {
+                       refcount = (uintptr_t)radix_tree_deref_slot_protected(
+                               pslot, &log->tree_lock) >>
+                               R5C_RADIX_COUNT_SHIFT;
+                       radix_tree_replace_slot(
+                               &log->big_stripe_tree, pslot,
+                               (void *)((refcount + 1) << R5C_RADIX_COUNT_SHIFT));
+               } else {
+                       /*
+                        * this radix_tree_insert can fail safely, so no
+                        * need to call radix_tree_preload()
+                        */
+                       ret = radix_tree_insert(
+                               &log->big_stripe_tree, tree_index,
+                               (void *)(1 << R5C_RADIX_COUNT_SHIFT));
+                       if (ret) {
+                               spin_unlock(&log->tree_lock);
+                               r5c_make_stripe_write_out(sh);
+                               return -EAGAIN;
+                       }
+               }
+               spin_unlock(&log->tree_lock);
+
+               /*
+                * set STRIPE_R5C_PARTIAL_STRIPE, this shows the stripe is
+                * counted in the radix tree
+                */
+               set_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state);
+               atomic_inc(&conf->r5c_cached_partial_stripes);
+       }
+
        for (i = disks; i--; ) {
                dev = &sh->dev[i];
                if (dev->towrite) {
@@ -2438,17 +2544,20 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
                                 struct stripe_head *sh,
                                 struct stripe_head_state *s)
 {
+       struct r5l_log *log = conf->log;
        int i;
        int do_wakeup = 0;
+       sector_t tree_index;
+       void **pslot;
+       uintptr_t refcount;
 
-       if (!conf->log ||
-           !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags))
+       if (!log || !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags))
                return;
 
        WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
        clear_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
 
-       if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
+       if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
                return;
 
        for (i = sh->disks; i--; ) {
@@ -2470,12 +2579,45 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
        if (do_wakeup)
                wake_up(&conf->wait_for_overlap);
 
-       spin_lock_irq(&conf->log->stripe_in_journal_lock);
+       spin_lock_irq(&log->stripe_in_journal_lock);
        list_del_init(&sh->r5c);
-       spin_unlock_irq(&conf->log->stripe_in_journal_lock);
+       spin_unlock_irq(&log->stripe_in_journal_lock);
        sh->log_start = MaxSector;
-       atomic_dec(&conf->log->stripe_in_journal_count);
-       r5c_update_log_state(conf->log);
+
+       atomic_dec(&log->stripe_in_journal_count);
+       r5c_update_log_state(log);
+
+       /* stop counting this stripe in big_stripe_tree */
+       if (test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) ||
+           test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
+               tree_index = r5c_tree_index(conf, sh->sector);
+               spin_lock(&log->tree_lock);
+               pslot = radix_tree_lookup_slot(&log->big_stripe_tree,
+                                              tree_index);
+               BUG_ON(pslot == NULL);
+               refcount = (uintptr_t)radix_tree_deref_slot_protected(
+                       pslot, &log->tree_lock) >>
+                       R5C_RADIX_COUNT_SHIFT;
+               if (refcount == 1)
+                       radix_tree_delete(&log->big_stripe_tree, tree_index);
+               else
+                       radix_tree_replace_slot(
+                               &log->big_stripe_tree, pslot,
+                               (void *)((refcount - 1) << R5C_RADIX_COUNT_SHIFT));
+               spin_unlock(&log->tree_lock);
+       }
+
+       if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) {
+               BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0);
+               atomic_dec(&conf->r5c_flushing_partial_stripes);
+               atomic_dec(&conf->r5c_cached_partial_stripes);
+       }
+
+       if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
+               BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0);
+               atomic_dec(&conf->r5c_flushing_full_stripes);
+               atomic_dec(&conf->r5c_cached_full_stripes);
+       }
 }
 
 int
@@ -2535,6 +2677,22 @@ r5c_cache_data(struct r5l_log *log, struct stripe_head *sh,
        return 0;
 }
 
+/* check whether this big stripe is in write back cache. */
+bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect)
+{
+       struct r5l_log *log = conf->log;
+       sector_t tree_index;
+       void *slot;
+
+       if (!log)
+               return false;
+
+       WARN_ON_ONCE(!rcu_read_lock_held());
+       tree_index = r5c_tree_index(conf, sect);
+       slot = radix_tree_lookup(&log->big_stripe_tree, tree_index);
+       return slot != NULL;
+}
+
 static int r5l_load_log(struct r5l_log *log)
 {
        struct md_rdev *rdev = log->rdev;
@@ -2681,6 +2839,9 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
        if (!log->meta_pool)
                goto out_mempool;
 
+       spin_lock_init(&log->tree_lock);
+       INIT_RADIX_TREE(&log->big_stripe_tree, GFP_NOWAIT | __GFP_NOWARN);
+
        log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
                                                 log->rdev->mddev, "reclaim");
        if (!log->reclaim_thread)
index 6214e699342c87d7cdcb83e385530dff808fa918..2ce23b01dbb21da6ae17664df085c37c0a63e157 100644 (file)
@@ -281,13 +281,13 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
                                                atomic_dec(&conf->r5c_cached_partial_stripes);
                                        list_add_tail(&sh->lru, &conf->r5c_full_stripe_list);
                                        r5c_check_cached_full_stripe(conf);
-                               } else {
-                                       /* partial stripe */
-                                       if (!test_and_set_bit(STRIPE_R5C_PARTIAL_STRIPE,
-                                                             &sh->state))
-                                               atomic_inc(&conf->r5c_cached_partial_stripes);
+                               } else
+                                       /*
+                                        * STRIPE_R5C_PARTIAL_STRIPE is set in
+                                        * r5c_try_caching_write(). No need to
+                                        * set it again.
+                                        */
                                        list_add_tail(&sh->lru, &conf->r5c_partial_stripe_list);
-                               }
                        }
                }
        }
@@ -353,17 +353,15 @@ static void release_inactive_stripe_list(struct r5conf *conf,
 static int release_stripe_list(struct r5conf *conf,
                               struct list_head *temp_inactive_list)
 {
-       struct stripe_head *sh;
+       struct stripe_head *sh, *t;
        int count = 0;
        struct llist_node *head;
 
        head = llist_del_all(&conf->released_stripes);
        head = llist_reverse_order(head);
-       while (head) {
+       llist_for_each_entry_safe(sh, t, head, release_list) {
                int hash;
 
-               sh = llist_entry(head, struct stripe_head, release_list);
-               head = llist_next(head);
                /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */
                smp_mb();
                clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state);
@@ -863,6 +861,43 @@ static int use_new_offset(struct r5conf *conf, struct stripe_head *sh)
        return 1;
 }
 
+static void flush_deferred_bios(struct r5conf *conf)
+{
+       struct bio_list tmp;
+       struct bio *bio;
+
+       if (!conf->batch_bio_dispatch || !conf->group_cnt)
+               return;
+
+       bio_list_init(&tmp);
+       spin_lock(&conf->pending_bios_lock);
+       bio_list_merge(&tmp, &conf->pending_bios);
+       bio_list_init(&conf->pending_bios);
+       spin_unlock(&conf->pending_bios_lock);
+
+       while ((bio = bio_list_pop(&tmp)))
+               generic_make_request(bio);
+}
+
+static void defer_bio_issue(struct r5conf *conf, struct bio *bio)
+{
+       /*
+        * change group_cnt will drain all bios, so this is safe
+        *
+        * A read generally means a read-modify-write, which usually means a
+        * randwrite, so we don't delay it
+        */
+       if (!conf->batch_bio_dispatch || !conf->group_cnt ||
+           bio_op(bio) == REQ_OP_READ) {
+               generic_make_request(bio);
+               return;
+       }
+       spin_lock(&conf->pending_bios_lock);
+       bio_list_add(&conf->pending_bios, bio);
+       spin_unlock(&conf->pending_bios_lock);
+       md_wakeup_thread(conf->mddev->thread);
+}
+
 static void
 raid5_end_read_request(struct bio *bi);
 static void
@@ -1043,7 +1078,7 @@ again:
                                trace_block_bio_remap(bdev_get_queue(bi->bi_bdev),
                                                      bi, disk_devt(conf->mddev->gendisk),
                                                      sh->dev[i].sector);
-                       generic_make_request(bi);
+                       defer_bio_issue(conf, bi);
                }
                if (rrdev) {
                        if (s->syncing || s->expanding || s->expanded
@@ -1088,7 +1123,7 @@ again:
                                trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev),
                                                      rbi, disk_devt(conf->mddev->gendisk),
                                                      sh->dev[i].sector);
-                       generic_make_request(rbi);
+                       defer_bio_issue(conf, rbi);
                }
                if (!rdev && !rrdev) {
                        if (op_is_write(op))
@@ -2914,12 +2949,36 @@ sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
  *      like to flush data in journal to RAID disks first, so complex rmw
  *      is handled in the write patch (handle_stripe_dirtying).
  *
+ *   2. when journal space is critical (R5C_LOG_CRITICAL=1)
+ *
+ *      It is important to be able to flush all stripes in raid5-cache.
+ *      Therefore, we need reserve some space on the journal device for
+ *      these flushes. If flush operation includes pending writes to the
+ *      stripe, we need to reserve (conf->raid_disk + 1) pages per stripe
+ *      for the flush out. If we exclude these pending writes from flush
+ *      operation, we only need (conf->max_degraded + 1) pages per stripe.
+ *      Therefore, excluding pending writes in these cases enables more
+ *      efficient use of the journal device.
+ *
+ *      Note: To make sure the stripe makes progress, we only delay
+ *      towrite for stripes with data already in journal (injournal > 0).
+ *      When LOG_CRITICAL, stripes with injournal == 0 will be sent to
+ *      no_space_stripes list.
+ *
  */
-static inline bool delay_towrite(struct r5dev *dev,
-                                  struct stripe_head_state *s)
+static inline bool delay_towrite(struct r5conf *conf,
+                                struct r5dev *dev,
+                                struct stripe_head_state *s)
 {
-       return !test_bit(R5_OVERWRITE, &dev->flags) &&
-               !test_bit(R5_Insync, &dev->flags) && s->injournal;
+       /* case 1 above */
+       if (!test_bit(R5_OVERWRITE, &dev->flags) &&
+           !test_bit(R5_Insync, &dev->flags) && s->injournal)
+               return true;
+       /* case 2 above */
+       if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
+           s->injournal > 0)
+               return true;
+       return false;
 }
 
 static void
@@ -2942,7 +3001,7 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
                for (i = disks; i--; ) {
                        struct r5dev *dev = &sh->dev[i];
 
-                       if (dev->towrite && !delay_towrite(dev, s)) {
+                       if (dev->towrite && !delay_towrite(conf, dev, s)) {
                                set_bit(R5_LOCKED, &dev->flags);
                                set_bit(R5_Wantdrain, &dev->flags);
                                if (!expand)
@@ -3694,7 +3753,7 @@ static int handle_stripe_dirtying(struct r5conf *conf,
        } else for (i = disks; i--; ) {
                /* would I have to read this buffer for read_modify_write */
                struct r5dev *dev = &sh->dev[i];
-               if (((dev->towrite && !delay_towrite(dev, s)) ||
+               if (((dev->towrite && !delay_towrite(conf, dev, s)) ||
                     i == sh->pd_idx || i == sh->qd_idx ||
                     test_bit(R5_InJournal, &dev->flags)) &&
                    !test_bit(R5_LOCKED, &dev->flags) &&
@@ -3718,8 +3777,8 @@ static int handle_stripe_dirtying(struct r5conf *conf,
                }
        }
 
-       pr_debug("for sector %llu, rmw=%d rcw=%d\n",
-               (unsigned long long)sh->sector, rmw, rcw);
+       pr_debug("for sector %llu state 0x%lx, rmw=%d rcw=%d\n",
+                (unsigned long long)sh->sector, sh->state, rmw, rcw);
        set_bit(STRIPE_HANDLE, &sh->state);
        if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) {
                /* prefer read-modify-write, but need to get some data */
@@ -3759,7 +3818,7 @@ static int handle_stripe_dirtying(struct r5conf *conf,
 
                for (i = disks; i--; ) {
                        struct r5dev *dev = &sh->dev[i];
-                       if (((dev->towrite && !delay_towrite(dev, s)) ||
+                       if (((dev->towrite && !delay_towrite(conf, dev, s)) ||
                             i == sh->pd_idx || i == sh->qd_idx ||
                             test_bit(R5_InJournal, &dev->flags)) &&
                            !test_bit(R5_LOCKED, &dev->flags) &&
@@ -4995,9 +5054,9 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
                return 0;
        }
        /*
-        * use bio_clone_mddev to make a copy of the bio
+        * use bio_clone_fast to make a copy of the bio
         */
-       align_bi = bio_clone_mddev(raid_bio, GFP_NOIO, mddev);
+       align_bi = bio_clone_fast(raid_bio, GFP_NOIO, mddev->bio_set);
        if (!align_bi)
                return 0;
        /*
@@ -5025,6 +5084,13 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
                      rdev->recovery_offset >= end_sector)))
                        rdev = NULL;
        }
+
+       if (r5c_big_stripe_cached(conf, align_bi->bi_iter.bi_sector)) {
+               rcu_read_unlock();
+               bio_put(align_bi);
+               return 0;
+       }
+
        if (rdev) {
                sector_t first_bad;
                int bad_sectors;
@@ -5381,7 +5447,6 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
         * data on failed drives.
         */
        if (rw == READ && mddev->degraded == 0 &&
-           !r5c_is_writeback(conf->log) &&
            mddev->reshape_position == MaxSector) {
                bi = chunk_aligned_read(mddev, bi);
                if (!bi)
@@ -6126,6 +6191,8 @@ static void raid5d(struct md_thread *thread)
                mutex_unlock(&conf->cache_size_mutex);
        }
 
+       flush_deferred_bios(conf);
+
        r5l_flush_stripe_to_raid(conf->log);
 
        async_tx_issue_pending_all();
@@ -6711,6 +6778,18 @@ static struct r5conf *setup_conf(struct mddev *mddev)
        atomic_set(&conf->active_stripes, 0);
        atomic_set(&conf->preread_active_stripes, 0);
        atomic_set(&conf->active_aligned_reads, 0);
+       bio_list_init(&conf->pending_bios);
+       spin_lock_init(&conf->pending_bios_lock);
+       conf->batch_bio_dispatch = true;
+       rdev_for_each(rdev, mddev) {
+               if (test_bit(Journal, &rdev->flags))
+                       continue;
+               if (blk_queue_nonrot(bdev_get_queue(rdev->bdev))) {
+                       conf->batch_bio_dispatch = false;
+                       break;
+               }
+       }
+
        conf->bypass_threshold = BYPASS_THRESHOLD;
        conf->recovery_disabled = mddev->recovery_disabled - 1;
 
@@ -6757,6 +6836,8 @@ static struct r5conf *setup_conf(struct mddev *mddev)
        INIT_LIST_HEAD(&conf->r5c_full_stripe_list);
        atomic_set(&conf->r5c_cached_partial_stripes, 0);
        INIT_LIST_HEAD(&conf->r5c_partial_stripe_list);
+       atomic_set(&conf->r5c_flushing_full_stripes, 0);
+       atomic_set(&conf->r5c_flushing_partial_stripes, 0);
 
        conf->level = mddev->new_level;
        conf->chunk_sectors = mddev->new_chunk_sectors;
index 1440fa26e29629c4f9acc098f0fa9035f5ff1d1a..4bb27b97bf6bc48f6362461592edab19f1024140 100644 (file)
@@ -663,6 +663,8 @@ struct r5conf {
        struct list_head        r5c_full_stripe_list;
        atomic_t                r5c_cached_partial_stripes;
        struct list_head        r5c_partial_stripe_list;
+       atomic_t                r5c_flushing_full_stripes;
+       atomic_t                r5c_flushing_partial_stripes;
 
        atomic_t                empty_inactive_list_nr;
        struct llist_head       released_stripes;
@@ -684,6 +686,10 @@ struct r5conf {
        int                     group_cnt;
        int                     worker_cnt_per_group;
        struct r5l_log          *log;
+
+       struct bio_list         pending_bios;
+       spinlock_t              pending_bios_lock;
+       bool                    batch_bio_dispatch;
 };
 
 
@@ -788,4 +794,5 @@ extern void r5c_check_stripe_cache_usage(struct r5conf *conf);
 extern void r5c_check_cached_full_stripe(struct r5conf *conf);
 extern struct md_sysfs_entry r5c_journal_mode;
 extern void r5c_update_on_rdev_error(struct mddev *mddev);
+extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect);
 #endif
index 7cf8a6c70a3f71c5eca6100dfda8d67f10b20e3b..8e521194f6fc4ad32138a51c962a365c74debaed 100644 (file)
@@ -183,7 +183,7 @@ static inline void bio_advance_iter(struct bio *bio, struct bvec_iter *iter,
 
 #define bio_iter_last(bvec, iter) ((iter).bi_size == (bvec).bv_len)
 
-static inline unsigned bio_segments(struct bio *bio)
+static inline unsigned __bio_segments(struct bio *bio, struct bvec_iter *bvec)
 {
        unsigned segs = 0;
        struct bio_vec bv;
@@ -205,12 +205,17 @@ static inline unsigned bio_segments(struct bio *bio)
                break;
        }
 
-       bio_for_each_segment(bv, bio, iter)
+       __bio_for_each_segment(bv, bio, iter, *bvec)
                segs++;
 
        return segs;
 }
 
+static inline unsigned bio_segments(struct bio *bio)
+{
+       return __bio_segments(bio, &bio->bi_iter);
+}
+
 /*
  * get a reference to a bio, so it won't disappear. the intended use is
  * something like:
@@ -384,6 +389,8 @@ extern void bio_put(struct bio *);
 extern void __bio_clone_fast(struct bio *, struct bio *);
 extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *);
 extern struct bio *bio_clone_bioset(struct bio *, gfp_t, struct bio_set *bs);
+extern struct bio *bio_clone_bioset_partial(struct bio *, gfp_t,
+                                           struct bio_set *, int, int);
 
 extern struct bio_set *fs_bio_set;
 
index 84812a9fb16fbbd1409315ea3752fb9a1e3e39ef..72fab4999c00662a187536ee66c6084eb69a8b11 100644 (file)
@@ -1102,6 +1102,7 @@ void radix_tree_replace_slot(struct radix_tree_root *root,
 {
        replace_slot(root, NULL, slot, item, true);
 }
+EXPORT_SYMBOL(radix_tree_replace_slot);
 
 /**
  * radix_tree_iter_replace - replace item in a slot