]> git.kernelconcepts.de Git - karo-tx-linux.git/commitdiff
Merge tag 'dm-4.1-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device...
authorLinus Torvalds <torvalds@linux-foundation.org>
Sat, 18 Apr 2015 12:14:18 +0000 (08:14 -0400)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sat, 18 Apr 2015 12:14:18 +0000 (08:14 -0400)
Pull device mapper updates from Mike Snitzer:

 - the most extensive changes this cycle are the DM core improvements to
   add full blk-mq support to request-based DM.

    - disabled by default but user can opt-in with CONFIG_DM_MQ_DEFAULT
    - depends on some blk-mq changes from Jens' for-4.1/core branch so
      that explains why this pull is built on linux-block.git

 - update DM to use name_to_dev_t() rather than open-coding a less
   capable device parser.

    - includes a couple small improvements to name_to_dev_t() that offer
      stricter constraints that DM's code provided.

 - improvements to the dm-cache "mq" cache replacement policy.

 - a DM crypt crypt_ctr() error path fix and an async crypto deadlock
   fix

 - a small efficiency improvement for DM crypt decryption by leveraging
   immutable biovecs

 - add error handling modes for corrupted blocks to DM verity

 - a new "log-writes" DM target from Josef Bacik that is meant for file
   system developers to test file system integrity at particular points
   in the life of a file system

 - a few DM log userspace cleanups and fixes

 - a few Documentation fixes (for thin, cache, crypt and switch)

* tag 'dm-4.1-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (34 commits)
  dm crypt: fix missing error code return from crypt_ctr error path
  dm crypt: fix deadlock when async crypto algorithm returns -EBUSY
  dm crypt: leverage immutable biovecs when decrypting on read
  dm crypt: update URLs to new cryptsetup project page
  dm: add log writes target
  dm table: use bool function return values of true/false not 1/0
  dm verity: add error handling modes for corrupted blocks
  dm thin: remove stale 'trim' message documentation
  dm delay: use msecs_to_jiffies for time conversion
  dm log userspace base: fix compile warning
  dm log userspace transfer: match wait_for_completion_timeout return type
  dm table: fall back to getting device using name_to_dev_t()
  init: stricter checking of major:minor root= values
  init: export name_to_dev_t and mark name argument as const
  dm: add 'use_blk_mq' module param and expose in per-device ro sysfs attr
  dm: optimize dm_mq_queue_rq to _not_ use kthread if using pure blk-mq
  dm: add full blk-mq support to request-based DM
  dm: impose configurable deadline for dm_request_fn's merge heuristic
  dm sysfs: introduce ability to add writable attributes
  dm: don't start current request if it would've merged with the previous
  ...

24 files changed:
Documentation/ABI/testing/sysfs-block-dm
Documentation/device-mapper/dm-crypt.txt
Documentation/device-mapper/log-writes.txt [new file with mode: 0644]
Documentation/device-mapper/switch.txt
Documentation/device-mapper/thin-provisioning.txt
Documentation/device-mapper/verity.txt
drivers/md/Kconfig
drivers/md/Makefile
drivers/md/dm-cache-policy-mq.c
drivers/md/dm-crypt.c
drivers/md/dm-delay.c
drivers/md/dm-log-userspace-base.c
drivers/md/dm-log-userspace-transfer.c
drivers/md/dm-log-writes.c [new file with mode: 0644]
drivers/md/dm-mpath.c
drivers/md/dm-sysfs.c
drivers/md/dm-table.c
drivers/md/dm-verity.c
drivers/md/dm.c
drivers/md/dm.h
include/linux/device-mapper.h
include/linux/mount.h
include/uapi/linux/dm-ioctl.h
init/do_mounts.c

index 87ca5691e29b1d92726e01bbe5296cde6932d08d..f9f2339b9a0a88e485d91bae022b1e6a3d213f07 100644 (file)
@@ -23,3 +23,25 @@ Description: Device-mapper device suspend state.
                Contains the value 1 while the device is suspended.
                Otherwise it contains 0. Read-only attribute.
 Users:         util-linux, device-mapper udev rules
+
+What:          /sys/block/dm-<num>/dm/rq_based_seq_io_merge_deadline
+Date:          March 2015
+KernelVersion: 4.1
+Contact:       dm-devel@redhat.com
+Description:   Allow control over how long a request that is a
+               reasonable merge candidate can be queued on the request
+               queue.  The resolution of this deadline is in
+               microseconds (ranging from 1 to 100000 usecs).
+               Setting this attribute to 0 (the default) will disable
+               request-based DM's merge heuristic and associated extra
+               accounting.  This attribute is not applicable to
+               bio-based DM devices so it will only ever report 0 for
+               them.
+
+What:          /sys/block/dm-<num>/dm/use_blk_mq
+Date:          March 2015
+KernelVersion: 4.1
+Contact:       dm-devel@redhat.com
+Description:   Request-based Device-mapper blk-mq I/O path mode.
+               Contains the value 1 if the device is using blk-mq.
+               Otherwise it contains 0. Read-only attribute.
index ad697781f9ac478477cfed76978b047685eda2b6..692171fe9da0307732b96327e10848e1d12e2e26 100644 (file)
@@ -5,7 +5,7 @@ Device-Mapper's "crypt" target provides transparent encryption of block devices
 using the kernel crypto API.
 
 For a more detailed description of supported parameters see:
-http://code.google.com/p/cryptsetup/wiki/DMCrypt
+https://gitlab.com/cryptsetup/cryptsetup/wikis/DMCrypt
 
 Parameters: <cipher> <key> <iv_offset> <device path> \
              <offset> [<#opt_params> <opt_params>]
@@ -80,7 +80,7 @@ Example scripts
 ===============
 LUKS (Linux Unified Key Setup) is now the preferred way to set up disk
 encryption with dm-crypt using the 'cryptsetup' utility, see
-http://code.google.com/p/cryptsetup/
+https://gitlab.com/cryptsetup/cryptsetup
 
 [[
 #!/bin/sh
diff --git a/Documentation/device-mapper/log-writes.txt b/Documentation/device-mapper/log-writes.txt
new file mode 100644 (file)
index 0000000..c10f30c
--- /dev/null
@@ -0,0 +1,140 @@
+dm-log-writes
+=============
+
+This target takes 2 devices, one to pass all IO to normally, and one to log all
+of the write operations to.  This is intended for file system developers wishing
+to verify the integrity of metadata or data as the file system is written to.
+There is a log_write_entry written for every WRITE request and the target is
+able to take arbitrary data from userspace to insert into the log.  The data
+that is in the WRITE requests is copied into the log to make the replay happen
+exactly as it happened originally.
+
+Log Ordering
+============
+
+We log things in order of completion once we are sure the write is no longer in
+cache.  This means that normal WRITE requests are not actually logged until the
+next REQ_FLUSH request.  This is to make it easier for userspace to replay the
+log in a way that correlates to what is on disk and not what is in cache, to
+make it easier to detect improper waiting/flushing.
+
+This works by attaching all WRITE requests to a list once the write completes.
+Once we see a REQ_FLUSH request we splice this list onto the request and once
+the FLUSH request completes we log all of the WRITEs and then the FLUSH.  Only
+completed WRITEs, at the time the REQ_FLUSH is issued, are added in order to
+simulate the worst case scenario with regard to power failures.  Consider the
+following example (W means write, C means complete):
+
+W1,W2,W3,C3,C2,Wflush,C1,Cflush
+
+The log would show the following
+
+W3,W2,flush,W1....
+
+Again this is to simulate what is actually on disk, this allows us to detect
+cases where a power failure at a particular point in time would create an
+inconsistent file system.
+
+Any REQ_FUA requests bypass this flushing mechanism and are logged as soon as
+they complete as those requests will obviously bypass the device cache.
+
+Any REQ_DISCARD requests are treated like WRITE requests.  Otherwise we would
+have all the DISCARD requests, and then the WRITE requests and then the FLUSH
+request.  Consider the following example:
+
+WRITE block 1, DISCARD block 1, FLUSH
+
+If we logged DISCARD when it completed, the replay would look like this
+
+DISCARD 1, WRITE 1, FLUSH
+
+which isn't quite what happened and wouldn't be caught during the log replay.
+
+Target interface
+================
+
+i) Constructor
+
+   log-writes <dev_path> <log_dev_path>
+
+   dev_path    : Device that all of the IO will go to normally.
+   log_dev_path : Device where the log entries are written to.
+
+ii) Status
+
+    <#logged entries> <highest allocated sector>
+
+    #logged entries           : Number of logged entries
+    highest allocated sector   : Highest allocated sector
+
+iii) Messages
+
+    mark <description>
+
+       You can use a dmsetup message to set an arbitrary mark in a log.
+       For example say you want to fsck a file system after every
+       write, but first you need to replay up to the mkfs to make sure
+       we're fsck'ing something reasonable, you would do something like
+       this:
+
+         mkfs.btrfs -f /dev/mapper/log
+         dmsetup message log 0 mark mkfs
+         <run test>
+
+         This would allow you to replay the log up to the mkfs mark and
+         then replay from that point on doing the fsck check in the
+         interval that you want.
+
+       Every log has a mark at the end labeled "dm-log-writes-end".
+
+Userspace component
+===================
+
+There is a userspace tool that will replay the log for you in various ways.
+It can be found here: https://github.com/josefbacik/log-writes
+
+Example usage
+=============
+
+Say you want to test fsync on your file system.  You would do something like
+this:
+
+TABLE="0 $(blockdev --getsz /dev/sdb) log-writes /dev/sdb /dev/sdc"
+dmsetup create log --table "$TABLE"
+mkfs.btrfs -f /dev/mapper/log
+dmsetup message log 0 mark mkfs
+
+mount /dev/mapper/log /mnt/btrfs-test
+<some test that does fsync at the end>
+dmsetup message log 0 mark fsync
+md5sum /mnt/btrfs-test/foo
+umount /mnt/btrfs-test
+
+dmsetup remove log
+replay-log --log /dev/sdc --replay /dev/sdb --end-mark fsync
+mount /dev/sdb /mnt/btrfs-test
+md5sum /mnt/btrfs-test/foo
+<verify md5sum's are correct>
+
+Another option is to do a complicated file system operation and verify the file
+system is consistent during the entire operation.  You could do this with:
+
+TABLE="0 $(blockdev --getsz /dev/sdb) log-writes /dev/sdb /dev/sdc"
+dmsetup create log --table "$TABLE"
+mkfs.btrfs -f /dev/mapper/log
+dmsetup message log 0 mark mkfs
+
+mount /dev/mapper/log /mnt/btrfs-test
+<fsstress to dirty the fs>
+btrfs filesystem balance /mnt/btrfs-test
+umount /mnt/btrfs-test
+dmsetup remove log
+
+replay-log --log /dev/sdc --replay /dev/sdb --end-mark mkfs
+btrfsck /dev/sdb
+replay-log --log /dev/sdc --replay /dev/sdb --start-mark mkfs \
+       --fsck "btrfsck /dev/sdb" --check fua
+
+And that will replay the log until it sees a FUA request, run the fsck command
+and if the fsck passes it will replay to the next FUA, until it is completed or
+the fsck command exists abnormally.
index 8897d04948384289b3fca54801be9676c15ce0e5..424835e57f2713df8e349883c02bce0771aa7012 100644 (file)
@@ -47,8 +47,8 @@ consume far too much memory.
 Using this device-mapper switch target we can now build a two-layer
 device hierarchy:
 
-    Upper Tier  Determine which array member the I/O should be sent to.
-    Lower Tier  Load balance amongst paths to a particular member.
+    Upper Tier - Determine which array member the I/O should be sent to.
+    Lower Tier - Load balance amongst paths to a particular member.
 
 The lower tier consists of a single dm multipath device for each member.
 Each of these multipath devices contains the set of paths directly to
index 2f5173500bd953b32e55134012af50ac93e46ad8..4f67578b295483bcc14d48f069c6ded3581e3f94 100644 (file)
@@ -380,9 +380,6 @@ then you'll have no access to blocks mapped beyond the end.  If you
 load a target that is bigger than before, then extra blocks will be
 provisioned as and when needed.
 
-If you wish to reduce the size of your thin device and potentially
-regain some space then send the 'trim' message to the pool.
-
 ii) Status
 
      <nr mapped sectors> <highest mapped sector>
index 9884681535ee36bf03a7d7baaa54abb36360d53d..e15bc1a0fb98ab23563681210cc6ed1865234816 100644 (file)
@@ -11,6 +11,7 @@ Construction Parameters
     <data_block_size> <hash_block_size>
     <num_data_blocks> <hash_start_block>
     <algorithm> <digest> <salt>
+    [<#opt_params> <opt_params>]
 
 <version>
     This is the type of the on-disk hash format.
@@ -62,6 +63,22 @@ Construction Parameters
 <salt>
     The hexadecimal encoding of the salt value.
 
+<#opt_params>
+    Number of optional parameters. If there are no optional parameters,
+    the optional paramaters section can be skipped or #opt_params can be zero.
+    Otherwise #opt_params is the number of following arguments.
+
+    Example of optional parameters section:
+        1 ignore_corruption
+
+ignore_corruption
+    Log corrupted blocks, but allow read operations to proceed normally.
+
+restart_on_corruption
+    Restart the system when a corrupted block is discovered. This option is
+    not compatible with ignore_corruption and requires user space support to
+    avoid restart loops.
+
 Theory of operation
 ===================
 
@@ -125,7 +142,7 @@ block boundary) are the hash blocks which are stored a depth at a time
 
 The full specification of kernel parameters and on-disk metadata format
 is available at the cryptsetup project's wiki page
-  http://code.google.com/p/cryptsetup/wiki/DMVerity
+  https://gitlab.com/cryptsetup/cryptsetup/wikis/DMVerity
 
 Status
 ======
@@ -142,7 +159,7 @@ Set up a device:
 
 A command line tool veritysetup is available to compute or verify
 the hash tree or activate the kernel device. This is available from
-the cryptsetup upstream repository http://code.google.com/p/cryptsetup/
+the cryptsetup upstream repository https://gitlab.com/cryptsetup/cryptsetup/
 (as a libcryptsetup extension).
 
 Create hash on the device:
index 63e05e32b46269e29f8e75e03073d5587fd6d916..6ddc983417d5a8120700fbfa98508d1db3e35546 100644 (file)
@@ -196,6 +196,17 @@ config BLK_DEV_DM
 
          If unsure, say N.
 
+config DM_MQ_DEFAULT
+       bool "request-based DM: use blk-mq I/O path by default"
+       depends on BLK_DEV_DM
+       ---help---
+         This option enables the blk-mq based I/O path for request-based
+         DM devices by default.  With the option the dm_mod.use_blk_mq
+         module/boot option defaults to Y, without it to N, but it can
+         still be overriden either way.
+
+         If unsure say N.
+
 config DM_DEBUG
        bool "Device mapper debugging support"
        depends on BLK_DEV_DM
@@ -432,4 +443,20 @@ config DM_SWITCH
 
          If unsure, say N.
 
+config DM_LOG_WRITES
+       tristate "Log writes target support"
+       depends on BLK_DEV_DM
+       ---help---
+         This device-mapper target takes two devices, one device to use
+         normally, one to log all write operations done to the first device.
+         This is for use by file system developers wishing to verify that
+         their fs is writing a consitent file system at all times by allowing
+         them to replay the log in a variety of ways and to check the
+         contents.
+
+         To compile this code as a module, choose M here: the module will
+         be called dm-log-writes.
+
+         If unsure, say N.
+
 endif # MD
index a2da532b1c2bda38d18a6324c2dfdcfcc7108df5..1863feaa584612eb862ff20d7806c8c12608b17c 100644 (file)
@@ -55,6 +55,7 @@ obj-$(CONFIG_DM_CACHE)                += dm-cache.o
 obj-$(CONFIG_DM_CACHE_MQ)      += dm-cache-mq.o
 obj-$(CONFIG_DM_CACHE_CLEANER) += dm-cache-cleaner.o
 obj-$(CONFIG_DM_ERA)           += dm-era.o
+obj-$(CONFIG_DM_LOG_WRITES)    += dm-log-writes.o
 
 ifeq ($(CONFIG_DM_UEVENT),y)
 dm-mod-objs                    += dm-uevent.o
index 13f547a4eeb61f2715845090b8e2e5ce311c73d1..3ddd1162334df3bcefe551ca86a5287b9f3ea4ca 100644 (file)
@@ -8,6 +8,7 @@
 #include "dm.h"
 
 #include <linux/hash.h>
+#include <linux/jiffies.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/slab.h>
@@ -124,32 +125,41 @@ static void iot_examine_bio(struct io_tracker *t, struct bio *bio)
  * sorted queue.
  */
 #define NR_QUEUE_LEVELS 16u
+#define NR_SENTINELS NR_QUEUE_LEVELS * 3
+
+#define WRITEBACK_PERIOD HZ
 
 struct queue {
+       unsigned nr_elts;
+       bool current_writeback_sentinels;
+       unsigned long next_writeback;
        struct list_head qs[NR_QUEUE_LEVELS];
+       struct list_head sentinels[NR_SENTINELS];
 };
 
 static void queue_init(struct queue *q)
 {
        unsigned i;
 
-       for (i = 0; i < NR_QUEUE_LEVELS; i++)
+       q->nr_elts = 0;
+       q->current_writeback_sentinels = false;
+       q->next_writeback = 0;
+       for (i = 0; i < NR_QUEUE_LEVELS; i++) {
                INIT_LIST_HEAD(q->qs + i);
+               INIT_LIST_HEAD(q->sentinels + i);
+               INIT_LIST_HEAD(q->sentinels + NR_QUEUE_LEVELS + i);
+               INIT_LIST_HEAD(q->sentinels + (2 * NR_QUEUE_LEVELS) + i);
+       }
 }
 
-/*
- * Checks to see if the queue is empty.
- * FIXME: reduce cpu usage.
- */
-static bool queue_empty(struct queue *q)
+static unsigned queue_size(struct queue *q)
 {
-       unsigned i;
-
-       for (i = 0; i < NR_QUEUE_LEVELS; i++)
-               if (!list_empty(q->qs + i))
-                       return false;
+       return q->nr_elts;
+}
 
-       return true;
+static bool queue_empty(struct queue *q)
+{
+       return q->nr_elts == 0;
 }
 
 /*
@@ -157,24 +167,19 @@ static bool queue_empty(struct queue *q)
  */
 static void queue_push(struct queue *q, unsigned level, struct list_head *elt)
 {
+       q->nr_elts++;
        list_add_tail(elt, q->qs + level);
 }
 
-static void queue_remove(struct list_head *elt)
+static void queue_remove(struct queue *q, struct list_head *elt)
 {
+       q->nr_elts--;
        list_del(elt);
 }
 
-/*
- * Shifts all regions down one level.  This has no effect on the order of
- * the queue.
- */
-static void queue_shift_down(struct queue *q)
+static bool is_sentinel(struct queue *q, struct list_head *h)
 {
-       unsigned level;
-
-       for (level = 1; level < NR_QUEUE_LEVELS; level++)
-               list_splice_init(q->qs + level, q->qs + level - 1);
+       return (h >= q->sentinels) && (h < (q->sentinels + NR_SENTINELS));
 }
 
 /*
@@ -184,10 +189,12 @@ static void queue_shift_down(struct queue *q)
 static struct list_head *queue_peek(struct queue *q)
 {
        unsigned level;
+       struct list_head *h;
 
        for (level = 0; level < NR_QUEUE_LEVELS; level++)
-               if (!list_empty(q->qs + level))
-                       return q->qs[level].next;
+               list_for_each(h, q->qs + level)
+                       if (!is_sentinel(q, h))
+                               return h;
 
        return NULL;
 }
@@ -197,16 +204,34 @@ static struct list_head *queue_pop(struct queue *q)
        struct list_head *r = queue_peek(q);
 
        if (r) {
+               q->nr_elts--;
                list_del(r);
-
-               /* have we just emptied the bottom level? */
-               if (list_empty(q->qs))
-                       queue_shift_down(q);
        }
 
        return r;
 }
 
+/*
+ * Pops an entry from a level that is not past a sentinel.
+ */
+static struct list_head *queue_pop_old(struct queue *q)
+{
+       unsigned level;
+       struct list_head *h;
+
+       for (level = 0; level < NR_QUEUE_LEVELS; level++)
+               list_for_each(h, q->qs + level) {
+                       if (is_sentinel(q, h))
+                               break;
+
+                       q->nr_elts--;
+                       list_del(h);
+                       return h;
+               }
+
+       return NULL;
+}
+
 static struct list_head *list_pop(struct list_head *lh)
 {
        struct list_head *r = lh->next;
@@ -217,6 +242,62 @@ static struct list_head *list_pop(struct list_head *lh)
        return r;
 }
 
+static struct list_head *writeback_sentinel(struct queue *q, unsigned level)
+{
+       if (q->current_writeback_sentinels)
+               return q->sentinels + NR_QUEUE_LEVELS + level;
+       else
+               return q->sentinels + 2 * NR_QUEUE_LEVELS + level;
+}
+
+static void queue_update_writeback_sentinels(struct queue *q)
+{
+       unsigned i;
+       struct list_head *h;
+
+       if (time_after(jiffies, q->next_writeback)) {
+               for (i = 0; i < NR_QUEUE_LEVELS; i++) {
+                       h = writeback_sentinel(q, i);
+                       list_del(h);
+                       list_add_tail(h, q->qs + i);
+               }
+
+               q->next_writeback = jiffies + WRITEBACK_PERIOD;
+               q->current_writeback_sentinels = !q->current_writeback_sentinels;
+       }
+}
+
+/*
+ * Sometimes we want to iterate through entries that have been pushed since
+ * a certain event.  We use sentinel entries on the queues to delimit these
+ * 'tick' events.
+ */
+static void queue_tick(struct queue *q)
+{
+       unsigned i;
+
+       for (i = 0; i < NR_QUEUE_LEVELS; i++) {
+               list_del(q->sentinels + i);
+               list_add_tail(q->sentinels + i, q->qs + i);
+       }
+}
+
+typedef void (*iter_fn)(struct list_head *, void *);
+static void queue_iterate_tick(struct queue *q, iter_fn fn, void *context)
+{
+       unsigned i;
+       struct list_head *h;
+
+       for (i = 0; i < NR_QUEUE_LEVELS; i++) {
+               list_for_each_prev(h, q->qs + i) {
+                       if (is_sentinel(q, h))
+                               break;
+
+                       fn(h, context);
+               }
+       }
+}
+
 /*----------------------------------------------------------------*/
 
 /*
@@ -232,8 +313,6 @@ struct entry {
         */
        bool dirty:1;
        unsigned hit_count;
-       unsigned generation;
-       unsigned tick;
 };
 
 /*
@@ -481,7 +560,6 @@ static bool in_cache(struct mq_policy *mq, struct entry *e)
  */
 static void push(struct mq_policy *mq, struct entry *e)
 {
-       e->tick = mq->tick;
        hash_insert(mq, e);
 
        if (in_cache(mq, e))
@@ -496,7 +574,11 @@ static void push(struct mq_policy *mq, struct entry *e)
  */
 static void del(struct mq_policy *mq, struct entry *e)
 {
-       queue_remove(&e->list);
+       if (in_cache(mq, e))
+               queue_remove(e->dirty ? &mq->cache_dirty : &mq->cache_clean, &e->list);
+       else
+               queue_remove(&mq->pre_cache, &e->list);
+
        hash_remove(e);
 }
 
@@ -518,18 +600,24 @@ static struct entry *pop(struct mq_policy *mq, struct queue *q)
        return e;
 }
 
-static struct entry *peek(struct queue *q)
+static struct entry *pop_old(struct mq_policy *mq, struct queue *q)
 {
-       struct list_head *h = queue_peek(q);
-       return h ? container_of(h, struct entry, list) : NULL;
+       struct entry *e;
+       struct list_head *h = queue_pop_old(q);
+
+       if (!h)
+               return NULL;
+
+       e = container_of(h, struct entry, list);
+       hash_remove(e);
+
+       return e;
 }
 
-/*
- * Has this entry already been updated?
- */
-static bool updated_this_tick(struct mq_policy *mq, struct entry *e)
+static struct entry *peek(struct queue *q)
 {
-       return mq->tick == e->tick;
+       struct list_head *h = queue_peek(q);
+       return h ? container_of(h, struct entry, list) : NULL;
 }
 
 /*
@@ -583,20 +671,9 @@ static void check_generation(struct mq_policy *mq)
  * Whenever we use an entry we bump up it's hit counter, and push it to the
  * back to it's current level.
  */
-static void requeue_and_update_tick(struct mq_policy *mq, struct entry *e)
+static void requeue(struct mq_policy *mq, struct entry *e)
 {
-       if (updated_this_tick(mq, e))
-               return;
-
-       e->hit_count++;
-       mq->hit_count++;
        check_generation(mq);
-
-       /* generation adjustment, to stop the counts increasing forever. */
-       /* FIXME: divide? */
-       /* e->hit_count -= min(e->hit_count - 1, mq->generation - e->generation); */
-       e->generation = mq->generation;
-
        del(mq, e);
        push(mq, e);
 }
@@ -703,7 +780,7 @@ static int cache_entry_found(struct mq_policy *mq,
                             struct entry *e,
                             struct policy_result *result)
 {
-       requeue_and_update_tick(mq, e);
+       requeue(mq, e);
 
        if (in_cache(mq, e)) {
                result->op = POLICY_HIT;
@@ -740,8 +817,6 @@ static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e,
        new_e->oblock = e->oblock;
        new_e->dirty = false;
        new_e->hit_count = e->hit_count;
-       new_e->generation = e->generation;
-       new_e->tick = e->tick;
 
        del(mq, e);
        free_entry(&mq->pre_cache_pool, e);
@@ -757,18 +832,16 @@ static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e,
                                 int data_dir, struct policy_result *result)
 {
        int r = 0;
-       bool updated = updated_this_tick(mq, e);
 
-       if ((!discarded_oblock && updated) ||
-           !should_promote(mq, e, discarded_oblock, data_dir)) {
-               requeue_and_update_tick(mq, e);
+       if (!should_promote(mq, e, discarded_oblock, data_dir)) {
+               requeue(mq, e);
                result->op = POLICY_MISS;
 
        } else if (!can_migrate)
                r = -EWOULDBLOCK;
 
        else {
-               requeue_and_update_tick(mq, e);
+               requeue(mq, e);
                r = pre_cache_to_cache(mq, e, result);
        }
 
@@ -795,7 +868,6 @@ static void insert_in_pre_cache(struct mq_policy *mq,
        e->dirty = false;
        e->oblock = oblock;
        e->hit_count = 1;
-       e->generation = mq->generation;
        push(mq, e);
 }
 
@@ -828,7 +900,6 @@ static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock,
        e->oblock = oblock;
        e->dirty = false;
        e->hit_count = 1;
-       e->generation = mq->generation;
        push(mq, e);
 
        result->cblock = infer_cblock(&mq->cache_pool, e);
@@ -905,12 +976,37 @@ static void mq_destroy(struct dm_cache_policy *p)
        kfree(mq);
 }
 
+static void update_pre_cache_hits(struct list_head *h, void *context)
+{
+       struct entry *e = container_of(h, struct entry, list);
+       e->hit_count++;
+}
+
+static void update_cache_hits(struct list_head *h, void *context)
+{
+       struct mq_policy *mq = context;
+       struct entry *e = container_of(h, struct entry, list);
+       e->hit_count++;
+       mq->hit_count++;
+}
+
 static void copy_tick(struct mq_policy *mq)
 {
-       unsigned long flags;
+       unsigned long flags, tick;
 
        spin_lock_irqsave(&mq->tick_lock, flags);
-       mq->tick = mq->tick_protected;
+       tick = mq->tick_protected;
+       if (tick != mq->tick) {
+               queue_iterate_tick(&mq->pre_cache, update_pre_cache_hits, mq);
+               queue_iterate_tick(&mq->cache_dirty, update_cache_hits, mq);
+               queue_iterate_tick(&mq->cache_clean, update_cache_hits, mq);
+               mq->tick = tick;
+       }
+
+       queue_tick(&mq->pre_cache);
+       queue_tick(&mq->cache_dirty);
+       queue_tick(&mq->cache_clean);
+       queue_update_writeback_sentinels(&mq->cache_dirty);
        spin_unlock_irqrestore(&mq->tick_lock, flags);
 }
 
@@ -1001,7 +1097,6 @@ static int mq_load_mapping(struct dm_cache_policy *p,
        e->oblock = oblock;
        e->dirty = false;       /* this gets corrected in a minute */
        e->hit_count = hint_valid ? hint : 1;
-       e->generation = mq->generation;
        push(mq, e);
 
        return 0;
@@ -1012,10 +1107,15 @@ static int mq_save_hints(struct mq_policy *mq, struct queue *q,
 {
        int r;
        unsigned level;
+       struct list_head *h;
        struct entry *e;
 
        for (level = 0; level < NR_QUEUE_LEVELS; level++)
-               list_for_each_entry(e, q->qs + level, list) {
+               list_for_each(h, q->qs + level) {
+                       if (is_sentinel(q, h))
+                               continue;
+
+                       e = container_of(h, struct entry, list);
                        r = fn(context, infer_cblock(&mq->cache_pool, e),
                               e->oblock, e->hit_count);
                        if (r)
@@ -1087,10 +1187,27 @@ static int mq_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock)
        return r;
 }
 
+#define CLEAN_TARGET_PERCENTAGE 25
+
+static bool clean_target_met(struct mq_policy *mq)
+{
+       /*
+        * Cache entries may not be populated.  So we're cannot rely on the
+        * size of the clean queue.
+        */
+       unsigned nr_clean = from_cblock(mq->cache_size) - queue_size(&mq->cache_dirty);
+       unsigned target = from_cblock(mq->cache_size) * CLEAN_TARGET_PERCENTAGE / 100;
+
+       return nr_clean >= target;
+}
+
 static int __mq_writeback_work(struct mq_policy *mq, dm_oblock_t *oblock,
                              dm_cblock_t *cblock)
 {
-       struct entry *e = pop(mq, &mq->cache_dirty);
+       struct entry *e = pop_old(mq, &mq->cache_dirty);
+
+       if (!e && !clean_target_met(mq))
+               e = pop(mq, &mq->cache_dirty);
 
        if (!e)
                return -ENODATA;
index 713a96237a80c34951302dfa4a5ea6db9f44c39b..9eeea196328acc63c3220c309399abf014dfbb4b 100644 (file)
@@ -228,7 +228,7 @@ static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc)
  *
  * tcw:  Compatible implementation of the block chaining mode used
  *       by the TrueCrypt device encryption system (prior to version 4.1).
- *       For more info see: http://www.truecrypt.org
+ *       For more info see: https://gitlab.com/cryptsetup/cryptsetup/wikis/TrueCryptOnDiskFormat
  *       It operates on full 512 byte sectors and uses CBC
  *       with an IV derived from initial key and the sector number.
  *       In addition, whitening value is applied on every sector, whitening
@@ -925,11 +925,10 @@ static int crypt_convert(struct crypt_config *cc,
 
                switch (r) {
                /* async */
+               case -EINPROGRESS:
                case -EBUSY:
                        wait_for_completion(&ctx->restart);
                        reinit_completion(&ctx->restart);
-                       /* fall through*/
-               case -EINPROGRESS:
                        ctx->req = NULL;
                        ctx->cc_sector++;
                        continue;
@@ -1124,15 +1123,15 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone)
 static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
 {
        struct crypt_config *cc = io->cc;
-       struct bio *base_bio = io->base_bio;
        struct bio *clone;
 
        /*
-        * The block layer might modify the bvec array, so always
-        * copy the required bvecs because we need the original
-        * one in order to decrypt the whole bio data *afterwards*.
+        * We need the original biovec array in order to decrypt
+        * the whole bio data *afterwards* -- thanks to immutable
+        * biovecs we don't need to worry about the block layer
+        * modifying the biovec array; so leverage bio_clone_fast().
         */
-       clone = bio_clone_bioset(base_bio, gfp, cc->bs);
+       clone = bio_clone_fast(io->base_bio, gfp, cc->bs);
        if (!clone)
                return 1;
 
@@ -1346,10 +1345,8 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
        struct dm_crypt_io *io = container_of(ctx, struct dm_crypt_io, ctx);
        struct crypt_config *cc = io->cc;
 
-       if (error == -EINPROGRESS) {
-               complete(&ctx->restart);
+       if (error == -EINPROGRESS)
                return;
-       }
 
        if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post)
                error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq);
@@ -1360,12 +1357,15 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
        crypt_free_req(cc, req_of_dmreq(cc, dmreq), io->base_bio);
 
        if (!atomic_dec_and_test(&ctx->cc_pending))
-               return;
+               goto done;
 
        if (bio_data_dir(io->base_bio) == READ)
                kcryptd_crypt_read_done(io);
        else
                kcryptd_crypt_write_io_submit(io, 1);
+done:
+       if (!completion_done(&ctx->restart))
+               complete(&ctx->restart);
 }
 
 static void kcryptd_crypt(struct work_struct *work)
@@ -1816,6 +1816,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
                if (ret)
                        goto bad;
 
+               ret = -EINVAL;
                while (opt_params--) {
                        opt_string = dm_shift_arg(&as);
                        if (!opt_string) {
index 42c3a27a14cc3a906b5f892a6206de348b6b58ee..57b6a1901c917127eccb8247cd1015a9b588159e 100644 (file)
@@ -236,7 +236,7 @@ static int delay_bio(struct delay_c *dc, int delay, struct bio *bio)
        delayed = dm_per_bio_data(bio, sizeof(struct dm_delay_info));
 
        delayed->context = dc;
-       delayed->expires = expires = jiffies + (delay * HZ / 1000);
+       delayed->expires = expires = jiffies + msecs_to_jiffies(delay);
 
        mutex_lock(&delayed_bios_lock);
 
index 03177ca0b0090435256510e58c80b7dbdce3b417..058256d2eeea8b6094f50190b9a0116137b238b4 100644 (file)
@@ -17,7 +17,9 @@
 
 #define DM_LOG_USERSPACE_VSN "1.3.0"
 
-struct flush_entry {
+#define FLUSH_ENTRY_POOL_SIZE 16
+
+struct dm_dirty_log_flush_entry {
        int type;
        region_t region;
        struct list_head list;
@@ -34,22 +36,14 @@ struct flush_entry {
 struct log_c {
        struct dm_target *ti;
        struct dm_dev *log_dev;
-       uint32_t region_size;
-       region_t region_count;
-       uint64_t luid;
-       char uuid[DM_UUID_LEN];
 
        char *usr_argv_str;
        uint32_t usr_argc;
 
-       /*
-        * in_sync_hint gets set when doing is_remote_recovering.  It
-        * represents the first region that needs recovery.  IOW, the
-        * first zero bit of sync_bits.  This can be useful for to limit
-        * traffic for calls like is_remote_recovering and get_resync_work,
-        * but be take care in its use for anything else.
-        */
-       uint64_t in_sync_hint;
+       uint32_t region_size;
+       region_t region_count;
+       uint64_t luid;
+       char uuid[DM_UUID_LEN];
 
        /*
         * Mark and clear requests are held until a flush is issued
@@ -61,6 +55,15 @@ struct log_c {
        struct list_head mark_list;
        struct list_head clear_list;
 
+       /*
+        * in_sync_hint gets set when doing is_remote_recovering.  It
+        * represents the first region that needs recovery.  IOW, the
+        * first zero bit of sync_bits.  This can be useful for to limit
+        * traffic for calls like is_remote_recovering and get_resync_work,
+        * but be take care in its use for anything else.
+        */
+       uint64_t in_sync_hint;
+
        /*
         * Workqueue for flush of clear region requests.
         */
@@ -72,19 +75,11 @@ struct log_c {
         * Combine userspace flush and mark requests for efficiency.
         */
        uint32_t integrated_flush;
-};
-
-static mempool_t *flush_entry_pool;
 
-static void *flush_entry_alloc(gfp_t gfp_mask, void *pool_data)
-{
-       return kmalloc(sizeof(struct flush_entry), gfp_mask);
-}
+       mempool_t *flush_entry_pool;
+};
 
-static void flush_entry_free(void *element, void *pool_data)
-{
-       kfree(element);
-}
+static struct kmem_cache *_flush_entry_cache;
 
 static int userspace_do_request(struct log_c *lc, const char *uuid,
                                int request_type, char *data, size_t data_size,
@@ -254,6 +249,14 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
                goto out;
        }
 
+       lc->flush_entry_pool = mempool_create_slab_pool(FLUSH_ENTRY_POOL_SIZE,
+                                                       _flush_entry_cache);
+       if (!lc->flush_entry_pool) {
+               DMERR("Failed to create flush_entry_pool");
+               r = -ENOMEM;
+               goto out;
+       }
+
        /*
         * Send table string and get back any opened device.
         */
@@ -310,6 +313,8 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
 out:
        kfree(devices_rdata);
        if (r) {
+               if (lc->flush_entry_pool)
+                       mempool_destroy(lc->flush_entry_pool);
                kfree(lc);
                kfree(ctr_str);
        } else {
@@ -338,6 +343,8 @@ static void userspace_dtr(struct dm_dirty_log *log)
        if (lc->log_dev)
                dm_put_device(lc->ti, lc->log_dev);
 
+       mempool_destroy(lc->flush_entry_pool);
+
        kfree(lc->usr_argv_str);
        kfree(lc);
 
@@ -461,7 +468,7 @@ static int userspace_in_sync(struct dm_dirty_log *log, region_t region,
 static int flush_one_by_one(struct log_c *lc, struct list_head *flush_list)
 {
        int r = 0;
-       struct flush_entry *fe;
+       struct dm_dirty_log_flush_entry *fe;
 
        list_for_each_entry(fe, flush_list, list) {
                r = userspace_do_request(lc, lc->uuid, fe->type,
@@ -481,7 +488,7 @@ static int flush_by_group(struct log_c *lc, struct list_head *flush_list,
        int r = 0;
        int count;
        uint32_t type = 0;
-       struct flush_entry *fe, *tmp_fe;
+       struct dm_dirty_log_flush_entry *fe, *tmp_fe;
        LIST_HEAD(tmp_list);
        uint64_t group[MAX_FLUSH_GROUP_COUNT];
 
@@ -563,7 +570,8 @@ static int userspace_flush(struct dm_dirty_log *log)
        LIST_HEAD(clear_list);
        int mark_list_is_empty;
        int clear_list_is_empty;
-       struct flush_entry *fe, *tmp_fe;
+       struct dm_dirty_log_flush_entry *fe, *tmp_fe;
+       mempool_t *flush_entry_pool = lc->flush_entry_pool;
 
        spin_lock_irqsave(&lc->flush_lock, flags);
        list_splice_init(&lc->mark_list, &mark_list);
@@ -643,10 +651,10 @@ static void userspace_mark_region(struct dm_dirty_log *log, region_t region)
 {
        unsigned long flags;
        struct log_c *lc = log->context;
-       struct flush_entry *fe;
+       struct dm_dirty_log_flush_entry *fe;
 
        /* Wait for an allocation, but _never_ fail */
-       fe = mempool_alloc(flush_entry_pool, GFP_NOIO);
+       fe = mempool_alloc(lc->flush_entry_pool, GFP_NOIO);
        BUG_ON(!fe);
 
        spin_lock_irqsave(&lc->flush_lock, flags);
@@ -672,7 +680,7 @@ static void userspace_clear_region(struct dm_dirty_log *log, region_t region)
 {
        unsigned long flags;
        struct log_c *lc = log->context;
-       struct flush_entry *fe;
+       struct dm_dirty_log_flush_entry *fe;
 
        /*
         * If we fail to allocate, we skip the clearing of
@@ -680,7 +688,7 @@ static void userspace_clear_region(struct dm_dirty_log *log, region_t region)
         * to cause the region to be resync'ed when the
         * device is activated next time.
         */
-       fe = mempool_alloc(flush_entry_pool, GFP_ATOMIC);
+       fe = mempool_alloc(lc->flush_entry_pool, GFP_ATOMIC);
        if (!fe) {
                DMERR("Failed to allocate memory to clear region.");
                return;
@@ -733,7 +741,6 @@ static int userspace_get_resync_work(struct dm_dirty_log *log, region_t *region)
 static void userspace_set_region_sync(struct dm_dirty_log *log,
                                      region_t region, int in_sync)
 {
-       int r;
        struct log_c *lc = log->context;
        struct {
                region_t r;
@@ -743,12 +750,12 @@ static void userspace_set_region_sync(struct dm_dirty_log *log,
        pkg.r = region;
        pkg.i = (int64_t)in_sync;
 
-       r = userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC,
-                                (char *)&pkg, sizeof(pkg), NULL, NULL);
+       (void) userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC,
+                                   (char *)&pkg, sizeof(pkg), NULL, NULL);
 
        /*
         * It would be nice to be able to report failures.
-        * However, it is easy emough to detect and resolve.
+        * However, it is easy enough to detect and resolve.
         */
        return;
 }
@@ -886,18 +893,16 @@ static int __init userspace_dirty_log_init(void)
 {
        int r = 0;
 
-       flush_entry_pool = mempool_create(100, flush_entry_alloc,
-                                         flush_entry_free, NULL);
-
-       if (!flush_entry_pool) {
-               DMWARN("Unable to create flush_entry_pool:  No memory.");
+       _flush_entry_cache = KMEM_CACHE(dm_dirty_log_flush_entry, 0);
+       if (!_flush_entry_cache) {
+               DMWARN("Unable to create flush_entry_cache: No memory.");
                return -ENOMEM;
        }
 
        r = dm_ulog_tfr_init();
        if (r) {
                DMWARN("Unable to initialize userspace log communications");
-               mempool_destroy(flush_entry_pool);
+               kmem_cache_destroy(_flush_entry_cache);
                return r;
        }
 
@@ -905,7 +910,7 @@ static int __init userspace_dirty_log_init(void)
        if (r) {
                DMWARN("Couldn't register userspace dirty log type");
                dm_ulog_tfr_exit();
-               mempool_destroy(flush_entry_pool);
+               kmem_cache_destroy(_flush_entry_cache);
                return r;
        }
 
@@ -917,7 +922,7 @@ static void __exit userspace_dirty_log_exit(void)
 {
        dm_dirty_log_type_unregister(&_userspace_type);
        dm_ulog_tfr_exit();
-       mempool_destroy(flush_entry_pool);
+       kmem_cache_destroy(_flush_entry_cache);
 
        DMINFO("version " DM_LOG_USERSPACE_VSN " unloaded");
        return;
index 39ad9664d39731022796430b0a3a30b7a8464898..fdf8ec304f8d26ae0cd45600e985ab3d845b47df 100644 (file)
@@ -172,6 +172,7 @@ int dm_consult_userspace(const char *uuid, uint64_t luid, int request_type,
                         char *rdata, size_t *rdata_size)
 {
        int r = 0;
+       unsigned long tmo;
        size_t dummy = 0;
        int overhead_size = sizeof(struct dm_ulog_request) + sizeof(struct cn_msg);
        struct dm_ulog_request *tfr = prealloced_ulog_tfr;
@@ -236,11 +237,11 @@ resend:
                goto out;
        }
 
-       r = wait_for_completion_timeout(&(pkg.complete), DM_ULOG_RETRY_TIMEOUT);
+       tmo = wait_for_completion_timeout(&(pkg.complete), DM_ULOG_RETRY_TIMEOUT);
        spin_lock(&receiving_list_lock);
        list_del_init(&(pkg.list));
        spin_unlock(&receiving_list_lock);
-       if (!r) {
+       if (!tmo) {
                DMWARN("[%s] Request timed out: [%u/%u] - retrying",
                       (strlen(uuid) > 8) ?
                       (uuid + (strlen(uuid) - 8)) : (uuid),
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
new file mode 100644 (file)
index 0000000..93e0844
--- /dev/null
@@ -0,0 +1,825 @@
+/*
+ * Copyright (C) 2014 Facebook. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/device-mapper.h>
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+#include <linux/slab.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+
+#define DM_MSG_PREFIX "log-writes"
+
+/*
+ * This target will sequentially log all writes to the target device onto the
+ * log device.  This is helpful for replaying writes to check for fs consistency
+ * at all times.  This target provides a mechanism to mark specific events to
+ * check data at a later time.  So for example you would:
+ *
+ * write data
+ * fsync
+ * dmsetup message /dev/whatever mark mymark
+ * unmount /mnt/test
+ *
+ * Then replay the log up to mymark and check the contents of the replay to
+ * verify it matches what was written.
+ *
+ * We log writes only after they have been flushed, this makes the log describe
+ * close to the order in which the data hits the actual disk, not its cache.  So
+ * for example the following sequence (W means write, C means complete)
+ *
+ * Wa,Wb,Wc,Cc,Ca,FLUSH,FUAd,Cb,CFLUSH,CFUAd
+ *
+ * Would result in the log looking like this:
+ *
+ * c,a,flush,fuad,b,<other writes>,<next flush>
+ *
+ * This is meant to help expose problems where file systems do not properly wait
+ * on data being written before invoking a FLUSH.  FUA bypasses cache so once it
+ * completes it is added to the log as it should be on disk.
+ *
+ * We treat DISCARDs as if they don't bypass cache so that they are logged in
+ * order of completion along with the normal writes.  If we didn't do it this
+ * way we would process all the discards first and then write all the data, when
+ * in fact we want to do the data and the discard in the order that they
+ * completed.
+ */
+#define LOG_FLUSH_FLAG (1 << 0)
+#define LOG_FUA_FLAG (1 << 1)
+#define LOG_DISCARD_FLAG (1 << 2)
+#define LOG_MARK_FLAG (1 << 3)
+
+#define WRITE_LOG_VERSION 1
+#define WRITE_LOG_MAGIC 0x6a736677736872
+
+/*
+ * The disk format for this is braindead simple.
+ *
+ * At byte 0 we have our super, followed by the following sequence for
+ * nr_entries:
+ *
+ * [   1 sector    ][  entry->nr_sectors ]
+ * [log_write_entry][    data written    ]
+ *
+ * The log_write_entry takes up a full sector so we can have arbitrary length
+ * marks and it leaves us room for extra content in the future.
+ */
+
+/*
+ * Basic info about the log for userspace.
+ */
+struct log_write_super {
+       __le64 magic;
+       __le64 version;
+       __le64 nr_entries;
+       __le32 sectorsize;
+};
+
+/*
+ * sector - the sector we wrote.
+ * nr_sectors - the number of sectors we wrote.
+ * flags - flags for this log entry.
+ * data_len - the size of the data in this log entry, this is for private log
+ * entry stuff, the MARK data provided by userspace for example.
+ */
+struct log_write_entry {
+       __le64 sector;
+       __le64 nr_sectors;
+       __le64 flags;
+       __le64 data_len;
+};
+
+struct log_writes_c {
+       struct dm_dev *dev;
+       struct dm_dev *logdev;
+       u64 logged_entries;
+       u32 sectorsize;
+       atomic_t io_blocks;
+       atomic_t pending_blocks;
+       sector_t next_sector;
+       sector_t end_sector;
+       bool logging_enabled;
+       bool device_supports_discard;
+       spinlock_t blocks_lock;
+       struct list_head unflushed_blocks;
+       struct list_head logging_blocks;
+       wait_queue_head_t wait;
+       struct task_struct *log_kthread;
+};
+
+struct pending_block {
+       int vec_cnt;
+       u64 flags;
+       sector_t sector;
+       sector_t nr_sectors;
+       char *data;
+       u32 datalen;
+       struct list_head list;
+       struct bio_vec vecs[0];
+};
+
+struct per_bio_data {
+       struct pending_block *block;
+};
+
+static void put_pending_block(struct log_writes_c *lc)
+{
+       if (atomic_dec_and_test(&lc->pending_blocks)) {
+               smp_mb__after_atomic();
+               if (waitqueue_active(&lc->wait))
+                       wake_up(&lc->wait);
+       }
+}
+
+static void put_io_block(struct log_writes_c *lc)
+{
+       if (atomic_dec_and_test(&lc->io_blocks)) {
+               smp_mb__after_atomic();
+               if (waitqueue_active(&lc->wait))
+                       wake_up(&lc->wait);
+       }
+}
+
+static void log_end_io(struct bio *bio, int err)
+{
+       struct log_writes_c *lc = bio->bi_private;
+       struct bio_vec *bvec;
+       int i;
+
+       if (err) {
+               unsigned long flags;
+
+               DMERR("Error writing log block, error=%d", err);
+               spin_lock_irqsave(&lc->blocks_lock, flags);
+               lc->logging_enabled = false;
+               spin_unlock_irqrestore(&lc->blocks_lock, flags);
+       }
+
+       bio_for_each_segment_all(bvec, bio, i)
+               __free_page(bvec->bv_page);
+
+       put_io_block(lc);
+       bio_put(bio);
+}
+
+/*
+ * Meant to be called if there is an error, it will free all the pages
+ * associated with the block.
+ */
+static void free_pending_block(struct log_writes_c *lc,
+                              struct pending_block *block)
+{
+       int i;
+
+       for (i = 0; i < block->vec_cnt; i++) {
+               if (block->vecs[i].bv_page)
+                       __free_page(block->vecs[i].bv_page);
+       }
+       kfree(block->data);
+       kfree(block);
+       put_pending_block(lc);
+}
+
+static int write_metadata(struct log_writes_c *lc, void *entry,
+                         size_t entrylen, void *data, size_t datalen,
+                         sector_t sector)
+{
+       struct bio *bio;
+       struct page *page;
+       void *ptr;
+       size_t ret;
+
+       bio = bio_alloc(GFP_KERNEL, 1);
+       if (!bio) {
+               DMERR("Couldn't alloc log bio");
+               goto error;
+       }
+       bio->bi_iter.bi_size = 0;
+       bio->bi_iter.bi_sector = sector;
+       bio->bi_bdev = lc->logdev->bdev;
+       bio->bi_end_io = log_end_io;
+       bio->bi_private = lc;
+       set_bit(BIO_UPTODATE, &bio->bi_flags);
+
+       page = alloc_page(GFP_KERNEL);
+       if (!page) {
+               DMERR("Couldn't alloc log page");
+               bio_put(bio);
+               goto error;
+       }
+
+       ptr = kmap_atomic(page);
+       memcpy(ptr, entry, entrylen);
+       if (datalen)
+               memcpy(ptr + entrylen, data, datalen);
+       memset(ptr + entrylen + datalen, 0,
+              lc->sectorsize - entrylen - datalen);
+       kunmap_atomic(ptr);
+
+       ret = bio_add_page(bio, page, lc->sectorsize, 0);
+       if (ret != lc->sectorsize) {
+               DMERR("Couldn't add page to the log block");
+               goto error_bio;
+       }
+       submit_bio(WRITE, bio);
+       return 0;
+error_bio:
+       bio_put(bio);
+       __free_page(page);
+error:
+       put_io_block(lc);
+       return -1;
+}
+
+static int log_one_block(struct log_writes_c *lc,
+                        struct pending_block *block, sector_t sector)
+{
+       struct bio *bio;
+       struct log_write_entry entry;
+       size_t ret;
+       int i;
+
+       entry.sector = cpu_to_le64(block->sector);
+       entry.nr_sectors = cpu_to_le64(block->nr_sectors);
+       entry.flags = cpu_to_le64(block->flags);
+       entry.data_len = cpu_to_le64(block->datalen);
+       if (write_metadata(lc, &entry, sizeof(entry), block->data,
+                          block->datalen, sector)) {
+               free_pending_block(lc, block);
+               return -1;
+       }
+
+       if (!block->vec_cnt)
+               goto out;
+       sector++;
+
+       bio = bio_alloc(GFP_KERNEL, block->vec_cnt);
+       if (!bio) {
+               DMERR("Couldn't alloc log bio");
+               goto error;
+       }
+       atomic_inc(&lc->io_blocks);
+       bio->bi_iter.bi_size = 0;
+       bio->bi_iter.bi_sector = sector;
+       bio->bi_bdev = lc->logdev->bdev;
+       bio->bi_end_io = log_end_io;
+       bio->bi_private = lc;
+       set_bit(BIO_UPTODATE, &bio->bi_flags);
+
+       for (i = 0; i < block->vec_cnt; i++) {
+               /*
+                * The page offset is always 0 because we allocate a new page
+                * for every bvec in the original bio for simplicity sake.
+                */
+               ret = bio_add_page(bio, block->vecs[i].bv_page,
+                                  block->vecs[i].bv_len, 0);
+               if (ret != block->vecs[i].bv_len) {
+                       atomic_inc(&lc->io_blocks);
+                       submit_bio(WRITE, bio);
+                       bio = bio_alloc(GFP_KERNEL, block->vec_cnt - i);
+                       if (!bio) {
+                               DMERR("Couldn't alloc log bio");
+                               goto error;
+                       }
+                       bio->bi_iter.bi_size = 0;
+                       bio->bi_iter.bi_sector = sector;
+                       bio->bi_bdev = lc->logdev->bdev;
+                       bio->bi_end_io = log_end_io;
+                       bio->bi_private = lc;
+                       set_bit(BIO_UPTODATE, &bio->bi_flags);
+
+                       ret = bio_add_page(bio, block->vecs[i].bv_page,
+                                          block->vecs[i].bv_len, 0);
+                       if (ret != block->vecs[i].bv_len) {
+                               DMERR("Couldn't add page on new bio?");
+                               bio_put(bio);
+                               goto error;
+                       }
+               }
+               sector += block->vecs[i].bv_len >> SECTOR_SHIFT;
+       }
+       submit_bio(WRITE, bio);
+out:
+       kfree(block->data);
+       kfree(block);
+       put_pending_block(lc);
+       return 0;
+error:
+       free_pending_block(lc, block);
+       put_io_block(lc);
+       return -1;
+}
+
+static int log_super(struct log_writes_c *lc)
+{
+       struct log_write_super super;
+
+       super.magic = cpu_to_le64(WRITE_LOG_MAGIC);
+       super.version = cpu_to_le64(WRITE_LOG_VERSION);
+       super.nr_entries = cpu_to_le64(lc->logged_entries);
+       super.sectorsize = cpu_to_le32(lc->sectorsize);
+
+       if (write_metadata(lc, &super, sizeof(super), NULL, 0, 0)) {
+               DMERR("Couldn't write super");
+               return -1;
+       }
+
+       return 0;
+}
+
+static inline sector_t logdev_last_sector(struct log_writes_c *lc)
+{
+       return i_size_read(lc->logdev->bdev->bd_inode) >> SECTOR_SHIFT;
+}
+
+static int log_writes_kthread(void *arg)
+{
+       struct log_writes_c *lc = (struct log_writes_c *)arg;
+       sector_t sector = 0;
+
+       while (!kthread_should_stop()) {
+               bool super = false;
+               bool logging_enabled;
+               struct pending_block *block = NULL;
+               int ret;
+
+               spin_lock_irq(&lc->blocks_lock);
+               if (!list_empty(&lc->logging_blocks)) {
+                       block = list_first_entry(&lc->logging_blocks,
+                                                struct pending_block, list);
+                       list_del_init(&block->list);
+                       if (!lc->logging_enabled)
+                               goto next;
+
+                       sector = lc->next_sector;
+                       if (block->flags & LOG_DISCARD_FLAG)
+                               lc->next_sector++;
+                       else
+                               lc->next_sector += block->nr_sectors + 1;
+
+                       /*
+                        * Apparently the size of the device may not be known
+                        * right away, so handle this properly.
+                        */
+                       if (!lc->end_sector)
+                               lc->end_sector = logdev_last_sector(lc);
+                       if (lc->end_sector &&
+                           lc->next_sector >= lc->end_sector) {
+                               DMERR("Ran out of space on the logdev");
+                               lc->logging_enabled = false;
+                               goto next;
+                       }
+                       lc->logged_entries++;
+                       atomic_inc(&lc->io_blocks);
+
+                       super = (block->flags & (LOG_FUA_FLAG | LOG_MARK_FLAG));
+                       if (super)
+                               atomic_inc(&lc->io_blocks);
+               }
+next:
+               logging_enabled = lc->logging_enabled;
+               spin_unlock_irq(&lc->blocks_lock);
+               if (block) {
+                       if (logging_enabled) {
+                               ret = log_one_block(lc, block, sector);
+                               if (!ret && super)
+                                       ret = log_super(lc);
+                               if (ret) {
+                                       spin_lock_irq(&lc->blocks_lock);
+                                       lc->logging_enabled = false;
+                                       spin_unlock_irq(&lc->blocks_lock);
+                               }
+                       } else
+                               free_pending_block(lc, block);
+                       continue;
+               }
+
+               if (!try_to_freeze()) {
+                       set_current_state(TASK_INTERRUPTIBLE);
+                       if (!kthread_should_stop() &&
+                           !atomic_read(&lc->pending_blocks))
+                               schedule();
+                       __set_current_state(TASK_RUNNING);
+               }
+       }
+       return 0;
+}
+
+/*
+ * Construct a log-writes mapping:
+ * log-writes <dev_path> <log_dev_path>
+ */
+static int log_writes_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+       struct log_writes_c *lc;
+       struct dm_arg_set as;
+       const char *devname, *logdevname;
+
+       as.argc = argc;
+       as.argv = argv;
+
+       if (argc < 2) {
+               ti->error = "Invalid argument count";
+               return -EINVAL;
+       }
+
+       lc = kzalloc(sizeof(struct log_writes_c), GFP_KERNEL);
+       if (!lc) {
+               ti->error = "Cannot allocate context";
+               return -ENOMEM;
+       }
+       spin_lock_init(&lc->blocks_lock);
+       INIT_LIST_HEAD(&lc->unflushed_blocks);
+       INIT_LIST_HEAD(&lc->logging_blocks);
+       init_waitqueue_head(&lc->wait);
+       lc->sectorsize = 1 << SECTOR_SHIFT;
+       atomic_set(&lc->io_blocks, 0);
+       atomic_set(&lc->pending_blocks, 0);
+
+       devname = dm_shift_arg(&as);
+       if (dm_get_device(ti, devname, dm_table_get_mode(ti->table), &lc->dev)) {
+               ti->error = "Device lookup failed";
+               goto bad;
+       }
+
+       logdevname = dm_shift_arg(&as);
+       if (dm_get_device(ti, logdevname, dm_table_get_mode(ti->table), &lc->logdev)) {
+               ti->error = "Log device lookup failed";
+               dm_put_device(ti, lc->dev);
+               goto bad;
+       }
+
+       lc->log_kthread = kthread_run(log_writes_kthread, lc, "log-write");
+       if (!lc->log_kthread) {
+               ti->error = "Couldn't alloc kthread";
+               dm_put_device(ti, lc->dev);
+               dm_put_device(ti, lc->logdev);
+               goto bad;
+       }
+
+       /* We put the super at sector 0, start logging at sector 1 */
+       lc->next_sector = 1;
+       lc->logging_enabled = true;
+       lc->end_sector = logdev_last_sector(lc);
+       lc->device_supports_discard = true;
+
+       ti->num_flush_bios = 1;
+       ti->flush_supported = true;
+       ti->num_discard_bios = 1;
+       ti->discards_supported = true;
+       ti->per_bio_data_size = sizeof(struct per_bio_data);
+       ti->private = lc;
+       return 0;
+
+bad:
+       kfree(lc);
+       return -EINVAL;
+}
+
+static int log_mark(struct log_writes_c *lc, char *data)
+{
+       struct pending_block *block;
+       size_t maxsize = lc->sectorsize - sizeof(struct log_write_entry);
+
+       block = kzalloc(sizeof(struct pending_block), GFP_KERNEL);
+       if (!block) {
+               DMERR("Error allocating pending block");
+               return -ENOMEM;
+       }
+
+       block->data = kstrndup(data, maxsize, GFP_KERNEL);
+       if (!block->data) {
+               DMERR("Error copying mark data");
+               kfree(block);
+               return -ENOMEM;
+       }
+       atomic_inc(&lc->pending_blocks);
+       block->datalen = strlen(block->data);
+       block->flags |= LOG_MARK_FLAG;
+       spin_lock_irq(&lc->blocks_lock);
+       list_add_tail(&block->list, &lc->logging_blocks);
+       spin_unlock_irq(&lc->blocks_lock);
+       wake_up_process(lc->log_kthread);
+       return 0;
+}
+
+static void log_writes_dtr(struct dm_target *ti)
+{
+       struct log_writes_c *lc = ti->private;
+
+       spin_lock_irq(&lc->blocks_lock);
+       list_splice_init(&lc->unflushed_blocks, &lc->logging_blocks);
+       spin_unlock_irq(&lc->blocks_lock);
+
+       /*
+        * This is just nice to have since it'll update the super to include the
+        * unflushed blocks, if it fails we don't really care.
+        */
+       log_mark(lc, "dm-log-writes-end");
+       wake_up_process(lc->log_kthread);
+       wait_event(lc->wait, !atomic_read(&lc->io_blocks) &&
+                  !atomic_read(&lc->pending_blocks));
+       kthread_stop(lc->log_kthread);
+
+       WARN_ON(!list_empty(&lc->logging_blocks));
+       WARN_ON(!list_empty(&lc->unflushed_blocks));
+       dm_put_device(ti, lc->dev);
+       dm_put_device(ti, lc->logdev);
+       kfree(lc);
+}
+
+static void normal_map_bio(struct dm_target *ti, struct bio *bio)
+{
+       struct log_writes_c *lc = ti->private;
+
+       bio->bi_bdev = lc->dev->bdev;
+}
+
+static int log_writes_map(struct dm_target *ti, struct bio *bio)
+{
+       struct log_writes_c *lc = ti->private;
+       struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
+       struct pending_block *block;
+       struct bvec_iter iter;
+       struct bio_vec bv;
+       size_t alloc_size;
+       int i = 0;
+       bool flush_bio = (bio->bi_rw & REQ_FLUSH);
+       bool fua_bio = (bio->bi_rw & REQ_FUA);
+       bool discard_bio = (bio->bi_rw & REQ_DISCARD);
+
+       pb->block = NULL;
+
+       /* Don't bother doing anything if logging has been disabled */
+       if (!lc->logging_enabled)
+               goto map_bio;
+
+       /*
+        * Map reads as normal.
+        */
+       if (bio_data_dir(bio) == READ)
+               goto map_bio;
+
+       /* No sectors and not a flush?  Don't care */
+       if (!bio_sectors(bio) && !flush_bio)
+               goto map_bio;
+
+       /*
+        * Discards will have bi_size set but there's no actual data, so just
+        * allocate the size of the pending block.
+        */
+       if (discard_bio)
+               alloc_size = sizeof(struct pending_block);
+       else
+               alloc_size = sizeof(struct pending_block) + sizeof(struct bio_vec) * bio_segments(bio);
+
+       block = kzalloc(alloc_size, GFP_NOIO);
+       if (!block) {
+               DMERR("Error allocating pending block");
+               spin_lock_irq(&lc->blocks_lock);
+               lc->logging_enabled = false;
+               spin_unlock_irq(&lc->blocks_lock);
+               return -ENOMEM;
+       }
+       INIT_LIST_HEAD(&block->list);
+       pb->block = block;
+       atomic_inc(&lc->pending_blocks);
+
+       if (flush_bio)
+               block->flags |= LOG_FLUSH_FLAG;
+       if (fua_bio)
+               block->flags |= LOG_FUA_FLAG;
+       if (discard_bio)
+               block->flags |= LOG_DISCARD_FLAG;
+
+       block->sector = bio->bi_iter.bi_sector;
+       block->nr_sectors = bio_sectors(bio);
+
+       /* We don't need the data, just submit */
+       if (discard_bio) {
+               WARN_ON(flush_bio || fua_bio);
+               if (lc->device_supports_discard)
+                       goto map_bio;
+               bio_endio(bio, 0);
+               return DM_MAPIO_SUBMITTED;
+       }
+
+       /* Flush bio, splice the unflushed blocks onto this list and submit */
+       if (flush_bio && !bio_sectors(bio)) {
+               spin_lock_irq(&lc->blocks_lock);
+               list_splice_init(&lc->unflushed_blocks, &block->list);
+               spin_unlock_irq(&lc->blocks_lock);
+               goto map_bio;
+       }
+
+       /*
+        * We will write this bio somewhere else way later so we need to copy
+        * the actual contents into new pages so we know the data will always be
+        * there.
+        *
+        * We do this because this could be a bio from O_DIRECT in which case we
+        * can't just hold onto the page until some later point, we have to
+        * manually copy the contents.
+        */
+       bio_for_each_segment(bv, bio, iter) {
+               struct page *page;
+               void *src, *dst;
+
+               page = alloc_page(GFP_NOIO);
+               if (!page) {
+                       DMERR("Error allocing page");
+                       free_pending_block(lc, block);
+                       spin_lock_irq(&lc->blocks_lock);
+                       lc->logging_enabled = false;
+                       spin_unlock_irq(&lc->blocks_lock);
+                       return -ENOMEM;
+               }
+
+               src = kmap_atomic(bv.bv_page);
+               dst = kmap_atomic(page);
+               memcpy(dst, src + bv.bv_offset, bv.bv_len);
+               kunmap_atomic(dst);
+               kunmap_atomic(src);
+               block->vecs[i].bv_page = page;
+               block->vecs[i].bv_len = bv.bv_len;
+               block->vec_cnt++;
+               i++;
+       }
+
+       /* Had a flush with data in it, weird */
+       if (flush_bio) {
+               spin_lock_irq(&lc->blocks_lock);
+               list_splice_init(&lc->unflushed_blocks, &block->list);
+               spin_unlock_irq(&lc->blocks_lock);
+       }
+map_bio:
+       normal_map_bio(ti, bio);
+       return DM_MAPIO_REMAPPED;
+}
+
+static int normal_end_io(struct dm_target *ti, struct bio *bio, int error)
+{
+       struct log_writes_c *lc = ti->private;
+       struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
+
+       if (bio_data_dir(bio) == WRITE && pb->block) {
+               struct pending_block *block = pb->block;
+               unsigned long flags;
+
+               spin_lock_irqsave(&lc->blocks_lock, flags);
+               if (block->flags & LOG_FLUSH_FLAG) {
+                       list_splice_tail_init(&block->list, &lc->logging_blocks);
+                       list_add_tail(&block->list, &lc->logging_blocks);
+                       wake_up_process(lc->log_kthread);
+               } else if (block->flags & LOG_FUA_FLAG) {
+                       list_add_tail(&block->list, &lc->logging_blocks);
+                       wake_up_process(lc->log_kthread);
+               } else
+                       list_add_tail(&block->list, &lc->unflushed_blocks);
+               spin_unlock_irqrestore(&lc->blocks_lock, flags);
+       }
+
+       return error;
+}
+
+/*
+ * INFO format: <logged entries> <highest allocated sector>
+ */
+static void log_writes_status(struct dm_target *ti, status_type_t type,
+                             unsigned status_flags, char *result,
+                             unsigned maxlen)
+{
+       unsigned sz = 0;
+       struct log_writes_c *lc = ti->private;
+
+       switch (type) {
+       case STATUSTYPE_INFO:
+               DMEMIT("%llu %llu", lc->logged_entries,
+                      (unsigned long long)lc->next_sector - 1);
+               if (!lc->logging_enabled)
+                       DMEMIT(" logging_disabled");
+               break;
+
+       case STATUSTYPE_TABLE:
+               DMEMIT("%s %s", lc->dev->name, lc->logdev->name);
+               break;
+       }
+}
+
+static int log_writes_ioctl(struct dm_target *ti, unsigned int cmd,
+                           unsigned long arg)
+{
+       struct log_writes_c *lc = ti->private;
+       struct dm_dev *dev = lc->dev;
+       int r = 0;
+
+       /*
+        * Only pass ioctls through if the device sizes match exactly.
+        */
+       if (ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT)
+               r = scsi_verify_blk_ioctl(NULL, cmd);
+
+       return r ? : __blkdev_driver_ioctl(dev->bdev, dev->mode, cmd, arg);
+}
+
+static int log_writes_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
+                           struct bio_vec *biovec, int max_size)
+{
+       struct log_writes_c *lc = ti->private;
+       struct request_queue *q = bdev_get_queue(lc->dev->bdev);
+
+       if (!q->merge_bvec_fn)
+               return max_size;
+
+       bvm->bi_bdev = lc->dev->bdev;
+       bvm->bi_sector = dm_target_offset(ti, bvm->bi_sector);
+
+       return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
+}
+
+static int log_writes_iterate_devices(struct dm_target *ti,
+                                     iterate_devices_callout_fn fn,
+                                     void *data)
+{
+       struct log_writes_c *lc = ti->private;
+
+       return fn(ti, lc->dev, 0, ti->len, data);
+}
+
+/*
+ * Messages supported:
+ *   mark <mark data> - specify the marked data.
+ */
+static int log_writes_message(struct dm_target *ti, unsigned argc, char **argv)
+{
+       int r = -EINVAL;
+       struct log_writes_c *lc = ti->private;
+
+       if (argc != 2) {
+               DMWARN("Invalid log-writes message arguments, expect 2 arguments, got %d", argc);
+               return r;
+       }
+
+       if (!strcasecmp(argv[0], "mark"))
+               r = log_mark(lc, argv[1]);
+       else
+               DMWARN("Unrecognised log writes target message received: %s", argv[0]);
+
+       return r;
+}
+
+static void log_writes_io_hints(struct dm_target *ti, struct queue_limits *limits)
+{
+       struct log_writes_c *lc = ti->private;
+       struct request_queue *q = bdev_get_queue(lc->dev->bdev);
+
+       if (!q || !blk_queue_discard(q)) {
+               lc->device_supports_discard = false;
+               limits->discard_granularity = 1 << SECTOR_SHIFT;
+               limits->max_discard_sectors = (UINT_MAX >> SECTOR_SHIFT);
+       }
+}
+
+static struct target_type log_writes_target = {
+       .name   = "log-writes",
+       .version = {1, 0, 0},
+       .module = THIS_MODULE,
+       .ctr    = log_writes_ctr,
+       .dtr    = log_writes_dtr,
+       .map    = log_writes_map,
+       .end_io = normal_end_io,
+       .status = log_writes_status,
+       .ioctl  = log_writes_ioctl,
+       .merge  = log_writes_merge,
+       .message = log_writes_message,
+       .iterate_devices = log_writes_iterate_devices,
+       .io_hints = log_writes_io_hints,
+};
+
+static int __init dm_log_writes_init(void)
+{
+       int r = dm_register_target(&log_writes_target);
+
+       if (r < 0)
+               DMERR("register failed %d", r);
+
+       return r;
+}
+
+static void __exit dm_log_writes_exit(void)
+{
+       dm_unregister_target(&log_writes_target);
+}
+
+module_init(dm_log_writes_init);
+module_exit(dm_log_writes_exit);
+
+MODULE_DESCRIPTION(DM_NAME " log writes target");
+MODULE_AUTHOR("Josef Bacik <jbacik@fb.com>");
+MODULE_LICENSE("GPL");
index d376dc87716ebbd64666e10d7ab54805b0e28f76..63953477a07c36e771a32d5bde686bd0f05890f1 100644 (file)
@@ -428,7 +428,7 @@ static int __multipath_map(struct dm_target *ti, struct request *clone,
        } else {
                /* blk-mq request-based interface */
                *__clone = blk_get_request(bdev_get_queue(bdev),
-                                          rq_data_dir(rq), GFP_KERNEL);
+                                          rq_data_dir(rq), GFP_ATOMIC);
                if (IS_ERR(*__clone))
                        /* ENOMEM, requeue */
                        return r;
@@ -1627,7 +1627,7 @@ static int __pgpath_busy(struct pgpath *pgpath)
 {
        struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev);
 
-       return dm_underlying_device_busy(q);
+       return blk_lld_busy(q);
 }
 
 /*
@@ -1703,7 +1703,7 @@ out:
  *---------------------------------------------------------------*/
 static struct target_type multipath_target = {
        .name = "multipath",
-       .version = {1, 8, 0},
+       .version = {1, 9, 0},
        .module = THIS_MODULE,
        .ctr = multipath_ctr,
        .dtr = multipath_dtr,
index c62c5ab6aed52018f6018f84f633124f6356e821..7e818f5f1dc4e048344b08a7b1ddf8e32304b2ac 100644 (file)
@@ -11,7 +11,7 @@
 struct dm_sysfs_attr {
        struct attribute attr;
        ssize_t (*show)(struct mapped_device *, char *);
-       ssize_t (*store)(struct mapped_device *, char *);
+       ssize_t (*store)(struct mapped_device *, const char *, size_t count);
 };
 
 #define DM_ATTR_RO(_name) \
@@ -39,6 +39,31 @@ static ssize_t dm_attr_show(struct kobject *kobj, struct attribute *attr,
        return ret;
 }
 
+#define DM_ATTR_RW(_name) \
+struct dm_sysfs_attr dm_attr_##_name = \
+       __ATTR(_name, S_IRUGO | S_IWUSR, dm_attr_##_name##_show, dm_attr_##_name##_store)
+
+static ssize_t dm_attr_store(struct kobject *kobj, struct attribute *attr,
+                            const char *page, size_t count)
+{
+       struct dm_sysfs_attr *dm_attr;
+       struct mapped_device *md;
+       ssize_t ret;
+
+       dm_attr = container_of(attr, struct dm_sysfs_attr, attr);
+       if (!dm_attr->store)
+               return -EIO;
+
+       md = dm_get_from_kobject(kobj);
+       if (!md)
+               return -EINVAL;
+
+       ret = dm_attr->store(md, page, count);
+       dm_put(md);
+
+       return ret;
+}
+
 static ssize_t dm_attr_name_show(struct mapped_device *md, char *buf)
 {
        if (dm_copy_name_and_uuid(md, buf, NULL))
@@ -64,25 +89,33 @@ static ssize_t dm_attr_suspended_show(struct mapped_device *md, char *buf)
        return strlen(buf);
 }
 
+static ssize_t dm_attr_use_blk_mq_show(struct mapped_device *md, char *buf)
+{
+       sprintf(buf, "%d\n", dm_use_blk_mq(md));
+
+       return strlen(buf);
+}
+
 static DM_ATTR_RO(name);
 static DM_ATTR_RO(uuid);
 static DM_ATTR_RO(suspended);
+static DM_ATTR_RO(use_blk_mq);
+static DM_ATTR_RW(rq_based_seq_io_merge_deadline);
 
 static struct attribute *dm_attrs[] = {
        &dm_attr_name.attr,
        &dm_attr_uuid.attr,
        &dm_attr_suspended.attr,
+       &dm_attr_use_blk_mq.attr,
+       &dm_attr_rq_based_seq_io_merge_deadline.attr,
        NULL,
 };
 
 static const struct sysfs_ops dm_sysfs_ops = {
        .show   = dm_attr_show,
+       .store  = dm_attr_store,
 };
 
-/*
- * dm kobject is embedded in mapped_device structure
- * no need to define release function here
- */
 static struct kobj_type dm_ktype = {
        .sysfs_ops      = &dm_sysfs_ops,
        .default_attrs  = dm_attrs,
index 6554d9148927771296e52cc2882f9e6f7fa68145..d9b00b8565c6dc1a36f5a3d863baa370126da593 100644 (file)
@@ -18,6 +18,8 @@
 #include <linux/mutex.h>
 #include <linux/delay.h>
 #include <linux/atomic.h>
+#include <linux/blk-mq.h>
+#include <linux/mount.h>
 
 #define DM_MSG_PREFIX "table"
 
@@ -372,23 +374,18 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
        int r;
        dev_t uninitialized_var(dev);
        struct dm_dev_internal *dd;
-       unsigned int major, minor;
        struct dm_table *t = ti->table;
-       char dummy;
+       struct block_device *bdev;
 
        BUG_ON(!t);
 
-       if (sscanf(path, "%u:%u%c", &major, &minor, &dummy) == 2) {
-               /* Extract the major/minor numbers */
-               dev = MKDEV(major, minor);
-               if (MAJOR(dev) != major || MINOR(dev) != minor)
-                       return -EOVERFLOW;
+       /* convert the path to a device */
+       bdev = lookup_bdev(path);
+       if (IS_ERR(bdev)) {
+               dev = name_to_dev_t(path);
+               if (!dev)
+                       return -ENODEV;
        } else {
-               /* convert the path to a device */
-               struct block_device *bdev = lookup_bdev(path);
-
-               if (IS_ERR(bdev))
-                       return PTR_ERR(bdev);
                dev = bdev->bd_dev;
                bdput(bdev);
        }
@@ -939,7 +936,7 @@ bool dm_table_mq_request_based(struct dm_table *t)
        return dm_table_get_type(t) == DM_TYPE_MQ_REQUEST_BASED;
 }
 
-static int dm_table_alloc_md_mempools(struct dm_table *t)
+static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *md)
 {
        unsigned type = dm_table_get_type(t);
        unsigned per_bio_data_size = 0;
@@ -957,7 +954,7 @@ static int dm_table_alloc_md_mempools(struct dm_table *t)
                        per_bio_data_size = max(per_bio_data_size, tgt->per_bio_data_size);
                }
 
-       t->mempools = dm_alloc_md_mempools(type, t->integrity_supported, per_bio_data_size);
+       t->mempools = dm_alloc_md_mempools(md, type, t->integrity_supported, per_bio_data_size);
        if (!t->mempools)
                return -ENOMEM;
 
@@ -1127,7 +1124,7 @@ int dm_table_complete(struct dm_table *t)
                return r;
        }
 
-       r = dm_table_alloc_md_mempools(t);
+       r = dm_table_alloc_md_mempools(t, t->md);
        if (r)
                DMERR("unable to allocate mempools");
 
@@ -1339,14 +1336,14 @@ static bool dm_table_supports_flush(struct dm_table *t, unsigned flush)
                        continue;
 
                if (ti->flush_supported)
-                       return 1;
+                       return true;
 
                if (ti->type->iterate_devices &&
                    ti->type->iterate_devices(ti, device_flush_capable, &flush))
-                       return 1;
+                       return true;
        }
 
-       return 0;
+       return false;
 }
 
 static bool dm_table_discard_zeroes_data(struct dm_table *t)
@@ -1359,10 +1356,10 @@ static bool dm_table_discard_zeroes_data(struct dm_table *t)
                ti = dm_table_get_target(t, i++);
 
                if (ti->discard_zeroes_data_unsupported)
-                       return 0;
+                       return false;
        }
 
-       return 1;
+       return true;
 }
 
 static int device_is_nonrot(struct dm_target *ti, struct dm_dev *dev,
@@ -1408,10 +1405,10 @@ static bool dm_table_all_devices_attribute(struct dm_table *t,
 
                if (!ti->type->iterate_devices ||
                    !ti->type->iterate_devices(ti, func, NULL))
-                       return 0;
+                       return false;
        }
 
-       return 1;
+       return true;
 }
 
 static int device_not_write_same_capable(struct dm_target *ti, struct dm_dev *dev,
@@ -1468,14 +1465,14 @@ static bool dm_table_supports_discards(struct dm_table *t)
                        continue;
 
                if (ti->discards_supported)
-                       return 1;
+                       return true;
 
                if (ti->type->iterate_devices &&
                    ti->type->iterate_devices(ti, device_discard_capable, NULL))
-                       return 1;
+                       return true;
        }
 
-       return 0;
+       return false;
 }
 
 void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
@@ -1677,20 +1674,6 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits)
        return r;
 }
 
-int dm_table_any_busy_target(struct dm_table *t)
-{
-       unsigned i;
-       struct dm_target *ti;
-
-       for (i = 0; i < t->num_targets; i++) {
-               ti = t->targets + i;
-               if (ti->type->busy && ti->type->busy(ti))
-                       return 1;
-       }
-
-       return 0;
-}
-
 struct mapped_device *dm_table_get_md(struct dm_table *t)
 {
        return t->md;
@@ -1709,9 +1692,13 @@ void dm_table_run_md_queue_async(struct dm_table *t)
        md = dm_table_get_md(t);
        queue = dm_get_md_queue(md);
        if (queue) {
-               spin_lock_irqsave(queue->queue_lock, flags);
-               blk_run_queue_async(queue);
-               spin_unlock_irqrestore(queue->queue_lock, flags);
+               if (queue->mq_ops)
+                       blk_mq_run_hw_queues(queue, true);
+               else {
+                       spin_lock_irqsave(queue->queue_lock, flags);
+                       blk_run_queue_async(queue);
+                       spin_unlock_irqrestore(queue->queue_lock, flags);
+               }
        }
 }
 EXPORT_SYMBOL(dm_table_run_md_queue_async);
index 7a7bab8947ae3485d31c132cb3398251c7d507cf..66616db33e6fdbc14896704fe50483930ffb1cfd 100644 (file)
 
 #include <linux/module.h>
 #include <linux/device-mapper.h>
+#include <linux/reboot.h>
 #include <crypto/hash.h>
 
 #define DM_MSG_PREFIX                  "verity"
 
+#define DM_VERITY_ENV_LENGTH           42
+#define DM_VERITY_ENV_VAR_NAME         "DM_VERITY_ERR_BLOCK_NR"
+
 #define DM_VERITY_IO_VEC_INLINE                16
 #define DM_VERITY_MEMPOOL_SIZE         4
 #define DM_VERITY_DEFAULT_PREFETCH_SIZE        262144
 
 #define DM_VERITY_MAX_LEVELS           63
+#define DM_VERITY_MAX_CORRUPTED_ERRS   100
+
+#define DM_VERITY_OPT_LOGGING          "ignore_corruption"
+#define DM_VERITY_OPT_RESTART          "restart_on_corruption"
 
 static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE;
 
 module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, S_IRUGO | S_IWUSR);
 
+enum verity_mode {
+       DM_VERITY_MODE_EIO,
+       DM_VERITY_MODE_LOGGING,
+       DM_VERITY_MODE_RESTART
+};
+
+enum verity_block_type {
+       DM_VERITY_BLOCK_TYPE_DATA,
+       DM_VERITY_BLOCK_TYPE_METADATA
+};
+
 struct dm_verity {
        struct dm_dev *data_dev;
        struct dm_dev *hash_dev;
@@ -54,6 +73,8 @@ struct dm_verity {
        unsigned digest_size;   /* digest size for the current hash algorithm */
        unsigned shash_descsize;/* the size of temporary space for crypto */
        int hash_failed;        /* set to 1 if hash of any block failed */
+       enum verity_mode mode;  /* mode for handling verification errors */
+       unsigned corrupted_errs;/* Number of errors for corrupted blocks */
 
        mempool_t *vec_mempool; /* mempool of bio vector */
 
@@ -174,6 +195,57 @@ static void verity_hash_at_level(struct dm_verity *v, sector_t block, int level,
                *offset = idx << (v->hash_dev_block_bits - v->hash_per_block_bits);
 }
 
+/*
+ * Handle verification errors.
+ */
+static int verity_handle_err(struct dm_verity *v, enum verity_block_type type,
+                            unsigned long long block)
+{
+       char verity_env[DM_VERITY_ENV_LENGTH];
+       char *envp[] = { verity_env, NULL };
+       const char *type_str = "";
+       struct mapped_device *md = dm_table_get_md(v->ti->table);
+
+       /* Corruption should be visible in device status in all modes */
+       v->hash_failed = 1;
+
+       if (v->corrupted_errs >= DM_VERITY_MAX_CORRUPTED_ERRS)
+               goto out;
+
+       v->corrupted_errs++;
+
+       switch (type) {
+       case DM_VERITY_BLOCK_TYPE_DATA:
+               type_str = "data";
+               break;
+       case DM_VERITY_BLOCK_TYPE_METADATA:
+               type_str = "metadata";
+               break;
+       default:
+               BUG();
+       }
+
+       DMERR("%s: %s block %llu is corrupted", v->data_dev->name, type_str,
+               block);
+
+       if (v->corrupted_errs == DM_VERITY_MAX_CORRUPTED_ERRS)
+               DMERR("%s: reached maximum errors", v->data_dev->name);
+
+       snprintf(verity_env, DM_VERITY_ENV_LENGTH, "%s=%d,%llu",
+               DM_VERITY_ENV_VAR_NAME, type, block);
+
+       kobject_uevent_env(&disk_to_dev(dm_disk(md))->kobj, KOBJ_CHANGE, envp);
+
+out:
+       if (v->mode == DM_VERITY_MODE_LOGGING)
+               return 0;
+
+       if (v->mode == DM_VERITY_MODE_RESTART)
+               kernel_restart("dm-verity device corrupted");
+
+       return 1;
+}
+
 /*
  * Verify hash of a metadata block pertaining to the specified data block
  * ("block" argument) at a specified level ("level" argument).
@@ -251,11 +323,11 @@ static int verity_verify_level(struct dm_verity_io *io, sector_t block,
                        goto release_ret_r;
                }
                if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) {
-                       DMERR_LIMIT("metadata block %llu is corrupted",
-                               (unsigned long long)hash_block);
-                       v->hash_failed = 1;
-                       r = -EIO;
-                       goto release_ret_r;
+                       if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_METADATA,
+                                             hash_block)) {
+                               r = -EIO;
+                               goto release_ret_r;
+                       }
                } else
                        aux->hash_verified = 1;
        }
@@ -367,10 +439,9 @@ test_block_hash:
                        return r;
                }
                if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) {
-                       DMERR_LIMIT("data block %llu is corrupted",
-                               (unsigned long long)(io->block + b));
-                       v->hash_failed = 1;
-                       return -EIO;
+                       if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA,
+                                             io->block + b))
+                               return -EIO;
                }
        }
 
@@ -546,6 +617,19 @@ static void verity_status(struct dm_target *ti, status_type_t type,
                else
                        for (x = 0; x < v->salt_size; x++)
                                DMEMIT("%02x", v->salt[x]);
+               if (v->mode != DM_VERITY_MODE_EIO) {
+                       DMEMIT(" 1 ");
+                       switch (v->mode) {
+                       case DM_VERITY_MODE_LOGGING:
+                               DMEMIT(DM_VERITY_OPT_LOGGING);
+                               break;
+                       case DM_VERITY_MODE_RESTART:
+                               DMEMIT(DM_VERITY_OPT_RESTART);
+                               break;
+                       default:
+                               BUG();
+                       }
+               }
                break;
        }
 }
@@ -647,13 +731,19 @@ static void verity_dtr(struct dm_target *ti)
 static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 {
        struct dm_verity *v;
-       unsigned num;
+       struct dm_arg_set as;
+       const char *opt_string;
+       unsigned int num, opt_params;
        unsigned long long num_ll;
        int r;
        int i;
        sector_t hash_position;
        char dummy;
 
+       static struct dm_arg _args[] = {
+               {0, 1, "Invalid number of feature args"},
+       };
+
        v = kzalloc(sizeof(struct dm_verity), GFP_KERNEL);
        if (!v) {
                ti->error = "Cannot allocate verity structure";
@@ -668,8 +758,8 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
                goto bad;
        }
 
-       if (argc != 10) {
-               ti->error = "Invalid argument count: exactly 10 arguments required";
+       if (argc < 10) {
+               ti->error = "Not enough arguments";
                r = -EINVAL;
                goto bad;
        }
@@ -790,6 +880,39 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
                }
        }
 
+       argv += 10;
+       argc -= 10;
+
+       /* Optional parameters */
+       if (argc) {
+               as.argc = argc;
+               as.argv = argv;
+
+               r = dm_read_arg_group(_args, &as, &opt_params, &ti->error);
+               if (r)
+                       goto bad;
+
+               while (opt_params) {
+                       opt_params--;
+                       opt_string = dm_shift_arg(&as);
+                       if (!opt_string) {
+                               ti->error = "Not enough feature arguments";
+                               r = -EINVAL;
+                               goto bad;
+                       }
+
+                       if (!strcasecmp(opt_string, DM_VERITY_OPT_LOGGING))
+                               v->mode = DM_VERITY_MODE_LOGGING;
+                       else if (!strcasecmp(opt_string, DM_VERITY_OPT_RESTART))
+                               v->mode = DM_VERITY_MODE_RESTART;
+                       else {
+                               ti->error = "Invalid feature arguments";
+                               r = -EINVAL;
+                               goto bad;
+                       }
+               }
+       }
+
        v->hash_per_block_bits =
                __fls((1 << v->hash_dev_block_bits) / v->digest_size);
 
index 8001fe9e3434734ad92c8109ef8fa860906238ce..f8c7ca3e8947378484a6d3f9745c363c78ab2879 100644 (file)
@@ -21,6 +21,9 @@
 #include <linux/delay.h>
 #include <linux/wait.h>
 #include <linux/kthread.h>
+#include <linux/ktime.h>
+#include <linux/elevator.h> /* for rq_end_sector() */
+#include <linux/blk-mq.h>
 
 #include <trace/events/block.h>
 
@@ -216,8 +219,29 @@ struct mapped_device {
 
        struct kthread_worker kworker;
        struct task_struct *kworker_task;
+
+       /* for request-based merge heuristic in dm_request_fn() */
+       unsigned seq_rq_merge_deadline_usecs;
+       int last_rq_rw;
+       sector_t last_rq_pos;
+       ktime_t last_rq_start_time;
+
+       /* for blk-mq request-based DM support */
+       struct blk_mq_tag_set tag_set;
+       bool use_blk_mq;
 };
 
+#ifdef CONFIG_DM_MQ_DEFAULT
+static bool use_blk_mq = true;
+#else
+static bool use_blk_mq = false;
+#endif
+
+bool dm_use_blk_mq(struct mapped_device *md)
+{
+       return md->use_blk_mq;
+}
+
 /*
  * For mempools pre-allocation at the table loading time.
  */
@@ -250,35 +274,35 @@ static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
  */
 static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS;
 
-static unsigned __dm_get_reserved_ios(unsigned *reserved_ios,
+static unsigned __dm_get_module_param(unsigned *module_param,
                                      unsigned def, unsigned max)
 {
-       unsigned ios = ACCESS_ONCE(*reserved_ios);
-       unsigned modified_ios = 0;
+       unsigned param = ACCESS_ONCE(*module_param);
+       unsigned modified_param = 0;
 
-       if (!ios)
-               modified_ios = def;
-       else if (ios > max)
-               modified_ios = max;
+       if (!param)
+               modified_param = def;
+       else if (param > max)
+               modified_param = max;
 
-       if (modified_ios) {
-               (void)cmpxchg(reserved_ios, ios, modified_ios);
-               ios = modified_ios;
+       if (modified_param) {
+               (void)cmpxchg(module_param, param, modified_param);
+               param = modified_param;
        }
 
-       return ios;
+       return param;
 }
 
 unsigned dm_get_reserved_bio_based_ios(void)
 {
-       return __dm_get_reserved_ios(&reserved_bio_based_ios,
+       return __dm_get_module_param(&reserved_bio_based_ios,
                                     RESERVED_BIO_BASED_IOS, RESERVED_MAX_IOS);
 }
 EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
 
 unsigned dm_get_reserved_rq_based_ios(void)
 {
-       return __dm_get_reserved_ios(&reserved_rq_based_ios,
+       return __dm_get_module_param(&reserved_rq_based_ios,
                                     RESERVED_REQUEST_BASED_IOS, RESERVED_MAX_IOS);
 }
 EXPORT_SYMBOL_GPL(dm_get_reserved_rq_based_ios);
@@ -1017,6 +1041,11 @@ static void end_clone_bio(struct bio *clone, int error)
        blk_update_request(tio->orig, 0, nr_bytes);
 }
 
+static struct dm_rq_target_io *tio_from_request(struct request *rq)
+{
+       return (rq->q->mq_ops ? blk_mq_rq_to_pdu(rq) : rq->special);
+}
+
 /*
  * Don't touch any member of the md after calling this function because
  * the md may be freed in dm_put() at the end of this function.
@@ -1024,10 +1053,13 @@ static void end_clone_bio(struct bio *clone, int error)
  */
 static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
 {
+       int nr_requests_pending;
+
        atomic_dec(&md->pending[rw]);
 
        /* nudge anyone waiting on suspend queue */
-       if (!md_in_flight(md))
+       nr_requests_pending = md_in_flight(md);
+       if (!nr_requests_pending)
                wake_up(&md->wait);
 
        /*
@@ -1036,8 +1068,13 @@ static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
         * back into ->request_fn() could deadlock attempting to grab the
         * queue lock again.
         */
-       if (run_queue)
-               blk_run_queue_async(md->queue);
+       if (run_queue) {
+               if (md->queue->mq_ops)
+                       blk_mq_run_hw_queues(md->queue, true);
+               else if (!nr_requests_pending ||
+                        (nr_requests_pending >= md->queue->nr_congestion_on))
+                       blk_run_queue_async(md->queue);
+       }
 
        /*
         * dm_put() must be at the end of this function. See the comment above
@@ -1048,13 +1085,18 @@ static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
 static void free_rq_clone(struct request *clone)
 {
        struct dm_rq_target_io *tio = clone->end_io_data;
+       struct mapped_device *md = tio->md;
 
        blk_rq_unprep_clone(clone);
-       if (clone->q && clone->q->mq_ops)
+
+       if (clone->q->mq_ops)
                tio->ti->type->release_clone_rq(clone);
-       else
-               free_clone_request(tio->md, clone);
-       free_rq_tio(tio);
+       else if (!md->queue->mq_ops)
+               /* request_fn queue stacked on request_fn queue(s) */
+               free_clone_request(md, clone);
+
+       if (!md->queue->mq_ops)
+               free_rq_tio(tio);
 }
 
 /*
@@ -1083,17 +1125,22 @@ static void dm_end_request(struct request *clone, int error)
        }
 
        free_rq_clone(clone);
-       blk_end_request_all(rq, error);
+       if (!rq->q->mq_ops)
+               blk_end_request_all(rq, error);
+       else
+               blk_mq_end_request(rq, error);
        rq_completed(md, rw, true);
 }
 
 static void dm_unprep_request(struct request *rq)
 {
-       struct dm_rq_target_io *tio = rq->special;
+       struct dm_rq_target_io *tio = tio_from_request(rq);
        struct request *clone = tio->clone;
 
-       rq->special = NULL;
-       rq->cmd_flags &= ~REQ_DONTPREP;
+       if (!rq->q->mq_ops) {
+               rq->special = NULL;
+               rq->cmd_flags &= ~REQ_DONTPREP;
+       }
 
        if (clone)
                free_rq_clone(clone);
@@ -1102,18 +1149,29 @@ static void dm_unprep_request(struct request *rq)
 /*
  * Requeue the original request of a clone.
  */
-static void dm_requeue_unmapped_original_request(struct mapped_device *md,
-                                                struct request *rq)
+static void old_requeue_request(struct request *rq)
 {
-       int rw = rq_data_dir(rq);
        struct request_queue *q = rq->q;
        unsigned long flags;
 
-       dm_unprep_request(rq);
-
        spin_lock_irqsave(q->queue_lock, flags);
        blk_requeue_request(q, rq);
        spin_unlock_irqrestore(q->queue_lock, flags);
+}
+
+static void dm_requeue_unmapped_original_request(struct mapped_device *md,
+                                                struct request *rq)
+{
+       int rw = rq_data_dir(rq);
+
+       dm_unprep_request(rq);
+
+       if (!rq->q->mq_ops)
+               old_requeue_request(rq);
+       else {
+               blk_mq_requeue_request(rq);
+               blk_mq_kick_requeue_list(rq->q);
+       }
 
        rq_completed(md, rw, false);
 }
@@ -1125,35 +1183,44 @@ static void dm_requeue_unmapped_request(struct request *clone)
        dm_requeue_unmapped_original_request(tio->md, tio->orig);
 }
 
-static void __stop_queue(struct request_queue *q)
-{
-       blk_stop_queue(q);
-}
-
-static void stop_queue(struct request_queue *q)
+static void old_stop_queue(struct request_queue *q)
 {
        unsigned long flags;
 
+       if (blk_queue_stopped(q))
+               return;
+
        spin_lock_irqsave(q->queue_lock, flags);
-       __stop_queue(q);
+       blk_stop_queue(q);
        spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
-static void __start_queue(struct request_queue *q)
+static void stop_queue(struct request_queue *q)
 {
-       if (blk_queue_stopped(q))
-               blk_start_queue(q);
+       if (!q->mq_ops)
+               old_stop_queue(q);
+       else
+               blk_mq_stop_hw_queues(q);
 }
 
-static void start_queue(struct request_queue *q)
+static void old_start_queue(struct request_queue *q)
 {
        unsigned long flags;
 
        spin_lock_irqsave(q->queue_lock, flags);
-       __start_queue(q);
+       if (blk_queue_stopped(q))
+               blk_start_queue(q);
        spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
+static void start_queue(struct request_queue *q)
+{
+       if (!q->mq_ops)
+               old_start_queue(q);
+       else
+               blk_mq_start_stopped_hw_queues(q, true);
+}
+
 static void dm_done(struct request *clone, int error, bool mapped)
 {
        int r = error;
@@ -1192,13 +1259,20 @@ static void dm_done(struct request *clone, int error, bool mapped)
 static void dm_softirq_done(struct request *rq)
 {
        bool mapped = true;
-       struct dm_rq_target_io *tio = rq->special;
+       struct dm_rq_target_io *tio = tio_from_request(rq);
        struct request *clone = tio->clone;
+       int rw;
 
        if (!clone) {
-               blk_end_request_all(rq, tio->error);
-               rq_completed(tio->md, rq_data_dir(rq), false);
-               free_rq_tio(tio);
+               rw = rq_data_dir(rq);
+               if (!rq->q->mq_ops) {
+                       blk_end_request_all(rq, tio->error);
+                       rq_completed(tio->md, rw, false);
+                       free_rq_tio(tio);
+               } else {
+                       blk_mq_end_request(rq, tio->error);
+                       rq_completed(tio->md, rw, false);
+               }
                return;
        }
 
@@ -1214,7 +1288,7 @@ static void dm_softirq_done(struct request *rq)
  */
 static void dm_complete_request(struct request *rq, int error)
 {
-       struct dm_rq_target_io *tio = rq->special;
+       struct dm_rq_target_io *tio = tio_from_request(rq);
 
        tio->error = error;
        blk_complete_request(rq);
@@ -1233,7 +1307,7 @@ static void dm_kill_unmapped_request(struct request *rq, int error)
 }
 
 /*
- * Called with the clone's queue lock held
+ * Called with the clone's queue lock held (for non-blk-mq)
  */
 static void end_clone_request(struct request *clone, int error)
 {
@@ -1693,7 +1767,7 @@ out:
  * The request function that just remaps the bio built up by
  * dm_merge_bvec.
  */
-static void _dm_request(struct request_queue *q, struct bio *bio)
+static void dm_make_request(struct request_queue *q, struct bio *bio)
 {
        int rw = bio_data_dir(bio);
        struct mapped_device *md = q->queuedata;
@@ -1725,16 +1799,6 @@ int dm_request_based(struct mapped_device *md)
        return blk_queue_stackable(md->queue);
 }
 
-static void dm_request(struct request_queue *q, struct bio *bio)
-{
-       struct mapped_device *md = q->queuedata;
-
-       if (dm_request_based(md))
-               blk_queue_bio(q, bio);
-       else
-               _dm_request(q, bio);
-}
-
 static void dm_dispatch_clone_request(struct request *clone, struct request *rq)
 {
        int r;
@@ -1787,15 +1851,25 @@ static int setup_clone(struct request *clone, struct request *rq,
 static struct request *clone_rq(struct request *rq, struct mapped_device *md,
                                struct dm_rq_target_io *tio, gfp_t gfp_mask)
 {
-       struct request *clone = alloc_clone_request(md, gfp_mask);
+       /*
+        * Do not allocate a clone if tio->clone was already set
+        * (see: dm_mq_queue_rq).
+        */
+       bool alloc_clone = !tio->clone;
+       struct request *clone;
 
-       if (!clone)
-               return NULL;
+       if (alloc_clone) {
+               clone = alloc_clone_request(md, gfp_mask);
+               if (!clone)
+                       return NULL;
+       } else
+               clone = tio->clone;
 
        blk_rq_init(NULL, clone);
        if (setup_clone(clone, rq, tio, gfp_mask)) {
                /* -ENOMEM */
-               free_clone_request(md, clone);
+               if (alloc_clone)
+                       free_clone_request(md, clone);
                return NULL;
        }
 
@@ -1804,6 +1878,19 @@ static struct request *clone_rq(struct request *rq, struct mapped_device *md,
 
 static void map_tio_request(struct kthread_work *work);
 
+static void init_tio(struct dm_rq_target_io *tio, struct request *rq,
+                    struct mapped_device *md)
+{
+       tio->md = md;
+       tio->ti = NULL;
+       tio->clone = NULL;
+       tio->orig = rq;
+       tio->error = 0;
+       memset(&tio->info, 0, sizeof(tio->info));
+       if (md->kworker_task)
+               init_kthread_work(&tio->work, map_tio_request);
+}
+
 static struct dm_rq_target_io *prep_tio(struct request *rq,
                                        struct mapped_device *md, gfp_t gfp_mask)
 {
@@ -1815,13 +1902,7 @@ static struct dm_rq_target_io *prep_tio(struct request *rq,
        if (!tio)
                return NULL;
 
-       tio->md = md;
-       tio->ti = NULL;
-       tio->clone = NULL;
-       tio->orig = rq;
-       tio->error = 0;
-       memset(&tio->info, 0, sizeof(tio->info));
-       init_kthread_work(&tio->work, map_tio_request);
+       init_tio(tio, rq, md);
 
        table = dm_get_live_table(md, &srcu_idx);
        if (!dm_table_mq_request_based(table)) {
@@ -1865,11 +1946,11 @@ static int dm_prep_fn(struct request_queue *q, struct request *rq)
  * DM_MAPIO_REQUEUE : the original request needs to be requeued
  * < 0              : the request was completed due to failure
  */
-static int map_request(struct dm_target *ti, struct request *rq,
+static int map_request(struct dm_rq_target_io *tio, struct request *rq,
                       struct mapped_device *md)
 {
        int r;
-       struct dm_rq_target_io *tio = rq->special;
+       struct dm_target *ti = tio->ti;
        struct request *clone = NULL;
 
        if (tio->clone) {
@@ -1884,7 +1965,7 @@ static int map_request(struct dm_target *ti, struct request *rq,
                }
                if (IS_ERR(clone))
                        return DM_MAPIO_REQUEUE;
-               if (setup_clone(clone, rq, tio, GFP_KERNEL)) {
+               if (setup_clone(clone, rq, tio, GFP_ATOMIC)) {
                        /* -ENOMEM */
                        ti->type->release_clone_rq(clone);
                        return DM_MAPIO_REQUEUE;
@@ -1925,15 +2006,24 @@ static void map_tio_request(struct kthread_work *work)
        struct request *rq = tio->orig;
        struct mapped_device *md = tio->md;
 
-       if (map_request(tio->ti, rq, md) == DM_MAPIO_REQUEUE)
+       if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE)
                dm_requeue_unmapped_original_request(md, rq);
 }
 
 static void dm_start_request(struct mapped_device *md, struct request *orig)
 {
-       blk_start_request(orig);
+       if (!orig->q->mq_ops)
+               blk_start_request(orig);
+       else
+               blk_mq_start_request(orig);
        atomic_inc(&md->pending[rq_data_dir(orig)]);
 
+       if (md->seq_rq_merge_deadline_usecs) {
+               md->last_rq_pos = rq_end_sector(orig);
+               md->last_rq_rw = rq_data_dir(orig);
+               md->last_rq_start_time = ktime_get();
+       }
+
        /*
         * Hold the md reference here for the in-flight I/O.
         * We can't rely on the reference count by device opener,
@@ -1944,6 +2034,45 @@ static void dm_start_request(struct mapped_device *md, struct request *orig)
        dm_get(md);
 }
 
+#define MAX_SEQ_RQ_MERGE_DEADLINE_USECS 100000
+
+ssize_t dm_attr_rq_based_seq_io_merge_deadline_show(struct mapped_device *md, char *buf)
+{
+       return sprintf(buf, "%u\n", md->seq_rq_merge_deadline_usecs);
+}
+
+ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md,
+                                                    const char *buf, size_t count)
+{
+       unsigned deadline;
+
+       if (!dm_request_based(md) || md->use_blk_mq)
+               return count;
+
+       if (kstrtouint(buf, 10, &deadline))
+               return -EINVAL;
+
+       if (deadline > MAX_SEQ_RQ_MERGE_DEADLINE_USECS)
+               deadline = MAX_SEQ_RQ_MERGE_DEADLINE_USECS;
+
+       md->seq_rq_merge_deadline_usecs = deadline;
+
+       return count;
+}
+
+static bool dm_request_peeked_before_merge_deadline(struct mapped_device *md)
+{
+       ktime_t kt_deadline;
+
+       if (!md->seq_rq_merge_deadline_usecs)
+               return false;
+
+       kt_deadline = ns_to_ktime((u64)md->seq_rq_merge_deadline_usecs * NSEC_PER_USEC);
+       kt_deadline = ktime_add_safe(md->last_rq_start_time, kt_deadline);
+
+       return !ktime_after(ktime_get(), kt_deadline);
+}
+
 /*
  * q->request_fn for request-based dm.
  * Called with the queue lock held.
@@ -1967,7 +2096,7 @@ static void dm_request_fn(struct request_queue *q)
        while (!blk_queue_stopped(q)) {
                rq = blk_peek_request(q);
                if (!rq)
-                       goto delay_and_out;
+                       goto out;
 
                /* always use block 0 to find the target for flushes for now */
                pos = 0;
@@ -1986,12 +2115,17 @@ static void dm_request_fn(struct request_queue *q)
                        continue;
                }
 
+               if (dm_request_peeked_before_merge_deadline(md) &&
+                   md_in_flight(md) && rq->bio && rq->bio->bi_vcnt == 1 &&
+                   md->last_rq_pos == pos && md->last_rq_rw == rq_data_dir(rq))
+                       goto delay_and_out;
+
                if (ti->type->busy && ti->type->busy(ti))
                        goto delay_and_out;
 
                dm_start_request(md, rq);
 
-               tio = rq->special;
+               tio = tio_from_request(rq);
                /* Establish tio->ti before queuing work (map_tio_request) */
                tio->ti = ti;
                queue_kthread_work(&md->kworker, &tio->work);
@@ -2001,33 +2135,11 @@ static void dm_request_fn(struct request_queue *q)
        goto out;
 
 delay_and_out:
-       blk_delay_queue(q, HZ / 10);
+       blk_delay_queue(q, HZ / 100);
 out:
        dm_put_live_table(md, srcu_idx);
 }
 
-int dm_underlying_device_busy(struct request_queue *q)
-{
-       return blk_lld_busy(q);
-}
-EXPORT_SYMBOL_GPL(dm_underlying_device_busy);
-
-static int dm_lld_busy(struct request_queue *q)
-{
-       int r;
-       struct mapped_device *md = q->queuedata;
-       struct dm_table *map = dm_get_live_table_fast(md);
-
-       if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))
-               r = 1;
-       else
-               r = dm_table_any_busy_target(map);
-
-       dm_put_live_table_fast(md);
-
-       return r;
-}
-
 static int dm_any_congested(void *congested_data, int bdi_bits)
 {
        int r = bdi_bits;
@@ -2110,7 +2222,7 @@ static void dm_init_md_queue(struct mapped_device *md)
 {
        /*
         * Request-based dm devices cannot be stacked on top of bio-based dm
-        * devices.  The type of this dm device has not been decided yet.
+        * devices.  The type of this dm device may not have been decided yet.
         * The type is decided at the first table loading time.
         * To prevent problematic device stacking, clear the queue flag
         * for request stacking support until then.
@@ -2118,13 +2230,21 @@ static void dm_init_md_queue(struct mapped_device *md)
         * This queue is new, so no concurrency on the queue_flags.
         */
        queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue);
+}
+
+static void dm_init_old_md_queue(struct mapped_device *md)
+{
+       md->use_blk_mq = false;
+       dm_init_md_queue(md);
 
+       /*
+        * Initialize aspects of queue that aren't relevant for blk-mq
+        */
        md->queue->queuedata = md;
        md->queue->backing_dev_info.congested_fn = dm_any_congested;
        md->queue->backing_dev_info.congested_data = md;
-       blk_queue_make_request(md->queue, dm_request);
+
        blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
-       blk_queue_merge_bvec(md->queue, dm_merge_bvec);
 }
 
 /*
@@ -2156,6 +2276,7 @@ static struct mapped_device *alloc_dev(int minor)
        if (r < 0)
                goto bad_io_barrier;
 
+       md->use_blk_mq = use_blk_mq;
        md->type = DM_TYPE_NONE;
        mutex_init(&md->suspend_lock);
        mutex_init(&md->type_lock);
@@ -2267,6 +2388,8 @@ static void free_dev(struct mapped_device *md)
        del_gendisk(md->disk);
        put_disk(md->disk);
        blk_cleanup_queue(md->queue);
+       if (md->use_blk_mq)
+               blk_mq_free_tag_set(&md->tag_set);
        bdput(md->bdev);
        free_minor(minor);
 
@@ -2278,7 +2401,7 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
 {
        struct dm_md_mempools *p = dm_table_get_md_mempools(t);
 
-       if (md->io_pool && md->bs) {
+       if (md->bs) {
                /* The md already has necessary mempools. */
                if (dm_table_get_type(t) == DM_TYPE_BIO_BASED) {
                        /*
@@ -2310,7 +2433,7 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
        p->bs = NULL;
 
 out:
-       /* mempool bind completed, now no need any mempools in the table */
+       /* mempool bind completed, no longer need any mempools in the table */
        dm_table_free_md_mempools(t);
 }
 
@@ -2357,7 +2480,7 @@ int dm_queue_merge_is_compulsory(struct request_queue *q)
        if (!q->merge_bvec_fn)
                return 0;
 
-       if (q->make_request_fn == dm_request) {
+       if (q->make_request_fn == dm_make_request) {
                dev_md = q->queuedata;
                if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags))
                        return 0;
@@ -2426,7 +2549,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
         * This must be done before setting the queue restrictions,
         * because request-based dm may be run just after the setting.
         */
-       if (dm_table_request_based(t) && !blk_queue_stopped(q))
+       if (dm_table_request_based(t))
                stop_queue(q);
 
        __bind_mempools(md, t);
@@ -2508,14 +2631,6 @@ unsigned dm_get_md_type(struct mapped_device *md)
        return md->type;
 }
 
-static bool dm_md_type_request_based(struct mapped_device *md)
-{
-       unsigned table_type = dm_get_md_type(md);
-
-       return (table_type == DM_TYPE_REQUEST_BASED ||
-               table_type == DM_TYPE_MQ_REQUEST_BASED);
-}
-
 struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
 {
        return md->immutable_target_type;
@@ -2532,6 +2647,14 @@ struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
 }
 EXPORT_SYMBOL_GPL(dm_get_queue_limits);
 
+static void init_rq_based_worker_thread(struct mapped_device *md)
+{
+       /* Initialize the request-based DM worker thread */
+       init_kthread_worker(&md->kworker);
+       md->kworker_task = kthread_run(kthread_worker_fn, &md->kworker,
+                                      "kdmwork-%s", dm_device_name(md));
+}
+
 /*
  * Fully initialize a request-based queue (->elevator, ->request_fn, etc).
  */
@@ -2540,27 +2663,160 @@ static int dm_init_request_based_queue(struct mapped_device *md)
        struct request_queue *q = NULL;
 
        if (md->queue->elevator)
-               return 1;
+               return 0;
 
        /* Fully initialize the queue */
        q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL);
        if (!q)
-               return 0;
+               return -EINVAL;
+
+       /* disable dm_request_fn's merge heuristic by default */
+       md->seq_rq_merge_deadline_usecs = 0;
 
        md->queue = q;
-       dm_init_md_queue(md);
+       dm_init_old_md_queue(md);
        blk_queue_softirq_done(md->queue, dm_softirq_done);
        blk_queue_prep_rq(md->queue, dm_prep_fn);
-       blk_queue_lld_busy(md->queue, dm_lld_busy);
 
-       /* Also initialize the request-based DM worker thread */
-       init_kthread_worker(&md->kworker);
-       md->kworker_task = kthread_run(kthread_worker_fn, &md->kworker,
-                                      "kdmwork-%s", dm_device_name(md));
+       init_rq_based_worker_thread(md);
 
        elv_register_queue(md->queue);
 
-       return 1;
+       return 0;
+}
+
+static int dm_mq_init_request(void *data, struct request *rq,
+                             unsigned int hctx_idx, unsigned int request_idx,
+                             unsigned int numa_node)
+{
+       struct mapped_device *md = data;
+       struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq);
+
+       /*
+        * Must initialize md member of tio, otherwise it won't
+        * be available in dm_mq_queue_rq.
+        */
+       tio->md = md;
+
+       return 0;
+}
+
+static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
+                         const struct blk_mq_queue_data *bd)
+{
+       struct request *rq = bd->rq;
+       struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq);
+       struct mapped_device *md = tio->md;
+       int srcu_idx;
+       struct dm_table *map = dm_get_live_table(md, &srcu_idx);
+       struct dm_target *ti;
+       sector_t pos;
+
+       /* always use block 0 to find the target for flushes for now */
+       pos = 0;
+       if (!(rq->cmd_flags & REQ_FLUSH))
+               pos = blk_rq_pos(rq);
+
+       ti = dm_table_find_target(map, pos);
+       if (!dm_target_is_valid(ti)) {
+               dm_put_live_table(md, srcu_idx);
+               DMERR_LIMIT("request attempted access beyond the end of device");
+               /*
+                * Must perform setup, that rq_completed() requires,
+                * before returning BLK_MQ_RQ_QUEUE_ERROR
+                */
+               dm_start_request(md, rq);
+               return BLK_MQ_RQ_QUEUE_ERROR;
+       }
+       dm_put_live_table(md, srcu_idx);
+
+       if (ti->type->busy && ti->type->busy(ti))
+               return BLK_MQ_RQ_QUEUE_BUSY;
+
+       dm_start_request(md, rq);
+
+       /* Init tio using md established in .init_request */
+       init_tio(tio, rq, md);
+
+       /*
+        * Establish tio->ti before queuing work (map_tio_request)
+        * or making direct call to map_request().
+        */
+       tio->ti = ti;
+
+       /* Clone the request if underlying devices aren't blk-mq */
+       if (dm_table_get_type(map) == DM_TYPE_REQUEST_BASED) {
+               /* clone request is allocated at the end of the pdu */
+               tio->clone = (void *)blk_mq_rq_to_pdu(rq) + sizeof(struct dm_rq_target_io);
+               if (!clone_rq(rq, md, tio, GFP_ATOMIC))
+                       return BLK_MQ_RQ_QUEUE_BUSY;
+               queue_kthread_work(&md->kworker, &tio->work);
+       } else {
+               /* Direct call is fine since .queue_rq allows allocations */
+               if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE)
+                       dm_requeue_unmapped_original_request(md, rq);
+       }
+
+       return BLK_MQ_RQ_QUEUE_OK;
+}
+
+static struct blk_mq_ops dm_mq_ops = {
+       .queue_rq = dm_mq_queue_rq,
+       .map_queue = blk_mq_map_queue,
+       .complete = dm_softirq_done,
+       .init_request = dm_mq_init_request,
+};
+
+static int dm_init_request_based_blk_mq_queue(struct mapped_device *md)
+{
+       unsigned md_type = dm_get_md_type(md);
+       struct request_queue *q;
+       int err;
+
+       memset(&md->tag_set, 0, sizeof(md->tag_set));
+       md->tag_set.ops = &dm_mq_ops;
+       md->tag_set.queue_depth = BLKDEV_MAX_RQ;
+       md->tag_set.numa_node = NUMA_NO_NODE;
+       md->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+       md->tag_set.nr_hw_queues = 1;
+       if (md_type == DM_TYPE_REQUEST_BASED) {
+               /* make the memory for non-blk-mq clone part of the pdu */
+               md->tag_set.cmd_size = sizeof(struct dm_rq_target_io) + sizeof(struct request);
+       } else
+               md->tag_set.cmd_size = sizeof(struct dm_rq_target_io);
+       md->tag_set.driver_data = md;
+
+       err = blk_mq_alloc_tag_set(&md->tag_set);
+       if (err)
+               return err;
+
+       q = blk_mq_init_allocated_queue(&md->tag_set, md->queue);
+       if (IS_ERR(q)) {
+               err = PTR_ERR(q);
+               goto out_tag_set;
+       }
+       md->queue = q;
+       dm_init_md_queue(md);
+
+       /* backfill 'mq' sysfs registration normally done in blk_register_queue */
+       blk_mq_register_disk(md->disk);
+
+       if (md_type == DM_TYPE_REQUEST_BASED)
+               init_rq_based_worker_thread(md);
+
+       return 0;
+
+out_tag_set:
+       blk_mq_free_tag_set(&md->tag_set);
+       return err;
+}
+
+static unsigned filter_md_type(unsigned type, struct mapped_device *md)
+{
+       if (type == DM_TYPE_BIO_BASED)
+               return type;
+
+       return !md->use_blk_mq ? DM_TYPE_REQUEST_BASED : DM_TYPE_MQ_REQUEST_BASED;
 }
 
 /*
@@ -2568,9 +2824,29 @@ static int dm_init_request_based_queue(struct mapped_device *md)
  */
 int dm_setup_md_queue(struct mapped_device *md)
 {
-       if (dm_md_type_request_based(md) && !dm_init_request_based_queue(md)) {
-               DMWARN("Cannot initialize queue for request-based mapped device");
-               return -EINVAL;
+       int r;
+       unsigned md_type = filter_md_type(dm_get_md_type(md), md);
+
+       switch (md_type) {
+       case DM_TYPE_REQUEST_BASED:
+               r = dm_init_request_based_queue(md);
+               if (r) {
+                       DMWARN("Cannot initialize queue for request-based mapped device");
+                       return r;
+               }
+               break;
+       case DM_TYPE_MQ_REQUEST_BASED:
+               r = dm_init_request_based_blk_mq_queue(md);
+               if (r) {
+                       DMWARN("Cannot initialize queue for request-based blk-mq mapped device");
+                       return r;
+               }
+               break;
+       case DM_TYPE_BIO_BASED:
+               dm_init_old_md_queue(md);
+               blk_queue_make_request(md->queue, dm_make_request);
+               blk_queue_merge_bvec(md->queue, dm_merge_bvec);
+               break;
        }
 
        return 0;
@@ -2654,7 +2930,7 @@ static void __dm_destroy(struct mapped_device *md, bool wait)
        set_bit(DMF_FREEING, &md->flags);
        spin_unlock(&_minor_lock);
 
-       if (dm_request_based(md))
+       if (dm_request_based(md) && md->kworker_task)
                flush_kthread_worker(&md->kworker);
 
        /*
@@ -2908,7 +3184,8 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
         */
        if (dm_request_based(md)) {
                stop_queue(md->queue);
-               flush_kthread_worker(&md->kworker);
+               if (md->kworker_task)
+                       flush_kthread_worker(&md->kworker);
        }
 
        flush_workqueue(md->wq);
@@ -3206,6 +3483,7 @@ struct gendisk *dm_disk(struct mapped_device *md)
 {
        return md->disk;
 }
+EXPORT_SYMBOL_GPL(dm_disk);
 
 struct kobject *dm_kobject(struct mapped_device *md)
 {
@@ -3253,16 +3531,19 @@ int dm_noflush_suspending(struct dm_target *ti)
 }
 EXPORT_SYMBOL_GPL(dm_noflush_suspending);
 
-struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, unsigned per_bio_data_size)
+struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type,
+                                           unsigned integrity, unsigned per_bio_data_size)
 {
        struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL);
-       struct kmem_cache *cachep;
+       struct kmem_cache *cachep = NULL;
        unsigned int pool_size = 0;
        unsigned int front_pad;
 
        if (!pools)
                return NULL;
 
+       type = filter_md_type(type, md);
+
        switch (type) {
        case DM_TYPE_BIO_BASED:
                cachep = _io_cache;
@@ -3270,13 +3551,13 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, u
                front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
                break;
        case DM_TYPE_REQUEST_BASED:
+               cachep = _rq_tio_cache;
                pool_size = dm_get_reserved_rq_based_ios();
                pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache);
                if (!pools->rq_pool)
                        goto out;
                /* fall through to setup remaining rq-based pools */
        case DM_TYPE_MQ_REQUEST_BASED:
-               cachep = _rq_tio_cache;
                if (!pool_size)
                        pool_size = dm_get_reserved_rq_based_ios();
                front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
@@ -3284,12 +3565,14 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, u
                WARN_ON(per_bio_data_size != 0);
                break;
        default:
-               goto out;
+               BUG();
        }
 
-       pools->io_pool = mempool_create_slab_pool(pool_size, cachep);
-       if (!pools->io_pool)
-               goto out;
+       if (cachep) {
+               pools->io_pool = mempool_create_slab_pool(pool_size, cachep);
+               if (!pools->io_pool)
+                       goto out;
+       }
 
        pools->bs = bioset_create_nobvec(pool_size, front_pad);
        if (!pools->bs)
@@ -3346,6 +3629,9 @@ MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
 module_param(reserved_rq_based_ios, uint, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools");
 
+module_param(use_blk_mq, bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(use_blk_mq, "Use block multiqueue for request-based DM devices");
+
 MODULE_DESCRIPTION(DM_NAME " driver");
 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
 MODULE_LICENSE("GPL");
index 59f53e79db8264521dc00d2563c7b58182b7e0da..6123c2bf9150cb836c1ecd80ebfe51c9f9aa82fd 100644 (file)
@@ -70,7 +70,6 @@ void dm_table_presuspend_undo_targets(struct dm_table *t);
 void dm_table_postsuspend_targets(struct dm_table *t);
 int dm_table_resume_targets(struct dm_table *t);
 int dm_table_any_congested(struct dm_table *t, int bdi_bits);
-int dm_table_any_busy_target(struct dm_table *t);
 unsigned dm_table_get_type(struct dm_table *t);
 struct target_type *dm_table_get_immutable_target_type(struct dm_table *t);
 bool dm_table_request_based(struct dm_table *t);
@@ -212,6 +211,8 @@ int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
 void dm_internal_suspend(struct mapped_device *md);
 void dm_internal_resume(struct mapped_device *md);
 
+bool dm_use_blk_mq(struct mapped_device *md);
+
 int dm_io_init(void);
 void dm_io_exit(void);
 
@@ -221,7 +222,8 @@ void dm_kcopyd_exit(void);
 /*
  * Mempool operations
  */
-struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, unsigned per_bio_data_size);
+struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type,
+                                           unsigned integrity, unsigned per_bio_data_size);
 void dm_free_md_mempools(struct dm_md_mempools *pools);
 
 /*
@@ -235,4 +237,8 @@ static inline bool dm_message_test_buffer_overflow(char *result, unsigned maxlen
        return !maxlen || strlen(result) + 1 >= maxlen;
 }
 
+ssize_t dm_attr_rq_based_seq_io_merge_deadline_show(struct mapped_device *md, char *buf);
+ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md,
+                                                    const char *buf, size_t count);
+
 #endif
index fd23978d93fe35a91afab5f56d3611c0324d9ceb..51cc1deb7af3a38597a665300ef9405d8dbe3cef 100644 (file)
@@ -605,9 +605,4 @@ static inline unsigned long to_bytes(sector_t n)
        return (n << SECTOR_SHIFT);
 }
 
-/*-----------------------------------------------------------------
- * Helper for block layer and dm core operations
- *---------------------------------------------------------------*/
-int dm_underlying_device_busy(struct request_queue *q);
-
 #endif /* _LINUX_DEVICE_MAPPER_H */
index c2c561dc011440ee83f395aba0a226ab467204a0..bca086d62b1a745f9a4ddfe12670139d9ce228d4 100644 (file)
@@ -92,6 +92,6 @@ extern struct vfsmount *vfs_kern_mount(struct file_system_type *type,
 extern void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list);
 extern void mark_mounts_for_expiry(struct list_head *mounts);
 
-extern dev_t name_to_dev_t(char *name);
+extern dev_t name_to_dev_t(const char *name);
 
 #endif /* _LINUX_MOUNT_H */
index 889f3a5b7b18267d91a81df449098bfd66da2b68..eac8c3641f39a629b5023782c050f84fffaa3036 100644 (file)
@@ -267,9 +267,9 @@ enum {
 #define DM_DEV_SET_GEOMETRY    _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
 
 #define DM_VERSION_MAJOR       4
-#define DM_VERSION_MINOR       30
+#define DM_VERSION_MINOR       31
 #define DM_VERSION_PATCHLEVEL  0
-#define DM_VERSION_EXTRA       "-ioctl (2014-12-22)"
+#define DM_VERSION_EXTRA       "-ioctl (2015-3-12)"
 
 /* Status bits */
 #define DM_READONLY_FLAG       (1 << 0) /* In/Out */
index eb410083e8e075f9ca1829d0db1bf3cb70d17139..8369ffa5f33db24a12703ce74eb7ac437ada96f9 100644 (file)
@@ -207,7 +207,7 @@ done:
  *     bangs.
  */
 
-dev_t name_to_dev_t(char *name)
+dev_t name_to_dev_t(const char *name)
 {
        char s[32];
        char *p;
@@ -226,8 +226,9 @@ dev_t name_to_dev_t(char *name)
 
        if (strncmp(name, "/dev/", 5) != 0) {
                unsigned maj, min;
+               char dummy;
 
-               if (sscanf(name, "%u:%u", &maj, &min) == 2) {
+               if (sscanf(name, "%u:%u%c", &maj, &min, &dummy) == 2) {
                        res = MKDEV(maj, min);
                        if (maj != MAJOR(res) || min != MINOR(res))
                                goto fail;
@@ -286,6 +287,7 @@ fail:
 done:
        return res;
 }
+EXPORT_SYMBOL_GPL(name_to_dev_t);
 
 static int __init root_dev_setup(char *line)
 {