struct dm_thin_new_mapping;
/*
- * The pool runs in 3 modes. Ordered in degraded order for comparisons.
+ * The pool runs in 4 modes. Ordered in degraded order for comparisons.
*/
enum pool_mode {
PM_WRITE, /* metadata may be changed */
+ PM_OUT_OF_DATA_SPACE, /* metadata may be changed, though data may not be allocated */
PM_READ_ONLY, /* metadata may not be changed */
PM_FAIL, /* all I/O fails */
};
};
static enum pool_mode get_pool_mode(struct pool *pool);
-static void out_of_data_space(struct pool *pool);
static void metadata_operation_failed(struct pool *pool, const char *op, int r);
/*
struct pool *pool;
struct dm_thin_device *td;
+ bool requeue_mode:1;
};
/*----------------------------------------------------------------*/
struct dm_thin_new_mapping *overwrite_mapping;
};
-static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
+static void requeue_bio_list(struct thin_c *tc, struct bio_list *master)
{
struct bio *bio;
struct bio_list bios;
+ unsigned long flags;
bio_list_init(&bios);
+
+ spin_lock_irqsave(&tc->pool->lock, flags);
bio_list_merge(&bios, master);
bio_list_init(master);
+ spin_unlock_irqrestore(&tc->pool->lock, flags);
while ((bio = bio_list_pop(&bios))) {
struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
static void requeue_io(struct thin_c *tc)
{
struct pool *pool = tc->pool;
+
+ requeue_bio_list(tc, &pool->deferred_bios);
+ requeue_bio_list(tc, &pool->retry_on_resume_list);
+}
+
+static void error_retry_list(struct pool *pool)
+{
+ struct bio *bio;
unsigned long flags;
+ struct bio_list bios;
+
+ bio_list_init(&bios);
spin_lock_irqsave(&pool->lock, flags);
- __requeue_bio_list(tc, &pool->deferred_bios);
- __requeue_bio_list(tc, &pool->retry_on_resume_list);
+ bio_list_merge(&bios, &pool->retry_on_resume_list);
+ bio_list_init(&pool->retry_on_resume_list);
spin_unlock_irqrestore(&pool->lock, flags);
+
+ while ((bio = bio_list_pop(&bios)))
+ bio_io_error(bio);
}
/*
}
}
+static void set_pool_mode(struct pool *pool, enum pool_mode new_mode);
+
static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
{
int r;
dm_block_t free_blocks;
struct pool *pool = tc->pool;
- if (get_pool_mode(pool) != PM_WRITE)
+ if (WARN_ON(get_pool_mode(pool) != PM_WRITE))
return -EINVAL;
r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
}
if (!free_blocks) {
- out_of_data_space(pool);
+ set_pool_mode(pool, PM_OUT_OF_DATA_SPACE);
return -ENOSPC;
}
}
spin_unlock_irqrestore(&pool->lock, flags);
}
-static void handle_unserviceable_bio(struct pool *pool, struct bio *bio)
+static bool should_error_unserviceable_bio(struct pool *pool)
{
- /*
- * When pool is read-only, no cell locking is needed because
- * nothing is changing.
- */
- WARN_ON_ONCE(get_pool_mode(pool) != PM_READ_ONLY);
+ enum pool_mode m = get_pool_mode(pool);
+
+ switch (m) {
+ case PM_WRITE:
+ /* Shouldn't get here */
+ DMERR_LIMIT("bio unserviceable, yet pool is in PM_WRITE mode");
+ return true;
+
+ case PM_OUT_OF_DATA_SPACE:
+ return pool->pf.error_if_no_space;
- if (pool->pf.error_if_no_space)
+ case PM_READ_ONLY:
+ case PM_FAIL:
+ return true;
+ default:
+ /* Shouldn't get here */
+ DMERR_LIMIT("bio unserviceable, yet pool has an unknown mode");
+ return true;
+ }
+}
+
+static void handle_unserviceable_bio(struct pool *pool, struct bio *bio)
+{
+ if (should_error_unserviceable_bio(pool))
bio_io_error(bio);
else
retry_on_resume(bio);
struct bio *bio;
struct bio_list bios;
+ if (should_error_unserviceable_bio(pool)) {
+ cell_error(pool, cell);
+ return;
+ }
+
bio_list_init(&bios);
cell_release(pool, cell, &bios);
- while ((bio = bio_list_pop(&bios)))
- handle_unserviceable_bio(pool, bio);
+ if (should_error_unserviceable_bio(pool))
+ while ((bio = bio_list_pop(&bios)))
+ bio_io_error(bio);
+ else
+ while ((bio = bio_list_pop(&bios)))
+ retry_on_resume(bio);
}
static void process_discard(struct thin_c *tc, struct bio *bio)
}
}
+static void process_bio_success(struct thin_c *tc, struct bio *bio)
+{
+ bio_endio(bio, 0);
+}
+
static void process_bio_fail(struct thin_c *tc, struct bio *bio)
{
bio_io_error(bio);
struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
struct thin_c *tc = h->tc;
+ if (tc->requeue_mode) {
+ bio_endio(bio, DM_ENDIO_REQUEUE);
+ continue;
+ }
+
/*
* If we've got no free new_mapping structs, and processing
* this bio might require one, we pause until there are some
/*----------------------------------------------------------------*/
+struct noflush_work {
+ struct work_struct worker;
+ struct thin_c *tc;
+
+ atomic_t complete;
+ wait_queue_head_t wait;
+};
+
+static void complete_noflush_work(struct noflush_work *w)
+{
+ atomic_set(&w->complete, 1);
+ wake_up(&w->wait);
+}
+
+static void do_noflush_start(struct work_struct *ws)
+{
+ struct noflush_work *w = container_of(ws, struct noflush_work, worker);
+ w->tc->requeue_mode = true;
+ requeue_io(w->tc);
+ complete_noflush_work(w);
+}
+
+static void do_noflush_stop(struct work_struct *ws)
+{
+ struct noflush_work *w = container_of(ws, struct noflush_work, worker);
+ w->tc->requeue_mode = false;
+ complete_noflush_work(w);
+}
+
+static void noflush_work(struct thin_c *tc, void (*fn)(struct work_struct *))
+{
+ struct noflush_work w;
+
+ INIT_WORK(&w.worker, fn);
+ w.tc = tc;
+ atomic_set(&w.complete, 0);
+ init_waitqueue_head(&w.wait);
+
+ queue_work(tc->pool->wq, &w.worker);
+
+ wait_event(w.wait, atomic_read(&w.complete));
+}
+
+/*----------------------------------------------------------------*/
+
static enum pool_mode get_pool_mode(struct pool *pool)
{
return pool->pf.mode;
}
+static void notify_of_pool_mode_change(struct pool *pool, const char *new_mode)
+{
+ dm_table_event(pool->ti->table);
+ DMINFO("%s: switching pool to %s mode",
+ dm_device_name(pool->pool_md), new_mode);
+}
+
static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
{
- int r;
- enum pool_mode old_mode = pool->pf.mode;
+ struct pool_c *pt = pool->ti->private;
+ bool needs_check = dm_pool_metadata_needs_check(pool->pmd);
+ enum pool_mode old_mode = get_pool_mode(pool);
+
+ /*
+ * Never allow the pool to transition to PM_WRITE mode if user
+ * intervention is required to verify metadata and data consistency.
+ */
+ if (new_mode == PM_WRITE && needs_check) {
+ DMERR("%s: unable to switch pool to write mode until repaired.",
+ dm_device_name(pool->pool_md));
+ if (old_mode != new_mode)
+ new_mode = old_mode;
+ else
+ new_mode = PM_READ_ONLY;
+ }
+ /*
+ * If we were in PM_FAIL mode, rollback of metadata failed. We're
+ * not going to recover without a thin_repair. So we never let the
+ * pool move out of the old mode.
+ */
+ if (old_mode == PM_FAIL)
+ new_mode = old_mode;
switch (new_mode) {
case PM_FAIL:
if (old_mode != new_mode)
- DMERR("%s: switching pool to failure mode",
- dm_device_name(pool->pool_md));
+ notify_of_pool_mode_change(pool, "failure");
dm_pool_metadata_read_only(pool->pmd);
pool->process_bio = process_bio_fail;
pool->process_discard = process_bio_fail;
pool->process_prepared_mapping = process_prepared_mapping_fail;
pool->process_prepared_discard = process_prepared_discard_fail;
+
+ error_retry_list(pool);
break;
case PM_READ_ONLY:
if (old_mode != new_mode)
- DMERR("%s: switching pool to read-only mode",
- dm_device_name(pool->pool_md));
- r = dm_pool_abort_metadata(pool->pmd);
- if (r) {
- DMERR("%s: aborting transaction failed",
- dm_device_name(pool->pool_md));
- new_mode = PM_FAIL;
- set_pool_mode(pool, new_mode);
- } else {
- dm_pool_metadata_read_only(pool->pmd);
- pool->process_bio = process_bio_read_only;
- pool->process_discard = process_discard;
- pool->process_prepared_mapping = process_prepared_mapping_fail;
- pool->process_prepared_discard = process_prepared_discard_passdown;
- }
+ notify_of_pool_mode_change(pool, "read-only");
+ dm_pool_metadata_read_only(pool->pmd);
+ pool->process_bio = process_bio_read_only;
+ pool->process_discard = process_bio_success;
+ pool->process_prepared_mapping = process_prepared_mapping_fail;
+ pool->process_prepared_discard = process_prepared_discard_passdown;
+
+ error_retry_list(pool);
+ break;
+
+ case PM_OUT_OF_DATA_SPACE:
+ /*
+ * Ideally we'd never hit this state; the low water mark
+ * would trigger userland to extend the pool before we
+ * completely run out of data space. However, many small
+ * IOs to unprovisioned space can consume data space at an
+ * alarming rate. Adjust your low water mark if you're
+ * frequently seeing this mode.
+ */
+ if (old_mode != new_mode)
+ notify_of_pool_mode_change(pool, "out-of-data-space");
+ pool->process_bio = process_bio_read_only;
+ pool->process_discard = process_discard;
+ pool->process_prepared_mapping = process_prepared_mapping;
+ pool->process_prepared_discard = process_prepared_discard_passdown;
break;
case PM_WRITE:
if (old_mode != new_mode)
- DMINFO("%s: switching pool to write mode",
- dm_device_name(pool->pool_md));
+ notify_of_pool_mode_change(pool, "write");
dm_pool_metadata_read_write(pool->pmd);
pool->process_bio = process_bio;
pool->process_discard = process_discard;
}
pool->pf.mode = new_mode;
+ /*
+ * The pool mode may have changed, sync it so bind_control_target()
+ * doesn't cause an unexpected mode transition on resume.
+ */
+ pt->adjusted_pf.mode = new_mode;
}
-/*
- * Rather than calling set_pool_mode directly, use these which describe the
- * reason for mode degradation.
- */
-static void out_of_data_space(struct pool *pool)
+static void abort_transaction(struct pool *pool)
{
- DMERR_LIMIT("%s: no free data space available.",
- dm_device_name(pool->pool_md));
- set_pool_mode(pool, PM_READ_ONLY);
+ const char *dev_name = dm_device_name(pool->pool_md);
+
+ DMERR_LIMIT("%s: aborting current metadata transaction", dev_name);
+ if (dm_pool_abort_metadata(pool->pmd)) {
+ DMERR("%s: failed to abort metadata transaction", dev_name);
+ set_pool_mode(pool, PM_FAIL);
+ }
+
+ if (dm_pool_metadata_set_needs_check(pool->pmd)) {
+ DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);
+ set_pool_mode(pool, PM_FAIL);
+ }
}
static void metadata_operation_failed(struct pool *pool, const char *op, int r)
{
- dm_block_t free_blocks;
-
DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
dm_device_name(pool->pool_md), op, r);
- if (r == -ENOSPC &&
- !dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks) &&
- !free_blocks)
- DMERR_LIMIT("%s: no free metadata space available.",
- dm_device_name(pool->pool_md));
-
+ abort_transaction(pool);
set_pool_mode(pool, PM_READ_ONLY);
}
thin_hook_bio(tc, bio);
+ if (tc->requeue_mode) {
+ bio_endio(bio, DM_ENDIO_REQUEUE);
+ return DM_MAPIO_SUBMITTED;
+ }
+
if (get_pool_mode(tc->pool) == PM_FAIL) {
bio_io_error(bio);
return DM_MAPIO_SUBMITTED;
/*
* We want to make sure that a pool in PM_FAIL mode is never upgraded.
*/
- enum pool_mode old_mode = pool->pf.mode;
+ enum pool_mode old_mode = get_pool_mode(pool);
enum pool_mode new_mode = pt->adjusted_pf.mode;
/*
pool->pf = pt->adjusted_pf;
pool->low_water_blocks = pt->low_water_blocks;
- /*
- * If we were in PM_FAIL mode, rollback of metadata failed. We're
- * not going to recover without a thin_repair. So we never let the
- * pool move out of the old mode. On the other hand a PM_READ_ONLY
- * may have been due to a lack of metadata or data space, and may
- * now work (ie. if the underlying devices have been resized).
- */
- if (old_mode == PM_FAIL)
- new_mode = old_mode;
-
set_pool_mode(pool, new_mode);
return 0;
return -EINVAL;
} else if (data_size > sb_data_size) {
+ if (dm_pool_metadata_needs_check(pool->pmd)) {
+ DMERR("%s: unable to grow the data device until repaired.",
+ dm_device_name(pool->pool_md));
+ return 0;
+ }
+
if (sb_data_size)
DMINFO("%s: growing the data device from %llu to %llu blocks",
dm_device_name(pool->pool_md),
return -EINVAL;
} else if (metadata_dev_size > sb_metadata_dev_size) {
+ if (dm_pool_metadata_needs_check(pool->pmd)) {
+ DMERR("%s: unable to grow the metadata device until repaired.",
+ dm_device_name(pool->pool_md));
+ return 0;
+ }
+
warn_if_metadata_device_too_big(pool->md_dev);
DMINFO("%s: growing the metadata device from %llu to %llu blocks",
dm_device_name(pool->pool_md),
else
DMEMIT("- ");
- if (pool->pf.mode == PM_READ_ONLY)
+ if (pool->pf.mode == PM_OUT_OF_DATA_SPACE)
+ DMEMIT("out_of_data_space ");
+ else if (pool->pf.mode == PM_READ_ONLY)
DMEMIT("ro ");
else
DMEMIT("rw ");
.name = "thin-pool",
.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
DM_TARGET_IMMUTABLE,
- .version = {1, 10, 0},
+ .version = {1, 11, 0},
.module = THIS_MODULE,
.ctr = pool_ctr,
.dtr = pool_dtr,
return 0;
}
-static void thin_postsuspend(struct dm_target *ti)
+static void thin_presuspend(struct dm_target *ti)
{
+ struct thin_c *tc = ti->private;
+
if (dm_noflush_suspending(ti))
- requeue_io((struct thin_c *)ti->private);
+ noflush_work(tc, do_noflush_start);
+}
+
+static void thin_postsuspend(struct dm_target *ti)
+{
+ struct thin_c *tc = ti->private;
+
+ /*
+ * The dm_noflush_suspending flag has been cleared by now, so
+ * unfortunately we must always run this.
+ */
+ noflush_work(tc, do_noflush_stop);
}
/*
static struct target_type thin_target = {
.name = "thin",
- .version = {1, 10, 0},
+ .version = {1, 11, 0},
.module = THIS_MODULE,
.ctr = thin_ctr,
.dtr = thin_dtr,
.map = thin_map,
.end_io = thin_endio,
+ .presuspend = thin_presuspend,
.postsuspend = thin_postsuspend,
.status = thin_status,
.iterate_devices = thin_iterate_devices,
dm_block_t block;
};
+struct bop_ring_buffer {
+ unsigned begin;
+ unsigned end;
+ struct block_op bops[MAX_RECURSIVE_ALLOCATIONS + 1];
+};
+
+static void brb_init(struct bop_ring_buffer *brb)
+{
+ brb->begin = 0;
+ brb->end = 0;
+}
+
+static bool brb_empty(struct bop_ring_buffer *brb)
+{
+ return brb->begin == brb->end;
+}
+
+static unsigned brb_next(struct bop_ring_buffer *brb, unsigned old)
+{
+ unsigned r = old + 1;
+ return (r >= (sizeof(brb->bops) / sizeof(*brb->bops))) ? 0 : r;
+}
+
+static int brb_push(struct bop_ring_buffer *brb,
+ enum block_op_type type, dm_block_t b)
+{
+ struct block_op *bop;
+ unsigned next = brb_next(brb, brb->end);
+
+ /*
+ * We don't allow the last bop to be filled, this way we can
+ * differentiate between full and empty.
+ */
+ if (next == brb->begin)
+ return -ENOMEM;
+
+ bop = brb->bops + brb->end;
+ bop->type = type;
+ bop->block = b;
+
+ brb->end = next;
+
+ return 0;
+}
+
+static int brb_pop(struct bop_ring_buffer *brb, struct block_op *result)
+{
+ struct block_op *bop;
+
+ if (brb_empty(brb))
+ return -ENODATA;
+
+ bop = brb->bops + brb->begin;
+ result->type = bop->type;
+ result->block = bop->block;
+
+ brb->begin = brb_next(brb, brb->begin);
+
+ return 0;
+}
+
+/*----------------------------------------------------------------*/
+
struct sm_metadata {
struct dm_space_map sm;
unsigned recursion_count;
unsigned allocated_this_transaction;
- unsigned nr_uncommitted;
- struct block_op uncommitted[MAX_RECURSIVE_ALLOCATIONS];
+ struct bop_ring_buffer uncommitted;
struct threshold threshold;
};
static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t b)
{
- struct block_op *op;
+ int r = brb_push(&smm->uncommitted, type, b);
- if (smm->nr_uncommitted == MAX_RECURSIVE_ALLOCATIONS) {
+ if (r) {
DMERR("too many recursive allocations");
return -ENOMEM;
}
- op = smm->uncommitted + smm->nr_uncommitted++;
- op->type = type;
- op->block = b;
-
return 0;
}
return -ENOMEM;
}
- if (smm->recursion_count == 1 && smm->nr_uncommitted) {
- while (smm->nr_uncommitted && !r) {
- smm->nr_uncommitted--;
- r = commit_bop(smm, smm->uncommitted +
- smm->nr_uncommitted);
+ if (smm->recursion_count == 1) {
+ while (!brb_empty(&smm->uncommitted)) {
+ struct block_op bop;
+
+ r = brb_pop(&smm->uncommitted, &bop);
+ if (r) {
+ DMERR("bug in bop ring buffer");
+ break;
+ }
+
+ r = commit_bop(smm, &bop);
if (r)
break;
}
static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b,
uint32_t *result)
{
- int r, i;
+ int r;
+ unsigned i;
struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
unsigned adjustment = 0;
* We may have some uncommitted adjustments to add. This list
* should always be really short.
*/
- for (i = 0; i < smm->nr_uncommitted; i++) {
- struct block_op *op = smm->uncommitted + i;
+ for (i = smm->uncommitted.begin;
+ i != smm->uncommitted.end;
+ i = brb_next(&smm->uncommitted, i)) {
+ struct block_op *op = smm->uncommitted.bops + i;
if (op->block != b)
continue;
static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm,
dm_block_t b, int *result)
{
- int r, i, adjustment = 0;
+ int r, adjustment = 0;
+ unsigned i;
struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
uint32_t rc;
* We may have some uncommitted adjustments to add. This list
* should always be really short.
*/
- for (i = 0; i < smm->nr_uncommitted; i++) {
- struct block_op *op = smm->uncommitted + i;
+ for (i = smm->uncommitted.begin;
+ i != smm->uncommitted.end;
+ i = brb_next(&smm->uncommitted, i)) {
+
+ struct block_op *op = smm->uncommitted.bops + i;
if (op->block != b)
continue;
smm->begin = superblock + 1;
smm->recursion_count = 0;
smm->allocated_this_transaction = 0;
- smm->nr_uncommitted = 0;
+ brb_init(&smm->uncommitted);
threshold_init(&smm->threshold);
memcpy(&smm->sm, &bootstrap_ops, sizeof(smm->sm));
smm->begin = 0;
smm->recursion_count = 0;
smm->allocated_this_transaction = 0;
- smm->nr_uncommitted = 0;
+ brb_init(&smm->uncommitted);
threshold_init(&smm->threshold);
memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll));