X-Git-Url: https://git.kernelconcepts.de/?a=blobdiff_plain;f=drivers%2Fmd%2Fdm-thin.c;h=b9d25026ab84ac55afd5df7d67bfc177219d7def;hb=ac4c3f34a9af63092b3fbfafe34c3e966fbd96c5;hp=4843801173fe11a99519b59dd808e46e02425ee4;hpb=f17a6f78595e52761b7b2dece8c11b04b65f19dd;p=karo-tx-linux.git diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 4843801173fe..b9d25026ab84 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -11,11 +11,13 @@ #include #include #include +#include #include #include #include #include #include +#include #include #define DM_MSG_PREFIX "thin" @@ -25,7 +27,6 @@ */ #define ENDIO_HOOK_POOL_SIZE 1024 #define MAPPING_POOL_SIZE 1024 -#define PRISON_CELLS 1024 #define COMMIT_PERIOD HZ #define NO_SPACE_TIMEOUT_SECS 60 @@ -127,6 +128,53 @@ static void build_virtual_key(struct dm_thin_device *td, dm_block_t b, /*----------------------------------------------------------------*/ +#define THROTTLE_THRESHOLD (1 * HZ) + +struct throttle { + struct rw_semaphore lock; + unsigned long threshold; + bool throttle_applied; +}; + +static void throttle_init(struct throttle *t) +{ + init_rwsem(&t->lock); + t->throttle_applied = false; +} + +static void throttle_work_start(struct throttle *t) +{ + t->threshold = jiffies + THROTTLE_THRESHOLD; +} + +static void throttle_work_update(struct throttle *t) +{ + if (!t->throttle_applied && jiffies > t->threshold) { + down_write(&t->lock); + t->throttle_applied = true; + } +} + +static void throttle_work_complete(struct throttle *t) +{ + if (t->throttle_applied) { + t->throttle_applied = false; + up_write(&t->lock); + } +} + +static void throttle_lock(struct throttle *t) +{ + down_read(&t->lock); +} + +static void throttle_unlock(struct throttle *t) +{ + up_read(&t->lock); +} + +/*----------------------------------------------------------------*/ + /* * A pool device ties together a metadata device and a data device. It * also provides the interface for creating and destroying internal @@ -155,8 +203,11 @@ struct pool_features { struct thin_c; typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio); +typedef void (*process_cell_fn)(struct thin_c *tc, struct dm_bio_prison_cell *cell); typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m); +#define CELL_SORT_ARRAY_SIZE 8192 + struct pool { struct list_head list; struct dm_target *ti; /* Only set if a pool target is bound */ @@ -176,6 +227,7 @@ struct pool { struct dm_kcopyd_client *copier; struct workqueue_struct *wq; + struct throttle throttle; struct work_struct worker; struct delayed_work waker; struct delayed_work no_space_timeout; @@ -198,8 +250,13 @@ struct pool { process_bio_fn process_bio; process_bio_fn process_discard; + process_cell_fn process_cell; + process_cell_fn process_discard_cell; + process_mapping_fn process_prepared_mapping; process_mapping_fn process_prepared_discard; + + struct dm_bio_prison_cell *cell_sort_array[CELL_SORT_ARRAY_SIZE]; }; static enum pool_mode get_pool_mode(struct pool *pool); @@ -234,6 +291,7 @@ struct thin_c { struct dm_thin_device *td; bool requeue_mode:1; spinlock_t lock; + struct list_head deferred_cells; struct bio_list deferred_bio_list; struct bio_list retry_on_resume_list; struct rb_root sort_bio_list; /* sorted list of deferred bios */ @@ -290,6 +348,15 @@ static void cell_release(struct pool *pool, dm_bio_prison_free_cell(pool->prison, cell); } +static void cell_visit_release(struct pool *pool, + void (*fn)(void *, struct dm_bio_prison_cell *), + void *context, + struct dm_bio_prison_cell *cell) +{ + dm_cell_visit_release(pool->prison, fn, context, cell); + dm_bio_prison_free_cell(pool->prison, cell); +} + static void cell_release_no_holder(struct pool *pool, struct dm_bio_prison_cell *cell, struct bio_list *bios) @@ -298,19 +365,6 @@ static void cell_release_no_holder(struct pool *pool, dm_bio_prison_free_cell(pool->prison, cell); } -static void cell_defer_no_holder_no_free(struct thin_c *tc, - struct dm_bio_prison_cell *cell) -{ - struct pool *pool = tc->pool; - unsigned long flags; - - spin_lock_irqsave(&tc->lock, flags); - dm_cell_release_no_holder(pool->prison, cell, &tc->deferred_bio_list); - spin_unlock_irqrestore(&tc->lock, flags); - - wake_worker(pool); -} - static void cell_error_with_code(struct pool *pool, struct dm_bio_prison_cell *cell, int error_code) { @@ -323,6 +377,16 @@ static void cell_error(struct pool *pool, struct dm_bio_prison_cell *cell) cell_error_with_code(pool, cell, -EIO); } +static void cell_success(struct pool *pool, struct dm_bio_prison_cell *cell) +{ + cell_error_with_code(pool, cell, 0); +} + +static void cell_requeue(struct pool *pool, struct dm_bio_prison_cell *cell) +{ + cell_error_with_code(pool, cell, DM_ENDIO_REQUEUE); +} + /*----------------------------------------------------------------*/ /* @@ -410,10 +474,28 @@ static void requeue_bio_list(struct thin_c *tc, struct bio_list *master) bio_endio(bio, DM_ENDIO_REQUEUE); } +static void requeue_deferred_cells(struct thin_c *tc) +{ + struct pool *pool = tc->pool; + unsigned long flags; + struct list_head cells; + struct dm_bio_prison_cell *cell, *tmp; + + INIT_LIST_HEAD(&cells); + + spin_lock_irqsave(&tc->lock, flags); + list_splice_init(&tc->deferred_cells, &cells); + spin_unlock_irqrestore(&tc->lock, flags); + + list_for_each_entry_safe(cell, tmp, &cells, user_list) + cell_requeue(pool, cell); +} + static void requeue_io(struct thin_c *tc) { requeue_bio_list(tc, &tc->deferred_bio_list); requeue_bio_list(tc, &tc->retry_on_resume_list); + requeue_deferred_cells(tc); } static void error_thin_retry_list(struct thin_c *tc) @@ -629,33 +711,75 @@ static void overwrite_endio(struct bio *bio, int err) */ /* - * This sends the bios in the cell back to the deferred_bios list. + * This sends the bios in the cell, except the original holder, back + * to the deferred_bios list. */ -static void cell_defer(struct thin_c *tc, struct dm_bio_prison_cell *cell) +static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell) { struct pool *pool = tc->pool; unsigned long flags; spin_lock_irqsave(&tc->lock, flags); - cell_release(pool, cell, &tc->deferred_bio_list); + cell_release_no_holder(pool, cell, &tc->deferred_bio_list); spin_unlock_irqrestore(&tc->lock, flags); wake_worker(pool); } -/* - * Same as cell_defer above, except it omits the original holder of the cell. - */ -static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell) +static void thin_defer_bio(struct thin_c *tc, struct bio *bio); + +struct remap_info { + struct thin_c *tc; + struct bio_list defer_bios; + struct bio_list issue_bios; +}; + +static void __inc_remap_and_issue_cell(void *context, + struct dm_bio_prison_cell *cell) { - struct pool *pool = tc->pool; - unsigned long flags; + struct remap_info *info = context; + struct bio *bio; - spin_lock_irqsave(&tc->lock, flags); - cell_release_no_holder(pool, cell, &tc->deferred_bio_list); - spin_unlock_irqrestore(&tc->lock, flags); + while ((bio = bio_list_pop(&cell->bios))) { + if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) + bio_list_add(&info->defer_bios, bio); + else { + inc_all_io_entry(info->tc->pool, bio); - wake_worker(pool); + /* + * We can't issue the bios with the bio prison lock + * held, so we add them to a list to issue on + * return from this function. + */ + bio_list_add(&info->issue_bios, bio); + } + } +} + +static void inc_remap_and_issue_cell(struct thin_c *tc, + struct dm_bio_prison_cell *cell, + dm_block_t block) +{ + struct bio *bio; + struct remap_info info; + + info.tc = tc; + bio_list_init(&info.defer_bios); + bio_list_init(&info.issue_bios); + + /* + * We have to be careful to inc any bios we're about to issue + * before the cell is released, and avoid a race with new bios + * being added to the cell. + */ + cell_visit_release(tc->pool, __inc_remap_and_issue_cell, + &info, cell); + + while ((bio = bio_list_pop(&info.defer_bios))) + thin_defer_bio(tc, bio); + + while ((bio = bio_list_pop(&info.issue_bios))) + remap_and_issue(info.tc, bio, block); } static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m) @@ -706,10 +830,13 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m) * the bios in the cell. */ if (bio) { - cell_defer_no_holder(tc, m->cell); + inc_remap_and_issue_cell(tc, m->cell, m->data_block); bio_endio(bio, 0); - } else - cell_defer(tc, m->cell); + } else { + inc_all_io_entry(tc->pool, m->cell->holder); + remap_and_issue(tc, m->cell->holder, m->data_block); + inc_remap_and_issue_cell(tc, m->cell, m->data_block); + } out: list_del(&m->list); @@ -842,6 +969,20 @@ static void ll_zero(struct thin_c *tc, struct dm_thin_new_mapping *m, } } +static void remap_and_issue_overwrite(struct thin_c *tc, struct bio *bio, + dm_block_t data_block, + struct dm_thin_new_mapping *m) +{ + struct pool *pool = tc->pool; + struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); + + h->overwrite_mapping = m; + m->bio = bio; + save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); + inc_all_io_entry(pool, bio); + remap_and_issue(tc, bio, data_block); +} + /* * A partial copy also needs to zero the uncopied region. */ @@ -876,15 +1017,9 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, * If the whole block of data is being overwritten, we can issue the * bio immediately. Otherwise we use kcopyd to clone the data first. */ - if (io_overwrites_block(pool, bio)) { - struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); - - h->overwrite_mapping = m; - m->bio = bio; - save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); - inc_all_io_entry(pool, bio); - remap_and_issue(tc, bio, data_dest); - } else { + if (io_overwrites_block(pool, bio)) + remap_and_issue_overwrite(tc, bio, data_dest, m); + else { struct dm_io_region from, to; from.bdev = origin->bdev; @@ -953,16 +1088,10 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, if (!pool->pf.zero_new_blocks) process_prepared_mapping(m); - else if (io_overwrites_block(pool, bio)) { - struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); + else if (io_overwrites_block(pool, bio)) + remap_and_issue_overwrite(tc, bio, data_block, m); - h->overwrite_mapping = m; - m->bio = bio; - save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); - inc_all_io_entry(pool, bio); - remap_and_issue(tc, bio, data_block); - - } else + else ll_zero(tc, m, data_block * pool->sectors_per_block, (data_block + 1) * pool->sectors_per_block); @@ -1143,20 +1272,21 @@ static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *c retry_on_resume(bio); } -static void process_discard(struct thin_c *tc, struct bio *bio) +static void process_discard_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell) { int r; - unsigned long flags; + struct bio *bio = cell->holder; struct pool *pool = tc->pool; - struct dm_bio_prison_cell *cell, *cell2; - struct dm_cell_key key, key2; + struct dm_bio_prison_cell *cell2; + struct dm_cell_key key2; dm_block_t block = get_bio_block(tc, bio); struct dm_thin_lookup_result lookup_result; struct dm_thin_new_mapping *m; - build_virtual_key(tc->td, block, &key); - if (bio_detain(tc->pool, &key, bio, &cell)) + if (tc->requeue_mode) { + cell_requeue(pool, cell); return; + } r = dm_thin_find_block(tc->td, block, 1, &lookup_result); switch (r) { @@ -1187,12 +1317,9 @@ static void process_discard(struct thin_c *tc, struct bio *bio) m->cell2 = cell2; m->bio = bio; - if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) { - spin_lock_irqsave(&pool->lock, flags); - list_add_tail(&m->list, &pool->prepared_discards); - spin_unlock_irqrestore(&pool->lock, flags); - wake_worker(pool); - } + if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) + pool->process_prepared_discard(m); + } else { inc_all_io_entry(pool, bio); cell_defer_no_holder(tc, cell); @@ -1227,6 +1354,19 @@ static void process_discard(struct thin_c *tc, struct bio *bio) } } +static void process_discard_bio(struct thin_c *tc, struct bio *bio) +{ + struct dm_bio_prison_cell *cell; + struct dm_cell_key key; + dm_block_t block = get_bio_block(tc, bio); + + build_virtual_key(tc->td, block, &key); + if (bio_detain(tc->pool, &key, bio, &cell)) + return; + + process_discard_cell(tc, cell); +} + static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block, struct dm_cell_key *key, struct dm_thin_lookup_result *lookup_result, @@ -1255,11 +1395,53 @@ static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block, } } +static void __remap_and_issue_shared_cell(void *context, + struct dm_bio_prison_cell *cell) +{ + struct remap_info *info = context; + struct bio *bio; + + while ((bio = bio_list_pop(&cell->bios))) { + if ((bio_data_dir(bio) == WRITE) || + (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA))) + bio_list_add(&info->defer_bios, bio); + else { + struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));; + + h->shared_read_entry = dm_deferred_entry_inc(info->tc->pool->shared_read_ds); + inc_all_io_entry(info->tc->pool, bio); + bio_list_add(&info->issue_bios, bio); + } + } +} + +static void remap_and_issue_shared_cell(struct thin_c *tc, + struct dm_bio_prison_cell *cell, + dm_block_t block) +{ + struct bio *bio; + struct remap_info info; + + info.tc = tc; + bio_list_init(&info.defer_bios); + bio_list_init(&info.issue_bios); + + cell_visit_release(tc->pool, __remap_and_issue_shared_cell, + &info, cell); + + while ((bio = bio_list_pop(&info.defer_bios))) + thin_defer_bio(tc, bio); + + while ((bio = bio_list_pop(&info.issue_bios))) + remap_and_issue(tc, bio, block); +} + static void process_shared_bio(struct thin_c *tc, struct bio *bio, dm_block_t block, - struct dm_thin_lookup_result *lookup_result) + struct dm_thin_lookup_result *lookup_result, + struct dm_bio_prison_cell *virt_cell) { - struct dm_bio_prison_cell *cell; + struct dm_bio_prison_cell *data_cell; struct pool *pool = tc->pool; struct dm_cell_key key; @@ -1268,19 +1450,23 @@ static void process_shared_bio(struct thin_c *tc, struct bio *bio, * of being broken so we have nothing further to do here. */ build_data_key(tc->td, lookup_result->block, &key); - if (bio_detain(pool, &key, bio, &cell)) + if (bio_detain(pool, &key, bio, &data_cell)) { + cell_defer_no_holder(tc, virt_cell); return; + } - if (bio_data_dir(bio) == WRITE && bio->bi_iter.bi_size) - break_sharing(tc, bio, block, &key, lookup_result, cell); - else { + if (bio_data_dir(bio) == WRITE && bio->bi_iter.bi_size) { + break_sharing(tc, bio, block, &key, lookup_result, data_cell); + cell_defer_no_holder(tc, virt_cell); + } else { struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); h->shared_read_entry = dm_deferred_entry_inc(pool->shared_read_ds); inc_all_io_entry(pool, bio); - cell_defer_no_holder(tc, cell); - remap_and_issue(tc, bio, lookup_result->block); + + remap_and_issue_shared_cell(tc, data_cell, lookup_result->block); + remap_and_issue_shared_cell(tc, virt_cell, lookup_result->block); } } @@ -1333,34 +1519,28 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block } } -static void process_bio(struct thin_c *tc, struct bio *bio) +static void process_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell) { int r; struct pool *pool = tc->pool; + struct bio *bio = cell->holder; dm_block_t block = get_bio_block(tc, bio); - struct dm_bio_prison_cell *cell; - struct dm_cell_key key; struct dm_thin_lookup_result lookup_result; - /* - * If cell is already occupied, then the block is already - * being provisioned so we have nothing further to do here. - */ - build_virtual_key(tc->td, block, &key); - if (bio_detain(pool, &key, bio, &cell)) + if (tc->requeue_mode) { + cell_requeue(pool, cell); return; + } r = dm_thin_find_block(tc->td, block, 1, &lookup_result); switch (r) { case 0: - if (lookup_result.shared) { - process_shared_bio(tc, bio, block, &lookup_result); - cell_defer_no_holder(tc, cell); /* FIXME: pass this cell into process_shared? */ - } else { + if (lookup_result.shared) + process_shared_bio(tc, bio, block, &lookup_result, cell); + else { inc_all_io_entry(pool, bio); - cell_defer_no_holder(tc, cell); - remap_and_issue(tc, bio, lookup_result.block); + inc_remap_and_issue_cell(tc, cell, lookup_result.block); } break; @@ -1394,7 +1574,26 @@ static void process_bio(struct thin_c *tc, struct bio *bio) } } -static void process_bio_read_only(struct thin_c *tc, struct bio *bio) +static void process_bio(struct thin_c *tc, struct bio *bio) +{ + struct pool *pool = tc->pool; + dm_block_t block = get_bio_block(tc, bio); + struct dm_bio_prison_cell *cell; + struct dm_cell_key key; + + /* + * If cell is already occupied, then the block is already + * being provisioned so we have nothing further to do here. + */ + build_virtual_key(tc->td, block, &key); + if (bio_detain(pool, &key, bio, &cell)) + return; + + process_cell(tc, cell); +} + +static void __process_bio_read_only(struct thin_c *tc, struct bio *bio, + struct dm_bio_prison_cell *cell) { int r; int rw = bio_data_dir(bio); @@ -1404,15 +1603,21 @@ static void process_bio_read_only(struct thin_c *tc, struct bio *bio) r = dm_thin_find_block(tc->td, block, 1, &lookup_result); switch (r) { case 0: - if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size) + if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size) { handle_unserviceable_bio(tc->pool, bio); - else { + if (cell) + cell_defer_no_holder(tc, cell); + } else { inc_all_io_entry(tc->pool, bio); remap_and_issue(tc, bio, lookup_result.block); + if (cell) + inc_remap_and_issue_cell(tc, cell, lookup_result.block); } break; case -ENODATA: + if (cell) + cell_defer_no_holder(tc, cell); if (rw != READ) { handle_unserviceable_bio(tc->pool, bio); break; @@ -1431,11 +1636,23 @@ static void process_bio_read_only(struct thin_c *tc, struct bio *bio) default: DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d", __func__, r); + if (cell) + cell_defer_no_holder(tc, cell); bio_io_error(bio); break; } } +static void process_bio_read_only(struct thin_c *tc, struct bio *bio) +{ + __process_bio_read_only(tc, bio, NULL); +} + +static void process_cell_read_only(struct thin_c *tc, struct dm_bio_prison_cell *cell) +{ + __process_bio_read_only(tc, cell->holder, cell); +} + static void process_bio_success(struct thin_c *tc, struct bio *bio) { bio_endio(bio, 0); @@ -1446,6 +1663,16 @@ static void process_bio_fail(struct thin_c *tc, struct bio *bio) bio_io_error(bio); } +static void process_cell_success(struct thin_c *tc, struct dm_bio_prison_cell *cell) +{ + cell_success(tc->pool, cell); +} + +static void process_cell_fail(struct thin_c *tc, struct dm_bio_prison_cell *cell) +{ + cell_error(tc->pool, cell); +} + /* * FIXME: should we also commit due to size of transaction, measured in * metadata blocks? @@ -1527,6 +1754,7 @@ static void process_thin_deferred_bios(struct thin_c *tc) struct bio *bio; struct bio_list bios; struct blk_plug plug; + unsigned count = 0; if (tc->requeue_mode) { requeue_bio_list(tc, &tc->deferred_bio_list); @@ -1568,10 +1796,97 @@ static void process_thin_deferred_bios(struct thin_c *tc) pool->process_discard(tc, bio); else pool->process_bio(tc, bio); + + if ((count++ & 127) == 0) { + throttle_work_update(&pool->throttle); + dm_pool_issue_prefetches(pool->pmd); + } } blk_finish_plug(&plug); } +static int cmp_cells(const void *lhs, const void *rhs) +{ + struct dm_bio_prison_cell *lhs_cell = *((struct dm_bio_prison_cell **) lhs); + struct dm_bio_prison_cell *rhs_cell = *((struct dm_bio_prison_cell **) rhs); + + BUG_ON(!lhs_cell->holder); + BUG_ON(!rhs_cell->holder); + + if (lhs_cell->holder->bi_iter.bi_sector < rhs_cell->holder->bi_iter.bi_sector) + return -1; + + if (lhs_cell->holder->bi_iter.bi_sector > rhs_cell->holder->bi_iter.bi_sector) + return 1; + + return 0; +} + +static unsigned sort_cells(struct pool *pool, struct list_head *cells) +{ + unsigned count = 0; + struct dm_bio_prison_cell *cell, *tmp; + + list_for_each_entry_safe(cell, tmp, cells, user_list) { + if (count >= CELL_SORT_ARRAY_SIZE) + break; + + pool->cell_sort_array[count++] = cell; + list_del(&cell->user_list); + } + + sort(pool->cell_sort_array, count, sizeof(cell), cmp_cells, NULL); + + return count; +} + +static void process_thin_deferred_cells(struct thin_c *tc) +{ + struct pool *pool = tc->pool; + unsigned long flags; + struct list_head cells; + struct dm_bio_prison_cell *cell; + unsigned i, j, count; + + INIT_LIST_HEAD(&cells); + + spin_lock_irqsave(&tc->lock, flags); + list_splice_init(&tc->deferred_cells, &cells); + spin_unlock_irqrestore(&tc->lock, flags); + + if (list_empty(&cells)) + return; + + do { + count = sort_cells(tc->pool, &cells); + + for (i = 0; i < count; i++) { + cell = pool->cell_sort_array[i]; + BUG_ON(!cell->holder); + + /* + * If we've got no free new_mapping structs, and processing + * this bio might require one, we pause until there are some + * prepared mappings to process. + */ + if (ensure_next_mapping(pool)) { + for (j = i; j < count; j++) + list_add(&pool->cell_sort_array[j]->user_list, &cells); + + spin_lock_irqsave(&tc->lock, flags); + list_splice(&cells, &tc->deferred_cells); + spin_unlock_irqrestore(&tc->lock, flags); + return; + } + + if (cell->holder->bi_rw & REQ_DISCARD) + pool->process_discard_cell(tc, cell); + else + pool->process_cell(tc, cell); + } + } while (!list_empty(&cells)); +} + static void thin_get(struct thin_c *tc); static void thin_put(struct thin_c *tc); @@ -1620,6 +1935,7 @@ static void process_deferred_bios(struct pool *pool) tc = get_first_thin(pool); while (tc) { + process_thin_deferred_cells(tc); process_thin_deferred_bios(tc); tc = get_next_thin(pool, tc); } @@ -1653,9 +1969,15 @@ static void do_worker(struct work_struct *ws) { struct pool *pool = container_of(ws, struct pool, worker); + throttle_work_start(&pool->throttle); + dm_pool_issue_prefetches(pool->pmd); + throttle_work_update(&pool->throttle); process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping); + throttle_work_update(&pool->throttle); process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard); + throttle_work_update(&pool->throttle); process_deferred_bios(pool); + throttle_work_complete(&pool->throttle); } /* @@ -1792,6 +2114,8 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) dm_pool_metadata_read_only(pool->pmd); pool->process_bio = process_bio_fail; pool->process_discard = process_bio_fail; + pool->process_cell = process_cell_fail; + pool->process_discard_cell = process_cell_fail; pool->process_prepared_mapping = process_prepared_mapping_fail; pool->process_prepared_discard = process_prepared_discard_fail; @@ -1804,6 +2128,8 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) dm_pool_metadata_read_only(pool->pmd); pool->process_bio = process_bio_read_only; pool->process_discard = process_bio_success; + pool->process_cell = process_cell_read_only; + pool->process_discard_cell = process_cell_success; pool->process_prepared_mapping = process_prepared_mapping_fail; pool->process_prepared_discard = process_prepared_discard_passdown; @@ -1822,7 +2148,9 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) if (old_mode != new_mode) notify_of_pool_mode_change(pool, "out-of-data-space"); pool->process_bio = process_bio_read_only; - pool->process_discard = process_discard; + pool->process_discard = process_discard_bio; + pool->process_cell = process_cell_read_only; + pool->process_discard_cell = process_discard_cell; pool->process_prepared_mapping = process_prepared_mapping; pool->process_prepared_discard = process_prepared_discard_passdown; @@ -1835,7 +2163,9 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) notify_of_pool_mode_change(pool, "write"); dm_pool_metadata_read_write(pool->pmd); pool->process_bio = process_bio; - pool->process_discard = process_discard; + pool->process_discard = process_discard_bio; + pool->process_cell = process_cell; + pool->process_discard_cell = process_discard_cell; pool->process_prepared_mapping = process_prepared_mapping; pool->process_prepared_discard = process_prepared_discard; break; @@ -1895,6 +2225,29 @@ static void thin_defer_bio(struct thin_c *tc, struct bio *bio) wake_worker(pool); } +static void thin_defer_bio_with_throttle(struct thin_c *tc, struct bio *bio) +{ + struct pool *pool = tc->pool; + + throttle_lock(&pool->throttle); + thin_defer_bio(tc, bio); + throttle_unlock(&pool->throttle); +} + +static void thin_defer_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell) +{ + unsigned long flags; + struct pool *pool = tc->pool; + + throttle_lock(&pool->throttle); + spin_lock_irqsave(&tc->lock, flags); + list_add_tail(&cell->user_list, &tc->deferred_cells); + spin_unlock_irqrestore(&tc->lock, flags); + throttle_unlock(&pool->throttle); + + wake_worker(pool); +} + static void thin_hook_bio(struct thin_c *tc, struct bio *bio) { struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); @@ -1915,8 +2268,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio) dm_block_t block = get_bio_block(tc, bio); struct dm_thin_device *td = tc->td; struct dm_thin_lookup_result result; - struct dm_bio_prison_cell cell1, cell2; - struct dm_bio_prison_cell *cell_result; + struct dm_bio_prison_cell *virt_cell, *data_cell; struct dm_cell_key key; thin_hook_bio(tc, bio); @@ -1932,10 +2284,18 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio) } if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) { - thin_defer_bio(tc, bio); + thin_defer_bio_with_throttle(tc, bio); return DM_MAPIO_SUBMITTED; } + /* + * We must hold the virtual cell before doing the lookup, otherwise + * there's a race with discard. + */ + build_virtual_key(tc->td, block, &key); + if (bio_detain(tc->pool, &key, bio, &virt_cell)) + return DM_MAPIO_SUBMITTED; + r = dm_thin_find_block(td, block, 0, &result); /* @@ -1958,23 +2318,19 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio) * More distant ancestors are irrelevant. The * shared flag will be set in their case. */ - thin_defer_bio(tc, bio); + thin_defer_cell(tc, virt_cell); return DM_MAPIO_SUBMITTED; } - build_virtual_key(tc->td, block, &key); - if (dm_bio_detain(tc->pool->prison, &key, bio, &cell1, &cell_result)) - return DM_MAPIO_SUBMITTED; - build_data_key(tc->td, result.block, &key); - if (dm_bio_detain(tc->pool->prison, &key, bio, &cell2, &cell_result)) { - cell_defer_no_holder_no_free(tc, &cell1); + if (bio_detain(tc->pool, &key, bio, &data_cell)) { + cell_defer_no_holder(tc, virt_cell); return DM_MAPIO_SUBMITTED; } inc_all_io_entry(tc->pool, bio); - cell_defer_no_holder_no_free(tc, &cell2); - cell_defer_no_holder_no_free(tc, &cell1); + cell_defer_no_holder(tc, data_cell); + cell_defer_no_holder(tc, virt_cell); remap(tc, bio, result.block); return DM_MAPIO_REMAPPED; @@ -1986,16 +2342,13 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio) * of doing so. */ handle_unserviceable_bio(tc->pool, bio); + cell_defer_no_holder(tc, virt_cell); return DM_MAPIO_SUBMITTED; } /* fall through */ case -EWOULDBLOCK: - /* - * In future, the failed dm_thin_find_block above could - * provide the hint to load the metadata into cache. - */ - thin_defer_bio(tc, bio); + thin_defer_cell(tc, virt_cell); return DM_MAPIO_SUBMITTED; default: @@ -2005,6 +2358,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio) * pool is switched to fail-io mode. */ bio_io_error(bio); + cell_defer_no_holder(tc, virt_cell); return DM_MAPIO_SUBMITTED; } } @@ -2185,7 +2539,7 @@ static struct pool *pool_create(struct mapped_device *pool_md, pool->sectors_per_block_shift = __ffs(block_size); pool->low_water_blocks = 0; pool_features_init(&pool->pf); - pool->prison = dm_bio_prison_create(PRISON_CELLS); + pool->prison = dm_bio_prison_create(); if (!pool->prison) { *error = "Error creating pool's bio prison"; err_p = ERR_PTR(-ENOMEM); @@ -2211,6 +2565,7 @@ static struct pool *pool_create(struct mapped_device *pool_md, goto bad_wq; } + throttle_init(&pool->throttle); INIT_WORK(&pool->worker, do_worker); INIT_DELAYED_WORK(&pool->waker, do_waker); INIT_DELAYED_WORK(&pool->no_space_timeout, do_no_space_timeout); @@ -3169,15 +3524,42 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) { struct pool_c *pt = ti->private; struct pool *pool = pt->pool; - uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT; + sector_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT; + + /* + * Adjust max_sectors_kb to highest possible power-of-2 + * factor of pool->sectors_per_block. + */ + if (limits->max_hw_sectors & (limits->max_hw_sectors - 1)) + limits->max_sectors = rounddown_pow_of_two(limits->max_hw_sectors); + else + limits->max_sectors = limits->max_hw_sectors; + + if (limits->max_sectors < pool->sectors_per_block) { + while (!is_factor(pool->sectors_per_block, limits->max_sectors)) { + if ((limits->max_sectors & (limits->max_sectors - 1)) == 0) + limits->max_sectors--; + limits->max_sectors = rounddown_pow_of_two(limits->max_sectors); + } + } else if (block_size_is_power_of_two(pool)) { + /* max_sectors_kb is >= power-of-2 thinp blocksize */ + while (!is_factor(limits->max_sectors, pool->sectors_per_block)) { + if ((limits->max_sectors & (limits->max_sectors - 1)) == 0) + limits->max_sectors--; + limits->max_sectors = rounddown_pow_of_two(limits->max_sectors); + } + } /* * If the system-determined stacked limits are compatible with the * pool's blocksize (io_opt is a factor) do not override them. */ if (io_opt_sectors < pool->sectors_per_block || - do_div(io_opt_sectors, pool->sectors_per_block)) { - blk_limits_io_min(limits, pool->sectors_per_block << SECTOR_SHIFT); + !is_factor(io_opt_sectors, pool->sectors_per_block)) { + if (is_factor(pool->sectors_per_block, limits->max_sectors)) + blk_limits_io_min(limits, limits->max_sectors << SECTOR_SHIFT); + else + blk_limits_io_min(limits, pool->sectors_per_block << SECTOR_SHIFT); blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); } @@ -3206,7 +3588,7 @@ static struct target_type pool_target = { .name = "thin-pool", .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | DM_TARGET_IMMUTABLE, - .version = {1, 13, 0}, + .version = {1, 14, 0}, .module = THIS_MODULE, .ctr = pool_ctr, .dtr = pool_dtr, @@ -3295,6 +3677,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) goto out_unlock; } spin_lock_init(&tc->lock); + INIT_LIST_HEAD(&tc->deferred_cells); bio_list_init(&tc->deferred_bio_list); bio_list_init(&tc->retry_on_resume_list); tc->sort_bio_list = RB_ROOT; @@ -3533,6 +3916,21 @@ err: DMEMIT("Error"); } +static int thin_merge(struct dm_target *ti, struct bvec_merge_data *bvm, + struct bio_vec *biovec, int max_size) +{ + struct thin_c *tc = ti->private; + struct request_queue *q = bdev_get_queue(tc->pool_dev->bdev); + + if (!q->merge_bvec_fn) + return max_size; + + bvm->bi_bdev = tc->pool_dev->bdev; + bvm->bi_sector = dm_target_offset(ti, bvm->bi_sector); + + return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); +} + static int thin_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) { @@ -3557,7 +3955,7 @@ static int thin_iterate_devices(struct dm_target *ti, static struct target_type thin_target = { .name = "thin", - .version = {1, 13, 0}, + .version = {1, 14, 0}, .module = THIS_MODULE, .ctr = thin_ctr, .dtr = thin_dtr, @@ -3567,6 +3965,7 @@ static struct target_type thin_target = { .presuspend = thin_presuspend, .postsuspend = thin_postsuspend, .status = thin_status, + .merge = thin_merge, .iterate_devices = thin_iterate_devices, };