return ret;
}
+static struct attribute_group x86_pmu_attr_group;
+
static int __init init_hw_perf_events(void)
{
struct x86_pmu_quirk *quirk;
x86_pmu_events_group.attrs = tmp;
}
+ if (x86_pmu.attrs) {
+ struct attribute **tmp;
+
+ tmp = merge_attr(x86_pmu_attr_group.attrs, x86_pmu.attrs);
+ if (!WARN_ON(!tmp))
+ x86_pmu_attr_group.attrs = tmp;
+ }
+
pr_info("... version: %d\n", x86_pmu.version);
pr_info("... bit width: %d\n", x86_pmu.cntval_bits);
pr_info("... generic registers: %d\n", x86_pmu.num_counters);
void arch_perf_update_userpage(struct perf_event *event,
struct perf_event_mmap_page *userpg, u64 now)
{
- struct cyc2ns_data *data;
+ struct cyc2ns_data data;
u64 offset;
userpg->cap_user_time = 0;
if (!using_native_sched_clock() || !sched_clock_stable())
return;
- data = cyc2ns_read_begin();
+ cyc2ns_read_begin(&data);
- offset = data->cyc2ns_offset + __sched_clock_offset;
+ offset = data.cyc2ns_offset + __sched_clock_offset;
/*
* Internal timekeeping for enabled/running/stopped times
* is always in the local_clock domain.
*/
userpg->cap_user_time = 1;
- userpg->time_mult = data->cyc2ns_mul;
- userpg->time_shift = data->cyc2ns_shift;
+ userpg->time_mult = data.cyc2ns_mul;
+ userpg->time_shift = data.cyc2ns_shift;
userpg->time_offset = offset - now;
/*
userpg->time_zero = offset;
}
- cyc2ns_read_end(data);
+ cyc2ns_read_end();
}
void
static void blk_mq_poll_stats_start(struct request_queue *q);
static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
-static void __blk_mq_stop_hw_queues(struct request_queue *q, bool sync);
static int blk_mq_poll_stats_bkt(const struct request *rq)
{
}
EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
+/*
+ * FIXME: replace the scsi_internal_device_*block_nowait() calls in the
+ * mpt3sas driver such that this function can be removed.
+ */
+void blk_mq_quiesce_queue_nowait(struct request_queue *q)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(q->queue_lock, flags);
+ queue_flag_set(QUEUE_FLAG_QUIESCED, q);
+ spin_unlock_irqrestore(q->queue_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
+
/**
- * blk_mq_quiesce_queue() - wait until all ongoing queue_rq calls have finished
+ * blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished
* @q: request queue.
*
* Note: this function does not prevent that the struct request end_io()
- * callback function is invoked. Additionally, it is not prevented that
- * new queue_rq() calls occur unless the queue has been stopped first.
+ * callback function is invoked. Once this function is returned, we make
+ * sure no dispatch can happen until the queue is unquiesced via
+ * blk_mq_unquiesce_queue().
*/
void blk_mq_quiesce_queue(struct request_queue *q)
{
unsigned int i;
bool rcu = false;
- __blk_mq_stop_hw_queues(q, true);
+ blk_mq_quiesce_queue_nowait(q);
queue_for_each_hw_ctx(q, hctx, i) {
if (hctx->flags & BLK_MQ_F_BLOCKING)
- synchronize_srcu(&hctx->queue_rq_srcu);
+ synchronize_srcu(hctx->queue_rq_srcu);
else
rcu = true;
}
}
EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);
+/*
+ * blk_mq_unquiesce_queue() - counterpart of blk_mq_quiesce_queue()
+ * @q: request queue.
+ *
+ * This function recovers queue into the state before quiescing
+ * which is done by blk_mq_quiesce_queue.
+ */
+void blk_mq_unquiesce_queue(struct request_queue *q)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(q->queue_lock, flags);
+ queue_flag_clear(QUEUE_FLAG_QUIESCED, q);
+ spin_unlock_irqrestore(q->queue_lock, flags);
+
+ /* dispatch requests which are inserted during quiescing */
+ blk_mq_run_hw_queues(q, true);
+}
+EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue);
+
void blk_mq_wake_waiters(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
}
EXPORT_SYMBOL(blk_mq_can_queue);
-void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
- struct request *rq, unsigned int op)
+static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
+ unsigned int tag, unsigned int op)
{
+ struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
+ struct request *rq = tags->static_rqs[tag];
+
+ rq->rq_flags = 0;
+
+ if (data->flags & BLK_MQ_REQ_INTERNAL) {
+ rq->tag = -1;
+ rq->internal_tag = tag;
+ } else {
+ if (blk_mq_tag_busy(data->hctx)) {
+ rq->rq_flags = RQF_MQ_INFLIGHT;
+ atomic_inc(&data->hctx->nr_active);
+ }
+ rq->tag = tag;
+ rq->internal_tag = -1;
+ data->hctx->tags->rqs[rq->tag] = rq;
+ }
+
INIT_LIST_HEAD(&rq->queuelist);
/* csd/requeue_work/fifo_time is initialized before use */
- rq->q = q;
- rq->mq_ctx = ctx;
+ rq->q = data->q;
+ rq->mq_ctx = data->ctx;
rq->cmd_flags = op;
- if (blk_queue_io_stat(q))
+ if (blk_queue_io_stat(data->q))
rq->rq_flags |= RQF_IO_STAT;
/* do not touch atomic flags, it needs atomic ops against the timer */
rq->cpu = -1;
rq->end_io_data = NULL;
rq->next_rq = NULL;
- ctx->rq_dispatched[op_is_sync(op)]++;
+ data->ctx->rq_dispatched[op_is_sync(op)]++;
+ return rq;
}
-EXPORT_SYMBOL_GPL(blk_mq_rq_ctx_init);
-struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data,
- unsigned int op)
+static struct request *blk_mq_get_request(struct request_queue *q,
+ struct bio *bio, unsigned int op,
+ struct blk_mq_alloc_data *data)
{
+ struct elevator_queue *e = q->elevator;
struct request *rq;
unsigned int tag;
- tag = blk_mq_get_tag(data);
- if (tag != BLK_MQ_TAG_FAIL) {
- struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
+ blk_queue_enter_live(q);
+ data->q = q;
+ if (likely(!data->ctx))
+ data->ctx = blk_mq_get_ctx(q);
+ if (likely(!data->hctx))
+ data->hctx = blk_mq_map_queue(q, data->ctx->cpu);
+ if (op & REQ_NOWAIT)
+ data->flags |= BLK_MQ_REQ_NOWAIT;
- rq = tags->static_rqs[tag];
+ if (e) {
+ data->flags |= BLK_MQ_REQ_INTERNAL;
- if (data->flags & BLK_MQ_REQ_INTERNAL) {
- rq->tag = -1;
- rq->internal_tag = tag;
- } else {
- if (blk_mq_tag_busy(data->hctx)) {
- rq->rq_flags = RQF_MQ_INFLIGHT;
- atomic_inc(&data->hctx->nr_active);
- }
- rq->tag = tag;
- rq->internal_tag = -1;
- data->hctx->tags->rqs[rq->tag] = rq;
- }
+ /*
+ * Flush requests are special and go directly to the
+ * dispatch list.
+ */
+ if (!op_is_flush(op) && e->type->ops.mq.limit_depth)
+ e->type->ops.mq.limit_depth(op, data);
+ }
- blk_mq_rq_ctx_init(data->q, data->ctx, rq, op);
- return rq;
+ tag = blk_mq_get_tag(data);
+ if (tag == BLK_MQ_TAG_FAIL) {
+ blk_queue_exit(q);
+ return NULL;
}
- return NULL;
+ rq = blk_mq_rq_ctx_init(data, tag, op);
+ if (!op_is_flush(op)) {
+ rq->elv.icq = NULL;
+ if (e && e->type->ops.mq.prepare_request) {
+ if (e->type->icq_cache && rq_ioc(bio))
+ blk_mq_sched_assign_ioc(rq, bio);
+
+ e->type->ops.mq.prepare_request(rq, bio);
+ rq->rq_flags |= RQF_ELVPRIV;
+ }
+ }
+ data->hctx->queued++;
+ return rq;
}
-EXPORT_SYMBOL_GPL(__blk_mq_alloc_request);
-struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
+struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
unsigned int flags)
{
struct blk_mq_alloc_data alloc_data = { .flags = flags };
if (ret)
return ERR_PTR(ret);
- rq = blk_mq_sched_get_request(q, NULL, rw, &alloc_data);
+ rq = blk_mq_get_request(q, NULL, op, &alloc_data);
blk_mq_put_ctx(alloc_data.ctx);
blk_queue_exit(q);
}
EXPORT_SYMBOL(blk_mq_alloc_request);
-struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw,
- unsigned int flags, unsigned int hctx_idx)
+struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
+ unsigned int op, unsigned int flags, unsigned int hctx_idx)
{
struct blk_mq_alloc_data alloc_data = { .flags = flags };
struct request *rq;
cpu = cpumask_first(alloc_data.hctx->cpumask);
alloc_data.ctx = __blk_mq_get_ctx(q, cpu);
- rq = blk_mq_sched_get_request(q, NULL, rw, &alloc_data);
+ rq = blk_mq_get_request(q, NULL, op, &alloc_data);
blk_queue_exit(q);
}
EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
-void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
- struct request *rq)
+void blk_mq_free_request(struct request *rq)
{
- const int sched_tag = rq->internal_tag;
struct request_queue *q = rq->q;
+ struct elevator_queue *e = q->elevator;
+ struct blk_mq_ctx *ctx = rq->mq_ctx;
+ struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
+ const int sched_tag = rq->internal_tag;
+ if (rq->rq_flags & RQF_ELVPRIV) {
+ if (e && e->type->ops.mq.finish_request)
+ e->type->ops.mq.finish_request(rq);
+ if (rq->elv.icq) {
+ put_io_context(rq->elv.icq->ioc);
+ rq->elv.icq = NULL;
+ }
+ }
+
+ ctx->rq_completed[rq_is_sync(rq)]++;
if (rq->rq_flags & RQF_MQ_INFLIGHT)
atomic_dec(&hctx->nr_active);
wbt_done(q->rq_wb, &rq->issue_stat);
- rq->rq_flags = 0;
clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
blk_mq_sched_restart(hctx);
blk_queue_exit(q);
}
-
-static void blk_mq_finish_hctx_request(struct blk_mq_hw_ctx *hctx,
- struct request *rq)
-{
- struct blk_mq_ctx *ctx = rq->mq_ctx;
-
- ctx->rq_completed[rq_is_sync(rq)]++;
- __blk_mq_finish_request(hctx, ctx, rq);
-}
-
-void blk_mq_finish_request(struct request *rq)
-{
- blk_mq_finish_hctx_request(blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), rq);
-}
-EXPORT_SYMBOL_GPL(blk_mq_finish_request);
-
-void blk_mq_free_request(struct request *rq)
-{
- blk_mq_sched_put_request(rq);
-}
EXPORT_SYMBOL_GPL(blk_mq_free_request);
-inline void __blk_mq_end_request(struct request *rq, int error)
+inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
{
blk_account_io_done(rq);
}
EXPORT_SYMBOL(__blk_mq_end_request);
-void blk_mq_end_request(struct request *rq, int error)
+void blk_mq_end_request(struct request *rq, blk_status_t error)
{
if (blk_update_request(rq, error, blk_rq_bytes(rq)))
BUG();
blk_queue_exit(q);
}
-/*
- * Reverse check our software queue for entries that we could potentially
- * merge with. Currently includes a hand-wavy stop count of 8, to not spend
- * too much time checking for merges.
- */
-static bool blk_mq_attempt_merge(struct request_queue *q,
- struct blk_mq_ctx *ctx, struct bio *bio)
-{
- struct request *rq;
- int checked = 8;
-
- list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) {
- bool merged = false;
-
- if (!checked--)
- break;
-
- if (!blk_rq_merge_ok(rq, bio))
- continue;
-
- switch (blk_try_merge(rq, bio)) {
- case ELEVATOR_BACK_MERGE:
- if (blk_mq_sched_allow_merge(q, rq, bio))
- merged = bio_attempt_back_merge(q, rq, bio);
- break;
- case ELEVATOR_FRONT_MERGE:
- if (blk_mq_sched_allow_merge(q, rq, bio))
- merged = bio_attempt_front_merge(q, rq, bio);
- break;
- case ELEVATOR_DISCARD_MERGE:
- merged = bio_attempt_discard_merge(q, rq, bio);
- break;
- default:
- continue;
- }
-
- if (merged)
- ctx->rq_merged++;
- return merged;
- }
-
- return false;
-}
-
struct flush_busy_ctx_data {
struct blk_mq_hw_ctx *hctx;
struct list_head *list;
return first != NULL;
}
- static int blk_mq_dispatch_wake(wait_queue_t *wait, unsigned mode, int flags,
+ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
void *key)
{
struct blk_mq_hw_ctx *hctx;
hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
- list_del(&wait->task_list);
+ list_del(&wait->entry);
clear_bit_unlock(BLK_MQ_S_TAG_WAITING, &hctx->state);
blk_mq_run_hw_queue(hctx, true);
return 1;
{
struct blk_mq_hw_ctx *hctx;
struct request *rq;
- int errors, queued, ret = BLK_MQ_RQ_QUEUE_OK;
+ int errors, queued;
if (list_empty(list))
return false;
errors = queued = 0;
do {
struct blk_mq_queue_data bd;
+ blk_status_t ret;
rq = list_first_entry(list, struct request, queuelist);
if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
}
ret = q->mq_ops->queue_rq(hctx, &bd);
- switch (ret) {
- case BLK_MQ_RQ_QUEUE_OK:
- queued++;
- break;
- case BLK_MQ_RQ_QUEUE_BUSY:
+ if (ret == BLK_STS_RESOURCE) {
blk_mq_put_driver_tag_hctx(hctx, rq);
list_add(&rq->queuelist, list);
__blk_mq_requeue_request(rq);
break;
- default:
- pr_err("blk-mq: bad return on queue: %d\n", ret);
- case BLK_MQ_RQ_QUEUE_ERROR:
+ }
+
+ if (unlikely(ret != BLK_STS_OK)) {
errors++;
- blk_mq_end_request(rq, -EIO);
- break;
+ blk_mq_end_request(rq, BLK_STS_IOERR);
+ continue;
}
- if (ret == BLK_MQ_RQ_QUEUE_BUSY)
- break;
+ queued++;
} while (!list_empty(list));
hctx->dispatched[queued_to_index(queued)]++;
* - blk_mq_run_hw_queue() checks whether or not a queue has
* been stopped before rerunning a queue.
* - Some but not all block drivers stop a queue before
- * returning BLK_MQ_RQ_QUEUE_BUSY. Two exceptions are scsi-mq
+ * returning BLK_STS_RESOURCE. Two exceptions are scsi-mq
* and dm-rq.
*/
if (!blk_mq_sched_needs_restart(hctx) &&
} else {
might_sleep();
- srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu);
+ srcu_idx = srcu_read_lock(hctx->queue_rq_srcu);
blk_mq_sched_dispatch_requests(hctx);
- srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx);
+ srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx);
}
}
static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
unsigned long msecs)
{
- if (unlikely(blk_mq_hctx_stopped(hctx) ||
- !blk_mq_hw_queue_mapped(hctx)))
+ if (WARN_ON_ONCE(!blk_mq_hw_queue_mapped(hctx)))
+ return;
+
+ if (unlikely(blk_mq_hctx_stopped(hctx)))
return;
if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) {
}
EXPORT_SYMBOL(blk_mq_queue_stopped);
-static void __blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx, bool sync)
+/*
+ * This function is often used for pausing .queue_rq() by driver when
+ * there isn't enough resource or some conditions aren't satisfied, and
+ * BLK_MQ_RQ_QUEUE_BUSY is usually returned.
+ *
+ * We do not guarantee that dispatch can be drained or blocked
+ * after blk_mq_stop_hw_queue() returns. Please use
+ * blk_mq_quiesce_queue() for that requirement.
+ */
+void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
{
- if (sync)
- cancel_delayed_work_sync(&hctx->run_work);
- else
- cancel_delayed_work(&hctx->run_work);
+ cancel_delayed_work(&hctx->run_work);
set_bit(BLK_MQ_S_STOPPED, &hctx->state);
}
-
-void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
-{
- __blk_mq_stop_hw_queue(hctx, false);
-}
EXPORT_SYMBOL(blk_mq_stop_hw_queue);
-static void __blk_mq_stop_hw_queues(struct request_queue *q, bool sync)
+/*
+ * This function is often used for pausing .queue_rq() by driver when
+ * there isn't enough resource or some conditions aren't satisfied, and
+ * BLK_MQ_RQ_QUEUE_BUSY is usually returned.
+ *
+ * We do not guarantee that dispatch can be drained or blocked
+ * after blk_mq_stop_hw_queues() returns. Please use
+ * blk_mq_quiesce_queue() for that requirement.
+ */
+void blk_mq_stop_hw_queues(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
int i;
queue_for_each_hw_ctx(q, hctx, i)
- __blk_mq_stop_hw_queue(hctx, sync);
-}
-
-void blk_mq_stop_hw_queues(struct request_queue *q)
-{
- __blk_mq_stop_hw_queues(q, false);
+ blk_mq_stop_hw_queue(hctx);
}
EXPORT_SYMBOL(blk_mq_stop_hw_queues);
void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
{
- if (unlikely(!blk_mq_hw_queue_mapped(hctx)))
+ if (WARN_ON_ONCE(!blk_mq_hw_queue_mapped(hctx)))
return;
/*
{
struct blk_mq_ctx *ctx = rq->mq_ctx;
+ lockdep_assert_held(&ctx->lock);
+
trace_block_rq_insert(hctx->queue, rq);
if (at_head)
{
struct blk_mq_ctx *ctx = rq->mq_ctx;
+ lockdep_assert_held(&ctx->lock);
+
__blk_mq_insert_req_list(hctx, rq, at_head);
blk_mq_hctx_mark_pending(hctx, ctx);
}
!blk_queue_nomerges(hctx->queue);
}
-static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx,
- struct blk_mq_ctx *ctx,
- struct request *rq, struct bio *bio)
+static inline void blk_mq_queue_io(struct blk_mq_hw_ctx *hctx,
+ struct blk_mq_ctx *ctx,
+ struct request *rq)
{
- if (!hctx_allow_merges(hctx) || !bio_mergeable(bio)) {
- blk_mq_bio_to_request(rq, bio);
- spin_lock(&ctx->lock);
-insert_rq:
- __blk_mq_insert_request(hctx, rq, false);
- spin_unlock(&ctx->lock);
- return false;
- } else {
- struct request_queue *q = hctx->queue;
-
- spin_lock(&ctx->lock);
- if (!blk_mq_attempt_merge(q, ctx, bio)) {
- blk_mq_bio_to_request(rq, bio);
- goto insert_rq;
- }
-
- spin_unlock(&ctx->lock);
- __blk_mq_finish_request(hctx, ctx, rq);
- return true;
- }
+ spin_lock(&ctx->lock);
+ __blk_mq_insert_request(hctx, rq, false);
+ spin_unlock(&ctx->lock);
}
static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq)
.last = true,
};
blk_qc_t new_cookie;
- int ret;
+ blk_status_t ret;
bool run_queue = true;
- if (blk_mq_hctx_stopped(hctx)) {
+ /* RCU or SRCU read lock is needed before checking quiesced flag */
+ if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) {
run_queue = false;
goto insert;
}
* would have done
*/
ret = q->mq_ops->queue_rq(hctx, &bd);
- if (ret == BLK_MQ_RQ_QUEUE_OK) {
+ switch (ret) {
+ case BLK_STS_OK:
*cookie = new_cookie;
return;
- }
-
- if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
+ case BLK_STS_RESOURCE:
+ __blk_mq_requeue_request(rq);
+ goto insert;
+ default:
*cookie = BLK_QC_T_NONE;
- blk_mq_end_request(rq, -EIO);
+ blk_mq_end_request(rq, ret);
return;
}
- __blk_mq_requeue_request(rq);
insert:
blk_mq_sched_insert_request(rq, false, run_queue, false, may_sleep);
}
might_sleep();
- srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu);
+ srcu_idx = srcu_read_lock(hctx->queue_rq_srcu);
__blk_mq_try_issue_directly(hctx, rq, cookie, true);
- srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx);
+ srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx);
}
}
blk_queue_bounce(q, &bio);
- blk_queue_split(q, &bio, q->bio_split);
+ blk_queue_split(q, &bio);
if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
bio_io_error(bio);
trace_block_getrq(q, bio, bio->bi_opf);
- rq = blk_mq_sched_get_request(q, bio, bio->bi_opf, &data);
+ rq = blk_mq_get_request(q, bio, bio->bi_opf, &data);
if (unlikely(!rq)) {
__wbt_done(q->rq_wb, wb_acct);
+ if (bio->bi_opf & REQ_NOWAIT)
+ bio_wouldblock_error(bio);
return BLK_QC_T_NONE;
}
blk_mq_put_ctx(data.ctx);
blk_mq_bio_to_request(rq, bio);
blk_mq_sched_insert_request(rq, false, true, true, true);
- } else if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
+ } else {
blk_mq_put_ctx(data.ctx);
+ blk_mq_bio_to_request(rq, bio);
+ blk_mq_queue_io(data.hctx, data.ctx, rq);
blk_mq_run_hw_queue(data.hctx, true);
- } else
- blk_mq_put_ctx(data.ctx);
+ }
return cookie;
}
set->ops->exit_hctx(hctx, hctx_idx);
if (hctx->flags & BLK_MQ_F_BLOCKING)
- cleanup_srcu_struct(&hctx->queue_rq_srcu);
+ cleanup_srcu_struct(hctx->queue_rq_srcu);
blk_mq_remove_cpuhp(hctx);
blk_free_flush_queue(hctx->fq);
spin_lock_init(&hctx->lock);
INIT_LIST_HEAD(&hctx->dispatch);
hctx->queue = q;
- hctx->queue_num = hctx_idx;
hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED;
cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
goto free_fq;
if (hctx->flags & BLK_MQ_F_BLOCKING)
- init_srcu_struct(&hctx->queue_rq_srcu);
+ init_srcu_struct(hctx->queue_rq_srcu);
blk_mq_debugfs_register_hctx(q, hctx);
}
EXPORT_SYMBOL(blk_mq_init_queue);
+static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
+{
+ int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
+
+ BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, queue_rq_srcu),
+ __alignof__(struct blk_mq_hw_ctx)) !=
+ sizeof(struct blk_mq_hw_ctx));
+
+ if (tag_set->flags & BLK_MQ_F_BLOCKING)
+ hw_ctx_size += sizeof(struct srcu_struct);
+
+ return hw_ctx_size;
+}
+
static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
struct request_queue *q)
{
continue;
node = blk_mq_hw_queue_to_node(q->mq_map, i);
- hctxs[i] = kzalloc_node(sizeof(struct blk_mq_hw_ctx),
+ hctxs[i] = kzalloc_node(blk_mq_hw_ctx_size(set),
GFP_KERNEL, node);
if (!hctxs[i])
break;
struct list_head rqs[KYBER_NUM_DOMAINS];
unsigned int cur_domain;
unsigned int batching;
- wait_queue_t domain_wait[KYBER_NUM_DOMAINS];
+ wait_queue_entry_t domain_wait[KYBER_NUM_DOMAINS];
atomic_t wait_index[KYBER_NUM_DOMAINS];
};
for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
INIT_LIST_HEAD(&khd->rqs[i]);
- INIT_LIST_HEAD(&khd->domain_wait[i].task_list);
+ INIT_LIST_HEAD(&khd->domain_wait[i].entry);
atomic_set(&khd->wait_index[i], 0);
}
}
}
-static struct request *kyber_get_request(struct request_queue *q,
- unsigned int op,
- struct blk_mq_alloc_data *data)
+static void kyber_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
{
- struct kyber_queue_data *kqd = q->elevator->elevator_data;
- struct request *rq;
-
/*
* We use the scheduler tags as per-hardware queue queueing tokens.
* Async requests can be limited at this stage.
*/
- if (!op_is_sync(op))
+ if (!op_is_sync(op)) {
+ struct kyber_queue_data *kqd = data->q->elevator->elevator_data;
+
data->shallow_depth = kqd->async_depth;
+ }
+}
- rq = __blk_mq_alloc_request(data, op);
- if (rq)
- rq_set_domain_token(rq, -1);
- return rq;
+static void kyber_prepare_request(struct request *rq, struct bio *bio)
+{
+ rq_set_domain_token(rq, -1);
}
-static void kyber_put_request(struct request *rq)
+static void kyber_finish_request(struct request *rq)
{
- struct request_queue *q = rq->q;
- struct kyber_queue_data *kqd = q->elevator->elevator_data;
+ struct kyber_queue_data *kqd = rq->q->elevator->elevator_data;
rq_clear_domain_token(kqd, rq);
- blk_mq_finish_request(rq);
}
static void kyber_completed_request(struct request *rq)
}
}
- static int kyber_domain_wake(wait_queue_t *wait, unsigned mode, int flags,
+ static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
void *key)
{
struct blk_mq_hw_ctx *hctx = READ_ONCE(wait->private);
- list_del_init(&wait->task_list);
+ list_del_init(&wait->entry);
blk_mq_run_hw_queue(hctx, true);
return 1;
}
{
unsigned int sched_domain = khd->cur_domain;
struct sbitmap_queue *domain_tokens = &kqd->domain_tokens[sched_domain];
- wait_queue_t *wait = &khd->domain_wait[sched_domain];
+ wait_queue_entry_t *wait = &khd->domain_wait[sched_domain];
struct sbq_wait_state *ws;
int nr;
* run when one becomes available. Note that this is serialized on
* khd->lock, but we still need to be careful about the waker.
*/
- if (list_empty_careful(&wait->task_list)) {
+ if (list_empty_careful(&wait->entry)) {
init_waitqueue_func_entry(wait, kyber_domain_wake);
wait->private = hctx;
ws = sbq_wait_ptr(domain_tokens,
{ \
struct blk_mq_hw_ctx *hctx = data; \
struct kyber_hctx_data *khd = hctx->sched_data; \
- wait_queue_t *wait = &khd->domain_wait[domain]; \
+ wait_queue_entry_t *wait = &khd->domain_wait[domain]; \
\
- seq_printf(m, "%d\n", !list_empty_careful(&wait->task_list)); \
+ seq_printf(m, "%d\n", !list_empty_careful(&wait->entry)); \
return 0; \
}
KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_READ, read)
.exit_sched = kyber_exit_sched,
.init_hctx = kyber_init_hctx,
.exit_hctx = kyber_exit_hctx,
- .get_request = kyber_get_request,
- .put_request = kyber_put_request,
+ .limit_depth = kyber_limit_depth,
+ .prepare_request = kyber_prepare_request,
+ .finish_request = kyber_finish_request,
.completed_request = kyber_completed_request,
.dispatch_request = kyber_dispatch_request,
.has_work = kyber_has_work,
i_gid_write(inode, 0);
atomic_set(&inode->i_writecount, 0);
inode->i_size = 0;
+ inode->i_write_hint = WRITE_LIFE_NOT_SET;
inode->i_blocks = 0;
inode->i_bytes = 0;
inode->i_generation = 0;
wait_queue_head_t *wq;
DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
wq = bit_waitqueue(&inode->i_state, __I_NEW);
- prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+ prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
spin_unlock(&inode->i_lock);
spin_unlock(&inode_hash_lock);
schedule();
- finish_wait(wq, &wait.wait);
+ finish_wait(wq, &wait.wq_entry);
spin_lock(&inode_hash_lock);
}
DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP);
do {
- prepare_to_wait(wq, &q.wait, TASK_UNINTERRUPTIBLE);
+ prepare_to_wait(wq, &q.wq_entry, TASK_UNINTERRUPTIBLE);
if (atomic_read(&inode->i_dio_count))
schedule();
} while (atomic_read(&inode->i_dio_count));
- finish_wait(wq, &q.wait);
+ finish_wait(wq, &q.wq_entry);
}
/**
/* Except MODE, it seems harmless of setting twice. */
if (opendata->o_arg.createmode != NFS4_CREATE_EXCLUSIVE &&
- attrset[1] & FATTR4_WORD1_MODE)
+ (attrset[1] & FATTR4_WORD1_MODE ||
+ attrset[2] & FATTR4_WORD2_MODE_UMASK))
sattr->ia_valid &= ~ATTR_MODE;
if (attrset[2] & FATTR4_WORD2_SECURITY_LABEL)
};
static int
- nfs4_wake_lock_waiter(wait_queue_t *wait, unsigned int mode, int flags, void *key)
+ nfs4_wake_lock_waiter(wait_queue_entry_t *wait, unsigned int mode, int flags, void *key)
{
int ret;
struct cb_notify_lock_args *cbnl = key;
.inode = state->inode,
.owner = &owner,
.notified = false };
- wait_queue_t wait;
+ wait_queue_entry_t wait;
/* Don't bother with waitqueue if we don't expect a callback */
if (!test_bit(NFS_STATE_MAY_NOTIFY_LOCK, &state->flags))
size_t max_pages = max_response_pages(server);
dprintk("--> %s\n", __func__);
+ nfs4_sequence_free_slot(&lgp->res.seq_res);
nfs4_free_pages(lgp->args.layout.pages, max_pages);
pnfs_put_layout_hdr(NFS_I(inode)->layout);
put_nfs_open_context(lgp->args.ctx);
/* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */
if (status == 0 && lgp->res.layoutp->len)
lseg = pnfs_layout_process(lgp);
- nfs4_sequence_free_slot(&lgp->res.seq_res);
rpc_put_task(task);
dprintk("<-- %s status=%d\n", __func__, status);
if (status)
struct blk_mq_ctx **ctxs;
unsigned int nr_ctx;
- wait_queue_t dispatch_wait;
+ wait_queue_entry_t dispatch_wait;
atomic_t wait_index;
struct blk_mq_tags *tags;
struct blk_mq_tags *sched_tags;
- struct srcu_struct queue_rq_srcu;
-
unsigned long queued;
unsigned long run;
#define BLK_MQ_MAX_DISPATCH_ORDER 7
struct dentry *debugfs_dir;
struct dentry *sched_debugfs_dir;
#endif
+
+ /* Must be the last member - see also blk_mq_hw_ctx_size(). */
+ struct srcu_struct queue_rq_srcu[0];
};
struct blk_mq_tag_set {
bool last;
};
-typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, const struct blk_mq_queue_data *);
+typedef blk_status_t (queue_rq_fn)(struct blk_mq_hw_ctx *,
+ const struct blk_mq_queue_data *);
typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool);
typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int);
typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int);
init_request_fn *init_request;
exit_request_fn *exit_request;
reinit_request_fn *reinit_request;
+ /* Called from inside blk_get_request() */
+ void (*initialize_rq_fn)(struct request *rq);
map_queues_fn *map_queues;
};
enum {
- BLK_MQ_RQ_QUEUE_OK = 0, /* queued fine */
- BLK_MQ_RQ_QUEUE_BUSY = 1, /* requeue IO for later */
- BLK_MQ_RQ_QUEUE_ERROR = 2, /* end IO with error */
-
BLK_MQ_F_SHOULD_MERGE = 1 << 0,
BLK_MQ_F_TAG_SHARED = 1 << 1,
BLK_MQ_F_SG_MERGE = 1 << 2,
BLK_MQ_REQ_INTERNAL = (1 << 2), /* allocate internal/sched tag */
};
-struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
+struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
unsigned int flags);
-struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int op,
- unsigned int flags, unsigned int hctx_idx);
+struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
+ unsigned int op, unsigned int flags, unsigned int hctx_idx);
struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag);
enum {
int blk_mq_request_started(struct request *rq);
void blk_mq_start_request(struct request *rq);
-void blk_mq_end_request(struct request *rq, int error);
-void __blk_mq_end_request(struct request *rq, int error);
+void blk_mq_end_request(struct request *rq, blk_status_t error);
+void __blk_mq_end_request(struct request *rq, blk_status_t error);
void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list);
void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
void blk_mq_start_hw_queues(struct request_queue *q);
void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
+void blk_mq_quiesce_queue(struct request_queue *q);
+void blk_mq_unquiesce_queue(struct request_queue *q);
void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
void blk_mq_run_hw_queues(struct request_queue *q, bool async);
int blk_mq_map_queues(struct blk_mq_tag_set *set);
void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues);
+void blk_mq_quiesce_queue_nowait(struct request_queue *q);
+
/*
* Driver command data is immediately after the request. So subtract request
* size to get back to the original request, add request size to get the PDU.
#define _LINUX_FS_H
#include <linux/linkage.h>
- #include <linux/wait.h>
+ #include <linux/wait_bit.h>
#include <linux/kdev_t.h>
#include <linux/dcache.h>
#include <linux/path.h>
#include <linux/rwsem.h>
#include <linux/capability.h>
#include <linux/semaphore.h>
+#include <linux/fcntl.h>
#include <linux/fiemap.h>
#include <linux/rculist_bl.h>
#include <linux/atomic.h>
#include <linux/percpu-rwsem.h>
#include <linux/workqueue.h>
#include <linux/delayed_call.h>
+#include <linux/uuid.h>
#include <asm/byteorder.h>
#include <uapi/linux/fs.h>
/* File was opened by fanotify and shouldn't generate fanotify events */
#define FMODE_NONOTIFY ((__force fmode_t)0x4000000)
+/* File is capable of returning -EAGAIN if AIO will block */
+#define FMODE_AIO_NOWAIT ((__force fmode_t)0x8000000)
+
/*
* Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector
* that indicates that they should check the contents of the iovec are
struct address_space;
struct writeback_control;
+/*
+ * Write life time hint values.
+ */
+enum rw_hint {
+ WRITE_LIFE_NOT_SET = 0,
+ WRITE_LIFE_NONE = RWH_WRITE_LIFE_NONE,
+ WRITE_LIFE_SHORT = RWH_WRITE_LIFE_SHORT,
+ WRITE_LIFE_MEDIUM = RWH_WRITE_LIFE_MEDIUM,
+ WRITE_LIFE_LONG = RWH_WRITE_LIFE_LONG,
+ WRITE_LIFE_EXTREME = RWH_WRITE_LIFE_EXTREME,
+};
+
#define IOCB_EVENTFD (1 << 0)
#define IOCB_APPEND (1 << 1)
#define IOCB_DIRECT (1 << 2)
#define IOCB_DSYNC (1 << 4)
#define IOCB_SYNC (1 << 5)
#define IOCB_WRITE (1 << 6)
+#define IOCB_NOWAIT (1 << 7)
struct kiocb {
struct file *ki_filp;
void (*ki_complete)(struct kiocb *iocb, long ret, long ret2);
void *private;
int ki_flags;
+ enum rw_hint ki_hint;
};
static inline bool is_sync_kiocb(struct kiocb *kiocb)
return kiocb->ki_complete == NULL;
}
-static inline int iocb_flags(struct file *file);
-
-static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
-{
- *kiocb = (struct kiocb) {
- .ki_filp = filp,
- .ki_flags = iocb_flags(filp),
- };
-}
-
/*
* "descriptor" for what we're up to with a read.
* This allows us to use the same read code yet
spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */
unsigned short i_bytes;
unsigned int i_blkbits;
+ enum rw_hint i_write_hint;
blkcnt_t i_blocks;
#ifdef __NEED_I_SIZE_ORDERED
* Must not be taken from IRQ context.
*/
spinlock_t f_lock;
+ enum rw_hint f_write_hint;
atomic_long_t f_count;
unsigned int f_flags;
fmode_t f_mode;
#define OFFT_OFFSET_MAX INT_LIMIT(off_t)
#endif
-#include <linux/fcntl.h>
-
extern void send_sigio(struct fown_struct *fown, int fd, int band);
/*
struct sb_writers s_writers;
- char s_id[32]; /* Informational name */
- u8 s_uuid[16]; /* UUID */
+ char s_id[32]; /* Informational name */
+ uuid_t s_uuid; /* UUID */
void *s_fs_info; /* Filesystem private info */
unsigned int s_max_links;
return !uid_valid(inode->i_uid) || !gid_valid(inode->i_gid);
}
+static inline enum rw_hint file_write_hint(struct file *file)
+{
+ if (file->f_write_hint != WRITE_LIFE_NOT_SET)
+ return file->f_write_hint;
+
+ return file_inode(file)->i_write_hint;
+}
+
+static inline int iocb_flags(struct file *file);
+
+static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
+{
+ *kiocb = (struct kiocb) {
+ .ki_filp = filp,
+ .ki_flags = iocb_flags(filp),
+ .ki_hint = file_write_hint(filp),
+ };
+}
+
/*
* Inode state bits. Protected by inode->i_lock
*
extern void filemap_fdatawait_keep_errors(struct address_space *);
extern int filemap_fdatawait_range(struct address_space *, loff_t lstart,
loff_t lend);
+extern bool filemap_range_has_page(struct address_space *, loff_t lstart,
+ loff_t lend);
extern int filemap_write_and_wait(struct address_space *mapping);
extern int filemap_write_and_wait_range(struct address_space *mapping,
loff_t lstart, loff_t lend);
DIO_SKIP_DIO_COUNT = 0x08,
};
-void dio_end_io(struct bio *bio, int error);
+void dio_end_io(struct bio *bio);
ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
struct block_device *bdev, struct iov_iter *iter,
return res;
}
+static inline int kiocb_set_rw_flags(struct kiocb *ki, int flags)
+{
+ if (unlikely(flags & ~RWF_SUPPORTED))
+ return -EOPNOTSUPP;
+
+ if (flags & RWF_NOWAIT) {
+ if (!(ki->ki_filp->f_mode & FMODE_AIO_NOWAIT))
+ return -EOPNOTSUPP;
+ ki->ki_flags |= IOCB_NOWAIT;
+ }
+ if (flags & RWF_HIPRI)
+ ki->ki_flags |= IOCB_HIPRI;
+ if (flags & RWF_DSYNC)
+ ki->ki_flags |= IOCB_DSYNC;
+ if (flags & RWF_SYNC)
+ ki->ki_flags |= (IOCB_DSYNC | IOCB_SYNC);
+ return 0;
+}
+
static inline ino_t parent_ino(struct dentry *dentry)
{
ino_t res;
endmenu # "CPU/Task time and stats accounting"
-menu "RCU Subsystem"
-
-config TREE_RCU
- bool
- default y if !PREEMPT && SMP
- help
- This option selects the RCU implementation that is
- designed for very large SMP system with hundreds or
- thousands of CPUs. It also scales down nicely to
- smaller systems.
-
-config PREEMPT_RCU
- bool
- default y if PREEMPT
- help
- This option selects the RCU implementation that is
- designed for very large SMP systems with hundreds or
- thousands of CPUs, but for which real-time response
- is also required. It also scales down nicely to
- smaller systems.
-
- Select this option if you are unsure.
-
-config TINY_RCU
- bool
- default y if !PREEMPT && !SMP
- help
- This option selects the RCU implementation that is
- designed for UP systems from which real-time response
- is not required. This option greatly reduces the
- memory footprint of RCU.
-
-config RCU_EXPERT
- bool "Make expert-level adjustments to RCU configuration"
- default n
- help
- This option needs to be enabled if you wish to make
- expert-level adjustments to RCU configuration. By default,
- no such adjustments can be made, which has the often-beneficial
- side-effect of preventing "make oldconfig" from asking you all
- sorts of detailed questions about how you would like numerous
- obscure RCU options to be set up.
-
- Say Y if you need to make expert-level adjustments to RCU.
-
- Say N if you are unsure.
-
-config SRCU
- bool
- default y
- help
- This option selects the sleepable version of RCU. This version
- permits arbitrary sleeping or blocking within RCU read-side critical
- sections.
-
-config CLASSIC_SRCU
- bool "Use v4.11 classic SRCU implementation"
- default n
- depends on RCU_EXPERT && SRCU
- help
- This option selects the traditional well-tested classic SRCU
- implementation from v4.11, as might be desired for enterprise
- Linux distributions. Without this option, the shiny new
- Tiny SRCU and Tree SRCU implementations are used instead.
- At some point, it is hoped that Tiny SRCU and Tree SRCU
- will accumulate enough test time and confidence to allow
- Classic SRCU to be dropped entirely.
-
- Say Y if you need a rock-solid SRCU.
-
- Say N if you would like help test Tree SRCU.
-
-config TINY_SRCU
- bool
- default y if SRCU && TINY_RCU && !CLASSIC_SRCU
- help
- This option selects the single-CPU non-preemptible version of SRCU.
-
-config TREE_SRCU
- bool
- default y if SRCU && !TINY_RCU && !CLASSIC_SRCU
- help
- This option selects the full-fledged version of SRCU.
-
-config TASKS_RCU
- bool
- default n
- select SRCU
- help
- This option enables a task-based RCU implementation that uses
- only voluntary context switch (not preemption!), idle, and
- user-mode execution as quiescent states.
-
-config RCU_STALL_COMMON
- def_bool ( TREE_RCU || PREEMPT_RCU || RCU_TRACE )
- help
- This option enables RCU CPU stall code that is common between
- the TINY and TREE variants of RCU. The purpose is to allow
- the tiny variants to disable RCU CPU stall warnings, while
- making these warnings mandatory for the tree variants.
-
-config RCU_NEED_SEGCBLIST
- def_bool ( TREE_RCU || PREEMPT_RCU || TINY_SRCU || TREE_SRCU )
-
-config CONTEXT_TRACKING
- bool
-
-config CONTEXT_TRACKING_FORCE
- bool "Force context tracking"
- depends on CONTEXT_TRACKING
- default y if !NO_HZ_FULL
- help
- The major pre-requirement for full dynticks to work is to
- support the context tracking subsystem. But there are also
- other dependencies to provide in order to make the full
- dynticks working.
-
- This option stands for testing when an arch implements the
- context tracking backend but doesn't yet fullfill all the
- requirements to make the full dynticks feature working.
- Without the full dynticks, there is no way to test the support
- for context tracking and the subsystems that rely on it: RCU
- userspace extended quiescent state and tickless cputime
- accounting. This option copes with the absence of the full
- dynticks subsystem by forcing the context tracking on all
- CPUs in the system.
-
- Say Y only if you're working on the development of an
- architecture backend for the context tracking.
-
- Say N otherwise, this option brings an overhead that you
- don't want in production.
-
-
-config RCU_FANOUT
- int "Tree-based hierarchical RCU fanout value"
- range 2 64 if 64BIT
- range 2 32 if !64BIT
- depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT
- default 64 if 64BIT
- default 32 if !64BIT
- help
- This option controls the fanout of hierarchical implementations
- of RCU, allowing RCU to work efficiently on machines with
- large numbers of CPUs. This value must be at least the fourth
- root of NR_CPUS, which allows NR_CPUS to be insanely large.
- The default value of RCU_FANOUT should be used for production
- systems, but if you are stress-testing the RCU implementation
- itself, small RCU_FANOUT values allow you to test large-system
- code paths on small(er) systems.
-
- Select a specific number if testing RCU itself.
- Take the default if unsure.
-
-config RCU_FANOUT_LEAF
- int "Tree-based hierarchical RCU leaf-level fanout value"
- range 2 64 if 64BIT
- range 2 32 if !64BIT
- depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT
- default 16
- help
- This option controls the leaf-level fanout of hierarchical
- implementations of RCU, and allows trading off cache misses
- against lock contention. Systems that synchronize their
- scheduling-clock interrupts for energy-efficiency reasons will
- want the default because the smaller leaf-level fanout keeps
- lock contention levels acceptably low. Very large systems
- (hundreds or thousands of CPUs) will instead want to set this
- value to the maximum value possible in order to reduce the
- number of cache misses incurred during RCU's grace-period
- initialization. These systems tend to run CPU-bound, and thus
- are not helped by synchronized interrupts, and thus tend to
- skew them, which reduces lock contention enough that large
- leaf-level fanouts work well. That said, setting leaf-level
- fanout to a large number will likely cause problematic
- lock contention on the leaf-level rcu_node structures unless
- you boot with the skew_tick kernel parameter.
-
- Select a specific number if testing RCU itself.
-
- Select the maximum permissible value for large systems, but
- please understand that you may also need to set the skew_tick
- kernel boot parameter to avoid contention on the rcu_node
- structure's locks.
-
- Take the default if unsure.
-
-config RCU_FAST_NO_HZ
- bool "Accelerate last non-dyntick-idle CPU's grace periods"
- depends on NO_HZ_COMMON && SMP && RCU_EXPERT
- default n
- help
- This option permits CPUs to enter dynticks-idle state even if
- they have RCU callbacks queued, and prevents RCU from waking
- these CPUs up more than roughly once every four jiffies (by
- default, you can adjust this using the rcutree.rcu_idle_gp_delay
- parameter), thus improving energy efficiency. On the other
- hand, this option increases the duration of RCU grace periods,
- for example, slowing down synchronize_rcu().
-
- Say Y if energy efficiency is critically important, and you
- don't care about increased grace-period durations.
-
- Say N if you are unsure.
-
-config TREE_RCU_TRACE
- def_bool RCU_TRACE && ( TREE_RCU || PREEMPT_RCU )
- select DEBUG_FS
- help
- This option provides tracing for the TREE_RCU and
- PREEMPT_RCU implementations, permitting Makefile to
- trivially select kernel/rcutree_trace.c.
-
-config RCU_BOOST
- bool "Enable RCU priority boosting"
- depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT
- default n
- help
- This option boosts the priority of preempted RCU readers that
- block the current preemptible RCU grace period for too long.
- This option also prevents heavy loads from blocking RCU
- callback invocation for all flavors of RCU.
-
- Say Y here if you are working with real-time apps or heavy loads
- Say N here if you are unsure.
-
-config RCU_KTHREAD_PRIO
- int "Real-time priority to use for RCU worker threads"
- range 1 99 if RCU_BOOST
- range 0 99 if !RCU_BOOST
- default 1 if RCU_BOOST
- default 0 if !RCU_BOOST
- depends on RCU_EXPERT
- help
- This option specifies the SCHED_FIFO priority value that will be
- assigned to the rcuc/n and rcub/n threads and is also the value
- used for RCU_BOOST (if enabled). If you are working with a
- real-time application that has one or more CPU-bound threads
- running at a real-time priority level, you should set
- RCU_KTHREAD_PRIO to a priority higher than the highest-priority
- real-time CPU-bound application thread. The default RCU_KTHREAD_PRIO
- value of 1 is appropriate in the common case, which is real-time
- applications that do not have any CPU-bound threads.
-
- Some real-time applications might not have a single real-time
- thread that saturates a given CPU, but instead might have
- multiple real-time threads that, taken together, fully utilize
- that CPU. In this case, you should set RCU_KTHREAD_PRIO to
- a priority higher than the lowest-priority thread that is
- conspiring to prevent the CPU from running any non-real-time
- tasks. For example, if one thread at priority 10 and another
- thread at priority 5 are between themselves fully consuming
- the CPU time on a given CPU, then RCU_KTHREAD_PRIO should be
- set to priority 6 or higher.
-
- Specify the real-time priority, or take the default if unsure.
-
-config RCU_BOOST_DELAY
- int "Milliseconds to delay boosting after RCU grace-period start"
- range 0 3000
- depends on RCU_BOOST
- default 500
- help
- This option specifies the time to wait after the beginning of
- a given grace period before priority-boosting preempted RCU
- readers blocking that grace period. Note that any RCU reader
- blocking an expedited RCU grace period is boosted immediately.
-
- Accept the default if unsure.
-
-config RCU_NOCB_CPU
- bool "Offload RCU callback processing from boot-selected CPUs"
- depends on TREE_RCU || PREEMPT_RCU
- depends on RCU_EXPERT || NO_HZ_FULL
- default n
- help
- Use this option to reduce OS jitter for aggressive HPC or
- real-time workloads. It can also be used to offload RCU
- callback invocation to energy-efficient CPUs in battery-powered
- asymmetric multiprocessors.
-
- This option offloads callback invocation from the set of
- CPUs specified at boot time by the rcu_nocbs parameter.
- For each such CPU, a kthread ("rcuox/N") will be created to
- invoke callbacks, where the "N" is the CPU being offloaded,
- and where the "x" is "b" for RCU-bh, "p" for RCU-preempt, and
- "s" for RCU-sched. Nothing prevents this kthread from running
- on the specified CPUs, but (1) the kthreads may be preempted
- between each callback, and (2) affinity or cgroups can be used
- to force the kthreads to run on whatever set of CPUs is desired.
-
- Say Y here if you want to help to debug reduced OS jitter.
- Say N here if you are unsure.
-
-choice
- prompt "Build-forced no-CBs CPUs"
- default RCU_NOCB_CPU_NONE
- depends on RCU_NOCB_CPU
- help
- This option allows no-CBs CPUs (whose RCU callbacks are invoked
- from kthreads rather than from softirq context) to be specified
- at build time. Additional no-CBs CPUs may be specified by
- the rcu_nocbs= boot parameter.
-
-config RCU_NOCB_CPU_NONE
- bool "No build_forced no-CBs CPUs"
- help
- This option does not force any of the CPUs to be no-CBs CPUs.
- Only CPUs designated by the rcu_nocbs= boot parameter will be
- no-CBs CPUs, whose RCU callbacks will be invoked by per-CPU
- kthreads whose names begin with "rcuo". All other CPUs will
- invoke their own RCU callbacks in softirq context.
-
- Select this option if you want to choose no-CBs CPUs at
- boot time, for example, to allow testing of different no-CBs
- configurations without having to rebuild the kernel each time.
-
-config RCU_NOCB_CPU_ZERO
- bool "CPU 0 is a build_forced no-CBs CPU"
- help
- This option forces CPU 0 to be a no-CBs CPU, so that its RCU
- callbacks are invoked by a per-CPU kthread whose name begins
- with "rcuo". Additional CPUs may be designated as no-CBs
- CPUs using the rcu_nocbs= boot parameter will be no-CBs CPUs.
- All other CPUs will invoke their own RCU callbacks in softirq
- context.
-
- Select this if CPU 0 needs to be a no-CBs CPU for real-time
- or energy-efficiency reasons, but the real reason it exists
- is to ensure that randconfig testing covers mixed systems.
-
-config RCU_NOCB_CPU_ALL
- bool "All CPUs are build_forced no-CBs CPUs"
- help
- This option forces all CPUs to be no-CBs CPUs. The rcu_nocbs=
- boot parameter will be ignored. All CPUs' RCU callbacks will
- be executed in the context of per-CPU rcuo kthreads created for
- this purpose. Assuming that the kthreads whose names start with
- "rcuo" are bound to "housekeeping" CPUs, this reduces OS jitter
- on the remaining CPUs, but might decrease memory locality during
- RCU-callback invocation, thus potentially degrading throughput.
-
- Select this if all CPUs need to be no-CBs CPUs for real-time
- or energy-efficiency reasons.
-
-endchoice
-
-endmenu # "RCU Subsystem"
+source "kernel/rcu/Kconfig"
config BUILD_BIN2C
bool
config CPUSETS
bool "Cpuset controller"
+ depends on SMP
help
This option will let you create and manage CPUSETs which
allow dynamically partitioning a system into sets of CPUs and
#include <uapi/linux/sched/types.h>
#include <linux/sched/loadavg.h>
#include <linux/sched/hotplug.h>
+ #include <linux/wait_bit.h>
#include <linux/cpuset.h>
#include <linux/delayacct.h>
#include <linux/init_task.h>
dequeue_task(rq, p, flags);
}
- void sched_set_stop_task(int cpu, struct task_struct *stop)
- {
- struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
- struct task_struct *old_stop = cpu_rq(cpu)->stop;
-
- if (stop) {
- /*
- * Make it appear like a SCHED_FIFO task, its something
- * userspace knows about and won't get confused about.
- *
- * Also, it will make PI more or less work without too
- * much confusion -- but then, stop work should not
- * rely on PI working anyway.
- */
- sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m);
-
- stop->sched_class = &stop_sched_class;
- }
-
- cpu_rq(cpu)->stop = stop;
-
- if (old_stop) {
- /*
- * Reset it back to a normal scheduling class so that
- * it can die in pieces.
- */
- old_stop->sched_class = &rt_sched_class;
- }
- }
-
/*
* __normal_prio - return the priority that is based on the static prio
*/
*avg += diff >> 3;
}
+ void sched_set_stop_task(int cpu, struct task_struct *stop)
+ {
+ struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
+ struct task_struct *old_stop = cpu_rq(cpu)->stop;
+
+ if (stop) {
+ /*
+ * Make it appear like a SCHED_FIFO task, its something
+ * userspace knows about and won't get confused about.
+ *
+ * Also, it will make PI more or less work without too
+ * much confusion -- but then, stop work should not
+ * rely on PI working anyway.
+ */
+ sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m);
+
+ stop->sched_class = &stop_sched_class;
+ }
+
+ cpu_rq(cpu)->stop = stop;
+
+ if (old_stop) {
+ /*
+ * Reset it back to a normal scheduling class so that
+ * it can die in pieces.
+ */
+ old_stop->sched_class = &rt_sched_class;
+ }
+ }
+
#else
static inline int __set_cpus_allowed_ptr(struct task_struct *p,
{
struct rq *rq = this_rq();
struct llist_node *llist = llist_del_all(&rq->wake_list);
- struct task_struct *p;
+ struct task_struct *p, *t;
struct rq_flags rf;
if (!llist)
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
- while (llist) {
- int wake_flags = 0;
-
- p = llist_entry(llist, struct task_struct, wake_entry);
- llist = llist_next(llist);
-
- if (p->sched_remote_wakeup)
- wake_flags = WF_MIGRATED;
-
- ttwu_do_activate(rq, p, wake_flags, &rf);
- }
+ llist_for_each_entry_safe(p, t, llist, wake_entry)
+ ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
rq_unlock_irqrestore(rq, &rf);
}
return try_to_wake_up(p, state, 0);
}
- /*
- * This function clears the sched_dl_entity static params.
- */
- void __dl_clear_params(struct task_struct *p)
- {
- struct sched_dl_entity *dl_se = &p->dl;
-
- dl_se->dl_runtime = 0;
- dl_se->dl_deadline = 0;
- dl_se->dl_period = 0;
- dl_se->flags = 0;
- dl_se->dl_bw = 0;
-
- dl_se->dl_throttled = 0;
- dl_se->dl_yielded = 0;
- }
-
/*
* Perform scheduler related setup for a newly forked process p.
* p is forked by current.
RB_CLEAR_NODE(&p->dl.rb_node);
init_dl_task_timer(&p->dl);
+ init_dl_inactive_task_timer(&p->dl);
__dl_clear_params(p);
INIT_LIST_HEAD(&p->rt.run_list);
unsigned long to_ratio(u64 period, u64 runtime)
{
if (runtime == RUNTIME_INF)
- return 1ULL << 20;
+ return BW_UNIT;
/*
* Doing this here saves a lot of checks in all
if (period == 0)
return 0;
- return div64_u64(runtime << 20, period);
- }
-
- #ifdef CONFIG_SMP
- inline struct dl_bw *dl_bw_of(int i)
- {
- RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
- "sched RCU must be held");
- return &cpu_rq(i)->rd->dl_bw;
+ return div64_u64(runtime << BW_SHIFT, period);
}
- static inline int dl_bw_cpus(int i)
- {
- struct root_domain *rd = cpu_rq(i)->rd;
- int cpus = 0;
-
- RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
- "sched RCU must be held");
- for_each_cpu_and(i, rd->span, cpu_active_mask)
- cpus++;
-
- return cpus;
- }
- #else
- inline struct dl_bw *dl_bw_of(int i)
- {
- return &cpu_rq(i)->dl.dl_bw;
- }
-
- static inline int dl_bw_cpus(int i)
- {
- return 1;
- }
- #endif
-
- /*
- * We must be sure that accepting a new task (or allowing changing the
- * parameters of an existing one) is consistent with the bandwidth
- * constraints. If yes, this function also accordingly updates the currently
- * allocated bandwidth to reflect the new situation.
- *
- * This function is called while holding p's rq->lock.
- *
- * XXX we should delay bw change until the task's 0-lag point, see
- * __setparam_dl().
- */
- static int dl_overflow(struct task_struct *p, int policy,
- const struct sched_attr *attr)
- {
-
- struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
- u64 period = attr->sched_period ?: attr->sched_deadline;
- u64 runtime = attr->sched_runtime;
- u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
- int cpus, err = -1;
-
- /* !deadline task may carry old deadline bandwidth */
- if (new_bw == p->dl.dl_bw && task_has_dl_policy(p))
- return 0;
-
- /*
- * Either if a task, enters, leave, or stays -deadline but changes
- * its parameters, we may need to update accordingly the total
- * allocated bandwidth of the container.
- */
- raw_spin_lock(&dl_b->lock);
- cpus = dl_bw_cpus(task_cpu(p));
- if (dl_policy(policy) && !task_has_dl_policy(p) &&
- !__dl_overflow(dl_b, cpus, 0, new_bw)) {
- __dl_add(dl_b, new_bw);
- err = 0;
- } else if (dl_policy(policy) && task_has_dl_policy(p) &&
- !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
- __dl_clear(dl_b, p->dl.dl_bw);
- __dl_add(dl_b, new_bw);
- err = 0;
- } else if (!dl_policy(policy) && task_has_dl_policy(p)) {
- __dl_clear(dl_b, p->dl.dl_bw);
- err = 0;
- }
- raw_spin_unlock(&dl_b->lock);
-
- return err;
- }
-
- extern void init_dl_bw(struct dl_bw *dl_b);
-
/*
* wake_up_new_task - wake up a newly created task for the first time.
*
exception_exit(prev_state);
}
- int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
+ int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
void *key)
{
return try_to_wake_up(curr->private, mode, wake_flags);
return pid ? find_task_by_vpid(pid) : current;
}
- /*
- * This function initializes the sched_dl_entity of a newly becoming
- * SCHED_DEADLINE task.
- *
- * Only the static values are considered here, the actual runtime and the
- * absolute deadline will be properly calculated when the task is enqueued
- * for the first time with its new policy.
- */
- static void
- __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
- {
- struct sched_dl_entity *dl_se = &p->dl;
-
- dl_se->dl_runtime = attr->sched_runtime;
- dl_se->dl_deadline = attr->sched_deadline;
- dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
- dl_se->flags = attr->sched_flags;
- dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
-
- /*
- * Changing the parameters of a task is 'tricky' and we're not doing
- * the correct thing -- also see task_dead_dl() and switched_from_dl().
- *
- * What we SHOULD do is delay the bandwidth release until the 0-lag
- * point. This would include retaining the task_struct until that time
- * and change dl_overflow() to not immediately decrement the current
- * amount.
- *
- * Instead we retain the current runtime/deadline and let the new
- * parameters take effect after the current reservation period lapses.
- * This is safe (albeit pessimistic) because the 0-lag point is always
- * before the current scheduling deadline.
- *
- * We can still have temporary overloads because we do not delay the
- * change in bandwidth until that time; so admission control is
- * not on the safe side. It does however guarantee tasks will never
- * consume more than promised.
- */
- }
-
/*
* sched_setparam() passes in -1 for its policy, to let the functions
* it calls know not to change it.
p->sched_class = &fair_sched_class;
}
- static void
- __getparam_dl(struct task_struct *p, struct sched_attr *attr)
- {
- struct sched_dl_entity *dl_se = &p->dl;
-
- attr->sched_priority = p->rt_priority;
- attr->sched_runtime = dl_se->dl_runtime;
- attr->sched_deadline = dl_se->dl_deadline;
- attr->sched_period = dl_se->dl_period;
- attr->sched_flags = dl_se->flags;
- }
-
- /*
- * This function validates the new parameters of a -deadline task.
- * We ask for the deadline not being zero, and greater or equal
- * than the runtime, as well as the period of being zero or
- * greater than deadline. Furthermore, we have to be sure that
- * user parameters are above the internal resolution of 1us (we
- * check sched_runtime only since it is always the smaller one) and
- * below 2^63 ns (we have to check both sched_deadline and
- * sched_period, as the latter can be zero).
- */
- static bool
- __checkparam_dl(const struct sched_attr *attr)
- {
- /* deadline != 0 */
- if (attr->sched_deadline == 0)
- return false;
-
- /*
- * Since we truncate DL_SCALE bits, make sure we're at least
- * that big.
- */
- if (attr->sched_runtime < (1ULL << DL_SCALE))
- return false;
-
- /*
- * Since we use the MSB for wrap-around and sign issues, make
- * sure it's not set (mind that period can be equal to zero).
- */
- if (attr->sched_deadline & (1ULL << 63) ||
- attr->sched_period & (1ULL << 63))
- return false;
-
- /* runtime <= deadline <= period (if period != 0) */
- if ((attr->sched_period != 0 &&
- attr->sched_period < attr->sched_deadline) ||
- attr->sched_deadline < attr->sched_runtime)
- return false;
-
- return true;
- }
-
/*
* Check the target process has a UID that matches the current process's:
*/
return match;
}
- static bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
- {
- struct sched_dl_entity *dl_se = &p->dl;
-
- if (dl_se->dl_runtime != attr->sched_runtime ||
- dl_se->dl_deadline != attr->sched_deadline ||
- dl_se->dl_period != attr->sched_period ||
- dl_se->flags != attr->sched_flags)
- return true;
-
- return false;
- }
-
static int __sched_setscheduler(struct task_struct *p,
const struct sched_attr *attr,
bool user, bool pi)
int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
struct rq *rq;
- /* May grab non-irq protected spin_locks: */
- BUG_ON(in_interrupt());
+ /* The pi code expects interrupts enabled */
+ BUG_ON(pi && in_interrupt());
recheck:
/* Double check policy once rq lock held: */
if (policy < 0) {
return -EINVAL;
}
- if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK))
+ if (attr->sched_flags &
+ ~(SCHED_FLAG_RESET_ON_FORK | SCHED_FLAG_RECLAIM))
return -EINVAL;
/*
* of a SCHED_DEADLINE task) we need to check if enough bandwidth
* is available.
*/
- if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) {
+ if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
task_rq_unlock(rq, p, &rf);
return -EBUSY;
}
#endif
}
+ #ifdef CONFIG_SMP
+
int cpuset_cpumask_can_shrink(const struct cpumask *cur,
const struct cpumask *trial)
{
- int ret = 1, trial_cpus;
- struct dl_bw *cur_dl_b;
- unsigned long flags;
+ int ret = 1;
if (!cpumask_weight(cur))
return ret;
- rcu_read_lock_sched();
- cur_dl_b = dl_bw_of(cpumask_any(cur));
- trial_cpus = cpumask_weight(trial);
-
- raw_spin_lock_irqsave(&cur_dl_b->lock, flags);
- if (cur_dl_b->bw != -1 &&
- cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw)
- ret = 0;
- raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags);
- rcu_read_unlock_sched();
+ ret = dl_cpuset_cpumask_can_shrink(cur, trial);
return ret;
}
goto out;
}
- #ifdef CONFIG_SMP
if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
- cs_cpus_allowed)) {
- unsigned int dest_cpu = cpumask_any_and(cpu_active_mask,
- cs_cpus_allowed);
- struct dl_bw *dl_b;
- bool overflow;
- int cpus;
- unsigned long flags;
-
- rcu_read_lock_sched();
- dl_b = dl_bw_of(dest_cpu);
- raw_spin_lock_irqsave(&dl_b->lock, flags);
- cpus = dl_bw_cpus(dest_cpu);
- overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
- if (overflow)
- ret = -EBUSY;
- else {
- /*
- * We reserve space for this task in the destination
- * root_domain, as we can't fail after this point.
- * We will free resources in the source root_domain
- * later on (see set_cpus_allowed_dl()).
- */
- __dl_add(dl_b, p->dl.dl_bw);
- }
- raw_spin_unlock_irqrestore(&dl_b->lock, flags);
- rcu_read_unlock_sched();
+ cs_cpus_allowed))
+ ret = dl_task_can_attach(p, cs_cpus_allowed);
- }
- #endif
out:
return ret;
}
- #ifdef CONFIG_SMP
-
bool sched_smp_initialized __read_mostly;
#ifdef CONFIG_NUMA_BALANCING
static int cpuset_cpu_inactive(unsigned int cpu)
{
- unsigned long flags;
- struct dl_bw *dl_b;
- bool overflow;
- int cpus;
-
if (!cpuhp_tasks_frozen) {
- rcu_read_lock_sched();
- dl_b = dl_bw_of(cpu);
-
- raw_spin_lock_irqsave(&dl_b->lock, flags);
- cpus = dl_bw_cpus(cpu);
- overflow = __dl_overflow(dl_b, cpus, 0, 0);
- raw_spin_unlock_irqrestore(&dl_b->lock, flags);
-
- rcu_read_unlock_sched();
-
- if (overflow)
+ if (dl_cpu_busy(cpu))
return -EBUSY;
cpuset_update_active_cpus();
} else {
* users of this state to go away such that all new such users will
* observe it.
*
- * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
- * not imply sync_sched(), so wait for both.
- *
* Do sync before park smpboot threads to take care the rcu boost case.
*/
- if (IS_ENABLED(CONFIG_PREEMPT))
- synchronize_rcu_mult(call_rcu, call_rcu_sched);
- else
- synchronize_rcu();
+ synchronize_rcu_mult(call_rcu, call_rcu_sched);
if (!sched_smp_initialized)
return 0;
cpumask_var_t non_isolated_cpus;
alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
- alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
sched_init_numa();
* happen.
*/
mutex_lock(&sched_domains_mutex);
- init_sched_domains(cpu_active_mask);
+ sched_init_domains(cpu_active_mask);
cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
if (cpumask_empty(non_isolated_cpus))
cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
init_sched_dl_class();
sched_init_smt();
- sched_clock_init_late();
sched_smp_initialized = true;
}
void __init sched_init_smp(void)
{
sched_init_granularity();
- sched_clock_init_late();
}
#endif /* CONFIG_SMP */
DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
- #define WAIT_TABLE_BITS 8
- #define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS)
- static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned;
-
- wait_queue_head_t *bit_waitqueue(void *word, int bit)
- {
- const int shift = BITS_PER_LONG == 32 ? 5 : 6;
- unsigned long val = (unsigned long)word << shift | bit;
-
- return bit_wait_table + hash_long(val, WAIT_TABLE_BITS);
- }
- EXPORT_SYMBOL(bit_waitqueue);
-
void __init sched_init(void)
{
int i, j;
unsigned long alloc_size = 0, ptr;
sched_clock_init();
-
- for (i = 0; i < WAIT_TABLE_SIZE; i++)
- init_waitqueue_head(bit_wait_table + i);
+ wait_bit_init();
#ifdef CONFIG_FAIR_GROUP_SCHED
alloc_size += 2 * nr_cpu_ids * sizeof(void **);
calc_load_update = jiffies + LOAD_FREQ;
#ifdef CONFIG_SMP
- zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
/* May be allocated at isolcpus cmdline parse time */
if (cpu_isolated_map == NULL)
zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
!is_idle_task(current)) ||
- system_state != SYSTEM_RUNNING || oops_in_progress)
+ system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||
+ oops_in_progress)
return;
+
if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
return;
prev_jiffy = jiffies;
task_rq_unlock(rq, tsk, &rf);
}
- #endif /* CONFIG_CGROUP_SCHED */
-
- #ifdef CONFIG_RT_GROUP_SCHED
- /*
- * Ensure that the real time constraints are schedulable.
- */
- static DEFINE_MUTEX(rt_constraints_mutex);
-
- /* Must be called with tasklist_lock held */
- static inline int tg_has_rt_tasks(struct task_group *tg)
- {
- struct task_struct *g, *p;
-
- /*
- * Autogroups do not have RT tasks; see autogroup_create().
- */
- if (task_group_is_autogroup(tg))
- return 0;
-
- for_each_process_thread(g, p) {
- if (rt_task(p) && task_group(p) == tg)
- return 1;
- }
-
- return 0;
- }
-
- struct rt_schedulable_data {
- struct task_group *tg;
- u64 rt_period;
- u64 rt_runtime;
- };
-
- static int tg_rt_schedulable(struct task_group *tg, void *data)
- {
- struct rt_schedulable_data *d = data;
- struct task_group *child;
- unsigned long total, sum = 0;
- u64 period, runtime;
-
- period = ktime_to_ns(tg->rt_bandwidth.rt_period);
- runtime = tg->rt_bandwidth.rt_runtime;
-
- if (tg == d->tg) {
- period = d->rt_period;
- runtime = d->rt_runtime;
- }
-
- /*
- * Cannot have more runtime than the period.
- */
- if (runtime > period && runtime != RUNTIME_INF)
- return -EINVAL;
-
- /*
- * Ensure we don't starve existing RT tasks.
- */
- if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
- return -EBUSY;
-
- total = to_ratio(period, runtime);
-
- /*
- * Nobody can have more than the global setting allows.
- */
- if (total > to_ratio(global_rt_period(), global_rt_runtime()))
- return -EINVAL;
-
- /*
- * The sum of our children's runtime should not exceed our own.
- */
- list_for_each_entry_rcu(child, &tg->children, siblings) {
- period = ktime_to_ns(child->rt_bandwidth.rt_period);
- runtime = child->rt_bandwidth.rt_runtime;
-
- if (child == d->tg) {
- period = d->rt_period;
- runtime = d->rt_runtime;
- }
-
- sum += to_ratio(period, runtime);
- }
-
- if (sum > total)
- return -EINVAL;
-
- return 0;
- }
-
- static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
- {
- int ret;
-
- struct rt_schedulable_data data = {
- .tg = tg,
- .rt_period = period,
- .rt_runtime = runtime,
- };
-
- rcu_read_lock();
- ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
- rcu_read_unlock();
-
- return ret;
- }
-
- static int tg_set_rt_bandwidth(struct task_group *tg,
- u64 rt_period, u64 rt_runtime)
- {
- int i, err = 0;
-
- /*
- * Disallowing the root group RT runtime is BAD, it would disallow the
- * kernel creating (and or operating) RT threads.
- */
- if (tg == &root_task_group && rt_runtime == 0)
- return -EINVAL;
-
- /* No period doesn't make any sense. */
- if (rt_period == 0)
- return -EINVAL;
-
- mutex_lock(&rt_constraints_mutex);
- read_lock(&tasklist_lock);
- err = __rt_schedulable(tg, rt_period, rt_runtime);
- if (err)
- goto unlock;
-
- raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
- tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
- tg->rt_bandwidth.rt_runtime = rt_runtime;
-
- for_each_possible_cpu(i) {
- struct rt_rq *rt_rq = tg->rt_rq[i];
-
- raw_spin_lock(&rt_rq->rt_runtime_lock);
- rt_rq->rt_runtime = rt_runtime;
- raw_spin_unlock(&rt_rq->rt_runtime_lock);
- }
- raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
- unlock:
- read_unlock(&tasklist_lock);
- mutex_unlock(&rt_constraints_mutex);
-
- return err;
- }
-
- static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
- {
- u64 rt_runtime, rt_period;
-
- rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
- rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
- if (rt_runtime_us < 0)
- rt_runtime = RUNTIME_INF;
-
- return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
- }
-
- static long sched_group_rt_runtime(struct task_group *tg)
- {
- u64 rt_runtime_us;
-
- if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
- return -1;
-
- rt_runtime_us = tg->rt_bandwidth.rt_runtime;
- do_div(rt_runtime_us, NSEC_PER_USEC);
- return rt_runtime_us;
- }
-
- static int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
- {
- u64 rt_runtime, rt_period;
-
- rt_period = rt_period_us * NSEC_PER_USEC;
- rt_runtime = tg->rt_bandwidth.rt_runtime;
-
- return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
- }
-
- static long sched_group_rt_period(struct task_group *tg)
- {
- u64 rt_period_us;
-
- rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
- do_div(rt_period_us, NSEC_PER_USEC);
- return rt_period_us;
- }
- #endif /* CONFIG_RT_GROUP_SCHED */
-
- #ifdef CONFIG_RT_GROUP_SCHED
- static int sched_rt_global_constraints(void)
- {
- int ret = 0;
-
- mutex_lock(&rt_constraints_mutex);
- read_lock(&tasklist_lock);
- ret = __rt_schedulable(NULL, 0, 0);
- read_unlock(&tasklist_lock);
- mutex_unlock(&rt_constraints_mutex);
-
- return ret;
- }
-
- static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
- {
- /* Don't accept realtime tasks when there is no way for them to run */
- if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
- return 0;
-
- return 1;
- }
-
- #else /* !CONFIG_RT_GROUP_SCHED */
- static int sched_rt_global_constraints(void)
- {
- unsigned long flags;
- int i;
-
- raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
- for_each_possible_cpu(i) {
- struct rt_rq *rt_rq = &cpu_rq(i)->rt;
-
- raw_spin_lock(&rt_rq->rt_runtime_lock);
- rt_rq->rt_runtime = global_rt_runtime();
- raw_spin_unlock(&rt_rq->rt_runtime_lock);
- }
- raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
-
- return 0;
- }
- #endif /* CONFIG_RT_GROUP_SCHED */
-
- static int sched_dl_global_validate(void)
- {
- u64 runtime = global_rt_runtime();
- u64 period = global_rt_period();
- u64 new_bw = to_ratio(period, runtime);
- struct dl_bw *dl_b;
- int cpu, ret = 0;
- unsigned long flags;
-
- /*
- * Here we want to check the bandwidth not being set to some
- * value smaller than the currently allocated bandwidth in
- * any of the root_domains.
- *
- * FIXME: Cycling on all the CPUs is overdoing, but simpler than
- * cycling on root_domains... Discussion on different/better
- * solutions is welcome!
- */
- for_each_possible_cpu(cpu) {
- rcu_read_lock_sched();
- dl_b = dl_bw_of(cpu);
-
- raw_spin_lock_irqsave(&dl_b->lock, flags);
- if (new_bw < dl_b->total_bw)
- ret = -EBUSY;
- raw_spin_unlock_irqrestore(&dl_b->lock, flags);
-
- rcu_read_unlock_sched();
-
- if (ret)
- break;
- }
-
- return ret;
- }
-
- static void sched_dl_do_global(void)
- {
- u64 new_bw = -1;
- struct dl_bw *dl_b;
- int cpu;
- unsigned long flags;
-
- def_dl_bandwidth.dl_period = global_rt_period();
- def_dl_bandwidth.dl_runtime = global_rt_runtime();
-
- if (global_rt_runtime() != RUNTIME_INF)
- new_bw = to_ratio(global_rt_period(), global_rt_runtime());
-
- /*
- * FIXME: As above...
- */
- for_each_possible_cpu(cpu) {
- rcu_read_lock_sched();
- dl_b = dl_bw_of(cpu);
-
- raw_spin_lock_irqsave(&dl_b->lock, flags);
- dl_b->bw = new_bw;
- raw_spin_unlock_irqrestore(&dl_b->lock, flags);
-
- rcu_read_unlock_sched();
- }
- }
-
- static int sched_rt_global_validate(void)
- {
- if (sysctl_sched_rt_period <= 0)
- return -EINVAL;
-
- if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
- (sysctl_sched_rt_runtime > sysctl_sched_rt_period))
- return -EINVAL;
-
- return 0;
- }
-
- static void sched_rt_do_global(void)
- {
- def_rt_bandwidth.rt_runtime = global_rt_runtime();
- def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
- }
-
- int sched_rt_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
- {
- int old_period, old_runtime;
- static DEFINE_MUTEX(mutex);
- int ret;
-
- mutex_lock(&mutex);
- old_period = sysctl_sched_rt_period;
- old_runtime = sysctl_sched_rt_runtime;
-
- ret = proc_dointvec(table, write, buffer, lenp, ppos);
-
- if (!ret && write) {
- ret = sched_rt_global_validate();
- if (ret)
- goto undo;
-
- ret = sched_dl_global_validate();
- if (ret)
- goto undo;
-
- ret = sched_rt_global_constraints();
- if (ret)
- goto undo;
-
- sched_rt_do_global();
- sched_dl_do_global();
- }
- if (0) {
- undo:
- sysctl_sched_rt_period = old_period;
- sysctl_sched_rt_runtime = old_runtime;
- }
- mutex_unlock(&mutex);
-
- return ret;
- }
-
- int sched_rr_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
- {
- int ret;
- static DEFINE_MUTEX(mutex);
-
- mutex_lock(&mutex);
- ret = proc_dointvec(table, write, buffer, lenp, ppos);
- /*
- * Make sure that internally we keep jiffies.
- * Also, writing zero resets the timeslice to default:
- */
- if (!ret && write) {
- sched_rr_timeslice =
- sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE :
- msecs_to_jiffies(sysctl_sched_rr_timeslice);
- }
- mutex_unlock(&mutex);
- return ret;
- }
-
- #ifdef CONFIG_CGROUP_SCHED
static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
{
}
EXPORT_SYMBOL(filemap_flush);
+/**
+ * filemap_range_has_page - check if a page exists in range.
+ * @mapping: address space within which to check
+ * @start_byte: offset in bytes where the range starts
+ * @end_byte: offset in bytes where the range ends (inclusive)
+ *
+ * Find at least one page in the range supplied, usually used to check if
+ * direct writing in this range will trigger a writeback.
+ */
+bool filemap_range_has_page(struct address_space *mapping,
+ loff_t start_byte, loff_t end_byte)
+{
+ pgoff_t index = start_byte >> PAGE_SHIFT;
+ pgoff_t end = end_byte >> PAGE_SHIFT;
+ struct pagevec pvec;
+ bool ret;
+
+ if (end_byte < start_byte)
+ return false;
+
+ if (mapping->nrpages == 0)
+ return false;
+
+ pagevec_init(&pvec, 0);
+ if (!pagevec_lookup(&pvec, mapping, index, 1))
+ return false;
+ ret = (pvec.pages[0]->index <= end);
+ pagevec_release(&pvec);
+ return ret;
+}
+EXPORT_SYMBOL(filemap_range_has_page);
+
static int __filemap_fdatawait_range(struct address_space *mapping,
loff_t start_byte, loff_t end_byte)
{
struct wait_page_queue {
struct page *page;
int bit_nr;
- wait_queue_t wait;
+ wait_queue_entry_t wait;
};
- static int wake_page_function(wait_queue_t *wait, unsigned mode, int sync, void *arg)
+ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)
{
struct wait_page_key *key = arg;
struct wait_page_queue *wait_page
struct page *page, int bit_nr, int state, bool lock)
{
struct wait_page_queue wait_page;
- wait_queue_t *wait = &wait_page.wait;
+ wait_queue_entry_t *wait = &wait_page.wait;
int ret = 0;
init_wait(wait);
for (;;) {
spin_lock_irq(&q->lock);
- if (likely(list_empty(&wait->task_list))) {
+ if (likely(list_empty(&wait->entry))) {
if (lock)
- __add_wait_queue_tail_exclusive(q, wait);
+ __add_wait_queue_entry_tail_exclusive(q, wait);
else
__add_wait_queue(q, wait);
SetPageWaiters(page);
*
* Add an arbitrary @waiter to the wait queue for the nominated @page.
*/
- void add_page_wait_queue(struct page *page, wait_queue_t *waiter)
+ void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter)
{
wait_queue_head_t *q = page_waitqueue(page);
unsigned long flags;
loff_t size;
size = i_size_read(inode);
- retval = filemap_write_and_wait_range(mapping, iocb->ki_pos,
- iocb->ki_pos + count - 1);
- if (retval < 0)
- goto out;
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (filemap_range_has_page(mapping, iocb->ki_pos,
+ iocb->ki_pos + count - 1))
+ return -EAGAIN;
+ } else {
+ retval = filemap_write_and_wait_range(mapping,
+ iocb->ki_pos,
+ iocb->ki_pos + count - 1);
+ if (retval < 0)
+ goto out;
+ }
file_accessed(file);
pos = iocb->ki_pos;
+ if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
+ return -EINVAL;
+
if (limit != RLIM_INFINITY) {
if (iocb->ki_pos >= limit) {
send_sig(SIGXFSZ, current, 0);
write_len = iov_iter_count(from);
end = (pos + write_len - 1) >> PAGE_SHIFT;
- written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1);
- if (written)
- goto out;
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ /* If there are pages to writeback, return */
+ if (filemap_range_has_page(inode->i_mapping, pos,
+ pos + iov_iter_count(from)))
+ return -EAGAIN;
+ } else {
+ written = filemap_write_and_wait_range(mapping, pos,
+ pos + write_len - 1);
+ if (written)
+ goto out;
+ }
/*
* After a write we want buffered reads to be sure to go to disk to get
#include <uapi/linux/memfd.h>
#include <linux/userfaultfd_k.h>
#include <linux/rmap.h>
+#include <linux/uuid.h>
#include <linux/uaccess.h>
#include <asm/pgtable.h>
* entry unconditionally - even if something else had already woken the
* target.
*/
- static int synchronous_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
+ static int synchronous_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
int ret = default_wake_function(wait, mode, sync, key);
- list_del_init(&wait->task_list);
+ list_del_init(&wait->entry);
return ret;
}
spin_lock(&inode->i_lock);
inode->i_private = NULL;
wake_up_all(&shmem_falloc_waitq);
- WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.task_list));
+ WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.head));
spin_unlock(&inode->i_lock);
error = 0;
goto out;
#ifdef CONFIG_TMPFS_POSIX_ACL
sb->s_flags |= MS_POSIXACL;
#endif
+ uuid_gen(&sb->s_uuid);
inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
if (!inode)