block/blk-mq-sched.c

   1 /*
   2  * blk-mq scheduling framework
   3  *
   4  * Copyright (C) 2016 Jens Axboe
   5  */
   6 #include <linux/kernel.h>
   7 #include <linux/module.h>
   8 #include <linux/blk-mq.h>
   9
  10 #include <trace/events/block.h>
  11
  12 #include "blk.h"
  13 #include "blk-mq.h"
  14 #include "blk-mq-sched.h"
  15 #include "blk-mq-tag.h"
  16 #include "blk-wbt.h"
  17
  18 void blk_mq_sched_free_hctx_data(struct request_queue *q,
  19                                  void (*exit)(struct blk_mq_hw_ctx *))
  20 {
  21         struct blk_mq_hw_ctx *hctx;
  22         int i;
  23
  24         queue_for_each_hw_ctx(q, hctx, i) {
  25                 if (exit && hctx->sched_data)
  26                         exit(hctx);
  27                 kfree(hctx->sched_data);
  28                 hctx->sched_data = NULL;
  29         }
  30 }
  31 EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data);
  32
  33 static void __blk_mq_sched_assign_ioc(struct request_queue *q,
  34                                       struct request *rq,
  35                                       struct bio *bio,
  36                                       struct io_context *ioc)
  37 {
  38         struct io_cq *icq;
  39
  40         spin_lock_irq(q->queue_lock);
  41         icq = ioc_lookup_icq(ioc, q);
  42         spin_unlock_irq(q->queue_lock);
  43
  44         if (!icq) {
  45                 icq = ioc_create_icq(ioc, q, GFP_ATOMIC);
  46                 if (!icq)
  47                         return;
  48         }
  49
  50         rq->elv.icq = icq;
  51         if (!blk_mq_sched_get_rq_priv(q, rq, bio)) {
  52                 rq->rq_flags |= RQF_ELVPRIV;
  53                 get_io_context(icq->ioc);
  54                 return;
  55         }
  56
  57         rq->elv.icq = NULL;
  58 }
  59
  60 static void blk_mq_sched_assign_ioc(struct request_queue *q,
  61                                     struct request *rq, struct bio *bio)
  62 {
  63         struct io_context *ioc;
  64
  65         ioc = rq_ioc(bio);
  66         if (ioc)
  67                 __blk_mq_sched_assign_ioc(q, rq, bio, ioc);
  68 }
  69
  70 struct request *blk_mq_sched_get_request(struct request_queue *q,
  71                                          struct bio *bio,
  72                                          unsigned int op,
  73                                          struct blk_mq_alloc_data *data)
  74 {
  75         struct elevator_queue *e = q->elevator;
  76         struct request *rq;
  77
  78         blk_queue_enter_live(q);
  79         data->q = q;
  80         if (likely(!data->ctx))
  81                 data->ctx = blk_mq_get_ctx(q);
  82         if (likely(!data->hctx))
  83                 data->hctx = blk_mq_map_queue(q, data->ctx->cpu);
  84
  85         if (e) {
  86                 data->flags |= BLK_MQ_REQ_INTERNAL;
  87
  88                 /*
  89                  * Flush requests are special and go directly to the
  90                  * dispatch list.
  91                  */
  92                 if (!op_is_flush(op) && e->type->ops.mq.get_request) {
  93                         rq = e->type->ops.mq.get_request(q, op, data);
  94                         if (rq)
  95                                 rq->rq_flags |= RQF_QUEUED;
  96                 } else
  97                         rq = __blk_mq_alloc_request(data, op);
  98         } else {
  99                 rq = __blk_mq_alloc_request(data, op);
 100         }
 101
 102         if (rq) {
 103                 if (!op_is_flush(op)) {
 104                         rq->elv.icq = NULL;
 105                         if (e && e->type->icq_cache)
 106                                 blk_mq_sched_assign_ioc(q, rq, bio);
 107                 }
 108                 data->hctx->queued++;
 109                 return rq;
 110         }
 111
 112         blk_queue_exit(q);
 113         return NULL;
 114 }
 115
 116 void blk_mq_sched_put_request(struct request *rq)
 117 {
 118         struct request_queue *q = rq->q;
 119         struct elevator_queue *e = q->elevator;
 120
 121         if (rq->rq_flags & RQF_ELVPRIV) {
 122                 blk_mq_sched_put_rq_priv(rq->q, rq);
 123                 if (rq->elv.icq) {
 124                         put_io_context(rq->elv.icq->ioc);
 125                         rq->elv.icq = NULL;
 126                 }
 127         }
 128
 129         if ((rq->rq_flags & RQF_QUEUED) && e && e->type->ops.mq.put_request)
 130                 e->type->ops.mq.put_request(rq);
 131         else
 132                 blk_mq_finish_request(rq);
 133 }
 134
 135 void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 136 {
 137         struct request_queue *q = hctx->queue;
 138         struct elevator_queue *e = q->elevator;
 139         const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request;
 140         bool did_work = false;
 141         LIST_HEAD(rq_list);
 142
 143         if (unlikely(blk_mq_hctx_stopped(hctx)))
 144                 return;
 145
 146         hctx->run++;
 147
 148         /*
 149          * If we have previous entries on our dispatch list, grab them first for
 150          * more fair dispatch.
 151          */
 152         if (!list_empty_careful(&hctx->dispatch)) {
 153                 spin_lock(&hctx->lock);
 154                 if (!list_empty(&hctx->dispatch))
 155                         list_splice_init(&hctx->dispatch, &rq_list);
 156                 spin_unlock(&hctx->lock);
 157         }
 158
 159         /*
 160          * Only ask the scheduler for requests, if we didn't have residual
 161          * requests from the dispatch list. This is to avoid the case where
 162          * we only ever dispatch a fraction of the requests available because
 163          * of low device queue depth. Once we pull requests out of the IO
 164          * scheduler, we can no longer merge or sort them. So it's best to
 165          * leave them there for as long as we can. Mark the hw queue as
 166          * needing a restart in that case.
 167          */
 168         if (!list_empty(&rq_list)) {
 169                 blk_mq_sched_mark_restart_hctx(hctx);
 170                 did_work = blk_mq_dispatch_rq_list(q, &rq_list);
 171         } else if (!has_sched_dispatch) {
 172                 blk_mq_flush_busy_ctxs(hctx, &rq_list);
 173                 blk_mq_dispatch_rq_list(q, &rq_list);
 174         }
 175
 176         /*
 177          * We want to dispatch from the scheduler if we had no work left
 178          * on the dispatch list, OR if we did have work but weren't able
 179          * to make progress.
 180          */
 181         if (!did_work && has_sched_dispatch) {
 182                 do {
 183                         struct request *rq;
 184
 185                         rq = e->type->ops.mq.dispatch_request(hctx);
 186                         if (!rq)
 187                                 break;
 188                         list_add(&rq->queuelist, &rq_list);
 189                 } while (blk_mq_dispatch_rq_list(q, &rq_list));
 190         }
 191 }
 192
 193 bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
 194                             struct request **merged_request)
 195 {
 196         struct request *rq;
 197
 198         switch (elv_merge(q, &rq, bio)) {
 199         case ELEVATOR_BACK_MERGE:
 200                 if (!blk_mq_sched_allow_merge(q, rq, bio))
 201                         return false;
 202                 if (!bio_attempt_back_merge(q, rq, bio))
 203                         return false;
 204                 *merged_request = attempt_back_merge(q, rq);
 205                 if (!*merged_request)
 206                         elv_merged_request(q, rq, ELEVATOR_BACK_MERGE);
 207                 return true;
 208         case ELEVATOR_FRONT_MERGE:
 209                 if (!blk_mq_sched_allow_merge(q, rq, bio))
 210                         return false;
 211                 if (!bio_attempt_front_merge(q, rq, bio))
 212                         return false;
 213                 *merged_request = attempt_front_merge(q, rq);
 214                 if (!*merged_request)
 215                         elv_merged_request(q, rq, ELEVATOR_FRONT_MERGE);
 216                 return true;
 217         default:
 218                 return false;
 219         }
 220 }
 221 EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge);
 222
 223 bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
 224 {
 225         struct elevator_queue *e = q->elevator;
 226
 227         if (e->type->ops.mq.bio_merge) {
 228                 struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
 229                 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
 230
 231                 blk_mq_put_ctx(ctx);
 232                 return e->type->ops.mq.bio_merge(hctx, bio);
 233         }
 234
 235         return false;
 236 }
 237
 238 bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq)
 239 {
 240         return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq);
 241 }
 242 EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge);
 243
 244 void blk_mq_sched_request_inserted(struct request *rq)
 245 {
 246         trace_block_rq_insert(rq->q, rq);
 247 }
 248 EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted);
 249
 250 static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
 251                                        struct request *rq)
 252 {
 253         if (rq->tag == -1) {
 254                 rq->rq_flags |= RQF_SORTED;
 255                 return false;
 256         }
 257
 258         /*
 259          * If we already have a real request tag, send directly to
 260          * the dispatch list.
 261          */
 262         spin_lock(&hctx->lock);
 263         list_add(&rq->queuelist, &hctx->dispatch);
 264         spin_unlock(&hctx->lock);
 265         return true;
 266 }
 267
 268 static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx)
 269 {
 270         if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) {
 271                 clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
 272                 if (blk_mq_hctx_has_pending(hctx)) {
 273                         blk_mq_run_hw_queue(hctx, true);
 274                         return true;
 275                 }
 276         }
 277         return false;
 278 }
 279
 280 /**
 281  * list_for_each_entry_rcu_rr - iterate in a round-robin fashion over rcu list
 282  * @pos:    loop cursor.
 283  * @skip:   the list element that will not be examined. Iteration starts at
 284  *          @skip->next.
 285  * @head:   head of the list to examine. This list must have at least one
 286  *          element, namely @skip.
 287  * @member: name of the list_head structure within typeof(*pos).
 288  */
 289 #define list_for_each_entry_rcu_rr(pos, skip, head, member)             \
 290         for ((pos) = (skip);                                            \
 291              (pos = (pos)->member.next != (head) ? list_entry_rcu(      \
 292                         (pos)->member.next, typeof(*pos), member) :     \
 293               list_entry_rcu((pos)->member.next->next, typeof(*pos), member)), \
 294              (pos) != (skip); )
 295
 296 /*
 297  * Called after a driver tag has been freed to check whether a hctx needs to
 298  * be restarted. Restarts @hctx if its tag set is not shared. Restarts hardware
 299  * queues in a round-robin fashion if the tag set of @hctx is shared with other
 300  * hardware queues.
 301  */
 302 void blk_mq_sched_restart(struct blk_mq_hw_ctx *const hctx)
 303 {
 304         struct blk_mq_tags *const tags = hctx->tags;
 305         struct blk_mq_tag_set *const set = hctx->queue->tag_set;
 306         struct request_queue *const queue = hctx->queue, *q;
 307         struct blk_mq_hw_ctx *hctx2;
 308         unsigned int i, j;
 309
 310         if (set->flags & BLK_MQ_F_TAG_SHARED) {
 311                 rcu_read_lock();
 312                 list_for_each_entry_rcu_rr(q, queue, &set->tag_list,
 313                                            tag_set_list) {
 314                         queue_for_each_hw_ctx(q, hctx2, i)
 315                                 if (hctx2->tags == tags &&
 316                                     blk_mq_sched_restart_hctx(hctx2))
 317                                         goto done;
 318                 }
 319                 j = hctx->queue_num + 1;
 320                 for (i = 0; i < queue->nr_hw_queues; i++, j++) {
 321                         if (j == queue->nr_hw_queues)
 322                                 j = 0;
 323                         hctx2 = queue->queue_hw_ctx[j];
 324                         if (hctx2->tags == tags &&
 325                             blk_mq_sched_restart_hctx(hctx2))
 326                                 break;
 327                 }
 328 done:
 329                 rcu_read_unlock();
 330         } else {
 331                 blk_mq_sched_restart_hctx(hctx);
 332         }
 333 }
 334
 335 /*
 336  * Add flush/fua to the queue. If we fail getting a driver tag, then
 337  * punt to the requeue list. Requeue will re-invoke us from a context
 338  * that's safe to block from.
 339  */
 340 static void blk_mq_sched_insert_flush(struct blk_mq_hw_ctx *hctx,
 341                                       struct request *rq, bool can_block)
 342 {
 343         if (blk_mq_get_driver_tag(rq, &hctx, can_block)) {
 344                 blk_insert_flush(rq);
 345                 blk_mq_run_hw_queue(hctx, true);
 346         } else
 347                 blk_mq_add_to_requeue_list(rq, false, true);
 348 }
 349
 350 void blk_mq_sched_insert_request(struct request *rq, bool at_head,
 351                                  bool run_queue, bool async, bool can_block)
 352 {
 353         struct request_queue *q = rq->q;
 354         struct elevator_queue *e = q->elevator;
 355         struct blk_mq_ctx *ctx = rq->mq_ctx;
 356         struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
 357
 358         if (rq->tag == -1 && op_is_flush(rq->cmd_flags)) {
 359                 blk_mq_sched_insert_flush(hctx, rq, can_block);
 360                 return;
 361         }
 362
 363         if (e && blk_mq_sched_bypass_insert(hctx, rq))
 364                 goto run;
 365
 366         if (e && e->type->ops.mq.insert_requests) {
 367                 LIST_HEAD(list);
 368
 369                 list_add(&rq->queuelist, &list);
 370                 e->type->ops.mq.insert_requests(hctx, &list, at_head);
 371         } else {
 372                 spin_lock(&ctx->lock);
 373                 __blk_mq_insert_request(hctx, rq, at_head);
 374                 spin_unlock(&ctx->lock);
 375         }
 376
 377 run:
 378         if (run_queue)
 379                 blk_mq_run_hw_queue(hctx, async);
 380 }
 381
 382 void blk_mq_sched_insert_requests(struct request_queue *q,
 383                                   struct blk_mq_ctx *ctx,
 384                                   struct list_head *list, bool run_queue_async)
 385 {
 386         struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
 387         struct elevator_queue *e = hctx->queue->elevator;
 388
 389         if (e) {
 390                 struct request *rq, *next;
 391
 392                 /*
 393                  * We bypass requests that already have a driver tag assigned,
 394                  * which should only be flushes. Flushes are only ever inserted
 395                  * as single requests, so we shouldn't ever hit the
 396                  * WARN_ON_ONCE() below (but let's handle it just in case).
 397                  */
 398                 list_for_each_entry_safe(rq, next, list, queuelist) {
 399                         if (WARN_ON_ONCE(rq->tag != -1)) {
 400                                 list_del_init(&rq->queuelist);
 401                                 blk_mq_sched_bypass_insert(hctx, rq);
 402                         }
 403                 }
 404         }
 405
 406         if (e && e->type->ops.mq.insert_requests)
 407                 e->type->ops.mq.insert_requests(hctx, list, false);
 408         else
 409                 blk_mq_insert_requests(hctx, ctx, list);
 410
 411         blk_mq_run_hw_queue(hctx, run_queue_async);
 412 }
 413
 414 static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set,
 415                                    struct blk_mq_hw_ctx *hctx,
 416                                    unsigned int hctx_idx)
 417 {
 418         if (hctx->sched_tags) {
 419                 blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx);
 420                 blk_mq_free_rq_map(hctx->sched_tags);
 421                 hctx->sched_tags = NULL;
 422         }
 423 }
 424
 425 static int blk_mq_sched_alloc_tags(struct request_queue *q,
 426                                    struct blk_mq_hw_ctx *hctx,
 427                                    unsigned int hctx_idx)
 428 {
 429         struct blk_mq_tag_set *set = q->tag_set;
 430         int ret;
 431
 432         hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests,
 433                                                set->reserved_tags);
 434         if (!hctx->sched_tags)
 435                 return -ENOMEM;
 436
 437         ret = blk_mq_alloc_rqs(set, hctx->sched_tags, hctx_idx, q->nr_requests);
 438         if (ret)
 439                 blk_mq_sched_free_tags(set, hctx, hctx_idx);
 440
 441         return ret;
 442 }
 443
 444 static void blk_mq_sched_tags_teardown(struct request_queue *q)
 445 {
 446         struct blk_mq_tag_set *set = q->tag_set;
 447         struct blk_mq_hw_ctx *hctx;
 448         int i;
 449
 450         queue_for_each_hw_ctx(q, hctx, i)
 451                 blk_mq_sched_free_tags(set, hctx, i);
 452 }
 453
 454 int blk_mq_sched_init_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
 455                            unsigned int hctx_idx)
 456 {
 457         struct elevator_queue *e = q->elevator;
 458         int ret;
 459
 460         if (!e)
 461                 return 0;
 462
 463         ret = blk_mq_sched_alloc_tags(q, hctx, hctx_idx);
 464         if (ret)
 465                 return ret;
 466
 467         if (e->type->ops.mq.init_hctx) {
 468                 ret = e->type->ops.mq.init_hctx(hctx, hctx_idx);
 469                 if (ret) {
 470                         blk_mq_sched_free_tags(q->tag_set, hctx, hctx_idx);
 471                         return ret;
 472                 }
 473         }
 474
 475         return 0;
 476 }
 477
 478 void blk_mq_sched_exit_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
 479                             unsigned int hctx_idx)
 480 {
 481         struct elevator_queue *e = q->elevator;
 482
 483         if (!e)
 484                 return;
 485
 486         if (e->type->ops.mq.exit_hctx && hctx->sched_data) {
 487                 e->type->ops.mq.exit_hctx(hctx, hctx_idx);
 488                 hctx->sched_data = NULL;
 489         }
 490
 491         blk_mq_sched_free_tags(q->tag_set, hctx, hctx_idx);
 492 }
 493
 494 int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
 495 {
 496         struct blk_mq_hw_ctx *hctx;
 497         struct elevator_queue *eq;
 498         unsigned int i;
 499         int ret;
 500
 501         if (!e) {
 502                 q->elevator = NULL;
 503                 return 0;
 504         }
 505
 506         /*
 507          * Default to 256, since we don't split into sync/async like the
 508          * old code did. Additionally, this is a per-hw queue depth.
 509          */
 510         q->nr_requests = 2 * BLKDEV_MAX_RQ;
 511
 512         queue_for_each_hw_ctx(q, hctx, i) {
 513                 ret = blk_mq_sched_alloc_tags(q, hctx, i);
 514                 if (ret)
 515                         goto err;
 516         }
 517
 518         ret = e->ops.mq.init_sched(q, e);
 519         if (ret)
 520                 goto err;
 521
 522         if (e->ops.mq.init_hctx) {
 523                 queue_for_each_hw_ctx(q, hctx, i) {
 524                         ret = e->ops.mq.init_hctx(hctx, i);
 525                         if (ret) {
 526                                 eq = q->elevator;
 527                                 blk_mq_exit_sched(q, eq);
 528                                 kobject_put(&eq->kobj);
 529                                 return ret;
 530                         }
 531                 }
 532         }
 533
 534         return 0;
 535
 536 err:
 537         blk_mq_sched_tags_teardown(q);
 538         q->elevator = NULL;
 539         return ret;
 540 }
 541
 542 void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
 543 {
 544         struct blk_mq_hw_ctx *hctx;
 545         unsigned int i;
 546
 547         if (e->type->ops.mq.exit_hctx) {
 548                 queue_for_each_hw_ctx(q, hctx, i) {
 549                         if (hctx->sched_data) {
 550                                 e->type->ops.mq.exit_hctx(hctx, i);
 551                                 hctx->sched_data = NULL;
 552                         }
 553                 }
 554         }
 555         if (e->type->ops.mq.exit_sched)
 556                 e->type->ops.mq.exit_sched(e);
 557         blk_mq_sched_tags_teardown(q);
 558         q->elevator = NULL;
 559 }
 560
 561 int blk_mq_sched_init(struct request_queue *q)
 562 {
 563         int ret;
 564
 565         mutex_lock(&q->sysfs_lock);
 566         ret = elevator_init(q, NULL);
 567         mutex_unlock(&q->sysfs_lock);
 568
 569         return ret;
 570 }