]> git.kernelconcepts.de Git - karo-tx-linux.git/blob - block/kyber-iosched.c
Merge tag 'befs-v4.12-rc1' of git://github.com/luisbg/linux-befs
[karo-tx-linux.git] / block / kyber-iosched.c
1 /*
2  * The Kyber I/O scheduler. Controls latency by throttling queue depths using
3  * scalable techniques.
4  *
5  * Copyright (C) 2017 Facebook
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public
9  * License v2 as published by the Free Software Foundation.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program.  If not, see <https://www.gnu.org/licenses/>.
18  */
19
20 #include <linux/kernel.h>
21 #include <linux/blkdev.h>
22 #include <linux/blk-mq.h>
23 #include <linux/elevator.h>
24 #include <linux/module.h>
25 #include <linux/sbitmap.h>
26
27 #include "blk.h"
28 #include "blk-mq.h"
29 #include "blk-mq-sched.h"
30 #include "blk-mq-tag.h"
31 #include "blk-stat.h"
32
33 /* Scheduling domains. */
34 enum {
35         KYBER_READ,
36         KYBER_SYNC_WRITE,
37         KYBER_OTHER, /* Async writes, discard, etc. */
38         KYBER_NUM_DOMAINS,
39 };
40
41 enum {
42         KYBER_MIN_DEPTH = 256,
43
44         /*
45          * In order to prevent starvation of synchronous requests by a flood of
46          * asynchronous requests, we reserve 25% of requests for synchronous
47          * operations.
48          */
49         KYBER_ASYNC_PERCENT = 75,
50 };
51
52 /*
53  * Initial device-wide depths for each scheduling domain.
54  *
55  * Even for fast devices with lots of tags like NVMe, you can saturate
56  * the device with only a fraction of the maximum possible queue depth.
57  * So, we cap these to a reasonable value.
58  */
59 static const unsigned int kyber_depth[] = {
60         [KYBER_READ] = 256,
61         [KYBER_SYNC_WRITE] = 128,
62         [KYBER_OTHER] = 64,
63 };
64
65 /*
66  * Scheduling domain batch sizes. We favor reads.
67  */
68 static const unsigned int kyber_batch_size[] = {
69         [KYBER_READ] = 16,
70         [KYBER_SYNC_WRITE] = 8,
71         [KYBER_OTHER] = 8,
72 };
73
74 struct kyber_queue_data {
75         struct request_queue *q;
76
77         struct blk_stat_callback *cb;
78
79         /*
80          * The device is divided into multiple scheduling domains based on the
81          * request type. Each domain has a fixed number of in-flight requests of
82          * that type device-wide, limited by these tokens.
83          */
84         struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS];
85
86         /*
87          * Async request percentage, converted to per-word depth for
88          * sbitmap_get_shallow().
89          */
90         unsigned int async_depth;
91
92         /* Target latencies in nanoseconds. */
93         u64 read_lat_nsec, write_lat_nsec;
94 };
95
96 struct kyber_hctx_data {
97         spinlock_t lock;
98         struct list_head rqs[KYBER_NUM_DOMAINS];
99         unsigned int cur_domain;
100         unsigned int batching;
101         wait_queue_t domain_wait[KYBER_NUM_DOMAINS];
102         atomic_t wait_index[KYBER_NUM_DOMAINS];
103 };
104
105 static int rq_sched_domain(const struct request *rq)
106 {
107         unsigned int op = rq->cmd_flags;
108
109         if ((op & REQ_OP_MASK) == REQ_OP_READ)
110                 return KYBER_READ;
111         else if ((op & REQ_OP_MASK) == REQ_OP_WRITE && op_is_sync(op))
112                 return KYBER_SYNC_WRITE;
113         else
114                 return KYBER_OTHER;
115 }
116
117 enum {
118         NONE = 0,
119         GOOD = 1,
120         GREAT = 2,
121         BAD = -1,
122         AWFUL = -2,
123 };
124
125 #define IS_GOOD(status) ((status) > 0)
126 #define IS_BAD(status) ((status) < 0)
127
128 static int kyber_lat_status(struct blk_stat_callback *cb,
129                             unsigned int sched_domain, u64 target)
130 {
131         u64 latency;
132
133         if (!cb->stat[sched_domain].nr_samples)
134                 return NONE;
135
136         latency = cb->stat[sched_domain].mean;
137         if (latency >= 2 * target)
138                 return AWFUL;
139         else if (latency > target)
140                 return BAD;
141         else if (latency <= target / 2)
142                 return GREAT;
143         else /* (latency <= target) */
144                 return GOOD;
145 }
146
147 /*
148  * Adjust the read or synchronous write depth given the status of reads and
149  * writes. The goal is that the latencies of the two domains are fair (i.e., if
150  * one is good, then the other is good).
151  */
152 static void kyber_adjust_rw_depth(struct kyber_queue_data *kqd,
153                                   unsigned int sched_domain, int this_status,
154                                   int other_status)
155 {
156         unsigned int orig_depth, depth;
157
158         /*
159          * If this domain had no samples, or reads and writes are both good or
160          * both bad, don't adjust the depth.
161          */
162         if (this_status == NONE ||
163             (IS_GOOD(this_status) && IS_GOOD(other_status)) ||
164             (IS_BAD(this_status) && IS_BAD(other_status)))
165                 return;
166
167         orig_depth = depth = kqd->domain_tokens[sched_domain].sb.depth;
168
169         if (other_status == NONE) {
170                 depth++;
171         } else {
172                 switch (this_status) {
173                 case GOOD:
174                         if (other_status == AWFUL)
175                                 depth -= max(depth / 4, 1U);
176                         else
177                                 depth -= max(depth / 8, 1U);
178                         break;
179                 case GREAT:
180                         if (other_status == AWFUL)
181                                 depth /= 2;
182                         else
183                                 depth -= max(depth / 4, 1U);
184                         break;
185                 case BAD:
186                         depth++;
187                         break;
188                 case AWFUL:
189                         if (other_status == GREAT)
190                                 depth += 2;
191                         else
192                                 depth++;
193                         break;
194                 }
195         }
196
197         depth = clamp(depth, 1U, kyber_depth[sched_domain]);
198         if (depth != orig_depth)
199                 sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth);
200 }
201
202 /*
203  * Adjust the depth of other requests given the status of reads and synchronous
204  * writes. As long as either domain is doing fine, we don't throttle, but if
205  * both domains are doing badly, we throttle heavily.
206  */
207 static void kyber_adjust_other_depth(struct kyber_queue_data *kqd,
208                                      int read_status, int write_status,
209                                      bool have_samples)
210 {
211         unsigned int orig_depth, depth;
212         int status;
213
214         orig_depth = depth = kqd->domain_tokens[KYBER_OTHER].sb.depth;
215
216         if (read_status == NONE && write_status == NONE) {
217                 depth += 2;
218         } else if (have_samples) {
219                 if (read_status == NONE)
220                         status = write_status;
221                 else if (write_status == NONE)
222                         status = read_status;
223                 else
224                         status = max(read_status, write_status);
225                 switch (status) {
226                 case GREAT:
227                         depth += 2;
228                         break;
229                 case GOOD:
230                         depth++;
231                         break;
232                 case BAD:
233                         depth -= max(depth / 4, 1U);
234                         break;
235                 case AWFUL:
236                         depth /= 2;
237                         break;
238                 }
239         }
240
241         depth = clamp(depth, 1U, kyber_depth[KYBER_OTHER]);
242         if (depth != orig_depth)
243                 sbitmap_queue_resize(&kqd->domain_tokens[KYBER_OTHER], depth);
244 }
245
246 /*
247  * Apply heuristics for limiting queue depths based on gathered latency
248  * statistics.
249  */
250 static void kyber_stat_timer_fn(struct blk_stat_callback *cb)
251 {
252         struct kyber_queue_data *kqd = cb->data;
253         int read_status, write_status;
254
255         read_status = kyber_lat_status(cb, KYBER_READ, kqd->read_lat_nsec);
256         write_status = kyber_lat_status(cb, KYBER_SYNC_WRITE, kqd->write_lat_nsec);
257
258         kyber_adjust_rw_depth(kqd, KYBER_READ, read_status, write_status);
259         kyber_adjust_rw_depth(kqd, KYBER_SYNC_WRITE, write_status, read_status);
260         kyber_adjust_other_depth(kqd, read_status, write_status,
261                                  cb->stat[KYBER_OTHER].nr_samples != 0);
262
263         /*
264          * Continue monitoring latencies if we aren't hitting the targets or
265          * we're still throttling other requests.
266          */
267         if (!blk_stat_is_active(kqd->cb) &&
268             ((IS_BAD(read_status) || IS_BAD(write_status) ||
269               kqd->domain_tokens[KYBER_OTHER].sb.depth < kyber_depth[KYBER_OTHER])))
270                 blk_stat_activate_msecs(kqd->cb, 100);
271 }
272
273 static unsigned int kyber_sched_tags_shift(struct kyber_queue_data *kqd)
274 {
275         /*
276          * All of the hardware queues have the same depth, so we can just grab
277          * the shift of the first one.
278          */
279         return kqd->q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift;
280 }
281
282 static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
283 {
284         struct kyber_queue_data *kqd;
285         unsigned int max_tokens;
286         unsigned int shift;
287         int ret = -ENOMEM;
288         int i;
289
290         kqd = kmalloc_node(sizeof(*kqd), GFP_KERNEL, q->node);
291         if (!kqd)
292                 goto err;
293         kqd->q = q;
294
295         kqd->cb = blk_stat_alloc_callback(kyber_stat_timer_fn, rq_sched_domain,
296                                           KYBER_NUM_DOMAINS, kqd);
297         if (!kqd->cb)
298                 goto err_kqd;
299
300         /*
301          * The maximum number of tokens for any scheduling domain is at least
302          * the queue depth of a single hardware queue. If the hardware doesn't
303          * have many tags, still provide a reasonable number.
304          */
305         max_tokens = max_t(unsigned int, q->tag_set->queue_depth,
306                            KYBER_MIN_DEPTH);
307         for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
308                 WARN_ON(!kyber_depth[i]);
309                 WARN_ON(!kyber_batch_size[i]);
310                 ret = sbitmap_queue_init_node(&kqd->domain_tokens[i],
311                                               max_tokens, -1, false, GFP_KERNEL,
312                                               q->node);
313                 if (ret) {
314                         while (--i >= 0)
315                                 sbitmap_queue_free(&kqd->domain_tokens[i]);
316                         goto err_cb;
317                 }
318                 sbitmap_queue_resize(&kqd->domain_tokens[i], kyber_depth[i]);
319         }
320
321         shift = kyber_sched_tags_shift(kqd);
322         kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
323
324         kqd->read_lat_nsec = 2000000ULL;
325         kqd->write_lat_nsec = 10000000ULL;
326
327         return kqd;
328
329 err_cb:
330         blk_stat_free_callback(kqd->cb);
331 err_kqd:
332         kfree(kqd);
333 err:
334         return ERR_PTR(ret);
335 }
336
337 static int kyber_init_sched(struct request_queue *q, struct elevator_type *e)
338 {
339         struct kyber_queue_data *kqd;
340         struct elevator_queue *eq;
341
342         eq = elevator_alloc(q, e);
343         if (!eq)
344                 return -ENOMEM;
345
346         kqd = kyber_queue_data_alloc(q);
347         if (IS_ERR(kqd)) {
348                 kobject_put(&eq->kobj);
349                 return PTR_ERR(kqd);
350         }
351
352         eq->elevator_data = kqd;
353         q->elevator = eq;
354
355         blk_stat_add_callback(q, kqd->cb);
356
357         return 0;
358 }
359
360 static void kyber_exit_sched(struct elevator_queue *e)
361 {
362         struct kyber_queue_data *kqd = e->elevator_data;
363         struct request_queue *q = kqd->q;
364         int i;
365
366         blk_stat_remove_callback(q, kqd->cb);
367
368         for (i = 0; i < KYBER_NUM_DOMAINS; i++)
369                 sbitmap_queue_free(&kqd->domain_tokens[i]);
370         blk_stat_free_callback(kqd->cb);
371         kfree(kqd);
372 }
373
374 static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
375 {
376         struct kyber_hctx_data *khd;
377         int i;
378
379         khd = kmalloc_node(sizeof(*khd), GFP_KERNEL, hctx->numa_node);
380         if (!khd)
381                 return -ENOMEM;
382
383         spin_lock_init(&khd->lock);
384
385         for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
386                 INIT_LIST_HEAD(&khd->rqs[i]);
387                 INIT_LIST_HEAD(&khd->domain_wait[i].task_list);
388                 atomic_set(&khd->wait_index[i], 0);
389         }
390
391         khd->cur_domain = 0;
392         khd->batching = 0;
393
394         hctx->sched_data = khd;
395
396         return 0;
397 }
398
399 static void kyber_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
400 {
401         kfree(hctx->sched_data);
402 }
403
404 static int rq_get_domain_token(struct request *rq)
405 {
406         return (long)rq->elv.priv[0];
407 }
408
409 static void rq_set_domain_token(struct request *rq, int token)
410 {
411         rq->elv.priv[0] = (void *)(long)token;
412 }
413
414 static void rq_clear_domain_token(struct kyber_queue_data *kqd,
415                                   struct request *rq)
416 {
417         unsigned int sched_domain;
418         int nr;
419
420         nr = rq_get_domain_token(rq);
421         if (nr != -1) {
422                 sched_domain = rq_sched_domain(rq);
423                 sbitmap_queue_clear(&kqd->domain_tokens[sched_domain], nr,
424                                     rq->mq_ctx->cpu);
425         }
426 }
427
428 static struct request *kyber_get_request(struct request_queue *q,
429                                          unsigned int op,
430                                          struct blk_mq_alloc_data *data)
431 {
432         struct kyber_queue_data *kqd = q->elevator->elevator_data;
433         struct request *rq;
434
435         /*
436          * We use the scheduler tags as per-hardware queue queueing tokens.
437          * Async requests can be limited at this stage.
438          */
439         if (!op_is_sync(op))
440                 data->shallow_depth = kqd->async_depth;
441
442         rq = __blk_mq_alloc_request(data, op);
443         if (rq)
444                 rq_set_domain_token(rq, -1);
445         return rq;
446 }
447
448 static void kyber_put_request(struct request *rq)
449 {
450         struct request_queue *q = rq->q;
451         struct kyber_queue_data *kqd = q->elevator->elevator_data;
452
453         rq_clear_domain_token(kqd, rq);
454         blk_mq_finish_request(rq);
455 }
456
457 static void kyber_completed_request(struct request *rq)
458 {
459         struct request_queue *q = rq->q;
460         struct kyber_queue_data *kqd = q->elevator->elevator_data;
461         unsigned int sched_domain;
462         u64 now, latency, target;
463
464         /*
465          * Check if this request met our latency goal. If not, quickly gather
466          * some statistics and start throttling.
467          */
468         sched_domain = rq_sched_domain(rq);
469         switch (sched_domain) {
470         case KYBER_READ:
471                 target = kqd->read_lat_nsec;
472                 break;
473         case KYBER_SYNC_WRITE:
474                 target = kqd->write_lat_nsec;
475                 break;
476         default:
477                 return;
478         }
479
480         /* If we are already monitoring latencies, don't check again. */
481         if (blk_stat_is_active(kqd->cb))
482                 return;
483
484         now = __blk_stat_time(ktime_to_ns(ktime_get()));
485         if (now < blk_stat_time(&rq->issue_stat))
486                 return;
487
488         latency = now - blk_stat_time(&rq->issue_stat);
489
490         if (latency > target)
491                 blk_stat_activate_msecs(kqd->cb, 10);
492 }
493
494 static void kyber_flush_busy_ctxs(struct kyber_hctx_data *khd,
495                                   struct blk_mq_hw_ctx *hctx)
496 {
497         LIST_HEAD(rq_list);
498         struct request *rq, *next;
499
500         blk_mq_flush_busy_ctxs(hctx, &rq_list);
501         list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
502                 unsigned int sched_domain;
503
504                 sched_domain = rq_sched_domain(rq);
505                 list_move_tail(&rq->queuelist, &khd->rqs[sched_domain]);
506         }
507 }
508
509 static int kyber_domain_wake(wait_queue_t *wait, unsigned mode, int flags,
510                              void *key)
511 {
512         struct blk_mq_hw_ctx *hctx = READ_ONCE(wait->private);
513
514         list_del_init(&wait->task_list);
515         blk_mq_run_hw_queue(hctx, true);
516         return 1;
517 }
518
519 static int kyber_get_domain_token(struct kyber_queue_data *kqd,
520                                   struct kyber_hctx_data *khd,
521                                   struct blk_mq_hw_ctx *hctx)
522 {
523         unsigned int sched_domain = khd->cur_domain;
524         struct sbitmap_queue *domain_tokens = &kqd->domain_tokens[sched_domain];
525         wait_queue_t *wait = &khd->domain_wait[sched_domain];
526         struct sbq_wait_state *ws;
527         int nr;
528
529         nr = __sbitmap_queue_get(domain_tokens);
530         if (nr >= 0)
531                 return nr;
532
533         /*
534          * If we failed to get a domain token, make sure the hardware queue is
535          * run when one becomes available. Note that this is serialized on
536          * khd->lock, but we still need to be careful about the waker.
537          */
538         if (list_empty_careful(&wait->task_list)) {
539                 init_waitqueue_func_entry(wait, kyber_domain_wake);
540                 wait->private = hctx;
541                 ws = sbq_wait_ptr(domain_tokens,
542                                   &khd->wait_index[sched_domain]);
543                 add_wait_queue(&ws->wait, wait);
544
545                 /*
546                  * Try again in case a token was freed before we got on the wait
547                  * queue.
548                  */
549                 nr = __sbitmap_queue_get(domain_tokens);
550         }
551         return nr;
552 }
553
554 static struct request *
555 kyber_dispatch_cur_domain(struct kyber_queue_data *kqd,
556                           struct kyber_hctx_data *khd,
557                           struct blk_mq_hw_ctx *hctx,
558                           bool *flushed)
559 {
560         struct list_head *rqs;
561         struct request *rq;
562         int nr;
563
564         rqs = &khd->rqs[khd->cur_domain];
565         rq = list_first_entry_or_null(rqs, struct request, queuelist);
566
567         /*
568          * If there wasn't already a pending request and we haven't flushed the
569          * software queues yet, flush the software queues and check again.
570          */
571         if (!rq && !*flushed) {
572                 kyber_flush_busy_ctxs(khd, hctx);
573                 *flushed = true;
574                 rq = list_first_entry_or_null(rqs, struct request, queuelist);
575         }
576
577         if (rq) {
578                 nr = kyber_get_domain_token(kqd, khd, hctx);
579                 if (nr >= 0) {
580                         khd->batching++;
581                         rq_set_domain_token(rq, nr);
582                         list_del_init(&rq->queuelist);
583                         return rq;
584                 }
585         }
586
587         /* There were either no pending requests or no tokens. */
588         return NULL;
589 }
590
591 static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx)
592 {
593         struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
594         struct kyber_hctx_data *khd = hctx->sched_data;
595         bool flushed = false;
596         struct request *rq;
597         int i;
598
599         spin_lock(&khd->lock);
600
601         /*
602          * First, if we are still entitled to batch, try to dispatch a request
603          * from the batch.
604          */
605         if (khd->batching < kyber_batch_size[khd->cur_domain]) {
606                 rq = kyber_dispatch_cur_domain(kqd, khd, hctx, &flushed);
607                 if (rq)
608                         goto out;
609         }
610
611         /*
612          * Either,
613          * 1. We were no longer entitled to a batch.
614          * 2. The domain we were batching didn't have any requests.
615          * 3. The domain we were batching was out of tokens.
616          *
617          * Start another batch. Note that this wraps back around to the original
618          * domain if no other domains have requests or tokens.
619          */
620         khd->batching = 0;
621         for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
622                 if (khd->cur_domain == KYBER_NUM_DOMAINS - 1)
623                         khd->cur_domain = 0;
624                 else
625                         khd->cur_domain++;
626
627                 rq = kyber_dispatch_cur_domain(kqd, khd, hctx, &flushed);
628                 if (rq)
629                         goto out;
630         }
631
632         rq = NULL;
633 out:
634         spin_unlock(&khd->lock);
635         return rq;
636 }
637
638 static bool kyber_has_work(struct blk_mq_hw_ctx *hctx)
639 {
640         struct kyber_hctx_data *khd = hctx->sched_data;
641         int i;
642
643         for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
644                 if (!list_empty_careful(&khd->rqs[i]))
645                         return true;
646         }
647         return false;
648 }
649
650 #define KYBER_LAT_SHOW_STORE(op)                                        \
651 static ssize_t kyber_##op##_lat_show(struct elevator_queue *e,          \
652                                      char *page)                        \
653 {                                                                       \
654         struct kyber_queue_data *kqd = e->elevator_data;                \
655                                                                         \
656         return sprintf(page, "%llu\n", kqd->op##_lat_nsec);             \
657 }                                                                       \
658                                                                         \
659 static ssize_t kyber_##op##_lat_store(struct elevator_queue *e,         \
660                                       const char *page, size_t count)   \
661 {                                                                       \
662         struct kyber_queue_data *kqd = e->elevator_data;                \
663         unsigned long long nsec;                                        \
664         int ret;                                                        \
665                                                                         \
666         ret = kstrtoull(page, 10, &nsec);                               \
667         if (ret)                                                        \
668                 return ret;                                             \
669                                                                         \
670         kqd->op##_lat_nsec = nsec;                                      \
671                                                                         \
672         return count;                                                   \
673 }
674 KYBER_LAT_SHOW_STORE(read);
675 KYBER_LAT_SHOW_STORE(write);
676 #undef KYBER_LAT_SHOW_STORE
677
678 #define KYBER_LAT_ATTR(op) __ATTR(op##_lat_nsec, 0644, kyber_##op##_lat_show, kyber_##op##_lat_store)
679 static struct elv_fs_entry kyber_sched_attrs[] = {
680         KYBER_LAT_ATTR(read),
681         KYBER_LAT_ATTR(write),
682         __ATTR_NULL
683 };
684 #undef KYBER_LAT_ATTR
685
686 static struct elevator_type kyber_sched = {
687         .ops.mq = {
688                 .init_sched = kyber_init_sched,
689                 .exit_sched = kyber_exit_sched,
690                 .init_hctx = kyber_init_hctx,
691                 .exit_hctx = kyber_exit_hctx,
692                 .get_request = kyber_get_request,
693                 .put_request = kyber_put_request,
694                 .completed_request = kyber_completed_request,
695                 .dispatch_request = kyber_dispatch_request,
696                 .has_work = kyber_has_work,
697         },
698         .uses_mq = true,
699         .elevator_attrs = kyber_sched_attrs,
700         .elevator_name = "kyber",
701         .elevator_owner = THIS_MODULE,
702 };
703
704 static int __init kyber_init(void)
705 {
706         return elv_register(&kyber_sched);
707 }
708
709 static void __exit kyber_exit(void)
710 {
711         elv_unregister(&kyber_sched);
712 }
713
714 module_init(kyber_init);
715 module_exit(kyber_exit);
716
717 MODULE_AUTHOR("Omar Sandoval");
718 MODULE_LICENSE("GPL");
719 MODULE_DESCRIPTION("Kyber I/O scheduler");