]> git.kernelconcepts.de Git - karo-tx-linux.git/blobdiff - block/blk-throttle.c
blk-throttle: fix unused variable warning with BLK_DEV_THROTTLING_LOW=n
[karo-tx-linux.git] / block / blk-throttle.c
index 6e1c29860eecf3c69ee9bd538563949beeb40db1..b78db2e5fdff1e158ea52c179313ff3eba282015 100644 (file)
@@ -28,6 +28,8 @@ static int throtl_quantum = 32;
 /* default latency target is 0, eg, guarantee IO latency by default */
 #define DFL_LATENCY_TARGET (0)
 
+#define SKIP_LATENCY (((u64)1) << BLK_STAT_RES_SHIFT)
+
 static struct blkcg_policy blkcg_policy_throtl;
 
 /* A workqueue to queue throttle related work */
@@ -163,6 +165,23 @@ struct throtl_grp {
        unsigned long checked_last_finish_time; /* ns / 1024 */
        unsigned long avg_idletime; /* ns / 1024 */
        unsigned long idletime_threshold; /* us */
+
+       unsigned int bio_cnt; /* total bios */
+       unsigned int bad_bio_cnt; /* bios exceeding latency threshold */
+       unsigned long bio_cnt_reset_time;
+};
+
+/* We measure latency for request size from <= 4k to >= 1M */
+#define LATENCY_BUCKET_SIZE 9
+
+struct latency_bucket {
+       unsigned long total_latency; /* ns / 1024 */
+       int samples;
+};
+
+struct avg_latency_bucket {
+       unsigned long latency; /* ns / 1024 */
+       bool valid;
 };
 
 struct throtl_data
@@ -188,6 +207,13 @@ struct throtl_data
        unsigned long low_downgrade_time;
 
        unsigned int scale;
+
+       struct latency_bucket tmp_buckets[LATENCY_BUCKET_SIZE];
+       struct avg_latency_bucket avg_buckets[LATENCY_BUCKET_SIZE];
+       struct latency_bucket __percpu *latency_buckets;
+       unsigned long last_calculate_time;
+
+       bool track_bio_latency;
 };
 
 static void throtl_pending_timer_fn(unsigned long arg);
@@ -306,6 +332,9 @@ static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw)
        return ret;
 }
 
+#define request_bucket_index(sectors) \
+       clamp_t(int, order_base_2(sectors) - 3, 0, LATENCY_BUCKET_SIZE - 1)
+
 /**
  * throtl_log - log debug message via blktrace
  * @sq: the service_queue being reported
@@ -1695,12 +1724,15 @@ static bool throtl_tg_is_idle(struct throtl_grp *tg)
         * - single idle is too long, longer than a fixed value (in case user
         *   configure a too big threshold) or 4 times of slice
         * - average think time is more than threshold
+        * - IO latency is largely below threshold
         */
        unsigned long time = jiffies_to_usecs(4 * tg->td->throtl_slice);
 
        time = min_t(unsigned long, MAX_IDLE_TIME, time);
        return (ktime_get_ns() >> 10) - tg->last_finish_time > time ||
-              tg->avg_idletime > tg->idletime_threshold;
+              tg->avg_idletime > tg->idletime_threshold ||
+              (tg->latency_target && tg->bio_cnt &&
+               tg->bad_bio_cnt * 5 < tg->bio_cnt);
 }
 
 static bool throtl_tg_can_upgrade(struct throtl_grp *tg)
@@ -1931,6 +1963,87 @@ static void blk_throtl_update_idletime(struct throtl_grp *tg)
        tg->checked_last_finish_time = last_finish_time;
 }
 
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+static void throtl_update_latency_buckets(struct throtl_data *td)
+{
+       struct avg_latency_bucket avg_latency[LATENCY_BUCKET_SIZE];
+       int i, cpu;
+       unsigned long last_latency = 0;
+       unsigned long latency;
+
+       if (!blk_queue_nonrot(td->queue))
+               return;
+       if (time_before(jiffies, td->last_calculate_time + HZ))
+               return;
+       td->last_calculate_time = jiffies;
+
+       memset(avg_latency, 0, sizeof(avg_latency));
+       for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
+               struct latency_bucket *tmp = &td->tmp_buckets[i];
+
+               for_each_possible_cpu(cpu) {
+                       struct latency_bucket *bucket;
+
+                       /* this isn't race free, but ok in practice */
+                       bucket = per_cpu_ptr(td->latency_buckets, cpu);
+                       tmp->total_latency += bucket[i].total_latency;
+                       tmp->samples += bucket[i].samples;
+                       bucket[i].total_latency = 0;
+                       bucket[i].samples = 0;
+               }
+
+               if (tmp->samples >= 32) {
+                       int samples = tmp->samples;
+
+                       latency = tmp->total_latency;
+
+                       tmp->total_latency = 0;
+                       tmp->samples = 0;
+                       latency /= samples;
+                       if (latency == 0)
+                               continue;
+                       avg_latency[i].latency = latency;
+               }
+       }
+
+       for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
+               if (!avg_latency[i].latency) {
+                       if (td->avg_buckets[i].latency < last_latency)
+                               td->avg_buckets[i].latency = last_latency;
+                       continue;
+               }
+
+               if (!td->avg_buckets[i].valid)
+                       latency = avg_latency[i].latency;
+               else
+                       latency = (td->avg_buckets[i].latency * 7 +
+                               avg_latency[i].latency) >> 3;
+
+               td->avg_buckets[i].latency = max(latency, last_latency);
+               td->avg_buckets[i].valid = true;
+               last_latency = td->avg_buckets[i].latency;
+       }
+}
+#else
+static inline void throtl_update_latency_buckets(struct throtl_data *td)
+{
+}
+#endif
+
+static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio)
+{
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+       int ret;
+
+       ret = bio_associate_current(bio);
+       if (ret == 0 || ret == -EBUSY)
+               bio->bi_cg_private = tg;
+       blk_stat_set_issue(&bio->bi_issue_stat, bio_sectors(bio));
+#else
+       bio_associate_current(bio);
+#endif
+}
+
 bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
                    struct bio *bio)
 {
@@ -1939,7 +2052,7 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
        struct throtl_service_queue *sq;
        bool rw = bio_data_dir(bio);
        bool throttled = false;
-       int ret;
+       struct throtl_data *td = tg->td;
 
        WARN_ON_ONCE(!rcu_read_lock_held());
 
@@ -1949,14 +2062,12 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 
        spin_lock_irq(q->queue_lock);
 
+       throtl_update_latency_buckets(td);
+
        if (unlikely(blk_queue_bypass(q)))
                goto out_unlock;
 
-       ret = bio_associate_current(bio);
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
-       if (ret == 0 || ret == -EBUSY)
-               bio->bi_cg_private = tg;
-#endif
+       blk_throtl_assoc_bio(tg, bio);
        blk_throtl_update_idletime(tg);
 
        sq = &tg->service_queue;
@@ -1974,8 +2085,8 @@ again:
                /* if above limits, break to queue */
                if (!tg_may_dispatch(tg, bio, NULL)) {
                        tg->last_low_overflow_time[rw] = jiffies;
-                       if (throtl_can_upgrade(tg->td, tg)) {
-                               throtl_upgrade_state(tg->td);
+                       if (throtl_can_upgrade(td, tg)) {
+                               throtl_upgrade_state(td);
                                goto again;
                        }
                        break;
@@ -2019,7 +2130,7 @@ again:
 
        tg->last_low_overflow_time[rw] = jiffies;
 
-       tg->td->nr_queued[rw]++;
+       td->nr_queued[rw]++;
        throtl_add_bio_tg(bio, qn, tg);
        throttled = true;
 
@@ -2044,20 +2155,91 @@ out:
         */
        if (!throttled)
                bio_clear_flag(bio, BIO_THROTTLED);
+
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+       if (throttled || !td->track_bio_latency)
+               bio->bi_issue_stat.stat |= SKIP_LATENCY;
+#endif
        return throttled;
 }
 
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+static void throtl_track_latency(struct throtl_data *td, sector_t size,
+       int op, unsigned long time)
+{
+       struct latency_bucket *latency;
+       int index;
+
+       if (!td || td->limit_index != LIMIT_LOW || op != REQ_OP_READ ||
+           !blk_queue_nonrot(td->queue))
+               return;
+
+       index = request_bucket_index(size);
+
+       latency = get_cpu_ptr(td->latency_buckets);
+       latency[index].total_latency += time;
+       latency[index].samples++;
+       put_cpu_ptr(td->latency_buckets);
+}
+
+void blk_throtl_stat_add(struct request *rq, u64 time_ns)
+{
+       struct request_queue *q = rq->q;
+       struct throtl_data *td = q->td;
+
+       throtl_track_latency(td, blk_stat_size(&rq->issue_stat),
+               req_op(rq), time_ns >> 10);
+}
+
 void blk_throtl_bio_endio(struct bio *bio)
 {
        struct throtl_grp *tg;
+       u64 finish_time_ns;
+       unsigned long finish_time;
+       unsigned long start_time;
+       unsigned long lat;
 
        tg = bio->bi_cg_private;
        if (!tg)
                return;
        bio->bi_cg_private = NULL;
 
-       tg->last_finish_time = ktime_get_ns() >> 10;
+       finish_time_ns = ktime_get_ns();
+       tg->last_finish_time = finish_time_ns >> 10;
+
+       start_time = blk_stat_time(&bio->bi_issue_stat) >> 10;
+       finish_time = __blk_stat_time(finish_time_ns) >> 10;
+       if (!start_time || finish_time <= start_time)
+               return;
+
+       lat = finish_time - start_time;
+       /* this is only for bio based driver */
+       if (!(bio->bi_issue_stat.stat & SKIP_LATENCY))
+               throtl_track_latency(tg->td, blk_stat_size(&bio->bi_issue_stat),
+                       bio_op(bio), lat);
+
+       if (tg->latency_target) {
+               int bucket;
+               unsigned int threshold;
+
+               bucket = request_bucket_index(
+                       blk_stat_size(&bio->bi_issue_stat));
+               threshold = tg->td->avg_buckets[bucket].latency +
+                       tg->latency_target;
+               if (lat > threshold)
+                       tg->bad_bio_cnt++;
+               /*
+                * Not race free, could get wrong count, which means cgroups
+                * will be throttled
+                */
+               tg->bio_cnt++;
+       }
+
+       if (time_after(jiffies, tg->bio_cnt_reset_time) || tg->bio_cnt > 1024) {
+               tg->bio_cnt_reset_time = tg->td->throtl_slice + jiffies;
+               tg->bio_cnt /= 2;
+               tg->bad_bio_cnt /= 2;
+       }
 }
 #endif
 
@@ -2133,6 +2315,12 @@ int blk_throtl_init(struct request_queue *q)
        td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
        if (!td)
                return -ENOMEM;
+       td->latency_buckets = __alloc_percpu(sizeof(struct latency_bucket) *
+               LATENCY_BUCKET_SIZE, __alignof__(u64));
+       if (!td->latency_buckets) {
+               kfree(td);
+               return -ENOMEM;
+       }
 
        INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
        throtl_service_queue_init(&td->service_queue);
@@ -2147,8 +2335,10 @@ int blk_throtl_init(struct request_queue *q)
 
        /* activate policy */
        ret = blkcg_activate_policy(q, &blkcg_policy_throtl);
-       if (ret)
+       if (ret) {
+               free_percpu(td->latency_buckets);
                kfree(td);
+       }
        return ret;
 }
 
@@ -2157,6 +2347,7 @@ void blk_throtl_exit(struct request_queue *q)
        BUG_ON(!q->td);
        throtl_shutdown_wq(q);
        blkcg_deactivate_policy(q, &blkcg_policy_throtl);
+       free_percpu(q->td->latency_buckets);
        kfree(q->td);
 }
 
@@ -2181,6 +2372,10 @@ void blk_throtl_register_queue(struct request_queue *q)
        td->throtl_slice = DFL_THROTL_SLICE_HD;
 #endif
 
+       td->track_bio_latency = !q->mq_ops && !q->request_fn;
+       if (!td->track_bio_latency)
+               blk_stat_enable_accounting(q);
+
        /*
         * some tg are created before queue is fully initialized, eg, nonrot
         * isn't initialized yet