blk-throttle: fix unused variable warning with BLK_DEV_THROTTLING_LOW=n

[karo-tx-linux.git] / block / blk-throttle.c
diff --git a/block/blk-throttle.c b/block/blk-throttle.c

index 62984fc920159f979274d2d6195822dd83e59a06..b78db2e5fdff1e158ea52c179313ff3eba282015 100644 (file)
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -22,6 +22,13 @@ static int throtl_quantum = 32;
  #define DFL_THROTL_SLICE_HD (HZ / 10)
  #define DFL_THROTL_SLICE_SSD (HZ / 50)
  #define MAX_THROTL_SLICE (HZ)
+#define DFL_IDLE_THRESHOLD_SSD (1000L) /* 1 ms */
+#define DFL_IDLE_THRESHOLD_HD (100L * 1000) /* 100 ms */
+#define MAX_IDLE_TIME (5L * 1000 * 1000) /* 5 s */
+/* default latency target is 0, eg, guarantee IO latency by default */
+#define DFL_LATENCY_TARGET (0)
+
+#define SKIP_LATENCY (((u64)1) << BLK_STAT_RES_SHIFT)
  
  static struct blkcg_policy blkcg_policy_throtl;
  
@@ -149,11 +156,32 @@ struct throtl_grp {
  
         unsigned long last_check_time;
  
-       unsigned long last_dispatch_time[2];
-
+       unsigned long latency_target; /* us */
         /* When did we start a new slice */
         unsigned long slice_start[2];
         unsigned long slice_end[2];
+
+       unsigned long last_finish_time; /* ns / 1024 */
+       unsigned long checked_last_finish_time; /* ns / 1024 */
+       unsigned long avg_idletime; /* ns / 1024 */
+       unsigned long idletime_threshold; /* us */
+
+       unsigned int bio_cnt; /* total bios */
+       unsigned int bad_bio_cnt; /* bios exceeding latency threshold */
+       unsigned long bio_cnt_reset_time;
+};
+
+/* We measure latency for request size from <= 4k to >= 1M */
+#define LATENCY_BUCKET_SIZE 9
+
+struct latency_bucket {
+       unsigned long total_latency; /* ns / 1024 */
+       int samples;
+};
+
+struct avg_latency_bucket {
+       unsigned long latency; /* ns / 1024 */
+       bool valid;
  };
  
  struct throtl_data
@@ -173,10 +201,19 @@ struct throtl_data
         unsigned int limit_index;
         bool limit_valid[LIMIT_CNT];
  
+       unsigned long dft_idletime_threshold; /* us */
+
         unsigned long low_upgrade_time;
         unsigned long low_downgrade_time;
  
         unsigned int scale;
+
+       struct latency_bucket tmp_buckets[LATENCY_BUCKET_SIZE];
+       struct avg_latency_bucket avg_buckets[LATENCY_BUCKET_SIZE];
+       struct latency_bucket __percpu *latency_buckets;
+       unsigned long last_calculate_time;
+
+       bool track_bio_latency;
  };
  
  static void throtl_pending_timer_fn(unsigned long arg);
@@ -295,6 +332,9 @@ static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw)
         return ret;
  }
  
+#define request_bucket_index(sectors) \
+       clamp_t(int, order_base_2(sectors) - 3, 0, LATENCY_BUCKET_SIZE - 1)
+
  /**
   * throtl_log - log debug message via blktrace
   * @sq: the service_queue being reported
@@ -441,6 +481,8 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
         tg->iops_conf[WRITE][LIMIT_MAX] = UINT_MAX;
         /* LIMIT_LOW will have default value 0 */
  
+       tg->latency_target = DFL_LATENCY_TARGET;
+
         return &tg->pd;
  }
  
@@ -468,6 +510,8 @@ static void throtl_pd_init(struct blkg_policy_data *pd)
         if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent)
                 sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue;
         tg->td = td;
+
+       tg->idletime_threshold = td->dft_idletime_threshold;
  }
  
  /*
@@ -496,8 +540,6 @@ static void throtl_pd_online(struct blkg_policy_data *pd)
          * Update has_rules[] after a new group is brought online.
          */
         tg_update_has_rules(tg);
-       tg->last_dispatch_time[READ] = jiffies;
-       tg->last_dispatch_time[WRITE] = jiffies;
  }
  
  static void blk_throtl_update_limit_valid(struct throtl_data *td)
@@ -1436,6 +1478,8 @@ static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd,
         char bufs[4][21] = { "max", "max", "max", "max" };
         u64 bps_dft;
         unsigned int iops_dft;
+       char idle_time[26] = "";
+       char latency_time[26] = "";
  
         if (!dname)
                 return 0;
@@ -1451,7 +1495,10 @@ static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd,
         if (tg->bps_conf[READ][off] == bps_dft &&
             tg->bps_conf[WRITE][off] == bps_dft &&
             tg->iops_conf[READ][off] == iops_dft &&
-           tg->iops_conf[WRITE][off] == iops_dft)
+           tg->iops_conf[WRITE][off] == iops_dft &&
+           (off != LIMIT_LOW ||
+            (tg->idletime_threshold == tg->td->dft_idletime_threshold &&
+             tg->latency_target == DFL_LATENCY_TARGET)))
                 return 0;
  
         if (tg->bps_conf[READ][off] != bps_dft)
@@ -1466,9 +1513,23 @@ static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd,
         if (tg->iops_conf[WRITE][off] != iops_dft)
                 snprintf(bufs[3], sizeof(bufs[3]), "%u",
                         tg->iops_conf[WRITE][off]);
+       if (off == LIMIT_LOW) {
+               if (tg->idletime_threshold == ULONG_MAX)
+                       strcpy(idle_time, " idle=max");
+               else
+                       snprintf(idle_time, sizeof(idle_time), " idle=%lu",
+                               tg->idletime_threshold);
+
+               if (tg->latency_target == ULONG_MAX)
+                       strcpy(latency_time, " latency=max");
+               else
+                       snprintf(latency_time, sizeof(latency_time),
+                               " latency=%lu", tg->latency_target);
+       }
  
-       seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s\n",
-                  dname, bufs[0], bufs[1], bufs[2], bufs[3]);
+       seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s%s%s\n",
+                  dname, bufs[0], bufs[1], bufs[2], bufs[3], idle_time,
+                  latency_time);
         return 0;
  }
  
@@ -1486,6 +1547,8 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
         struct blkg_conf_ctx ctx;
         struct throtl_grp *tg;
         u64 v[4];
+       unsigned long idle_time;
+       unsigned long latency_time;
         int ret;
         int index = of_cft(of)->private;
  
@@ -1500,6 +1563,8 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
         v[2] = tg->iops_conf[READ][index];
         v[3] = tg->iops_conf[WRITE][index];
  
+       idle_time = tg->idletime_threshold;
+       latency_time = tg->latency_target;
         while (true) {
                 char tok[27];   /* wiops=18446744073709551616 */
                 char *p;
@@ -1531,6 +1596,10 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
                         v[2] = min_t(u64, val, UINT_MAX);
                 else if (!strcmp(tok, "wiops"))
                         v[3] = min_t(u64, val, UINT_MAX);
+               else if (off == LIMIT_LOW && !strcmp(tok, "idle"))
+                       idle_time = val;
+               else if (off == LIMIT_LOW && !strcmp(tok, "latency"))
+                       latency_time = val;
                 else
                         goto out_finish;
         }
@@ -1559,6 +1628,10 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
                 blk_throtl_update_limit_valid(tg->td);
                 if (tg->td->limit_valid[LIMIT_LOW])
                         tg->td->limit_index = LIMIT_LOW;
+               tg->idletime_threshold = (idle_time == ULONG_MAX) ?
+                       ULONG_MAX : idle_time;
+               tg->latency_target = (latency_time == ULONG_MAX) ?
+                       ULONG_MAX : latency_time;
         }
         tg_conf_updated(tg);
         ret = 0;
@@ -1644,6 +1717,24 @@ static unsigned long tg_last_low_overflow_time(struct throtl_grp *tg)
         return ret;
  }
  
+static bool throtl_tg_is_idle(struct throtl_grp *tg)
+{
+       /*
+        * cgroup is idle if:
+        * - single idle is too long, longer than a fixed value (in case user
+        *   configure a too big threshold) or 4 times of slice
+        * - average think time is more than threshold
+        * - IO latency is largely below threshold
+        */
+       unsigned long time = jiffies_to_usecs(4 * tg->td->throtl_slice);
+
+       time = min_t(unsigned long, MAX_IDLE_TIME, time);
+       return (ktime_get_ns() >> 10) - tg->last_finish_time > time ||
+              tg->avg_idletime > tg->idletime_threshold ||
+              (tg->latency_target && tg->bio_cnt &&
+               tg->bad_bio_cnt * 5 < tg->bio_cnt);
+}
+
  static bool throtl_tg_can_upgrade(struct throtl_grp *tg)
  {
         struct throtl_service_queue *sq = &tg->service_queue;
@@ -1665,9 +1756,8 @@ static bool throtl_tg_can_upgrade(struct throtl_grp *tg)
                 return true;
  
         if (time_after_eq(jiffies,
-            tg->last_dispatch_time[READ] + tg->td->throtl_slice) &&
-           time_after_eq(jiffies,
-            tg->last_dispatch_time[WRITE] + tg->td->throtl_slice))
+               tg_last_low_overflow_time(tg) + tg->td->throtl_slice) &&
+           throtl_tg_is_idle(tg))
                 return true;
         return false;
  }
@@ -1713,6 +1803,26 @@ static bool throtl_can_upgrade(struct throtl_data *td,
         return true;
  }
  
+static void throtl_upgrade_check(struct throtl_grp *tg)
+{
+       unsigned long now = jiffies;
+
+       if (tg->td->limit_index != LIMIT_LOW)
+               return;
+
+       if (time_after(tg->last_check_time + tg->td->throtl_slice, now))
+               return;
+
+       tg->last_check_time = now;
+
+       if (!time_after_eq(now,
+            __tg_last_low_overflow_time(tg) + tg->td->throtl_slice))
+               return;
+
+       if (throtl_can_upgrade(tg->td, NULL))
+               throtl_upgrade_state(tg->td);
+}
+
  static void throtl_upgrade_state(struct throtl_data *td)
  {
         struct cgroup_subsys_state *pos_css;
@@ -1754,18 +1864,15 @@ static bool throtl_tg_can_downgrade(struct throtl_grp *tg)
         struct throtl_data *td = tg->td;
         unsigned long now = jiffies;
  
-       if (time_after_eq(now, tg->last_dispatch_time[READ] +
-                                       td->throtl_slice) &&
-           time_after_eq(now, tg->last_dispatch_time[WRITE] +
-                                       td->throtl_slice))
-               return false;
         /*
          * If cgroup is below low limit, consider downgrade and throttle other
          * cgroups
          */
         if (time_after_eq(now, td->low_upgrade_time + td->throtl_slice) &&
             time_after_eq(now, tg_last_low_overflow_time(tg) +
-                                       td->throtl_slice))
+                                       td->throtl_slice) &&
+           (!throtl_tg_is_idle(tg) ||
+            !list_empty(&tg_to_blkg(tg)->blkcg->css.children)))
                 return true;
         return false;
  }
@@ -1843,6 +1950,100 @@ static void throtl_downgrade_check(struct throtl_grp *tg)
         tg->last_io_disp[WRITE] = 0;
  }
  
+static void blk_throtl_update_idletime(struct throtl_grp *tg)
+{
+       unsigned long now = ktime_get_ns() >> 10;
+       unsigned long last_finish_time = tg->last_finish_time;
+
+       if (now <= last_finish_time || last_finish_time == 0 ||
+           last_finish_time == tg->checked_last_finish_time)
+               return;
+
+       tg->avg_idletime = (tg->avg_idletime * 7 + now - last_finish_time) >> 3;
+       tg->checked_last_finish_time = last_finish_time;
+}
+
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+static void throtl_update_latency_buckets(struct throtl_data *td)
+{
+       struct avg_latency_bucket avg_latency[LATENCY_BUCKET_SIZE];
+       int i, cpu;
+       unsigned long last_latency = 0;
+       unsigned long latency;
+
+       if (!blk_queue_nonrot(td->queue))
+               return;
+       if (time_before(jiffies, td->last_calculate_time + HZ))
+               return;
+       td->last_calculate_time = jiffies;
+
+       memset(avg_latency, 0, sizeof(avg_latency));
+       for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
+               struct latency_bucket *tmp = &td->tmp_buckets[i];
+
+               for_each_possible_cpu(cpu) {
+                       struct latency_bucket *bucket;
+
+                       /* this isn't race free, but ok in practice */
+                       bucket = per_cpu_ptr(td->latency_buckets, cpu);
+                       tmp->total_latency += bucket[i].total_latency;
+                       tmp->samples += bucket[i].samples;
+                       bucket[i].total_latency = 0;
+                       bucket[i].samples = 0;
+               }
+
+               if (tmp->samples >= 32) {
+                       int samples = tmp->samples;
+
+                       latency = tmp->total_latency;
+
+                       tmp->total_latency = 0;
+                       tmp->samples = 0;
+                       latency /= samples;
+                       if (latency == 0)
+                               continue;
+                       avg_latency[i].latency = latency;
+               }
+       }
+
+       for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
+               if (!avg_latency[i].latency) {
+                       if (td->avg_buckets[i].latency < last_latency)
+                               td->avg_buckets[i].latency = last_latency;
+                       continue;
+               }
+
+               if (!td->avg_buckets[i].valid)
+                       latency = avg_latency[i].latency;
+               else
+                       latency = (td->avg_buckets[i].latency * 7 +
+                               avg_latency[i].latency) >> 3;
+
+               td->avg_buckets[i].latency = max(latency, last_latency);
+               td->avg_buckets[i].valid = true;
+               last_latency = td->avg_buckets[i].latency;
+       }
+}
+#else
+static inline void throtl_update_latency_buckets(struct throtl_data *td)
+{
+}
+#endif
+
+static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio)
+{
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+       int ret;
+
+       ret = bio_associate_current(bio);
+       if (ret == 0 || ret == -EBUSY)
+               bio->bi_cg_private = tg;
+       blk_stat_set_issue(&bio->bi_issue_stat, bio_sectors(bio));
+#else
+       bio_associate_current(bio);
+#endif
+}
+
  bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
                     struct bio *bio)
  {
@@ -1851,6 +2052,7 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
         struct throtl_service_queue *sq;
         bool rw = bio_data_dir(bio);
         bool throttled = false;
+       struct throtl_data *td = tg->td;
  
         WARN_ON_ONCE(!rcu_read_lock_held());
  
@@ -1860,17 +2062,22 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
  
         spin_lock_irq(q->queue_lock);
  
+       throtl_update_latency_buckets(td);
+
         if (unlikely(blk_queue_bypass(q)))
                 goto out_unlock;
  
+       blk_throtl_assoc_bio(tg, bio);
+       blk_throtl_update_idletime(tg);
+
         sq = &tg->service_queue;
  
  again:
         while (true) {
-               tg->last_dispatch_time[rw] = jiffies;
                 if (tg->last_low_overflow_time[rw] == 0)
                         tg->last_low_overflow_time[rw] = jiffies;
                 throtl_downgrade_check(tg);
+               throtl_upgrade_check(tg);
                 /* throtl is FIFO - if bios are already queued, should queue */
                 if (sq->nr_queued[rw])
                         break;
@@ -1878,8 +2085,8 @@ again:
                 /* if above limits, break to queue */
                 if (!tg_may_dispatch(tg, bio, NULL)) {
                         tg->last_low_overflow_time[rw] = jiffies;
-                       if (throtl_can_upgrade(tg->td, tg)) {
-                               throtl_upgrade_state(tg->td);
+                       if (throtl_can_upgrade(td, tg)) {
+                               throtl_upgrade_state(td);
                                 goto again;
                         }
                         break;
@@ -1923,8 +2130,7 @@ again:
  
         tg->last_low_overflow_time[rw] = jiffies;
  
-       bio_associate_current(bio);
-       tg->td->nr_queued[rw]++;
+       td->nr_queued[rw]++;
         throtl_add_bio_tg(bio, qn, tg);
         throttled = true;
  
@@ -1949,9 +2155,94 @@ out:
          */
         if (!throttled)
                 bio_clear_flag(bio, BIO_THROTTLED);
+
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+       if (throttled || !td->track_bio_latency)
+               bio->bi_issue_stat.stat |= SKIP_LATENCY;
+#endif
         return throttled;
  }
  
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+static void throtl_track_latency(struct throtl_data *td, sector_t size,
+       int op, unsigned long time)
+{
+       struct latency_bucket *latency;
+       int index;
+
+       if (!td || td->limit_index != LIMIT_LOW || op != REQ_OP_READ ||
+           !blk_queue_nonrot(td->queue))
+               return;
+
+       index = request_bucket_index(size);
+
+       latency = get_cpu_ptr(td->latency_buckets);
+       latency[index].total_latency += time;
+       latency[index].samples++;
+       put_cpu_ptr(td->latency_buckets);
+}
+
+void blk_throtl_stat_add(struct request *rq, u64 time_ns)
+{
+       struct request_queue *q = rq->q;
+       struct throtl_data *td = q->td;
+
+       throtl_track_latency(td, blk_stat_size(&rq->issue_stat),
+               req_op(rq), time_ns >> 10);
+}
+
+void blk_throtl_bio_endio(struct bio *bio)
+{
+       struct throtl_grp *tg;
+       u64 finish_time_ns;
+       unsigned long finish_time;
+       unsigned long start_time;
+       unsigned long lat;
+
+       tg = bio->bi_cg_private;
+       if (!tg)
+               return;
+       bio->bi_cg_private = NULL;
+
+       finish_time_ns = ktime_get_ns();
+       tg->last_finish_time = finish_time_ns >> 10;
+
+       start_time = blk_stat_time(&bio->bi_issue_stat) >> 10;
+       finish_time = __blk_stat_time(finish_time_ns) >> 10;
+       if (!start_time || finish_time <= start_time)
+               return;
+
+       lat = finish_time - start_time;
+       /* this is only for bio based driver */
+       if (!(bio->bi_issue_stat.stat & SKIP_LATENCY))
+               throtl_track_latency(tg->td, blk_stat_size(&bio->bi_issue_stat),
+                       bio_op(bio), lat);
+
+       if (tg->latency_target) {
+               int bucket;
+               unsigned int threshold;
+
+               bucket = request_bucket_index(
+                       blk_stat_size(&bio->bi_issue_stat));
+               threshold = tg->td->avg_buckets[bucket].latency +
+                       tg->latency_target;
+               if (lat > threshold)
+                       tg->bad_bio_cnt++;
+               /*
+                * Not race free, could get wrong count, which means cgroups
+                * will be throttled
+                */
+               tg->bio_cnt++;
+       }
+
+       if (time_after(jiffies, tg->bio_cnt_reset_time) || tg->bio_cnt > 1024) {
+               tg->bio_cnt_reset_time = tg->td->throtl_slice + jiffies;
+               tg->bio_cnt /= 2;
+               tg->bad_bio_cnt /= 2;
+       }
+}
+#endif
+
  /*
   * Dispatch all bios from all children tg's queued on @parent_sq.  On
   * return, @parent_sq is guaranteed to not have any active children tg's
@@ -2024,6 +2315,12 @@ int blk_throtl_init(struct request_queue *q)
         td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
         if (!td)
                 return -ENOMEM;
+       td->latency_buckets = __alloc_percpu(sizeof(struct latency_bucket) *
+               LATENCY_BUCKET_SIZE, __alignof__(u64));
+       if (!td->latency_buckets) {
+               kfree(td);
+               return -ENOMEM;
+       }
  
         INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
         throtl_service_queue_init(&td->service_queue);
@@ -2035,10 +2332,13 @@ int blk_throtl_init(struct request_queue *q)
         td->limit_index = LIMIT_MAX;
         td->low_upgrade_time = jiffies;
         td->low_downgrade_time = jiffies;
+
         /* activate policy */
         ret = blkcg_activate_policy(q, &blkcg_policy_throtl);
-       if (ret)
+       if (ret) {
+               free_percpu(td->latency_buckets);
                 kfree(td);
+       }
         return ret;
  }
  
@@ -2047,24 +2347,46 @@ void blk_throtl_exit(struct request_queue *q)
         BUG_ON(!q->td);
         throtl_shutdown_wq(q);
         blkcg_deactivate_policy(q, &blkcg_policy_throtl);
+       free_percpu(q->td->latency_buckets);
         kfree(q->td);
  }
  
  void blk_throtl_register_queue(struct request_queue *q)
  {
         struct throtl_data *td;
+       struct cgroup_subsys_state *pos_css;
+       struct blkcg_gq *blkg;
  
         td = q->td;
         BUG_ON(!td);
  
-       if (blk_queue_nonrot(q))
+       if (blk_queue_nonrot(q)) {
                 td->throtl_slice = DFL_THROTL_SLICE_SSD;
-       else
+               td->dft_idletime_threshold = DFL_IDLE_THRESHOLD_SSD;
+       } else {
                 td->throtl_slice = DFL_THROTL_SLICE_HD;
+               td->dft_idletime_threshold = DFL_IDLE_THRESHOLD_HD;
+       }
  #ifndef CONFIG_BLK_DEV_THROTTLING_LOW
         /* if no low limit, use previous default */
         td->throtl_slice = DFL_THROTL_SLICE_HD;
  #endif
+
+       td->track_bio_latency = !q->mq_ops && !q->request_fn;
+       if (!td->track_bio_latency)
+               blk_stat_enable_accounting(q);
+
+       /*
+        * some tg are created before queue is fully initialized, eg, nonrot
+        * isn't initialized yet
+        */
+       rcu_read_lock();
+       blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) {
+               struct throtl_grp *tg = blkg_to_tg(blkg);
+
+               tg->idletime_threshold = td->dft_idletime_threshold;
+       }
+       rcu_read_unlock();
  }
  
  #ifdef CONFIG_BLK_DEV_THROTTLING_LOW