+static unsigned long __tg_last_low_overflow_time(struct throtl_grp *tg)
+{
+ unsigned long rtime = jiffies, wtime = jiffies;
+
+ if (tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW])
+ rtime = tg->last_low_overflow_time[READ];
+ if (tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW])
+ wtime = tg->last_low_overflow_time[WRITE];
+ return min(rtime, wtime);
+}
+
+/* tg should not be an intermediate node */
+static unsigned long tg_last_low_overflow_time(struct throtl_grp *tg)
+{
+ struct throtl_service_queue *parent_sq;
+ struct throtl_grp *parent = tg;
+ unsigned long ret = __tg_last_low_overflow_time(tg);
+
+ while (true) {
+ parent_sq = parent->service_queue.parent_sq;
+ parent = sq_to_tg(parent_sq);
+ if (!parent)
+ break;
+
+ /*
+ * The parent doesn't have low limit, it always reaches low
+ * limit. Its overflow time is useless for children
+ */
+ if (!parent->bps[READ][LIMIT_LOW] &&
+ !parent->iops[READ][LIMIT_LOW] &&
+ !parent->bps[WRITE][LIMIT_LOW] &&
+ !parent->iops[WRITE][LIMIT_LOW])
+ continue;
+ if (time_after(__tg_last_low_overflow_time(parent), ret))
+ ret = __tg_last_low_overflow_time(parent);
+ }
+ return ret;
+}
+
+static bool throtl_tg_is_idle(struct throtl_grp *tg)
+{
+ /*
+ * cgroup is idle if:
+ * - single idle is too long, longer than a fixed value (in case user
+ * configure a too big threshold) or 4 times of slice
+ * - average think time is more than threshold
+ * - IO latency is largely below threshold
+ */
+ unsigned long time = jiffies_to_usecs(4 * tg->td->throtl_slice);
+
+ time = min_t(unsigned long, MAX_IDLE_TIME, time);
+ return (ktime_get_ns() >> 10) - tg->last_finish_time > time ||
+ tg->avg_idletime > tg->idletime_threshold ||
+ (tg->latency_target && tg->bio_cnt &&
+ tg->bad_bio_cnt * 5 < tg->bio_cnt);
+}
+
+static bool throtl_tg_can_upgrade(struct throtl_grp *tg)
+{
+ struct throtl_service_queue *sq = &tg->service_queue;
+ bool read_limit, write_limit;
+
+ /*
+ * if cgroup reaches low limit (if low limit is 0, the cgroup always
+ * reaches), it's ok to upgrade to next limit
+ */
+ read_limit = tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW];
+ write_limit = tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW];
+ if (!read_limit && !write_limit)
+ return true;
+ if (read_limit && sq->nr_queued[READ] &&
+ (!write_limit || sq->nr_queued[WRITE]))
+ return true;
+ if (write_limit && sq->nr_queued[WRITE] &&
+ (!read_limit || sq->nr_queued[READ]))
+ return true;
+
+ if (time_after_eq(jiffies,
+ tg_last_low_overflow_time(tg) + tg->td->throtl_slice) &&
+ throtl_tg_is_idle(tg))
+ return true;
+ return false;
+}
+
+static bool throtl_hierarchy_can_upgrade(struct throtl_grp *tg)
+{
+ while (true) {
+ if (throtl_tg_can_upgrade(tg))
+ return true;
+ tg = sq_to_tg(tg->service_queue.parent_sq);
+ if (!tg || !tg_to_blkg(tg)->parent)
+ return false;
+ }
+ return false;
+}
+
+static bool throtl_can_upgrade(struct throtl_data *td,
+ struct throtl_grp *this_tg)
+{
+ struct cgroup_subsys_state *pos_css;
+ struct blkcg_gq *blkg;
+
+ if (td->limit_index != LIMIT_LOW)
+ return false;
+
+ if (time_before(jiffies, td->low_downgrade_time + td->throtl_slice))
+ return false;
+
+ rcu_read_lock();
+ blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
+ struct throtl_grp *tg = blkg_to_tg(blkg);
+
+ if (tg == this_tg)
+ continue;
+ if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children))
+ continue;
+ if (!throtl_hierarchy_can_upgrade(tg)) {
+ rcu_read_unlock();
+ return false;
+ }
+ }
+ rcu_read_unlock();
+ return true;
+}
+
+static void throtl_upgrade_check(struct throtl_grp *tg)
+{
+ unsigned long now = jiffies;
+
+ if (tg->td->limit_index != LIMIT_LOW)
+ return;
+
+ if (time_after(tg->last_check_time + tg->td->throtl_slice, now))
+ return;
+
+ tg->last_check_time = now;
+
+ if (!time_after_eq(now,
+ __tg_last_low_overflow_time(tg) + tg->td->throtl_slice))
+ return;
+
+ if (throtl_can_upgrade(tg->td, NULL))
+ throtl_upgrade_state(tg->td);
+}
+
+static void throtl_upgrade_state(struct throtl_data *td)
+{
+ struct cgroup_subsys_state *pos_css;
+ struct blkcg_gq *blkg;
+
+ td->limit_index = LIMIT_MAX;
+ td->low_upgrade_time = jiffies;
+ td->scale = 0;
+ rcu_read_lock();
+ blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
+ struct throtl_grp *tg = blkg_to_tg(blkg);
+ struct throtl_service_queue *sq = &tg->service_queue;
+
+ tg->disptime = jiffies - 1;
+ throtl_select_dispatch(sq);
+ throtl_schedule_next_dispatch(sq, false);
+ }
+ rcu_read_unlock();
+ throtl_select_dispatch(&td->service_queue);
+ throtl_schedule_next_dispatch(&td->service_queue, false);
+ queue_work(kthrotld_workqueue, &td->dispatch_work);
+}
+
+static void throtl_downgrade_state(struct throtl_data *td, int new)
+{
+ td->scale /= 2;
+
+ if (td->scale) {
+ td->low_upgrade_time = jiffies - td->scale * td->throtl_slice;
+ return;
+ }
+
+ td->limit_index = new;
+ td->low_downgrade_time = jiffies;
+}
+
+static bool throtl_tg_can_downgrade(struct throtl_grp *tg)
+{
+ struct throtl_data *td = tg->td;
+ unsigned long now = jiffies;
+
+ /*
+ * If cgroup is below low limit, consider downgrade and throttle other
+ * cgroups
+ */
+ if (time_after_eq(now, td->low_upgrade_time + td->throtl_slice) &&
+ time_after_eq(now, tg_last_low_overflow_time(tg) +
+ td->throtl_slice) &&
+ (!throtl_tg_is_idle(tg) ||
+ !list_empty(&tg_to_blkg(tg)->blkcg->css.children)))
+ return true;
+ return false;
+}
+
+static bool throtl_hierarchy_can_downgrade(struct throtl_grp *tg)
+{
+ while (true) {
+ if (!throtl_tg_can_downgrade(tg))
+ return false;
+ tg = sq_to_tg(tg->service_queue.parent_sq);
+ if (!tg || !tg_to_blkg(tg)->parent)
+ break;
+ }
+ return true;
+}
+
+static void throtl_downgrade_check(struct throtl_grp *tg)
+{
+ uint64_t bps;
+ unsigned int iops;
+ unsigned long elapsed_time;
+ unsigned long now = jiffies;
+
+ if (tg->td->limit_index != LIMIT_MAX ||
+ !tg->td->limit_valid[LIMIT_LOW])
+ return;
+ if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children))
+ return;
+ if (time_after(tg->last_check_time + tg->td->throtl_slice, now))
+ return;
+
+ elapsed_time = now - tg->last_check_time;
+ tg->last_check_time = now;
+
+ if (time_before(now, tg_last_low_overflow_time(tg) +
+ tg->td->throtl_slice))
+ return;
+
+ if (tg->bps[READ][LIMIT_LOW]) {
+ bps = tg->last_bytes_disp[READ] * HZ;
+ do_div(bps, elapsed_time);
+ if (bps >= tg->bps[READ][LIMIT_LOW])
+ tg->last_low_overflow_time[READ] = now;
+ }
+
+ if (tg->bps[WRITE][LIMIT_LOW]) {
+ bps = tg->last_bytes_disp[WRITE] * HZ;
+ do_div(bps, elapsed_time);
+ if (bps >= tg->bps[WRITE][LIMIT_LOW])
+ tg->last_low_overflow_time[WRITE] = now;
+ }
+
+ if (tg->iops[READ][LIMIT_LOW]) {
+ iops = tg->last_io_disp[READ] * HZ / elapsed_time;
+ if (iops >= tg->iops[READ][LIMIT_LOW])
+ tg->last_low_overflow_time[READ] = now;
+ }
+
+ if (tg->iops[WRITE][LIMIT_LOW]) {
+ iops = tg->last_io_disp[WRITE] * HZ / elapsed_time;
+ if (iops >= tg->iops[WRITE][LIMIT_LOW])
+ tg->last_low_overflow_time[WRITE] = now;
+ }
+
+ /*
+ * If cgroup is below low limit, consider downgrade and throttle other
+ * cgroups
+ */
+ if (throtl_hierarchy_can_downgrade(tg))
+ throtl_downgrade_state(tg->td, LIMIT_LOW);
+
+ tg->last_bytes_disp[READ] = 0;
+ tg->last_bytes_disp[WRITE] = 0;
+ tg->last_io_disp[READ] = 0;
+ tg->last_io_disp[WRITE] = 0;
+}
+
+static void blk_throtl_update_idletime(struct throtl_grp *tg)
+{
+ unsigned long now = ktime_get_ns() >> 10;
+ unsigned long last_finish_time = tg->last_finish_time;
+
+ if (now <= last_finish_time || last_finish_time == 0 ||
+ last_finish_time == tg->checked_last_finish_time)
+ return;
+
+ tg->avg_idletime = (tg->avg_idletime * 7 + now - last_finish_time) >> 3;
+ tg->checked_last_finish_time = last_finish_time;
+}
+
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+static void throtl_update_latency_buckets(struct throtl_data *td)
+{
+ struct avg_latency_bucket avg_latency[LATENCY_BUCKET_SIZE];
+ int i, cpu;
+ unsigned long last_latency = 0;
+ unsigned long latency;
+
+ if (!blk_queue_nonrot(td->queue))
+ return;
+ if (time_before(jiffies, td->last_calculate_time + HZ))
+ return;
+ td->last_calculate_time = jiffies;
+
+ memset(avg_latency, 0, sizeof(avg_latency));
+ for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
+ struct latency_bucket *tmp = &td->tmp_buckets[i];
+
+ for_each_possible_cpu(cpu) {
+ struct latency_bucket *bucket;
+
+ /* this isn't race free, but ok in practice */
+ bucket = per_cpu_ptr(td->latency_buckets, cpu);
+ tmp->total_latency += bucket[i].total_latency;
+ tmp->samples += bucket[i].samples;
+ bucket[i].total_latency = 0;
+ bucket[i].samples = 0;
+ }
+
+ if (tmp->samples >= 32) {
+ int samples = tmp->samples;
+
+ latency = tmp->total_latency;
+
+ tmp->total_latency = 0;
+ tmp->samples = 0;
+ latency /= samples;
+ if (latency == 0)
+ continue;
+ avg_latency[i].latency = latency;
+ }
+ }
+
+ for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
+ if (!avg_latency[i].latency) {
+ if (td->avg_buckets[i].latency < last_latency)
+ td->avg_buckets[i].latency = last_latency;
+ continue;
+ }
+
+ if (!td->avg_buckets[i].valid)
+ latency = avg_latency[i].latency;
+ else
+ latency = (td->avg_buckets[i].latency * 7 +
+ avg_latency[i].latency) >> 3;
+
+ td->avg_buckets[i].latency = max(latency, last_latency);
+ td->avg_buckets[i].valid = true;
+ last_latency = td->avg_buckets[i].latency;
+ }
+}
+#else
+static inline void throtl_update_latency_buckets(struct throtl_data *td)
+{
+}
+#endif
+
+static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio)
+{
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+ int ret;
+
+ ret = bio_associate_current(bio);
+ if (ret == 0 || ret == -EBUSY)
+ bio->bi_cg_private = tg;
+ blk_stat_set_issue(&bio->bi_issue_stat, bio_sectors(bio));
+#else
+ bio_associate_current(bio);
+#endif
+}
+