From 0c650cbcd424fb2c3cefd1684c4815a88732bd5c Mon Sep 17 00:00:00 2001
From: Jiangtian Feng <jiangtianf97@163.com>
Date: Sat, 30 May 2026 23:47:27 +0800
Subject: [PATCH] anolis: block: blk-throttle: add per-cgroup IO latency
 histogram

ANBZ: #36701

blk-throttle already accumulates per-cgroup service_time / wait_time /
completed and exposes them via io.extstat (commit 013b1c893728,
"anolis: blk-throttle: add extra statistics"). Those are sums, so only
a mean latency is derivable; the distribution and the tail (p99) that
operators actually need to answer "which cgroup sees tail latency under
its own io.max limit" are discarded.

Add a per-cgroup end-to-end IO completion-latency histogram, read and
write separately, exported read-only via the cgroup v2 file
io.lat_hist. The histogram uses fixed log2 buckets (~1us .. ~34s);
userspace derives percentiles and samples deltas between two reads,
exactly like /proc/vmstat-style counters (no in-kernel float or
percentile interpolation, mirroring memory.lru_gen_idle_stats).

The histogram is bumped in throtl_stats_update_completion(), next to
the existing sum accounting, using the end-to-end latency (completion
minus throttle entry) that is already computed there. It is updated
lock-free from the completion path on arbitrary CPUs, so the buckets
are atomic64_t (a plain array would undercount under parallel IO; the
existing blkg_rwstat counters are safe only because they are
percpu_counter-backed). Like the rest of the blk-throttle stats, it
only accrues while a finite io.max rule is set (the blk_should_throtl
gate), i.e. the QoS-monitoring use case.

The buckets live at the tail of the internally-allocated struct
throtl_grp (zeroed by kzalloc_node in throtl_pd_alloc, no init or free
change needed), so there is no KABI impact. io.lat_hist is dfl/v2 only,
matching io.extstat.

Splitting the histogram into queue-wait vs device-service components is
left as a follow-up.

Signed-off-by: Jiangtian Feng <jiangtianf97@163.com>
---
 block/blk-throttle.c | 57 ++++++++++++++++++++++++++++++++++++++++++++
 block/blk-throttle.h | 13 ++++++++++
 2 files changed, 70 insertions(+)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index aa0956d1c866..c06a6eb34791 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1002,6 +1002,15 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
 	return false;
 }
 
+/* Map an IO latency in ns to a log2 histogram bucket (~1us .. ~34s). */
+static unsigned int throtl_lat_bucket(u64 lat_ns)
+{
+	if (lat_ns < 1024)
+		return 0;
+	return min_t(unsigned int, ilog2(lat_ns) - 10,
+		     THROTL_LAT_NR_BUCKETS - 1);
+}
+
 static void throtl_stats_update_completion(struct throtl_grp *tg,
 					   uint64_t start_time,
 					   uint64_t io_start_time,
@@ -1017,6 +1026,10 @@ static void throtl_stats_update_completion(struct throtl_grp *tg,
 		blkg_rwstat_add(&tg->wait_time, opf, io_start_time - start_time);
 	blkg_rwstat_add(&tg->completed, opf, 1);
 	local_irq_restore(flags);
+
+	if (time_after64(now, start_time))
+		atomic64_inc(&tg->lat_hist[op_is_write(opf)]
+				    [throtl_lat_bucket(now - start_time)]);
 }
 
 static void throtl_bio_end_io(struct bio *bio)
@@ -1759,6 +1772,45 @@ static int tg_print_extstat(struct seq_file *sf, void *v)
 	return 0;
 }
 
+static void tg_prfill_lat_hist_dir(struct seq_file *sf, const char *dname,
+				   const char *dir, atomic64_t *bucket)
+{
+	int i;
+
+	seq_printf(sf, "%s %s", dname, dir);
+	for (i = 0; i < THROTL_LAT_NR_BUCKETS; i++)
+		seq_printf(sf, " %llu",
+			   (unsigned long long)atomic64_read(&bucket[i]));
+	seq_putc(sf, '\n');
+}
+
+static u64 tg_prfill_lat_hist(struct seq_file *sf, struct blkg_policy_data *pd,
+			      int off)
+{
+	struct throtl_grp *tg = pd_to_tg(pd);
+	const char *dname = blkg_dev_name(pd->blkg);
+
+	if (!dname)
+		return 0;
+
+	/*
+	 * Raw end-to-end latency histogram, read and write separately.
+	 * Bucket boundaries are fixed (see THROTL_LAT_NR_BUCKETS); userspace
+	 * derives percentiles and samples deltas between two reads.
+	 */
+	tg_prfill_lat_hist_dir(sf, dname, "read", tg->lat_hist[READ]);
+	tg_prfill_lat_hist_dir(sf, dname, "write", tg->lat_hist[WRITE]);
+
+	return 0;
+}
+
+static int tg_print_lat_hist(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_lat_hist,
+			  &blkcg_policy_throtl, 0, false);
+	return 0;
+}
+
 static ssize_t tg_set_limit(struct kernfs_open_file *of,
 			  char *buf, size_t nbytes, loff_t off)
 {
@@ -1899,6 +1951,11 @@ static struct cftype throtl_files[] = {
 		.name = "extstat",
 		.seq_show = tg_print_extstat,
 	},
+	{
+		.name = "lat_hist",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = tg_print_lat_hist,
+	},
 	{ }	/* terminate */
 };
 
diff --git a/block/blk-throttle.h b/block/blk-throttle.h
index 4b5ce538ca5b..79c77be33b6d 100644
--- a/block/blk-throttle.h
+++ b/block/blk-throttle.h
@@ -66,6 +66,13 @@ enum {
 	LIMIT_CNT,
 };
 
+/*
+ * Per-cgroup IO completion-latency histogram. Bucket i (1..NR-2) holds
+ * completions whose end-to-end latency falls in [2^(i+10), 2^(i+11)) ns;
+ * bucket 0 is "< ~1us" and the last bucket saturates (>= ~34s).
+ */
+#define THROTL_LAT_NR_BUCKETS	26
+
 struct throtl_grp {
 	/* must be the first member */
 	struct blkg_policy_data pd;
@@ -162,6 +169,12 @@ struct throtl_grp {
 	struct blkg_rwstat total_bytes_queued;
 	/* total IOs throttled */
 	struct blkg_rwstat total_io_queued;
+	/*
+	 * End-to-end IO completion-latency histogram, [READ/WRITE].
+	 * Updated lock-free from the completion path on arbitrary CPUs,
+	 * hence atomic64_t. Read-only export via io.lat_hist.
+	 */
+	atomic64_t lat_hist[2][THROTL_LAT_NR_BUCKETS];
 };
 
 extern struct blkcg_policy blkcg_policy_throtl;
-- 
Gitee