From 0c650cbcd424fb2c3cefd1684c4815a88732bd5c Mon Sep 17 00:00:00 2001 From: Jiangtian Feng Date: Sat, 30 May 2026 23:47:27 +0800 Subject: [PATCH] anolis: block: blk-throttle: add per-cgroup IO latency histogram ANBZ: #36701 blk-throttle already accumulates per-cgroup service_time / wait_time / completed and exposes them via io.extstat (commit 013b1c893728, "anolis: blk-throttle: add extra statistics"). Those are sums, so only a mean latency is derivable; the distribution and the tail (p99) that operators actually need to answer "which cgroup sees tail latency under its own io.max limit" are discarded. Add a per-cgroup end-to-end IO completion-latency histogram, read and write separately, exported read-only via the cgroup v2 file io.lat_hist. The histogram uses fixed log2 buckets (~1us .. ~34s); userspace derives percentiles and samples deltas between two reads, exactly like /proc/vmstat-style counters (no in-kernel float or percentile interpolation, mirroring memory.lru_gen_idle_stats). The histogram is bumped in throtl_stats_update_completion(), next to the existing sum accounting, using the end-to-end latency (completion minus throttle entry) that is already computed there. It is updated lock-free from the completion path on arbitrary CPUs, so the buckets are atomic64_t (a plain array would undercount under parallel IO; the existing blkg_rwstat counters are safe only because they are percpu_counter-backed). Like the rest of the blk-throttle stats, it only accrues while a finite io.max rule is set (the blk_should_throtl gate), i.e. the QoS-monitoring use case. The buckets live at the tail of the internally-allocated struct throtl_grp (zeroed by kzalloc_node in throtl_pd_alloc, no init or free change needed), so there is no KABI impact. io.lat_hist is dfl/v2 only, matching io.extstat. Splitting the histogram into queue-wait vs device-service components is left as a follow-up. Signed-off-by: Jiangtian Feng --- block/blk-throttle.c | 57 ++++++++++++++++++++++++++++++++++++++++++++ block/blk-throttle.h | 13 ++++++++++ 2 files changed, 70 insertions(+) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index aa0956d1c866..c06a6eb34791 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1002,6 +1002,15 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, return false; } +/* Map an IO latency in ns to a log2 histogram bucket (~1us .. ~34s). */ +static unsigned int throtl_lat_bucket(u64 lat_ns) +{ + if (lat_ns < 1024) + return 0; + return min_t(unsigned int, ilog2(lat_ns) - 10, + THROTL_LAT_NR_BUCKETS - 1); +} + static void throtl_stats_update_completion(struct throtl_grp *tg, uint64_t start_time, uint64_t io_start_time, @@ -1017,6 +1026,10 @@ static void throtl_stats_update_completion(struct throtl_grp *tg, blkg_rwstat_add(&tg->wait_time, opf, io_start_time - start_time); blkg_rwstat_add(&tg->completed, opf, 1); local_irq_restore(flags); + + if (time_after64(now, start_time)) + atomic64_inc(&tg->lat_hist[op_is_write(opf)] + [throtl_lat_bucket(now - start_time)]); } static void throtl_bio_end_io(struct bio *bio) @@ -1759,6 +1772,45 @@ static int tg_print_extstat(struct seq_file *sf, void *v) return 0; } +static void tg_prfill_lat_hist_dir(struct seq_file *sf, const char *dname, + const char *dir, atomic64_t *bucket) +{ + int i; + + seq_printf(sf, "%s %s", dname, dir); + for (i = 0; i < THROTL_LAT_NR_BUCKETS; i++) + seq_printf(sf, " %llu", + (unsigned long long)atomic64_read(&bucket[i])); + seq_putc(sf, '\n'); +} + +static u64 tg_prfill_lat_hist(struct seq_file *sf, struct blkg_policy_data *pd, + int off) +{ + struct throtl_grp *tg = pd_to_tg(pd); + const char *dname = blkg_dev_name(pd->blkg); + + if (!dname) + return 0; + + /* + * Raw end-to-end latency histogram, read and write separately. + * Bucket boundaries are fixed (see THROTL_LAT_NR_BUCKETS); userspace + * derives percentiles and samples deltas between two reads. + */ + tg_prfill_lat_hist_dir(sf, dname, "read", tg->lat_hist[READ]); + tg_prfill_lat_hist_dir(sf, dname, "write", tg->lat_hist[WRITE]); + + return 0; +} + +static int tg_print_lat_hist(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_lat_hist, + &blkcg_policy_throtl, 0, false); + return 0; +} + static ssize_t tg_set_limit(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { @@ -1899,6 +1951,11 @@ static struct cftype throtl_files[] = { .name = "extstat", .seq_show = tg_print_extstat, }, + { + .name = "lat_hist", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = tg_print_lat_hist, + }, { } /* terminate */ }; diff --git a/block/blk-throttle.h b/block/blk-throttle.h index 4b5ce538ca5b..79c77be33b6d 100644 --- a/block/blk-throttle.h +++ b/block/blk-throttle.h @@ -66,6 +66,13 @@ enum { LIMIT_CNT, }; +/* + * Per-cgroup IO completion-latency histogram. Bucket i (1..NR-2) holds + * completions whose end-to-end latency falls in [2^(i+10), 2^(i+11)) ns; + * bucket 0 is "< ~1us" and the last bucket saturates (>= ~34s). + */ +#define THROTL_LAT_NR_BUCKETS 26 + struct throtl_grp { /* must be the first member */ struct blkg_policy_data pd; @@ -162,6 +169,12 @@ struct throtl_grp { struct blkg_rwstat total_bytes_queued; /* total IOs throttled */ struct blkg_rwstat total_io_queued; + /* + * End-to-end IO completion-latency histogram, [READ/WRITE]. + * Updated lock-free from the completion path on arbitrary CPUs, + * hence atomic64_t. Read-only export via io.lat_hist. + */ + atomic64_t lat_hist[2][THROTL_LAT_NR_BUCKETS]; }; extern struct blkcg_policy blkcg_policy_throtl; -- Gitee