From 383bf6ed86d0b526a854365d960c44745cc70e8e Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Fri, 29 Aug 2025 16:11:16 +0800 Subject: [PATCH 1/9] sched/fair: Add related data structure for task based throttle ANBZ: #32935 commit 2cd571245b43492867bf1b4252485f3e6647b643 upstream. Add related data structures for this new throttle functionality. Tesed-by: K Prateek Nayak Signed-off-by: Valentin Schneider Signed-off-by: Aaron Lu Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Chengming Zhou Tested-by: Valentin Schneider Tested-by: Matteo Martelli Link: https://lore.kernel.org/r/20250829081120.806-2-ziqianlu@bytedance.com Signed-off-by: Peng Wang --- include/linux/sched.h | 5 +++++ kernel/sched/core.c | 3 +++ kernel/sched/fair.c | 13 +++++++++++++ kernel/sched/sched.h | 2 ++ 4 files changed, 23 insertions(+) diff --git a/include/linux/sched.h b/include/linux/sched.h index 2b2945f053da..c7479be510ea 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -894,6 +894,11 @@ struct task_struct { #ifdef CONFIG_CGROUP_SCHED struct task_group *sched_task_group; +#ifdef CONFIG_CFS_BANDWIDTH + struct callback_head sched_throttle_work; + struct list_head throttle_node; + bool throttled; +#endif #endif #ifdef CONFIG_UCLAMP_TASK diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7d435b1b6155..cd7112483eab 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4475,6 +4475,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) #ifdef CONFIG_FAIR_GROUP_SCHED p->se.cfs_rq = NULL; +#ifdef CONFIG_CFS_BANDWIDTH + init_cfs_throttle_work(p); +#endif #endif #ifdef CONFIG_SCHEDSTATS diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8255faa8c9fb..24e264b94ee6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6441,6 +6441,18 @@ static inline int throttled_lb_pair(struct task_group *tg, throttled_hierarchy(dest_cfs_rq); } +static void throttle_cfs_rq_work(struct callback_head *work) +{ +} + +void init_cfs_throttle_work(struct task_struct *p) +{ + init_task_work(&p->sched_throttle_work, throttle_cfs_rq_work); + /* Protect against double add, see throttle_cfs_rq() and throttle_cfs_rq_work() */ + p->sched_throttle_work.next = &p->sched_throttle_work; + INIT_LIST_HEAD(&p->throttle_node); +} + static int tg_unthrottle_up(struct task_group *tg, void *data) { struct rq *rq = data; @@ -7222,6 +7234,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) #ifdef CONFIG_SMP INIT_LIST_HEAD(&cfs_rq->throttled_csd_list); #endif + INIT_LIST_HEAD(&cfs_rq->throttled_limbo_list); } void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, int init) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 5f74c737ad41..a9a8c6f8acc3 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -962,6 +962,7 @@ struct cfs_rq { #ifdef CONFIG_SMP struct list_head throttled_csd_list; #endif + struct list_head throttled_limbo_list; #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -3083,6 +3084,7 @@ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); extern void init_dl_entity(struct sched_dl_entity *dl_se); +extern void init_cfs_throttle_work(struct task_struct *p); #define BW_SHIFT 20 #define BW_UNIT (1 << BW_SHIFT) -- Gitee From f0df3816ec06c3622610caf92012f5f6a85b66ea Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Fri, 29 Aug 2025 16:11:17 +0800 Subject: [PATCH 2/9] sched/fair: Implement throttle task work and related helpers ANBZ: #32935 commit 7fc2d14392475e368a2a7be458aba4eecdf2439b upstream. Implement throttle_cfs_rq_work() task work which gets executed on task's ret2user path where the task is dequeued and marked as throttled. Signed-off-by: Valentin Schneider Signed-off-by: Aaron Lu Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Chengming Zhou Tested-by: Valentin Schneider Tested-by: Matteo Martelli Tested-by: K Prateek Nayak Link: https://lore.kernel.org/r/20250829081120.806-3-ziqianlu@bytedance.com Signed-off-by: Peng Wang --- kernel/sched/fair.c | 65 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 24e264b94ee6..be91e9286eb3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6441,8 +6441,51 @@ static inline int throttled_lb_pair(struct task_group *tg, throttled_hierarchy(dest_cfs_rq); } +static inline bool task_is_throttled(struct task_struct *p) +{ + return cfs_bandwidth_used() && p->throttled; +} + +static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags); static void throttle_cfs_rq_work(struct callback_head *work) { + struct task_struct *p = container_of(work, struct task_struct, sched_throttle_work); + struct sched_entity *se; + struct cfs_rq *cfs_rq; + struct rq *rq; + + WARN_ON_ONCE(p != current); + p->sched_throttle_work.next = &p->sched_throttle_work; + + /* + * If task is exiting, then there won't be a return to userspace, so we + * don't have to bother with any of this. + */ + if ((p->flags & PF_EXITING)) + return; + + scoped_guard(task_rq_lock, p) { + se = &p->se; + cfs_rq = cfs_rq_of(se); + + /* Raced, forget */ + if (p->sched_class != &fair_sched_class) + return; + + /* + * If not in limbo, then either replenish has happened or this + * task got migrated out of the throttled cfs_rq, move along. + */ + if (!cfs_rq->throttle_count) + return; + rq = scope.rq; + update_rq_clock(rq); + WARN_ON_ONCE(p->throttled || !list_empty(&p->throttle_node)); + dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_SPECIAL); + list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list); + p->throttled = true; + resched_curr(rq); + } } void init_cfs_throttle_work(struct task_struct *p) @@ -6482,6 +6525,26 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) return 0; } +static inline bool task_has_throttle_work(struct task_struct *p) +{ + return p->sched_throttle_work.next != &p->sched_throttle_work; +} + +static inline void task_throttle_setup_work(struct task_struct *p) +{ + if (task_has_throttle_work(p)) + return; + + /* + * Kthreads and exiting tasks don't return to userspace, so adding the + * work is pointless + */ + if ((p->flags & (PF_EXITING | PF_KTHREAD))) + return; + + task_work_add(p, &p->sched_throttle_work, TWA_RESUME); +} + static int tg_throttle_down(struct task_group *tg, void *data) { struct rq *rq = data; @@ -7413,6 +7476,8 @@ static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} static inline void sync_throttle(struct task_group *tg, int cpu) {} static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} +static void task_throttle_setup_work(struct task_struct *p) {} +static bool task_is_throttled(struct task_struct *p) { return false; } static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) { -- Gitee From 769e28745e081597619fba825fa9dcfbcc10b6a2 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Fri, 29 Aug 2025 16:11:18 +0800 Subject: [PATCH 3/9] sched/fair: Switch to task based throttle model ANBZ: #32935 commit e1fad12dcb66b7f35573c52b665830a1538f9886 upstream. In current throttle model, when a cfs_rq is throttled, its entity will be dequeued from cpu's rq, making tasks attached to it not able to run, thus achiveing the throttle target. This has a drawback though: assume a task is a reader of percpu_rwsem and is waiting. When it gets woken, it can not run till its task group's next period comes, which can be a relatively long time. Waiting writer will have to wait longer due to this and it also makes further reader build up and eventually trigger task hung. To improve this situation, change the throttle model to task based, i.e. when a cfs_rq is throttled, record its throttled status but do not remove it from cpu's rq. Instead, for tasks that belong to this cfs_rq, when they get picked, add a task work to them so that when they return to user, they can be dequeued there. In this way, tasks throttled will not hold any kernel resources. And on unthrottle, enqueue back those tasks so they can continue to run. Throttled cfs_rq's PELT clock is handled differently now: previously the cfs_rq's PELT clock is stopped once it entered throttled state but since now tasks(in kernel mode) can continue to run, change the behaviour to stop PELT clock when the throttled cfs_rq has no tasks left. Suggested-by: Chengming Zhou # tag on pick Signed-off-by: Valentin Schneider Signed-off-by: Aaron Lu Signed-off-by: Peter Zijlstra (Intel) Tested-by: Valentin Schneider Tested-by: Chen Yu Tested-by: Matteo Martelli Tested-by: K Prateek Nayak Link: https://lore.kernel.org/r/20250829081120.806-4-ziqianlu@bytedance.com Signed-off-by: Peng Wang --- kernel/sched/fair.c | 397 +++++++++++++++++++------------------------ kernel/sched/pelt.h | 5 +- kernel/sched/sched.h | 3 +- 3 files changed, 180 insertions(+), 225 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index be91e9286eb3..3291e64aa8ed 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5917,18 +5917,23 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (cfs_rq->nr_queued == 1) { check_enqueue_throttle(cfs_rq); - if (!throttled_hierarchy(cfs_rq)) { - list_add_leaf_cfs_rq(cfs_rq); - } else { + list_add_leaf_cfs_rq(cfs_rq); #ifdef CONFIG_CFS_BANDWIDTH + if (throttled_hierarchy(cfs_rq)) { struct rq *rq = rq_of(cfs_rq); if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock) cfs_rq->throttled_clock = rq_clock(rq); if (!cfs_rq->throttled_clock_self) cfs_rq->throttled_clock_self = rq_clock(rq); -#endif + + if (cfs_rq->pelt_clock_throttled) { + cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) - + cfs_rq->throttled_clock_pelt; + cfs_rq->pelt_clock_throttled = 0; + } } +#endif } } @@ -5988,8 +5993,6 @@ static void set_delayed(struct sched_entity *se) cfs_rq->h_nr_expellee -= expellee_delta; } #endif - if (cfs_rq_throttled(cfs_rq)) - break; } } @@ -6033,8 +6036,6 @@ static void __clear_delayed(struct sched_entity *se, bool keep_nr_tasks) cfs_rq->h_nr_expellee += expellee_delta; } #endif - if (cfs_rq_throttled(cfs_rq)) - break; } } @@ -6125,8 +6126,18 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (flags & DEQUEUE_DELAYED) finish_delayed_dequeue_entity(se); - if (cfs_rq->nr_queued == 0) + if (cfs_rq->nr_queued == 0) { update_idle_cfs_rq_clock_pelt(cfs_rq); +#ifdef CONFIG_CFS_BANDWIDTH + if (throttled_hierarchy(cfs_rq)) { + struct rq *rq = rq_of(cfs_rq); + + list_del_leaf_cfs_rq(cfs_rq); + cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); + cfs_rq->pelt_clock_throttled = 1; + } +#endif + } return true; } @@ -6483,6 +6494,10 @@ static void throttle_cfs_rq_work(struct callback_head *work) WARN_ON_ONCE(p->throttled || !list_empty(&p->throttle_node)); dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_SPECIAL); list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list); + /* + * Must not set throttled before dequeue or dequeue will + * mistakenly regard this task as an already throttled one. + */ p->throttled = true; resched_curr(rq); } @@ -6496,32 +6511,124 @@ void init_cfs_throttle_work(struct task_struct *p) INIT_LIST_HEAD(&p->throttle_node); } +/* + * Task is throttled and someone wants to dequeue it again: + * it could be sched/core when core needs to do things like + * task affinity change, task group change, task sched class + * change etc. and in these cases, DEQUEUE_SLEEP is not set; + * or the task is blocked after throttled due to freezer etc. + * and in these cases, DEQUEUE_SLEEP is set. + */ +static void detach_task_cfs_rq(struct task_struct *p); +static void dequeue_throttled_task(struct task_struct *p, int flags) +{ + WARN_ON_ONCE(p->se.on_rq); + list_del_init(&p->throttle_node); + + /* task blocked after throttled */ + if (flags & DEQUEUE_SLEEP) { + p->throttled = false; + return; + } + + /* + * task is migrating off its old cfs_rq, detach + * the task's load from its old cfs_rq. + */ + if (task_on_rq_migrating(p)) + detach_task_cfs_rq(p); +} + +static bool enqueue_throttled_task(struct task_struct *p) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(&p->se); + + /* @p should have gone through dequeue_throttled_task() first */ + WARN_ON_ONCE(!list_empty(&p->throttle_node)); + + /* + * If the throttled task @p is enqueued to a throttled cfs_rq, + * take the fast path by directly putting the task on the + * target cfs_rq's limbo list. + * + * Do not do that when @p is current because the following race can + * cause @p's group_node to be incorrectly re-inserted in its rq's + * cfs_tasks list, despite being throttled: + * + * cpuX cpuY + * p ret2user + * throttle_cfs_rq_work() sched_move_task(p) + * LOCK task_rq_lock + * dequeue_task_fair(p) + * UNLOCK task_rq_lock + * LOCK task_rq_lock + * task_current(p) == true + * task_on_rq_queued(p) == true + * dequeue_task(p) + * put_prev_task(p) + * sched_change_group() + * enqueue_task(p) -> p's new cfs_rq + * is throttled, go + * fast path and skip + * actual enqueue + * set_next_task(p) + * list_move(&se->group_node, &rq->cfs_tasks); // bug + * schedule() + * + * In the above race case, @p current cfs_rq is in the same rq as + * its previous cfs_rq because sched_move_task() only moves a task + * to a different group from the same rq, so we can use its current + * cfs_rq to derive rq and test if the task is current. + */ + if (throttled_hierarchy(cfs_rq) && + !task_current(rq_of(cfs_rq), p)) { + list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list); + return true; + } + + /* we can't take the fast path, do an actual enqueue */ + p->throttled = false; + return false; +} + +static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags); static int tg_unthrottle_up(struct task_group *tg, void *data) { struct rq *rq = data; struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + struct task_struct *p, *tmp; - cfs_rq->throttle_count--; - if (!cfs_rq->throttle_count) { + if (--cfs_rq->throttle_count) + return 0; + + if (cfs_rq->pelt_clock_throttled) { cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) - cfs_rq->throttled_clock_pelt; + cfs_rq->pelt_clock_throttled = 0; + } - /* Add cfs_rq with load or one or more already running entities to the list */ - if (!cfs_rq_is_decayed(cfs_rq)) - list_add_leaf_cfs_rq(cfs_rq); + if (cfs_rq->throttled_clock_self) { + u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self; - if (cfs_rq->throttled_clock_self) { - u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self; + cfs_rq->throttled_clock_self = 0; - cfs_rq->throttled_clock_self = 0; + if (WARN_ON_ONCE((s64)delta < 0)) + delta = 0; - if (SCHED_WARN_ON((s64)delta < 0)) - delta = 0; + cfs_rq->throttled_clock_self_time += delta; + } - cfs_rq->throttled_clock_self_time += delta; - } + /* Re-enqueue the tasks that have been throttled at this level. */ + list_for_each_entry_safe(p, tmp, &cfs_rq->throttled_limbo_list, throttle_node) { + list_del_init(&p->throttle_node); + p->throttled = false; + enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP); } + /* Add cfs_rq with load or one or more already running entities to the list */ + if (!cfs_rq_is_decayed(cfs_rq)) + list_add_leaf_cfs_rq(cfs_rq); + return 0; } @@ -6550,17 +6657,24 @@ static int tg_throttle_down(struct task_group *tg, void *data) struct rq *rq = data; struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + if (cfs_rq->throttle_count++) + return 0; + /* group is entering throttled state, stop time */ - if (!cfs_rq->throttle_count) { - cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); + SCHED_WARN_ON(cfs_rq->throttled_clock_self); + if (cfs_rq->nr_queued) + cfs_rq->throttled_clock_self = rq_clock(rq); + else { + /* + * For cfs_rqs that still have entities enqueued, PELT clock + * stop happens at dequeue time when all entities are dequeued. + */ list_del_leaf_cfs_rq(cfs_rq); - - SCHED_WARN_ON(cfs_rq->throttled_clock_self); - if (cfs_rq->nr_queued) - cfs_rq->throttled_clock_self = rq_clock(rq); + cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); + cfs_rq->pelt_clock_throttled = 1; } - cfs_rq->throttle_count++; + WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list)); return 0; } @@ -6568,12 +6682,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - struct sched_entity *se; - long queued_delta, runnable_delta, idle_delta, dequeue = 1; -#if defined(CONFIG_SCHED_CORE) && defined(CONFIG_GROUP_IDENTITY) - long expeller_delta; - long expellee_delta; -#endif + int dequeue = 1; raw_spin_lock(&cfs_b->lock); /* This will start the period timer if necessary */ @@ -6596,90 +6705,10 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) if (!dequeue) return false; /* Throttle no longer required. */ - se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; - /* freeze hierarchy runnable averages while throttled */ rcu_read_lock(); walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); rcu_read_unlock(); - - queued_delta = cfs_rq->h_nr_queued; - runnable_delta = cfs_rq->h_nr_runnable; - idle_delta = cfs_rq->h_nr_idle; -#if defined(CONFIG_SCHED_CORE) && defined(CONFIG_GROUP_IDENTITY) - if (sched_core_enabled(rq)) { - expeller_delta = cfs_rq->h_nr_expeller; - expellee_delta = cfs_rq->h_nr_expellee; - } -#endif - for_each_sched_entity(se) { - struct cfs_rq *qcfs_rq = cfs_rq_of(se); - int flags; - - /* throttled entity or throttle-on-deactivate */ - if (!se->on_rq) - goto done; - - if (se->my_q != cfs_rq) - cgroup_idle_start(se); - - /* - * Abuse SPECIAL to avoid delayed dequeue in this instance. - * This avoids teaching dequeue_entities() about throttled - * entities and keeps things relatively simple. - */ - flags = DEQUEUE_SLEEP | DEQUEUE_SPECIAL; - if (se->sched_delayed) - flags |= DEQUEUE_DELAYED; - dequeue_entity(qcfs_rq, se, flags); - - if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_delta = cfs_rq->h_nr_queued; - - qcfs_rq->h_nr_queued -= queued_delta; - qcfs_rq->h_nr_runnable -= runnable_delta; - qcfs_rq->h_nr_idle -= idle_delta; -#if defined(CONFIG_SCHED_CORE) && defined(CONFIG_GROUP_IDENTITY) - if (sched_core_enabled(rq)) { - qcfs_rq->h_nr_expeller -= expeller_delta; - qcfs_rq->h_nr_expellee -= expellee_delta; - } -#endif - - if (qcfs_rq->load.weight) { - /* Avoid re-evaluating load for this entity: */ - se = parent_entity(se); - break; - } - } - - for_each_sched_entity(se) { - struct cfs_rq *qcfs_rq = cfs_rq_of(se); - /* throttled entity or throttle-on-deactivate */ - if (!se->on_rq) - goto done; - - update_load_avg(qcfs_rq, se, 0); - se_update_runnable(se); - - if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_delta = cfs_rq->h_nr_queued; - - qcfs_rq->h_nr_queued -= queued_delta; - qcfs_rq->h_nr_runnable -= runnable_delta; - qcfs_rq->h_nr_idle -= idle_delta; -#if defined(CONFIG_SCHED_CORE) && defined(CONFIG_GROUP_IDENTITY) - if (sched_core_enabled(rq)) { - qcfs_rq->h_nr_expeller -= expeller_delta; - qcfs_rq->h_nr_expellee -= expellee_delta; - } -#endif - } - - /* At this point se is NULL and we are at root level*/ - sub_nr_running(rq, queued_delta); - -done: /* * Note: distribution will already see us throttled via the * throttled-list. rq->lock protects completion. @@ -6693,15 +6722,22 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) { - struct cfs_rq *bottom_cfs_rq = cfs_rq; struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - struct sched_entity *se; - long queued_delta, runnable_delta, idle_delta; -#if defined(CONFIG_SCHED_CORE) && defined(CONFIG_GROUP_IDENTITY) - long expeller_delta; - long expellee_delta; -#endif + struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)]; + + /* + * It's possible we are called with !runtime_remaining due to things + * like user changed quota setting(see tg_set_cfs_bandwidth()) or async + * unthrottled us with a positive runtime_remaining but others still + * running entities consumed that runtime before we reached here. + * + * Anyway, we can't unthrottle this cfs_rq without any runtime remaining + * because any enqueue in tg_unthrottle_up() will immediately trigger a + * throttle, which is not supposed to happen on unthrottle path. + */ + if (cfs_rq->runtime_enabled && cfs_rq->runtime_remaining <= 0) + return; se = cfs_rq->tg->se[cpu_of(rq)]; @@ -6734,75 +6770,6 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) goto unthrottle_throttle; } - queued_delta = cfs_rq->h_nr_queued; - runnable_delta = cfs_rq->h_nr_runnable; - idle_delta = cfs_rq->h_nr_idle; -#if defined(CONFIG_SCHED_CORE) && defined(CONFIG_GROUP_IDENTITY) - if (sched_core_enabled(rq)) { - expeller_delta = cfs_rq->h_nr_expeller; - expellee_delta = cfs_rq->h_nr_expellee; - } -#endif - for_each_sched_entity(se) { - struct cfs_rq *qcfs_rq = cfs_rq_of(se); - - /* Handle any unfinished DELAY_DEQUEUE business first. */ - if (se->sched_delayed) { - int flags = DEQUEUE_SLEEP | DEQUEUE_DELAYED; - - dequeue_entity(qcfs_rq, se, flags); - } else if (se->on_rq) - break; - - if (se->my_q != bottom_cfs_rq) - cgroup_idle_end(se); - enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP); - - if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_delta = cfs_rq->h_nr_queued; - - qcfs_rq->h_nr_queued += queued_delta; - qcfs_rq->h_nr_runnable += runnable_delta; - qcfs_rq->h_nr_idle += idle_delta; -#if defined(CONFIG_SCHED_CORE) && defined(CONFIG_GROUP_IDENTITY) - if (sched_core_enabled(rq)) { - qcfs_rq->h_nr_expeller += expeller_delta; - qcfs_rq->h_nr_expellee += expellee_delta; - } -#endif - - /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(qcfs_rq)) - goto unthrottle_throttle; - } - - for_each_sched_entity(se) { - struct cfs_rq *qcfs_rq = cfs_rq_of(se); - - update_load_avg(qcfs_rq, se, UPDATE_TG); - se_update_runnable(se); - - if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_delta = cfs_rq->h_nr_queued; - - qcfs_rq->h_nr_queued += queued_delta; - qcfs_rq->h_nr_runnable += runnable_delta; - qcfs_rq->h_nr_idle += idle_delta; -#if defined(CONFIG_SCHED_CORE) && defined(CONFIG_GROUP_IDENTITY) - if (sched_core_enabled(rq)) { - qcfs_rq->h_nr_expeller += expeller_delta; - qcfs_rq->h_nr_expellee += expellee_delta; - } -#endif - - /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(qcfs_rq)) - goto unthrottle_throttle; - } - - /* At this point se is NULL and we are at root level*/ - add_nr_running(rq, queued_delta); - unthrottle_throttle: assert_list_leaf_cfs_rq(rq); @@ -7478,6 +7445,8 @@ static inline void sync_throttle(struct task_group *tg, int cpu) {} static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} static void task_throttle_setup_work(struct task_struct *p) {} static bool task_is_throttled(struct task_struct *p) { return false; } +static void dequeue_throttled_task(struct task_struct *p, int flags) {} +static bool enqueue_throttled_task(struct task_struct *p) { return false; } static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) { @@ -7687,6 +7656,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) int task_new = !(flags & ENQUEUE_WAKEUP); u64 slice = 0; + if (task_is_throttled(p) && enqueue_throttled_task(p)) + return; + /* * The code below (indirectly) updates schedutil which looks at * the cfs_rq utilization to select a frequency. @@ -7758,15 +7730,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_is_idle(cfs_rq)) h_nr_idle = 1; - /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(cfs_rq)) { -#ifdef CONFIG_FAIR_GROUP_SCHED - if (cfs_rq->nr_queued == 1) - cgroup_idle_end(se->parent); -#endif - goto enqueue_throttle; - } - flags = ENQUEUE_WAKEUP; } @@ -7794,10 +7757,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_is_idle(cfs_rq)) h_nr_idle = 1; - - /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(cfs_rq)) - goto enqueue_throttle; } /* At this point se is NULL and we are at root level*/ @@ -7819,8 +7778,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) */ if (!task_new) check_update_overutilized_status(rq); - -enqueue_throttle: assert_list_leaf_cfs_rq(rq); hrtick_update(rq); @@ -7897,15 +7854,6 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) if (cfs_rq_is_idle(cfs_rq)) h_nr_idle = h_nr_queued; - /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(cfs_rq)) { -#ifdef CONFIG_FAIR_GROUP_SCHED - if (!cfs_rq->nr_queued) - cgroup_idle_start(se->parent); -#endif - return 0; - } - /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { slice = cfs_rq_slice(cfs_rq); @@ -7948,10 +7896,6 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) if (cfs_rq_is_idle(cfs_rq)) h_nr_idle = h_nr_queued; - - /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(cfs_rq)) - return 0; } sub_nr_running(rq, h_nr_queued); @@ -7987,6 +7931,11 @@ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct sched_entity *se = &p->se; + if (task_is_throttled(p)) { + dequeue_throttled_task(p, flags); + return true; + } + if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE)))) { struct cfs_rq *cfs_rq; @@ -9892,6 +9841,8 @@ static struct task_struct *pick_task_fair(struct rq *rq) { struct sched_entity *se; struct cfs_rq *cfs_rq; + struct task_struct *p; + bool throttled; update_rq_on_expel(rq); push_expellee(rq); @@ -9903,13 +9854,14 @@ static struct task_struct *pick_task_fair(struct rq *rq) if (expellee_only_rq(rq)) return NULL; + throttled = false; + do { /* Might not have done put_prev_entity() */ if (cfs_rq->curr && cfs_rq->curr->on_rq) update_curr(cfs_rq); - if (unlikely(check_cfs_rq_runtime(cfs_rq))) - goto again; + throttled |= check_cfs_rq_runtime(cfs_rq); se = pick_next_entity(rq, cfs_rq); if (!se) @@ -9917,7 +9869,10 @@ static struct task_struct *pick_task_fair(struct rq *rq) cfs_rq = group_cfs_rq(se); } while (cfs_rq); - return task_of(se); + p = task_of(se); + if (unlikely(throttled)) + task_throttle_setup_work(p); + return p; } static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first); diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index 3a0e0dc28721..ea24fdc9d128 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -157,7 +157,7 @@ static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { u64 throttled; - if (unlikely(cfs_rq->throttle_count)) + if (unlikely(cfs_rq->pelt_clock_throttled)) throttled = U64_MAX; else throttled = cfs_rq->throttled_clock_pelt_time; @@ -168,7 +168,7 @@ static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) /* rq->task_clock normalized against any time this cfs_rq has spent throttled */ static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { - if (unlikely(cfs_rq->throttle_count)) + if (unlikely(cfs_rq->pelt_clock_throttled)) return cfs_rq->throttled_clock_pelt - cfs_rq->throttled_clock_pelt_time; return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_pelt_time; @@ -232,4 +232,3 @@ update_idle_rq_clock_pelt(struct rq *rq) { } static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { } #endif - diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index a9a8c6f8acc3..23339fcb866a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -956,7 +956,8 @@ struct cfs_rq { u64 throttled_clock_pelt_time; u64 throttled_clock_self; u64 throttled_clock_self_time; - int throttled; + bool throttled:1; + bool pelt_clock_throttled:1; int throttle_count; struct list_head throttled_list; #ifdef CONFIG_SMP -- Gitee From 292139a8dfafe9791771a132cd1cd7fcf1fda897 Mon Sep 17 00:00:00 2001 From: Aaron Lu Date: Fri, 29 Aug 2025 16:11:19 +0800 Subject: [PATCH 4/9] sched/fair: Task based throttle time accounting ANBZ: #32935 commit eb962f251fbba251a0d34897d6170f7616d70c52 upstream. With task based throttle model, the previous way to check cfs_rq's nr_queued to decide if throttled time should be accounted doesn't work as expected, e.g. when a cfs_rq which has a single task is throttled, that task could later block in kernel mode instead of being dequeued on limbo list and accounting this as throttled time is not accurate. Rework throttle time accounting for a cfs_rq as follows: - start accounting when the first task gets throttled in its hierarchy; - stop accounting on unthrottle. Note that there will be a time gap between when a cfs_rq is throttled and when a task in its hierarchy is actually throttled. This accounting mechanism only starts accounting in the latter case. Suggested-by: Chengming Zhou # accounting mechanism Co-developed-by: K Prateek Nayak # simplify implementation Signed-off-by: K Prateek Nayak Signed-off-by: Aaron Lu Signed-off-by: Peter Zijlstra (Intel) Tested-by: Valentin Schneider Tested-by: Matteo Martelli Tested-by: K Prateek Nayak Link: https://lore.kernel.org/r/20250829081120.806-5-ziqianlu@bytedance.com Signed-off-by: Peng Wang --- kernel/sched/fair.c | 55 +++++++++++++++++++++++++------------------- kernel/sched/sched.h | 1 + 2 files changed, 32 insertions(+), 24 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3291e64aa8ed..5b74ef8f727e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5919,19 +5919,12 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) check_enqueue_throttle(cfs_rq); list_add_leaf_cfs_rq(cfs_rq); #ifdef CONFIG_CFS_BANDWIDTH - if (throttled_hierarchy(cfs_rq)) { + if (cfs_rq->pelt_clock_throttled) { struct rq *rq = rq_of(cfs_rq); - if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock) - cfs_rq->throttled_clock = rq_clock(rq); - if (!cfs_rq->throttled_clock_self) - cfs_rq->throttled_clock_self = rq_clock(rq); - - if (cfs_rq->pelt_clock_throttled) { - cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) - - cfs_rq->throttled_clock_pelt; - cfs_rq->pelt_clock_throttled = 0; - } + cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) - + cfs_rq->throttled_clock_pelt; + cfs_rq->pelt_clock_throttled = 0; } #endif } @@ -6068,7 +6061,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * DELAY_DEQUEUE relies on spurious wakeups, special task * states must not suffer spurious wakeups, excempt them. */ - if (flags & DEQUEUE_SPECIAL) + if (flags & (DEQUEUE_SPECIAL | DEQUEUE_THROTTLE)) delay = false; SCHED_WARN_ON(delay && se->sched_delayed); @@ -6492,7 +6485,7 @@ static void throttle_cfs_rq_work(struct callback_head *work) rq = scope.rq; update_rq_clock(rq); WARN_ON_ONCE(p->throttled || !list_empty(&p->throttle_node)); - dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_SPECIAL); + dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_THROTTLE); list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list); /* * Must not set throttled before dequeue or dequeue will @@ -6652,6 +6645,17 @@ static inline void task_throttle_setup_work(struct task_struct *p) task_work_add(p, &p->sched_throttle_work, TWA_RESUME); } +static void record_throttle_clock(struct cfs_rq *cfs_rq) +{ + struct rq *rq = rq_of(cfs_rq); + + if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock) + cfs_rq->throttled_clock = rq_clock(rq); + + if (!cfs_rq->throttled_clock_self) + cfs_rq->throttled_clock_self = rq_clock(rq); +} + static int tg_throttle_down(struct task_group *tg, void *data) { struct rq *rq = data; @@ -6660,20 +6664,17 @@ static int tg_throttle_down(struct task_group *tg, void *data) if (cfs_rq->throttle_count++) return 0; - /* group is entering throttled state, stop time */ - SCHED_WARN_ON(cfs_rq->throttled_clock_self); - if (cfs_rq->nr_queued) - cfs_rq->throttled_clock_self = rq_clock(rq); - else { - /* - * For cfs_rqs that still have entities enqueued, PELT clock - * stop happens at dequeue time when all entities are dequeued. - */ + /* + * For cfs_rqs that still have entities enqueued, PELT clock + * stop happens at dequeue time when all entities are dequeued. + */ + if (!cfs_rq->nr_queued) { list_del_leaf_cfs_rq(cfs_rq); cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); cfs_rq->pelt_clock_throttled = 1; } + WARN_ON_ONCE(cfs_rq->throttled_clock_self); WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list)); return 0; } @@ -6715,8 +6716,6 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) */ cfs_rq->throttled = 1; SCHED_WARN_ON(cfs_rq->throttled_clock); - if (cfs_rq->nr_queued) - cfs_rq->throttled_clock = rq_clock(rq); return true; } @@ -7447,6 +7446,7 @@ static void task_throttle_setup_work(struct task_struct *p) {} static bool task_is_throttled(struct task_struct *p) { return false; } static void dequeue_throttled_task(struct task_struct *p, int flags) {} static bool enqueue_throttled_task(struct task_struct *p) { return false; } +static void record_throttle_clock(struct cfs_rq *cfs_rq) {} static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) { @@ -7802,6 +7802,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) bool was_sched_idle = sched_idle_rq(rq); bool task_sleep = flags & DEQUEUE_SLEEP; bool task_delayed = flags & DEQUEUE_DELAYED; + bool task_throttled = flags & DEQUEUE_THROTTLE; struct task_struct *p = NULL; int h_nr_idle = 0; int h_nr_queued = 0; @@ -7854,6 +7855,9 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) if (cfs_rq_is_idle(cfs_rq)) h_nr_idle = h_nr_queued; + if (throttled_hierarchy(cfs_rq) && task_throttled) + record_throttle_clock(cfs_rq); + /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { slice = cfs_rq_slice(cfs_rq); @@ -7896,6 +7900,9 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) if (cfs_rq_is_idle(cfs_rq)) h_nr_idle = h_nr_queued; + + if (throttled_hierarchy(cfs_rq) && task_throttled) + record_throttle_clock(cfs_rq); } sub_nr_running(rq, h_nr_queued); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 23339fcb866a..86c510424ccc 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2745,6 +2745,7 @@ extern const u32 sched_prio_to_wmult[40]; #define DEQUEUE_SPECIAL 0x10 #define DEQUEUE_MIGRATING 0x100 /* Matches ENQUEUE_MIGRATING */ #define DEQUEUE_DELAYED 0x200 /* Matches ENQUEUE_DELAYED */ +#define DEQUEUE_THROTTLE 0x800 #define ENQUEUE_WAKEUP 0x01 #define ENQUEUE_RESTORE 0x02 -- Gitee From fdf8ea8766cf38a29de7517907c2397ee2ee2e89 Mon Sep 17 00:00:00 2001 From: Aaron Lu Date: Fri, 29 Aug 2025 16:11:20 +0800 Subject: [PATCH 5/9] sched/fair: Get rid of throttled_lb_pair() ANBZ: #32935 commit 5b726e9bf9544a349090879a513a5e00da486c14 upstream. Now that throttled tasks are dequeued and can not stay on rq's cfs_tasks list, there is no need to take special care of these throttled tasks anymore in load balance. Suggested-by: K Prateek Nayak Signed-off-by: Aaron Lu Signed-off-by: Peter Zijlstra (Intel) Tested-by: Valentin Schneider Tested-by: Matteo Martelli Tested-by: K Prateek Nayak Link: https://lore.kernel.org/r/20250829081120.806-6-ziqianlu@bytedance.com Signed-off-by: Peng Wang --- kernel/sched/fair.c | 33 +++------------------------------ 1 file changed, 3 insertions(+), 30 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5b74ef8f727e..3d1a2b8d8c7b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6428,23 +6428,6 @@ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) return cfs_bandwidth_used() && cfs_rq->throttle_count; } -/* - * Ensure that neither of the group entities corresponding to src_cpu or - * dest_cpu are members of a throttled hierarchy when performing group - * load-balance operations. - */ -static inline int throttled_lb_pair(struct task_group *tg, - int src_cpu, int dest_cpu) -{ - struct cfs_rq *src_cfs_rq, *dest_cfs_rq; - - src_cfs_rq = tg->cfs_rq[src_cpu]; - dest_cfs_rq = tg->cfs_rq[dest_cpu]; - - return throttled_hierarchy(src_cfs_rq) || - throttled_hierarchy(dest_cfs_rq); -} - static inline bool task_is_throttled(struct task_struct *p) { return cfs_bandwidth_used() && p->throttled; @@ -7458,12 +7441,6 @@ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) return 0; } -static inline int throttled_lb_pair(struct task_group *tg, - int src_cpu, int dest_cpu) -{ - return 0; -} - #ifdef CONFIG_FAIR_GROUP_SCHED void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent) {} static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} @@ -10382,17 +10359,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) /* * We do not migrate tasks that are: * 1) delayed dequeued unless we migrate load, or - * 2) throttled_lb_pair, or - * 3) cannot be migrated to this CPU due to cpus_ptr, or - * 4) running (obviously), or - * 5) are cache-hot on their current CPU. + * 2) cannot be migrated to this CPU due to cpus_ptr, or + * 3) running (obviously), or + * 4) are cache-hot on their current CPU. */ if ((p->se.sched_delayed) && (env->migration_type != migrate_load)) return 0; - if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) - return 0; - /* * We want to prioritize the migration of eligible tasks. * For ineligible tasks we soft-limit them and only allow -- Gitee From 666e82f97f40126877989ae179ad72a819a201c2 Mon Sep 17 00:00:00 2001 From: Aaron Lu Date: Wed, 10 Sep 2025 17:50:41 +0800 Subject: [PATCH 6/9] sched/fair: Propagate load for throttled cfs_rq ANBZ: #32935 commit fe8d238e646e16cc431b7a5899f8dda690258ee9 upstream. Before task based throttle model, propagating load will stop at a throttled cfs_rq and that propagate will happen on unthrottle time by update_load_avg(). Now that there is no update_load_avg() on unthrottle for throttled cfs_rq and all load tracking is done by task related operations, let the propagate happen immediately. While at it, add a comment to explain why cfs_rqs that are not affected by throttle have to be added to leaf cfs_rq list in propagate_entity_cfs_rq() per my understanding of commit 0258bdfaff5b ("sched/fair: Fix unfairness caused by missing load decay"). Signed-off-by: Aaron Lu Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Chengming Zhou Signed-off-by: Peng Wang --- kernel/sched/fair.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3d1a2b8d8c7b..7991fadcff6a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6422,6 +6422,11 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) return cfs_bandwidth_used() && cfs_rq->throttled; } +static inline bool cfs_rq_pelt_clock_throttled(struct cfs_rq *cfs_rq) +{ + return cfs_bandwidth_used() && cfs_rq->pelt_clock_throttled; +} + /* check whether cfs_rq, or any parent, is throttled */ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) { @@ -7436,6 +7441,11 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) return 0; } +static inline bool cfs_rq_pelt_clock_throttled(struct cfs_rq *cfs_rq) +{ + return false; +} + static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) { return 0; @@ -14275,10 +14285,13 @@ static void propagate_entity_cfs_rq(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - if (cfs_rq_throttled(cfs_rq)) - return; - - if (!throttled_hierarchy(cfs_rq)) + /* + * If a task gets attached to this cfs_rq and before being queued, + * it gets migrated to another CPU due to reasons like affinity + * change, make sure this cfs_rq stays on leaf cfs_rq list to have + * that removed load decayed or it can cause faireness problem. + */ + if (!cfs_rq_pelt_clock_throttled(cfs_rq)) list_add_leaf_cfs_rq(cfs_rq); /* Start to propagate at parent */ @@ -14289,10 +14302,7 @@ static void propagate_entity_cfs_rq(struct sched_entity *se) update_load_avg(cfs_rq, se, UPDATE_TG); - if (cfs_rq_throttled(cfs_rq)) - break; - - if (!throttled_hierarchy(cfs_rq)) + if (!cfs_rq_pelt_clock_throttled(cfs_rq)) list_add_leaf_cfs_rq(cfs_rq); } } -- Gitee From 479dbc18f75b80e66f39587f5c0dec81ed5ddda1 Mon Sep 17 00:00:00 2001 From: Aaron Lu Date: Wed, 10 Sep 2025 17:50:42 +0800 Subject: [PATCH 7/9] sched/fair: update_cfs_group() for throttled cfs_rqs ANBZ: #32935 commit fcd394866e3db344cbe0bb485d7e3f741ac07245 upstream. With task based throttle model, tasks in a throttled hierarchy are allowed to continue to run if they are running in kernel mode. For this reason, PELT clock is not stopped for these cfs_rqs in throttled hierarchy when they still have tasks running or queued. Since PELT clock is not stopped, whether to allow update_cfs_group() doing its job for cfs_rqs which are in throttled hierarchy but still have tasks running/queued is a question. The good side is, continue to run update_cfs_group() can get these cfs_rq entities with an up2date weight and that up2date weight can be useful to derive an accurate load for the CPU as well as ensure fairness if multiple tasks of different cgroups are running on the same CPU. OTOH, as Benjamin Segall pointed: when unthrottle comes around the most likely correct distribution is the distribution we had at the time of throttle. In reality, either way may not matter that much if tasks in throttled hierarchy don't run in kernel mode for too long. But in case that happens, let these cfs_rq entities have an up2date weight seems a good thing to do. Signed-off-by: Aaron Lu Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Peng Wang --- kernel/sched/fair.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7991fadcff6a..0c0bc464c940 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4555,9 +4555,6 @@ static void update_cfs_group(struct sched_entity *se) if (!gcfs_rq || !gcfs_rq->load.weight) return; - if (throttled_hierarchy(gcfs_rq)) - return; - #ifndef CONFIG_SMP shares = READ_ONCE(gcfs_rq->tg->shares); #else -- Gitee From 64514537209d2d67db2bab157426c2486a327c20 Mon Sep 17 00:00:00 2001 From: Aaron Lu Date: Wed, 10 Sep 2025 17:50:43 +0800 Subject: [PATCH 8/9] sched/fair: Do not special case tasks in throttled hierarchy ANBZ: #32935 commit 253b3f587241967a97a971e23b1e2a7d74244fad upstream. With the introduction of task based throttle model, task in a throttled hierarchy is allowed to continue to run till it gets throttled on its ret2user path. For this reason, remove those throttled_hierarchy() checks in the following functions so that those tasks can get their turn as normal tasks: dequeue_entities(), check_preempt_wakeup_fair() and yield_to_task_fair(). The benefit of doing it this way is: if those tasks gets the chance to run earlier and if they hold any kernel resources, they can release those resources earlier. The downside is, if they don't hold any kernel resouces, all they can do is to throttle themselves on their way back to user space so the favor to let them run seems not that useful and for check_preempt_wakeup_fair(), that favor may be bad for curr. K Prateek Nayak pointed out prio_changed_fair() can send a throttled task to check_preempt_wakeup_fair(), further tests showed the affinity change path from move_queued_task() can also send a throttled task to check_preempt_wakeup_fair(), that's why the check of task_is_throttled() in that function. Signed-off-by: Aaron Lu Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Peng Wang --- kernel/sched/fair.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0c0bc464c940..a7096584666c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7852,7 +7852,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) * Bias pick_next to pick a task from this cfs_rq, as * p is sleeping when it is within its sched_slice. */ - if (task_sleep && se && !throttled_hierarchy(cfs_rq)) + if (task_sleep && se) set_next_buddy(se); break; } @@ -9504,7 +9504,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int * lead to a throttle). This both saves work and prevents false * next-buddy nomination below. */ - if (unlikely(throttled_hierarchy(cfs_rq_of(pse)))) + if (task_is_throttled(p)) return; if (sched_feat(NEXT_BUDDY) && @@ -10026,8 +10026,8 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) { struct sched_entity *se = &p->se; - /* throttled hierarchies are not runnable */ - if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se))) + /* !se->on_rq also covers throttled task */ + if (!se->on_rq) return false; /* Tell the scheduler that we'd really like pse to run next. */ -- Gitee From 4bc1952be74494208d6ae7118045000daf840bc4 Mon Sep 17 00:00:00 2001 From: Aaron Lu Date: Fri, 12 Sep 2025 11:44:28 +0800 Subject: [PATCH 9/9] sched/fair: Do not balance task to a throttled cfs_rq ANBZ: #32935 commit 0d4eaf8caf8cd633b23e949e2996b420052c2d45 upstream. When doing load balance and the target cfs_rq is in throttled hierarchy, whether to allow balancing there is a question. The good side to allow balancing is: if the target CPU is idle or less loaded and the being balanced task is holding some kernel resources, then it seems a good idea to balance the task there and let the task get the CPU earlier and release kernel resources sooner. The bad part is, if the task is not holding any kernel resources, then the balance seems not that useful. While theoretically it's debatable, a performance test[0] which involves 200 cgroups and each cgroup runs hackbench(20 sender, 20 receiver) in pipe mode showed a performance degradation on AMD Genoa when allowing load balance to throttled cfs_rq. Analysis[1] showed hackbench doesn't like task migration across LLC boundary. For this reason, add a check in can_migrate_task() to forbid balancing to a cfs_rq that is in throttled hierarchy. This reduced task migration a lot and performance restored. [0]: https://lore.kernel.org/lkml/20250822110701.GB289@bytedance/ [1]: https://lore.kernel.org/lkml/20250903101102.GB42@bytedance/ Signed-off-by: Aaron Lu Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: K Prateek Nayak Signed-off-by: Peng Wang --- kernel/sched/fair.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a7096584666c..ec8c15ec7a18 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6430,6 +6430,11 @@ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) return cfs_bandwidth_used() && cfs_rq->throttle_count; } +static inline int lb_throttled_hierarchy(struct task_struct *p, int dst_cpu) +{ + return throttled_hierarchy(task_group(p)->cfs_rq[dst_cpu]); +} + static inline bool task_is_throttled(struct task_struct *p) { return cfs_bandwidth_used() && p->throttled; @@ -7448,6 +7453,11 @@ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) return 0; } +static inline int lb_throttled_hierarchy(struct task_struct *p, int dst_cpu) +{ + return 0; +} + #ifdef CONFIG_FAIR_GROUP_SCHED void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent) {} static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} @@ -10366,13 +10376,17 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) /* * We do not migrate tasks that are: * 1) delayed dequeued unless we migrate load, or - * 2) cannot be migrated to this CPU due to cpus_ptr, or - * 3) running (obviously), or - * 4) are cache-hot on their current CPU. + * 2) target cfs_rq is in throttled hierarchy, or + * 3) cannot be migrated to this CPU due to cpus_ptr, or + * 4) running (obviously), or + * 5) are cache-hot on their current CPU. */ if ((p->se.sched_delayed) && (env->migration_type != migrate_load)) return 0; + if (lb_throttled_hierarchy(p, env->dst_cpu)) + return 0; + /* * We want to prioritize the migration of eligible tasks. * For ineligible tasks we soft-limit them and only allow -- Gitee