* Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com> wrote:
> Ingo,
> Here's an update of the group fairness patch I have been
> working on. Its against CFS v16 (sched-cfs-v2.6.22-rc4-mm2-v16.patch).
thanks!
> The core idea is to reuse much of CFS logic to apply fairness at
> higher hierarchical levels (user, container etc). In this regard CFS
> engine has been modified to deal with generic 'schedulable entities'.
> The patches introduce two essential structures in CFS core:
>
> - struct sched_entity
> - represents a schedulable entity in a hierarchy. Task
> is the lowest element in this hierarchy. Its ancestors
> could be user, container etc. This structure stores
> essential attributes/execution-history (wait_runtime etc)
> which is required by CFS engine to provide fairness between
> 'struct sched_entities' at the same hierarchy.
>
> - struct lrq
> - represents (per-cpu) runqueue in which ready-to-run
> 'struct sched_entities' are queued. The fair clock
> calculation is split to be per 'struct lrq'.
>
> Here's a brief description of the patches to follow:
>
> Patches 1-3 introduce the essential changes in CFS core to support
> this concept. They rework existing code w/o any (intended!) change in
> functionality.
i currently have these 3 patches applied to the CFS queue and it's
looking pretty good so far! If it continues to be problem-free i'll
release them as part of -v17, just to check that they truly have no bad
side-effects (they shouldnt). Then #4 can go into -v18.
i've attached my current -v17 tree - it should apply mostly cleanly
ontop of the -mm queue (with a minor number of fixups). Could you
refactor the remaining 3 patches ontop of this base? There's some
rejects in the last 3 patches due to the update_load_fair() change.
> Patch 4 fixes some bad interaction between SCHED_RT and SCHED_NORMAL
> tasks in current CFS.
btw., the plan here is to turn off 'bit 0' in sched_features: i.e. to
use the precise statistics to calculate lrq->cpu_load[], not the
timer-irq-sampled imprecise statistics. Dmitry has fixed a couple of
bugs in it that made it not work too well in previous CFS versions, but
now we are ready to turn it on for -v17. (indeed in my tree it's already
turned on - i.e. sched_features defaults to '14')
> Patch 5 introduces basic changes in CFS core to support group
> fairness.
>
> Patch 6 hooks up scheduler with container patches in mm (as an
> interface for task-grouping functionality).
ok. Kirill, how do you like Srivatsa's current approach? Would be nice
to kill two birds with the same stone, if possible :-)
> Note: I have noticed that running lat_ctx in a loop for 10 times
> doesnt give me good results. Basically I expected the loop to take
> same time for both users (when run simultaneously), whereas it was
> taking different times for different users. I think this can be solved
> by increasing sysctl_sched_runtime_limit at group level (to remeber
> execution history over a longer period).
you'll get the best hackbench results by using SCHED_BATCH:
chrt -b 0 ./hackbench 10
or indeed increasing the runtime_limit would work too.
Ingo
Index: linux/Makefile
===================================================================
--- linux.orig/Makefile
+++ linux/Makefile
@@ -1,7 +1,7 @@
VERSION = 2
PATCHLEVEL = 6
SUBLEVEL = 21
-EXTRAVERSION = .4-cfs-v16
+EXTRAVERSION = .4-cfs-v17
NAME = Nocturnal Monster Puppy
# *DOCUMENTATION*
Index: linux/fs/proc/array.c
===================================================================
--- linux.orig/fs/proc/array.c
+++ linux/fs/proc/array.c
@@ -319,7 +319,7 @@ static clock_t task_utime(struct task_st
* Use CFS's precise accounting, if available:
*/
if (!(sysctl_sched_features & 128)) {
- u64 temp = (u64)nsec_to_clock_t(p->sum_exec_runtime);
+ u64 temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
if (total) {
temp *= utime;
@@ -341,7 +341,7 @@ static clock_t task_stime(struct task_st
* by userspace grows monotonically - apps rely on that):
*/
if (!(sysctl_sched_features & 128))
- stime = nsec_to_clock_t(p->sum_exec_runtime) - task_utime(p);
+ stime = nsec_to_clock_t(p->se.sum_exec_runtime) - task_utime(p);
return stime;
}
Index: linux/include/linux/sched.h
===================================================================
--- linux.orig/include/linux/sched.h
+++ linux/include/linux/sched.h
@@ -534,8 +534,7 @@ struct signal_struct {
#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO)
#define rt_task(p) rt_prio((p)->prio)
-#define batch_task(p) (unlikely((p)->policy == SCHED_BATCH))
-#define is_rt_policy(p) ((p) != SCHED_NORMAL && (p) != SCHED_BATCH)
+#define is_rt_policy(p) ((p) == SCHED_FIFO || (p) == SCHED_RR)
#define has_rt_policy(p) unlikely(is_rt_policy((p)->policy))
/*
@@ -819,6 +818,29 @@ struct sched_class {
void (*task_new) (struct rq *rq, struct task_struct *p);
};
+/* CFS stats for a schedulable entity (task, task-group etc) */
+struct sched_entity {
+ int load_weight; /* for niceness load balancing purposes */
+ int on_rq;
+ struct rb_node run_node;
+ u64 wait_start_fair;
+ u64 wait_start;
+ u64 exec_start;
+ u64 sleep_start, sleep_start_fair;
+ u64 block_start;
+ u64 sleep_max;
+ u64 block_max;
+ u64 exec_max;
+ u64 wait_max;
+ u64 last_ran;
+
+ s64 wait_runtime;
+ u64 sum_exec_runtime;
+ s64 fair_key;
+ s64 sum_wait_runtime, sum_sleep_runtime;
+ unsigned long wait_runtime_overruns, wait_runtime_underruns;
+};
+
struct task_struct {
volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
struct thread_info *thread_info;
@@ -833,33 +855,15 @@ struct task_struct {
int oncpu;
#endif
#endif
- int load_weight; /* for niceness load balancing purposes */
int prio, static_prio, normal_prio;
- int on_rq;
struct list_head run_list;
- struct rb_node run_node;
+ struct sched_entity se;
unsigned short ioprio;
#ifdef CONFIG_BLK_DEV_IO_TRACE
unsigned int btrace_seq;
#endif
- /* CFS scheduling class statistics fields: */
- u64 wait_start_fair;
- u64 wait_start;
- u64 exec_start;
- u64 sleep_start, sleep_start_fair;
- u64 block_start;
- u64 sleep_max;
- u64 block_max;
- u64 exec_max;
- u64 wait_max;
-
- s64 wait_runtime;
- u64 sum_exec_runtime;
- s64 fair_key;
- s64 sum_wait_runtime, sum_sleep_runtime;
- unsigned long wait_runtime_overruns, wait_runtime_underruns;
unsigned long policy;
cpumask_t cpus_allowed;
Index: linux/kernel/exit.c
===================================================================
--- linux.orig/kernel/exit.c
+++ linux/kernel/exit.c
@@ -112,7 +112,7 @@ static void __exit_signal(struct task_st
sig->maj_flt += tsk->maj_flt;
sig->nvcsw += tsk->nvcsw;
sig->nivcsw += tsk->nivcsw;
- sig->sum_sched_runtime += tsk->sum_exec_runtime;
+ sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
sig = NULL; /* Marker for below. */
}
Index: linux/kernel/posix-cpu-timers.c
===================================================================
--- linux.orig/kernel/posix-cpu-timers.c
+++ linux/kernel/posix-cpu-timers.c
@@ -249,7 +249,7 @@ static int cpu_clock_sample_group_locked
cpu->sched = p->signal->sum_sched_runtime;
/* Add in each other live thread. */
while ((t = next_thread(t)) != p) {
- cpu->sched += t->sum_exec_runtime;
+ cpu->sched += t->se.sum_exec_runtime;
}
cpu->sched += sched_ns(p);
break;
@@ -467,7 +467,7 @@ static void cleanup_timers(struct list_h
void posix_cpu_timers_exit(struct task_struct *tsk)
{
cleanup_timers(tsk->cpu_timers,
- tsk->utime, tsk->stime, tsk->sum_exec_runtime);
+ tsk->utime, tsk->stime, tsk->se.sum_exec_runtime);
}
void posix_cpu_timers_exit_group(struct task_struct *tsk)
@@ -475,7 +475,7 @@ void posix_cpu_timers_exit_group(struct
cleanup_timers(tsk->signal->cpu_timers,
cputime_add(tsk->utime, tsk->signal->utime),
cputime_add(tsk->stime, tsk->signal->stime),
- tsk->sum_exec_runtime + tsk->signal->sum_sched_runtime);
+ tsk->se.sum_exec_runtime + tsk->signal->sum_sched_runtime);
}
@@ -536,7 +536,7 @@ static void process_timer_rebalance(stru
nsleft = max_t(unsigned long long, nsleft, 1);
do {
if (likely(!(t->flags & PF_EXITING))) {
- ns = t->sum_exec_runtime + nsleft;
+ ns = t->se.sum_exec_runtime + nsleft;
if (t->it_sched_expires == 0 ||
t->it_sched_expires > ns) {
t->it_sched_expires = ns;
@@ -1004,7 +1004,7 @@ static void check_thread_timers(struct t
struct cpu_timer_list *t = list_entry(timers->next,
struct cpu_timer_list,
entry);
- if (!--maxfire || tsk->sum_exec_runtime < t->expires.sched) {
+ if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) {
tsk->it_sched_expires = t->expires.sched;
break;
}
@@ -1049,7 +1049,7 @@ static void check_process_timers(struct
do {
utime = cputime_add(utime, t->utime);
stime = cputime_add(stime, t->stime);
- sum_sched_runtime += t->sum_exec_runtime;
+ sum_sched_runtime += t->se.sum_exec_runtime;
t = next_thread(t);
} while (t != tsk);
ptime = cputime_add(utime, stime);
@@ -1208,7 +1208,7 @@ static void check_process_timers(struct
t->it_virt_expires = ticks;
}
- sched = t->sum_exec_runtime + sched_left;
+ sched = t->se.sum_exec_runtime + sched_left;
if (sched_expires && (t->it_sched_expires == 0 ||
t->it_sched_expires > sched)) {
t->it_sched_expires = sched;
@@ -1300,7 +1300,7 @@ void run_posix_cpu_timers(struct task_st
if (UNEXPIRED(prof) && UNEXPIRED(virt) &&
(tsk->it_sc
...