This patch reuses CFS core to provide inter task-group fairness. For
demonstration purpose, the patch also extends CFS to provide per-user
fairness. The patch is very experimental atm and in particular, SMP LOAD
BALANCE IS DISABLED to keep the patch simple. I think group-based smp
load
balance is more trickier and I intend to look at it next.
Although user id is chosen as the basis of grouping tasks, the patch can
be adapted to work with other task grouping mechanisms (like :
http://lkml.org/lkml/2007/4/27/146).
Signed-off-by : Srivatsa Vaddagiri <vatsa@in.ibm.com>
---
arch/i386/Kconfig | 6
arch/x86_64/Kconfig | 6
include/linux/sched.h | 16
kernel/sched.c | 256 +++++++++++----
kernel/sched_debug.c | 4
kernel/sched_fair.c | 820 ++++++++++++++++++++++++++++++++------------------
kernel/sched_rt.c | 2
kernel/user.c | 5
8 files changed, 753 insertions(+), 362 deletions(-)
Index: linux-2.6.21-rc7/arch/i386/Kconfig
===================================================================
--- linux-2.6.21-rc7.orig/arch/i386/Kconfig 2007-05-23 20:46:38.000000000 +0530
+++ linux-2.6.21-rc7/arch/i386/Kconfig 2007-05-23 20:48:39.000000000 +0530
@@ -307,6 +307,12 @@
making when dealing with multi-core CPU chips at a cost of slightly
increased overhead in some places. If unsure say N here.
+config FAIR_USER_SCHED
+ bool "Fair user scheduler"
+ default n
+ help
+ Fair user scheduler
+
source "kernel/Kconfig.preempt"
config X86_UP_APIC
Index: linux-2.6.21-rc7/arch/x86_64/Kconfig
===================================================================
--- linux-2.6.21-rc7.orig/arch/x86_64/Kconfig 2007-05-23 20:46:38.000000000 +0530
+++ linux-2.6.21-rc7/arch/x86_64/Kconfig 2007-05-23 20:48:39.000000000 +0530
@@ -330,6 +330,12 @@
making when dealing with multi-core CPU chips at a cost of slightly
increased overhead in some places. If unsure say N here.
+config FAIR_USER_SCHED
+ bool "Fair user scheduler"
+ default n
+ help
+ Fair user scheduler
+
source "kernel/Kconfig.preempt"
config NUMA
Index: linux-2.6.21-rc7/include/linux/sched.h
===================================================================
--- linux-2.6.21-rc7.orig/include/linux/sched.h 2007-05-23 20:48:34.000000000 +0530
+++ linux-2.6.21-rc7/include/linux/sched.h 2007-05-23 20:48:39.000000000 +0530
@@ -551,6 +551,16 @@
#define is_rt_policy(p) ((p) != SCHED_NORMAL && (p) != SCHED_BATCH)
#define has_rt_policy(p) unlikely(is_rt_policy((p)->policy))
+#ifdef CONFIG_FAIR_USER_SCHED
+int sched_alloc_user(struct user_struct *user);
+void sched_free_user(struct user_struct *user);
+void sched_move_task(struct user_struct *old);
+#else
+static inline int sched_alloc_user(struct user_struct *user) { return 0; }
+static inline void sched_free_user(struct user_struct *user) { }
+static inline void sched_move_task(struct user_struct *user) { }
+#endif
+
/*
* Some day this will be a full-fledged user tracking system..
*/
@@ -575,6 +585,10 @@
/* Hash table maintenance information */
struct list_head uidhash_list;
uid_t uid;
+#ifdef CONFIG_FAIR_USER_SCHED
+ struct sched_entity *se; /* per-cpu sched_entity */
+ struct lrq *lrq; /* per-cpu runqueue for this user */
+#endif
};
extern struct user_struct *find_user(uid_t);
@@ -859,6 +873,8 @@
s64 fair_key;
s64 sum_wait_runtime, sum_sleep_runtime;
unsigned long wait_runtime_overruns, wait_runtime_underruns;
+ struct sched_entity *parent;
+ struct lrq *my_q; /* The queue owned by this entity */
};
struct task_struct {
Index: linux-2.6.21-rc7/kernel/sched.c
===================================================================
--- linux-2.6.21-rc7.orig/kernel/sched.c 2007-05-23 20:48:34.000000000 +0530
+++ linux-2.6.21-rc7/kernel/sched.c 2007-05-23 20:48:39.000000000 +0530
@@ -129,6 +129,14 @@
struct rb_root tasks_timeline;
struct rb_node *rb_leftmost;
struct rb_node *rb_load_balance_curr;
+ struct sched_entity *curr;
+ unsigned int *sched_granularity; /* &sysctl_sched_granularity */
+ struct rq *rq;
+ unsigned long nice_0_load;
+#ifdef CONFIG_FAIR_USER_SCHED
+ struct list_head lrq_list;
+ struct rcu_head rcu;
+#endif
};
/*
@@ -164,6 +172,7 @@
struct task_struct *curr, *idle;
unsigned long next_balance;
+ unsigned long rt_load;
struct mm_struct *prev_mm;
u64 clock, prev_clock_raw;
@@ -214,6 +223,32 @@
struct lock_class_key rq_lock_key;
};
+#define NICE_0_LOAD SCHED_LOAD_SCALE
+#define NICE_0_SHIFT SCHED_LOAD_SHIFT
+
+#ifdef CONFIG_FAIR_USER_SCHED
+static struct sched_entity root_user_se[NR_CPUS];
+static struct lrq root_user_lrq[NR_CPUS];
+
+static inline void init_se(struct sched_entity *se, struct lrq *lrq)
+{
+ se->my_q = lrq;
+ se->load_weight = NICE_0_LOAD;
+}
+
+static inline void init_lrq(struct lrq *lrq, struct rq *rq)
+{
+ lrq->rq = rq;
+ lrq->fair_clock = 1;
+ lrq->tasks_timeline = RB_ROOT;
+ lrq->nice_0_load = NICE_0_LOAD;
+ lrq->sched_granularity = &sysctl_sched_granularity;
+ INIT_LIST_HEAD(&lrq->lrq_list);
+ list_add_rcu(&lrq->lrq_list, &rq->lrq.lrq_list);
+}
+
+#endif
+
static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
static DEFINE_MUTEX(sched_hotcpu_mutex);
@@ -555,9 +590,6 @@
#define RTPRIO_TO_LOAD_WEIGHT(rp) \
(PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp))
-#define NICE_0_LOAD SCHED_LOAD_SCALE
-#define NICE_0_SHIFT SCHED_LOAD_SHIFT
-
/*
* Nice levels are multiplicative, with a gentle 10% change for every
* nice level changed. I.e. when a CPU-bound task goes from nice 0 to
@@ -576,16 +608,22 @@
/* 10 */ 110, 87, 70, 56, 45, 36, 29, 23, 18, 15,
};
+extern struct sched_class rt_sched_class;
+
static inline void
inc_raw_weighted_load(struct rq *rq, const struct task_struct *p)
{
- rq->lrq.raw_weighted_load += p->se.load_weight;
+ /* Hack - needs better handling */
+ if (p->sched_class == &rt_sched_class)
+ rq->rt_load += p->se.load_weight;
}
static inline void
dec_raw_weighted_load(struct rq *rq, const struct task_struct *p)
{
- rq->lrq.raw_weighted_load -= p->se.load_weight;
+ /* Hack - needs better handling */
+ if (p->sched_class == &rt_sched_class)
+ rq->rt_load -= p->se.load_weight;
}
static inline void inc_nr_running(struct task_struct *p, struct rq *rq)
@@ -728,10 +766,32 @@
return cpu_curr(task_cpu(p)) == p;
}
+#ifdef CONFIG_FAIR_USER_SCHED
+
+#define for_each_lrq(rq, lrq) \
+ for (lrq = container_of((rq)->lrq.lrq_list.next, struct lrq, lrq_list);\
+ prefetch(rcu_dereference(lrq->lrq_list.next)), lrq != &(rq)->lrq;\
+ lrq = container_of(lrq->lrq_list.next, struct lrq, lrq_list))
+
+#else
+
+#define for_each_lrq(rq, lrq) \
+ for (lrq = &rq->lrq; lrq != NULL; lrq = NULL)
+
+#endif
+
/* Used instead of source_load when we know the type == 0 */
unsigned long weighted_cpuload(const int cpu)
{
- return cpu_rq(cpu)->lrq.raw_weighted_load;
+ struct lrq *lrq;
+ unsigned long weight = 0;
+
+ for_each_lrq(cpu_rq(cpu), lrq)
+ weight += lrq->raw_weighted_load;
+
+ weight += cpu_rq(cpu)->rt_load;
+
+ return weight;
}
#ifdef CONFIG_SMP
@@ -761,6 +821,10 @@
if (p->se.sleep_start_fair)
p->se.sleep_start_fair -= fair_clock_offset;
+#ifdef CONFIG_FAIR_USER_SCHED
+ p->se.parent = &p->user->se[new_cpu];
+#endif
+
task_thread_info(p)->cpu = new_cpu;
}
@@ -863,12 +927,18 @@
*/
static inline unsigned long source_load(int cpu, int type)
{
- struct rq *rq = cpu_rq(cpu);
+ unsigned long rwl, cpl = 0;
+ struct lrq *lrq;
+
+ rwl = weighted_cpuload(cpu);
if (type == 0)
- return rq->lrq.raw_weighted_load;
+ return rwl;
+
+ for_each_lrq(cpu_rq(cpu), lrq)
+ cpl += lrq->cpu_load[type-1];
- return min(rq->lrq.cpu_load[type-1], rq->lrq.raw_weighted_load);
+ return min(cpl, rwl);
}
/*
@@ -877,12 +947,18 @@
*/
static inline unsigned long target_load(int cpu, int type)
{
- struct rq *rq = cpu_rq(cpu);
+ unsigned long rwl, cpl = 0;
+ struct lrq *lrq;
+
+ rwl = weighted_cpuload(cpu);
if (type == 0)
- return rq->lrq.raw_weighted_load;
+ return rwl;
+
+ for_each_lrq(cpu_rq(cpu), lrq)
+ cpl += lrq->cpu_load[type-1];
- return max(rq->lrq.cpu_load[type-1], rq->lrq.raw_weighted_load);
+ return max(cpl, rwl);
}
/*
@@ -893,7 +969,7 @@
struct rq *rq = cpu_rq(cpu);
unsigned long n = rq->nr_running;
- return n ? rq->lrq.raw_weighted_load / n : SCHED_LOAD_SCALE;
+ return n ? weighted_cpuload(cpu) / n : SCHED_LOAD_SCALE;
}
/*
@@ -1583,59 +1659,6 @@
return running + uninterruptible;
}
-static void update_load_fair(struct rq *this_rq)
-{
- unsigned long this_load, fair_delta, exec_delta, idle_delta;
- unsigned int i, scale;
- s64 fair_delta64, exec_delta64;
- unsigned long tmp;
- u64 tmp64;
-
- this_rq->lrq.nr_load_updates++;
- if (!(sysctl_sched_load_smoothing & 64)) {
- this_load = this_rq->lrq.raw_weighted_load;
- goto do_avg;
- }
-
- fair_delta64 = this_rq->lrq.fair_clock -
- this_rq->lrq.prev_fair_clock + 1;
- this_rq->lrq.prev_fair_clock = this_rq->lrq.fair_clock;
-
- exec_delta64 = this_rq->lrq.exec_clock -
- this_rq->lrq.prev_exec_clock + 1;
- this_rq->lrq.prev_exec_clock = this_rq->lrq.exec_clock;
-
- if (fair_delta64 > (s64)LONG_MAX)
- fair_delta64 = (s64)LONG_MAX;
- fair_delta = (unsigned long)fair_delta64;
-
- if (exec_delta64 > (s64)LONG_MAX)
- exec_delta64 = (s64)LONG_MAX;
- exec_delta = (unsigned long)exec_delta64;
- if (exec_delta > TICK_NSEC)
- exec_delta = TICK_NSEC;
-
- idle_delta = TICK_NSEC - exec_delta;
-
- tmp = (SCHED_LOAD_SCALE * exec_delta) / fair_delta;
- tmp64 = (u64)tmp * (u64)exec_delta;
- do_div(tmp64, TICK_NSEC);
- this_load = (unsigned long)tmp64;
-
-do_avg:
- /* Update our load: */
- for (i = 0, scale = 1; i < CPU_LOAD_IDX_MA
...