OpenVZ Forum


Home » Mailing lists » Devel » [RFC] [PATCH 0/3] Add group fairness to CFS
[RFC] [PATCH 3/3] Generalize CFS core and provide per-user fairness [message #18635 is a reply to message #18632] Wed, 23 May 2007 16:56 Go to previous messageGo to previous message
Srivatsa Vaddagiri is currently offline  Srivatsa Vaddagiri
Messages: 241
Registered: August 2006
Senior Member
This patch reuses CFS core to provide inter task-group fairness. For
demonstration purpose, the patch also extends CFS to provide per-user
fairness. The patch is very experimental atm and in particular, SMP LOAD
BALANCE IS DISABLED to keep the patch simple. I think group-based smp
load
balance is more trickier and I intend to look at it next.

Although user id is chosen as the basis of grouping tasks, the patch can
be adapted to work with other task grouping mechanisms (like :
http://lkml.org/lkml/2007/4/27/146).

Signed-off-by : Srivatsa Vaddagiri <vatsa@in.ibm.com>


---
 arch/i386/Kconfig     |    6 
 arch/x86_64/Kconfig   |    6 
 include/linux/sched.h |   16 
 kernel/sched.c        |  256 +++++++++++----
 kernel/sched_debug.c  |    4 
 kernel/sched_fair.c   |  820 ++++++++++++++++++++++++++++++++------------------
 kernel/sched_rt.c     |    2 
 kernel/user.c         |    5 
 8 files changed, 753 insertions(+), 362 deletions(-)

Index: linux-2.6.21-rc7/arch/i386/Kconfig
===================================================================
--- linux-2.6.21-rc7.orig/arch/i386/Kconfig	2007-05-23 20:46:38.000000000 +0530
+++ linux-2.6.21-rc7/arch/i386/Kconfig	2007-05-23 20:48:39.000000000 +0530
@@ -307,6 +307,12 @@
 	  making when dealing with multi-core CPU chips at a cost of slightly
 	  increased overhead in some places. If unsure say N here.
 
+config FAIR_USER_SCHED
+	bool "Fair user scheduler"
+	default n
+	help
+		Fair user scheduler
+
 source "kernel/Kconfig.preempt"
 
 config X86_UP_APIC
Index: linux-2.6.21-rc7/arch/x86_64/Kconfig
===================================================================
--- linux-2.6.21-rc7.orig/arch/x86_64/Kconfig	2007-05-23 20:46:38.000000000 +0530
+++ linux-2.6.21-rc7/arch/x86_64/Kconfig	2007-05-23 20:48:39.000000000 +0530
@@ -330,6 +330,12 @@
 	  making when dealing with multi-core CPU chips at a cost of slightly
 	  increased overhead in some places. If unsure say N here.
 
+config FAIR_USER_SCHED
+	bool "Fair user scheduler"
+	default n
+	help
+		Fair user scheduler
+
 source "kernel/Kconfig.preempt"
 
 config NUMA
Index: linux-2.6.21-rc7/include/linux/sched.h
===================================================================
--- linux-2.6.21-rc7.orig/include/linux/sched.h	2007-05-23 20:48:34.000000000 +0530
+++ linux-2.6.21-rc7/include/linux/sched.h	2007-05-23 20:48:39.000000000 +0530
@@ -551,6 +551,16 @@
 #define is_rt_policy(p)		((p) != SCHED_NORMAL && (p) != SCHED_BATCH)
 #define has_rt_policy(p)	unlikely(is_rt_policy((p)->policy))
 
+#ifdef CONFIG_FAIR_USER_SCHED
+int sched_alloc_user(struct user_struct *user);
+void sched_free_user(struct user_struct *user);
+void sched_move_task(struct user_struct *old);
+#else
+static inline int sched_alloc_user(struct user_struct *user) { return 0; }
+static inline void sched_free_user(struct user_struct *user) { }
+static inline void sched_move_task(struct user_struct *user) { }
+#endif
+
 /*
  * Some day this will be a full-fledged user tracking system..
  */
@@ -575,6 +585,10 @@
 	/* Hash table maintenance information */
 	struct list_head uidhash_list;
 	uid_t uid;
+#ifdef CONFIG_FAIR_USER_SCHED
+	struct sched_entity *se;        /* per-cpu sched_entity */
+	struct lrq *lrq;        /* per-cpu runqueue for this user */
+#endif
 };
 
 extern struct user_struct *find_user(uid_t);
@@ -859,6 +873,8 @@
 	s64 fair_key;
 	s64 sum_wait_runtime, sum_sleep_runtime;
 	unsigned long wait_runtime_overruns, wait_runtime_underruns;
+	struct sched_entity *parent;
+	struct lrq *my_q;       /* The queue owned by this entity */
 };
 
 struct task_struct {
Index: linux-2.6.21-rc7/kernel/sched.c
===================================================================
--- linux-2.6.21-rc7.orig/kernel/sched.c	2007-05-23 20:48:34.000000000 +0530
+++ linux-2.6.21-rc7/kernel/sched.c	2007-05-23 20:48:39.000000000 +0530
@@ -129,6 +129,14 @@
 	struct rb_root tasks_timeline;
 	struct rb_node *rb_leftmost;
 	struct rb_node *rb_load_balance_curr;
+	struct sched_entity *curr;
+	unsigned int *sched_granularity;	/* &sysctl_sched_granularity */
+	struct rq *rq;
+	unsigned long nice_0_load;
+#ifdef CONFIG_FAIR_USER_SCHED
+	struct list_head lrq_list;
+	struct rcu_head rcu;
+#endif
 };
 
 /*
@@ -164,6 +172,7 @@
 
 	struct task_struct *curr, *idle;
 	unsigned long next_balance;
+	unsigned long rt_load;
 	struct mm_struct *prev_mm;
 
 	u64 clock, prev_clock_raw;
@@ -214,6 +223,32 @@
 	struct lock_class_key rq_lock_key;
 };
 
+#define NICE_0_LOAD	SCHED_LOAD_SCALE
+#define NICE_0_SHIFT	SCHED_LOAD_SHIFT
+
+#ifdef CONFIG_FAIR_USER_SCHED
+static struct sched_entity root_user_se[NR_CPUS];
+static struct lrq root_user_lrq[NR_CPUS];
+
+static inline void init_se(struct sched_entity *se, struct lrq *lrq)
+{
+	se->my_q = lrq;
+	se->load_weight = NICE_0_LOAD;
+}
+
+static inline void init_lrq(struct lrq *lrq, struct rq *rq)
+{
+	lrq->rq = rq;
+	lrq->fair_clock = 1;
+	lrq->tasks_timeline = RB_ROOT;
+	lrq->nice_0_load = NICE_0_LOAD;
+	lrq->sched_granularity = &sysctl_sched_granularity;
+	INIT_LIST_HEAD(&lrq->lrq_list);
+	list_add_rcu(&lrq->lrq_list, &rq->lrq.lrq_list);
+}
+
+#endif
+
 static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
 static DEFINE_MUTEX(sched_hotcpu_mutex);
 
@@ -555,9 +590,6 @@
 #define RTPRIO_TO_LOAD_WEIGHT(rp) \
 	(PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp))
 
-#define NICE_0_LOAD	SCHED_LOAD_SCALE
-#define NICE_0_SHIFT	SCHED_LOAD_SHIFT
-
 /*
  * Nice levels are multiplicative, with a gentle 10% change for every
  * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
@@ -576,16 +608,22 @@
 /*  10 */   110,    87,    70,    56,    45,    36,    29,    23,    18,    15,
 };
 
+extern struct sched_class rt_sched_class;
+
 static inline void
 inc_raw_weighted_load(struct rq *rq, const struct task_struct *p)
 {
-	rq->lrq.raw_weighted_load += p->se.load_weight;
+	/* Hack - needs better handling */
+	if (p->sched_class == &rt_sched_class)
+		rq->rt_load += p->se.load_weight;
 }
 
 static inline void
 dec_raw_weighted_load(struct rq *rq, const struct task_struct *p)
 {
-	rq->lrq.raw_weighted_load -= p->se.load_weight;
+	/* Hack - needs better handling */
+	if (p->sched_class == &rt_sched_class)
+		rq->rt_load -= p->se.load_weight;
 }
 
 static inline void inc_nr_running(struct task_struct *p, struct rq *rq)
@@ -728,10 +766,32 @@
 	return cpu_curr(task_cpu(p)) == p;
 }
 
+#ifdef CONFIG_FAIR_USER_SCHED
+
+#define for_each_lrq(rq, lrq) \
+	for (lrq = container_of((rq)->lrq.lrq_list.next, struct lrq, lrq_list);\
+	     prefetch(rcu_dereference(lrq->lrq_list.next)), lrq != &(rq)->lrq;\
+	     lrq = container_of(lrq->lrq_list.next, struct lrq, lrq_list))
+
+#else
+
+#define for_each_lrq(rq, lrq) \
+		for (lrq = &rq->lrq; lrq != NULL; lrq = NULL)
+
+#endif
+
 /* Used instead of source_load when we know the type == 0 */
 unsigned long weighted_cpuload(const int cpu)
 {
-	return cpu_rq(cpu)->lrq.raw_weighted_load;
+	struct lrq *lrq;
+	unsigned long weight = 0;
+
+	for_each_lrq(cpu_rq(cpu), lrq)
+		weight += lrq->raw_weighted_load;
+
+	weight += cpu_rq(cpu)->rt_load;
+
+	return weight;
 }
 
 #ifdef CONFIG_SMP
@@ -761,6 +821,10 @@
 	if (p->se.sleep_start_fair)
 		p->se.sleep_start_fair -= fair_clock_offset;
 
+#ifdef CONFIG_FAIR_USER_SCHED
+	p->se.parent = &p->user->se[new_cpu];
+#endif
+
 	task_thread_info(p)->cpu = new_cpu;
 
 }
@@ -863,12 +927,18 @@
  */
 static inline unsigned long source_load(int cpu, int type)
 {
-	struct rq *rq = cpu_rq(cpu);
+	unsigned long rwl, cpl = 0;
+	struct lrq *lrq;
+
+	rwl = weighted_cpuload(cpu);
 
 	if (type == 0)
-		return rq->lrq.raw_weighted_load;
+		return rwl;
+
+	for_each_lrq(cpu_rq(cpu), lrq)
+		cpl += lrq->cpu_load[type-1];
 
-	return min(rq->lrq.cpu_load[type-1], rq->lrq.raw_weighted_load);
+	return min(cpl, rwl);
 }
 
 /*
@@ -877,12 +947,18 @@
  */
 static inline unsigned long target_load(int cpu, int type)
 {
-	struct rq *rq = cpu_rq(cpu);
+	unsigned long rwl, cpl = 0;
+	struct lrq *lrq;
+
+	rwl = weighted_cpuload(cpu);
 
 	if (type == 0)
-		return rq->lrq.raw_weighted_load;
+		return rwl;
+
+	for_each_lrq(cpu_rq(cpu), lrq)
+		cpl += lrq->cpu_load[type-1];
 
-	return max(rq->lrq.cpu_load[type-1], rq->lrq.raw_weighted_load);
+	return max(cpl, rwl);
 }
 
 /*
@@ -893,7 +969,7 @@
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long n = rq->nr_running;
 
-	return n ? rq->lrq.raw_weighted_load / n : SCHED_LOAD_SCALE;
+	return n ? weighted_cpuload(cpu) / n : SCHED_LOAD_SCALE;
 }
 
 /*
@@ -1583,59 +1659,6 @@
 	return running + uninterruptible;
 }
 
-static void update_load_fair(struct rq *this_rq)
-{
-	unsigned long this_load, fair_delta, exec_delta, idle_delta;
-	unsigned int i, scale;
-	s64 fair_delta64, exec_delta64;
-	unsigned long tmp;
-	u64 tmp64;
-
-	this_rq->lrq.nr_load_updates++;
-	if (!(sysctl_sched_load_smoothing & 64)) {
-		this_load = this_rq->lrq.raw_weighted_load;
-		goto do_avg;
-	}
-
-	fair_delta64 = this_rq->lrq.fair_clock -
-			 this_rq->lrq.prev_fair_clock + 1;
-	this_rq->lrq.prev_fair_clock = this_rq->lrq.fair_clock;
-
-	exec_delta64 = this_rq->lrq.exec_clock -
-			 this_rq->lrq.prev_exec_clock + 1;
-	this_rq->lrq.prev_exec_clock = this_rq->lrq.exec_clock;
-
-	if (fair_delta64 > (s64)LONG_MAX)
-		fair_delta64 = (s64)LONG_MAX;
-	fair_delta = (unsigned long)fair_delta64;
-
-	if (exec_delta64 > (s64)LONG_MAX)
-		exec_delta64 = (s64)LONG_MAX;
-	exec_delta = (unsigned long)exec_delta64;
-	if (exec_delta > TICK_NSEC)
-		exec_delta = TICK_NSEC;
-
-	idle_delta = TICK_NSEC - exec_delta;
-
-	tmp = (SCHED_LOAD_SCALE * exec_delta) / fair_delta;
-	tmp64 = (u64)tmp * (u64)exec_delta;
-	do_div(tmp64, TICK_NSEC);
-	this_load = (unsigned long)tmp64;
-
-do_avg:
-	/* Update our load: */
-	for (i = 0, scale = 1; i < CPU_LOAD_IDX_MA
...

 
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Previous Topic: Re: Pid namespaces approaches testing results
Next Topic: Re: [ckrm-tech] [RFC] [PATCH 0/3] Add group fairness to CFS
Goto Forum:
  


Current Time: Sun Aug 03 09:28:03 GMT 2025

Total time taken to generate the page: 1.38263 seconds