OpenVZ Forum: Devel » [RFC] [PATCH 0/3] Add group fairness to CFS

Home » Mailing lists » Devel » [RFC] [PATCH 0/3] Add group fairness to CFS

Show: Today's Messages :: Show Polls :: Message Navigator
E-mail to friend

[RFC] [PATCH 3/3] Generalize CFS core and provide per-user fairness [message #18635 is a reply to message #18632]

Wed, 23 May 2007 16:56

Srivatsa Vaddagiri
Messages: 241
Registered: August 2006

Senior Member

This patch reuses CFS core to provide inter task-group fairness. For
demonstration purpose, the patch also extends CFS to provide per-user
fairness. The patch is very experimental atm and in particular, SMP LOAD
BALANCE IS DISABLED to keep the patch simple. I think group-based smp
load
balance is more trickier and I intend to look at it next.

Although user id is chosen as the basis of grouping tasks, the patch can
be adapted to work with other task grouping mechanisms (like :
http://lkml.org/lkml/2007/4/27/146).

Signed-off-by : Srivatsa Vaddagiri <vatsa@in.ibm.com>


---
 arch/i386/Kconfig     |    6 
 arch/x86_64/Kconfig   |    6 
 include/linux/sched.h |   16 
 kernel/sched.c        |  256 +++++++++++----
 kernel/sched_debug.c  |    4 
 kernel/sched_fair.c   |  820 ++++++++++++++++++++++++++++++++------------------
 kernel/sched_rt.c     |    2 
 kernel/user.c         |    5 
 8 files changed, 753 insertions(+), 362 deletions(-)

Index: linux-2.6.21-rc7/arch/i386/Kconfig
===================================================================
--- linux-2.6.21-rc7.orig/arch/i386/Kconfig	2007-05-23 20:46:38.000000000 +0530
+++ linux-2.6.21-rc7/arch/i386/Kconfig	2007-05-23 20:48:39.000000000 +0530
@@ -307,6 +307,12 @@
 	  making when dealing with multi-core CPU chips at a cost of slightly
 	  increased overhead in some places. If unsure say N here.
 
+config FAIR_USER_SCHED
+	bool "Fair user scheduler"
+	default n
+	help
+		Fair user scheduler
+
 source "kernel/Kconfig.preempt"
 
 config X86_UP_APIC
Index: linux-2.6.21-rc7/arch/x86_64/Kconfig
===================================================================
--- linux-2.6.21-rc7.orig/arch/x86_64/Kconfig	2007-05-23 20:46:38.000000000 +0530
+++ linux-2.6.21-rc7/arch/x86_64/Kconfig	2007-05-23 20:48:39.000000000 +0530
@@ -330,6 +330,12 @@
 	  making when dealing with multi-core CPU chips at a cost of slightly
 	  increased overhead in some places. If unsure say N here.
 
+config FAIR_USER_SCHED
+	bool "Fair user scheduler"
+	default n
+	help
+		Fair user scheduler
+
 source "kernel/Kconfig.preempt"
 
 config NUMA
Index: linux-2.6.21-rc7/include/linux/sched.h
===================================================================
--- linux-2.6.21-rc7.orig/include/linux/sched.h	2007-05-23 20:48:34.000000000 +0530
+++ linux-2.6.21-rc7/include/linux/sched.h	2007-05-23 20:48:39.000000000 +0530
@@ -551,6 +551,16 @@
 #define is_rt_policy(p)		((p) != SCHED_NORMAL && (p) != SCHED_BATCH)
 #define has_rt_policy(p)	unlikely(is_rt_policy((p)->policy))
 
+#ifdef CONFIG_FAIR_USER_SCHED
+int sched_alloc_user(struct user_struct *user);
+void sched_free_user(struct user_struct *user);
+void sched_move_task(struct user_struct *old);
+#else
+static inline int sched_alloc_user(struct user_struct *user) { return 0; }
+static inline void sched_free_user(struct user_struct *user) { }
+static inline void sched_move_task(struct user_struct *user) { }
+#endif
+
 /*
  * Some day this will be a full-fledged user tracking system..
  */
@@ -575,6 +585,10 @@
 	/* Hash table maintenance information */
 	struct list_head uidhash_list;
 	uid_t uid;
+#ifdef CONFIG_FAIR_USER_SCHED
+	struct sched_entity *se;        /* per-cpu sched_entity */
+	struct lrq *lrq;        /* per-cpu runqueue for this user */
+#endif
 };
 
 extern struct user_struct *find_user(uid_t);
@@ -859,6 +873,8 @@
 	s64 fair_key;
 	s64 sum_wait_runtime, sum_sleep_runtime;
 	unsigned long wait_runtime_overruns, wait_runtime_underruns;
+	struct sched_entity *parent;
+	struct lrq *my_q;       /* The queue owned by this entity */
 };
 
 struct task_struct {
Index: linux-2.6.21-rc7/kernel/sched.c
===================================================================
--- linux-2.6.21-rc7.orig/kernel/sched.c	2007-05-23 20:48:34.000000000 +0530
+++ linux-2.6.21-rc7/kernel/sched.c	2007-05-23 20:48:39.000000000 +0530
@@ -129,6 +129,14 @@
 	struct rb_root tasks_timeline;
 	struct rb_node *rb_leftmost;
 	struct rb_node *rb_load_balance_curr;
+	struct sched_entity *curr;
+	unsigned int *sched_granularity;	/* &sysctl_sched_granularity */
+	struct rq *rq;
+	unsigned long nice_0_load;
+#ifdef CONFIG_FAIR_USER_SCHED
+	struct list_head lrq_list;
+	struct rcu_head rcu;
+#endif
 };
 
 /*
@@ -164,6 +172,7 @@
 
 	struct task_struct *curr, *idle;
 	unsigned long next_balance;
+	unsigned long rt_load;
 	struct mm_struct *prev_mm;
 
 	u64 clock, prev_clock_raw;
@@ -214,6 +223,32 @@
 	struct lock_class_key rq_lock_key;
 };
 
+#define NICE_0_LOAD	SCHED_LOAD_SCALE
+#define NICE_0_SHIFT	SCHED_LOAD_SHIFT
+
+#ifdef CONFIG_FAIR_USER_SCHED
+static struct sched_entity root_user_se[NR_CPUS];
+static struct lrq root_user_lrq[NR_CPUS];
+
+static inline void init_se(struct sched_entity *se, struct lrq *lrq)
+{
+	se->my_q = lrq;
+	se->load_weight = NICE_0_LOAD;
+}
+
+static inline void init_lrq(struct lrq *lrq, struct rq *rq)
+{
+	lrq->rq = rq;
+	lrq->fair_clock = 1;
+	lrq->tasks_timeline = RB_ROOT;
+	lrq->nice_0_load = NICE_0_LOAD;
+	lrq->sched_granularity = &sysctl_sched_granularity;
+	INIT_LIST_HEAD(&lrq->lrq_list);
+	list_add_rcu(&lrq->lrq_list, &rq->lrq.lrq_list);
+}
+
+#endif
+
 static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
 static DEFINE_MUTEX(sched_hotcpu_mutex);
 
@@ -555,9 +590,6 @@
 #define RTPRIO_TO_LOAD_WEIGHT(rp) \
 	(PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp))
 
-#define NICE_0_LOAD	SCHED_LOAD_SCALE
-#define NICE_0_SHIFT	SCHED_LOAD_SHIFT
-
 /*
  * Nice levels are multiplicative, with a gentle 10% change for every
  * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
@@ -576,16 +608,22 @@
 /*  10 */   110,    87,    70,    56,    45,    36,    29,    23,    18,    15,
 };
 
+extern struct sched_class rt_sched_class;
+
 static inline void
 inc_raw_weighted_load(struct rq *rq, const struct task_struct *p)
 {
-	rq->lrq.raw_weighted_load += p->se.load_weight;
+	/* Hack - needs better handling */
+	if (p->sched_class == &rt_sched_class)
+		rq->rt_load += p->se.load_weight;
 }
 
 static inline void
 dec_raw_weighted_load(struct rq *rq, const struct task_struct *p)
 {
-	rq->lrq.raw_weighted_load -= p->se.load_weight;
+	/* Hack - needs better handling */
+	if (p->sched_class == &rt_sched_class)
+		rq->rt_load -= p->se.load_weight;
 }
 
 static inline void inc_nr_running(struct task_struct *p, struct rq *rq)
@@ -728,10 +766,32 @@
 	return cpu_curr(task_cpu(p)) == p;
 }
 
+#ifdef CONFIG_FAIR_USER_SCHED
+
+#define for_each_lrq(rq, lrq) \
+	for (lrq = container_of((rq)->lrq.lrq_list.next, struct lrq, lrq_list);\
+	     prefetch(rcu_dereference(lrq->lrq_list.next)), lrq != &(rq)->lrq;\
+	     lrq = container_of(lrq->lrq_list.next, struct lrq, lrq_list))
+
+#else
+
+#define for_each_lrq(rq, lrq) \
+		for (lrq = &rq->lrq; lrq != NULL; lrq = NULL)
+
+#endif
+
 /* Used instead of source_load when we know the type == 0 */
 unsigned long weighted_cpuload(const int cpu)
 {
-	return cpu_rq(cpu)->lrq.raw_weighted_load;
+	struct lrq *lrq;
+	unsigned long weight = 0;
+
+	for_each_lrq(cpu_rq(cpu), lrq)
+		weight += lrq->raw_weighted_load;
+
+	weight += cpu_rq(cpu)->rt_load;
+
+	return weight;
 }
 
 #ifdef CONFIG_SMP
@@ -761,6 +821,10 @@
 	if (p->se.sleep_start_fair)
 		p->se.sleep_start_fair -= fair_clock_offset;
 
+#ifdef CONFIG_FAIR_USER_SCHED
+	p->se.parent = &p->user->se[new_cpu];
+#endif
+
 	task_thread_info(p)->cpu = new_cpu;
 
 }
@@ -863,12 +927,18 @@
  */
 static inline unsigned long source_load(int cpu, int type)
 {
-	struct rq *rq = cpu_rq(cpu);
+	unsigned long rwl, cpl = 0;
+	struct lrq *lrq;
+
+	rwl = weighted_cpuload(cpu);
 
 	if (type == 0)
-		return rq->lrq.raw_weighted_load;
+		return rwl;
+
+	for_each_lrq(cpu_rq(cpu), lrq)
+		cpl += lrq->cpu_load[type-1];
 
-	return min(rq->lrq.cpu_load[type-1], rq->lrq.raw_weighted_load);
+	return min(cpl, rwl);
 }
 
 /*
@@ -877,12 +947,18 @@
  */
 static inline unsigned long target_load(int cpu, int type)
 {
-	struct rq *rq = cpu_rq(cpu);
+	unsigned long rwl, cpl = 0;
+	struct lrq *lrq;
+
+	rwl = weighted_cpuload(cpu);
 
 	if (type == 0)
-		return rq->lrq.raw_weighted_load;
+		return rwl;
+
+	for_each_lrq(cpu_rq(cpu), lrq)
+		cpl += lrq->cpu_load[type-1];
 
-	return max(rq->lrq.cpu_load[type-1], rq->lrq.raw_weighted_load);
+	return max(cpl, rwl);
 }
 
 /*
@@ -893,7 +969,7 @@
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long n = rq->nr_running;
 
-	return n ? rq->lrq.raw_weighted_load / n : SCHED_LOAD_SCALE;
+	return n ? weighted_cpuload(cpu) / n : SCHED_LOAD_SCALE;
 }
 
 /*
@@ -1583,59 +1659,6 @@
 	return running + uninterruptible;
 }
 
-static void update_load_fair(struct rq *this_rq)
-{
-	unsigned long this_load, fair_delta, exec_delta, idle_delta;
-	unsigned int i, scale;
-	s64 fair_delta64, exec_delta64;
-	unsigned long tmp;
-	u64 tmp64;
-
-	this_rq->lrq.nr_load_updates++;
-	if (!(sysctl_sched_load_smoothing & 64)) {
-		this_load = this_rq->lrq.raw_weighted_load;
-		goto do_avg;
-	}
-
-	fair_delta64 = this_rq->lrq.fair_clock -
-			 this_rq->lrq.prev_fair_clock + 1;
-	this_rq->lrq.prev_fair_clock = this_rq->lrq.fair_clock;
-
-	exec_delta64 = this_rq->lrq.exec_clock -
-			 this_rq->lrq.prev_exec_clock + 1;
-	this_rq->lrq.prev_exec_clock = this_rq->lrq.exec_clock;
-
-	if (fair_delta64 > (s64)LONG_MAX)
-		fair_delta64 = (s64)LONG_MAX;
-	fair_delta = (unsigned long)fair_delta64;
-
-	if (exec_delta64 > (s64)LONG_MAX)
-		exec_delta64 = (s64)LONG_MAX;
-	exec_delta = (unsigned long)exec_delta64;
-	if (exec_delta > TICK_NSEC)
-		exec_delta = TICK_NSEC;
-
-	idle_delta = TICK_NSEC - exec_delta;
-
-	tmp = (SCHED_LOAD_SCALE * exec_delta) / fair_delta;
-	tmp64 = (u64)tmp * (u64)exec_delta;
-	do_div(tmp64, TICK_NSEC);
-	this_load = (unsigned long)tmp64;
-
-do_avg:
-	/* Update our load: */
-	for (i = 0, scale = 1; i < CPU_LOAD_IDX_MA

...

[ Show the rest of the message ]

Report message to a moderator

[Message index]

		[RFC] [PATCH 0/3] Add group fairness to CFS By: Srivatsa Vaddagiri on Wed, 23 May 2007 16:48
		[RFC] [PATCH 1/3] task_cpu(p) needs to be correct always By: Srivatsa Vaddagiri on Wed, 23 May 2007 16:51
		[RFC] [PATCH 2/3] Introduce two new structures - struct lrq and sched_entity By: Srivatsa Vaddagiri on Wed, 23 May 2007 16:54
		[RFC] [PATCH 3/3] Generalize CFS core and provide per-user fairness By: Srivatsa Vaddagiri on Wed, 23 May 2007 16:56
		Re: [RFC] [PATCH 0/3] Add group fairness to CFS By: Peter Williams on Thu, 24 May 2007 03:15
		Re: [RFC] [PATCH 0/3] Add group fairness to CFS By: tong.n.li on Fri, 25 May 2007 17:14
		Re: [ckrm-tech] [RFC] [PATCH 0/3] Add group fairness to CFS By: Srivatsa Vaddagiri on Mon, 28 May 2007 16:39
		Re: [ckrm-tech] [RFC] [PATCH 0/3] Add group fairness to CFS By: billh on Wed, 30 May 2007 00:14
		Re: [ckrm-tech] [RFC] [PATCH 0/3] Add group fairness to CFS By: William Lee Irwin III on Wed, 30 May 2007 02:51
		Re: [RFC] [PATCH 0/3] Add group fairness to CFS By: Ingo Molnar on Fri, 25 May 2007 08:29
		Re: [RFC] [PATCH 0/3] Add group fairness to CFS By: Srivatsa Vaddagiri on Fri, 25 May 2007 10:56
		Re: [RFC] [PATCH 0/3] Add group fairness to CFS By: dev on Fri, 25 May 2007 13:05
		Re: [ckrm-tech] [RFC] [PATCH 0/3] Add group fairness to CFS By: Srivatsa Vaddagiri on Fri, 25 May 2007 15:34
		Re: [RFC] [PATCH 0/3] Add group fairness to CFS By: Guillaume Chazarain on Fri, 25 May 2007 09:30
		Re: [RFC] [PATCH 0/3] Add group fairness to CFS By: Ingo Molnar on Fri, 25 May 2007 11:11
		Re: [RFC] [PATCH 0/3] Add group fairness to CFS By: Srivatsa Vaddagiri on Fri, 25 May 2007 11:28
		Re: [RFC] [PATCH 0/3] Add group fairness to CFS By: Ingo Molnar on Fri, 25 May 2007 12:05
		Re: [RFC] [PATCH 0/3] Add group fairness to CFS By: Srivatsa Vaddagiri on Fri, 25 May 2007 12:41

Previous Topic:	Re: Pid namespaces approaches testing results
Next Topic:	Re: [ckrm-tech] [RFC] [PATCH 0/3] Add group fairness to CFS

Goto Forum:

-=] Back to Top [=-

[ Syndicate this forum (XML) ] [

] [

]

Current Time: Wed Nov 26 05:15:05 GMT 2025

Total time taken to generate the page: 0.36455 seconds