OpenVZ Forum


Home » Mailing lists » Devel » [PATCH 0/2] Add group awareness to CFS - v2
[PATCH 1/2] Introduce notion of scheduler hierarchy [message #19111 is a reply to message #19110] Sat, 23 June 2007 13:18 Go to previous messageGo to previous message
Srivatsa Vaddagiri is currently offline  Srivatsa Vaddagiri
Messages: 241
Registered: August 2006
Senior Member
This patch introduces the core changes in CFS required to accomplish
group fairness at higher levels. It also modifies load balance interface
between classes a bit, so that move_tasks (which is centric to load
balance) can be reused to balance between runqueues of various types
(struct rq in case of SCHED_RT tasks, struct lrq in case of
SCHED_NORMAL/BATCH tasks).

Signed-off-by : Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>

---
 include/linux/sched.h   |   17 ++
 kernel/sched.c          |  174 ++++++++++++++++------------
 kernel/sched_debug.c    |   31 +++--
 kernel/sched_fair.c     |  295 ++++++++++++++++++++++++++++++++++++++++++++----
 kernel/sched_idletask.c |   11 +
 kernel/sched_rt.c       |   47 ++++++-
 6 files changed, 464 insertions(+), 111 deletions(-)

Index: current/include/linux/sched.h
===================================================================
--- current.orig/include/linux/sched.h
+++ current/include/linux/sched.h
@@ -134,8 +134,11 @@ extern unsigned long nr_iowait(void);
 extern unsigned long weighted_cpuload(const int cpu);
 
 struct seq_file;
+struct cfs_rq;
 extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m);
 extern void proc_sched_set_task(struct task_struct *p);
+extern void
+print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq, u64 now);
 
 /*
  * Task state bitmask. NOTE! These bits are also
@@ -865,8 +868,13 @@ struct sched_class {
 	struct task_struct * (*pick_next_task) (struct rq *rq, u64 now);
 	void (*put_prev_task) (struct rq *rq, struct task_struct *p, u64 now);
 
-	struct task_struct * (*load_balance_start) (struct rq *rq);
-	struct task_struct * (*load_balance_next) (struct rq *rq);
+	int (*load_balance) (struct rq *this_rq, int this_cpu,
+			struct rq *busiest,
+			unsigned long max_nr_move, unsigned long max_load_move,
+			struct sched_domain *sd, enum cpu_idle_type idle,
+			int *all_pinned, unsigned long *total_load_moved);
+
+	void (*set_curr_task) (struct rq *rq);
 	void (*task_tick) (struct rq *rq, struct task_struct *p);
 	void (*task_new) (struct rq *rq, struct task_struct *p);
 };
@@ -899,6 +907,11 @@ struct sched_entity {
 	s64 fair_key;
 	s64 sum_wait_runtime, sum_sleep_runtime;
 	unsigned long wait_runtime_overruns, wait_runtime_underruns;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	struct sched_entity *parent;
+	struct cfs_rq *cfs_rq, 	/* rq on which this entity is (to be) queued */
+		      *my_q;  	/* rq "owned" by this entity/group */
+#endif
 };
 
 struct task_struct {
Index: current/kernel/sched.c
===================================================================
--- current.orig/kernel/sched.c
+++ current/kernel/sched.c
@@ -133,6 +133,22 @@ struct cfs_rq {
 	struct rb_root tasks_timeline;
 	struct rb_node *rb_leftmost;
 	struct rb_node *rb_load_balance_curr;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	/* 'curr' points to currently running entity on this cfs_rq.
+	 * It is set to NULL otherwise (i.e when none are currently running).
+	 */
+	struct sched_entity *curr;
+	struct rq *rq;	/* cpu runqueue to which this cfs_rq is attached */
+
+	/* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
+	 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
+	 * (like users, containers etc.)
+	 *
+	 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
+	 * list is used during load balance.
+	 */
+	struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */
+#endif
 };
 
 /* Real-Time classes' related field in a runqueue: */
@@ -168,6 +184,9 @@ struct rq {
 	u64 nr_switches;
 
 	struct cfs_rq cfs;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	struct list_head leaf_cfs_rq_list; /* list of leaf cfs_rq on this cpu */
+#endif
 	struct rt_rq  rt;
 
 	/*
@@ -342,6 +361,16 @@ static inline unsigned long long rq_cloc
 #define task_rq(p)		cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/* Change a task's ->cfs_rq if it moves across CPUs */
+static inline void set_task_cfs_rq(struct task_struct *p)
+{
+	p->se.cfs_rq = &task_rq(p)->cfs;
+}
+#else
+static inline void set_task_cfs_rq(struct task_struct *p) { }
+#endif
+
 #ifndef prepare_arch_switch
 # define prepare_arch_switch(next)	do { } while (0)
 #endif
@@ -738,6 +767,21 @@ static inline void dec_nr_running(struct
 }
 
 static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
+#ifdef CONFIG_SMP
+
+struct rq_iterator {
+	void *arg;
+	struct task_struct *(*start)(void *);
+	struct task_struct *(*next)(void *);
+};
+
+static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
+		      unsigned long max_nr_move, unsigned long max_load_move,
+		      struct sched_domain *sd, enum cpu_idle_type idle,
+		      int *all_pinned, unsigned long *load_moved,
+		      int this_best_prio, int best_prio, int best_prio_seen,
+		      struct rq_iterator *iterator);
+#endif
 
 #include "sched_stats.h"
 #include "sched_rt.c"
@@ -894,6 +938,7 @@ unsigned long weighted_cpuload(const int
 static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 {
 	task_thread_info(p)->cpu = cpu;
+	set_task_cfs_rq(p);
 }
 
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
@@ -919,6 +964,7 @@ void set_task_cpu(struct task_struct *p,
 
 	task_thread_info(p)->cpu = new_cpu;
 
+	set_task_cfs_rq(p);
 }
 
 struct migration_req {
@@ -2003,89 +2049,26 @@ int can_migrate_task(struct task_struct 
 	return 1;
 }
 
-/*
- * Load-balancing iterator: iterate through the hieararchy of scheduling
- * classes, starting with the highest-prio one:
- */
-
-struct task_struct * load_balance_start(struct rq *rq)
-{
-	struct sched_class *class = sched_class_highest;
-	struct task_struct *p;
-
-	do {
-		p = class->load_balance_start(rq);
-		if (p) {
-			rq->load_balance_class = class;
-			return p;
-		}
-		class = class->next;
-	} while (class);
-
-	return NULL;
-}
-
-struct task_struct * load_balance_next(struct rq *rq)
-{
-	struct sched_class *class = rq->load_balance_class;
-	struct task_struct *p;
-
-	p = class->load_balance_next(rq);
-	if (p)
-		return p;
-	/*
-	 * Pick up the next class (if any) and attempt to start
-	 * the iterator there:
-	 */
-	while ((class = class->next)) {
-		p = class->load_balance_start(rq);
-		if (p) {
-			rq->load_balance_class = class;
-			return p;
-		}
-	}
-	return NULL;
-}
-
-#define rq_best_prio(rq) (rq)->curr->prio
-
-/*
- * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
- * load from busiest to this_rq, as part of a balancing operation within
- * "domain". Returns the number of tasks moved.
- *
- * Called with both runqueues locked.
- */
-static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
+static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		      unsigned long max_nr_move, unsigned long max_load_move,
 		      struct sched_domain *sd, enum cpu_idle_type idle,
-		      int *all_pinned)
+		      int *all_pinned, unsigned long *load_moved,
+		      int this_best_prio, int best_prio, int best_prio_seen,
+		      struct rq_iterator *iterator)
 {
-	int pulled = 0, pinned = 0, this_best_prio, best_prio,
-	    best_prio_seen, skip_for_load;
+	int pulled = 0, pinned = 0, skip_for_load;
 	struct task_struct *p;
-	long rem_load_move;
+	long rem_load_move = max_load_move;
 
 	if (max_nr_move == 0 || max_load_move == 0)
 		goto out;
 
-	rem_load_move = max_load_move;
 	pinned = 1;
-	this_best_prio = rq_best_prio(this_rq);
-	best_prio = rq_best_prio(busiest);
-	/*
-	 * Enable handling of the case where there is more than one task
-	 * with the best priority.   If the current running task is one
-	 * of those with prio==best_prio we know it won't be moved
-	 * and therefore it's safe to override the skip (based on load) of
-	 * any task we find with that prio.
-	 */
-	best_prio_seen = best_prio == busiest->curr->prio;
 
 	/*
 	 * Start the load-balancing iterator:
 	 */
-	p = load_balance_start(busiest);
+	p = iterator->start(iterator->arg);
 next:
 	if (!p)
 		goto out;
@@ -2102,7 +2085,7 @@ next:
 	    !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
 
 		best_prio_seen |= p->prio == best_prio;
-		p = load_balance_next(busiest);
+		p = iterator->next(iterator->arg);
 		goto next;
 	}
 
@@ -2117,7 +2100,7 @@ next:
 	if (pulled < max_nr_move && rem_load_move > 0) {
 		if (p->prio < this_best_prio)
 			this_best_prio = p->prio;
-		p = load_balance_next(busiest);
+		p = iterator->next(iterator->arg);
 		goto next;
 	}
 out:
@@ -2130,10 +2113,40 @@ out:
 
 	if (all_pinned)
 		*all_pinned = pinned;
+	*load_moved = max_load_move - rem_load_move;
 	return pulled;
 }
 
 /*
+ * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
+ * load from busiest to this_rq, as part of a balancing operation within
+ * "domain". Returns the number of tasks moved.
+ *
+ * Called with both runqueues locked.
+ */
+static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
+		      unsigned long max_nr_move, unsigned long max_load_move,
+		      struct sched_domain *sd, enum cpu_idle_type idle,
+		      int *all_pinned)
+{
+	struct sched_class *class = sched_class_highest;
+	unsigned long load_moved, total_nr_moved = 0, nr_moved;
+	long rem_load_move = max_load_move;
+
+	do {
+		nr_moved = class->load_balance(this_rq, this_cpu, busiest,
+				max_nr_move, (unsigned long)rem_load_move,
+				sd, idle, all_pinned, &load_moved);
+		total_nr_moved += nr_moved;
+		max_nr_move -= nr_moved;
+		rem_load_move -= load_moved;
+		class = class->next;
+	} while (class && max_nr_move && rem_load_move > 0);
+
+	return total_nr_moved;
+}
+
+/*
  * find_busiest_group finds and returns the busiest CPU group within the
  * domain. It calculates and returns the amount of weighted load w
...

 
Read Message
Read Message
Read Message
Read Message
Previous Topic: Re: [RFC] mm-controller
Next Topic: Introduce new filesystem inside VE
Goto Forum:
  


Current Time: Sun Oct 13 04:02:50 GMT 2024

Total time taken to generate the page: 0.06465 seconds