This patch introduces the core changes in CFS required to accomplish
group fairness at higher levels. It also modifies load balance interface
between classes a bit, so that move_tasks (which is centric to load
balance) can be reused to balance between runqueues of various types
(struct rq in case of SCHED_RT tasks, struct lrq in case of
SCHED_NORMAL/BATCH tasks).
Signed-off-by : Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
---
include/linux/sched.h | 17 ++
kernel/sched.c | 174 ++++++++++++++++------------
kernel/sched_debug.c | 31 +++--
kernel/sched_fair.c | 295 ++++++++++++++++++++++++++++++++++++++++++++----
kernel/sched_idletask.c | 11 +
kernel/sched_rt.c | 47 ++++++-
6 files changed, 464 insertions(+), 111 deletions(-)
Index: current/include/linux/sched.h
===================================================================
--- current.orig/include/linux/sched.h
+++ current/include/linux/sched.h
@@ -134,8 +134,11 @@ extern unsigned long nr_iowait(void);
extern unsigned long weighted_cpuload(const int cpu);
struct seq_file;
+struct cfs_rq;
extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m);
extern void proc_sched_set_task(struct task_struct *p);
+extern void
+print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq, u64 now);
/*
* Task state bitmask. NOTE! These bits are also
@@ -865,8 +868,13 @@ struct sched_class {
struct task_struct * (*pick_next_task) (struct rq *rq, u64 now);
void (*put_prev_task) (struct rq *rq, struct task_struct *p, u64 now);
- struct task_struct * (*load_balance_start) (struct rq *rq);
- struct task_struct * (*load_balance_next) (struct rq *rq);
+ int (*load_balance) (struct rq *this_rq, int this_cpu,
+ struct rq *busiest,
+ unsigned long max_nr_move, unsigned long max_load_move,
+ struct sched_domain *sd, enum cpu_idle_type idle,
+ int *all_pinned, unsigned long *total_load_moved);
+
+ void (*set_curr_task) (struct rq *rq);
void (*task_tick) (struct rq *rq, struct task_struct *p);
void (*task_new) (struct rq *rq, struct task_struct *p);
};
@@ -899,6 +907,11 @@ struct sched_entity {
s64 fair_key;
s64 sum_wait_runtime, sum_sleep_runtime;
unsigned long wait_runtime_overruns, wait_runtime_underruns;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ struct sched_entity *parent;
+ struct cfs_rq *cfs_rq, /* rq on which this entity is (to be) queued */
+ *my_q; /* rq "owned" by this entity/group */
+#endif
};
struct task_struct {
Index: current/kernel/sched.c
===================================================================
--- current.orig/kernel/sched.c
+++ current/kernel/sched.c
@@ -133,6 +133,22 @@ struct cfs_rq {
struct rb_root tasks_timeline;
struct rb_node *rb_leftmost;
struct rb_node *rb_load_balance_curr;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ /* 'curr' points to currently running entity on this cfs_rq.
+ * It is set to NULL otherwise (i.e when none are currently running).
+ */
+ struct sched_entity *curr;
+ struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
+
+ /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
+ * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
+ * (like users, containers etc.)
+ *
+ * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
+ * list is used during load balance.
+ */
+ struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */
+#endif
};
/* Real-Time classes' related field in a runqueue: */
@@ -168,6 +184,9 @@ struct rq {
u64 nr_switches;
struct cfs_rq cfs;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ struct list_head leaf_cfs_rq_list; /* list of leaf cfs_rq on this cpu */
+#endif
struct rt_rq rt;
/*
@@ -342,6 +361,16 @@ static inline unsigned long long rq_cloc
#define task_rq(p) cpu_rq(task_cpu(p))
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/* Change a task's ->cfs_rq if it moves across CPUs */
+static inline void set_task_cfs_rq(struct task_struct *p)
+{
+ p->se.cfs_rq = &task_rq(p)->cfs;
+}
+#else
+static inline void set_task_cfs_rq(struct task_struct *p) { }
+#endif
+
#ifndef prepare_arch_switch
# define prepare_arch_switch(next) do { } while (0)
#endif
@@ -738,6 +767,21 @@ static inline void dec_nr_running(struct
}
static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
+#ifdef CONFIG_SMP
+
+struct rq_iterator {
+ void *arg;
+ struct task_struct *(*start)(void *);
+ struct task_struct *(*next)(void *);
+};
+
+static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
+ unsigned long max_nr_move, unsigned long max_load_move,
+ struct sched_domain *sd, enum cpu_idle_type idle,
+ int *all_pinned, unsigned long *load_moved,
+ int this_best_prio, int best_prio, int best_prio_seen,
+ struct rq_iterator *iterator);
+#endif
#include "sched_stats.h"
#include "sched_rt.c"
@@ -894,6 +938,7 @@ unsigned long weighted_cpuload(const int
static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
{
task_thread_info(p)->cpu = cpu;
+ set_task_cfs_rq(p);
}
void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
@@ -919,6 +964,7 @@ void set_task_cpu(struct task_struct *p,
task_thread_info(p)->cpu = new_cpu;
+ set_task_cfs_rq(p);
}
struct migration_req {
@@ -2003,89 +2049,26 @@ int can_migrate_task(struct task_struct
return 1;
}
-/*
- * Load-balancing iterator: iterate through the hieararchy of scheduling
- * classes, starting with the highest-prio one:
- */
-
-struct task_struct * load_balance_start(struct rq *rq)
-{
- struct sched_class *class = sched_class_highest;
- struct task_struct *p;
-
- do {
- p = class->load_balance_start(rq);
- if (p) {
- rq->load_balance_class = class;
- return p;
- }
- class = class->next;
- } while (class);
-
- return NULL;
-}
-
-struct task_struct * load_balance_next(struct rq *rq)
-{
- struct sched_class *class = rq->load_balance_class;
- struct task_struct *p;
-
- p = class->load_balance_next(rq);
- if (p)
- return p;
- /*
- * Pick up the next class (if any) and attempt to start
- * the iterator there:
- */
- while ((class = class->next)) {
- p = class->load_balance_start(rq);
- if (p) {
- rq->load_balance_class = class;
- return p;
- }
- }
- return NULL;
-}
-
-#define rq_best_prio(rq) (rq)->curr->prio
-
-/*
- * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
- * load from busiest to this_rq, as part of a balancing operation within
- * "domain". Returns the number of tasks moved.
- *
- * Called with both runqueues locked.
- */
-static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
+static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long max_nr_move, unsigned long max_load_move,
struct sched_domain *sd, enum cpu_idle_type idle,
- int *all_pinned)
+ int *all_pinned, unsigned long *load_moved,
+ int this_best_prio, int best_prio, int best_prio_seen,
+ struct rq_iterator *iterator)
{
- int pulled = 0, pinned = 0, this_best_prio, best_prio,
- best_prio_seen, skip_for_load;
+ int pulled = 0, pinned = 0, skip_for_load;
struct task_struct *p;
- long rem_load_move;
+ long rem_load_move = max_load_move;
if (max_nr_move == 0 || max_load_move == 0)
goto out;
- rem_load_move = max_load_move;
pinned = 1;
- this_best_prio = rq_best_prio(this_rq);
- best_prio = rq_best_prio(busiest);
- /*
- * Enable handling of the case where there is more than one task
- * with the best priority. If the current running task is one
- * of those with prio==best_prio we know it won't be moved
- * and therefore it's safe to override the skip (based on load) of
- * any task we find with that prio.
- */
- best_prio_seen = best_prio == busiest->curr->prio;
/*
* Start the load-balancing iterator:
*/
- p = load_balance_start(busiest);
+ p = iterator->start(iterator->arg);
next:
if (!p)
goto out;
@@ -2102,7 +2085,7 @@ next:
!can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
best_prio_seen |= p->prio == best_prio;
- p = load_balance_next(busiest);
+ p = iterator->next(iterator->arg);
goto next;
}
@@ -2117,7 +2100,7 @@ next:
if (pulled < max_nr_move && rem_load_move > 0) {
if (p->prio < this_best_prio)
this_best_prio = p->prio;
- p = load_balance_next(busiest);
+ p = iterator->next(iterator->arg);
goto next;
}
out:
@@ -2130,10 +2113,40 @@ out:
if (all_pinned)
*all_pinned = pinned;
+ *load_moved = max_load_move - rem_load_move;
return pulled;
}
/*
+ * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
+ * load from busiest to this_rq, as part of a balancing operation within
+ * "domain". Returns the number of tasks moved.
+ *
+ * Called with both runqueues locked.
+ */
+static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
+ unsigned long max_nr_move, unsigned long max_load_move,
+ struct sched_domain *sd, enum cpu_idle_type idle,
+ int *all_pinned)
+{
+ struct sched_class *class = sched_class_highest;
+ unsigned long load_moved, total_nr_moved = 0, nr_moved;
+ long rem_load_move = max_load_move;
+
+ do {
+ nr_moved = class->load_balance(this_rq, this_cpu, busiest,
+ max_nr_move, (unsigned long)rem_load_move,
+ sd, idle, all_pinned, &load_moved);
+ total_nr_moved += nr_moved;
+ max_nr_move -= nr_moved;
+ rem_load_move -= load_moved;
+ class = class->next;
+ } while (class && max_nr_move && rem_load_move > 0);
+
+ return total_nr_moved;
+}
+
+/*
* find_busiest_group finds and returns the busiest CPU group within the
* domain. It calculates and returns the amount of weighted load w
...