OpenVZ Forum: Devel » Re: [PATCH] Hookup group-scheduler with task container infrastructure

Home » Mailing lists » Devel » Re: [PATCH] Hookup group-scheduler with task container infrastructure

Show: Today's Messages :: Show Polls :: Message Navigator
E-mail to friend

Re: [PATCH] Hookup group-scheduler with task container infrastructure [message #20218 is a reply to message #20157]

Thu, 13 September 2007 12:27

Srivatsa Vaddagiri
Messages: 241
Registered: August 2006

Senior Member

On Wed, Sep 12, 2007 at 06:25:37PM +0200, Dmitry Adamushko wrote:
> > +       kfree(tg);
> > +}
> 
> kfree(tg->cfs_rq) && kfree(tg->se) ?

oops, yes!

> > +       if (tsk->sched_class != &fair_sched_class)
> > +               goto done;
> 
> this check should be redundant now with sched_can_attach() in place.

well, there is remote possibility that task could have changed policies
between sched_can_attach() and sched_move_task()! In the updated patch
below, I have put some hooks to deal with that scenario elsewhere also
(in rt_mutex_setprio and __setscheduler). Pls let me know if you see 
any issues with that.

> > +static void set_se_shares(struct sched_entity *se, unsigned long shares)
> > +{
> > +       struct cfs_rq *cfs_rq = se->cfs_rq;
> > +       struct rq *rq = cfs_rq->rq;
> > +       int on_rq;
> > +
> > +       spin_lock_irq(&rq->lock);
> > +
> > +       on_rq = se->on_rq;
> > +       if (on_rq)
> > +               __dequeue_entity(cfs_rq, se);
> > +
> > +       se->load.weight = shares;
> > +       se->load.inv_weight = div64_64((1ULL<<32), shares);
> 
> A bit of nit-picking... are you sure, there is no need in non '__'
> versions of dequeue/enqueu() here (at least, for the sake of

You are right, we need the dequeue/enqueue_entity here, as the sched
entity load is changing in between the two function calls.

How does the updated patch look?  (Thanks again for all your review so far!)


--


Add interface to control cpu bandwidth allocation to task-groups.

Signed-off-by : Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by : Dhaval Giani <dhaval@linux.vnet.ibm.com>


---
 include/linux/container_subsys.h |    6 
 init/Kconfig                     |    9 +
 kernel/sched.c                   |  344 +++++++++++++++++++++++++++++++++++++--
 kernel/sched_fair.c              |    3 
 kernel/sched_idletask.c          |    5 
 kernel/sched_rt.c                |    5 
 6 files changed, 355 insertions(+), 17 deletions(-)

Index: current/include/linux/container_subsys.h
===================================================================
--- current.orig/include/linux/container_subsys.h
+++ current/include/linux/container_subsys.h
@@ -36,3 +36,9 @@ SUBSYS(mem_container)
 #endif
 
 /* */
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+SUBSYS(cpu)
+#endif
+
+/* */
Index: current/init/Kconfig
===================================================================
--- current.orig/init/Kconfig
+++ current/init/Kconfig
@@ -326,6 +326,15 @@ config RESOURCE_COUNTERS
           infrastructure that works with containers
 	depends on CONTAINERS
 
+config FAIR_GROUP_SCHED
+	bool "Fair group scheduler"
+	depends on EXPERIMENTAL && CONTAINERS
+	help
+	  This option enables you to group tasks and control CPU resource
+	  allocation to such groups.
+
+	  Say N if unsure.
+
 config SYSFS_DEPRECATED
 	bool "Create deprecated sysfs files"
 	default y
Index: current/kernel/sched.c
===================================================================
--- current.orig/kernel/sched.c
+++ current/kernel/sched.c
@@ -179,6 +179,58 @@ struct load_stat {
 	unsigned long delta_fair, delta_exec, delta_stat;
 };
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
+#include <linux/container.h>
+
+struct cfs_rq;
+
+/* task group related information */
+struct task_grp {
+	struct container_subsys_state css;
+	/* schedulable entities of this group on each cpu */
+	struct sched_entity **se;
+	/* runqueue "owned" by this group on each cpu */
+	struct cfs_rq **cfs_rq;
+	unsigned long shares;
+};
+
+/* Default task group's sched entity on each cpu */
+static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
+/* Default task group's cfs_rq on each cpu */
+static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
+
+static struct sched_entity *init_sched_entity_p[CONFIG_NR_CPUS];
+static struct cfs_rq *init_cfs_rq_p[CONFIG_NR_CPUS];
+
+/* Default task group.
+ * 	Every task in system belong to this group at bootup.
+ */
+static struct task_grp init_task_grp =  {
+					.se     = init_sched_entity_p,
+					.cfs_rq = init_cfs_rq_p,
+					};
+
+/* return group to which a task belongs */
+static inline struct task_grp *task_grp(struct task_struct *p)
+{
+	return container_of(task_subsys_state(p, cpu_subsys_id),
+				struct task_grp, css);
+}
+
+/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
+static inline void set_task_cfs_rq(struct task_struct *p)
+{
+	p->se.cfs_rq = task_grp(p)->cfs_rq[task_cpu(p)];
+	p->se.parent = task_grp(p)->se[task_cpu(p)];
+}
+
+#else
+
+static inline void set_task_cfs_rq(struct task_struct *p) { }
+
+#endif	/* CONFIG_FAIR_GROUP_SCHED */
+
 /* CFS-related fields in a runqueue */
 struct cfs_rq {
 	struct load_weight load;
@@ -208,6 +260,7 @@ struct cfs_rq {
 	 * list is used during load balance.
 	 */
 	struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */
+	struct task_grp *tg;    /* group that "owns" this runqueue */
 #endif
 };
 
@@ -405,18 +458,6 @@ unsigned long long cpu_clock(int cpu)
 
 EXPORT_SYMBOL_GPL(cpu_clock);
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
-/* Change a task's ->cfs_rq if it moves across CPUs */
-static inline void set_task_cfs_rq(struct task_struct *p)
-{
-	p->se.cfs_rq = &task_rq(p)->cfs;
-}
-#else
-static inline void set_task_cfs_rq(struct task_struct *p)
-{
-}
-#endif
-
 #ifndef prepare_arch_switch
 # define prepare_arch_switch(next)	do { } while (0)
 #endif
@@ -3987,8 +4028,11 @@ void rt_mutex_setprio(struct task_struct
 
 	oldprio = p->prio;
 	on_rq = p->se.on_rq;
-	if (on_rq)
+	if (on_rq) {
 		dequeue_task(rq, p, 0);
+		if (task_running(rq, p))
+			p->sched_class->put_prev_task(rq, p);
+	}
 
 	if (rt_prio(prio))
 		p->sched_class = &rt_sched_class;
@@ -4007,6 +4051,7 @@ void rt_mutex_setprio(struct task_struct
 		if (task_running(rq, p)) {
 			if (p->prio > oldprio)
 				resched_task(rq->curr);
+			p->sched_class->set_curr_task(rq);
 		} else {
 			check_preempt_curr(rq, p);
 		}
@@ -4293,8 +4338,11 @@ recheck:
 	}
 	update_rq_clock(rq);
 	on_rq = p->se.on_rq;
-	if (on_rq)
+	if (on_rq) {
 		deactivate_task(rq, p, 0);
+		if (task_running(rq, p))
+			p->sched_class->put_prev_task(rq, p);
+	}
 	oldprio = p->prio;
 	__setscheduler(rq, p, policy, param->sched_priority);
 	if (on_rq) {
@@ -4307,6 +4355,7 @@ recheck:
 		if (task_running(rq, p)) {
 			if (p->prio > oldprio)
 				resched_task(rq->curr);
+			p->sched_class->set_curr_task(rq);
 		} else {
 			check_preempt_curr(rq, p);
 		}
@@ -6567,7 +6616,25 @@ void __init sched_init(void)
 		init_cfs_rq(&rq->cfs, rq);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 		INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
-		list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
+	 	{
+ 			struct cfs_rq *cfs_rq = &per_cpu(init_cfs_rq, i);
+ 			struct sched_entity *se =
+ 					 &per_cpu(init_sched_entity, i);
+
+ 			init_cfs_rq_p[i] = cfs_rq;
+ 			init_cfs_rq(cfs_rq, rq);
+ 			cfs_rq->tg = &init_task_grp;
+ 			list_add(&cfs_rq->leaf_cfs_rq_list,
+							 &rq->leaf_cfs_rq_list);
+
+ 			init_sched_entity_p[i] = se;
+ 			se->cfs_rq = &rq->cfs;
+ 			se->my_q = cfs_rq;
+ 			se->load.weight = NICE_0_LOAD;
+			se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD);
+ 			se->parent = NULL;
+ 		}
+		init_task_grp.shares = NICE_0_LOAD;
 #endif
 		rq->ls.load_update_last = now;
 		rq->ls.load_update_start = now;
@@ -6764,3 +6831,250 @@ void set_curr_task(int cpu, struct task_
 }
 
 #endif
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
+/* return corresponding task_grp object of a container */
+static inline struct task_grp *container_tg(struct container *cont)
+{
+	return container_of(container_subsys_state(cont, cpu_subsys_id),
+					 struct task_grp, css);
+}
+
+/* allocate runqueue etc for a new task group */
+static struct container_subsys_state *
+sched_create_group(struct container_subsys *ss, struct container *cont)
+{
+	struct task_grp *tg;
+	struct cfs_rq *cfs_rq;
+	struct sched_entity *se;
+	int i;
+
+	if (!cont->parent) {
+		/* This is early initialization for the top container */
+		init_task_grp.css.container = cont;
+		return &init_task_grp.css;
+	}
+
+	/* we support only 1-level deep hierarchical scheduler atm */
+	if (cont->parent->parent)
+		return ERR_PTR(-EINVAL);
+
+	tg = kzalloc(sizeof(*tg), GFP_KERNEL);
+	if (!tg)
+		return ERR_PTR(-ENOMEM);
+
+	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * num_possible_cpus(), GFP_KERNEL);
+	if (!tg->cfs_rq)
+		goto err;
+	tg->se = kzalloc(sizeof(se) * num_possible_cpus(), GFP_KERNEL);
+	if (!tg->se)
+		goto err;
+
+	for_each_possible_cpu(i) {
+		struct rq *rq = cpu_rq(i);
+
+		cfs_rq = kmalloc_node(sizeof(struct cfs_rq), GFP_KERNEL,
+							 cpu_to_node(i));
+		if (!cfs_rq)
+			goto err;
+
+		se = kmalloc_node(sizeof(struct sched_entity), GFP_KERNEL,
+							cpu_to_node(i));
+		if (!se)
+			goto err;
+
+		memset(cfs_rq, 0, sizeof(struct cfs_rq));
+		memset(se, 0, sizeof(struct sched_entity));
+
+		tg->cfs_rq[i] = cfs_rq;
+		init_cfs_rq(cfs_rq, rq);
+		cfs_rq->tg = tg;
+		list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
+
+		tg->se[i] = se;
+		se->cfs_rq = &rq->cfs;
+		se->my_q = cfs_rq;
+		se->load.weight = NICE_0_LOAD;
+		se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD);
+		se->parent = NULL;
+	}
+
+	tg->shares = NICE_0_LOAD;
+
+	/* Bind the container to task_grp object we just created */
+	tg->css.container = cont;
+
+	return &tg->css;
+
+err:
+	for_each_possible_cpu(i) {
+		if (tg->cfs_rq && tg->cfs_rq[i])
+			kfree(tg->cfs_rq[i]);
+		if (tg->se && tg->se[i])
+

...

[ Show the rest of the message ]

Report message to a moderator

[Message index]

		Re: [PATCH] Hookup group-scheduler with task container infrastructure By: akpm on Mon, 10 September 2007 17:22
		Re: Re: [PATCH] Hookup group-scheduler with task container infrastructure By: Paul Menage on Mon, 10 September 2007 18:38
		Re: Re: [PATCH] Hookup group-scheduler with task container infrastructure By: Srivatsa Vaddagiri on Tue, 11 September 2007 04:44
		Re: [PATCH] Hookup group-scheduler with task container infrastructure By: Jan Engelhardt on Mon, 10 September 2007 17:41
		Re: [PATCH] Hookup group-scheduler with task container infrastructure By: Srivatsa Vaddagiri on Mon, 10 September 2007 17:46
		Re: [PATCH] Hookup group-scheduler with task container infrastructure By: Dmitry Adamushko on Mon, 10 September 2007 22:28
		Re: [PATCH] Hookup group-scheduler with task container infrastructure By: Srivatsa Vaddagiri on Tue, 11 September 2007 04:41
		Re: [PATCH] Hookup group-scheduler with task container infrastructure By: Dmitry Adamushko on Tue, 11 September 2007 09:53
		Re: [PATCH] Hookup group-scheduler with task container infrastructure By: Srivatsa Vaddagiri on Wed, 12 September 2007 11:33
		Re: [PATCH] Hookup group-scheduler with task container infrastructure By: Srivatsa Vaddagiri on Wed, 12 September 2007 12:05
		Re: [PATCH] Hookup group-scheduler with task container infrastructure By: Dmitry Adamushko on Wed, 12 September 2007 16:25
		Re: [PATCH] Hookup group-scheduler with task container infrastructure By: Srivatsa Vaddagiri on Thu, 13 September 2007 12:27
		Re: [PATCH] Hookup group-scheduler with task container infrastructure By: Ingo Molnar on Fri, 14 September 2007 09:41
		Re: [PATCH] Hookup group-scheduler with task container infrastructure By: Srivatsa Vaddagiri on Fri, 14 September 2007 15:51
		Re: [PATCH] Hookup group-scheduler with task container infrastructure By: Srivatsa Vaddagiri on Fri, 14 September 2007 16:13
		Re: [PATCH] Hookup group-scheduler with task container infrastructure By: Dmitry Adamushko on Mon, 10 September 2007 22:37

Previous Topic:	[NETNS45][patch 0/1] remove udp rover
Next Topic:	[PATCH 1/2] Convert uid hash to hlist

Goto Forum:

-=] Back to Top [=-

[ Syndicate this forum (XML) ] [

] [

]

Current Time: Mon Feb 16 22:45:20 GMT 2026

Total time taken to generate the page: 0.43965 seconds