| Home » Mailing lists » Devel » [PATCH 0/7] Generic Process Containers (+ ResGroups/BeanCounters) Goto Forum:
	| 
		
			| [PATCH 0/7] Generic Process Containers (+ ResGroups/BeanCounters) [message #8540] | Thu, 23 November 2006 12:08  |  
			| 
				
				
					|  Paul Menage Messages: 642
 Registered: September 2006
 | Senior Member |  |  |  
	| This is an update to my multi-hierarchy generic containers patch (against 2.6.19-rc6). Changes include:
 
 - an example patch implementing the BeanCounters core and numfiles
 counters over generic containers. The addition of the
 BeanCounters code unifies the three main process grouping
 abstractions (Cpusets, ResGroups and BeanCounters).
 
 - a patch splitting Cpusets into two independently groupable
 subsystems, Cpusets and Memsets.
 
 - support for a subsystem to keep a container alive via refcounts
 (e.g. the BeanCounters numfiles counter has a reference to the
 beancounter object from each file charged to that beancounter, so
 needs to be able to keep the beancounter alive until the file is
 destroyed)
 
 -------------------------------------
 
 There have recently been various proposals floating around for
 resource management/accounting subsystems in the kernel, including
 Res Groups, User BeanCounters and others.  These all need the basic
 abstraction of being able to group together multiple processes in an
 aggregate, in order to track/limit the resources permitted to those
 processes, and all implement this grouping in different ways.
 
 Already existing in the kernel is the cpuset subsystem; this has a
 process grouping mechanism that is mature, tested, and well documented
 (particularly with regards to synchronization rules).
 
 This patchset extracts the process grouping code from cpusets into a
 generic container system, and makes the cpusets code a client of
 the container system.
 
 It also provides several example clients of the container system,
 including ResGroups and BeanCounters
 
 The change is implemented in five stages plus two additional example patches:
 
 1) extract the process grouping code from cpusets into a standalone system
 
 2) remove the process grouping code from cpusets and hook into the
 container system
 
 3) convert the container system to present a generic multi-hierarchy
 API, and make cpusets a client of that API
 
 4) add a simple CPU accounting container subsystem as an example
 
 5) example of implementing ResGroups and its numtasks controller over
 generic containers - not intended to be applied with this patch set
 
 6) split cpusets into two subsystems, cpusets and memsets
 
 7) example of implementing BeanCounters and its numfiles counter over
 generic containers - not intended to be applied with this patch set
 
 
 The intention is that the various resource management efforts can also
 become container clients, with the result that:
 
 - the userspace APIs are (somewhat) normalised
 
 - it's easier to test out e.g. the ResGroups CPU controller in
 conjunction with the BeanCounters memory controller
 
 - the additional kernel footprint of any of the competing resource
 management systems is substantially reduced, since it doesn't need
 to provide process grouping/containment, hence improving their
 chances of getting into the kernel
 
 Signed-off-by: Paul Menage <menage@google.com>
 
 --
 |  
	|  |  |  
	| 
		
			| [PATCH 3/7] Add generic multi-subsystem API to containers [message #8541 is a reply to message #8540] | Thu, 23 November 2006 12:08   |  
			| 
				
				
					|  Paul Menage Messages: 642
 Registered: September 2006
 | Senior Member |  |  |  
	| This patch removes all cpuset-specific knowlege from the container system, replacing it with a generic API that can be used by multiple
 subsystems. Cpusets is adapted to be a container subsystem.
 
 Signed-off-by: Paul Menage <menage@google.com>
 
 ---
 Documentation/containers.txt |  233 ++++++++++++-
 include/linux/container.h    |  132 +++++++
 include/linux/cpuset.h       |   16
 include/linux/mempolicy.h    |   12
 include/linux/sched.h        |    4
 kernel/container.c           |  716 +++++++++++++++++++++++++++++++++++--------
 kernel/cpuset.c              |  165 ++++++---
 mm/mempolicy.c               |    2
 8 files changed, 1048 insertions(+), 232 deletions(-)
 
 Index: container-2.6.19-rc6/include/linux/container.h
 ============================================================ =======
 --- container-2.6.19-rc6.orig/include/linux/container.h
 +++ container-2.6.19-rc6/include/linux/container.h
 @@ -14,8 +14,6 @@
 
 #ifdef CONFIG_CONTAINERS
 
 -extern int number_of_containers;	/* How many containers are defined in system? */
 -
 extern int container_init_early(void);
 extern int container_init(void);
 extern void container_init_smp(void);
 @@ -30,6 +28,68 @@ extern void container_unlock(void);
 extern void container_manage_lock(void);
 extern void container_manage_unlock(void);
 
 +struct containerfs_root;
 +
 +/* Per-subsystem/per-container state maintained by the system. */
 +struct container_subsys_state {
 +	/* The container that this subsystem is attached to. Useful
 +	 * for subsystems that want to know about the container
 +	 * hierarchy structure */
 +	struct container *container;
 +
 +	/* State maintained by the container system to allow
 +	 * subsystems to be "busy". Should be accessed via css_get()
 +	 * and css_put() */
 +	spinlock_t refcnt_lock;
 +	atomic_t refcnt;
 +};
 +
 +/*
 + * Call css_get() to hold a reference on the container; following a
 + * return of 0, this container subsystem state object is guaranteed
 + * not to be destroyed until css_put() is called on it.  A non-zero
 + * return code indicates that a reference could not be taken.
 + *
 + */
 +
 +static inline int css_get(struct container_subsys_state *css)
 +{
 +	int retval = 0;
 +	unsigned long flags;
 +	/* Synchronize with container_rmdir() */
 +	spin_lock_irqsave(&css->refcnt_lock, flags);
 +	if (atomic_read(&css->refcnt) >= 0) {
 +		/* Container is still alive */
 +		atomic_inc(&css->refcnt);
 +	} else {
 +		/* Container removal is in progress */
 +		retval = -EINVAL;
 +	}
 +	spin_unlock_irqrestore(&css->refcnt_lock, flags);
 +	return retval;
 +}
 +
 +/*
 + * If you are holding current->alloc_lock then it's impossible for you
 + * to be moved out of your container, and hence it's impossible for
 + * your container to be destroyed. Therefore doing a simple
 + * atomic_inc() on a css is safe.
 + */
 +
 +static inline void css_get_current(struct container_subsys_state *css)
 +{
 +	atomic_inc(&css->refcnt);
 +}
 +
 +/*
 + * css_put() should be called to release a reference taken by
 + * css_get() or css_get_current()
 + */
 +
 +static inline void css_put(struct container_subsys_state *css) {
 +	atomic_dec(&css->refcnt);
 +}
 +
 struct container {
 unsigned long flags;		/* "unsigned long" so bitops work */
 
 @@ -46,11 +106,15 @@ struct container {
 struct list_head children;	/* my children */
 
 struct container *parent;	/* my parent */
 -	struct dentry *dentry;		/* container fs entry */
 +	struct dentry *dentry;	  	/* container fs entry */
 
 -#ifdef CONFIG_CPUSETS
 -	struct cpuset *cpuset;
 -#endif
 +	/* Private pointers for each registered subsystem */
 +	struct container_subsys_state *subsys[CONFIG_MAX_CONTAINER_SUBSYS];
 +
 +	int hierarchy;
 +
 +	struct containerfs_root *root;
 +	struct container *top_container;
 };
 
 /* struct cftype:
 @@ -67,8 +131,9 @@ struct container {
 */
 
 struct inode;
 +#define MAX_CFTYPE_NAME 64
 struct cftype {
 -	char *name;
 +	char name[MAX_CFTYPE_NAME];
 int private;
 int (*open) (struct inode *inode, struct file *file);
 ssize_t (*read) (struct container *cont, struct cftype *cft,
 @@ -87,6 +152,59 @@ void container_set_release_agent_path(co
 
 int container_path(const struct container *cont, char *buf, int buflen);
 
 +/* Container subsystem type. See Documentation/containers.txt for details */
 +
 +struct container_subsys {
 +	int (*create)(struct container_subsys *ss,
 +		      struct container *cont);
 +	void (*destroy)(struct container_subsys *ss, struct container *cont);
 +	int (*can_attach)(struct container_subsys *ss,
 +			  struct container *cont, struct task_struct *tsk);
 +	void (*attach)(struct container_subsys *ss, struct container *cont,
 +			struct container *old_cont, struct task_struct *tsk);
 +	void (*post_attach)(struct container_subsys *ss,
 +			    struct container *cont,
 +			    struct container *old_cont,
 +			    struct task_struct *tsk);
 +	void (*fork)(struct container_subsys *ss, struct task_struct *task);
 +	void (*exit)(struct container_subsys *ss, struct task_struct *task);
 +	int (*populate)(struct container_subsys *ss,
 +			struct container *cont);
 +
 +	int subsys_id;
 +#define MAX_CONTAINER_TYPE_NAMELEN 32
 +	const char *name;
 +
 +	/* Protected by RCU */
 +	int hierarchy;
 +
 +	struct list_head sibling;
 +};
 +
 +int container_register_subsys(struct container_subsys *subsys);
 +
 +static inline struct container_subsys_state *container_subsys_state(
 +	struct container *cont,
 +	struct container_subsys *ss)
 +{
 +	return cont->subsys[ss->subsys_id];
 +}
 +
 +static inline struct container* task_container(struct task_struct *task,
 +					       struct container_subsys *ss)
 +{
 +	return rcu_dereference(task->container[ss->hierarchy]);
 +}
 +
 +static inline struct container_subsys_state *task_subsys_state(
 +	struct task_struct *task,
 +	struct container_subsys *ss)
 +{
 +	return container_subsys_state(task_container(task, ss), ss);
 +}
 +
 +int container_path(const struct container *cont, char *buf, int buflen);
 +
 #else /* !CONFIG_CONTAINERS */
 
 static inline int container_init_early(void) { return 0; }
 Index: container-2.6.19-rc6/include/linux/cpuset.h
 ============================================================ =======
 --- container-2.6.19-rc6.orig/include/linux/cpuset.h
 +++ container-2.6.19-rc6/include/linux/cpuset.h
 @@ -60,16 +60,7 @@ static inline int cpuset_do_slab_mem_spr
 
 extern void cpuset_track_online_nodes(void);
 
 -extern int cpuset_can_attach_task(struct container *cont,
 -				  struct task_struct *tsk);
 -extern void cpuset_attach_task(struct container *cont,
 -				struct task_struct *tsk);
 -extern void cpuset_post_attach_task(struct container *cont,
 -				    struct container *oldcont,
 -				    struct task_struct *tsk);
 -extern int cpuset_populate_dir(struct container *cont);
 -extern int cpuset_create(struct container *cont);
 -extern void cpuset_destroy(struct container *cont);
 +extern int current_cpuset_is_being_rebound(void);
 
 #else /* !CONFIG_CPUSETS */
 
 @@ -131,6 +122,11 @@ static inline int cpuset_do_slab_mem_spr
 
 static inline void cpuset_track_online_nodes(void) {}
 
 +static inline int current_cpuset_is_being_rebound(void)
 +{
 +	return 0;
 +}
 +
 #endif /* !CONFIG_CPUSETS */
 
 #endif /* _LINUX_CPUSET_H */
 Index: container-2.6.19-rc6/kernel/container.c
 ============================================================ =======
 --- container-2.6.19-rc6.orig/kernel/container.c
 +++ container-2.6.19-rc6/kernel/container.c
 @@ -55,7 +55,6 @@
 #include <linux/time.h>
 #include <linux/backing-dev.h>
 #include <linux/sort.h>
 -#include <linux/cpuset.h>
 
 #include <asm/uaccess.h>
 #include <asm/atomic.h>
 @@ -63,12 +62,47 @@
 
 #define CONTAINER_SUPER_MAGIC		0x27e0eb
 
 -/*
 - * Tracks how many containers are currently defined in system.
 - * When there is only one container (the root container) we can
 - * short circuit some hooks.
 +static struct container_subsys *subsys[CONFIG_MAX_CONTAINER_SUBSYS];
 +static int subsys_count = 0;
 +
 +struct containerfs_root {
 +	struct super_block *sb;
 +	unsigned long subsys_bits;
 +	struct list_head subsys_list;
 +	struct container top_container;
 +	/*
 +	 * Tracks how many containers are currently defined in system.
 +	 * When there is only one container (the root container) we can
 +	 * short circuit some hooks.
 +	 */
 +	int number_of_containers;
 +	struct vfsmount *pin_mount;
 +};
 +
 +/* The set of hierarchies in use. Hierarchy 0 is the "dummy
 + * container", reserved for the subsystems that are otherwise
 + * unattached - it never has more than a single container, and all
 + * tasks are part of that container. */
 +
 +static struct containerfs_root rootnode[CONFIG_MAX_CONTAINER_HIERARCHIES];
 +
 +/* dummytop is a shorthand for the dummy hierarchy's top container */
 +#define dummytop (&rootnode[0].top_container)
 +
 +/* This flag indicates whether tasks in the fork and exit paths should
 + * take callback_mutex and check for fork/exit handlers to call. This
 + * avoids us having to take locks in the fork/exit path if none of the
 + * subsystems need to be called.
 + *
 + * It is protected via RCU, with the invariant that a process in an
 + * rcu_read_lock() section will never see this as 0 if there are
 + * actually registered subsystems with a fork or exit
 + * handler. (Sometimes it may be 1 without there being any registered
 + * subsystems with such a handler, but such periods are safe and of
 + * short duration).
 */
 -int number_of_containers __read_mostly;
 +
 +static int need_forkexit_callback = 0;
 
 /* bits in struct container flags field */
 typedef enum {
 @@ -87,11 +121,8 @@ static inline int notify_on_release(cons
 return test_bit(CONT_NOTIFY_ON_RELEASE, &cont->flags);
 }
 
 -static struct container top_container = {
 -	.count = ATOMIC_INIT(0),
 -	.sibling = LIST_HEAD_INIT(top_container.sibling),
 -	.children = LIST_HEAD_INIT(top_container.children),
 -};
 +#define for_each_subsys(_hierarchy, _ss) list_for_each_entry(_ss, &rootnode[_hierarchy].subsys_list, sibling)
 +
 
 /* The path to use for release notifications. No locking between
 * setting and use - so if userspace updates this while subcontainers
 @@ -105,9 +136,6 @@ void container_set_release_agent_path(co
 container_manage_unlock();
 }
 
 -static struct vfsmount *container_mount;
 -static struct super_block *container_sb;
 -
 /*
 * We have two global container mutexes below.  They can nest.
 * It is ok to first take manage_mutex, then nest callback_mutex.  We also
 @@ -202,15 +230,18 @@ static DEFINE_MUTEX(callback_mutex);
 
 static int container_mkdir(struct inode *dir, struct dentry *dentry, int mode);
 static int container_rmdir(struct inode *unused_dir, struct dentry *dentry);
 +static int container_populate_dir(struct container *cont);
 +static struct inode_operations container_dir_inode_operations;
 +struct file_operations proc_containerstats_operations;
 
 static struct backing_dev_info container_backing_dev_info = {
 .ra_pages = 0,		/* No readahead */
 .capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
 };
 
 -static struct inode *container_new_inode(mode_t mode)
 +static struct inode *container_new_inode(mode_t mode, struct super_block *sb)
 {
 -	struct inode *inode = new_inode(container_sb);
 +	struct inode *inode = new_inode(sb);
 
 if (inode) {
 inode->i_mode = mode;
 @@ -282,32 +313,102 @@ static void container_d_remove_dir(struc
 remove_dir(dentry);
 }
 
 +/*
 + * Release the last use of a hierarchy.  Will never be called when
 + * there are active subcontainers since each subcontainer bumps the
 + * value of sb->s_active.
 + */
 +
 +static void container_put_super(struct super_block *sb) {
 +
 +	struct containerfs_root *root = sb->s_fs_info;
 +	int hierarchy = root->top_container.hierarchy;
 +	int i;
 +	struct container *cont = &root->top_container;
 +	struct task_struct *g, *p;
 +
 +	root->sb = NULL;
 +	sb->s_fs_info = NULL;
 +
 +	mutex_lock(&manage_mutex);
 +
 +	BUG_ON(root->number_of_containers != 1);
 +	BUG_ON(!list_empty(&cont->children));
 +	BUG_ON(!list_empty(&cont->sibling));
 +	BUG_ON(!root->subsys_bits);
 +
 +	mutex_lock(&callback_mutex);
 +
 +	/* Remove all tasks from this container hierarchy */
 +	read_lock(&tasklist_lock);
 +	do_each_thread(g, p) {
 +		task_lock(p);
 +		BUG_ON(!p->container[hierarchy]);
 +		BUG_ON(p->container[hierarchy] != cont);
 +		rcu_assign_pointer(p->container[hierarchy], NULL);
 +		task_unlock(p);
 +	} while_each_thread(g, p);
 +	read_unlock(&tasklist_lock);
 +	atomic_set(&cont->count, 1);
 +
 +	/* Remove all subsystems from this hierarchy */
 +	for (i = 0; i < subsys_count; i++) {
 +		if (root->subsys_bits & (1 << i)) {
 +			struct container_subsys *ss = subsys[i];
 +			BUG_ON(cont->subsys[i] != dummytop->subsys[i]);
 +			BUG_ON(cont->subsys[i]->container != cont);
 +			dummytop->subsys[i]->container = dummytop;
 +			cont->subsys[i] = NULL;
 +			rcu_assign_pointer(subsys[i]->hierarchy, 0);
 +			list_del(&ss->sibling);
 +		} else {
 +			BUG_ON(cont->subsys[i]);
 +		}
 +	}
 +	root->subsys_bits = 0;
 +	mutex_unlock(&callback_mutex);
 +	synchronize_rcu();
 +
 +	mutex_unlock(&manage_mutex);
 +}
 +
 +static int container_show_options(struct seq_file *seq, struct vfsmount *vfs) {
 +	struct containerfs_root *root = vfs->mnt_sb->s_fs_info;
 +	struct container_subsys *ss;
 +	for_each_subsys(root->top_container.hierarchy, ss) {
 +		seq_printf(seq, ",%s", ss->name);
 +	}
 +	return 0;
 +}
 +
 static struct super_operations container_ops = {
 .statfs = simple_statfs,
 .drop_inode = generic_delete_inode,
 +	.put_super = container_put_super,
 +	.show_options = container_show_options,
 };
 
 -static int container_fill_super(struct super_block *sb, void *unused_data,
 -							int unused_silent)
 +static int container_fill_super(struct super_block *sb, void *options,
 +				int unused_silent)
 {
 struct inode *inode;
 struct dentry *root;
 +	struct containerfs_root *hroot = options;
 
 sb->s_blocksize = PAGE_CACHE_SIZE;
 sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
 sb->s_magic = CONTAINER_SUPER_MAGIC;
 sb->s_op = &container_ops;
 -	container_sb = sb;
 
 -	inode = container_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR);
 -	if (inode) {
 -		inode->i_op = &simple_dir_inode_operations;
 -		inode->i_fop = &simple_dir_operations;
 -		/* directories start off with i_nlink == 2 (for "." entry) */
 -		inode->i_nlink++;
 -	} else {
 +	inode = container_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
 +	if (!inode)
 return -ENOMEM;
 -	}
 +
 +	inode->i_op = &simple_dir_inode_operations;
 +	inode->i_fop = &simple_dir_operations;
 +	inode->i_op = &container_dir_inode_operations;
 +	/* directories start off with i_nlink == 2 (for "." entry) */
 +	inc_nlink(inode);
 
 root = d_alloc_root(inode);
 if (!root) {
 @@ -315,6 +416,12 @@ static int container_fill_super(struct s
 return -ENOMEM;
 }
 sb->s_root = root;
 +	root->d_fsdata = &hroot->top_container;
 +	hroot->top_container.dentry = root;
 +
 +	sb->s_fs_info = hroot;
 +	hroot->sb = sb;
 +
 return 0;
 }
 
 @@ -322,7 +429,130 @@ static int container_get_sb(struct file_
 int flags, const char *unused_dev_name,
 void *data, struct vfsmount *mnt)
 {
 -	return get_sb_single(fs_type, flags, data, container_fill_super, mnt);
 +	int i;
 +	struct container_subsys *ss;
 +	char *token, *o = data ?: "all";
 +	unsigned long subsys_bits = 0;
 +	int ret = 0;
 +	struct containerfs_root *root = NULL;
 +	int hierarchy;
 +
 +	mutex_lock(&manage_mutex);
 +
 +	/* First find the desired set of resource controllers */
 +	while ((token = strsep(&o, ",")) != NULL) {
 +		if (!*token) {
 +			ret = -EINVAL;
 +			goto out_unlock;
 +		}
 +		if (!strcmp(token, "all")) {
 +			subsys_bits = (1 << subsys_count) - 1;
 +		} else {
 +			for (i = 0; i < subsys_count; i++) {
 +				ss = subsys[i];
 +				if (!strcmp(token, ss->name)) {
 +					subsys_bits |= 1 << i;
 +					break;
 +				}
 +			}
 +			if (i == subsys_count) {
 +				ret = -ENOENT;
 +				goto out_unlock;
 +			}
 +		}
 +	}
 +
 +	/* See if we already have a hierarchy containing this set */
 +
 +	for (i = 1; i < CONFIG_MAX_CONTAINER_HIERARCHIES; i++) {
 +		root = &rootnode[i];
 +		/* We match - use this hieracrchy */
 +		if (root->subsys_bits == subsys_bits) break;
 +		/* We clash - fail */
 +		if (root->subsys_bits & subsys_bits) {
 +			ret = -EBUSY;
 +			goto out_unlock;
 +		}
 +	}
 +
 +	if (i == CONFIG_MAX_CONTAINER_HIERARCHIES) {
 +		/* No existing hierarchy matched this set - but we
 +		 * know that all the subsystems are free */
 +		for (i = 1; i < CONFIG_MAX_CONTAINER_HIERARCHIES; i++) {
 +			root = &rootnode[i];
 +			if (!root->sb && !root->subsys_bits) break;
 +		}
 +	}
 +
 +	if (i == CONFIG_MAX_CONTAINER_HIERARCHIES) {
 +		ret = -ENOSPC;
 +		goto out_unlock;
 +	}
 +
 +	hierarchy = i;
 +
 +	if (!root->sb) {
 +		/* We need a new superblock for this container combination */
 +		struct container *cont = &root->top_container;
 +		struct container_subsys *ss;
 +		struct task_struct *p, *g;
 +
 +		BUG_ON(root->subsys_bits);
 +		root->subsys_bits = subsys_bits;
 +		ret = get_sb_nodev(fs_type, flags, root,
 +				   container_fill_super, mnt);
 +		if (ret)
 +			goto out_unlock;
 +
 +		BUG_ON(!list_empty(&cont->sibling));
 +		BUG_ON(!list_empty(&cont->children));
 +		BUG_ON(root->number_of_containers != 1);
 +
 +		mutex_lock(&callback_mutex);
 +
 +		/* Add all tasks into this container hierarchy */
 +		atomic_set(&cont->count, 1);
 +		read_lock(&tasklist_lock);
 +		do_each_thread(g, p) {
 +			task_lock(p);
 +			BUG_ON(p->container[hierarchy]);
 +			rcu_assign_pointer(p->container[hierarchy], cont);
 +			if (!(p->flags & PF_EXITING)) {
 +				atomic_inc(&cont->count);
 +			}
 +			task_unlock(p);
 +		} while_each_thread(g, p);
 +		read_unlock(&tasklist_lock);
 +
 +		/* Move all the relevant subsystems into the hierarchy. */
 +		for (i = 0; i < subsys_count; i++) {
 +			if (!(subsys_bits & (1 << i))) continue;
 +
 +			ss = subsys[i];
 +
 +			BUG_ON(cont->subsys[i]);
 +			BUG_ON(dummytop->subsys[i]->container != dummytop);
 +			cont->subsys[i] = dummytop->subsys[i];
 +			cont->subsys[i]->container = cont;
 +			list_add(&ss->sibling, &root->subsys_list);
 +			rcu_assign_pointer(subsys[i]->hierarchy,
 +					   hierarchy);
 +		}
 +		mutex_unlock(&callback_mutex);
 +		synchronize_rcu();
 +
 +		container_populate_dir(cont);
 +
 +	} else {
 +		/* Reuse the existing superblock */
 +		ret = simple_set_mnt(mnt, root->sb);
 +		if (!ret)
 +			atomic_inc(&root->sb->s_active);
 +	}
 +
 + out_unlock:
 +	mutex_unlock(&manage_mutex);
 +	return ret;
 }
 
 static struct file_system_type container_fs_type = {
 @@ -501,6 +731,8 @@ static int attach_task(struct container
 struct task_struct *tsk;
 struct container *oldcont;
 int retval = 0;
 +	struct container_subsys *ss;
 +	int h = cont->hierarchy;
 
 if (sscanf(pidbuf, "%d", &pid) != 1)
 return -EIO;
 @@ -527,37 +759,45 @@ static int attach_task(struct container
 get_task_struct(tsk);
 }
 
 -#ifdef CONFIG_CPUSETS
 -	retval = cpuset_can_attach_task(cont, tsk);
 -#endif
 -	if (retval) {
 -		put_task_struct(tsk);
 -		return retval;
 +	for_each_subsys(h, ss) {
 +		if (ss->can_attach) {
 +			retval = ss->can_attach(ss, cont, tsk);
 +			if (retval) {
 +				put_task_struct(tsk);
 +				return retval;
 +			}
 +		}
 }
 
 mutex_lock(&callback_mutex);
 
 task_lock(tsk);
 -	oldcont = tsk->container;
 +	oldcont = tsk->container[h];
 if (!oldcont) {
 task_unlock(tsk);
 mutex_unlock(&callback_mutex);
 put_task_struct(tsk);
 return -ESRCH;
 }
 +        BUG_ON(oldcont == dummytop);
 +
 atomic_inc(&cont->count);
 -	rcu_assign_pointer(tsk->container, cont);
 +	rcu_assign_pointer(tsk->container[h], cont);
 task_unlock(tsk);
 
 -#ifdef CONFIG_CPUSETS
 -	cpuset_attach_task(cont, tsk);
 -#endif
 +	for_each_subsys(h, ss) {
 +		if (ss->attach) {
 +			ss->attach(ss, cont, oldcont, tsk);
 +		}
 +	}
 
 mutex_unlock(&callback_mutex);
 
 -#ifdef CONFIG_CPUSETS
 -	cpuset_post_attach_task(cont, oldcont, tsk);
 -#endif
 +	for_each_subsys(h, ss) {
 +		if (ss->post_attach) {
 +			ss->post_attach(ss, cont, oldcont, tsk);
 +		}
 +	}
 
 put_task_struct(tsk);
 synchronize_rcu();
 @@ -780,7 +1020,7 @@ static struct inode_operations container
 .rename = container_rename,
 };
 
 -static int container_create_file(struct dentry *dentry, int mode)
 +static int container_create_file(struct dentry *dentry, int mode, struct super_block *sb)
 {
 struct inode *inode;
 
 @@ -789,7 +1029,7 @@ static int container_create_file(struct
 if (dentry->d_inode)
 return -EEXIST;
 
 -	inode = container_new_inode(mode);
 +	inode = container_new_inode(mode, sb);
 if (!inode)
 return -ENOMEM;
 
 @@ -798,7 +1038,7 @@ static int container_create_file(struct
 inode->i_fop = &simple_dir_operations;
 
 /* start off with i_nlink == 2 (for "." entry) */
 -		inode->i_nlink++;
 +		inc_nlink(inode);
 } else if (S_ISREG(mode)) {
 inode->i_size = 0;
 inode->i_fop = &container_file_operations;
 @@ -828,10 +1068,10 @@ static int container_create_dir(struct c
 dentry = container_get_dentry(parent, name);
 if (IS_ERR(dentry))
 return PTR_ERR(dentry);
 -	error = container_create_file(dentry, S_IFDIR | mode);
 +	error = container_create_file(dentry, S_IFDIR | mode, cont->root->sb);
 if (!error) {
 dentry->d_fsdata = cont;
 -		parent->d_inode->i_nlink++;
 +		inc_nlink(parent->d_inode);
 cont->dentry = dentry;
 }
 dput(dentry);
 @@ -848,7 +1088,7 @@ int container_add_file(struct container
 mutex_lock(&dir->d_inode->i_mutex);
 dentry = container_get_dentry(dir, cft->name);
 if (!IS_ERR(dentry)) {
 -		error = container_create_file(dentry, 0644 | S_IFREG);
 +		error = container_create_file(dentry, 0644 | S_IFREG, cont->root->sb);
 if (!error)
 dentry->d_fsdata = (void *)cft;
 dput(dentry);
 @@ -894,7 +1134,7 @@ static int pid_array_load(pid_t *pidarra
 read_lock(&tasklist_lock);
 
 do_each_thread(g, p) {
 -		if (p->container == cont) {
 +		if (p->container[cont->hierarchy] == cont) {
 pidarray[n++] = p->pid;
 if (unlikely(n == npids))
 goto array_full;
 @@ -1037,21 +1277,33 @@ static struct cftype cft_release_agent =
 static int container_populate_dir(struct container *cont)
 {
 int err;
 +	struct container_subsys *ss;
 
 if ((err = container_add_file(cont, &cft_notify_on_release)) < 0)
 return err;
 if ((err = container_add_file(cont, &cft_tasks)) < 0)
 return err;
 -	if ((cont == &top_container) &&
 +	if ((cont == cont->top_container) &&
 (err = container_add_file(cont, &cft_release_agent)) < 0)
 return err;
 -#ifdef CONFIG_CPUSETS
 -	if ((err = cpuset_populate_dir(cont)) < 0)
 -		return err;
 -#endif
 +
 +	for_each_subsys(cont->hierarchy, ss) {
 +		if (ss->populate && (err = ss->populate(ss, cont)) < 0)
 +			return err;
 +	}
 +
 return 0;
 }
 
 +static void init_container_css(struct container_subsys *ss,
 +			       struct container *cont)
 +{
 +	struct container_subsys_state *css = cont->subsys[ss->subsys_id];
 +	css->container = cont;
 +	spin_lock_init(&css->refcnt_lock);
 +	atomic_set(&css->refcnt, 0);
 +}
 +
 /*
 *	container_create - create a container
 *	parent:	container that will be parent of the new container.
 @@ -1064,13 +1316,24 @@ static int container_populate_dir(struct
 static long container_create(struct container *parent, const char *name, int mode)
 {
 struct container *cont;
 -	int err;
 +	struct containerfs_root *root = parent->root;
 +	int err = 0;
 +	struct container_subsys *ss;
 +	struct super_block *sb = root->sb;
 
 cont = kmalloc(sizeof(*cont), GFP_KERNEL);
 if (!cont)
 return -ENOMEM;
 
 +	/* Grab a reference on the superblock so the hierarchy doesn't
 +	 * get deleted on unmount if there are child containers.  This
 +	 * can be done outside manage_mutex, since the sb can't
 +	 * disappear while someone has an open control file on the
 +	 * fs */
 +	atomic_inc(&sb->s_active);
 +
 mutex_lock(&manage_mutex);
 +
 cont->flags = 0;
 if (notify_on_release(parent))
 set_bit(CONT_NOTIFY_ON_RELEASE, &cont->flags);
 @@ -1079,16 +1342,18 @@ static long container_create(struct cont
 INIT_LIST_HEAD(&cont->children);
 
 cont->parent = parent;
 +	cont->root = parent->root;
 +	cont->hierarchy = parent->hierarchy;
 
 -#ifdef CONFIG_CPUSETS
 -	err = cpuset_create(cont);
 -	if (err)
 -		goto err_unlock_free;
 -#endif
 +	for_each_subsys(cont->hierarchy, ss) {
 +		err = ss->create(ss, cont);
 +		if (err) goto err_destroy;
 +		init_container_css(ss, cont);
 +	}
 
 mutex_lock(&callback_mutex);
 list_add(&cont->sibling, &cont->parent->children);
 -	number_of_containers++;
 +	root->number_of_containers++;
 mutex_unlock(&callback_mutex);
 
 err = container_create_dir(cont, name, mode);
 @@ -1107,15 +1372,23 @@ static long container_create(struct cont
 return 0;
 
 err_remove:
 -#ifdef CONFIG_CPUSETS
 -	cpuset_destroy(cont);
 -#endif
 +
 mutex_lock(&callback_mutex);
 list_del(&cont->sibling);
 -	number_of_containers--;
 +	root->number_of_containers--;
 mutex_unlock(&callback_mutex);
 - err_unlock_free:
 +
 + err_destroy:
 +
 +	for_each_subsys(cont->hierarchy, ss) {
 +		if (cont->subsys[ss->subsys_id])
 +			ss->destroy(ss, cont);
 +	}
 +
 mutex_unlock(&manage_mutex);
 +
 +	deactivate_super(sb);
 +
 kfree(cont);
 return err;
 }
 @@ -1145,6 +1418,11 @@ static int container_rmdir(struct inode
 struct dentry *d;
 struct container *parent;
 char *pathbuf = NULL;
 +	struct container_subsys *ss;
 +	struct super_block *sb;
 +	struct containerfs_root *root;
 +	unsigned long flags;
 +	int css_busy = 0;
 
 /* the vfs holds both inode->i_mutex already */
 
 @@ -1157,7 +1435,32 @@ static int container_rmdir(struct inode
 mutex_unlock(&manage_mutex);
 return -EBUSY;
 }
 +
 parent = cont->parent;
 +	root = cont->root;
 +	sb = root->sb;
 +
 +	local_irq_save(flags);
 +	/* Check each container, locking the refcnt lock and testing
 +	 * the refcnt. This will lock out any calls to css_get() */
 +	for_each_subsys(root->top_container.hierarchy, ss) {
 +		struct container_subsys_state *css;
 +		css = cont->subsys[ss->subsys_id];
 +		spin_lock(&css->refcnt_lock);
 +		css_busy += atomic_read(&css->refcnt);
 +	}
 +	/* Go through and release all the locks; if we weren't busy,
 +	 * the set the refcount to -1 to prevent css_get() from adding
 +	 * a refcount */
 +	for_each_subsys(root->top_container.hierarchy, ss) {
 +		struct container_subsys_state *css;
 +		css = cont->subsys[ss->subsys_id];
 +		if (!css_busy) atomic_dec(&css->refcnt);
 +		spin_unlock(&css->refcnt_lock);
 +	}
 +	local_irq_restore(flags);
 +	if (css_busy) return -EBUSY;
 +
 mutex_lock(&callback_mutex);
 set_bit(CONT_REMOVED, &cont->flags);
 list_del(&cont->sibling);	/* delete my sibling from parent->children */
 @@ -1165,67 +1468,142 @@ static int container_rmdir(struct inode
 d = dget(cont->dentry);
 cont->dentry = NULL;
 spin_unlock(&d->d_lock);
 +
 +	for_each_subsys(root->top_container.hierarchy, ss) {
 +		ss->destroy(ss, cont);
 +	}
 container_d_remove_dir(d);
 dput(d);
 -	number_of_containers--;
 +	root->number_of_containers--;
 mutex_unlock(&callback_mutex);
 -#ifdef CONFIG_CPUSETS
 -	cpuset_destroy(cont);
 -#endif
 +
 if (list_empty(&parent->children))
 check_for_release(parent, &pathbuf);
 +
 mutex_unlock(&manage_mutex);
 +	/* Drop the active superblock reference that we took when we
 +	 * created the container */
 +	deactivate_super(sb);
 container_release_agent(pathbuf);
 return 0;
 }
 
 -/*
 - * container_init_early - probably not needed yet, but will be needed
 - * once cpusets are hooked into this code
 - */
 +
 +/**
 + * container_init_early - initialize containers at system boot
 + *
 + * Description: Initialize the container housekeeping structures
 + **/
 
 int __init container_init_early(void)
 {
 -	struct task_struct *tsk = current;
 +	int i;
 +
 +	for (i = 0; i < CONFIG_MAX_CONTAINER_HIERARCHIES; i++) {
 +		struct containerfs_root *root = &rootnode[i];
 +		struct container *cont = &root->top_container;
 +		INIT_LIST_HEAD(&root->subsys_list);
 +		root->number_of_containers = 1;
 +
 +		cont->root = root;
 +		cont->hierarchy = i;
 +		INIT_LIST_HEAD(&cont->sibling);
 +		INIT_LIST_HEAD(&cont->children);
 +		cont->top_container = cont;
 +		atomic_set(&cont->count, 1);
 +	}
 +	init_task.container[0] = &rootnode[0].top_container;
 
 -	tsk->container = &top_container;
 return 0;
 }
 
 /**
 - * container_init - initialize containers at system boot
 - *
 - * Description: Initialize top_container and the container internal file system,
 + * container_init - register container filesystem and /proc file
 **/
 
 int __init container_init(void)
 {
 -	struct dentry *root;
 int err;
 -
 -	init_task.container = &top_container;
 +	struct proc_dir_entry *entry;
 
 err = register_filesystem(&container_fs_type);
 if (err < 0)
 goto out;
 -	container_mount = kern_mount(&container_fs_type);
 -	if (IS_ERR(container_mount)) {
 -		printk(KERN_ERR "container: could not mount!\n");
 -		err = PTR_ERR(container_mount);
 -		container_mount = NULL;
 -		goto out;
 -	}
 -	root = container_mount->mnt_sb->s_root;
 -	root->d_fsdata = &top_container;
 -	root->d_inode->i_nlink++;
 -	top_container.dentry = root;
 -	root->d_inode->i_op = &container_dir_inode_operations;
 -	number_of_containers = 1;
 -	err = container_populate_dir(&top_container);
 +
 +	entry = create_proc_entry("containers", 0, NULL);
 +	if (entry)
 +		entry->proc_fops = &proc_containerstats_operations;
 +
 out:
 return err;
 }
 
 +int container_register_subsys(struct container_subsys *new_subsys) {
 +	int retval = 0;
 +	int i;
 +
 +	BUG_ON(new_subsys->hierarchy);
 +	mutex_lock(&manage_mutex);
 +	if (subsys_count == CONFIG_MAX_CONTAINER_SUBSYS) {
 +		retval = -ENOSPC;
 +		goto out;
 +	}
 +	if (!new_subsys->name ||
 +	    (strlen(new_subsys->name) > MAX_CONTAINER_TYPE_NAMELEN) ||
 +	    !new_subsys->create || !new_subsys->destroy) {
 +		retval = -EINVAL;
 +		goto out;
 +	}
 +	for (i = 0; i < subsys_count; i++) {
 +		if (!strcmp(subsys[i]->name, new_subsys->name)) {
 +			retval = -EEXIST;
 +			goto out;
 +		}
 +	}
 +
 +	new_subsys->subsys_id = subsys_count;
 +	retval = new_subsys->create(new_subsys, dummytop);
 +	if (retval) {
 +		new_subsys->subsys_id = -1;
 +		goto out;
 +	}
 +	init_container_css(new_subsys, dummytop);
 + 	mutex_lock(&callback_mutex);
 + 	/* If this is the first subsystem that requested a fork or
 + 	 * exit callback, tell our fork/exit hooks that they need to
 + 	 * grab callback_mutex on every invocation. If they are
 + 	 * running concurrently with this code, they will either not
 + 	 * see the change now and go straight on, or they will see it
 + 	 * and grab callback_mutex, which will deschedule them. Either
 + 	 * way once synchronize_rcu() returns we know that all current
 + 	 * and future forks will make the callbacks. */
 + 	if (!need_forkexit_callback &&
 + 	    (new_subsys->fork || new_subsys->exit)) {
 + 		need_forkexit_callback = 1;
 + 		synchronize_rcu();
 + 	}
 +
 + 	/* If this subsystem requested that it be notified with fork
 + 	 * events, we should send it one now for every process in the
 + 	 * system */
 + 	if (new_subsys->fork) {
 + 		struct task_struct *g, *p;
 +
 + 		read_lock(&tasklist_lock);
 + 		do_each_thread(g, p) {
 + 			new_subsys->fork(new_subsys, p);
 + 		} while_each_thread(g, p);
 + 		read_unlock(&tasklist_lock);
 + 	}
 +
 +	subsys[subsys_count++] = new_subsys;
 + 	mutex_unlock(&callback_mutex);
 + out:
 + 	mutex_unlock(&manage_mutex);
 + 	return retval;
 +
 +}
 +
 /**
 * container_fork - attach newly forked task to its parents container.
 * @tsk: pointer to task_struct of forking parent process.
 @@ -1246,10 +1624,38 @@ out:
 
 void container_fork(struct task_struct *child)
 {
 +	int i, need_callback;
 +
 +	rcu_read_lock();
 +	/* need_forkexit_callback will be true if we might need to do
 + 	 * a callback */
 +	need_callback = rcu_dereference(need_forkexit_callback);
 +	if (need_callback) {
 +		rcu_read_unlock();
 +		mutex_lock(&callback_mutex);
 +	}
 task_lock(current);
 -	child->container = current->container;
 -	atomic_inc(&child->container->count);
 +        /* Skip hierarchy 0 since it's permanent */
 +	for (i = 1; i < CONFIG_MAX_CONTAINER_HIERARCHIES; i++) {
 +		struct container *cont = current->container[i];
 +		if (!cont) continue;
 +		child->container[i] = cont;
 +		atomic_inc(&cont->count);
 +	}
 +	if (need_callback) {
 +		for (i = 0; i < subsys_count; i++) {
 +			struct container_subsys *ss = subsys[i];
 +			if (ss->fork) {
 +				ss->fork(ss, child);
 +			}
 +		}
 +	}
 task_unlock(current);
 +	if (need_callback) {
 +		mutex_unlock(&callback_mutex);
 +	} else {
 +		rcu_read_unlock();
 +	}
 }
 
 /**
 @@ -1314,20 +1720,38 @@ void container_fork(struct task_struct *
 void container_exit(struct task_struct *tsk)
 {
 struct container *cont;
 -
 -	cont = tsk->container;
 -	tsk->container = &top_container;	/* the_top_container_hack - see above */
 -
 -	if (notify_on_release(cont)) {
 -		char *pathbuf = NULL;
 -
 -		mutex_lock(&manage_mutex);
 -		if (atomic_dec_and_test(&cont->count))
 -			check_for_release(cont, &pathbuf);
 -		mutex_unlock(&manage_mutex);
 -		container_release_agent(pathbuf);
 +	int i;
 +	rcu_read_lock();
 +	if (rcu_dereference(need_forkexit_callback)) {
 +		rcu_read_unlock();
 +		mutex_lock(&callback_mutex);
 +		for (i = 0; i < subsys_count; i++) {
 +			struct container_subsys *ss = subsys[i];
 +			if (ss->exit) {
 +				ss->exit(ss, tsk);
 +			}
 +		}
 +		mutex_unlock(&callback_mutex);
 } else {
 -		atomic_dec(&cont->count);
 +		rcu_read_unlock();
 +	}
 +
 +	/* Skip hierarchy 0 since it's permanent */
 +	for (i = 1; i < CONFIG_MAX_CONTAINER_HIERARCHIES; i++) {
 +		cont = tsk->container[i];
 +		if (!cont) continue;
 +		/* the_top_container_hack - see above */
 +		tsk->container[i] = cont->top_container;
 +		if (notify_on_release(cont)) {
 +			char *pathbuf = NULL;
 +			mutex_lock(&manage_mutex);
 +			if (atomic_dec_and_test(&cont->count))
 +				check_for_release(cont, &pathbuf);
 +			mutex_unlock(&manage_mutex);
 +			container_release_agent(pathbuf);
 +		} else {
 +			atomic_dec(&cont->count);
 +		}
 }
 }
 
 @@ -1387,12 +1811,15 @@ void container_manage_unlock(void)
 *    the_top_container_hack in container_exit(), which sets an exiting tasks
 *    container to top_container.
 */
 +
 +/* TODO: Use a proper seq_file iterator */
 static int proc_container_show(struct seq_file *m, void *v)
 {
 struct pid *pid;
 struct task_struct *tsk;
 char *buf;
 int retval;
 +	int i;
 
 retval = -ENOMEM;
 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
 @@ -1405,14 +1832,25 @@ static int proc_container_show(struct se
 if (!tsk)
 goto out_free;
 
 -	retval = -EINVAL;
 +	retval = 0;
 +
 mutex_lock(&manage_mutex);
 
 -	retval = container_path(tsk->container, buf, PAGE_SIZE);
 -	if (retval < 0)
 -		goto out_unlock;
 -	seq_puts(m, buf);
 -	seq_putc(m, '\n');
 +	for (i = 1; i < CONFIG_MAX_CONTAINER_HIERARCHIES; i++) {
 +		struct containerfs_root *root = &rootnode[i];
 +		struct container_subsys *ss;
 +		int count = 0;
 +		if (!root->subsys_bits) continue;
 +		for_each_subsys(i, ss) {
 +			seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
 +		}
 +		seq_putc(m, ':');
 +		retval = container_path(tsk->container[i], buf, PAGE_SIZE);
 +		if (retval < 0)
 +			goto out_unlock;
 +		seq_puts(m, buf);
 +		seq_putc(m, '\n');
 +	}
 out_unlock:
 mutex_unlock(&manage_mutex);
 put_task_struct(tsk);
 @@ -1434,3 +1872,47 @@ struct file_operations proc_container_op
 .llseek		= seq_lseek,
 .release	= single_release,
 };
 +
 +static int proc_containerstats_show(struct seq_file *m, void *v)
 +{
 +	int i;
 +	mutex_lock(&manage_mutex);
 +	seq_puts(m, "Hierarchies:\n");
 +	for (i = 0; i < CONFIG_MAX_CONTAINER_HIERARCHIES; i++) {
 +		struct containerfs_root *root = &rootnode[i];
 +		struct container_subsys *ss;
 +		int first = 1;
 +		seq_printf(m, "%d: topcount=%d bits=%lx containers=%d (",
 +			   i, atomic_read(&root->top_container.count),
 +			   root->subsys_bits, root->number_of_containers);
 +		for_each_subsys(i, ss) {
 +			seq_printf(m, "%s%s", first ? "" : ", ", ss->name);
 +			first = false;
 +		}
 +		seq_putc(m, ')');
 +		if (root->sb) {
 +			seq_printf(m, " s_active=%d", atomic_read(&root->sb->s_active));
 +		}
 +		seq_putc(m, '\n');
 +	}
 +	seq_puts(m, "Subsystems:\n");
 +	for (i = 0; i < subsys_count; i++) {
 +		struct container_subsys *ss = subsys[i];
 +		seq_printf(m, "%d: name=%s hierarchy=%d\n",
 +			   i, ss->name, ss->hierarchy);
 +	}
 +	mutex_unlock(&manage_mutex);
 +	return 0;
 +}
 +
 +static int containerstats_open(struct inode *inode, struct file *file)
 +{
 +	return single_open(file, proc_containerstats_show, 0);
 +}
 +
 +struct file_operations proc_containerstats_operations = {
 +	.open = containerstats_open,
 +	.read = seq_read,
 +	.llseek = seq_lseek,
 +	.release = single_release,
 +};
 Index: container-2.6.19-rc6/kernel/cpuset.c
 ============================================================ =======
 --- container-2.6.19-rc6.orig/kernel/cpuset.c
 +++ container-2.6.19-rc6/kernel/cpuset.c
 @@ -5,6 +5,7 @@
 *
 *  Copyright (C) 2003 BULL SA.
 *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
 + *  Copyright (C) 2006 Google, Inc
 *
 *  Portions derived from Patrick Mochel's sysfs code.
 *  sysfs is Copyright (c) 2001-3 Patrick Mochel
 @@ -12,6 +13,7 @@
 *  2003-10-10 Written by Simon Derr.
 *  2003-10-22 Updates by Stephen Hemminger.
 *  2004 May-July Rework by Paul Jackson.
 + *  2006 Rework by Paul Menage to use generic containers
 *
 *  This file is subject to the terms and conditions of the GNU General Public
 *  License.  See the file COPYING in the main directory of the Linux
 @@ -61,6 +63,10 @@
 */
 int number_of_cpusets __read_mostly;
 
 +/* Retrieve the cpuset from a container */
 +static struct container_subsys cpuset_subsys;
 +struct cpuset;
 +
 /* See "Frequency meter" comments, below. */
 
 struct fmeter {
 @@ -71,11 +77,12 @@ struct fmeter {
 };
 
 struct cpuset {
 +	struct container_subsys_state css;
 +
 unsigned long flags;		/* "unsigned long" so bitops work */
 cpumask_t cpus_allowed;		/* CPUs allowed to tasks in cpuset */
 nodemask_t mems_allowed;	/* Memory Nodes allowed to tasks */
 
 -	struct container *container;    /* Task container */
 struct cpuset *parent;		/* my parent */
 
 /*
 @@ -87,6 +94,26 @@ struct cpuset {
 struct fmeter fmeter;		/* memory_pressure filter */
 };
 
 +/* Update the cpuset for a container */
 +static inline void set_container_cs(struct container *cont, struct cpuset *cs)
 +{
 +	cont->subsys[cpuset_subsys.subsys_id] = &cs->css;
 +}
 +
 +/* Retrieve the cpuset for a container */
 +static inline struct cpuset *container_cs(struct container *cont)
 +{
 +	return container_of(container_subsys_state(cont, &cpuset_subsys),
 +			    struct cpuset, css);
 +}
 +
 +/* Retrieve the cpuset for a task */
 +static inline struct cpuset *task_cs(struct task_struct *task)
 +{
 +	return container_cs(task_container(task, &cpuset_subsys));
 +}
 +
 +
 /* bits in struct cpuset flags field */
 typedef enum {
 CS_CPU_EXCLUSIVE,
 @@ -162,7 +189,7 @@ static int cpuset_get_sb(struct file_sys
 if (container_fs) {
 ret = container_fs->get_sb(container_fs, flags,
 unused_dev_name,
 -					   data, mnt);
 +					   "cpuset", mnt);
 put_filesystem(container_fs);
 }
 return ret;
 @@ -270,20 +297,19 @@ void cpuset_update_task_memory_state(voi
 struct task_struct *tsk = current;
 struct cpuset *cs;
 
 -	if (tsk->container->cpuset == &top_cpuset) {
 +	if (task_cs(tsk) == &top_cpuset) {
 /* Don't need rcu for top_cpuset.  It's never freed. */
 my_cpusets_mem_gen = top_cpuset.mems_generation;
 } else {
 rcu_read_lock();
 -		cs = rcu_dereference(tsk->container->cpuset);
 -		my_cpusets_mem_gen = cs->mems_generation;
 +		my_cpusets_mem_gen = task_cs(current)->mems_generation;
 rcu_read_unlock();
 }
 
 if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
 container_lock();
 task_lock(tsk);
 -		cs = tsk->container->cpuset; /* Maybe changed when task not locked */
 +		cs = task_cs(tsk); /* Maybe changed when task not locked */
 guarantee_online_mems(cs, &tsk->mems_allowed);
 tsk->cpuset_mems_generation = cs->mems_generation;
 if (is_spread_page(cs))
 @@ -342,9 +368,8 @@ static int validate_change(const struct
 struct cpuset *c, *par;
 
 /* Each of our child cpusets must be a subset of us */
 -	list_for_each_entry(cont, &cur->container->children, sibling) {
 -		c = cont->cpuset;
 -		if (!is_cpuset_subset(c, trial))
 +	list_for_each_entry(cont, &cur->css.container->children, sibling) {
 +		if (!is_cpuset_subset(container_cs(cont), trial))
 return -EBUSY;
 }
 
 @@ -357,8 +382,8 @@ static int validate_change(const struct
 return -EACCES;
 
 /* If either I or some sibling (!= me) is exclusive, we can't overlap */
 -	list_for_each_entry(cont, &par->container->children, sibling) {
 -		c = cont->cpuset;
 +	list_for_each_entry(cont, &par->css.container->children, sibling) {
 +		c = container_cs(cont);
 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
 c != cur &&
 cpus_intersects(trial->cpus_allowed, c->cpus_allowed))
 @@ -400,8 +425,8 @@ static void update_cpu_domains(struct cp
 * children
 */
 pspan = par->cpus_allowed;
 -	list_for_each_entry(cont, &par->container->children, sibling) {
 -		c = cont->cpuset;
 +	list_for_each_entry(cont, &par->css.container->children, sibling) {
 +		c = container_cs(cont);
 if (is_cpu_exclusive(c))
 cpus_andnot(pspan, pspan, c->cpus_allowed);
 }
 @@ -418,8 +443,8 @@ static void update_cpu_domains(struct cp
 * Get all cpus from current cpuset's cpus_allowed not part
 * of exclusive children
 */
 -		list_for_each_entry(cont, &cur->container->children, sibling) {
 -			c = cont->cpuset;
 +		list_for_each_entry(cont, &cur->css.container->children, sibling) {
 +			c = container_cs(cont);
 if (is_cpu_exclusive(c))
 cpus_andnot(cspan, cspan, c->cpus_allowed);
 }
 @@ -507,7 +532,7 @@ static void cpuset_migrate_mm(struct mm_
 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
 
 container_lock();
 -	guarantee_online_mems(tsk->container->cpuset, &tsk->mems_allowed);
 +	guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed);
 container_unlock();
 }
 
 @@ -525,6 +550,8 @@ static void cpuset_migrate_mm(struct mm_
 * their mempolicies to the cpusets new mems_allowed.
 */
 
 +static void *cpuset_being_rebound;
 +
 static int update_nodemask(struct cpuset *cs, char *buf)
 {
 struct cpuset trialcs;
 @@ -542,7 +569,7 @@ static int update_nodemask(struct cpuset
 return -EACCES;
 
 trialcs = *cs;
 -	cont = cs->container;
 +	cont = cs->css.container;
 retval = nodelist_parse(buf, trialcs.mems_allowed);
 if (retval < 0)
 goto done;
 @@ -565,7 +592,7 @@ static int update_nodemask(struct cpuset
 cs->mems_generation = cpuset_mems_generation++;
 container_unlock();
 
 -	set_cpuset_being_rebound(cs);		/* causes mpol_copy() rebind */
 +	cpuset_being_rebound = cs;		/* causes mpol_copy() rebind */
 
 fudge = 10;				/* spare mmarray[] slots */
 fudge += cpus_weight(cs->cpus_allowed);	/* imagine one fork-bomb/cpu */
 @@ -579,13 +606,13 @@ static int update_nodemask(struct cpuset
 * enough mmarray[] w/o using GFP_ATOMIC.
 */
 while (1) {
 -		ntasks = atomic_read(&cs->container->count);  /* guess */
 +		ntasks = atomic_read(&cs->css.container->count);  /* guess */
 ntasks += fudge;
 mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);
 if (!mmarray)
 goto done;
 write_lock_irq(&tasklist_lock);		/* block fork */
 -		if (atomic_read(&cs->container->count) <= ntasks)
 +		if (atomic_read(&cs->css.container->count) <= ntasks)
 break;				/* got enough */
 write_unlock_irq(&tasklist_lock);	/* try again */
 kfree(mmarray);
 @@ -602,7 +629,7 @@ static int update_nodemask(struct cpuset
 "Cpuset mempolicy rebind incomplete.\n");
 continue;
 }
 -		if (p->container != cont)
 +		if (task_cs(p) != cs)
 continue;
 mm = get_task_mm(p);
 if (!mm)
 @@ -636,12 +663,17 @@ static int update_nodemask(struct cpuset
 
 /* We're done rebinding vma's to this cpusets new mems_allowed. */
 kfree(mmarray);
 -	set_cpuset_being_rebound(NULL);
 +	cpuset_being_rebound = NULL;
 retval = 0;
 done:
 return retval;
 }
 
 +int current_cpuset_is_being_rebound(void)
 +{
 +	return task_cs(current) == cpuset_being_rebound;
 +}
 +
 /*
 * Call with manage_mutex held.
 */
 @@ -795,9 +827,10 @@ static int fmeter_getrate(struct fmeter
 return val;
 }
 
 -int cpuset_can_attach_task(struct container *cont, struct task_struct *tsk)
 +int cpuset_can_attach(struct container_subsys *ss,
 +		      struct container *cont, struct task_struct *tsk)
 {
 -	struct cpuset *cs = cont->cpuset;
 +	struct cpuset *cs = container_cs(cont);
 
 if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
 return -ENOSPC;
 @@ -805,22 +838,23 @@ int cpuset_can_attach_task(struct contai
 return security_task_setscheduler(tsk, 0, NULL);
 }
 
 -void cpuset_attach_task(struct container *cont, struct task_struct *tsk)
 +void cpuset_attach(struct container_subsys *ss, struct container *cont,
 +		   struct container *old_cont, struct task_struct *tsk)
 {
 cpumask_t cpus;
 -	struct cpuset *cs = cont->cpuset;
 -	guarantee_online_cpus(cs, &cpus);
 +	guarantee_online_cpus(container_cs(cont), &cpus);
 set_cpus_allowed(tsk, cpus);
 }
 
 -void cpuset_post_attach_task(struct container *cont,
 -			     struct container *oldcont,
 -			     struct task_struct *tsk)
 +void cpuset_post_attach(struct container_subsys *ss,
 +			struct container *cont,
 +			struct container *oldcont,
 +			struct task_struct *tsk)
 {
 nodemask_t from, to;
 struct mm_struct *mm;
 -	struct cpuset *cs = cont->cpuset;
 -	struct cpuset *oldcs = oldcont->cpuset;
 +	struct cpuset *cs = container_cs(cont);
 +	struct cpuset *oldcs = container_cs(oldcont);
 
 from = oldcs->mems_allowed;
 to = cs->mems_allowed;
 @@ -854,7 +888,7 @@ static ssize_t cpuset_common_file_write(
 const char __user *userbuf,
 size_t nbytes, loff_t *unused_ppos)
 {
 -	struct cpuset *cs = cont->cpuset;
 +	struct cpuset *cs = container_cs(cont);
 cpuset_filetype_t type = cft->private;
 char *buffer;
 int retval = 0;
 @@ -964,7 +998,7 @@ static ssize_t cpuset_common_file_read(s
 char __user *buf,
 size_t nbytes, loff_t *ppos)
 {
 -	struct cpuset *cs = cont->cpuset;
 +	struct cpuset *cs = container_cs(cont);
 cpuset_filetype_t type = cft->private;
 char *page;
 ssize_t retval = 0;
 @@ -1083,7 +1117,7 @@ static struct cftype cft_spread_slab = {
 .private = FILE_SPREAD_SLAB,
 };
 
 -int cpuset_populate_dir(struct container *cont)
 +int cpuset_populate(struct container_subsys *ss, struct container *cont)
 {
 int err;
 
 @@ -1118,11 +1152,19 @@ int cpuset_populate_dir(struct container
 *	Must be called with the mutex on the parent inode held
 */
 
 -int cpuset_create(struct container *cont)
 +int cpuset_create(struct container_subsys *ss, struct container *cont)
 {
 struct cpuset *cs;
 -	struct cpuset *parent = cont->parent->cpuset;
 +	struct cpuset *parent;
 
 +	if (!cont->parent) {
 +		/* This is early initialization for the top container */
 +		set_container_cs(cont, &top_cpuset);
 +		top_cpuset.css.container = cont;
 +		top_cpuset.mems_generation = cpuset_mems_generation++;
 +		return 0;
 +	}
 +	parent = container_cs(cont->parent);
 cs = kmalloc(sizeof(*cs), GFP_KERNEL);
 if (!cs)
 return -ENOMEM;
 @@ -1139,8 +1181,8 @@ int cpuset_create(struct container *cont
 fmeter_init(&cs->fmeter);
 
 cs->parent = parent;
 -	cont->cpuset = cs;
 -	cs->container = cont;
 +	set_container_cs(cont, cs);
 +	cs->css.container = cont;
 number_of_cpusets++;
 return 0;
 }
 @@ -1156,9 +1198,9 @@ int cpuset_create(struct container *cont
 * nesting would risk an ABBA deadlock.
 */
 
 -void cpuset_destroy(struct container *cont)
 +void cpuset_destroy(struct container_subsys *ss, struct container *cont)
 {
 -	struct cpuset *cs = cont->cpuset;
 +	struct cpuset *cs = container_cs(cont);
 
 cpuset_update_task_memory_state();
 if (is_cpu_exclusive(cs)) {
 @@ -1166,8 +1208,20 @@ void cpuset_destroy(struct container *co
 BUG_ON(retval);
 }
 number_of_cpusets--;
 +	kfree(cs);
 }
 
 +static struct container_subsys cpuset_subsys = {
 +	.name = "cpuset",
 +	.create = cpuset_create,
 +	.destroy  = cpuset_destroy,
 +	.can_attach = cpuset_can_attach,
 +	.attach = cpuset_attach,
 +	.post_attach = cpuset_post_attach,
 +	.populate = cpuset_populate,
 +	.subsys_id = -1,
 +};
 +
 /*
 * cpuset_init_early - just enough so that the calls to
 * cpuset_update_task_memory_state() in early init code
 @@ -1176,13 +1230,13 @@ void cpuset_destroy(struct container *co
 
 int __init cpuset_init_early(void)
 {
 -	struct container *cont = current->container;
 -	cont->cpuset = &top_cpuset;
 -	top_cpuset.container = cont;
 -	cont->cpuset->mems_generation = cpuset_mems_generation++;
 +	if (container_register_subsys(&cpuset_subsys) < 0)
 +		panic("Couldn't register cpuset subsystem");
 +	top_cpuset.mems_generation = cpuset_mems_generation++;
 return 0;
 }
 
 +
 /**
 * cpuset_init - initialize cpusets at system boot
 *
 @@ -1192,6 +1246,7 @@ int __init cpuset_init_early(void)
 int __init cpuset_init(void)
 {
 int err = 0;
 +
 top_cpuset.cpus_allowed = CPU_MASK_ALL;
 top_cpuset.mems_allowed = NODE_MASK_ALL;
 
 @@ -1234,7 +1289,7 @@ static void guarantee_online_cpus_mems_i
 struct cpuset *c;
 
 /* Each of our child cpusets mems must be online */
 -	list_for_each_entry(c, &cur->container->children, sibling) {
 +	list_for_each_entry(cont, &cur->css.container->children, sibling) {
 c = container_cs(cont);
 guarantee_online_cpus_mems_in_subtree(c);
 if (!cpus_empty(c->cpus_allowed))
 @@ -1336,7 +1391,7 @@ cpumask_t cpuset_cpus_allowed(struct tas
 
 container_lock();
 task_lock(tsk);
 -	guarantee_online_cpus(tsk->container->cpuset, &mask);
 +	guarantee_online_cpus(task_cs(tsk), &mask);
 task_unlock(tsk);
 container_unlock();
 
 @@ -1364,7 +1419,7 @@ nodemask_t cpuset_mems_allowed(struct ta
 
 container_lock();
 task_lock(tsk);
 -	guarantee_online_mems(tsk->container->cpuset, &mask);
 +	guarantee_online_mems(task_cs(tsk), &mask);
 task_unlock(tsk);
 container_unlock();
 
 @@ -1469,7 +1524,7 @@ int __cpuset_zone_allowed(struct zone *z
 container_lock();
 
 task_lock(current);
 -	cs = nearest_exclusive_ancestor(current->container->cpuset);
 +	cs = nearest_exclusive_ancestor(task_cs(current));
 task_unlock(current);
 
 allowed = node_isset(node, cs->mems_allowed);
 @@ -1537,7 +1592,7 @@ int cpuset_excl_nodes_overlap(const stru
 task_unlock(current);
 goto done;
 }
 -	cs1 = nearest_exclusive_ancestor(current->container->cpuset);
 +	cs1 = nearest_exclusive_ancestor(task_cs(current));
 task_unlock(current);
 
 task_lock((struct task_struct *)p);
 @@ -1545,7 +1600,7 @@ int cpuset_excl_nodes_overlap(const stru
 task_unlock((struct task_struct *)p);
 goto done;
 }
 -	cs2 = nearest_exclusive_ancestor(p->container->cpuset);
 +	cs2 = nearest_exclusive_ancestor(task_cs((struct task_struct *)p));
 task_unlock((struct task_struct *)p);
 
 overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed);
 @@ -1581,11 +1636,8 @@ int cpuset_memory_pressure_enabled __rea
 
 void __cpuset_memory_pressure_bump(void)
 {
 -	struct cpuset *cs;
 -
 task_lock(current);
 -	cs = current->container->cpuset;
 -	fmeter_markevent(&cs->fmeter);
 +	fmeter_markevent(&task_cs(current)->fmeter);
 task_unlock(current);
 }
 
 @@ -1622,7 +1674,8 @@ static int proc_cpuset_show(struct seq_f
 retval = -EINVAL;
 container_manage_lock();
 
 -	retval = container_path(tsk->container, buf, PAGE_SIZE);
 +	retval = container_path(tsk->container[cpuset_subsys.hierarchy],
 +				buf, PAGE_SIZE);
 if (retval < 0)
 goto out_unlock;
 seq_puts(m, buf);
 Index: container-2.6.19-rc6/Documentation/containers.txt
 ============================================================ =======
 --- container-2.6.19-rc6.orig/Documentation/containers.txt
 +++ container-2.6.19-rc6/Documentation/containers.txt
 @@ -17,12 +17,16 @@ CONTENTS:
 1.2 Why are containers needed ?
 1.3 How are containers implemented ?
 1.4 What does notify_on_release do ?
 -  1.5 How do I use containers ?
 +  1.5 What do the xxx_enabled files do ?
 +  1.6 How do I use containers ?
 2. Usage Examples and Syntax
 2.1 Basic Usage
 2.2 Attaching processes
 -3. Questions
 -4. Contact
 +3. Kernel API
 +  3.1 Overview
 +  3.2 Synchronization
 +  3.3 Subsystem API
 +4. Questions
 
 1. Containers
 ==========
 @@ -31,13 +35,17 @@ CONTENTS:
 ----------------------
 
 Containers provide a mechanism for aggregating sets of tasks, and all
 -their children, into hierarchical groups.
 -
 -Each task has a pointer to a container.  Multiple tasks may reference
 -the same container. User level code may create and destroy containers
 -by name in the container virtual file system, specify and query to
 -which container a task is assigned, and list the task pids assigned to
 -a container.
 +their children, into hierarchical groups. A container associates a set
 +of tasks with a set of parameters for one or more "subsystems"
 +(typically resource controllers).
 +
 +At any one time there may be up to CONFIG_MAX_CONTAINER_HIERARCHIES
 +active hierachies of tasks.  Each task has a pointer to a container in
 +each active hierarchy.  Multiple tasks may reference the same
 +container. User level code may create and destroy containers by name
 +in an instance of the container virtual file system, specify and query
 +to which container a task is assigned, and list the task pids assigned
 +to a container.
 
 On their own, the only use for containers is for simple job
 tracking. The intention is that other subsystems, such as cpusets (see
 @@ -67,27 +75,43 @@ desired.
 
 Containers extends the kernel as follows:
 
 - - Each task in the system is attached to a container, via a pointer
 -   in the task structure to a reference counted container structure.
 - - The hierarchy of containers can be mounted at /dev/container (or
 -   elsewhere), for browsing and manipulation from user space.
 + - Each task in the system has set of reference-counted container
 +   pointers, one for each active hierarchy
 + - A container hierarchy filesystem can be mounted  for browsing and
 +   manipulation from user space.
 - You can list all the tasks (by pid) attached to any container.
 
 The implementation of containers requires a few, simple hooks
 into the rest of the kernel, none in performance critical paths:
 
 - - in init/main.c, to initialize the root container at system boot.
 - - in fork and exit, to attach and detach a task from its container.
 + - in init/main.c, to initialize the root containers at system boot.
 + - in fork and exit, to attach and detach a task from its containers.
 
 In addition a new file system, of type "container" may be mounted,
 -typically at /dev/container, to enable browsing and modifying the containers
 -presently known to the kernel.  No new system calls are added for
 -containers - all support for querying and modifying containers is via
 -this container file system.
 -
 -Each task under /proc has an added file named 'container', displaying
 -the container name, as the path relative to the root of the container file
 -system.
 +typically at /dev/container, to enable browsing and modifying the
 +containers presently known to the kernel.  When mounting a container
 +hierarchy, you may specify a comma-separated list of subsystems to
 +mount as the filesystem mount options.  By default, mounting the
 +container filesystem attempts to mount a hierarchy containing all
 +registered subsystems.
 +
 +If an active hierarchy with exactly the same set of subsystems already
 +exists, it will be reused for the new mount. If no existing hierarchy
 +matches, and any of the requested subsystems are in use in an existing
 +hierarchy, the mount will fail with -EBUSY. Otherwise, a new hierarchy
 +is created, associated with the requested subsystems.
 +
 +When a container filesystem is unmounted, if there are any
 +subcontainers created below the top-level container, that hierarchy
 +will remain active even though unmounted; if there are no
 +subcontainers then the hierarchy will be deactivated.
 +
 +No new system calls are added for containers - all support for
 +querying and modifying containers is via this container file system.
 +
 +Each task under /proc has an added file named 'container' displaying,
 +for each active hierarchy, the subsystem names and the container name
 +as the path relative to the root of the container file system.
 
 Each container is represented by a directory in the container file system
 containing the following files describing that container:
 @@ -129,7 +153,18 @@ The default value of notify_on_release i
 boot is disabled (0).  The default value of other containers at creation
 is the current value of their parents notify_on_release setting.
 
 -1.5 How do I use containers ?
 +1.5 What do the xxx_enabled files do ?
 +--------------------------------------
 +
 +In the top-level container directory there are a series of
 +<subsys>_enabled files, one for each registered subsystem.  Each of
 +these files contains 0 or 1 to indicate whether the named container
 +subsystem is enabled, and can only be modified when there are no
 +subcontainers.  Disabled container subsystems don't get new instances
 +created when a subcontainer is created; the subsystem-specific state
 +is simply inherited from the parent container.
 +
 +1.6 How do I use containers ?
 --------------------------
 
 To start a new job that is to be contained within a container, the steps are:
 @@ -214,8 +249,154 @@ If you have several tasks to attach, you
 ...
 # /bin/echo PIDn > tasks
 
 +3. Kernel API
 +=============
 +
 +3.1 Overview
 +------------
 +
 +Each kernel subsystem that wants to hook into the generic container
 +system needs to create a container_subsys object. This contains
 +various methods, which are callbacks from the container system, along
 +with a subsystem id which will be assigned by the container system.
 +
 +Other fields in the container_subsys object include:
 +
 +- subsys_id: a unique array index for the subsystem, indicating which
 +  entry in container->subsys[] this subsystem should be
 +  managing. Initialized by container_register_subsys(); prior to this
 +  it should be initialized to -1
 +
 +- top_container: the subsystem-specific state representing the root
 +  container in the system. This should always exist, even when the
 +  subsystem isn't attached to a hierarchy.
 +
 +- hierarchy: an index indicating which hierarchy, if any, this
 +  subsystem is currently attached to. If this is -1, then the
 +  subsystem is not attached to any hierarchy, and all tasks should be
 +  considered to be members of the subsystem's top_container. It should
 +  be initialized to -1.
 +
 +- name: should be initialized to a unique subsystem name prior to
 +  calling container_register_subsystem. Should be no longer than
 +  MAX_CONTAINER_TYPE_NAMELEN
 +
 +Each container object created by the system has an array of pointers,
 +indexed by subsystem id; this pointer is entirely managed by the
 +subsystem; the generic container code will never touch this pointer.
 +
 +3.2 Synchronization
 +-------------------
 +
 +There are two global mutexes used by the container system. The first
 +is the manage_mutex, which should be taken by anything that wants to
 +modify a container; The second if the callback_mutex, which should be
 +taken by holders of the manage_mutex at the point when they actually
 +make changes, and by callbacks from lower-level subsystems that want
 +to ensure that no container changes occur.  Note that memory
 +allocations cannot be made while holding callback_mutex.
 +
 +The callback_mutex nests inside the manage_mutex.
 +
 +In general, the pattern of use is:
 +
 +1) take manage_mutex
 +2) verify that the change is valid and do any necessary allocations\
 +3) take callback_mutex
 +4) make changes
 +5) release callback_mutex
 +6) release manage_mutex
 +
 +See kernel/container.c for more details.
 +
 +Subsystems can take/release the manage_mutex via the functions
 +container_manage_lock()/container_manage_unlock(), and can
 +take/release the callback_mutex via the functions
 +container_lock()/container_unlock().
 +
 +Accessing a task's container pointer may be done in the following ways:
 +- while holding manage_mutex
 +- while holding callback_mutex
 +- while holding the task's alloc_lock (via task_lock())
 +- inside an rcu_read_lock() section via rcu_dereference()
 +
 +3.3 Subsystem API
 +--------------------------
 +
 +Each subsystem should call container_register_subsys() with a pointer
 +to its subsystem object. This will store the new subsystem id in the
 +subsystem subsys_id field and return 0, or a negative error.  There's
 +currently no facility for deregestering a subsystem nor for
 +registering a subsystem after any containers (other than the default
 +"top_container") have been created.
 +
 +Each subsystem may export the following methods. The only mandatory
 +methods are create/destroy. Any others that are null are presumed to
 +be successful no-ops.
 +
 +int create(struct container *cont)
 +LL=manage_mutex
 +
 +The subsystem should set its subsystem pointer for the passed
 +container, returning 0 on success or a negative error code. On
 +success, the subsystem pointer should point to a structure of type
 +container_subsys_state (typically embedded in a larger
 +subsystem-specific object), which will be initialized by the container
 +system.
 +
 +void destroy(struct container *cont)
 +LL=manage_mutex
 +
 +The container system is about to destroy the passed container; the
 +subsystem should do any necessary cleanup
 +
 +int can_attach(struct container_subsys *ss, struct container *cont,
 +	       struct task_struct *task)
 +LL=manage_mutex
 +
 +Called prior to moving a task into a container; if the subsystem
 +returns an error, this will abort the attach operation.  Note that
 +this isn't called on a fork.
 +
 +void attach(struct container_subsys *ss, struct container *cont,
 +	    struct container *old_cont, struct task_struct *task)
 +LL=manage_mutex & callback_mutex
 +
 +Called during the attach operation.  The subsystem should do any
 +necessary work that can be accomplished without memory allocations or
 +sleeping.
 +
 +void post_attach(struct container_subsys *ss, struct container *cont,
 +		 struct container *old_cont, struct task_struct *task)
 +LL=manage_mutex
 +
 +Called after the task has been attached to the container, to allow any
 +post-attachment activity that requires memory allocations or blocking.
 +
 +void fork(struct container_subsy *ss, struct task_struct *task)
 +LL=callback_mutex, maybe read_lock(tasklist_lock)
 +
 +Called when a task is forked into a container. Also called during
 +registration for all existing tasks.
 +
 +void exit(struct container_subsys *ss, struct task_struct *task)
 +LL=callback_mutex
 +
 +Called during task exit
 +
 +int populate(struct container_subsys *ss, struct container *cont)
 +LL=none
 +
 +Called after creation of a container to allow a subsystem to populate
 +the container directory with file entries.  The subsystem should make
 +calls to container_add_file() with objects of type cftype (see
 +include/linux/container.h for details).  Called during
 +container_register_subsys() to populate the root container.  Note that
 +although this method can return an error code, the error code is
 +currently not always handled well.
 +
 
 -3. Questions
 +4. Questions
 ============
 
 Q: what's up with this '/bin/echo' ?
 Index: container-2.6.19-rc6/include/linux/mempolicy.h
 ============================================================ =======
 --- container-2.6.19-rc6.orig/include/linux/mempolicy.h
 +++ container-2.6.19-rc6/include/linux/mempolicy.h
 @@ -148,14 +148,6 @@ extern void mpol_rebind_task(struct task
 const nodemask_t *new);
 extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new);
 extern void mpol_fix_fork_child_flag(struct task_struct *p);
 -#define set_cpuset_being_rebound(x) (cpuset_being_rebound = (x))
 -
 -#ifdef CONFIG_CPUSETS
 -#define current_cpuset_is_being_rebound() \
 - (cpuset_being_rebound == current->container->cpuset)
 -#else
 -#define current_cpuset_is_being_rebound() 0
 -#endif
 
 extern struct mempolicy default_policy;
 extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
 @@ -173,8 +165,6 @@ static inline void check_highest_zone(en
 int do_migrate_pages(struct mm_struct *mm,
 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags);
 
 -extern void *cpuset_being_rebound;	/* Trigger mpol_copy vma rebind */
 -
 #else
 
 struct mempolicy {};
 @@ -253,8 +243,6 @@ static inline void mpol_fix_fork_child_f
 {
 }
 
 -#define set_cpuset_being_rebound(x) do {} while (0)
 -
 static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
 unsigned long addr)
 {
 Index: container-2.6.19-rc6/include/linux/sched.h
 ============================================================ =======
 --- container-2.6.19-rc6.orig/include/linux/sched.h
 +++ container-2.6.19-rc6/include/linux/sched.h
 @@ -1005,7 +1005,7 @@ struct task_struct {
 int cpuset_mem_spread_rotor;
 #endif
 #ifdef CONFIG_CONTAINERS
 -	struct container *container;
 +	struct container *container[CONFIG_MAX_CONTAINER_HIERARCHIES];
 #endif
 struct robust_list_head __user *robust_list;
 #ifdef CONFIG_COMPAT
 @@ -1430,7 +1430,7 @@ static inline int thread_group_empty(str
 /*
 * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
 * subscriptions and synchronises with wait4().  Also used in procfs.  Also
 - * pins the final release of task.io_context.  Also protects ->container.
 + * pins the final release of task.io_context.  Also protects ->container[].
 *
 * Nests both inside and outside of read_lock(&tasklist_lock).
 * It must not be nested with write_lock_irq(&tasklist_lock),
 Index: container-2.6.19-rc6/mm/mempolicy.c
 ============================================================ =======
 --- container-2.6.19-rc6.orig/mm/mempolicy.c
 +++ container-2.6.19-rc6/mm/mempolicy.c
 @@ -1307,7 +1307,6 @@ EXPORT_SYMBOL(alloc_pages_current);
 * keeps mempolicies cpuset relative after its cpuset moves.  See
 * further kernel/cpuset.c update_nodemask().
 */
 -void *cpuset_being_rebound;
 
 /* Slow path of a mempolicy copy */
 struct mempolicy *__mpol_copy(struct mempolicy *old)
 @@ -1906,4 +1905,3 @@ out:
 m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
 return 0;
 }
 -
 
 --
 |  
	|  |  |  
	| 
		
			| [PATCH 7/7] BeanCounters over generic process containers [message #8542 is a reply to message #8540] | Thu, 23 November 2006 12:08   |  
			| 
				
				
					|  Paul Menage Messages: 642
 Registered: September 2006
 | Senior Member |  |  |  
	| This patch implements the BeanCounter resource control abstraction over generic process containers. It contains the beancounter core
 code, plus the numfiles resource counter. It doesn't currently contain
 any of the memory tracking code or the code for switching beancounter
 context in interrupts.
 
 Currently all the beancounters resource counters are lumped into a
 single hierarchy; ideally it would be possible for each resource
 counter to be a separate container subsystem, allowing them to be
 connected to different hierarchies.
 
 ---
 fs/file_table.c          |   11 +
 include/bc/beancounter.h |  192 ++++++++++++++++++++++++
 include/bc/misc.h        |   27 +++
 include/linux/fs.h       |    3
 init/Kconfig             |    4
 init/main.c              |    3
 kernel/Makefile          |    1
 kernel/bc/Kconfig        |   17 ++
 kernel/bc/Makefile       |    7
 kernel/bc/beancounter.c  |  371 +++++++++++++++++++++++++++++++++++++++++++++++
 kernel/bc/misc.c         |   56 +++++++
 11 files changed, 691 insertions(+), 1 deletion(-)
 
 Index: container-2.6.19-rc6/init/Kconfig
 ============================================================ =======
 --- container-2.6.19-rc6.orig/init/Kconfig
 +++ container-2.6.19-rc6/init/Kconfig
 @@ -601,6 +601,10 @@ config STOP_MACHINE
 Need stop_machine() primitive.
 endmenu
 
 +menu "Beancounters"
 +source "kernel/bc/Kconfig"
 +endmenu
 +
 menu "Block layer"
 source "block/Kconfig"
 endmenu
 Index: container-2.6.19-rc6/kernel/Makefile
 ============================================================ =======
 --- container-2.6.19-rc6.orig/kernel/Makefile
 +++ container-2.6.19-rc6/kernel/Makefile
 @@ -12,6 +12,7 @@ obj-y     = sched.o fork.o exec_domain.o
 
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
 obj-y += time/
 +obj-$(CONFIG_BEANCOUNTERS) += bc/
 obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
 obj-$(CONFIG_LOCKDEP) += lockdep.o
 ifeq ($(CONFIG_PROC_FS),y)
 Index: container-2.6.19-rc6/kernel/bc/Kconfig
 ============================================================ =======
 --- /dev/null
 +++ container-2.6.19-rc6/kernel/bc/Kconfig
 @@ -0,0 +1,17 @@
 +config BEANCOUNTERS
 +	bool "Enable resource accounting/control"
 +	default n
 +	select CONTAINERS
 +	help
 +	  When Y this option provides accounting and allows configuring
 +	  limits for user's consumption of exhaustible system resources.
 +	  The most important resource controlled by this patch is unswappable
 +	  memory (either mlock'ed or used by internal kernel structures and
 +	  buffers). The main goal of this patch is to protect processes
 +	  from running short of important resources because of accidental
 +	  misbehavior of processes or malicious activity aiming to ``kill''
 +	  the system. It's worth mentioning that resource limits configured
 +	  by setrlimit(2) do not give an acceptable level of protection
 +	  because they cover only a small fraction of resources and work on a
 +	  per-process basis.  Per-process accounting doesn't prevent malicious
 +	  users from spawning a lot of resource-consuming processes.
 Index: container-2.6.19-rc6/kernel/bc/Makefile
 ============================================================ =======
 --- /dev/null
 +++ container-2.6.19-rc6/kernel/bc/Makefile
 @@ -0,0 +1,7 @@
 +#
 +# kernel/bc/Makefile
 +#
 +# Copyright (C) 2006 OpenVZ SWsoft Inc.
 +#
 +
 +obj-y = beancounter.o misc.o
 Index: container-2.6.19-rc6/include/bc/beancounter.h
 ============================================================ =======
 --- /dev/null
 +++ container-2.6.19-rc6/include/bc/beancounter.h
 @@ -0,0 +1,192 @@
 +/*
 + * include/bc/beancounter.h
 + *
 + * Copyright (C) 2006 OpenVZ SWsoft Inc
 + *
 + */
 +
 +#ifndef __BEANCOUNTER_H__
 +#define __BEANCOUNTER_H__
 +
 +#include <linux/container.h>
 +
 +enum {
 +	BC_KMEMSIZE,
 +	BC_PRIVVMPAGES,
 +	BC_PHYSPAGES,
 +	BC_NUMTASKS,
 +	BC_NUMFILES,
 +
 +	BC_RESOURCES
 +};
 +
 +struct bc_resource_parm {
 +	unsigned long	barrier;
 +	unsigned long	limit;
 +	unsigned long	held;
 +	unsigned long	minheld;
 +	unsigned long	maxheld;
 +	unsigned long	failcnt;
 +
 +};
 +
 +#ifdef __KERNEL__
 +
 +#include <linux/list.h>
 +#include <linux/spinlock.h>
 +#include <linux/init.h>
 +#include <linux/configfs.h>
 +#include <asm/atomic.h>
 +
 +#define BC_MAXVALUE	((unsigned long)LONG_MAX)
 +
 +enum bc_severity {
 +	BC_BARRIER,
 +	BC_LIMIT,
 +	BC_FORCE,
 +};
 +
 +struct beancounter;
 +
 +#ifdef CONFIG_BEANCOUNTERS
 +
 +enum bc_attr_index {
 +	BC_RES_HELD,
 +	BC_RES_MAXHELD,
 +	BC_RES_MINHELD,
 +	BC_RES_BARRIER,
 +	BC_RES_LIMIT,
 +	BC_RES_FAILCNT,
 +
 +	BC_ATTRS
 +};
 +
 +struct bc_resource {
 +	char	*bcr_name;
 +	int      res_id;
 +
 +	int	(*bcr_init)(struct beancounter *bc, int res);
 +	int	(*bcr_change)(struct beancounter *bc,
 +			unsigned long new_bar, unsigned long new_lim);
 +	void	(*bcr_barrier_hit)(struct beancounter *bc);
 +	int	(*bcr_limit_hit)(struct beancounter *bc, unsigned long val,
 +			unsigned long flags);
 +	void	(*bcr_fini)(struct beancounter *bc);
 +
 +	/* container file handlers */
 +	struct cftype cft_attrs[BC_ATTRS];
 +};
 +
 +extern struct bc_resource *bc_resources[];
 +extern struct container_subsys bc_subsys;
 +
 +struct beancounter {
 +	struct container_subsys_state css;
 +	spinlock_t		bc_lock;
 +
 +	struct bc_resource_parm bc_parms[BC_RESOURCES];
 +};
 +
 +/* Update the beancounter for a container */
 +static inline void set_container_bc(struct container *cont,
 +				    struct beancounter *bc)
 +{
 +	cont->subsys[bc_subsys.subsys_id] = &bc->css;
 +}
 +
 +/* Retrieve the beancounter for a container */
 +static inline struct beancounter *container_bc(struct container *cont)
 +{
 +	return container_of(container_subsys_state(cont, &bc_subsys),
 +			    struct beancounter, css);
 +}
 +
 +/* Retrieve the beancounter for a task */
 +static inline struct beancounter *task_bc(struct task_struct *task)
 +{
 +	return container_bc(task_container(task, &bc_subsys));
 +}
 +
 +static inline void bc_adjust_maxheld(struct bc_resource_parm *parm)
 +{
 +	if (parm->maxheld < parm->held)
 +		parm->maxheld = parm->held;
 +}
 +
 +static inline void bc_adjust_minheld(struct bc_resource_parm *parm)
 +{
 +	if (parm->minheld > parm->held)
 +		parm->minheld = parm->held;
 +}
 +
 +static inline void bc_init_resource(struct bc_resource_parm *parm,
 +		unsigned long bar, unsigned long lim)
 +{
 +	parm->barrier = bar;
 +	parm->limit = lim;
 +	parm->held = 0;
 +	parm->minheld = 0;
 +	parm->maxheld = 0;
 +	parm->failcnt = 0;
 +}
 +
 +int bc_change_param(struct beancounter *bc, int res,
 +		unsigned long bar, unsigned long lim);
 +
 +int __must_check bc_charge_locked(struct beancounter *bc, int res_id,
 +		unsigned long val, int strict, unsigned long flags);
 +static inline int __must_check bc_charge(struct beancounter *bc, int res_id,
 +		unsigned long val, int strict)
 +{
 +	int ret;
 +	unsigned long flags;
 +
 +	spin_lock_irqsave(&bc->bc_lock, flags);
 +	ret = bc_charge_locked(bc, res_id, val, strict, flags);
 +	spin_unlock_irqrestore(&bc->bc_lock, flags);
 +	return ret;
 +}
 +
 +void __must_check bc_uncharge_locked(struct beancounter *bc, int res_id,
 +		unsigned long val);
 +static inline void bc_uncharge(struct beancounter *bc, int res_id,
 +		unsigned long val)
 +{
 +	unsigned long flags;
 +
 +	spin_lock_irqsave(&bc->bc_lock, flags);
 +	bc_uncharge_locked(bc, res_id, val);
 +	spin_unlock_irqrestore(&bc->bc_lock, flags);
 +}
 +
 +void __init bc_register_resource(int res_id, struct bc_resource *br);
 +void __init bc_init_early(void);
 +#else /* CONFIG_BEANCOUNTERS */
 +static inline int __must_check bc_charge_locked(struct beancounter *bc, int res,
 +		unsigned long val, int strict, unsigned long flags)
 +{
 +	return 0;
 +}
 +
 +static inline int __must_check bc_charge(struct beancounter *bc, int res,
 +		unsigned long val, int strict)
 +{
 +	return 0;
 +}
 +
 +static inline void bc_uncharge_locked(struct beancounter *bc, int res,
 +		unsigned long val)
 +{
 +}
 +
 +static inline void bc_uncharge(struct beancounter *bc, int res,
 +		unsigned long val)
 +{
 +}
 +
 +static inline void bc_init_early(void)
 +{
 +}
 +#endif /* CONFIG_BEANCOUNTERS */
 +#endif /* __KERNEL__ */
 +#endif
 Index: container-2.6.19-rc6/init/main.c
 ============================================================ =======
 --- container-2.6.19-rc6.orig/init/main.c
 +++ container-2.6.19-rc6/init/main.c
 @@ -52,6 +52,8 @@
 #include <linux/debug_locks.h>
 #include <linux/lockdep.h>
 
 +#include <bc/beancounter.h>
 +
 #include <asm/io.h>
 #include <asm/bugs.h>
 #include <asm/setup.h>
 @@ -482,6 +484,7 @@ asmlinkage void __init start_kernel(void
 char * command_line;
 extern struct kernel_param __start___param[], __stop___param[];
 
 +	bc_init_early();
 smp_setup_processor_id();
 
 /*
 Index: container-2.6.19-rc6/kernel/bc/beancounter.c
 ============================================================ =======
 --- /dev/null
 +++ container-2.6.19-rc6/kernel/bc/beancounter.c
 @@ -0,0 +1,371 @@
 +/*
 + * kernel/bc/beancounter.c
 + *
 + * Copyright (C) 2006 OpenVZ SWsoft Inc
 + *
 + */
 +
 +#include <linux/sched.h>
 +#include <linux/list.h>
 +#include <linux/hash.h>
 +#include <linux/gfp.h>
 +#include <linux/slab.h>
 +#include <linux/module.h>
 +#include <linux/fs.h>
 +#include <linux/uaccess.h>
 +
 +#include <bc/beancounter.h>
 +
 +#define BC_HASH_BITS	(8)
 +#define BC_HASH_SIZE	(1 << BC_HASH_BITS)
 +
 +static int bc_dummy_init(struct beancounter *bc, int i)
 +{
 +	bc_init_resource(&bc->bc_parms[i], BC_MAXVALUE, BC_MAXVALUE);
 +	return 0;
 +}
 +
 +static struct bc_resource bc_dummy_res = {
 +	.bcr_name = "dummy",
 +	.bcr_init = bc_dummy_init,
 +};
 +
 +struct bc_resource *bc_resources[BC_RESOURCES] = {
 +	[0 ... BC_RESOURCES - 1] = &bc_dummy_res,
 +};
 +
 +struct beancounter init_bc;
 +static kmem_cache_t *bc_cache;
 +
 +static int bc_create(struct container_subsys *ss,
 +		     struct container *cont)
 +{
 +	int i;
 +	struct beancounter *new_bc;
 +
 +	if (!cont->parent) {
 +		/* Early initialization for top container */
 +		set_container_b
...
 
 
 |  
	|  |  |  
	| 
		
			| [PATCH 5/7] Resource Groups over generic containers [message #8543 is a reply to message #8540] | Thu, 23 November 2006 12:08   |  
			| 
				
				
					|  Paul Menage Messages: 642
 Registered: September 2006
 | Senior Member |  |  |  
	| This patch provides the RG core and numtasks controller as container subsystems, intended as an example of how to implement a more complex
 resource control system over generic process containers. The changes
 to the core involve primarily removing the group management, task
 membership and configfs support and adding interface layers to talk to
 the generic container layer instead.
 
 Each resource controller becomes an independent container subsystem;
 the RG core is essentially a library that the resource controllers can
 use to provide the RG API to userspace. Rather than a single shares
 and stats file in each group, there's a <controller>_shares and
 a <controller>_stats file, each linked to the appropriate resource
 controller.
 
 include/linux/moduleparam.h  |   12 -
 include/linux/numtasks.h     |   28 ++
 include/linux/res_group.h    |   87 ++++++++
 include/linux/res_group_rc.h |   97 ++++++++
 init/Kconfig                 |   22 ++
 kernel/Makefile              |    1
 kernel/fork.c                |    7
 kernel/res_group/Makefile    |    2
 kernel/res_group/local.h     |   38 +++
 kernel/res_group/numtasks.c  |  467 +++++++++++++++++++++++++++++++++++++++++++
 kernel/res_group/res_group.c |  160 ++++++++++++++
 kernel/res_group/rgcs.c      |  302 +++++++++++++++++++++++++++
 kernel/res_group/shares.c    |  228 ++++++++++++++++++++
 13 files changed, 1447 insertions(+), 4 deletions(-)
 
 Index: container-2.6.19-rc6/include/linux/moduleparam.h
 ============================================================ =======
 --- container-2.6.19-rc6.orig/include/linux/moduleparam.h
 +++ container-2.6.19-rc6/include/linux/moduleparam.h
 @@ -75,11 +75,17 @@ struct kparam_array
 /* Helper functions: type is byte, short, ushort, int, uint, long,
 ulong, charp, bool or invbool, or XXX if you define param_get_XXX,
 param_set_XXX and param_check_XXX. */
 -#define module_param_named(name, value, type, perm)			   \
 -	param_check_##type(name, &(value));				   \
 -	module_param_call(name, param_set_##type, param_get_##type, &value, perm); \
 +#define module_param_named_call(name, value, type, set, perm)		\
 +	param_check_##type(name, &(value));				\
 +	module_param_call(name, set, param_get_##type, &(value), perm); \
 __MODULE_PARM_TYPE(name, #type)
 
 +#define module_param_named(name, value, type, perm)			   \
 +	module_param_named_call(name, value, type, param_set_##type, perm)
 +
 +#define module_param_set_call(name, type, setfn, perm) \
 +	module_param_named_call(name, name, type, setfn, perm)
 +
 #define module_param(name, type, perm)				\
 module_param_named(name, name, type, perm)
 
 Index: container-2.6.19-rc6/include/linux/numtasks.h
 ============================================================ =======
 --- /dev/null
 +++ container-2.6.19-rc6/include/linux/numtasks.h
 @@ -0,0 +1,28 @@
 +/* numtasks.h - No. of tasks resource controller for Resource Groups
 + *
 + * Copyright (C) Chandra Seetharaman, IBM Corp. 2003, 2004, 2005
 + *
 + * Provides No. of tasks resource controller for Resource Groups
 + *
 + * Latest version, more details at http://ckrm.sf.net
 + *
 + * This program is free software; you can redistribute it and/or modify
 + * it under the terms of the GNU General Public License as published by
 + * the Free Software Foundation; either version 2 of the License, or
 + * (at your option) any later version.
 + *
 + */
 +#ifndef _LINUX_NUMTASKS_H
 +#define _LINUX_NUMTASKS_H
 +
 +#ifdef CONFIG_RES_GROUPS_NUMTASKS
 +#include <linux/res_group_rc.h>
 +
 +extern int numtasks_allow_fork(struct task_struct *);
 +
 +#else /* CONFIG_RES_GROUPS_NUMTASKS */
 +
 +#define numtasks_allow_fork(task) (0)
 +
 +#endif /* CONFIG_RES_GROUPS_NUMTASKS */
 +#endif /* _LINUX_NUMTASKS_H */
 Index: container-2.6.19-rc6/include/linux/res_group.h
 ============================================================ =======
 --- /dev/null
 +++ container-2.6.19-rc6/include/linux/res_group.h
 @@ -0,0 +1,87 @@
 +/*
 + *  res_group.h - Header file to be used by Resource Groups
 + *
 + * Copyright (C) Hubertus Franke, IBM Corp. 2003, 2004
 + *		(C) Shailabh Nagar,  IBM Corp. 2003, 2004
 + *		(C) Chandra Seetharaman, IBM Corp. 2003, 2004, 2005
 + *
 + * Provides data structures, macros and kernel APIs
 + *
 + * More details at http://ckrm.sf.net
 + *
 + * This program is free software; you can redistribute it and/or modify
 + * it under the terms of the GNU General Public License as published by
 + * the Free Software Foundation; either version 2 of the License, or
 + * (at your option) any later version.
 + *
 + */
 +
 +#ifndef _LINUX_RES_GROUP_H
 +#define _LINUX_RES_GROUP_H
 +
 +#ifdef CONFIG_RES_GROUPS
 +#include <linux/spinlock.h>
 +#include <linux/list.h>
 +#include <linux/kref.h>
 +#include <linux/container.h>
 +
 +#define SHARE_UNCHANGED	(-1)	/* implicitly specified by userspace,
 +					 * never stored in a resource group'
 +					 * shares struct; never displayed */
 +#define SHARE_UNSUPPORTED	(-2)	/* If the resource controller doesn't
 +					 * support user changing a shares value
 +					 * it sets the corresponding share
 +					 * value to UNSUPPORTED when it returns
 +					 * the newly allocated shares data
 +					 * structure */
 +#define SHARE_DONT_CARE	(-3)
 +
 +#define SHARE_DEFAULT_DIVISOR 	(100)
 +
 +#define MAX_RES_CTLRS	CONFIG_MAX_CONTAINER_SUBSYS /* max # of resource controllers */
 +#define MAX_DEPTH	5	/* max depth of hierarchy supported */
 +
 +#define NO_RES_GROUP		NULL
 +#define NO_SHARE		NULL
 +#define NO_RES_ID		MAX_RES_CTLRS /* Invalid ID */
 +
 +/*
 + * Share quantities are a child's fraction of the parent's resource
 + * specified by a divisor in the parent and a dividend in the child.
 + *
 + * Shares are represented as a relative quantity between parent and child
 + * to simplify locking when propagating modifications to the shares of a
 + * resource group. Only the parent and the children of the modified
 + * resource group need to be locked.
 +*/
 +struct res_shares {
 +	/* shares only set by userspace */
 +	int min_shares; /* minimun fraction of parent's resources allowed */
 +	int max_shares; /* maximum fraction of parent's resources allowed */
 +	int child_shares_divisor; /* >= 1, may not be DONT_CARE */
 +
 +	/*
 +	 * share values invisible to userspace.  adjusted when userspace
 +	 * sets shares
 +	 */
 +	int unused_min_shares;
 +		/* 0 <= unused_min_shares <= (child_shares_divisor -
 +		 * 			Sum of min_shares of children)
 +		 */
 +	int cur_max_shares; /* max(children's max_shares). need better name */
 +
 +	/* State maintained by container system - only relevant when
 +	 * this shares struct is the actual shares struct for a
 +	 * container */
 +	struct container_subsys_state css;
 +};
 +
 +/*
 + * Class is the grouping of tasks with shares of each resource that has
 + * registered a resource controller (see include/linux/res_group_rc.h).
 + */
 +
 +#define resource_group container
 +
 +#endif /* CONFIG_RES_GROUPS */
 +#endif /* _LINUX_RES_GROUP_H */
 Index: container-2.6.19-rc6/include/linux/res_group_rc.h
 ============================================================ =======
 --- /dev/null
 +++ container-2.6.19-rc6/include/linux/res_group_rc.h
 @@ -0,0 +1,97 @@
 +/*
 + *  res_group_rc.h - Header file to be used by Resource controllers of
 + *		      Resource Groups
 + *
 + * Copyright (C) Hubertus Franke, IBM Corp. 2003
 + *		(C) Shailabh Nagar,  IBM Corp. 2003
 + *		(C) Chandra Seetharaman, IBM Corp. 2003, 2004, 2005
 + *		(C) Vivek Kashyap , IBM Corp. 2004
 + *
 + * Provides data structures, macros and kernel API of Resource Groups for
 + * resource controllers.
 + *
 + * More details at http://ckrm.sf.net
 + *
 + * This program is free software; you can redistribute it and/or modify
 + * it under the terms of the GNU General Public License as published by
 + * the Free Software Foundation; either version 2 of the License, or
 + * (at your option) any later version.
 + *
 + */
 +
 +#ifndef _LINUX_RES_GROUP_RC_H
 +#define _LINUX_RES_GROUP_RC_H
 +
 +#include <linux/res_group.h>
 +#include <linux/container.h>
 +
 +struct res_group_cft {
 +	struct cftype cft;
 +	struct res_controller *ctlr;
 +};
 +
 +struct res_controller {
 +	struct container_subsys subsys;
 +	struct res_group_cft shares_cft;
 +	struct res_group_cft stats_cft;
 +
 +	const char *name;
 +	unsigned int ctlr_id;
 +
 +	/*
 +	 * Keeps number of references to this controller structure. kref
 +	 * does not work as we want to be able to allow removal of a
 +	 * controller even when some resource group are still defined.
 +	 */
 +	atomic_t count;
 +
 +	/*
 +	 * Allocate a new shares struct for this resource controller.
 +	 * Called when registering a resource controller with pre-existing
 +	 * resource groups and when new resource group is created by the user.
 +	 */
 +	struct res_shares *(*alloc_shares_struct)(struct container *);
 +	/* Corresponding free of shares struct for this resource controller */
 +	void (*free_shares_struct)(struct res_shares *);
 +
 +	/* Notifies the controller when the shares are changed */
 +	void (*shares_changed)(struct res_shares *);
 +
 +	/* resource statistics */
 +	ssize_t (*show_stats)(struct res_shares *, char *, size_t);
 +	int (*reset_stats)(struct res_shares *, const char *);
 +
 +	/*
 +	 * move_task is called when a task moves from one resource group to
 +	 * another. First parameter is the task that is moving, the second
 +	 * is the resource specific shares of the resource group the task
 +	 * was in, and the third is the shares of the resource group the
 +	 * task has moved to.
 +	 */
 +	void (*move_task)(struct task_struct *, struct res_shares *,
 +				struct res_shares *);
 +};
 +
 +extern int register_controller(struct res_controller *);
 +extern int unregister_controller(struct res_controller *);
 +extern struct resource_group default_res_group;
 +static inline int is_res_group_root(const struct resource_group *rgroup)
 +{
 +	return (rgroup->parent == NULL);
 +}
 +
 +#define for_each_child(child, parent)	\
 +	list_for_each_entry(child, &parent->children, sibling)
 +
 +/* Get controller specific shares structure for the given resource group *
...
 
 
 |  
	|  |  |  
	| 
		
			| [PATCH 2/7] Cpusets hooked into containers [message #8544 is a reply to message #8540] | Thu, 23 November 2006 12:08   |  
			| 
				
				
					|  Paul Menage Messages: 642
 Registered: September 2006
 | Senior Member |  |  |  
	| This patch removes the process grouping code from the cpusets code, instead hooking it into the generic container system. This temporarily
 adds cpuset-specific code in kernel/container.c, which is removed by
 the next patch in the series.
 
 Signed-off-by: Paul Menage <menage@google.com>
 
 ---
 Documentation/cpusets.txt |   81 +-
 fs/super.c                |    5
 include/linux/container.h |    7
 include/linux/cpuset.h    |   25
 include/linux/fs.h        |    2
 include/linux/mempolicy.h |    2
 include/linux/sched.h     |    4
 init/Kconfig              |   23
 kernel/container.c        |  107 +++
 kernel/cpuset.c           | 1273 +++++-----------------------------------------
 kernel/exit.c             |    2
 kernel/fork.c             |    7
 mm/oom_kill.c             |    6
 13 files changed, 330 insertions(+), 1214 deletions(-)
 
 Index: container-2.6.19-rc5/include/linux/container.h
 ============================================================ =======
 --- container-2.6.19-rc5.orig/include/linux/container.h
 +++ container-2.6.19-rc5/include/linux/container.h
 @@ -47,6 +47,10 @@ struct container {
 
 struct container *parent;	/* my parent */
 struct dentry *dentry;		/* container fs entry */
 +
 +#ifdef CONFIG_CPUSETS
 +	struct cpuset *cpuset;
 +#endif
 };
 
 /* struct cftype:
 @@ -79,6 +83,9 @@ struct cftype {
 int container_add_file(struct container *cont, const struct cftype *cft);
 
 int container_is_removed(const struct container *cont);
 +void container_set_release_agent_path(const char *path);
 +
 +int container_path(const struct container *cont, char *buf, int buflen);
 
 #else /* !CONFIG_CONTAINERS */
 
 Index: container-2.6.19-rc5/include/linux/cpuset.h
 ============================================================ =======
 --- container-2.6.19-rc5.orig/include/linux/cpuset.h
 +++ container-2.6.19-rc5/include/linux/cpuset.h
 @@ -11,16 +11,15 @@
 #include <linux/sched.h>
 #include <linux/cpumask.h>
 #include <linux/nodemask.h>
 +#include <linux/container.h>
 
 #ifdef CONFIG_CPUSETS
 
 -extern int number_of_cpusets;	/* How many cpusets are defined in system? */
 +extern int number_of_cpusets;  /* How many cpusets are defined in system? */
 
 extern int cpuset_init_early(void);
 extern int cpuset_init(void);
 extern void cpuset_init_smp(void);
 -extern void cpuset_fork(struct task_struct *p);
 -extern void cpuset_exit(struct task_struct *p);
 extern cpumask_t cpuset_cpus_allowed(struct task_struct *p);
 extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
 void cpuset_init_current_mems_allowed(void);
 @@ -47,10 +46,6 @@ extern void __cpuset_memory_pressure_bum
 
 extern struct file_operations proc_cpuset_operations;
 extern char *cpuset_task_status_allowed(struct task_struct *task, char *buffer);
 -
 -extern void cpuset_lock(void);
 -extern void cpuset_unlock(void);
 -
 extern int cpuset_mem_spread_node(void);
 
 static inline int cpuset_do_page_mem_spread(void)
 @@ -65,13 +60,22 @@ static inline int cpuset_do_slab_mem_spr
 
 extern void cpuset_track_online_nodes(void);
 
 +extern int cpuset_can_attach_task(struct container *cont,
 +				  struct task_struct *tsk);
 +extern void cpuset_attach_task(struct container *cont,
 +				struct task_struct *tsk);
 +extern void cpuset_post_attach_task(struct container *cont,
 +				    struct container *oldcont,
 +				    struct task_struct *tsk);
 +extern int cpuset_populate_dir(struct container *cont);
 +extern int cpuset_create(struct container *cont);
 +extern void cpuset_destroy(struct container *cont);
 +
 #else /* !CONFIG_CPUSETS */
 
 static inline int cpuset_init_early(void) { return 0; }
 static inline int cpuset_init(void) { return 0; }
 static inline void cpuset_init_smp(void) {}
 -static inline void cpuset_fork(struct task_struct *p) {}
 -static inline void cpuset_exit(struct task_struct *p) {}
 
 static inline cpumask_t cpuset_cpus_allowed(struct task_struct *p)
 {
 @@ -110,9 +114,6 @@ static inline char *cpuset_task_status_a
 return buffer;
 }
 
 -static inline void cpuset_lock(void) {}
 -static inline void cpuset_unlock(void) {}
 -
 static inline int cpuset_mem_spread_node(void)
 {
 return 0;
 Index: container-2.6.19-rc5/kernel/exit.c
 ============================================================ =======
 --- container-2.6.19-rc5.orig/kernel/exit.c
 +++ container-2.6.19-rc5/kernel/exit.c
 @@ -29,7 +29,6 @@
 #include <linux/mempolicy.h>
 #include <linux/taskstats_kern.h>
 #include <linux/delayacct.h>
 -#include <linux/cpuset.h>
 #include <linux/container.h>
 #include <linux/syscalls.h>
 #include <linux/signal.h>
 @@ -923,7 +922,6 @@ fastcall NORET_TYPE void do_exit(long co
 __exit_files(tsk);
 __exit_fs(tsk);
 exit_thread();
 -	cpuset_exit(tsk);
 container_exit(tsk);
 exit_keys(tsk);
 
 Index: container-2.6.19-rc5/kernel/fork.c
 ============================================================ =======
 --- container-2.6.19-rc5.orig/kernel/fork.c
 +++ container-2.6.19-rc5/kernel/fork.c
 @@ -30,7 +30,6 @@
 #include <linux/nsproxy.h>
 #include <linux/capability.h>
 #include <linux/cpu.h>
 -#include <linux/cpuset.h>
 #include <linux/container.h>
 #include <linux/security.h>
 #include <linux/swap.h>
 @@ -1056,13 +1055,12 @@ static struct task_struct *copy_process(
 p->io_wait = NULL;
 p->audit_context = NULL;
 container_fork(p);
 -	cpuset_fork(p);
 #ifdef CONFIG_NUMA
 p->mempolicy = mpol_copy(p->mempolicy);
 if (IS_ERR(p->mempolicy)) {
 retval = PTR_ERR(p->mempolicy);
 p->mempolicy = NULL;
 - 		goto bad_fork_cleanup_cpuset;
 + 		goto bad_fork_cleanup_container;
 }
 mpol_fix_fork_child_flag(p);
 #endif
 @@ -1286,9 +1284,8 @@ bad_fork_cleanup_security:
 bad_fork_cleanup_policy:
 #ifdef CONFIG_NUMA
 mpol_free(p->mempolicy);
 -bad_fork_cleanup_cpuset:
 +bad_fork_cleanup_container:
 #endif
 -	cpuset_exit(p);
 container_exit(p);
 bad_fork_cleanup_delays_binfmt:
 delayacct_tsk_free(p);
 Index: container-2.6.19-rc5/kernel/container.c
 ============================================================ =======
 --- container-2.6.19-rc5.orig/kernel/container.c
 +++ container-2.6.19-rc5/kernel/container.c
 @@ -55,6 +55,7 @@
 #include <linux/time.h>
 #include <linux/backing-dev.h>
 #include <linux/sort.h>
 +#include <linux/cpuset.h>
 
 #include <asm/uaccess.h>
 #include <asm/atomic.h>
 @@ -92,6 +93,18 @@ static struct container top_container =
 .children = LIST_HEAD_INIT(top_container.children),
 };
 
 +/* The path to use for release notifications. No locking between
 + * setting and use - so if userspace updates this while subcontainers
 + * exist, you could miss a notification */
 +static char release_agent_path[PATH_MAX] = "/sbin/container_release_agent";
 +
 +void container_set_release_agent_path(const char *path)
 +{
 +	container_manage_lock();
 +	strcpy(release_agent_path, path);
 +	container_manage_unlock();
 +}
 +
 static struct vfsmount *container_mount;
 static struct super_block *container_sb;
 
 @@ -333,7 +346,7 @@ static inline struct cftype *__d_cft(str
 * Returns 0 on success, -errno on error.
 */
 
 -static int container_path(const struct container *cont, char *buf, int buflen)
 +int container_path(const struct container *cont, char *buf, int buflen)
 {
 char *start;
 
 @@ -397,7 +410,7 @@ static void container_release_agent(cons
 return;
 
 i = 0;
 -	argv[i++] = "/sbin/container_release_agent";
 +	argv[i++] = release_agent_path;
 argv[i++] = (char *)pathbuf;
 argv[i] = NULL;
 
 @@ -438,6 +451,7 @@ static void check_for_release(struct con
 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
 if (!buf)
 return;
 +
 if (container_path(cont, buf, PAGE_SIZE) < 0)
 kfree(buf);
 else
 @@ -486,7 +500,7 @@ static int attach_task(struct container
 pid_t pid;
 struct task_struct *tsk;
 struct container *oldcont;
 -	int retval;
 +	int retval = 0;
 
 if (sscanf(pidbuf, "%d", &pid) != 1)
 return -EIO;
 @@ -513,7 +527,9 @@ static int attach_task(struct container
 get_task_struct(tsk);
 }
 
 -	retval = security_task_setscheduler(tsk, 0, NULL);
 +#ifdef CONFIG_CPUSETS
 +	retval = cpuset_can_attach_task(cont, tsk);
 +#endif
 if (retval) {
 put_task_struct(tsk);
 return retval;
 @@ -533,8 +549,16 @@ static int attach_task(struct container
 rcu_assign_pointer(tsk->container, cont);
 task_unlock(tsk);
 
 +#ifdef CONFIG_CPUSETS
 +	cpuset_attach_task(cont, tsk);
 +#endif
 +
 mutex_unlock(&callback_mutex);
 
 +#ifdef CONFIG_CPUSETS
 +	cpuset_post_attach_task(cont, oldcont, tsk);
 +#endif
 +
 put_task_struct(tsk);
 synchronize_rcu();
 if (atomic_dec_and_test(&oldcont->count))
 @@ -549,6 +573,7 @@ typedef enum {
 FILE_DIR,
 FILE_NOTIFY_ON_RELEASE,
 FILE_TASKLIST,
 +	FILE_RELEASE_AGENT,
 } container_filetype_t;
 
 static ssize_t container_common_file_write(struct container *cont,
 @@ -562,8 +587,7 @@ static ssize_t container_common_file_wri
 char *pathbuf = NULL;
 int retval = 0;
 
 -	/* Crude upper limit on largest legitimate cpulist user might write. */
 -	if (nbytes > 100 + 6 * NR_CPUS)
 +	if (nbytes >= PATH_MAX)
 return -E2BIG;
 
 /* +1 for nul-terminator */
 @@ -590,6 +614,20 @@ static ssize_t container_common_file_wri
 case FILE_TASKLIST:
 retval = attach_task(cont, buffer, &pathbuf);
 break;
 +	case FILE_RELEASE_AGENT:
 +	{
 +		if (nbytes < sizeof(release_agent_path)) {
 +			/* We never write anything other than '\0'
 +			 * into the last char of release_agent_path,
 +			 * so it always remains a NUL-terminated
 +			 * string */
 +			strncpy(release_agent_path, buffer, nbytes);
 +			release_agent_path[nbytes] = 0;
 +		} else {
 +			retval = -ENOSPC;
 +		}
 +		break;
 +	}
 default:
 retval = -EINVAL;
 goto out2;
 @@ -643,6 +681,17 @@ static ssize_t container_common_file_rea
 case FILE_NOTIFY_ON_RELEASE:
 *s++ = notify_on_release(cont) ? '1' : '0';
 break;
 +	case FILE_RELEASE_AGENT:
 +	{
 +		size_t n;
 +		container_manage_lock();
 +		n = strnlen(release_agent_path, sizeof(relea
...
 
 
 |  
	|  |  |  
	| 
		
			| [PATCH 1/7] Generic container system abstracted from cpusets code [message #8545 is a reply to message #8540] | Thu, 23 November 2006 12:08   |  
			| 
				
				
					|  Paul Menage Messages: 642
 Registered: September 2006
 | Senior Member |  |  |  
	| This patch creates a generic process container system based on (and parallel top) the cpusets code.  At a coarse level it was created by
 copying kernel/cpuset.c, doing s/cpuset/container/g, and stripping out any
 code that was cpuset-specific rather than applicable to any process
 container subsystem.
 
 Signed-off-by: Paul Menage <menage@google.com>
 
 ---
 Documentation/containers.txt |  229 +++++++
 fs/proc/base.c               |   11
 include/linux/container.h    |   96 +++
 include/linux/sched.h        |    5
 init/Kconfig                 |    9
 init/main.c                  |    3
 kernel/Makefile              |    1
 kernel/container.c           | 1343 +++++++++++++++++++++++++++++++++++++++++++
 kernel/exit.c                |    2
 kernel/fork.c                |    3
 10 files changed, 1699 insertions(+), 3 deletions(-)
 
 Index: container-2.6.19-rc5/fs/proc/base.c
 ============================================================ =======
 --- container-2.6.19-rc5.orig/fs/proc/base.c
 +++ container-2.6.19-rc5/fs/proc/base.c
 @@ -68,6 +68,7 @@
 #include <linux/security.h>
 #include <linux/ptrace.h>
 #include <linux/seccomp.h>
 +#include <linux/container.h>
 #include <linux/cpuset.h>
 #include <linux/audit.h>
 #include <linux/poll.h>
 @@ -1782,7 +1783,10 @@ static struct pid_entry tgid_base_stuff[
 #ifdef CONFIG_SCHEDSTATS
 INF("schedstat",  S_IRUGO, pid_schedstat),
 #endif
 -#ifdef CONFIG_CPUSETS
 +#ifdef CONFIG_CONTAINERS
 +	REG("container",  S_IRUGO, container),
 +#endif
 +#ifdef CONFIG_PROC_PID_CPUSET
 REG("cpuset",     S_IRUGO, cpuset),
 #endif
 INF("oom_score",  S_IRUGO, oom_score),
 @@ -2056,7 +2060,10 @@ static struct pid_entry tid_base_stuff[]
 #ifdef CONFIG_SCHEDSTATS
 INF("schedstat", S_IRUGO, pid_schedstat),
 #endif
 -#ifdef CONFIG_CPUSETS
 +#ifdef CONFIG_CONTAINERS
 +	REG("container",  S_IRUGO, container),
 +#endif
 +#ifdef CONFIG_PROC_PID_CPUSET
 REG("cpuset",    S_IRUGO, cpuset),
 #endif
 INF("oom_score", S_IRUGO, oom_score),
 Index: container-2.6.19-rc5/include/linux/container.h
 ============================================================ =======
 --- /dev/null
 +++ container-2.6.19-rc5/include/linux/container.h
 @@ -0,0 +1,96 @@
 +#ifndef _LINUX_CONTAINER_H
 +#define _LINUX_CONTAINER_H
 +/*
 + *  container interface
 + *
 + *  Copyright (C) 2003 BULL SA
 + *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
 + *
 + */
 +
 +#include <linux/sched.h>
 +#include <linux/cpumask.h>
 +#include <linux/nodemask.h>
 +
 +#ifdef CONFIG_CONTAINERS
 +
 +extern int number_of_containers;	/* How many containers are defined in system? */
 +
 +extern int container_init_early(void);
 +extern int container_init(void);
 +extern void container_init_smp(void);
 +extern void container_fork(struct task_struct *p);
 +extern void container_exit(struct task_struct *p);
 +
 +extern struct file_operations proc_container_operations;
 +
 +extern void container_lock(void);
 +extern void container_unlock(void);
 +
 +extern void container_manage_lock(void);
 +extern void container_manage_unlock(void);
 +
 +struct container {
 +	unsigned long flags;		/* "unsigned long" so bitops work */
 +
 +	/*
 +	 * Count is atomic so can incr (fork) or decr (exit) without a lock.
 +	 */
 +	atomic_t count;			/* count tasks using this container */
 +
 +	/*
 +	 * We link our 'sibling' struct into our parent's 'children'.
 +	 * Our children link their 'sibling' into our 'children'.
 +	 */
 +	struct list_head sibling;	/* my parent's children */
 +	struct list_head children;	/* my children */
 +
 +	struct container *parent;	/* my parent */
 +	struct dentry *dentry;		/* container fs entry */
 +};
 +
 +/* struct cftype:
 + *
 + * The files in the container filesystem mostly have a very simple read/write
 + * handling, some common function will take care of it. Nevertheless some cases
 + * (read tasks) are special and therefore I define this structure for every
 + * kind of file.
 + *
 + *
 + * When reading/writing to a file:
 + *	- the container to use in file->f_dentry->d_parent->d_fsdata
 + *	- the 'cftype' of the file is file->f_dentry->d_fsdata
 + */
 +
 +struct inode;
 +struct cftype {
 +	char *name;
 +	int private;
 +	int (*open) (struct inode *inode, struct file *file);
 +	ssize_t (*read) (struct container *cont, struct cftype *cft,
 +			 struct file *file,
 +			 char __user *buf, size_t nbytes, loff_t *ppos);
 +	ssize_t (*write) (struct container *cont, struct cftype *cft,
 +			  struct file *file,
 +			  const char __user *buf, size_t nbytes, loff_t *ppos);
 +	int (*release) (struct inode *inode, struct file *file);
 +};
 +
 +int container_add_file(struct container *cont, const struct cftype *cft);
 +
 +int container_is_removed(const struct container *cont);
 +
 +#else /* !CONFIG_CONTAINERS */
 +
 +static inline int container_init_early(void) { return 0; }
 +static inline int container_init(void) { return 0; }
 +static inline void container_init_smp(void) {}
 +static inline void container_fork(struct task_struct *p) {}
 +static inline void container_exit(struct task_struct *p) {}
 +
 +static inline void container_lock(void) {}
 +static inline void container_unlock(void) {}
 +
 +#endif /* !CONFIG_CONTAINERS */
 +
 +#endif /* _LINUX_CONTAINER_H */
 Index: container-2.6.19-rc5/include/linux/sched.h
 ============================================================ =======
 --- container-2.6.19-rc5.orig/include/linux/sched.h
 +++ container-2.6.19-rc5/include/linux/sched.h
 @@ -719,8 +719,8 @@ extern unsigned int max_cache_size;
 
 
 struct io_context;			/* See blkdev.h */
 +struct container;
 struct cpuset;
 -
 #define NGROUPS_SMALL		32
 #define NGROUPS_PER_BLOCK	((int)(PAGE_SIZE / sizeof(gid_t)))
 struct group_info {
 @@ -1006,6 +1006,9 @@ struct task_struct {
 int cpuset_mems_generation;
 int cpuset_mem_spread_rotor;
 #endif
 +#ifdef CONFIG_CONTAINERS
 +	struct container *container;
 +#endif
 struct robust_list_head __user *robust_list;
 #ifdef CONFIG_COMPAT
 struct compat_robust_list_head __user *compat_robust_list;
 Index: container-2.6.19-rc5/init/Kconfig
 ============================================================ =======
 --- container-2.6.19-rc5.orig/init/Kconfig
 +++ container-2.6.19-rc5/init/Kconfig
 @@ -238,6 +238,15 @@ config IKCONFIG_PROC
 This option enables access to the kernel configuration file
 through /proc/config.gz.
 
 +config CONTAINERS
 +	bool "Container support"
 +	help
 +	  This option will let you create and manage process containers,
 +	  which can be used to aggregate multiple processes, e.g. for
 +	  the purposes of resource tracking.
 +
 +	  Say N if unsure
 +
 config CPUSETS
 bool "Cpuset support"
 depends on SMP
 Index: container-2.6.19-rc5/init/main.c
 ============================================================ =======
 --- container-2.6.19-rc5.orig/init/main.c
 +++ container-2.6.19-rc5/init/main.c
 @@ -38,6 +38,7 @@
 #include <linux/writeback.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 +#include <linux/container.h>
 #include <linux/efi.h>
 #include <linux/taskstats_kern.h>
 #include <linux/delayacct.h>
 @@ -568,6 +569,7 @@ asmlinkage void __init start_kernel(void
 }
 #endif
 vfs_caches_init_early();
 +	container_init_early();
 cpuset_init_early();
 mem_init();
 kmem_cache_init();
 @@ -598,6 +600,7 @@ asmlinkage void __init start_kernel(void
 #ifdef CONFIG_PROC_FS
 proc_root_init();
 #endif
 +	container_init();
 cpuset_init();
 taskstats_init_early();
 delayacct_init();
 Index: container-2.6.19-rc5/kernel/container.c
 ============================================================ =======
 --- /dev/null
 +++ container-2.6.19-rc5/kernel/container.c
 @@ -0,0 +1,1343 @@
 +/*
 + *  kernel/container.c
 + *
 + *  Generic process-grouping system.
 + *
 + *  Based originally on the cpuset system, extracted by Paul Menage
 + *  Copyright (C) 2006 Google, Inc
 + *
 + *  Copyright notices from the original cpuset code:
 + *  --------------------------------------------------
 + *  Copyright (C) 2003 BULL SA.
 + *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
 + *
 + *  Portions derived from Patrick Mochel's sysfs code.
 + *  sysfs is Copyright (c) 2001-3 Patrick Mochel
 + *
 + *  2003-10-10 Written by Simon Derr.
 + *  2003-10-22 Updates by Stephen Hemminger.
 + *  2004 May-July Rework by Paul Jackson.
 + *  ---------------------------------------------------
 + *
 + *  This file is subject to the terms and conditions of the GNU General Public
 + *  License.  See the file COPYING in the main directory of the Linux
 + *  distribution for more details.
 + */
 +
 +#include <linux/cpu.h>
 +#include <linux/cpumask.h>
 +#include <linux/container.h>
 +#include <linux/err.h>
 +#include <linux/errno.h>
 +#include <linux/file.h>
 +#include <linux/fs.h>
 +#include <linux/init.h>
 +#include <linux/interrupt.h>
 +#include <linux/kernel.h>
 +#include <linux/kmod.h>
 +#include <linux/list.h>
 +#include <linux/mempolicy.h>
 +#include <linux/mm.h>
 +#include <linux/module.h>
 +#include <linux/mount.h>
 +#include <linux/namei.h>
 +#include <linux/pagemap.h>
 +#include <linux/proc_fs.h>
 +#include <linux/rcupdate.h>
 +#include <linux/sched.h>
 +#include <linux/seq_file.h>
 +#include <linux/security.h>
 +#include <linux/slab.h>
 +#include <linux/smp_lock.h>
 +#include <linux/spinlock.h>
 +#include <linux/stat.h>
 +#include <linux/string.h>
 +#include <linux/time.h>
 +#include <linux/backing-dev.h>
 +#include <linux/sort.h>
 +
 +#include <asm/uaccess.h>
 +#include <asm/atomic.h>
 +#include <linux/mutex.h>
 +
 +#define CONTAINER_SUPER_MAGIC		0x27e0eb
 +
 +/*
 + * Tracks how many containers are currently defined in system.
 + * When there is only one container (the root container) we can
 + * short circuit some hooks.
 + */
 +int number_of_containers __read_mostly;
 +
 +/* bits in struct container fla
...
 
 
 |  
	|  |  |  
	| 
		
			| Re: [PATCH 0/7] Generic Process Containers (+ ResGroups/BeanCounters) [message #8646 is a reply to message #8540] | Thu, 30 November 2006 07:32   |  
			| 
				
				
					|  Paul Jackson Messages: 157
 Registered: February 2006
 | Senior Member |  |  |  
	| I got a chance to build and test this patch set, to see if it behaved like I expected cpusets to behave, on an ia64 SN2 Altix system.
 
 Two details - otherwise looked good.  I continue to like this
 approach.
 
 The two details are (1) /proc/<pid>/cpuset not configured by
 default if CPUSETS configured, and (2) a locking bug wedging
 tasks trying to rmdir a cpuset off the notify_on_release hook.
 
 
 1) I had to enable CONFIG_PROC_PID_CPUSET.  I used the following
 one line change to do this.  I am willing to consider, in due
 time, phasing out such legacy cpuset support.  But so long as it
 is small stuff that is not getting in anyone's way, I think we
 should take our sweet time about doing so -- as in a year or two
 after marking it deprecated or some such.  No sense deciding that
 matter now; keep the current cpuset API working throughout any
 transitition to container based cpusets, then revisit the question
 of whether to deprecate and eventually remove these kernel API
 details, later on, after the major reconstruction dust settles.
 In general, we try to avoid removing kernel API's, especially if
 they are happily being used and working and not causing anyone
 grief.
 
 ============================ begin ============================
 --- 2.6.19-rc5.orig/init/Kconfig	2006-11-29 21:14:48.071114833 -0800
 +++ 2.6.19-rc5/init/Kconfig	2006-11-29 22:19:02.015166048 -0800
 @@ -268,6 +268,7 @@ config CPUSETS
 config PROC_PID_CPUSET
 bool "Include legacy /proc/<pid>/cpuset file"
 depends on CPUSETS
 +	default y if CPUSETS
 
 config CONTAINER_CPUACCT
 bool "Simple CPU accounting container subsystem"
 ============================= end =============================
 
 
 2) I wedged the kernel on the container_lock, doing a removal of a cpuset
 using notify_on_release.
 
 Right now, that test system has the following two tasks, wedged:
 
 ============================ begin ============================
 F S UID   PID PPID C PRI NI ADDR SZ  WCHAN  STIME TTY  TIME     CMD
 0 S root 4992   34 0  71 -5 -   380   wait   22:51 ?   00:00:00 /bin/sh /sbin/cpuset_release_agent /cpuset_test_tree
 0 D root 4994 4992 0  72 -5 -   200 contai   22:51 ?   00:00:00 rmdir /dev/cpuset//cpuset_test_tree
 ============================= end =============================
 
 I had a cpuset called /cpuset_test_tree, and some sub-cpusets
 below it.  I marked it 'notify_on_release' and then removed all
 tasks from it, and then removed the child cpusets that it had.
 Removing that last child cpuset presumably triggered the above
 callout to /sbin/cpuset_release_agent, which called rmdir.
 
 That wait address (from /proc/4994/stat) in hex is a0000001000f1060,
 and my System.map has the two lines:
 
 a0000001000f1040 T container_lock
 a0000001000f1360 T container_manage_unlock
 
 So it is wedged in container_lock.
 
 I have subsequently also wedged an 'ls' command trying to scan this
 /dev/cpuset directory, waiting in the kernel routine vfs_readdir
 (not surprising, given that I'm in the middle of doing a rmdir on
 that directory.)
 
 If you don't immediately see the problem, I can go back and get a
 kernel stack trace or whatever else you need.
 
 This lockup occurred the first, and thus far only, time that I tried
 to use notify_on_release to rmdir a cpuset.  So I presume it is an
 easy failure for me to reproduce.
 
 --
 I won't rest till it's the best ...
 Programmer, Linux Scalability
 Paul Jackson <pj@sgi.com> 1.925.600.0401
 |  
	|  |  |  
	|  |  
	|  |  
	| 
		
			| [Patch 1/3] Miscellaneous container fixes [message #8682 is a reply to message #8544] | Fri, 01 December 2006 16:46   |  
			| 
				
				
					|  Srivatsa Vaddagiri Messages: 241
 Registered: August 2006
 | Senior Member |  |  |  
	| This patches fixes various bugs I hit in the recently posted container patches.
 
 1. If a subsystem registers with fork/exit hook during bootup (much
 before rcu is initialized), then the resulting synchronize_rcu() in
 container_register_subsys() hangs. Avoid this by not calling
 synchronize_rcu() if we arent fully booted yet.
 
 2. If cpuset_create fails() for some reason, then the resulting
 call to cpuset_destroy can trip. Avoid this by initializing
 container->...->cpuset pointer to NULL in cpuset_create().
 
 3. container_rmdir->cpuset_destroy->update_flag can deadlock on
 container_lock(). Avoid this by introducing __update_flag, which
 doesnt take container_lock().
 
 (I have also hit some lockdep warnings. Will post them after some
 review, to make sure that they are not introduced by my patches).
 
 Signed-off-by : Srivatsa Vaddagiri <vatsa@in.ibm.com>
 
 
 ---
 
 linux-2.6.19-rc6-vatsa/kernel/container.c |    4 ++-
 linux-2.6.19-rc6-vatsa/kernel/cpuset.c    |   35 ++++++++++++++++++++++++------
 2 files changed, 31 insertions(+), 8 deletions(-)
 
 diff -puN kernel/container.c~container_fixes kernel/container.c
 --- linux-2.6.19-rc6/kernel/container.c~container_fixes	2006-12-01 16:19:41.000000000 +0530
 +++ linux-2.6.19-rc6-vatsa/kernel/container.c	2006-12-01 16:20:20.000000000 +0530
 @@ -1344,6 +1344,7 @@ static long container_create(struct cont
 cont->parent = parent;
 cont->root = parent->root;
 cont->hierarchy = parent->hierarchy;
 +	cont->top_container = parent->top_container;
 
 for_each_subsys(cont->hierarchy, ss) {
 err = ss->create(ss, cont);
 @@ -1580,7 +1581,8 @@ int container_register_subsys(struct con
 if (!need_forkexit_callback &&
 (new_subsys->fork || new_subsys->exit)) {
 need_forkexit_callback = 1;
 - 		synchronize_rcu();
 +		if (system_state == SYSTEM_RUNNING)
 + 			synchronize_rcu();
 }
 
 /* If this subsystem requested that it be notified with fork
 diff -puN kernel/cpuset.c~container_fixes kernel/cpuset.c
 --- linux-2.6.19-rc6/kernel/cpuset.c~container_fixes	2006-12-01 19:02:35.000000000 +0530
 +++ linux-2.6.19-rc6-vatsa/kernel/cpuset.c	2006-12-01 20:43:52.000000000 +0530
 @@ -97,14 +97,22 @@ struct cpuset {
 /* Update the cpuset for a container */
 static inline void set_container_cs(struct container *cont, struct cpuset *cs)
 {
 -	cont->subsys[cpuset_subsys.subsys_id] = &cs->css;
 +	if (cs)
 +		cont->subsys[cpuset_subsys.subsys_id] = &cs->css;
 +	else
 +		cont->subsys[cpuset_subsys.subsys_id] = NULL;
 }
 
 /* Retrieve the cpuset for a container */
 static inline struct cpuset *container_cs(struct container *cont)
 {
 -	return container_of(container_subsys_state(cont, &cpuset_subsys),
 -			    struct cpuset, css);
 +	struct container_subsys_state *css;
 +
 +	css = container_subsys_state(cont, &cpuset_subsys);
 +	if (css)
 +		return container_of(css, struct cpuset, css);
 +	else
 +		return NULL;
 }
 
 /* Retrieve the cpuset for a task */
 @@ -698,7 +706,7 @@ static int update_memory_pressure_enable
 * Call with manage_mutex held.
 */
 
 -static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
 +static int __update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
 {
 int turning_on;
 struct cpuset trialcs;
 @@ -717,18 +725,27 @@ static int update_flag(cpuset_flagbits_t
 return err;
 cpu_exclusive_changed =
 (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs));
 -	container_lock();
 if (turning_on)
 set_bit(bit, &cs->flags);
 else
 clear_bit(bit, &cs->flags);
 -	container_unlock();
 
 if (cpu_exclusive_changed)
 update_cpu_domains(cs);
 return 0;
 }
 
 +static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
 +{
 +	int err;
 +
 +	container_lock();
 +	err = __update_flag(bit, cs, buf);
 +	container_unlock();
 +
 +	return err;
 +}
 +
 /*
 * Frequency meter - How fast is some event occurring?
 *
 @@ -1165,6 +1182,7 @@ int cpuset_create(struct container_subsy
 return 0;
 }
 parent = container_cs(cont->parent);
 +	set_container_cs(cont, NULL);
 cs = kmalloc(sizeof(*cs), GFP_KERNEL);
 if (!cs)
 return -ENOMEM;
 @@ -1202,9 +1220,12 @@ void cpuset_destroy(struct container_sub
 {
 struct cpuset *cs = container_cs(cont);
 
 +	if (!cs)
 +		return;
 +
 cpuset_update_task_memory_state();
 if (is_cpu_exclusive(cs)) {
 -		int retval = update_flag(CS_CPU_EXCLUSIVE, cs, "0");
 +		int retval = __update_flag(CS_CPU_EXCLUSIVE, cs, "0");
 BUG_ON(retval);
 }
 number_of_cpusets--;
 _
 
 
 --
 Regards,
 vatsa
 |  
	|  |  |  
	| 
		
			| [Patch 2/3] cpu controller (v3) - based on RTLIMIT_RT_CPU patch [message #8683 is a reply to message #8682] | Fri, 01 December 2006 16:51  |  
			| 
				
				
					|  Srivatsa Vaddagiri Messages: 241
 Registered: August 2006
 | Senior Member |  |  |  
	| Nick/Ingo, 
 Here's another approach for a minimal cpu controller. Would greatly
 appreciate any feedback as before.
 
 This version is inspired by Ingo's RTLIMIT_RT_CPU patches found here:
 
 http://kernel.org/pub/linux/kernel/people/akpm/patches/2.6/2 .6.11-rc2/2.6.11-rc2-mm2/broken-out/rlimit_rt_cpu.patch
 
 This patch is also about 80% smaller than the patches I had posted earlier:
 
 http://lkml.org/lkml/2006/9/28/236
 
 Primary differences between Ingo's RT_LIMIT_CPU patch and this one:
 
 - This patch handles starvation of lower priority tasks in a group.
 - This patch uses tokens for accounting cpu consumption. I didnt
 get good results with decaying avg approach used in the rtlimit patches.
 - Task grouping based on cpuset/containers and not on rtlimit.
 
 Other features:
 
 - Retains one-runqueue-per-cpu, as is prevalent today
 - scheduling no longer of O(1) complexity, similar to rtlimit patches.
 This can be avoided if we use separate runqueues for different groups
 (as was done in http://lkml.org/lkml/2006/9/28/236)
 - Only limit supported (no guarantee)
 
 Unsupported feature:
 
 - SMP load balance aware of group limits. This can be handled using
 smpnice later if required (http://lkml.org/lkml/2006/9/28/244)
 
 
 Signed-off-by : Srivatsa Vaddagiri <vatsa@in.ibm.com>
 
 ---
 
 linux-2.6.19-rc6-vatsa/include/linux/sched.h |    3
 linux-2.6.19-rc6-vatsa/kernel/sched.c        |  195 ++++++++++++++++++++++++++-
 2 files changed, 195 insertions(+), 3 deletions(-)
 
 diff -puN include/linux/sched.h~cpu_ctlr include/linux/sched.h
 --- linux-2.6.19-rc6/include/linux/sched.h~cpu_ctlr	2006-12-01 20:45:08.000000000 +0530
 +++ linux-2.6.19-rc6-vatsa/include/linux/sched.h	2006-12-01 20:45:08.000000000 +0530
 @@ -1095,6 +1095,9 @@ static inline void put_task_struct(struc
 /* Not implemented yet, only for 486*/
 #define PF_STARTING	0x00000002	/* being created */
 #define PF_EXITING	0x00000004	/* getting shut down */
 +#ifdef CONFIG_CPUMETER
 +#define PF_STARVING	0x00000010      /* Task starving for CPU */
 +#endif
 #define PF_FORKNOEXEC	0x00000040	/* forked but didn't exec */
 #define PF_SUPERPRIV	0x00000100	/* used super-user privileges */
 #define PF_DUMPCORE	0x00000200	/* dumped core */
 diff -puN kernel/sched.c~cpu_ctlr kernel/sched.c
 --- linux-2.6.19-rc6/kernel/sched.c~cpu_ctlr	2006-12-01 20:45:08.000000000 +0530
 +++ linux-2.6.19-rc6-vatsa/kernel/sched.c	2006-12-01 20:45:08.000000000 +0530
 @@ -264,10 +264,25 @@ struct rq {
 unsigned long ttwu_local;
 #endif
 struct lock_class_key rq_lock_key;
 +#ifdef CONFIG_CPUMETER
 +	unsigned long last_update;
 +	int need_recheck;
 +#endif
 };
 
 static DEFINE_PER_CPU(struct rq, runqueues);
 
 +struct cpu_usage {
 +	long tokens;
 +	unsigned long last_update;
 +	int starve_count;
 +};
 +
 +struct task_grp {
 +	unsigned long limit;
 +	struct cpu_usage *cpu_usage;	/* per-cpu ptr */
 +};
 +
 static inline int cpu_of(struct rq *rq)
 {
 #ifdef CONFIG_SMP
 @@ -705,6 +720,137 @@ enqueue_task_head(struct task_struct *p,
 p->array = array;
 }
 
 +#ifdef CONFIG_CPUMETER
 +
 +#define task_starving(p)	(p->flags & PF_STARVING)
 +
 +static inline struct task_grp *task_grp(struct task_struct *p)
 +{
 +	return NULL;
 +}
 +
 +/* Mark a task starving - either we shortcircuited its timeslice or we didnt
 + * pick it to run (because its group ran out of bandwidth limit).
 + */
 +static inline void set_tsk_starving(struct task_struct *p, struct task_grp *grp)
 +{
 +	struct cpu_usage *grp_usage;
 +
 +	if (task_starving(p))
 +		return;
 +
 +	BUG_ON(!grp);
 +	grp_usage = per_cpu_ptr(grp->cpu_usage, task_cpu(p));
 +	grp_usage->starve_count++;
 +	p->flags |= PF_STARVING;
 +}
 +
 +/* Clear a task's starving flag */
 +static inline void clear_tsk_starving(struct task_struct *p,
 +						 struct task_grp *grp)
 +{
 +	struct cpu_usage *grp_usage;
 +
 +	if (!task_starving(p))
 +		return;
 +
 +	BUG_ON(!grp);
 +	grp_usage = per_cpu_ptr(grp->cpu_usage, task_cpu(p));
 +	grp_usage->starve_count--;
 +	p->flags &= ~PF_STARVING;
 +}
 +
 +/* Does the task's group have starving tasks? */
 +static inline int is_grp_starving(struct task_struct *p)
 +{
 +	struct task_grp *grp = task_grp(p);
 +	struct cpu_usage *grp_usage;
 +
 +	if (!grp)
 +		return 0;
 +
 +	grp_usage = per_cpu_ptr(grp->cpu_usage, task_cpu(p));
 +	if (grp_usage->starve_count)
 +		return 1;
 +
 +	return 0;
 +}
 +
 +/* Are we past the 1-sec control window? If so, all groups get to renew their
 + * expired tokens.
 + */
 +static inline void adjust_control_window(struct task_struct *p)
 +{
 +	struct rq *rq = task_rq(p);
 +	unsigned long delta;
 +
 +	delta = jiffies - rq->last_update;
 +	if (delta >= HZ) {
 +		rq->last_update += (delta/HZ) * HZ;
 +		rq->need_recheck = 1;
 +	}
 +}
 +
 +/* Account group's cpu usage */
 +static inline void inc_cpu_usage(struct task_struct *p)
 +{
 +	struct task_grp *grp = task_grp(p);
 +	struct cpu_usage *grp_usage;
 +
 +	adjust_control_window(p);
 +
 +	if (!grp || !grp->limit || rt_task(p))
 +		return;
 +
 +	grp_usage = per_cpu_ptr(grp->cpu_usage, task_cpu(p));
 +	grp_usage->tokens--;
 +}
 +
 +static inline int task_over_cpu_limit(struct task_struct *p)
 +{
 +	struct rq *rq = task_rq(p);
 +	struct task_grp *grp = task_grp(p);
 +	struct cpu_usage *grp_usage;
 +
 +	adjust_control_window(p);
 +
 +	if (!grp || !grp->limit || !rq->need_recheck)
 +	 	return 0;
 +
 +	grp_usage = per_cpu_ptr(grp->cpu_usage, task_cpu(p));
 +	if (grp_usage->last_update != rq->last_update) {
 +		/* Replenish tokens */
 +		grp_usage->tokens = grp->limit * HZ / 100;
 +		grp_usage->last_update = rq->last_update;
 +	}
 +
 +	if (grp_usage->tokens <= 0)
 +		return 1;
 +
 +	return 0;
 +}
 +
 +static inline void rq_set_recheck(struct rq *rq, int check)
 +{
 +	rq->need_recheck = check;
 +}
 +
 +#else
 +
 +#define task_starving(p)	0
 +
 +struct task_grp;
 +
 +static struct task_grp *task_grp(struct task_struct *p) { return NULL;}
 +static void inc_cpu_usage(struct task_struct *p) { }
 +static int task_over_cpu_limit(struct task_struct *p) { return 0; }
 +static void set_tsk_starving(struct task_struct *p, struct task_grp *grp) { }
 +static void clear_tsk_starving(struct task_struct *p, struct task_grp *grp) { }
 +static int is_grp_starving(struct task_struct *p) { return 0;}
 +static inline void rq_set_recheck(struct rq *rq, int check) { }
 +
 +#endif		/* CONFIG_CPUMETER */
 +
 /*
 * __normal_prio - return the priority that is based on the static
 * priority but is modified by bonuses/penalties.
 @@ -847,6 +993,7 @@ static void __activate_task(struct task_
 target = rq->expired;
 enqueue_task(p, target);
 inc_nr_running(p, rq);
 +	rq_set_recheck(rq, 1);
 }
 
 /*
 @@ -1586,6 +1733,9 @@ void fastcall sched_fork(struct task_str
 /* Want to start with kernel preemption disabled. */
 task_thread_info(p)->preempt_count = 1;
 #endif
 +#ifdef CONFIG_CPUMETER
 +	p->flags &= ~PF_STARVING;
 +#endif
 /*
 * Share the timeslice between parent and child, thus the
 * total amount of pending timeslices in the system doesn't change,
 @@ -2047,6 +2197,8 @@ static void pull_task(struct rq *src_rq,
 {
 dequeue_task(p, src_array);
 dec_nr_running(p, src_rq);
 +	clear_tsk_starving(p, task_grp(p));
 +	rq_set_recheck(this_rq, 1);
 set_task_cpu(p, this_cpu);
 inc_nr_running(p, this_rq);
 enqueue_task(p, this_array);
 @@ -3068,6 +3220,9 @@ void scheduler_tick(void)
 goto out;
 }
 spin_lock(&rq->lock);
 +
 +	inc_cpu_usage(p);
 +
 /*
 * The task was running during this tick - update the
 * time slice counter. Note: we do not update a thread's
 @@ -3094,17 +3249,18 @@ void scheduler_tick(void)
 dequeue_task(p, rq->active);
 set_tsk_need_resched(p);
 p->prio = effective_prio(p);
 -		p->time_slice = task_timeslice(p);
 p->first_time_slice = 0;
 
 if (!rq->expired_timestamp)
 rq->expired_timestamp = jiffies;
 -		if (!TASK_INTERACTIVE(p) || expired_starving(rq)) {
 +		if (!TASK_INTERACTIVE(p) || expired_starving(rq)
 +						|| task_over_cpu_limit(p)) {
 enqueue_task(p, rq->expired);
 if (p->static_prio < rq->best_expired_prio)
 rq->best_expired_prio = p->static_prio;
 } else
 enqueue_task(p, rq->active);
 +		goto out_unlock;
 } else {
 /*
 * Prevent a too long timeslice allowing a task to monopolize
 @@ -3131,6 +3287,14 @@ void scheduler_tick(void)
 set_tsk_need_resched(p);
 }
 }
 +
 +	if (task_over_cpu_limit(p)) {
 +		dequeue_task(p, rq->active);
 +		set_tsk_need_resched(p);
 +		enqueue_task(p, rq->expired);
 +		set_tsk_starving(p, task_grp(p));
 +	}
 +
 out_unlock:
 spin_unlock(&rq->lock);
 out:
 @@ -3320,7 +3484,7 @@ asmlinkage void __sched schedule(void)
 struct list_head *queue;
 unsigned long long now;
 unsigned long run_time;
 -	int cpu, idx, new_prio;
 +	int cpu, idx, new_prio, array_switch;
 long *switch_count;
 struct rq *rq;
 
 @@ -3379,6 +3543,7 @@ need_resched_nonpreemptible:
 else {
 if (prev->state == TASK_UNINTERRUPTIBLE)
 rq->nr_uninterruptible++;
 +			clear_tsk_starving(prev, task_grp(prev));
 deactivate_task(prev, rq);
 }
 }
 @@ -3394,11 +3559,15 @@ need_resched_nonpreemptible:
 }
 }
 
 +	array_switch = 0;
 +
 +pick_next_task:
 array = rq->active;
 if (unlikely(!array->nr_active)) {
 /*
 * Switch the active and expired arrays.
 */
 +		array_switch++;
 schedstat_inc(rq, sched_switch);
 rq->active = rq->expired;
 rq->expired = array;
 @@ -3411,6 +3580,25 @@ need_resched_nonpreemptible:
 queue = array->queue + idx;
 next = list_entry(queue->next, struct task_struct, run_list);
 
 +	/* If we have done an array switch twice, it means we cant find any
 +	 * task which isn't above its limit and hence we just run the
 +	 * first task on the active array.
 +	 */
 +	if (array_switch < 2 && (task_over_cpu_limit(next) ||
 +			(!task_starving(next) && is_grp_starving(next)))) {
 +		dequeue_task(next, rq->active);
 +		enqueue_task(next, rq->expired);
 +		if (next->time_slice)
 +			set_tsk_starving(next, task_grp(next));
 +		goto pick_next_task;
 +	}
 +
 +	if (task_over_c
...
 
 
 |  
	|  |  |  
	| 
		
			| [PATCH 6/7] Split Cpusets into Cpusets and Memsets [message #16760 is a reply to message #8540] | Thu, 23 November 2006 12:08  |  
			| 
				
				
					|  Paul Menage Messages: 642
 Registered: September 2006
 | Senior Member |  |  |  
	| This patch splits the Cpusets container subsystem into two independent
subsystems; currently CPUsets are the cpu and memory node control
functionality in Cpusets are pretty much disjoint and unrelated; now
that the common process container abstraction has been moved out,
there's no particular reason to keep them together in the same
subsystem.
Signed-off-by: Paul Menage <menage@google.com>
---
 fs/proc/array.c           |    2 
 include/linux/cpuset.h    |   75 --
 include/linux/mempolicy.h |    2 
 include/linux/memset.h    |  125 ++++
 include/linux/sched.h     |   10 
 init/Kconfig              |   14 
 init/main.c               |    3 
 kernel/Makefile           |    1 
 kernel/cpuset.c           |  994 +--------------------------------
 kernel/memset.c           | 1352 ++++++++++++++++++++++++++++++++++++++++++++++
 mm/filemap.c              |    6 
 mm/hugetlb.c              |    4 
 mm/memory_hotplug.c       |    4 
 mm/mempolicy.c            |   36 -
 mm/migrate.c              |    4 
 mm/oom_kill.c             |   14 
 mm/page_alloc.c           |   26 
 mm/slab.c                 |   12 
 mm/vmscan.c               |   10 
 19 files changed, 1593 insertions(+), 1101 deletions(-)
Index: container-2.6.19-rc6/include/linux/cpuset.h
===================================================================
--- container-2.6.19-rc6.orig/include/linux/cpuset.h
+++ container-2.6.19-rc6/include/linux/cpuset.h
@@ -10,58 +10,22 @@
 
 #include <linux/sched.h>
 #include <linux/cpumask.h>
-#include <linux/nodemask.h>
 #include <linux/container.h>
 
 #ifdef CONFIG_CPUSETS
 
-extern int number_of_cpusets;  /* How many cpusets are defined in system? */
-
 extern int cpuset_init_early(void);
 extern int cpuset_init(void);
 extern void cpuset_init_smp(void);
 extern cpumask_t cpuset_cpus_allowed(struct task_struct *p);
-extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
-void cpuset_init_current_mems_allowed(void);
-void cpuset_update_task_memory_state(void);
-#define cpuset_nodes_subset_current_mems_allowed(nodes) \
-		nodes_subset((nodes), current->mems_allowed)
-int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl);
-
-extern int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask);
-static int inline cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
-{
-	return number_of_cpusets <= 1 || __cpuset_zone_allowed(z, gfp_mask);
-}
 
 extern int cpuset_excl_nodes_overlap(const struct task_struct *p);
 
-#define cpuset_memory_pressure_bump() 				\
-	do {							\
-		if (cpuset_memory_pressure_enabled)		\
-			__cpuset_memory_pressure_bump();	\
-	} while (0)
-extern int cpuset_memory_pressure_enabled;
-extern void __cpuset_memory_pressure_bump(void);
-
 extern struct file_operations proc_cpuset_operations;
 extern char *cpuset_task_status_allowed(struct task_struct *task, char *buffer);
-extern int cpuset_mem_spread_node(void);
-
-static inline int cpuset_do_page_mem_spread(void)
-{
-	return current->flags & PF_SPREAD_PAGE;
-}
-
-static inline int cpuset_do_slab_mem_spread(void)
-{
-	return current->flags & PF_SPREAD_SLAB;
-}
 
 extern void cpuset_track_online_nodes(void);
 
-extern int current_cpuset_is_being_rebound(void);
-
 #else /* !CONFIG_CPUSETS */
 
 static inline int cpuset_init_early(void) { return 0; }
@@ -73,60 +37,21 @@ static inline cpumask_t cpuset_cpus_allo
 	return cpu_possible_map;
 }
 
-static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
-{
-	return node_possible_map;
-}
-
-static inline void cpuset_init_current_mems_allowed(void) {}
-static inline void cpuset_update_task_memory_state(void) {}
-#define cpuset_nodes_subset_current_mems_allowed(nodes) (1)
-
-static inline int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
-{
-	return 1;
-}
-
-static inline int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
-{
-	return 1;
-}
 
 static inline int cpuset_excl_nodes_overlap(const struct task_struct *p)
 {
 	return 1;
 }
 
-static inline void cpuset_memory_pressure_bump(void) {}
-
 static inline char *cpuset_task_status_allowed(struct task_struct *task,
 							char *buffer)
 {
 	return buffer;
 }
 
-static inline int cpuset_mem_spread_node(void)
-{
-	return 0;
-}
-
-static inline int cpuset_do_page_mem_spread(void)
-{
-	return 0;
-}
-
-static inline int cpuset_do_slab_mem_spread(void)
-{
-	return 0;
-}
 
 static inline void cpuset_track_online_nodes(void) {}
 
-static inline int current_cpuset_is_being_rebound(void)
-{
-	return 0;
-}
-
 #endif /* !CONFIG_CPUSETS */
 
 #endif /* _LINUX_CPUSET_H */
Index: container-2.6.19-rc6/include/linux/memset.h
===================================================================
--- /dev/null
+++ container-2.6.19-rc6/include/linux/memset.h
@@ -0,0 +1,125 @@
+#ifndef _LINUX_MEMSET_H
+#define _LINUX_MEMSET_H
+/*
+ *  memset interface
+ *
+ *  Copyright (C) 2003 BULL SA
+ *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
+ *
+ */
+
+#include <linux/sched.h>
+#include <linux/nodemask.h>
+#include <linux/container.h>
+
+#ifdef CONFIG_MEMSETS
+
+extern int number_of_memsets;  /* How many memsets are defined in system? */
+
+extern int memset_init_early(void);
+extern int memset_init(void);
+extern void memset_init_smp(void);
+extern nodemask_t memset_mems_allowed(struct task_struct *p);
+void memset_init_current_mems_allowed(void);
+void memset_update_task_memory_state(void);
+#define memset_nodes_subset_current_mems_allowed(nodes) \
+		nodes_subset((nodes), current->mems_allowed)
+int memset_zonelist_valid_mems_allowed(struct zonelist *zl);
+
+extern int __memset_zone_allowed(struct zone *z, gfp_t gfp_mask);
+static int inline memset_zone_allowed(struct zone *z, gfp_t gfp_mask)
+{
+	return number_of_memsets <= 1 || __memset_zone_allowed(z, gfp_mask);
+}
+
+extern int memset_excl_nodes_overlap(const struct task_struct *p);
+
+#define memset_memory_pressure_bump() 				\
+	do {							\
+		if (memset_memory_pressure_enabled)		\
+			__memset_memory_pressure_bump();	\
+	} while (0)
+extern int memset_memory_pressure_enabled;
+extern void __memset_memory_pressure_bump(void);
+
+extern struct file_operations proc_memset_operations;
+extern char *memset_task_status_allowed(struct task_struct *task, char *buffer);
+extern int memset_mem_spread_node(void);
+
+static inline int memset_do_page_mem_spread(void)
+{
+	return current->flags & PF_SPREAD_PAGE;
+}
+
+static inline int memset_do_slab_mem_spread(void)
+{
+	return current->flags & PF_SPREAD_SLAB;
+}
+
+extern void memset_track_online_nodes(void);
+
+extern int current_memset_is_being_rebound(void);
+
+#else /* !CONFIG_MEMSETS */
+
+static inline int memset_init_early(void) { return 0; }
+static inline int memset_init(void) { return 0; }
+static inline void memset_init_smp(void) {}
+
+static inline nodemask_t memset_mems_allowed(struct task_struct *p)
+{
+	return node_possible_map;
+}
+
+static inline void memset_init_current_mems_allowed(void) {}
+static inline void memset_update_task_memory_state(void) {}
+#define memset_nodes_subset_current_mems_allowed(nodes) (1)
+
+static inline int memset_zonelist_valid_mems_allowed(struct zonelist *zl)
+{
+	return 1;
+}
+
+static inline int memset_zone_allowed(struct zone *z, gfp_t gfp_mask)
+{
+	return 1;
+}
+
+static inline int memset_excl_nodes_overlap(const struct task_struct *p)
+{
+	return 1;
+}
+
+static inline void memset_memory_pressure_bump(void) {}
+
+static inline char *memset_task_status_allowed(struct task_struct *task,
+							char *buffer)
+{
+	return buffer;
+}
+
+static inline int memset_mem_spread_node(void)
+{
+	return 0;
+}
+
+static inline int memset_do_page_mem_spread(void)
+{
+	return 0;
+}
+
+static inline int memset_do_slab_mem_spread(void)
+{
+	return 0;
+}
+
+static inline void memset_track_online_nodes(void) {}
+
+static inline int current_memset_is_being_rebound(void)
+{
+	return 0;
+}
+
+#endif /* !CONFIG_MEMSETS */
+
+#endif /* _LINUX_MEMSET_H */
Index: container-2.6.19-rc6/kernel/cpuset.c
===================================================================
--- container-2.6.19-rc6.orig/kernel/cpuset.c
+++ container-2.6.19-rc6/kernel/cpuset.c
@@ -32,7 +32,6 @@
 #include <linux/kernel.h>
 #include <linux/kmod.h>
 #include <linux/list.h>
-#include <linux/mempolicy.h>
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/mount.h>
@@ -56,42 +55,18 @@
 #include <asm/atomic.h>
 #include <linux/mutex.h>
 
-/*
- * Tracks how many cpusets are currently defined in system.
- * When there is only one cpuset (the root cpuset) we can
- * short circuit some hooks.
- */
-int number_of_cpusets __read_mostly;
-
 /* Retrieve the cpuset from a container */
 static struct container_subsys cpuset_subsys;
 struct cpuset;
 
-/* See "Frequency meter" comments, below. */
-
-struct fmeter {
-	int cnt;		/* unprocessed events count */
-	int val;		/* most recent output value */
-	time_t time;		/* clock (secs) when val computed */
-	spinlock_t lock;	/* guards read or write of above */
-};
-
 struct cpuset {
 	struct container_subsys_state css;
 
 	unsigned long flags;		/* "unsigned long" so bitops work */
 	cpumask_t cpus_allowed;		/* CPUs allowed to tasks in cpuset */
-	nodemask_t mems_allowed;	/* Memory Nodes allowed to tasks */
 
 	struct cpuset *parent;		/* my parent */
 
-	/*
-	 * Copy of global cpuset_mems_generation as of the most
-	 * recent time this cpuset changed its mems_allowed.
-	 */
-	int mems_generation;
-
-	struct fmeter fmeter;		/* memory_pressure filter */
 };
 
 /* Update the cpuset for a container */
@@ -117,10 +92,6 @@ static inline struct cpuset *task_cs(str
 /* bits in struct cpuset flags field */
 typedef enum {
 	CS_CPU_EXCLUSIVE,
-	CS_MEM_EXCLUSIVE,
-	CS_MEMORY_MIGRATE,
-	CS_SPREAD_PAGE,
-	CS_SPREAD_SLAB,
 } cpuset_flagbits_t;
 
 /* convenient tests for these bits */
@@ -129,51 +100,10 @@ static inline int is_cpu_exclusive(const
 	return test_bit(CS_CPU_EXC...
 
 |  
	|  |  | 
 
 
 Current Time: Sun Oct 26 14:48:34 GMT 2025 
 Total time taken to generate the page: 0.08606 seconds |