Home » Mailing lists » Devel » [PATCH 01/10] Containers(V10): Basic container framework
Re: [PATCH 01/10] Containers(V10): Basic container framework [message #13569 is a reply to message #13542] |
Wed, 30 May 2007 07:15 |
Andrew Morton
Messages: 127 Registered: December 2005
|
Senior Member |
|
|
On Tue, 29 May 2007 06:01:05 -0700 menage@google.com wrote:
> This patch adds the main containers framework - the container
> filesystem, and the basic structures for tracking membership and
> associating subsystem state objects to tasks.
>
> ...
>
> --- /dev/null
> +++ container-2.6.22-rc2-mm1/include/linux/container_subsys.h
> @@ -0,0 +1,10 @@
> +/* Add subsystem definitions of the form SUBSYS(<name>) in this
> + * file. Surround each one by a line of comment markers so that
> + * patches don't collide
> + */
> +
> +/* */
> +
> +/* */
> +
> +/* */
> Index: container-2.6.22-rc2-mm1/include/linux/sched.h
> ============================================================ =======
> --- container-2.6.22-rc2-mm1.orig/include/linux/sched.h
> +++ container-2.6.22-rc2-mm1/include/linux/sched.h
> @@ -851,6 +851,34 @@ struct sched_class {
> void (*task_new) (struct rq *rq, struct task_struct *p);
> };
>
> +#ifdef CONFIG_CONTAINERS
> +
> +#define SUBSYS(_x) _x ## _subsys_id,
> +enum container_subsys_id {
> +#include <linux/container_subsys.h>
> + CONTAINER_SUBSYS_COUNT
> +};
> +#undef SUBSYS
> +
> +/* A css_group is a structure holding pointers to a set of
> + * container_subsys_state objects.
> + */
> +
> +struct css_group {
> +
> + /* Set of subsystem states, one for each subsystem. NULL for
> + * subsystems that aren't part of this hierarchy. These
> + * pointers reduce the number of dereferences required to get
> + * from a task to its state for a given container, but result
> + * in increased space usage if tasks are in wildly different
> + * groupings across different hierarchies. This array is
> + * immutable after creation */
> + struct container_subsys_state *subsys[CONTAINER_SUBSYS_COUNT];
> +
> +};
hm, missing forward declaration of struct container_subsys_state, but it all
seems to work out (via nested include) once the patches are applied and the
config option is enableable.
>
> ...
>
> --- /dev/null
> +++ container-2.6.22-rc2-mm1/kernel/container.c
>
> ...
>
> +#include <linux/cpu.h>
> +#include <linux/cpumask.h>
> +#include <linux/container.h>
> +#include <linux/err.h>
> +#include <linux/errno.h>
> +#include <linux/file.h>
> +#include <linux/fs.h>
> +#include <linux/init.h>
> +#include <linux/interrupt.h>
> +#include <linux/kernel.h>
> +#include <linux/kmod.h>
> +#include <linux/list.h>
> +#include <linux/mempolicy.h>
> +#include <linux/mm.h>
> +#include <linux/module.h>
> +#include <linux/mount.h>
> +#include <linux/namei.h>
> +#include <linux/pagemap.h>
> +#include <linux/proc_fs.h>
> +#include <linux/rcupdate.h>
> +#include <linux/sched.h>
> +#include <linux/seq_file.h>
> +#include <linux/security.h>
> +#include <linux/slab.h>
> +#include <linux/smp_lock.h>
> +#include <linux/spinlock.h>
> +#include <linux/stat.h>
> +#include <linux/string.h>
> +#include <linux/time.h>
> +#include <linux/backing-dev.h>
> +#include <linux/sort.h>
> +
> +#include <asm/uaccess.h>
> +#include <asm/atomic.h>
> +#include <linux/mutex.h>
Holy cow, do we need all those?
> +typedef enum {
> + CONT_REMOVED,
> +} container_flagbits_t;
typedefs are verboten. Fortunately this one is never referred to - only
the values are used, so we can delete it.
>
> ...
>
> +static void container_clear_directory(struct dentry *dentry)
> +{
> + struct list_head *node;
> + BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
> + spin_lock(&dcache_lock);
> + node = dentry->d_subdirs.next;
> + while (node != &dentry->d_subdirs) {
> + struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
> + list_del_init(node);
> + if (d->d_inode) {
> + /* This should never be called on a container
> + * directory with child containers */
> + BUG_ON(d->d_inode->i_mode & S_IFDIR);
> + d = dget_locked(d);
> + spin_unlock(&dcache_lock);
> + d_delete(d);
> + simple_unlink(dentry->d_inode, d);
> + dput(d);
> + spin_lock(&dcache_lock);
> + }
> + node = dentry->d_subdirs.next;
> + }
> + spin_unlock(&dcache_lock);
> +}
> +
> +/*
> + * NOTE : the dentry must have been dget()'ed
> + */
> +static void container_d_remove_dir(struct dentry *dentry)
> +{
> + container_clear_directory(dentry);
> +
> + spin_lock(&dcache_lock);
> + list_del_init(&dentry->d_u.d_child);
> + spin_unlock(&dcache_lock);
> + remove_dir(dentry);
> +}
Taking dcache_lock in here is unfortunate. A filesystem really shouldn't
be playing with that lock.
But about 20 filesystems do so. Ho hum.
> +static int rebind_subsystems(struct containerfs_root *root,
> + unsigned long final_bits)
The code's a bit short on comments.
> +{
> + unsigned long added_bits, removed_bits;
> + struct container *cont = &root->top_container;
> + int i;
> +
> + removed_bits = root->subsys_bits & ~final_bits;
> + added_bits = final_bits & ~root->subsys_bits;
> + /* Check that any added subsystems are currently free */
> + for (i = 0; i < CONTAINER_SUBSYS_COUNT; i++) {
> + unsigned long long bit = 1ull << i;
> + struct container_subsys *ss = subsys[i];
> + if (!(bit & added_bits))
> + continue;
> + if (ss->root != &rootnode) {
> + /* Subsystem isn't free */
> + return -EBUSY;
> + }
> + }
> +
> + /* Currently we don't handle adding/removing subsystems when
> + * any subcontainers exist. This is theoretically supportable
> + * but involves complex erro r handling, so it's being left until
> + * later */
> + if (!list_empty(&cont->children)) {
> + return -EBUSY;
> + }
> +
> + /* Process each subsystem */
> + for (i = 0; i < CONTAINER_SUBSYS_COUNT; i++) {
> + struct container_subsys *ss = subsys[i];
> + unsigned long bit = 1UL << i;
> + if (bit & added_bits) {
> + /* We're binding this subsystem to this hierarchy */
> + BUG_ON(cont->subsys[i]);
> + BUG_ON(!dummytop->subsys[i]);
> + BUG_ON(dummytop->subsys[i]->container != dummytop);
> + cont->subsys[i] = dummytop->subsys[i];
> + cont->subsys[i]->container = cont;
> + list_add(&ss->sibling, &root->subsys_list);
> + rcu_assign_pointer(ss->root, root);
> + if (ss->bind)
> + ss->bind(ss, cont);
> +
> + } else if (bit & removed_bits) {
> + /* We're removing this subsystem */
> + BUG_ON(cont->subsys[i] != dummytop->subsys[i]);
> + BUG_ON(cont->subsys[i]->container != cont);
> + if (ss->bind)
> + ss->bind(ss, dummytop);
> + dummytop->subsys[i]->container = dummytop;
> + cont->subsys[i] = NULL;
> + rcu_assign_pointer(subsys[i]->root, &rootnode);
> + list_del(&ss->sibling);
> + } else if (bit & final_bits) {
> + /* Subsystem state should already exist */
> + BUG_ON(!cont->subsys[i]);
> + } else {
> + /* Subsystem state shouldn't exist */
> + BUG_ON(cont->subsys[i]);
> + }
> + }
> + root->subsys_bits = final_bits;
> + synchronize_rcu();
> +
> + return 0;
> +}
>
> ...
>
> +static int container_remount(struct super_block *sb, int *flags, char *data)
> +{
> + int ret = 0;
> + unsigned long subsys_bits;
> + struct containerfs_root *root = sb->s_fs_info;
> + struct container *cont = &root->top_container;
> +
> + mutex_lock(&cont->dentry->d_inode->i_mutex);
> + mutex_lock(&container_mutex);
So container_mutex nests inside i_mutex. That mean that we'll get lockdep
moaning if anyone does a __GFP_FS allocation inside container_mutex (some
filesystems can take i_mutex on the ->writepage path, iirc).
Probably a false positive, we can cross that bridege if/when we come to it.
> + /* See what subsystems are wanted */
> + ret = parse_containerfs_options(data, &subsys_bits);
> + if (ret)
> + goto out_unlock;
> +
> + ret = rebind_subsystems(root, subsys_bits);
> +
> + /* (re)populate subsystem files */
> + if (!ret)
> + container_populate_dir(cont);
> +
> + out_unlock:
> + mutex_unlock(&container_mutex);
> + mutex_unlock(&cont->dentry->d_inode->i_mutex);
> + return ret;
> +}
> +
>
> ...
>
> +
> +static int container_fill_super(struct super_block *sb, void *options,
> + int unused_silent)
> +{
> + struct inode *inode;
> + struct dentry *root;
> + struct containerfs_root *hroot = options;
> +
> + sb->s_blocksize = PAGE_CACHE_SIZE;
> + sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
> + sb->s_magic = CONTAINER_SUPER_MAGIC;
> + sb->s_op = &container_ops;
> +
> + inode = container_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
> + if (!inode)
> + return -ENOMEM;
> +
> + inode->i_op = &simple_dir_inode_operations;
> + inode->i_fop = &simple_dir_operations;
> + inode->i_op = &container_dir_inode_operations;
> + /* directories start off with i_nlink == 2 (for "." entry) */
> + inc_nlink(inode);
> +
> + root = d_alloc_root(inode);
> + if (!root) {
> + iput(inode);
> + return -ENOMEM;
I bet that iput() hasn't been tested ;)
People have hit unpleasant prob
...
|
|
|
Goto Forum:
Current Time: Tue Oct 08 13:18:42 GMT 2024
Total time taken to generate the page: 0.05417 seconds
|