OpenVZ Forum



Members   Search      Help    Register    Login    Home
Today's Messages (off)  | Unanswered Messages (on)

Forum: Devel
 Topic: [PATCH 10/33] task containersv11 automatic userspace notification of idle containers
[PATCH 10/33] task containersv11 automatic userspace notification of idle containers [message #20415] Mon, 17 September 2007 17:03
Paul Menage is currently offline Paul Menage
Messages: 642
Registered: September 2006
Senior Member
From: openvz.org
Add the following files to the cgroup filesystem:

notify_on_release - configures/reports whether the cgroup subsystem should
attempt to run a release script when this cgroup becomes unused

release_agent - configures/reports the release agent to be used for this
hierarchy (top level in each hierarchy only)

releasable - reports whether this cgroup would have been auto-released if
notify_on_release was true and a release agent was configured (mainly useful
for debugging)

To avoid locking issues, invoking the userspace release agent is done via a
workqueue task; cgroups that need to have their release agents invoked by
the workqueue task are linked on to a list.

Signed-off-by: Paul Menage <menage@google.com>
---

 include/linux/cgroup.h |   11 
 kernel/cgroup.c        |  425 ++++++++++++++++++++++++++++++++----
 2 files changed, 393 insertions(+), 43 deletions(-)

diff -puN include/linux/cgroup.h~task-cgroupsv11-automatic-userspace-notification-of-idle-cgroups include/linux/cgroup.h
--- a/include/linux/cgroup.h~task-cgroupsv11-automatic-userspace-notification-of-idle-cgroups
+++ a/include/linux/cgroup.h
@@ -77,10 +77,11 @@ static inline void css_get(struct contai
  * css_get()
  */
 
+extern void __css_put(struct cgroup_subsys_state *css);
 static inline void css_put(struct cgroup_subsys_state *css)
 {
 	if (!test_bit(CSS_ROOT, &css->flags))
-		atomic_dec(&css->refcnt);
+		__css_put(css);
 }
 
 struct cgroup {
@@ -112,6 +113,13 @@ struct cgroup {
 	 * tasks in this cgroup. Protected by css_set_lock
 	 */
 	struct list_head css_sets;
+
+	/*
+	 * Linked list running through all cgroups that can
+	 * potentially be reaped by the release agent. Protected by
+	 * release_list_lock
+	 */
+	struct list_head release_list;
 };
 
 /* A css_set is a structure holding pointers to a set of
@@ -285,7 +293,6 @@ struct task_struct *cgroup_iter_next(
 					struct cgroup_iter *it);
 void cgroup_iter_end(struct cgroup *cont, struct cgroup_iter *it);
 
-
 #else /* !CONFIG_CGROUPS */
 
 static inline int cgroup_init_early(void) { return 0; }
diff -puN kernel/cgroup.c~task-cgroupsv11-automatic-userspace-notification-of-idle-cgroups kernel/cgroup.c
--- a/kernel/cgroup.c~task-cgroupsv11-automatic-userspace-notification-of-idle-cgroups
+++ a/kernel/cgroup.c
@@ -44,6 +44,8 @@
 #include <linux/sort.h>
 #include <asm/atomic.h>
 
+static DEFINE_MUTEX(cgroup_mutex);
+
 /* Generate an array of cgroup subsystem pointers */
 #define SUBSYS(_x) &_x ## _subsys,
 
@@ -82,6 +84,13 @@ struct cgroupfs_root {
 
 	/* Hierarchy-specific flags */
 	unsigned long flags;
+
+	/* The path to use for release notifications. No locking
+	 * between setting and use - so if userspace updates this
+	 * while child cgroups exist, you could miss a
+	 * notification. We ensure that it's always a valid
+	 * NUL-terminated string */
+	char release_agent_path[PATH_MAX];
 };
 
 
@@ -109,7 +118,13 @@ static int need_forkexit_callback;
 
 /* bits in struct cgroup flags field */
 enum {
+	/* Control Group is dead */
 	CONT_REMOVED,
+	/* Control Group has previously had a child cgroup or a task,
+	 * but no longer (only if CONT_NOTIFY_ON_RELEASE is set) */
+	CONT_RELEASABLE,
+	/* Control Group requires release notifications to userspace */
+	CONT_NOTIFY_ON_RELEASE,
 };
 
 /* convenient tests for these bits */
@@ -123,6 +138,19 @@ enum {
 	ROOT_NOPREFIX, /* mounted subsystems have no named prefix */
 };
 
+inline int cgroup_is_releasable(const struct cgroup *cont)
+{
+	const int bits =
+		(1 << CONT_RELEASABLE) |
+		(1 << CONT_NOTIFY_ON_RELEASE);
+	return (cont->flags & bits) == bits;
+}
+
+inline int notify_on_release(const struct cgroup *cont)
+{
+	return test_bit(CONT_NOTIFY_ON_RELEASE, &cont->flags);
+}
+
 /*
  * for_each_subsys() allows you to iterate on each subsystem attached to
  * an active hierarchy
@@ -134,6 +162,14 @@ list_for_each_entry(_ss, &_root->subsys_
 #define for_each_root(_root) \
 list_for_each_entry(_root, &roots, root_list)
 
+/* the list of cgroups eligible for automatic release. Protected by
+ * release_list_lock */
+static LIST_HEAD(release_list);
+static DEFINE_SPINLOCK(release_list_lock);
+static void cgroup_release_agent(struct work_struct *work);
+static DECLARE_WORK(release_agent_work, cgroup_release_agent);
+static void check_for_release(struct cgroup *cont);
+
 /* Link structure for associating css_set objects with cgroups */
 struct cg_cgroup_link {
 	/*
@@ -188,11 +224,8 @@ static int use_task_css_set_links;
 /*
  * unlink a css_set from the list and free it
  */
-static void release_css_set(struct kref *k)
+static void unlink_css_set(struct css_set *cg)
 {
-	struct css_set *cg = container_of(k, struct css_set, ref);
-	int i;
-
 	write_lock(&css_set_lock);
 	list_del(&cg->list);
 	css_set_count--;
@@ -205,11 +238,39 @@ static void release_css_set(struct kre
 		kfree(link);
 	}
 	write_unlock(&css_set_lock);
-	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
-		atomic_dec(&cg->subsys[i]->cgroup->count);
+}
+
+static void __release_css_set(struct kref *k, int taskexit)
+{
+	int i;
+	struct css_set *cg = container_of(k, struct css_set, ref);
+
+	unlink_css_set(cg);
+
+	rcu_read_lock();
+	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+		struct cgroup *cont = cg->subsys[i]->cgroup;
+		if (atomic_dec_and_test(&cont->count) &&
+		    notify_on_release(cont)) {
+			if (taskexit)
+				set_bit(CONT_RELEASABLE, &cont->flags);
+			check_for_release(cont);
+		}
+	}
+	rcu_read_unlock();
 	kfree(cg);
 }
 
+static void release_css_set(struct kref *k)
+{
+	__release_css_set(k, 0);
+}
+
+static void release_css_set_taskexit(struct kref *k)
+{
+	__release_css_set(k, 1);
+}
+
 /*
  * refcounted get/put for css_set objects
  */
@@ -223,6 +284,11 @@ static inline void put_css_set(struct 
 	kref_put(&cg->ref, release_css_set);
 }
 
+static inline void put_css_set_taskexit(struct css_set *cg)
+{
+	kref_put(&cg->ref, release_css_set_taskexit);
+}
+
 /*
  * find_existing_css_set() is a helper for
  * find_css_set(), and checks to see whether an existing
@@ -464,8 +530,6 @@ static struct css_set *find_css_set(
  * update of a tasks cgroup pointer by attach_task()
  */
 
-static DEFINE_MUTEX(cgroup_mutex);
-
 /**
  * cgroup_lock - lock out any changes to cgroup structures
  *
@@ -524,6 +588,13 @@ static void cgroup_diput(struct dentr
 	if (S_ISDIR(inode->i_mode)) {
 		struct cgroup *cont = dentry->d_fsdata;
 		BUG_ON(!(cgroup_is_removed(cont)));
+		/* It's possible for external users to be holding css
+		 * reference counts on a cgroup; css_put() needs to
+		 * be able to access the cgroup after decrementing
+		 * the reference count in order to know if it needs to
+		 * queue the cgroup to be handled by the release
+		 * agent */
+		synchronize_rcu();
 		kfree(cont);
 	}
 	iput(inode);
@@ -668,6 +739,8 @@ static int cgroup_show_options(struct
 		seq_printf(seq, ",%s", ss->name);
 	if (test_bit(ROOT_NOPREFIX, &root->flags))
 		seq_puts(seq, ",noprefix");
+	if (strlen(root->release_agent_path))
+		seq_printf(seq, ",release_agent=%s", root->release_agent_path);
 	mutex_unlock(&cgroup_mutex);
 	return 0;
 }
@@ -675,6 +748,7 @@ static int cgroup_show_options(struct
 struct cgroup_sb_opts {
 	unsigned long subsys_bits;
 	unsigned long flags;
+	char *release_agent;
 };
 
 /* Convert a hierarchy specifier into a bitmask of subsystems and
@@ -686,6 +760,7 @@ static int parse_cgroupfs_options(cha
 
 	opts->subsys_bits = 0;
 	opts->flags = 0;
+	opts->release_agent = NULL;
 
 	while ((token = strsep(&o, ",")) != NULL) {
 		if (!*token)
@@ -694,6 +769,15 @@ static int parse_cgroupfs_options(cha
 			opts->subsys_bits = (1 << CGROUP_SUBSYS_COUNT) - 1;
 		} else if (!strcmp(token, "noprefix")) {
 			set_bit(ROOT_NOPREFIX, &opts->flags);
+		} else if (!strncmp(token, "release_agent=", 14)) {
+			/* Specifying two release agents is forbidden */
+			if (opts->release_agent)
+				return -EINVAL;
+			opts->release_agent = kzalloc(PATH_MAX, GFP_KERNEL);
+			if (!opts->release_agent)
+				return -ENOMEM;
+			strncpy(opts->release_agent, token + 14, PATH_MAX - 1);
+			opts->release_agent[PATH_MAX - 1] = 0;
 		} else {
 			struct cgroup_subsys *ss;
 			int i;
@@ -743,7 +827,11 @@ static int cgroup_remount(struct supe
 	if (!ret)
 		cgroup_populate_dir(cont);
 
+	if (opts.release_agent)
+		strcpy(root->release_agent_path, opts.release_agent);
  out_unlock:
+	if (opts.release_agent)
+		kfree(opts.release_agent);
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cont->dentry->d_inode->i_mutex);
 	return ret;
@@ -767,6 +855,7 @@ static void init_cgroup_root(struct c
 	INIT_LIST_HEAD(&cont->sibling);
 	INIT_LIST_HEAD(&cont->children);
 	INIT_LIST_HEAD(&cont->css_sets);
+	INIT_LIST_HEAD(&cont->release_list);
 }
 
 static int cgroup_test_super(struct super_block *sb, void *data)
@@ -841,8 +930,11 @@ static int cgroup_get_sb(struct file_
 
 	/* First find the desired set of subsystems */
 	ret = parse_cgroupfs_options(data, &opts);
-	if (ret)
+	if (ret) {
+		if (opts.release_agent)
+			kfree(opts.release_agent);
 		return ret;
+	}
 
 	root = kzalloc(sizeof(*root), GFP_KERNEL);
 	if (!root)
@@ -851,6 +943,10 @@ static int cgroup_get_sb(struct file_
 	init_cgroup_root(root);
 	root->subsys_bits = opts.subsys_bits;
 	root->flags = opts.flags;
+	if (opts.release_agent) {
+		strcpy(root->release_agent_path, opts.release_agent);
+		kfree(opts.release_agent);
+	}
 
 	sb = sget(fs_type, cgroup_test_super, cgroup_set_super, root);
 
@@ -1131,7 +1227,7 @@ static int attach_task(struct cgroup 
 			ss->attach(ss, cont, oldcont, tsk);
 		}
 	}
-
+	set_bit(CONT_RELEASABLE, &oldcont->flags);
 	synchronize_rcu();
 	put_css_set(cg);
 	return 0;
@@ -1181,6 +1277,9 @@ enum cgroup_fi
...

 Topic: [PATCH 09/33] task containersv11 shared container subsystem group arrays include fix
[PATCH 09/33] task containersv11 shared container subsystem group arrays include fix [message #20408] Mon, 17 September 2007 17:03
Paul Menage is currently offline Paul Menage
Messages: 642
Registered: September 2006
Senior Member
From: openvz.org
kernel/cgroup.c: In function 'cgroup_new_inode':
kernel/cgroup.c:573: error: variable 'cgroup_backing_dev_info' has initializer but incomplete type
kernel/cgroup.c:574: error: unknown field 'capabilities' specified in initializer
kernel/cgroup.c:574: error: 'BDI_CAP_NO_ACCT_DIRTY' undeclared (first use in this function)
kernel/cgroup.c:574: error: (Each undeclared identifier is reported only once)


Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Paul Menage <menage@google.com>

---

 kernel/cgroup.c |    1 +
 1 file changed, 1 insertion(+)

diff -puN kernel/cgroup.c~task-cgroupsv11-shared-cgroup-subsystem-group-arrays-include-fix kernel/cgroup.c
--- a/kernel/cgroup.c~task-cgroupsv11-shared-cgroup-subsystem-group-arrays-include-fix
+++ a/kernel/cgroup.c
@@ -36,6 +36,7 @@
 #include <linux/proc_fs.h>
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
+#include <linux/backing-dev.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/magic.h>
_

--
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
 Topic: [PATCH 08/33] task containersv11 shared container subsystem group arrays avoid lockdep warning
[PATCH 08/33] task containersv11 shared container subsystem group arrays avoid lockdep warning [message #20430] Mon, 17 September 2007 17:03
Paul Menage is currently offline Paul Menage
Messages: 642
Registered: September 2006
Senior Member
From: openvz.org
I think this is the right way to handle the lockdep false-positive in the
current cgroups patches, but I'm not that familiar with lockdep so any
suggestions for a better approach are welcomed.

In order to avoid a false-positive lockdep warning, we lock the root inode
of a new filesystem mount prior to taking cgroup_mutex, to preserve the
invariant that cgroup_mutex nests inside inode->i_mutex.  In order to
prevent a lockdep false positive when locking i_mutex on a newly-created
cgroup directory inode we use mutex_lock_nested(), with a nesting level
of I_MUTEX_CHILD since the new inode will ultimately be a child directory
of the parent whose i_mutex is nested outside of cgroup_mutex.

Signed-off-by: Paul Menage <menage@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---

 kernel/cgroup.c |   17 +++++++----------
 1 files changed, 7 insertions(+), 10 deletions(-)

diff -puN kernel/cgroup.c~task-cgroupsv11-shared-cgroup-subsystem-group-arrays-avoid-lockdep-warning kernel/cgroup.c
--- a/kernel/cgroup.c~task-cgroupsv11-shared-cgroup-subsystem-group-arrays-avoid-lockdep-warning
+++ a/kernel/cgroup.c
@@ -867,13 +867,16 @@ static int cgroup_get_sb(struct file_
 	} else {
 		/* New superblock */
 		struct cgroup *cont = &root->top_cgroup;
+		struct inode *inode;
 
 		BUG_ON(sb->s_root != NULL);
 
 		ret = cgroup_get_rootdir(sb);
 		if (ret)
 			goto drop_new_super;
+		inode = sb->s_root->d_inode;
 
+		mutex_lock(&inode->i_mutex);
 		mutex_lock(&cgroup_mutex);
 
 		/*
@@ -886,12 +889,14 @@ static int cgroup_get_sb(struct file_
 		ret = allocate_cg_links(css_set_count, &tmp_cg_links);
 		if (ret) {
 			mutex_unlock(&cgroup_mutex);
+			mutex_unlock(&inode->i_mutex);
 			goto drop_new_super;
 		}
 
 		ret = rebind_subsystems(root, root->subsys_bits);
 		if (ret == -EBUSY) {
 			mutex_unlock(&cgroup_mutex);
+			mutex_unlock(&inode->i_mutex);
 			goto drop_new_super;
 		}
 
@@ -931,16 +936,8 @@ static int cgroup_get_sb(struct file_
 		BUG_ON(!list_empty(&cont->children));
 		BUG_ON(root->number_of_cgroups != 1);
 
-		/*
-		 * I believe that it's safe to nest i_mutex inside
-		 * cgroup_mutex in this case, since no-one else can
-		 * be accessing this directory yet. But we still need
-		 * to teach lockdep that this is the case - currently
-		 * a cgroupfs remount triggers a lockdep warning
-		 */
-		mutex_lock(&cont->dentry->d_inode->i_mutex);
 		cgroup_populate_dir(cont);
-		mutex_unlock(&cont->dentry->d_inode->i_mutex);
+		mutex_unlock(&inode->i_mutex);
 		mutex_unlock(&cgroup_mutex);
 	}
 
@@ -1358,7 +1355,7 @@ static int cgroup_create_file(struct 
 
 		/* start with the directory inode held, so that we can
 		 * populate it without racing with another mkdir */
-		mutex_lock(&inode->i_mutex);
+		mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
 	} else if (S_ISREG(mode)) {
 		inode->i_size = 0;
 		inode->i_fop = &cgroup_file_operations;
_

--
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
 Topic: [PATCH 07/33] task containersv11 shared container subsystem group arrays
[PATCH 07/33] task containersv11 shared container subsystem group arrays [message #20432] Mon, 17 September 2007 17:03
Paul Menage is currently offline Paul Menage
Messages: 642
Registered: September 2006
Senior Member
From: openvz.org
Replace the struct css_set embedded in task_struct with a pointer; all tasks
that have the same set of memberships across all hierarchies will share a
css_set object, and will be linked via their css_sets field to the "tasks"
list_head in the css_set.

Assuming that many tasks share the same cgroup assignments, this reduces
overall space usage and keeps the size of the task_struct down (three pointers
added to task_struct compared to a non-cgroups kernel, no matter how many
subsystems are registered).

Signed-off-by: Paul Menage <menage@google.com>
---

 Documentation/cgroups.txt |   14 
 include/linux/cgroup.h    |   89 ++++
 include/linux/sched.h        |   33 -
 kernel/cgroup.c           |  606 ++++++++++++++++++++++++++++-----
 kernel/fork.c                |    1 
 5 files changed, 620 insertions(+), 123 deletions(-)

diff -puN Documentation/cgroups.txt~task-cgroupsv11-shared-cgroup-subsystem-group-arrays Documentation/cgroups.txt
--- a/Documentation/cgroups.txt~task-cgroupsv11-shared-cgroup-subsystem-group-arrays
+++ a/Documentation/cgroups.txt
@@ -176,7 +176,9 @@ Control Groups extends the kernel as follows
    subsystem state is something that's expected to happen frequently
    and in performance-critical code, whereas operations that require a
    task's actual cgroup assignments (in particular, moving between
-   cgroups) are less common.
+   cgroups) are less common. A linked list runs through the cg_list
+   field of each task_struct using the css_set, anchored at
+   css_set->tasks.
 
  - A cgroup hierarchy filesystem can be mounted  for browsing and
    manipulation from user space.
@@ -252,6 +254,16 @@ linear search to locate an appropriate e
 very efficient. A future version will use a hash table for better
 performance.
 
+To allow access from a cgroup to the css_sets (and hence tasks)
+that comprise it, a set of cg_cgroup_link objects form a lattice;
+each cg_cgroup_link is linked into a list of cg_cgroup_links for
+a single cgroup on its cont_link_list field, and a list of
+cg_cgroup_links for a single css_set on its cg_link_list.
+
+Thus the set of tasks in a cgroup can be listed by iterating over
+each css_set that references the cgroup, and sub-iterating over
+each css_set's task set.
+
 The use of a Linux virtual file system (vfs) to represent the
 cgroup hierarchy provides for a familiar permission and name space
 for cgroups, with a minimum of additional kernel code.
diff -puN include/linux/cgroup.h~task-cgroupsv11-shared-cgroup-subsystem-group-arrays include/linux/cgroup.h
--- a/include/linux/cgroup.h~task-cgroupsv11-shared-cgroup-subsystem-group-arrays
+++ a/include/linux/cgroup.h
@@ -27,10 +27,19 @@ extern void cgroup_lock(void);
 extern void cgroup_unlock(void);
 extern void cgroup_fork(struct task_struct *p);
 extern void cgroup_fork_callbacks(struct task_struct *p);
+extern void cgroup_post_fork(struct task_struct *p);
 extern void cgroup_exit(struct task_struct *p, int run_callbacks);
 
 extern struct file_operations proc_cgroup_operations;
 
+/* Define the enumeration of all cgroup subsystems */
+#define SUBSYS(_x) _x ## _subsys_id,
+enum cgroup_subsys_id {
+#include <linux/cgroup_subsys.h>
+	CGROUP_SUBSYS_COUNT
+};
+#undef SUBSYS
+
 /* Per-subsystem/per-cgroup state maintained by the system. */
 struct cgroup_subsys_state {
 	/* The cgroup that this subsystem is attached to. Useful
@@ -97,6 +106,52 @@ struct cgroup {
 
 	struct cgroupfs_root *root;
 	struct cgroup *top_cgroup;
+
+	/*
+	 * List of cg_cgroup_links pointing at css_sets with
+	 * tasks in this cgroup. Protected by css_set_lock
+	 */
+	struct list_head css_sets;
+};
+
+/* A css_set is a structure holding pointers to a set of
+ * cgroup_subsys_state objects. This saves space in the task struct
+ * object and speeds up fork()/exit(), since a single inc/dec and a
+ * list_add()/del() can bump the reference count on the entire
+ * cgroup set for a task.
+ */
+
+struct css_set {
+
+	/* Reference count */
+	struct kref ref;
+
+	/*
+	 * List running through all cgroup groups. Protected by
+	 * css_set_lock
+	 */
+	struct list_head list;
+
+	/*
+	 * List running through all tasks using this cgroup
+	 * group. Protected by css_set_lock
+	 */
+	struct list_head tasks;
+
+	/*
+	 * List of cg_cgroup_link objects on link chains from
+	 * cgroups referenced from this css_set. Protected by
+	 * css_set_lock
+	 */
+	struct list_head cg_links;
+
+	/*
+	 * Set of subsystem states, one for each subsystem. This array
+	 * is immutable after creation apart from the init_css_set
+	 * during subsystem registration (at boot time).
+	 */
+	struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
+
 };
 
 /* struct cftype:
@@ -149,15 +204,7 @@ int cgroup_is_removed(const struct co
 
 int cgroup_path(const struct cgroup *cont, char *buf, int buflen);
 
-int __cgroup_task_count(const struct cgroup *cont);
-static inline int cgroup_task_count(const struct cgroup *cont)
-{
-	int task_count;
-	rcu_read_lock();
-	task_count = __cgroup_task_count(cont);
-	rcu_read_unlock();
-	return task_count;
-}
+int cgroup_task_count(const struct cgroup *cont);
 
 /* Return true if the cgroup is a descendant of the current cgroup */
 int cgroup_is_descendant(const struct cgroup *cont);
@@ -205,7 +252,7 @@ static inline struct cgroup_subsys_st
 static inline struct cgroup_subsys_state *task_subsys_state(
 	struct task_struct *task, int subsys_id)
 {
-	return rcu_dereference(task->cgroups.subsys[subsys_id]);
+	return rcu_dereference(task->cgroups->subsys[subsys_id]);
 }
 
 static inline struct cgroup* task_cgroup(struct task_struct *task,
@@ -218,6 +265,27 @@ int cgroup_path(const struct containe
 
 int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *ss);
 
+/* A cgroup_iter should be treated as an opaque object */
+struct cgroup_iter {
+	struct list_head *cg_link;
+	struct list_head *task;
+};
+
+/* To iterate across the tasks in a cgroup:
+ *
+ * 1) call cgroup_iter_start to intialize an iterator
+ *
+ * 2) call cgroup_iter_next() to retrieve member tasks until it
+ *    returns NULL or until you want to end the iteration
+ *
+ * 3) call cgroup_iter_end() to destroy the iterator.
+ */
+void cgroup_iter_start(struct cgroup *cont, struct cgroup_iter *it);
+struct task_struct *cgroup_iter_next(struct cgroup *cont,
+					struct cgroup_iter *it);
+void cgroup_iter_end(struct cgroup *cont, struct cgroup_iter *it);
+
+
 #else /* !CONFIG_CGROUPS */
 
 static inline int cgroup_init_early(void) { return 0; }
@@ -225,6 +293,7 @@ static inline int cgroup_init(void) {
 static inline void cgroup_init_smp(void) {}
 static inline void cgroup_fork(struct task_struct *p) {}
 static inline void cgroup_fork_callbacks(struct task_struct *p) {}
+static inline void cgroup_post_fork(struct task_struct *p) {}
 static inline void cgroup_exit(struct task_struct *p, int callbacks) {}
 
 static inline void cgroup_lock(void) {}
diff -puN include/linux/sched.h~task-cgroupsv11-shared-cgroup-subsystem-group-arrays include/linux/sched.h
--- a/include/linux/sched.h~task-cgroupsv11-shared-cgroup-subsystem-group-arrays
+++ a/include/linux/sched.h
@@ -861,34 +861,6 @@ struct sched_entity {
 #endif
 };
 
-#ifdef CONFIG_CGROUPS
-
-#define SUBSYS(_x) _x ## _subsys_id,
-enum cgroup_subsys_id {
-#include <linux/cgroup_subsys.h>
-	CGROUP_SUBSYS_COUNT
-};
-#undef SUBSYS
-
-/* A css_set is a structure holding pointers to a set of
- * cgroup_subsys_state objects.
- */
-
-struct css_set {
-
-	/* Set of subsystem states, one for each subsystem. NULL for
-	 * subsystems that aren't part of this hierarchy. These
-	 * pointers reduce the number of dereferences required to get
-	 * from a task to its state for a given cgroup, but result
-	 * in increased space usage if tasks are in wildly different
-	 * groupings across different hierarchies. This array is
-	 * immutable after creation */
-	struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
-
-};
-
-#endif /* CONFIG_CGROUPS */
-
 struct task_struct {
 	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
 	void *stack;
@@ -1125,7 +1097,10 @@ struct task_struct {
 	int cpuset_mem_spread_rotor;
 #endif
 #ifdef CONFIG_CGROUPS
-	struct css_set cgroups;
+	/* Control Group info protected by css_set_lock */
+	struct css_set *cgroups;
+	/* cg_list protected by css_set_lock and tsk->alloc_lock */
+	struct list_head cg_list;
 #endif
 #ifdef CONFIG_FUTEX
 	struct robust_list_head __user *robust_list;
diff -puN kernel/cgroup.c~task-cgroupsv11-shared-cgroup-subsystem-group-arrays kernel/cgroup.c
--- a/kernel/cgroup.c~task-cgroupsv11-shared-cgroup-subsystem-group-arrays
+++ a/kernel/cgroup.c
@@ -95,6 +95,7 @@ static struct cgroupfs_root rootnode;
 /* The list of hierarchy roots */
 
 static LIST_HEAD(roots);
+static int root_count;
 
 /* dummytop is a shorthand for the dummy hierarchy's top cgroup */
 #define dummytop (&rootnode.top_cgroup)
@@ -133,12 +134,49 @@ list_for_each_entry(_ss, &_root->subsys_
 #define for_each_root(_root) \
 list_for_each_entry(_root, &roots, root_list)
 
-/* Each task_struct has an embedded css_set, so the get/put
- * operation simply takes a reference count on all the cgroups
- * referenced by subsystems in this css_set. This can end up
- * multiple-counting some cgroups, but that's OK - the ref-count is
- * just a busy/not-busy indicator; ensuring that we only count each
- * cgroup once would require taking a global lock to ensure that no
+/* Link structure for associating css_set objects with cgroups */
+struct cg_cgroup_link {
+	/*
+	 * List running through cg_cgroup_links associated with a
+	 * cgroup, anchored on cgroup->css_sets
+	 */
+	struct list_head cont_link_list;
+	/*
+	 * List running through cg_cgroup_links pointing at a
+	 * single css_set object, anchored on css_set->cg_links
+	 */
+	struct list_head cg_link_list;
+	struct css_set *cg;
+};
+
+/* The default css_set - used by init and its children prior to any
+ * hierarchies being mount
...

 Topic: [PATCH 06/33] task containersv11 add procfs interface
[PATCH 06/33] task containersv11 add procfs interface [message #20407] Mon, 17 September 2007 17:03
Paul Menage is currently offline Paul Menage
Messages: 642
Registered: September 2006
Senior Member
From: openvz.org
Add:

/proc/cgroups - general system info

/proc/*/cgroup - per-task cgroup membership info

Signed-off-by: Paul Menage <menage@google.com>

---

 fs/proc/base.c            |    7 +
 include/linux/cgroup.h |    2 
 kernel/cgroup.c        |  132 ++++++++++++++++++++++++++++++++++++
 3 files changed, 141 insertions(+)

diff -puN fs/proc/base.c~task-cgroupsv11-add-procfs-interface fs/proc/base.c
--- a/fs/proc/base.c~task-cgroupsv11-add-procfs-interface
+++ a/fs/proc/base.c
@@ -67,6 +67,7 @@
 #include <linux/mount.h>
 #include <linux/security.h>
 #include <linux/ptrace.h>
+#include <linux/cgroup.h>
 #include <linux/cpuset.h>
 #include <linux/audit.h>
 #include <linux/poll.h>
@@ -2051,6 +2052,9 @@ static const struct pid_entry tgid_base_
 #ifdef CONFIG_CPUSETS
 	REG("cpuset",     S_IRUGO, cpuset),
 #endif
+#ifdef CONFIG_CGROUPS
+	REG("cgroup",  S_IRUGO, cgroup),
+#endif
 	INF("oom_score",  S_IRUGO, oom_score),
 	REG("oom_adj",    S_IRUGO|S_IWUSR, oom_adjust),
 #ifdef CONFIG_AUDITSYSCALL
@@ -2340,6 +2344,9 @@ static const struct pid_entry tid_base_s
 #ifdef CONFIG_CPUSETS
 	REG("cpuset",    S_IRUGO, cpuset),
 #endif
+#ifdef CONFIG_CGROUPS
+	REG("cgroup",  S_IRUGO, cgroup),
+#endif
 	INF("oom_score", S_IRUGO, oom_score),
 	REG("oom_adj",   S_IRUGO|S_IWUSR, oom_adjust),
 #ifdef CONFIG_AUDITSYSCALL
diff -puN include/linux/cgroup.h~task-cgroupsv11-add-procfs-interface include/linux/cgroup.h
--- a/include/linux/cgroup.h~task-cgroupsv11-add-procfs-interface
+++ a/include/linux/cgroup.h
@@ -29,6 +29,8 @@ extern void cgroup_fork(struct task_s
 extern void cgroup_fork_callbacks(struct task_struct *p);
 extern void cgroup_exit(struct task_struct *p, int run_callbacks);
 
+extern struct file_operations proc_cgroup_operations;
+
 /* Per-subsystem/per-cgroup state maintained by the system. */
 struct cgroup_subsys_state {
 	/* The cgroup that this subsystem is attached to. Useful
diff -puN kernel/cgroup.c~task-cgroupsv11-add-procfs-interface kernel/cgroup.c
--- a/kernel/cgroup.c~task-cgroupsv11-add-procfs-interface
+++ a/kernel/cgroup.c
@@ -33,6 +33,7 @@
 #include <linux/mutex.h>
 #include <linux/mount.h>
 #include <linux/pagemap.h>
+#include <linux/proc_fs.h>
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
 #include <linux/seq_file.h>
@@ -247,6 +248,7 @@ static int cgroup_mkdir(struct inode 
 static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
 static int cgroup_populate_dir(struct cgroup *cont);
 static struct inode_operations cgroup_dir_inode_operations;
+static struct file_operations proc_cgroupstats_operations;
 
 static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
 {
@@ -1576,6 +1578,7 @@ int __init cgroup_init(void)
 {
 	int err;
 	int i;
+	struct proc_dir_entry *entry;
 
 	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 		struct cgroup_subsys *ss = subsys[i];
@@ -1587,10 +1590,139 @@ int __init cgroup_init(void)
 	if (err < 0)
 		goto out;
 
+	entry = create_proc_entry("cgroups", 0, NULL);
+	if (entry)
+		entry->proc_fops = &proc_cgroupstats_operations;
+
 out:
 	return err;
 }
 
+/*
+ * proc_cgroup_show()
+ *  - Print task's cgroup paths into seq_file, one line for each hierarchy
+ *  - Used for /proc/<pid>/cgroup.
+ *  - No need to task_lock(tsk) on this tsk->cgroup reference, as it
+ *    doesn't really matter if tsk->cgroup changes after we read it,
+ *    and we take cgroup_mutex, keeping attach_task() from changing it
+ *    anyway.  No need to check that tsk->cgroup != NULL, thanks to
+ *    the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks
+ *    cgroup to top_cgroup.
+ */
+
+/* TODO: Use a proper seq_file iterator */
+static int proc_cgroup_show(struct seq_file *m, void *v)
+{
+	struct pid *pid;
+	struct task_struct *tsk;
+	char *buf;
+	int retval;
+	struct cgroupfs_root *root;
+
+	retval = -ENOMEM;
+	buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!buf)
+		goto out;
+
+	retval = -ESRCH;
+	pid = m->private;
+	tsk = get_pid_task(pid, PIDTYPE_PID);
+	if (!tsk)
+		goto out_free;
+
+	retval = 0;
+
+	mutex_lock(&cgroup_mutex);
+
+	for_each_root(root) {
+		struct cgroup_subsys *ss;
+		struct cgroup *cont;
+		int subsys_id;
+		int count = 0;
+
+		/* Skip this hierarchy if it has no active subsystems */
+		if (!root->actual_subsys_bits)
+			continue;
+		for_each_subsys(root, ss)
+			seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
+		seq_putc(m, ':');
+		get_first_subsys(&root->top_cgroup, NULL, &subsys_id);
+		cont = task_cgroup(tsk, subsys_id);
+		retval = cgroup_path(cont, buf, PAGE_SIZE);
+		if (retval < 0)
+			goto out_unlock;
+		seq_puts(m, buf);
+		seq_putc(m, '\n');
+	}
+
+out_unlock:
+	mutex_unlock(&cgroup_mutex);
+	put_task_struct(tsk);
+out_free:
+	kfree(buf);
+out:
+	return retval;
+}
+
+static int cgroup_open(struct inode *inode, struct file *file)
+{
+	struct pid *pid = PROC_I(inode)->pid;
+	return single_open(file, proc_cgroup_show, pid);
+}
+
+struct file_operations proc_cgroup_operations = {
+	.open		= cgroup_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+/* Display information about each subsystem and each hierarchy */
+static int proc_cgroupstats_show(struct seq_file *m, void *v)
+{
+	int i;
+	struct cgroupfs_root *root;
+
+	mutex_lock(&cgroup_mutex);
+	seq_puts(m, "Hierarchies:\n");
+	for_each_root(root) {
+		struct cgroup_subsys *ss;
+		int first = 1;
+		seq_printf(m, "%p: bits=%lx cgroups=%d (", root,
+			   root->subsys_bits, root->number_of_cgroups);
+		for_each_subsys(root, ss) {
+			seq_printf(m, "%s%s", first ? "" : ", ", ss->name);
+			first = false;
+		}
+		seq_putc(m, ')');
+		if (root->sb) {
+			seq_printf(m, " s_active=%d",
+				   atomic_read(&root->sb->s_active));
+		}
+		seq_putc(m, '\n');
+	}
+	seq_puts(m, "Subsystems:\n");
+	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+		struct cgroup_subsys *ss = subsys[i];
+		seq_printf(m, "%d: name=%s hierarchy=%p\n",
+			   i, ss->name, ss->root);
+	}
+	mutex_unlock(&cgroup_mutex);
+	return 0;
+}
+
+static int cgroupstats_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, proc_cgroupstats_show, 0);
+}
+
+static struct file_operations proc_cgroupstats_operations = {
+	.open = cgroupstats_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
 /**
  * cgroup_fork - attach newly forked task to its parents cgroup.
  * @tsk: pointer to task_struct of forking parent process.
_

--
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
 Topic: [PATCH 05/33] task containersv11 add container_clone interface
[PATCH 05/33] task containersv11 add container_clone interface [message #20426] Mon, 17 September 2007 17:03
Paul Menage is currently offline Paul Menage
Messages: 642
Registered: September 2006
Senior Member
From: openvz.org
Add support for cgroup_clone(), a way to create new cgroups intended to
be used for systems such as namespace unsharing.  A new subsystem callback,
post_clone(), is added to allow subsystems to automatically configure cloned
cgroups.

Signed-off-by: Paul Menage <menage@google.com>
---

 Documentation/cgroups.txt |    7 +
 include/linux/cgroup.h    |    3 
 kernel/cgroup.c           |  135 +++++++++++++++++++++++++++++++++
 3 files changed, 145 insertions(+)

diff -puN Documentation/cgroups.txt~task-cgroupsv11-add-cgroup_clone-interface Documentation/cgroups.txt
--- a/Documentation/cgroups.txt~task-cgroupsv11-add-cgroup_clone-interface
+++ a/Documentation/cgroups.txt
@@ -504,6 +504,13 @@ include/linux/cgroup.h for details). 
 method can return an error code, the error code is currently not
 always handled well.
 
+void post_clone(struct cgroup_subsys *ss, struct cgroup *cont)
+
+Called at the end of cgroup_clone() to do any paramater
+initialization which might be required before a task could attach.  For
+example in cpusets, no task may attach before 'cpus' and 'mems' are set
+up.
+
 void bind(struct cgroup_subsys *ss, struct cgroup *root)
 LL=callback_mutex
 
diff -puN include/linux/cgroup.h~task-cgroupsv11-add-cgroup_clone-interface include/linux/cgroup.h
--- a/include/linux/cgroup.h~task-cgroupsv11-add-cgroup_clone-interface
+++ a/include/linux/cgroup.h
@@ -174,6 +174,7 @@ struct cgroup_subsys {
 	void (*exit)(struct cgroup_subsys *ss, struct task_struct *task);
 	int (*populate)(struct cgroup_subsys *ss,
 			struct cgroup *cont);
+	void (*post_clone)(struct cgroup_subsys *ss, struct cgroup *cont);
 	void (*bind)(struct cgroup_subsys *ss, struct cgroup *root);
 	int subsys_id;
 	int active;
@@ -213,6 +214,8 @@ static inline struct cgroup* task_con
 
 int cgroup_path(const struct cgroup *cont, char *buf, int buflen);
 
+int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *ss);
+
 #else /* !CONFIG_CGROUPS */
 
 static inline int cgroup_init_early(void) { return 0; }
diff -puN kernel/cgroup.c~task-cgroupsv11-add-cgroup_clone-interface kernel/cgroup.c
--- a/kernel/cgroup.c~task-cgroupsv11-add-cgroup_clone-interface
+++ a/kernel/cgroup.c
@@ -1684,3 +1684,138 @@ void cgroup_exit(struct task_struct *
 	tsk->cgroups = init_task.cgroups;
 	task_unlock(tsk);
 }
+
+/**
+ * cgroup_clone - duplicate the current cgroup in the hierarchy
+ * that the given subsystem is attached to, and move this task into
+ * the new child
+ */
+int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys)
+{
+	struct dentry *dentry;
+	int ret = 0;
+	char nodename[MAX_CGROUP_TYPE_NAMELEN];
+	struct cgroup *parent, *child;
+	struct inode *inode;
+	struct css_set *cg;
+	struct cgroupfs_root *root;
+	struct cgroup_subsys *ss;
+
+	/* We shouldn't be called by an unregistered subsystem */
+	BUG_ON(!subsys->active);
+
+	/* First figure out what hierarchy and cgroup we're dealing
+	 * with, and pin them so we can drop cgroup_mutex */
+	mutex_lock(&cgroup_mutex);
+ again:
+	root = subsys->root;
+	if (root == &rootnode) {
+		printk(KERN_INFO
+		       "Not cloning cgroup for unused subsystem %s\n",
+		       subsys->name);
+		mutex_unlock(&cgroup_mutex);
+		return 0;
+	}
+	cg = &tsk->cgroups;
+	parent = task_cgroup(tsk, subsys->subsys_id);
+
+	snprintf(nodename, MAX_CGROUP_TYPE_NAMELEN, "node_%d", tsk->pid);
+
+	/* Pin the hierarchy */
+	atomic_inc(&parent->root->sb->s_active);
+
+	mutex_unlock(&cgroup_mutex);
+
+	/* Now do the VFS work to create a cgroup */
+	inode = parent->dentry->d_inode;
+
+	/* Hold the parent directory mutex across this operation to
+	 * stop anyone else deleting the new cgroup */
+	mutex_lock(&inode->i_mutex);
+	dentry = cgroup_get_dentry(parent->dentry, nodename);
+	if (IS_ERR(dentry)) {
+		printk(KERN_INFO
+		       "Couldn't allocate dentry for %s: %ld\n", nodename,
+		       PTR_ERR(dentry));
+		ret = PTR_ERR(dentry);
+		goto out_release;
+	}
+
+	/* Create the cgroup directory, which also creates the cgroup */
+	ret = vfs_mkdir(inode, dentry, S_IFDIR | 0755);
+	child = __d_cont(dentry);
+	dput(dentry);
+	if (ret) {
+		printk(KERN_INFO
+		       "Failed to create cgroup %s: %d\n", nodename,
+		       ret);
+		goto out_release;
+	}
+
+	if (!child) {
+		printk(KERN_INFO
+		       "Couldn't find new cgroup %s\n", nodename);
+		ret = -ENOMEM;
+		goto out_release;
+	}
+
+	/* The cgroup now exists. Retake cgroup_mutex and check
+	 * that we're still in the same state that we thought we
+	 * were. */
+	mutex_lock(&cgroup_mutex);
+	if ((root != subsys->root) ||
+	    (parent != task_cgroup(tsk, subsys->subsys_id))) {
+		/* Aargh, we raced ... */
+		mutex_unlock(&inode->i_mutex);
+
+		deactivate_super(parent->root->sb);
+		/* The cgroup is still accessible in the VFS, but
+		 * we're not going to try to rmdir() it at this
+		 * point. */
+		printk(KERN_INFO
+		       "Race in cgroup_clone() - leaking cgroup %s\n",
+		       nodename);
+		goto again;
+	}
+
+	/* do any required auto-setup */
+	for_each_subsys(root, ss) {
+		if (ss->post_clone)
+			ss->post_clone(ss, child);
+	}
+
+	/* All seems fine. Finish by moving the task into the new cgroup */
+	ret = attach_task(child, tsk);
+	mutex_unlock(&cgroup_mutex);
+
+ out_release:
+	mutex_unlock(&inode->i_mutex);
+	deactivate_super(parent->root->sb);
+	return ret;
+}
+
+/*
+ * See if "cont" is a descendant of the current task's cgroup in
+ * the appropriate hierarchy
+ *
+ * If we are sending in dummytop, then presumably we are creating
+ * the top cgroup in the subsystem.
+ *
+ * Called only by the ns (nsproxy) cgroup.
+ */
+int cgroup_is_descendant(const struct cgroup *cont)
+{
+	int ret;
+	struct cgroup *target;
+	int subsys_id;
+
+	if (cont == dummytop)
+		return 1;
+
+	get_first_subsys(cont, NULL, &subsys_id);
+	target = task_cgroup(current, subsys_id);
+	while (cont != target && cont!= cont->top_cgroup)
+		cont = cont->parent;
+	ret = (cont == target);
+	return ret;
+}
_

--
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
 Topic: [PATCH 04/33] task containersv11 add fork exit hooks
[PATCH 04/33] task containersv11 add fork exit hooks [message #20402] Mon, 17 September 2007 17:03
Paul Menage is currently offline Paul Menage
Messages: 642
Registered: September 2006
Senior Member
From: openvz.org
Adds the necessary hooks to the fork() and exit() paths to ensure that
new children inherit their parent's cgroup assignments, and that exiting
processes release reference counts on their cgroups.

Signed-off-by: Paul Menage <menage@google.com>
---

 include/linux/cgroup.h |    6 +
 kernel/cgroup.c        |  121 ++++++++++++++++++++++++++++++++++++
 kernel/exit.c             |    2 
 kernel/fork.c             |   14 +++-
 4 files changed, 141 insertions(+), 2 deletions(-)

diff -puN include/linux/cgroup.h~task-cgroupsv11-add-fork-exit-hooks include/linux/cgroup.h
--- a/include/linux/cgroup.h~task-cgroupsv11-add-fork-exit-hooks
+++ a/include/linux/cgroup.h
@@ -25,6 +25,9 @@ extern int cgroup_init(void);
 extern void cgroup_init_smp(void);
 extern void cgroup_lock(void);
 extern void cgroup_unlock(void);
+extern void cgroup_fork(struct task_struct *p);
+extern void cgroup_fork_callbacks(struct task_struct *p);
+extern void cgroup_exit(struct task_struct *p, int run_callbacks);
 
 /* Per-subsystem/per-cgroup state maintained by the system. */
 struct cgroup_subsys_state {
@@ -215,6 +218,9 @@ int cgroup_path(const struct containe
 static inline int cgroup_init_early(void) { return 0; }
 static inline int cgroup_init(void) { return 0; }
 static inline void cgroup_init_smp(void) {}
+static inline void cgroup_fork(struct task_struct *p) {}
+static inline void cgroup_fork_callbacks(struct task_struct *p) {}
+static inline void cgroup_exit(struct task_struct *p, int callbacks) {}
 
 static inline void cgroup_lock(void) {}
 static inline void cgroup_unlock(void) {}
diff -puN kernel/cgroup.c~task-cgroupsv11-add-fork-exit-hooks kernel/cgroup.c
--- a/kernel/cgroup.c~task-cgroupsv11-add-fork-exit-hooks
+++ a/kernel/cgroup.c
@@ -132,6 +132,33 @@ list_for_each_entry(_ss, &_root->subsys_
 #define for_each_root(_root) \
 list_for_each_entry(_root, &roots, root_list)
 
+/* Each task_struct has an embedded css_set, so the get/put
+ * operation simply takes a reference count on all the cgroups
+ * referenced by subsystems in this css_set. This can end up
+ * multiple-counting some cgroups, but that's OK - the ref-count is
+ * just a busy/not-busy indicator; ensuring that we only count each
+ * cgroup once would require taking a global lock to ensure that no
+ * subsystems moved between hierarchies while we were doing so.
+ *
+ * Possible TODO: decide at boot time based on the number of
+ * registered subsystems and the number of CPUs or NUMA nodes whether
+ * it's better for performance to ref-count every subsystem, or to
+ * take a global lock and only add one ref count to each hierarchy.
+ */
+static void get_css_set(struct css_set *cg)
+{
+	int i;
+	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
+		atomic_inc(&cg->subsys[i]->cgroup->count);
+}
+
+static void put_css_set(struct css_set *cg)
+{
+	int i;
+	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
+		atomic_dec(&cg->subsys[i]->cgroup->count);
+}
+
 /*
  * There is one global cgroup mutex. We also require taking
  * task_lock() when dereferencing a task's cgroup subsys pointers.
@@ -1563,3 +1590,97 @@ int __init cgroup_init(void)
 out:
 	return err;
 }
+
+/**
+ * cgroup_fork - attach newly forked task to its parents cgroup.
+ * @tsk: pointer to task_struct of forking parent process.
+ *
+ * Description: A task inherits its parent's cgroup at fork().
+ *
+ * A pointer to the shared css_set was automatically copied in
+ * fork.c by dup_task_struct().  However, we ignore that copy, since
+ * it was not made under the protection of RCU or cgroup_mutex, so
+ * might no longer be a valid cgroup pointer.  attach_task() might
+ * have already changed current->cgroup, allowing the previously
+ * referenced cgroup to be removed and freed.
+ *
+ * At the point that cgroup_fork() is called, 'current' is the parent
+ * task, and the passed argument 'child' points to the child task.
+ */
+void cgroup_fork(struct task_struct *child)
+{
+	rcu_read_lock();
+	child->cgroups = rcu_dereference(current->cgroups);
+	get_css_set(&child->cgroups);
+	rcu_read_unlock();
+}
+
+/**
+ * cgroup_fork_callbacks - called on a new task very soon before
+ * adding it to the tasklist. No need to take any locks since no-one
+ * can be operating on this task
+ */
+void cgroup_fork_callbacks(struct task_struct *child)
+{
+	if (need_forkexit_callback) {
+		int i;
+		for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+			struct cgroup_subsys *ss = subsys[i];
+			if (ss->fork)
+				ss->fork(ss, child);
+		}
+	}
+}
+
+/**
+ * cgroup_exit - detach cgroup from exiting task
+ * @tsk: pointer to task_struct of exiting process
+ *
+ * Description: Detach cgroup from @tsk and release it.
+ *
+ * Note that cgroups marked notify_on_release force every task in
+ * them to take the global cgroup_mutex mutex when exiting.
+ * This could impact scaling on very large systems.  Be reluctant to
+ * use notify_on_release cgroups where very high task exit scaling
+ * is required on large systems.
+ *
+ * the_top_cgroup_hack:
+ *
+ *    Set the exiting tasks cgroup to the root cgroup (top_cgroup).
+ *
+ *    We call cgroup_exit() while the task is still competent to
+ *    handle notify_on_release(), then leave the task attached to the
+ *    root cgroup in each hierarchy for the remainder of its exit.
+ *
+ *    To do this properly, we would increment the reference count on
+ *    top_cgroup, and near the very end of the kernel/exit.c do_exit()
+ *    code we would add a second cgroup function call, to drop that
+ *    reference.  This would just create an unnecessary hot spot on
+ *    the top_cgroup reference count, to no avail.
+ *
+ *    Normally, holding a reference to a cgroup without bumping its
+ *    count is unsafe.   The cgroup could go away, or someone could
+ *    attach us to a different cgroup, decrementing the count on
+ *    the first cgroup that we never incremented.  But in this case,
+ *    top_cgroup isn't going away, and either task has PF_EXITING set,
+ *    which wards off any attach_task() attempts, or task is a failed
+ *    fork, never visible to attach_task.
+ *
+ */
+void cgroup_exit(struct task_struct *tsk, int run_callbacks)
+{
+	int i;
+
+	if (run_callbacks && need_forkexit_callback) {
+		for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+			struct cgroup_subsys *ss = subsys[i];
+			if (ss->exit)
+				ss->exit(ss, tsk);
+		}
+	}
+	/* Reassign the task to the init_css_set. */
+	task_lock(tsk);
+	put_css_set(&tsk->cgroups);
+	tsk->cgroups = init_task.cgroups;
+	task_unlock(tsk);
+}
diff -puN kernel/exit.c~task-cgroupsv11-add-fork-exit-hooks kernel/exit.c
--- a/kernel/exit.c~task-cgroupsv11-add-fork-exit-hooks
+++ a/kernel/exit.c
@@ -33,6 +33,7 @@
 #include <linux/delayacct.h>
 #include <linux/freezer.h>
 #include <linux/cpuset.h>
+#include <linux/cgroup.h>
 #include <linux/syscalls.h>
 #include <linux/signal.h>
 #include <linux/posix-timers.h>
@@ -981,6 +982,7 @@ fastcall NORET_TYPE void do_exit(long co
 	check_stack_usage();
 	exit_thread();
 	cpuset_exit(tsk);
+	cgroup_exit(tsk, 1);
 	exit_keys(tsk);
 
 	if (group_dead && tsk->signal->leader)
diff -puN kernel/fork.c~task-cgroupsv11-add-fork-exit-hooks kernel/fork.c
--- a/kernel/fork.c~task-cgroupsv11-add-fork-exit-hooks
+++ a/kernel/fork.c
@@ -30,6 +30,7 @@
 #include <linux/capability.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
+#include <linux/cgroup.h>
 #include <linux/security.h>
 #include <linux/swap.h>
 #include <linux/syscalls.h>
@@ -967,6 +968,7 @@ static struct task_struct *copy_process(
 {
 	int retval;
 	struct task_struct *p = NULL;
+	int cgroup_callbacks_done = 0;
 
 	if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
 		return ERR_PTR(-EINVAL);
@@ -1068,12 +1070,13 @@ static struct task_struct *copy_process(
 	p->io_context = NULL;
 	p->audit_context = NULL;
 	cpuset_fork(p);
+	cgroup_fork(p);
 #ifdef CONFIG_NUMA
  	p->mempolicy = mpol_copy(p->mempolicy);
  	if (IS_ERR(p->mempolicy)) {
  		retval = PTR_ERR(p->mempolicy);
  		p->mempolicy = NULL;
- 		goto bad_fork_cleanup_cpuset;
+ 		goto bad_fork_cleanup_cgroup;
  	}
 	mpol_fix_fork_child_flag(p);
 #endif
@@ -1184,6 +1187,12 @@ static struct task_struct *copy_process(
 	/* Perform scheduler related setup. Assign this task to a CPU. */
 	sched_fork(p, clone_flags);
 
+	/* Now that the task is set up, run cgroup callbacks if
+	 * necessary. We need to run them before the task is visible
+	 * on the tasklist. */
+	cgroup_fork_callbacks(p);
+	cgroup_callbacks_done = 1;
+
 	/* Need tasklist lock for parent etc handling! */
 	write_lock_irq(&tasklist_lock);
 
@@ -1306,9 +1315,10 @@ bad_fork_cleanup_security:
 bad_fork_cleanup_policy:
 #ifdef CONFIG_NUMA
 	mpol_free(p->mempolicy);
-bad_fork_cleanup_cpuset:
+bad_fork_cleanup_cgroup:
 #endif
 	cpuset_exit(p);
+	cgroup_exit(p, cgroup_callbacks_done);
 	delayacct_tsk_free(p);
 	if (p->binfmt)
 		module_put(p->binfmt->module);
_

--
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
 Topic: [PATCH 02/33] task containersv11 basic task container framework fix
[PATCH 02/33] task containersv11 basic task container framework fix [message #20425] Mon, 17 September 2007 17:03
Paul Menage is currently offline Paul Menage
Messages: 642
Registered: September 2006
Senior Member
From: openvz.org
Handle reading /proc/self/cpuset when cpusets isn't mounted.

Signed-off-by: Paul Menage <menage@google.com>

---

 kernel/cgroup.c |    9 +++++++++
 1 file changed, 9 insertions(+)

diff -puN kernel/cgroup.c~task-cgroupsv11-basic-task-cgroup-framework-fix kernel/cgroup.c
--- a/kernel/cgroup.c~task-cgroupsv11-basic-task-cgroup-framework-fix
+++ a/kernel/cgroup.c
@@ -683,6 +683,15 @@ int cgroup_path(const struct containe
 {
 	char *start;
 
+	if (cont == dummytop) {
+		/*
+		 * Inactive subsystems have no dentry for their root
+		 * cgroup
+		 */
+		strcpy(buf, "/");
+		return 0;
+	}
+
 	start = buf + buflen;
 
 	*--start = '\0';
_

--
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
 Topic: [PATCH 01/33] task containersv11 basic task container framework
[PATCH 01/33] task containersv11 basic task container framework [message #20428] Mon, 17 September 2007 17:03
Paul Menage is currently offline Paul Menage
Messages: 642
Registered: September 2006
Senior Member
From: openvz.org
Generic Process Control Groups
--------------------------

There have recently been various proposals floating around for
resource management/accounting and other task grouping subsystems in
the kernel, including ResGroups, User BeanCounters, NSProxy
cgroups, and others.  These all need the basic abstraction of being
able to group together multiple processes in an aggregate, in order to
track/limit the resources permitted to those processes, or control
other behaviour of the processes, and all implement this grouping in
different ways.

This patchset provides a framework for tracking and grouping processes
into arbitrary "cgroups" and assigning arbitrary state to those
groupings, in order to control the behaviour of the cgroup as an
aggregate.

The intention is that the various resource management and
virtualization/cgroup efforts can also become task cgroup
clients, with the result that:

- the userspace APIs are (somewhat) normalised

- it's easier to test e.g. the ResGroups CPU controller in
 conjunction with the BeanCounters memory controller, or use either of
them as the resource-control portion of a virtual server system.

- the additional kernel footprint of any of the competing resource
 management systems is substantially reduced, since it doesn't need
 to provide process grouping/containment, hence improving their
 chances of getting into the kernel



This patch:

Add the main task cgroups framework - the cgroup filesystem, and the
basic structures for tracking membership and associating subsystem state
objects to tasks.

Signed-off-by: Paul Menage <menage@google.com>
---

 Documentation/cgroups.txt     |  526 ++++++++++++
 include/linux/cgroup.h        |  214 +++++
 include/linux/cgroup_subsys.h |   10 
 include/linux/magic.h            |    1 
 include/linux/sched.h            |   34 
 init/Kconfig                     |    8 
 init/main.c                      |    3 
 kernel/Makefile                  |    1 
 kernel/cgroup.c               | 1199 +++++++++++++++++++++++++++++
 9 files changed, 1995 insertions(+), 1 deletion(-)

diff -puN /dev/null Documentation/cgroups.txt
--- /dev/null
+++ a/Documentation/cgroups.txt
@@ -0,0 +1,526 @@
+				CGROUPS
+				-------
+
+Written by Paul Menage <menage@google.com> based on Documentation/cpusets.txt
+
+Original copyright statements from cpusets.txt:
+Portions Copyright (C) 2004 BULL SA.
+Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
+Modified by Paul Jackson <pj@sgi.com>
+Modified by Christoph Lameter <clameter@sgi.com>
+
+CONTENTS:
+=========
+
+1. Control Groups
+  1.1 What are cgroups ?
+  1.2 Why are cgroups needed ?
+  1.3 How are cgroups implemented ?
+  1.4 What does notify_on_release do ?
+  1.5 How do I use cgroups ?
+2. Usage Examples and Syntax
+  2.1 Basic Usage
+  2.2 Attaching processes
+3. Kernel API
+  3.1 Overview
+  3.2 Synchronization
+  3.3 Subsystem API
+4. Questions
+
+1. Control Groups
+==========
+
+1.1 What are cgroups ?
+----------------------
+
+Control Groups provide a mechanism for aggregating/partitioning sets of
+tasks, and all their future children, into hierarchical groups with
+specialized behaviour.
+
+Definitions:
+
+A *cgroup* associates a set of tasks with a set of parameters for one
+or more subsystems.
+
+A *subsystem* is a module that makes use of the task grouping
+facilities provided by cgroups to treat groups of tasks in
+particular ways. A subsystem is typically a "resource controller" that
+schedules a resource or applies per-cgroup limits, but it may be
+anything that wants to act on a group of processes, e.g. a
+virtualization subsystem.
+
+A *hierarchy* is a set of cgroups arranged in a tree, such that
+every task in the system is in exactly one of the cgroups in the
+hierarchy, and a set of subsystems; each subsystem has system-specific
+state attached to each cgroup in the hierarchy.  Each hierarchy has
+an instance of the cgroup virtual filesystem associated with it.
+
+At any one time there may be multiple active hierachies of task
+cgroups. Each hierarchy is a partition of all tasks in the system.
+
+User level code may create and destroy cgroups by name in an
+instance of the cgroup virtual file system, specify and query to
+which cgroup a task is assigned, and list the task pids assigned to
+a cgroup. Those creations and assignments only affect the hierarchy
+associated with that instance of the cgroup file system.
+
+On their own, the only use for cgroups is for simple job
+tracking. The intention is that other subsystems hook into the generic
+cgroup support to provide new attributes for cgroups, such as
+accounting/limiting the resources which processes in a cgroup can
+access. For example, cpusets (see Documentation/cpusets.txt) allows
+you to associate a set of CPUs and a set of memory nodes with the
+tasks in each cgroup.
+
+1.2 Why are cgroups needed ?
+----------------------------
+
+There are multiple efforts to provide process aggregations in the
+Linux kernel, mainly for resource tracking purposes. Such efforts
+include cpusets, CKRM/ResGroups, UserBeanCounters, and virtual server
+namespaces. These all require the basic notion of a
+grouping/partitioning of processes, with newly forked processes ending
+in the same group (cgroup) as their parent process.
+
+The kernel cgroup patch provides the minimum essential kernel
+mechanisms required to efficiently implement such groups. It has
+minimal impact on the system fast paths, and provides hooks for
+specific subsystems such as cpusets to provide additional behaviour as
+desired.
+
+Multiple hierarchy support is provided to allow for situations where
+the division of tasks into cgroups is distinctly different for
+different subsystems - having parallel hierarchies allows each
+hierarchy to be a natural division of tasks, without having to handle
+complex combinations of tasks that would be present if several
+unrelated subsystems needed to be forced into the same tree of
+cgroups.
+
+At one extreme, each resource controller or subsystem could be in a
+separate hierarchy; at the other extreme, all subsystems
+would be attached to the same hierarchy.
+
+As an example of a scenario (originally proposed by vatsa@in.ibm.com)
+that can benefit from multiple hierarchies, consider a large
+university server with various users - students, professors, system
+tasks etc. The resource planning for this server could be along the
+following lines:
+
+       CPU :           Top cpuset
+                       /       \
+               CPUSet1         CPUSet2
+                  |              |
+               (Profs)         (Students)
+
+               In addition (system tasks) are attached to topcpuset (so
+               that they can run anywhere) with a limit of 20%
+
+       Memory : Professors (50%), students (30%), system (20%)
+
+       Disk : Prof (50%), students (30%), system (20%)
+
+       Network : WWW browsing (20%), Network File System (60%), others (20%)
+                               / \
+                       Prof (15%) students (5%)
+
+Browsers like firefox/lynx go into the WWW network class, while (k)nfsd go
+into NFS network class.
+
+At the same time firefox/lynx will share an appropriate CPU/Memory class
+depending on who launched it (prof/student).
+
+With the ability to classify tasks differently for different resources
+(by putting those resource subsystems in different hierarchies) then
+the admin can easily set up a script which receives exec notifications
+and depending on who is launching the browser he can
+
+       # echo browser_pid > /mnt/<restype>/<userclass>/tasks
+
+With only a single hierarchy, he now would potentially have to create
+a separate cgroup for every browser launched and associate it with
+approp network and other resource class.  This may lead to
+proliferation of such cgroups.
+
+Also lets say that the administrator would like to give enhanced network
+access temporarily to a student's browser (since it is night and the user
+wants to do online gaming :)  OR give one of the students simulation
+apps enhanced CPU power,
+
+With ability to write pids directly to resource classes, its just a
+matter of :
+
+       # echo pid > /mnt/network/<new_class>/tasks
+       (after some time)
+       # echo pid > /mnt/network/<orig_class>/tasks
+
+Without this ability, he would have to split the cgroup into
+multiple separate ones and then associate the new cgroups with the
+new resource classes.
+
+
+
+1.3 How are cgroups implemented ?
+---------------------------------
+
+Control Groups extends the kernel as follows:
+
+ - Each task in the system has a reference-counted pointer to a
+   css_set.
+
+ - A css_set contains a set of reference-counted pointers to
+   cgroup_subsys_state objects, one for each cgroup subsystem
+   registered in the system. There is no direct link from a task to
+   the cgroup of which it's a member in each hierarchy, but this
+   can be determined by following pointers through the
+   cgroup_subsys_state objects. This is because accessing the
+   subsystem state is something that's expected to happen frequently
+   and in performance-critical code, whereas operations that require a
+   task's actual cgroup assignments (in particular, moving between
+   cgroups) are less common.
+
+ - A cgroup hierarchy filesystem can be mounted  for browsing and
+   manipulation from user space.
+
+ - You can list all the tasks (by pid) attached to any cgroup.
+
+The implementation of cgroups requires a few, simple hooks
+into the rest of the kernel, none in performance critical paths:
+
+ - in init/main.c, to initialize the root cgroups and initial
+   css_set at system boot.
+
+ - in fork and exit, to attach and detach a task from its css_set.
+
+In addition a new file system, of type "cgroup" may be mounted, to
+enable browsing and modifying the cgroups presently known to the
+kernel.  When mounting a cgro
...

 Topic: [PATCH 4/5][AFS] Cleanup explicit check for mandatory locks
[PATCH 4/5][AFS] Cleanup explicit check for mandatory locks [message #20351] Mon, 17 September 2007 03:56
Pavel Emelianov is currently offline Pavel Emelianov
Messages: 1149
Registered: September 2006
Senior Member
From: openvz.org
The __mandatory_lock(inode) macro makes the same check, but
makes the code more readable.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Cc: David Howells <dhowells@redhat.com>

---

 fs/afs/flock.c |    3 +--
 1 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index af6952e..210acaf 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -524,8 +524,7 @@ int afs_lock(struct file *file, int cmd,
 	       (long long) fl->fl_start, (long long) fl->fl_end);
 
 	/* AFS doesn't support mandatory locks */
-	if ((vnode->vfs_inode.i_mode & (S_ISGID | S_IXGRP)) == S_ISGID &&
-	    fl->fl_type != F_UNLCK)
+	if (__mandatory_lock(&vnode->vfs_inode) && fl->fl_type != F_UNLCK)
 		return -ENOLCK;
 
 	if (IS_GETLK(cmd))
 Topic: [PATCH 1/5] Cleanup macros for distinguishing mandatory locks
[PATCH 1/5] Cleanup macros for distinguishing mandatory locks [message #20348] Mon, 17 September 2007 03:50
Pavel Emelianov is currently offline Pavel Emelianov
Messages: 1149
Registered: September 2006
Senior Member
From: openvz.org
The combination of S_ISGID bit set and S_IXGRP bit unset is 
used to mark the inode as "mandatory lockable" and there's a 
macro for this check called MANDATORY_LOCK(inode). However, 
fs/locks.c and some filesystems still perform the explicit 
i_mode checking. Besides, Andrew pointed out, that this macro
is buggy itself, as it dereferences the inode arg twice.

Convert this macro into static inline function and switch 
its users to it, making the code shorter and more readable.

The __mandatory_lock() helper is to be used in places where
the IS_MANDLOCK() for superblock is already known to be true.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>

---

 fs/locks.c          |   14 ++++----------
 fs/nfsd/nfs4state.c |    2 +-
 fs/nfsd/vfs.c       |    2 +-
 fs/read_write.c     |    2 +-
 include/linux/fs.h  |   21 +++++++++++++++++----
 5 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 291d40b..9c519e6 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1488,12 +1488,25 @@ extern int locks_mandatory_area(int, str
  * Candidates for mandatory locking have the setgid bit set
  * but no group execute bit -  an otherwise meaningless combination.
  */
-#define MANDATORY_LOCK(inode) \
-	(IS_MANDLOCK(inode) && ((inode)->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
+
+static inline int __mandatory_lock(struct inode *ino)
+{
+	return (ino->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID;
+}
+
+/*
+ * ... and these candidates should be on MS_MANDLOCK mounted fs,
+ * otherwise these will be advisory locks
+ */
+
+static inline int mandatory_lock(struct inode *ino)
+{
+	return IS_MANDLOCK(ino) && __mandatory_lock(ino);
+}
 
 static inline int locks_verify_locked(struct inode *inode)
 {
-	if (MANDATORY_LOCK(inode))
+	if (mandatory_lock(inode))
 		return locks_mandatory_locked(inode);
 	return 0;
 }
@@ -1504,7 +1517,7 @@ static inline int locks_verify_truncate(
 				    struct file *filp,
 				    loff_t size)
 {
-	if (inode->i_flock && MANDATORY_LOCK(inode))
+	if (inode->i_flock && mandatory_lock(inode))
 		return locks_mandatory_area(
 			FLOCK_VERIFY_WRITE, inode, filp,
 			size < inode->i_size ? size : inode->i_size,
diff --git a/fs/locks.c b/fs/locks.c
index f59d066..a71c589 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1116,7 +1116,7 @@ int locks_mandatory_area(int read_write,
 			 * If we've been sleeping someone might have
 			 * changed the permissions behind our back.
 			 */
-			if ((inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
+			if (__mandatory_lock(inode))
 				continue;
 		}
 
@@ -1755,9 +1755,7 @@ int fcntl_setlk(unsigned int fd, struct 
 	/* Don't allow mandatory locks on files that may be memory mapped
 	 * and shared.
 	 */
-	if (IS_MANDLOCK(inode) &&
-	    (inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID &&
-	    mapping_writably_mapped(filp->f_mapping)) {
+	if (mandatory_lock(inode) && mapping_writably_mapped(filp->f_mapping)) {
 		error = -EAGAIN;
 		goto out;
 	}
@@ -1881,9 +1879,7 @@ int fcntl_setlk64(unsigned int fd, struc
 	/* Don't allow mandatory locks on files that may be memory mapped
 	 * and shared.
 	 */
-	if (IS_MANDLOCK(inode) &&
-	    (inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID &&
-	    mapping_writably_mapped(filp->f_mapping)) {
+	if (mandatory_lock(inode) && mapping_writably_mapped(filp->f_mapping)) {
 		error = -EAGAIN;
 		goto out;
 	}
@@ -2077,9 +2073,7 @@ static void lock_get_status(char* out, s
 		out += sprintf(out, "%6s %s ",
 			     (fl->fl_flags & FL_ACCESS) ? "ACCESS" : "POSIX ",
 			     (inode == NULL) ? "*NOINODE*" :
-			     (IS_MANDLOCK(inode) &&
-			      (inode->i_mode & (S_IXGRP | S_ISGID)) == S_ISGID) ?
-			     "MANDATORY" : "ADVISORY ");
+			     mandatory_lock(inode) ? "MANDATORY" : "ADVISORY ");
 	} else if (IS_FLOCK(fl)) {
 		if (fl->fl_type & LOCK_MAND) {
 			out += sprintf(out, "FLOCK  MSNFS     ");
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 6256492..a0635d7 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2030,7 +2030,7 @@ static inline int
 io_during_grace_disallowed(struct inode *inode, int flags)
 {
 	return nfs4_in_grace() && (flags & (RD_STATE | WR_STATE))
-		&& MANDATORY_LOCK(inode);
+		&& mandatory_lock(inode);
 }
 
 /*
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 70f2c86..3c703a7 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -65,7 +65,7 @@
  * locks on them because there is no way to know if the accesser has
  * the lock.
  */
-#define IS_ISMNDLK(i)	(S_ISREG((i)->i_mode) && MANDATORY_LOCK(i))
+#define IS_ISMNDLK(i)	(S_ISREG((i)->i_mode) && mandatory_lock(i))
 
 /*
  * This is a cache of readahead params that help us choose the proper
diff --git a/fs/read_write.c b/fs/read_write.c
index 507ddff..124693e 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -205,7 +205,7 @@ int rw_verify_area(int read_write, struc
 	if (unlikely((pos < 0) || (loff_t) (pos + count) < 0))
 		goto Einval;
 
-	if (unlikely(inode->i_flock && MANDATORY_LOCK(inode))) {
+	if (unlikely(inode->i_flock && mandatory_lock(inode))) {
 		int retval = locks_mandatory_area(
 			read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
 			inode, file, pos, count);
 Topic: [PATCH] shrink_dcache_sb speedup
[PATCH] shrink_dcache_sb speedup [message #20255] Fri, 14 September 2007 04:37
den is currently offline den
Messages: 493
Registered: December 2005
Senior Member
From: openvz.org
From: Denis V. Lunev <den@openvz.org>

This patch makes shrink_dcache_sb consistent with dentry pruning policy.

On the first pass we iterate over dentry unused list and prepare some
dentries for removal.
However, since the existing code moves evicted dentries
to the beginning of the LRU it can happen that fresh dentries from
other superblocks will be inserted *before* our dentries.

This can result in significant slowdown of shrink_dcache_sb().
Moreover, for virtual filesystems like unionfs which can call dput()
during dentries kill existing code results in O(n^2) complexity.

We observed 2 minutes shrink_dcache_sb() with only 35000 dentries.

To avoid this effects we propose to isolate sb dentries at the end
of LRU list.

Signed-off-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: Kirill Korotaev <dev@openvz.org>
Signed-off-by: Andrey Mirkin <amirkin@openvz.org>

-------

--- ./fs/dcache.c.shrink	2007-09-14 10:25:21.000000000 +0400
+++ ./fs/dcache.c	2007-09-14 10:26:08.000000000 +0400
@@ -553,18 +553,18 @@ void shrink_dcache_sb(struct super_block
 	 * superblock to the most recent end of the unused list.
 	 */
 	spin_lock(&dcache_lock);
-	list_for_each_safe(tmp, next, &dentry_unused) {
+	list_for_each_prev_safe(tmp, next, &dentry_unused) {
 		dentry = list_entry(tmp, struct dentry, d_lru);
 		if (dentry->d_sb != sb)
 			continue;
-		list_move(tmp, &dentry_unused);
+		list_move_tail(tmp, &dentry_unused);
 	}
 
 	/*
 	 * Pass two ... free the dentries for this superblock.
 	 */
 repeat:
-	list_for_each_safe(tmp, next, &dentry_unused) {
+	list_for_each_prev_safe(tmp, next, &dentry_unused) {
 		dentry = list_entry(tmp, struct dentry, d_lru);
 		if (dentry->d_sb != sb)
 			continue;
--- ./include/linux/list.h.shrink	2007-08-10 16:58:49.000000000 +0400
+++ ./include/linux/list.h	2007-09-14 10:26:08.000000000 +0400
@@ -478,6 +478,18 @@ static inline void list_splice_init_rcu(
 		pos = n, n = pos->next)
 
 /**
+ * list_for_each_prev_safe - iterate over a list backwards safe against removal
+			of list entry
+ * @pos:	the &struct list_head to use as a loop cursor.
+ * @n:		another &struct list_head to use as temporary storage
+ * @head:	the head for your list.
+ */
+#define list_for_each_prev_safe(pos, n, head) \
+	for (pos = (head)->prev, n = pos->prev; \
+	     prefetch(pos->prev), pos != (head); \
+	     pos = n, n = pos->prev)
+
+/**
  * list_for_each_entry	-	iterate over list of given type
  * @pos:	the type * to use as a loop cursor.
  * @head:	the head for your list.
 Topic: [NETNS][patch 0/1] fix allnoconfig compilation error
[NETNS][patch 0/1] fix allnoconfig compilation error [message #20172] Wed, 12 September 2007 16:48
Daniel Lezcano is currently offline Daniel Lezcano
Messages: 417
Registered: June 2006
Senior Member
From: openvz.org
fixes a compilation issue when allnoconfig is used.
 - init_net is unresolved.

If ok, I send it right now to Dave Miller.

-- 
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
 Topic: [BUG] ULOG problem on stable 2.6.18
[BUG] ULOG problem on stable 2.6.18 [message #20147] Wed, 12 September 2007 09:39
Enrico Weigelt is currently offline Enrico Weigelt
Messages: 31
Registered: July 2006
Member
From: openvz.org
Hi folks,


I'm just trying the current stable kernel w/ on an amd-k8 
and ran into trouble w/ netfilter's ulog target:

    ip_tables: ULOG target: invalid size 44 != 76
    
I had a look at the web and found out it might be an 32/64 bit
issue (userland's and kernel's int size differs). So I tried 
building for generic i586. But this also didn't help. 

Some netfilter's changelog on the web tells that issue had 
been solved, but probably that's in an newer kernel and not 
backported to the ovz kernel yet.

My big problem is that I need the ulog (even if it's silent ;O)
to get the kernel running together with some given firewall script,
since I'd like to add openvz support to an production grade 
distro which is used by non-experienced people. Rewriting the
firewall script would probably break the server management 
system :(

Maybe someone has an idea that might help ?


thx
-- 
---------------------------------------------------------------------
 Enrico Weigelt    ==   metux IT service - http://www.metux.de/
---------------------------------------------------------------------
 Please visit the OpenSource QM Taskforce:
 	http://wiki.metux.de/public/OpenSource_QM_Taskforce
 Patches / Fixes for a lot dozens of packages in dozens of versions:
	http://patches.metux.de/
---------------------------------------------------------------------
 Topic: [-mm PATCH] Memory controller make charging gfp mask aware
[-mm PATCH] Memory controller make charging gfp mask aware [message #20132] Wed, 12 September 2007 08:14
Balbir Singh is currently offline Balbir Singh
Messages: 491
Registered: August 2006
Senior Member
From: openvz.org
Nick Piggin pointed out that swap cache and page cache addition routines
could be called from non GFP_KERNEL contexts. This patch makes the charging
routine aware of the gfp context. Charging might fail if the container is
over it's limit, in which case a suitable error is returned.

This patch was tested on a Powerpc box. I am still looking at being able
to test the path, through which allocations happen in non GFP_KERNEL contexts.

Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
---

 include/linux/memcontrol.h |   12 ++++++++----
 include/linux/swap.h       |    3 ++-
 mm/filemap.c               |    2 +-
 mm/memcontrol.c            |   24 +++++++++++++++++-------
 mm/memory.c                |   10 +++++-----
 mm/migrate.c               |    2 +-
 mm/swap_state.c            |    2 +-
 mm/swapfile.c              |    2 +-
 mm/vmscan.c                |    5 +++--
 9 files changed, 39 insertions(+), 23 deletions(-)

diff -puN include/linux/memcontrol.h~memory-controller-make-charging-gfpmask-aware include/linux/memcontrol.h
--- linux-2.6.23-rc4/include/linux/memcontrol.h~memory-controller-make-charging-gfpmask-aware	2007-09-11 22:20:50.000000000 +0530
+++ linux-2.6.23-rc4-balbir/include/linux/memcontrol.h	2007-09-11 22:23:32.000000000 +0530
@@ -32,7 +32,8 @@ extern void mm_free_container(struct mm_
 extern void page_assign_page_container(struct page *page,
 					struct page_container *pc);
 extern struct page_container *page_get_page_container(struct page *page);
-extern int mem_container_charge(struct page *page, struct mm_struct *mm);
+extern int mem_container_charge(struct page *page, struct mm_struct *mm,
+				gfp_t gfp_mask);
 extern void mem_container_uncharge(struct page_container *pc);
 extern void mem_container_move_lists(struct page_container *pc, bool active);
 extern unsigned long mem_container_isolate_pages(unsigned long nr_to_scan,
@@ -42,7 +43,8 @@ extern unsigned long mem_container_isola
 					struct mem_container *mem_cont,
 					int active);
 extern void mem_container_out_of_memory(struct mem_container *mem);
-extern int mem_container_cache_charge(struct page *page, struct mm_struct *mm);
+extern int mem_container_cache_charge(struct page *page, struct mm_struct *mm,
+					gfp_t gfp_mask);
 extern struct mem_container *mm_container(struct mm_struct *mm);
 
 static inline void mem_container_uncharge_page(struct page *page)
@@ -70,7 +72,8 @@ static inline struct page_container *pag
 	return NULL;
 }
 
-static inline int mem_container_charge(struct page *page, struct mm_struct *mm)
+static inline int mem_container_charge(struct page *page, struct mm_struct *mm,
+					gfp_t gfp_mask)
 {
 	return 0;
 }
@@ -89,7 +92,8 @@ static inline void mem_container_move_li
 }
 
 static inline int mem_container_cache_charge(struct page *page,
-						struct mm_struct *mm)
+						struct mm_struct *mm,
+						gfp_t gfp_mask)
 {
 	return 0;
 }
diff -puN mm/memcontrol.c~memory-controller-make-charging-gfpmask-aware mm/memcontrol.c
--- linux-2.6.23-rc4/mm/memcontrol.c~memory-controller-make-charging-gfpmask-aware	2007-09-11 22:20:50.000000000 +0530
+++ linux-2.6.23-rc4-balbir/mm/memcontrol.c	2007-09-12 00:25:12.000000000 +0530
@@ -261,7 +261,8 @@ unsigned long mem_container_isolate_page
  * 0 if the charge was successful
  * < 0 if the container is over its limit
  */
-int mem_container_charge(struct page *page, struct mm_struct *mm)
+int mem_container_charge(struct page *page, struct mm_struct *mm,
+				gfp_t gfp_mask)
 {
 	struct mem_container *mem;
 	struct page_container *pc, *race_pc;
@@ -287,7 +288,7 @@ int mem_container_charge(struct page *pa
 
 	unlock_page_container(page);
 
-	pc = kzalloc(sizeof(struct page_container), GFP_KERNEL);
+	pc = kzalloc(sizeof(struct page_container), gfp_mask);
 	if (pc == NULL)
 		goto err;
 
@@ -314,7 +315,14 @@ int mem_container_charge(struct page *pa
 	 * the container limit.
 	 */
 	while (res_counter_charge(&mem->res, 1)) {
-		if (try_to_free_mem_container_pages(mem))
+		bool is_atomic = gfp_mask & GFP_ATOMIC;
+		/*
+		 * We cannot reclaim under GFP_ATOMIC, fail the charge
+		 */
+		if (is_atomic)
+			goto noreclaim;
+
+		if (try_to_free_mem_container_pages(mem, gfp_mask))
 			continue;
 
 		/*
@@ -338,9 +346,10 @@ int mem_container_charge(struct page *pa
 			congestion_wait(WRITE, HZ/10);
 			continue;
 		}
-
+noreclaim:
 		css_put(&mem->css);
-		mem_container_out_of_memory(mem);
+		if (!is_atomic)
+			mem_container_out_of_memory(mem);
 		goto free_pc;
 	}
 
@@ -381,7 +390,8 @@ err:
 /*
  * See if the cached pages should be charged at all?
  */
-int mem_container_cache_charge(struct page *page, struct mm_struct *mm)
+int mem_container_cache_charge(struct page *page, struct mm_struct *mm,
+				gfp_t gfp_mask)
 {
 	struct mem_container *mem;
 	if (!mm)
@@ -389,7 +399,7 @@ int mem_container_cache_charge(struct pa
 
 	mem = rcu_dereference(mm->mem_container);
 	if (mem->control_type == MEM_CONTAINER_TYPE_ALL)
-		return mem_container_charge(page, mm);
+		return mem_container_charge(page, mm, gfp_mask);
 	else
 		return 0;
 }
diff -puN mm/memory.c~memory-controller-make-charging-gfpmask-aware mm/memory.c
--- linux-2.6.23-rc4/mm/memory.c~memory-controller-make-charging-gfpmask-aware	2007-09-11 22:20:50.000000000 +0530
+++ linux-2.6.23-rc4-balbir/mm/memory.c	2007-09-11 22:54:09.000000000 +0530
@@ -1137,7 +1137,7 @@ static int insert_page(struct mm_struct 
 	pte_t *pte;
 	spinlock_t *ptl;
 
-	retval = mem_container_charge(page, mm);
+	retval = mem_container_charge(page, mm, GFP_KERNEL);
 	if (retval)
 		goto out;
 
@@ -1638,7 +1638,7 @@ gotten:
 		goto oom;
 	cow_user_page(new_page, old_page, address, vma);
 
-	if (mem_container_charge(new_page, mm))
+	if (mem_container_charge(new_page, mm, GFP_KERNEL))
 		goto oom_free_new;
 
 	/*
@@ -2101,7 +2101,7 @@ static int do_swap_page(struct mm_struct
 	}
 
 	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
-	if (mem_container_charge(page, mm)) {
+	if (mem_container_charge(page, mm, GFP_KERNEL)) {
 		ret = VM_FAULT_OOM;
 		goto out;
 	}
@@ -2185,7 +2185,7 @@ static int do_anonymous_page(struct mm_s
 	if (!page)
 		goto oom;
 
-	if (mem_container_charge(page, mm))
+	if (mem_container_charge(page, mm, GFP_KERNEL))
 		goto oom_free_page;
 
 	entry = mk_pte(page, vma->vm_page_prot);
@@ -2320,7 +2320,7 @@ static int __do_fault(struct mm_struct *
 
 	}
 
-	if (mem_container_charge(page, mm)) {
+	if (mem_container_charge(page, mm, GFP_KERNEL)) {
 		ret = VM_FAULT_OOM;
 		goto out;
 	}
diff -puN mm/filemap.c~memory-controller-make-charging-gfpmask-aware mm/filemap.c
--- linux-2.6.23-rc4/mm/filemap.c~memory-controller-make-charging-gfpmask-aware	2007-09-11 22:20:50.000000000 +0530
+++ linux-2.6.23-rc4-balbir/mm/filemap.c	2007-09-11 22:54:19.000000000 +0530
@@ -445,7 +445,7 @@ int add_to_page_cache(struct page *page,
 
 	if (error == 0) {
 
-		error = mem_container_cache_charge(page, current->mm);
+		error = mem_container_cache_charge(page, current->mm, gfp_mask);
 		if (error)
 			goto out;
 
diff -puN mm/migrate.c~memory-controller-make-charging-gfpmask-aware mm/migrate.c
--- linux-2.6.23-rc4/mm/migrate.c~memory-controller-make-charging-gfpmask-aware	2007-09-11 22:20:50.000000000 +0530
+++ linux-2.6.23-rc4-balbir/mm/migrate.c	2007-09-11 22:54:29.000000000 +0530
@@ -158,7 +158,7 @@ static void remove_migration_pte(struct 
  		return;
  	}
 
-	if (mem_container_charge(new, mm)) {
+	if (mem_container_charge(new, mm, GFP_KERNEL)) {
 		pte_unmap(ptep);
 		return;
 	}
diff -puN mm/page_alloc.c~memory-controller-make-charging-gfpmask-aware mm/page_alloc.c
diff -puN mm/rmap.c~memory-controller-make-charging-gfpmask-aware mm/rmap.c
diff -puN mm/swapfile.c~memory-controller-make-charging-gfpmask-aware mm/swapfile.c
--- linux-2.6.23-rc4/mm/swapfile.c~memory-controller-make-charging-gfpmask-aware	2007-09-11 22:20:50.000000000 +0530
+++ linux-2.6.23-rc4-balbir/mm/swapfile.c	2007-09-11 22:54:52.000000000 +0530
@@ -510,7 +510,7 @@ unsigned int count_swap_pages(int type, 
 static int unuse_pte(struct vm_area_struct *vma, pte_t *pte,
 		unsigned long addr, swp_entry_t entry, struct page *page)
 {
-	if (mem_container_charge(page, vma->vm_mm))
+	if (mem_container_charge(page, vma->vm_mm, GFP_KERNEL))
 		return -ENOMEM;
 
 	inc_mm_counter(vma->vm_mm, anon_rss);
diff -puN mm/swap_state.c~memory-controller-make-charging-gfpmask-aware mm/swap_state.c
--- linux-2.6.23-rc4/mm/swap_state.c~memory-controller-make-charging-gfpmask-aware	2007-09-11 22:20:50.000000000 +0530
+++ linux-2.6.23-rc4-balbir/mm/swap_state.c	2007-09-11 22:55:12.000000000 +0530
@@ -81,7 +81,7 @@ static int __add_to_swap_cache(struct pa
 	error = radix_tree_preload(gfp_mask);
 	if (!error) {
 
-		error = mem_container_cache_charge(page, current->mm);
+		error = mem_container_cache_charge(page, current->mm, gfp_mask);
 		if (error)
 			goto out;
 
diff -puN mm/vmscan.c~memory-controller-make-charging-gfpmask-aware mm/vmscan.c
--- linux-2.6.23-rc4/mm/vmscan.c~memory-controller-make-charging-gfpmask-aware	2007-09-11 22:20:50.000000000 +0530
+++ linux-2.6.23-rc4-balbir/mm/vmscan.c	2007-09-11 23:05:40.000000000 +0530
@@ -1357,10 +1357,11 @@ unsigned long try_to_free_pages(struct z
 #define ZONE_USERPAGES ZONE_NORMAL
 #endif
 
-unsigned long try_to_free_mem_container_pages(struct mem_container *mem_cont)
+unsigned long try_to_free_mem_container_pages(struct mem_container *mem_cont,
+						gfp_t gfp_mask)
 {
 	struct scan_control sc = {
-		.gfp_mask = GFP_KERNEL,
+		.gfp_mask = gfp_mask,
 		.may_writepage = !laptop_mode,
 		.may_swap = 1,
 		.swap_cluster_max = SWAP_CLUSTER_MAX,
diff -puN include/linux/swap.h~memory-controller-make-charging-gfpmask-aware include/linux/swap.h
--- linux-2.6.23-rc4/include/linux/swap.h~memory-controller-make-charging-gfpmask-aware	2007-09-12 00:11:37.000000000 +0530
+++ linux-2.6.23-rc4-balbir/include/linux/swap.h	2007-09-11 23:05:59.000000000 +0530
@@ -191,7 +191,8 @@ extern void swap_setup(void);
 /* linux/mm/vmscan.c *
...

 Topic: [PATCH 5/5][NFS] Use macro instead of explicit check for mandatory locks
[PATCH 5/5][NFS] Use macro instead of explicit check for mandatory locks [message #20120] Wed, 12 September 2007 07:28
Pavel Emelianov is currently offline Pavel Emelianov
Messages: 1149
Registered: September 2006
Senior Member
From: openvz.org
The __MANDATORY_LOCK(inode) macro makes the same check, but
makes the code more readable.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>

---

 fs/nfs/file.c |    3 +--
 1 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 73ddd2e..8dc2cde 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -605,8 +605,7 @@ static int nfs_lock(struct file *filp, i
 	nfs_inc_stats(inode, NFSIOS_VFSLOCK);
 
 	/* No mandatory locks over NFS */
-	if ((inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID &&
-	    fl->fl_type != F_UNLCK)
+	if (__MANDATORY_LOCK(inode) && fl->fl_type != F_UNLCK)
 		return -ENOLCK;
 
 	if (IS_GETLK(cmd))
 Topic: [PATCH 4/5][AFS] Use macro instead of explicit check for mandatory locks
[PATCH 4/5][AFS] Use macro instead of explicit check for mandatory locks [message #20119] Wed, 12 September 2007 07:27
Pavel Emelianov is currently offline Pavel Emelianov
Messages: 1149
Registered: September 2006
Senior Member
From: openvz.org
The __MANDATORY_LOCK(inode) macro makes the same check, but
makes the code more readable.

Unfortunately, I haven't found the maintainer for this FS in 
the MAINTAINERS file.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>

---

 fs/afs/flock.c |    3 +--
 1 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index af6952e..9ddac05 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -524,8 +524,7 @@ int afs_lock(struct file *file, int cmd,
 	       (long long) fl->fl_start, (long long) fl->fl_end);
 
 	/* AFS doesn't support mandatory locks */
-	if ((vnode->vfs_inode.i_mode & (S_ISGID | S_IXGRP)) == S_ISGID &&
-	    fl->fl_type != F_UNLCK)
+	if (__MANDATORY_LOCK(vnode->vfs_inode) && fl->fl_type != F_UNLCK)
 		return -ENOLCK;
 
 	if (IS_GETLK(cmd))
 Topic: [PATCH 3/5][9PFS] Use macro instead of explicit check for mandatory locks
[PATCH 3/5][9PFS] Use macro instead of explicit check for mandatory locks [message #20118] Wed, 12 September 2007 07:23
Pavel Emelianov is currently offline Pavel Emelianov
Messages: 1149
Registered: September 2006
Senior Member
From: openvz.org
The __MANDATORY_LOCK(inode) macro makes the same check, but
makes the code more readable.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Cc: Eric Van Hensbergen <ericvh@gmail.com>
Cc: Ron Minnich <rminnich@sandia.gov>
Cc: Latchesar Ionkov <lucho@ionkov.net>

---

 fs/9p/vfs_file.c |    2 +-
 1 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 2a40c29..7e75309 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -105,7 +105,7 @@ static int v9fs_file_lock(struct file *f
 	P9_DPRINTK(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl);
 
 	/* No mandatory locks */
-	if ((inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
+	if (__MANDATORY_LOCK(inode))
 		return -ENOLCK;
 
 	if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
 Topic: [PATCH 2/5][GFS2] Use macro instead of explicit check for mandatory locks
[PATCH 2/5][GFS2] Use macro instead of explicit check for mandatory locks [message #20117] Wed, 12 September 2007 07:20
Pavel Emelianov is currently offline Pavel Emelianov
Messages: 1149
Registered: September 2006
Senior Member
From: openvz.org
The __MANDATORY_LOCK(inode) macro makes the same check, but
makes the code more readable.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Cc: Steven Whitehouse <swhiteho@redhat.com>

---

 fs/gfs2/ops_file.c |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index 94d76ac..7e814f4 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -535,7 +535,7 @@ static int gfs2_lock(struct file *file, 
 
 	if (!(fl->fl_flags & FL_POSIX))
 		return -ENOLCK;
-	if ((ip->i_inode.i_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
+	if (__MANDATORY_LOCK(ip->i_inode))
 		return -ENOLCK;
 
 	if (sdp->sd_args.ar_localflocks) {
@@ -637,7 +637,7 @@ static int gfs2_flock(struct file *file,
 
 	if (!(fl->fl_flags & FL_FLOCK))
 		return -ENOLCK;
-	if ((ip->i_inode.i_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
+	if (__MANDATORY_LOCK(ip->i_inode))
 		return -ENOLCK;
 
 	if (sdp->sd_args.ar_localflocks)
 Topic: Couple of rpm requests, 2.6.22
Couple of rpm requests, 2.6.22 [message #20055] Tue, 11 September 2007 11:07
mhw is currently offline mhw
Messages: 12
Registered: March 2007
Junior Member
From: openvz.org
Hey all!

	Not sure if I should post this for discussion or just file these
requests in bugzilla.  I'll do both...  :-)

	I've got a couple of requests for some changes in the rpm builds, now
that development is on 2.6.22.

	2.6.22 includes MD5 tcp signatures which are necessary for IPv4
passwords on BGP sessions (quagga w/ password patch).  The default
builds for Fedora 2.6.22 builds include this option but it has not been
enabled in the OpenVZ builds.  Any chance of getting that slipped in for
the next release (which I understand should also fix the netfilter
problems?).

	I also noticed that SIT tunnels for IPv6 are not enabled in the build,
which is a standard default option for all RedHat / Fedora / CentOS
builds.

	The RedHat and Fedora builds include development rpm's.  I maintain a
couple of kernel modules and often build outside of the kernel, so I
need the development bits and pieces.  I'm curious why a "devel" package
isn't similarly provided with the 2.6.22 packages.

	These are sort of show stoppers for me which require I rebuild the rpms
from srpm for some of my systems, where the 2.6.22 kernel is required.

	Regards,
	Mike
-- 
Michael H. Warfield (AI4NB) | (770) 985-6132 |  mhw@WittsEnd.com
   /\/\|=mhw=|\/\/          | (678) 463-0932 |  http://www.wittsend.com/mhw/
   NIC whois: MHW9          | An optimist believes we live in the best of all
 PGP Key: 0xDF1DD471        | possible worlds.  A pessimist is sure of it!
 Topic: [PATCH 3/3] Signal semantics for pid namespaces
[PATCH 3/3] Signal semantics for pid namespaces [message #20026] Tue, 11 September 2007 00:12
Sukadev Bhattiprolu is currently offline Sukadev Bhattiprolu
Messages: 413
Registered: August 2006
Senior Member
From: openvz.org
From: Sukadev Bhattiprolu <sukadev@us.ibm.com>
Subject: [PATCH 3/3] Signal semantics for pid namespaces

With support for multiple pid namespaces, each pid namespace has a
separate child reaper and this process needs some special handling
of signals. 

	- The child reaper should appear like a normal process to other
	  processes in its ancestor namespaces and so should be killable
	  (or not) in the usual way.

       	- The child reaper should receive, from processes in it's active
          and decendent namespaces, only those signals for which it has
          installed a signal handler.

	- System-wide signals (eg: kill signum -1) from within a child namespace
	  should only affect processes within that namespace and descendant
	  namespaces. They should not be posted to processes in ancestor or
	  sibling namespaces.

	- If the sender of a signal does not have a pid_t in the receiver's
	  namespace (eg: a process in init_pid_ns sends a signal to a process
	  in a descendant namespace), the sender's pid and uid should appear
	  as 0 in the signal's 'siginfo' structure.

	- Existing rules for SIGIO delivery still apply and a process can
	  choose any other process in its namespace and descendant namespaces
	  to receive the SIGIO signal.
	  
	  The following appears to be incorrect in the fcntl() man page for
	  F_SETOWN.

              Sending a signal to  the  owner  process  (group)  specified  by
              F_SETOWN  is  subject  to  the  same  permissions  checks as are
              described for kill(2), where the sending process is the one that
              employs F_SETOWN (but see BUGS below).

	Current behavior is that the SIGIO signal is delivered on behalf of
	the process that caused the event (eg: made data available on the
	file) and not the process that called fcntl().

Changelog:
	- [Oleg Nesterov]: Used the interfaces, is_current_in_ancestor_pid_ns()
	  and is_current_in_same_or_ancestor_pid_ns().
	- [Oleg Nesterov]: Clear info.si_uid also when masquerading sender.

Signed-off-by: Sukadev Bhattiprolu <sukadev@us.ibm.com>
---
 kernel/signal.c |   28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

Index: 2.6.23-rc4-mm1/kernel/signal.c
===================================================================
--- 2.6.23-rc4-mm1.orig/kernel/signal.c	2007-09-10 18:42:16.000000000 -0700
+++ 2.6.23-rc4-mm1/kernel/signal.c	2007-09-10 18:42:16.000000000 -0700
@@ -25,6 +25,7 @@
 #include <linux/capability.h>
 #include <linux/freezer.h>
 #include <linux/pid_namespace.h>
+#include <linux/pid.h>
 #include <linux/nsproxy.h>
 #include <linux/hardirq.h>
 
@@ -45,7 +46,10 @@ static int sig_init_ignore(struct task_s
 
 	// Currently this check is a bit racy with exec(),
 	// we can _simplify_ de_thread and close the race.
-	if (likely(!is_global_init(tsk->group_leader)))
+	if (likely(!is_container_init(tsk->group_leader)))
+		return 0;
+
+	if (is_current_in_ancestor_pid_ns(tsk) && !in_interrupt())
 		return 0;
 
 	return 1;
@@ -681,6 +685,20 @@ static void handle_stop_signal(int sig, 
 	}
 }
 
+static void masquerade_sender(struct task_struct *t, struct sigqueue *q)
+{
+	/*
+	 * If the sender does not have a pid_t in the receiver's active
+	 * pid namespace, set si_pid to 0 and pretend signal originated
+	 * from the kernel.
+	 */
+	if (!pid_ns_equal(t)) {
+		q->info.si_pid = 0;
+		q->info.si_uid = 0;
+		q->info.si_code = SI_KERNEL;
+	}
+}
+
 static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 			struct sigpending *signals)
 {
@@ -732,6 +750,7 @@ static int send_signal(int sig, struct s
 			copy_siginfo(&q->info, info);
 			break;
 		}
+		masquerade_sender(t, q);
 	} else if (!is_si_special(info)) {
 		if (sig >= SIGRTMIN && info->si_code != SI_USER)
 		/*
@@ -1165,6 +1184,7 @@ EXPORT_SYMBOL_GPL(kill_pid_info_as_uid);
 static int kill_something_info(int sig, struct siginfo *info, int pid)
 {
 	int ret;
+
 	rcu_read_lock();
 	if (!pid) {
 		ret = kill_pgrp_info(sig, info, task_pgrp(current));
@@ -1174,6 +1194,12 @@ static int kill_something_info(int sig, 
 
 		read_lock(&tasklist_lock);
 		for_each_process(p) {
+			/*
+			 * System-wide signals only apply to pid namespace
+			 * of sender.
+			 */
+			if (!is_current_in_same_or_ancestor_pid_ns(p))
+				continue;
 			if (p->pid > 1 && !same_thread_group(p, current)) {
 				int err = group_send_sig_info(sig, info, p);
 				++count;
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
 Topic: [PATCH 2/3] Pid ns helpers for signals
[PATCH 2/3] Pid ns helpers for signals [message #20025] Tue, 11 September 2007 00:11
Sukadev Bhattiprolu is currently offline Sukadev Bhattiprolu
Messages: 413
Registered: August 2006
Senior Member
From: openvz.org
Define some helper functions that will be used to implement signal semantics
with multiple pid namespaces. 

	is_current_in_ancestor_pid_ns(task)

		TRUE iff active pid namespace of 'current' is an ancestor of
		active pid namespace of @task.

	is_current_in_same_or_ancestor_pid_ns(task)

		TRUE iff active pid namespace of 'current' is either same as 
		or an ancestor of active pid namespace of @task.

	pid_ns_equal(tsk)
		TRUE if active pid ns of @tsk is same as active pid ns of
		'current'.

Changelog: [Oleg Nesterov]: Renamed helpers. Dropped reference to pid and
			    pid-namespace since they are stable for current
			    callers.

---
 include/linux/pid.h |   12 +++++++++
 kernel/pid.c        |   63 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 75 insertions(+)

Index: 2.6.23-rc4-mm1/include/linux/pid.h
===================================================================
--- 2.6.23-rc4-mm1.orig/include/linux/pid.h	2007-09-07 19:18:34.000000000 -0700
+++ 2.6.23-rc4-mm1/include/linux/pid.h	2007-09-07 19:18:42.000000000 -0700
@@ -124,6 +124,18 @@ extern struct pid *alloc_pid(struct pid_
 extern void FASTCALL(free_pid(struct pid *pid));
 extern void zap_pid_ns_processes(struct pid_namespace *pid_ns);
 
+static inline struct pid_namespace *pid_active_ns(struct pid *pid)
+{
+	if (!pid)
+		return NULL;
+
+	return pid->numbers[pid->level].ns;
+}
+
+extern int pid_ns_equal(struct task_struct *tsk);
+extern int is_current_in_ancestor_pid_ns(struct task_struct *tsk);
+extern int is_current_in_same_or_ancestor_pid_ns(struct task_struct *tsk);
+
 /*
  * the helpers to get the pid's id seen from different namespaces
  *
Index: 2.6.23-rc4-mm1/kernel/pid.c
===================================================================
--- 2.6.23-rc4-mm1.orig/kernel/pid.c	2007-09-07 19:18:34.000000000 -0700
+++ 2.6.23-rc4-mm1/kernel/pid.c	2007-09-10 18:35:51.000000000 -0700
@@ -199,6 +199,69 @@ static int next_pidmap(struct pid_namesp
 	return -1;
 }
 
+/*
+ * Return TRUE if the active pid namespace of @tsk is same as active
+ * pid namespace of 'current'.
+ */
+int pid_ns_equal(struct task_struct *tsk)
+{
+	struct pid_namespace *my_ns = pid_active_ns(task_pid(current));
+	struct pid_namespace *tsk_ns = pid_active_ns(task_pid(tsk));
+
+	return my_ns == tsk_ns;
+}
+
+/*
+ * Return TRUE if pid namespace @ns1 is an ancestor of pid namespace @ns2.
+ * Return FALSE otherwise.
+ *
+ * Note: Callers must ensure @ns1 and @ns2 are stable.
+ */
+static int ancestor_pid_ns(struct pid_namespace *ns1, struct pid_namespace *ns2)
+{
+	int i;
+	struct pid_namespace *tmp;
+
+	if (ns1 == NULL || ns2 == NULL)
+		return 0;
+
+	if (ns1->level >= ns2->level)
+		return 0;
+
+	tmp = ns2->parent;
+	for (i = tmp->level; i >= ns1->level; i--) {
+		if (tmp == ns1)
+			return 1;
+		tmp = tmp->parent;
+	}
+
+	return 0;
+}
+
+/*
+ * Return TRUE if active pid namespace of 'current' is an ancestor of
+ * pid namespace of @tsk. Return FALSE otherwise.
+ */
+int is_current_in_ancestor_pid_ns(struct task_struct *tsk)
+{
+	struct pid_namespace *my_ns = pid_active_ns(task_pid(current));
+	struct pid_namespace *tsk_ns = pid_active_ns(task_pid(tsk));
+
+	return ancestor_pid_ns(my_ns, tsk_ns);
+}
+
+/*
+ * Return TRUE if active pid namespace of 'current' is either same as
+ * or an ancestor of active pid namespace of @tsk.
+ */
+int is_current_in_same_or_ancestor_pid_ns(struct task_struct *tsk)
+{
+	struct pid_namespace *my_ns = pid_active_ns(task_pid(current));
+	struct pid_namespace *tsk_ns = pid_active_ns(task_pid(tsk));
+
+	return my_ns == tsk_ns || ancestor_pid_ns(my_ns, tsk_ns);
+}
+
 fastcall void put_pid(struct pid *pid)
 {
 	struct pid_namespace *ns;
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
 Topic: [PATCH] Containers: Fix refcount bug
[PATCH] Containers: Fix refcount bug [message #20010] Mon, 10 September 2007 17:51
menage is currently offline menage
Messages: 5
Registered: August 2007
Junior Member
From: openvz.org
Fix a reference counting bug in containerfs

As part of the extraction of cpusetfs to containerfs, a call to
cpuset_get_dentry() was lost (justified by the fact that the dentry in
question was now being passed down by the caller). Since
cpuset_get_dentry() called lookup_one_len(), this resulted in a
reference count being missed from the directory dentry.

This patch removes container_get_dentry() and replaces it with direct
calls to lookup_one_len(); the initialization of containerfs dentry
ops is done now in container_create_file() at dentry creation time.

Signed-off-by: Paul Menage <menage@google.com>

---
 kernel/container.c |   26 ++++++++------------------
 1 file changed, 8 insertions(+), 18 deletions(-)

Index: container-2.6.23-rc3-mm1/kernel/container.c
===================================================================
--- container-2.6.23-rc3-mm1.orig/kernel/container.c
+++ container-2.6.23-rc3-mm1/kernel/container.c
@@ -603,19 +603,6 @@ static void container_diput(struct dentr
 	iput(inode);
 }
 
-static struct dentry *container_get_dentry(struct dentry *parent,
-					   const char *name)
-{
-	struct dentry *d = lookup_one_len(name, parent, strlen(name));
-	static struct dentry_operations container_dops = {
-		.d_iput = container_diput,
-	};
-
-	if (!IS_ERR(d))
-		d->d_op = &container_dops;
-	return d;
-}
-
 static void remove_dir(struct dentry *d)
 {
 	struct dentry *parent = dget(d->d_parent);
@@ -1506,6 +1493,10 @@ static struct inode_operations container
 static int container_create_file(struct dentry *dentry, int mode,
 				struct super_block *sb)
 {
+	static struct dentry_operations container_dops = {
+		.d_iput = container_diput,
+	};
+
 	struct inode *inode;
 
 	if (!dentry)
@@ -1531,7 +1522,7 @@ static int container_create_file(struct 
 		inode->i_size = 0;
 		inode->i_fop = &container_file_operations;
 	}
-
+	dentry->d_op = &container_dops;
 	d_instantiate(dentry, inode);
 	dget(dentry);	/* Extra count - pin the dentry in core */
 	return 0;
@@ -1552,13 +1543,12 @@ static int container_create_dir(struct c
 	int error = 0;
 
 	parent = cont->parent->dentry;
-	if (IS_ERR(dentry))
-		return PTR_ERR(dentry);
 	error = container_create_file(dentry, S_IFDIR | mode, cont->root->sb);
 	if (!error) {
 		dentry->d_fsdata = cont;
 		inc_nlink(parent->d_inode);
 		cont->dentry = dentry;
+		dget(dentry);
 	}
 	dput(dentry);
 
@@ -1580,7 +1570,7 @@ int container_add_file(struct container 
 	}
 	strcat(name, cft->name);
 	BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
-	dentry = container_get_dentry(dir, name);
+	dentry = lookup_one_len(name, dir, strlen(name));
 	if (!IS_ERR(dentry)) {
 		error = container_create_file(dentry, 0644 | S_IFREG,
 						cont->root->sb);
@@ -2586,7 +2576,7 @@ int container_clone(struct task_struct *
 	/* Hold the parent directory mutex across this operation to
 	 * stop anyone else deleting the new container */
 	mutex_lock(&inode->i_mutex);
-	dentry = container_get_dentry(parent->dentry, nodename);
+	dentry = lookup_one_len(nodename, parent->dentry, strlen(nodename));
 	if (IS_ERR(dentry)) {
 		printk(KERN_INFO
 		       "Couldn't allocate dentry for %s: %ld\n", nodename,


_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
 Topic: pid namespace .text overhead
pid namespace .text overhead [message #20027] Mon, 10 September 2007 15:38
Cedric Le Goater is currently offline Cedric Le Goater
Messages: 443
Registered: February 2006
Senior Member
From: openvz.org
FYI,

I just did a compile test on a 2.6.23-rc4-mm1 kernel with and without 
the following patches on a x86_64 defconfig  (I also had to remove 
CONFIG_IPV6 for some compile reason) :

+ pid-namespaces-rework-forget_original_parent.patch
+ pid-namespaces-move-exit_task_namespaces.patch
+ pid-namespaces-introduce-ms_kernmount-flag.patch
+ pid-namespaces-prepare-proc_flust_task-to-flush-entries-from-multiple-proc-trees.patch
+ pid-namespaces-introduce-struct-upid.patch
+ pid-namespaces-add-support-for-pid-namespaces-hierarchy.patch
+ pid-namespaces-make-alloc_pid-free_pid-and-put_pid-work-with-struct-upid.patch
+ pid-namespaces-helpers-to-obtain-pid-numbers.patch
+ pid-namespaces-helpers-to-find-the-task-by-its-numerical-ids.patch
+ pid-namespaces-helpers-to-find-the-task-by-its-numerical-ids-fix.patch
+ pid-namespaces-move-alloc_pid-lower-in-copy_process.patch
+ pid-namespaces-make-proc-have-multiple-superblocks-one-for-each-namespace.patch
+ pid-namespaces-miscelaneous-preparations-for-pid-namespaces.patch
+ pid-namespaces-allow-cloning-of-new-namespace.patch
+ pid-namespaces-allow-cloning-of-new-namespace-fix-check-for-return-value-of-create_pid_namespace.patch
+ pid-namespaces-make-proc_flush_task-actually-from-entries-from-multiple-namespaces.patch
+ pid-namespaces-initialize-the-namespaces-proc_mnt.patch
+ pid-namespaces-create-a-slab-cache-for-struct-pid_namespace.patch
+ pid-namespaces-allow-signalling-container-init.patch
+ pid-namespaces-destroy-pid-namespace-on-inits-death.patch
+ pid-namespaces-changes-to-show-virtual-ids-to-user.patch
+ pid-namespaces-changes-to-show-virtual-ids-to-user-fix-the-return-value-of-sys_set_tid_address.patch
+ pid-namespaces-changes-to-show-virtual-ids-to-user-use-find_task_by_pid_ns-in-places-that-operate-with-virtual.patch
+ pid-namespaces-changes-to-show-virtual-ids-to-user-use-find_task_by_pid_ns-in-places-that-operate-with-virtual-fix.patch
+ pid-namespaces-changes-to-show-virtual-ids-to-user-use-find_task_by_pid_ns-in-places-that-operate-with-virtual-fix-2.patch
+ pid-namespaces-changes-to-show-virtual-ids-to-user-use-find_task_by_pid_ns-in-places-that-operate-with-virtual-fix-3.patch
+ pid-namespaces-changes-to-show-virtual-ids-to-user-sys_getsid-sys_getpgid-return-wrong-id-for-task-from-another.patch
+ pid-namespaces-changes-to-show-virtual-ids-to-user-fix-the-sys_setpgrp-to-work-between-namespaces.patch
+ pid-namespaces-changes-to-show-virtual-ids-to-user-fix.patch
+ pid-namespaces-remove-the-struct-pid-unneeded-fields.patch
+ isolate-some-explicit-usage-of-task-tgid.patch
+ isolate-some-explicit-usage-of-task-tgid-fix.patch
+ isolate-some-explicit-usage-of-task-tgid-fix-fix.patch

I got a less than *6k* difference in .text. Here are the detailed size 
results.

size without:
   text    data     bss     dec     hex filename
5067718  892082  725544 6685344  6602a0 vmlinux

size with:
   text    data     bss     dec     hex filename
5073314  892210  725544 6691068  6618fc vmlinux


section                   without      with
.text                     3383305   3388569
__ex_table                  16704     16704
__bug_table                 41136     41208
.rodata                   1276129   1276469
.pci_fixup                   4032      4032
__ksymtab                   43984     43984
__ksymtab_gpl               13024     13056
__ksymtab_gpl_future           48        48
__ksymtab_strings           83388     83420
__param                      7800      7800
.data                      482928    483056
.data.cacheline_aligned    187008    187008
.data.read_mostly           41600     41600
.vsyscall_0                   227       227
.vsyscall_fn                   54        54
.vsyscall_gtod_data            80        80
.vsyscall_1                    52        52
.vsyscall_2                    91        91
.vgetcpu_mode                   4         4
.jiffies                        8         8
.vsyscall_3                     8         8
.data.init_task              8192      8192
.data.page_aligned           4096      4096
.smp_locks                  33584     33624
.init.text                 153713    153529
.init.data                 137714    137714
.init.setup                  3168      3168
.initcall.init               2200      2200
.con_initcall.init             16        16
.altinstructions             1243      1243
.altinstr_replacement         244       244
.exit.text                   4739      4739
.note                          24        24
.vdso                        4056      4056
.init.ramfs                   133       133
.data.percpu                25064     25064
.data_nosave                    4         4
.bss                       725544    725544
.comment                    44352     44352
Total                     6729696   6735420


kernel/pid.o .text increases a lot (more than a 1K) but some inline
would also need to be checked (that i didn't do yet) 

Cheers,

C.
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
 Topic: Re: [PATCH] Hookup group-scheduler with task container infrastructure
Re: [PATCH] Hookup group-scheduler with task container infrastructure [message #20022] Mon, 10 September 2007 13:42
Jan Engelhardt is currently offline Jan Engelhardt
Messages: 18
Registered: August 2006
Junior Member
From: openvz.org
On Sep 10 2007 22:58, Srivatsa Vaddagiri wrote:
>On Mon, Sep 10, 2007 at 10:53:34PM +0530, Srivatsa Vaddagiri wrote:
>> > cpuctl, cpuctrl, cpu_controller?
>> 
>> *shrug* .. I used "cpuctlr" to mean "CPU Controller". Any other short names
>> would do. From your list, cpuctl or cpuctrl both qualifies IMO!
>> 
>> Unless folks have strong objection to it, I prefer "cptctlr", the way it is.
>
>s/cptctlr/cpuctlr !

Captain Controller to the rescue!
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
 Topic: [PATCH] Leases can be hidden by flocks
[PATCH] Leases can be hidden by flocks [message #16517] Mon, 10 September 2007 10:16
Pavel Emelianov is currently offline Pavel Emelianov
Messages: 1149
Registered: September 2006
Senior Member
From: openvz.org
The inode->i_flock list contains the leases, flocks and posix
locks in the specified order. However, the flocks are added in
the head of this list thus hiding the leases from F_GETLEASE
command, from time_out_leases() and other code that expects
the leases to come first.

The following example will demonstrate this:

#define _GNU_SOURCE

#include <unistd.h>
#include <fcntl.h>
#include <stdio.h>
#include <sys/file.h>

static void show_lease(int fd)
{
        int res;

        res = fcntl(fd, F_GETLEASE);
        switch (res) {
                case F_RDLCK:
                        printf("Read lease\n");
                        break;
                case F_WRLCK:
                        printf("Write lease\n");
                        break;
                case F_UNLCK:
                        printf("No leases\n");
                        break;
                default:
                        printf("Some shit\n");
                        break;
        }
}

int main(int argc, char **argv)
{
        int fd, res;

        fd = open(argv[1], O_RDONLY);
        if (fd == -1) {
                perror("Can't open file");
                return 1;
        }

        res = fcntl(fd, F_SETLEASE, F_WRLCK);
        if (res == -1) {
                perror("Can't set lease");
                return 1;
        }

        show_lease(fd);

        if (flock(fd, LOCK_SH) == -1) {
                perror("Can't flock shared");
                return 1;
        }

        show_lease(fd);

        return 0;
}

The first call to show_lease() will show the write lease set, but
the second will show no leases.

Fix the flock adding so that the leases always stay in the head
of this list.

Found during making the flocks pid-namespaces aware.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>

---

diff --git a/fs/locks.c b/fs/locks.c
index 6068f82..0db1a14 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -781,7 +781,7 @@ find_conflict:
 	if (request->fl_flags & FL_ACCESS)
 		goto out;
 	locks_copy_lock(new_fl, request);
-	locks_insert_lock(&inode->i_flock, new_fl);
+	locks_insert_lock(before, new_fl);
 	new_fl = NULL;
 	error = 0;
 Topic: ks and mini-summit documents
ks and mini-summit documents [message #19956] Thu, 06 September 2007 10:14
Cedric Le Goater is currently offline Cedric Le Goater
Messages: 443
Registered: February 2006
Senior Member
From: openvz.org
All,

I've gathered the ks and mini-summit documents here :

	http://lxc.sourceforge.net/doc/

Cheers,

C.
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
 Topic: Re: [PATCH] Send quota messages via netlink
Re: [PATCH] Send quota messages via netlink [message #19931] Wed, 05 September 2007 10:28
serue is currently offline serue
Messages: 750
Registered: February 2006
Senior Member
From: openvz.org
Quoting Jan Kara (jack@suse.cz):
> On Tue 04-09-07 18:48:52, Serge E. Hallyn wrote:
> > Quoting Jan Kara (jack@suse.cz):
> > > On Tue 04-09-07 16:32:10, Serge E. Hallyn wrote:
> > > > Quoting Jan Kara (jack@suse.cz):
> > > > > On Thu 30-08-07 17:14:47, Serge E. Hallyn wrote:
> > > > > > Quoting Jan Kara (jack@suse.cz):
> > > > > > >   I imagine it so that you have a machine and on it several virtual
> > > > > > > machines which are sharing a filesystem (or it could be a cluster). Now you
> > > > > > > want UIDs to be independent between these virtual machines. That's it,
> > > > > > > right?
> > > > > > >   Now to continue the example: Alice has UID 100 on machineA, Bob has
> > > > > > >  UID 100 on machineB. These translate to UIDs 1000 and 1001 on the common
> > > > > > > filesystem. Process of Alice writes to a file and Bob becomes to be over
> > > > > > > quota. In this situation, there would be probably two processes (from
> > > > > > > machineA and machineB) listening on the netlink socket. We want to send a
> > > > > > > message so that on Alice's desktop we can show a message: "You caused
> > > > > > > Bob to exceed his quotas" and of Bob's desktop: "Alice has caused that you
> > > > > > > are over quota.".
> > > > > > 
> > > > > > Since this is over NFS, you handle it the way you would any other time
> > > > > > that user Alice on some other machine managed to do this.
> > > > >   I meant this would actually happen over a local filesystem (imagine
> > > > > something like "hostfs" from UML).
> > > > 
> > > > Ok, then that is where I was previously suggesting that we use an api to
> > > > report a uid meaningful in bob's context, where we currently (in the
> > > > absense of meaningful mount uids and uid equivalence) tell Bob that root
> > > > was the one who brought him over quota.  From a user pov 'nobody' would
> > > > make more sense, but I don't think we want the kernel to know about user
> > > > nobody, right?
> > >   But what is the problem with using the filesystem ids? All virtual
> > > machines in my example should have a notion of those...
> > 
> > I don't know what you mean by filesystem ids.  Do you mean the uid
> > stored on the fs?  I imagine a network fs could get fancy and store
> > something more detailed than the unix uid, based on the user's keys.
> > 
> > Do you mean the inode->i_uid?  Nothing wrong with that.  Then we just
> > assume that either you are in the superblock or mount's user namespace
> > (depending on how we implement it, probably superblock), or can figure
> > out what that is.
>   I meant the identity the process uses to access the filesystem (to
> identify the user who caused the limit excess) and also the identity stored
> in the quota file (to identify whose quota was exceeded).
>   Anyway, any identity more complicated than just a number needs changes in
> both quota file format and filesystems so at that moment, we can also
> change the netlink interface...
> 
> > Sure, and in many ways.  But if working with NFS, as far as I know the
> > most common way to solve it is to enforce a common /etc/passwd across
> > all the valid NFS clients  :)
>   Then one wonders whether user namespaces are really what users want ;).

Absolutely.

You use nfs to share filesystems among separate machines that you want
to have look similar.

You use user namespaces to pretend one machine is a bunch of separate
machines.  So if you're just going to split up your machine into 5
vms and then have them all share disk over nfs, you may just want to
keep it as one machine :)

Ideally each vm would have completely separate disk space, so file
access across user namespaces wouldn't happen.  More realistically,
file trees will be shared read-only - i.e. /lib, /usr, etc.  Some of
that can be handled simply using read-only bind mounts.  We'd like
to allow users to create vm's as well, so then we want uid 500 in
the initial user namespace to be uid 0 in a newly created user
namespace.

So what Eric and I are worried about are corner cases and admin
mistakes, not regular function.

(And again I really do think we'll want to tie netlink sockets to a user
namespace, not a network namespace, so there may be no issue at all
so long as proper filesystem access checks are implemented so that every
action on some filesystem is done with credentials valid in that
filesystems' user namespace)

-serge
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
 Topic: Containers status update
Containers status update [message #19901] Mon, 03 September 2007 04:36
Pavel Emelianov is currently offline Pavel Emelianov
Messages: 1149
Registered: September 2006
Senior Member
From: openvz.org
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
 Topic: containers - bug
containers - bug [message #19874] Fri, 31 August 2007 12:29
Daniel Lezcano is currently offline Daniel Lezcano
Messages: 417
Registered: June 2006
Senior Member
From: openvz.org
Hi Paul,

I was playing with the container filesystem (very nice) and I fall 
inside a kbug.

I did the following:

	mkdir /dev/container

	mount -t container -o cpuset cpuset /dev/container

	cd /dev/container/

	mkdir Charlie

	cd Charlie

	echo $$ > tasks

	bash

	cd ..

	rmdir Charlie

	exit

	ls => bang !

I run a 2.6.23-rc3-mm1 kernel with qemu.

Here is the message:
--------------------
	
BUG: unable to handle kernel paging request at virtual address 6b6b6c23
printing eip: c016719d *pde = 00000000
Oops: 0000 [#1] SMP
Modules linked in:

Pid: 960, comm: bash Not tainted (2.6.23-rc3-mm1 #522)
EIP: 0060:[<c016719d>] EFLAGS: 00000246 CPU: 0
EIP is at __link_path_walk+0x41/0xb54
EAX: c1873320 EBX: c141c608 ECX: c167be38 EDX: c15de000
ESI: c167bef4 EDI: 6b6b6b6b EBP: c167be3c ESP: c167bde8
  DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068
Process bash (pid: 960, ti=c167a000 task=c1466b50 task.ti=c167a000)
Stack: c15de000 00000000 00000400 00000001 000000bf 00008a31 7f1c0300 
01000415
        1a131100 170f1200 00000000 00009600 00009600 00000500 00000003 
c03ab570
        00000082 c167be38 c141c608 c167bef4 c1873320 c167bea8 c0167cfb 
c15de000
Call Trace:
  [<c010398d>] show_trace_log_lvl+0x1a/0x2f
  [<c0103a3f>] show_stack_log_lvl+0x9d/0xa5
  [<c0103aee>] show_registers+0xa7/0x178
  [<c0103cc6>] die+0x107/0x227
  [<c011447c>] do_page_fault+0x47f/0x567
  [<c030800a>] error_code+0x72/0x78
  [<c0167cfb>] link_path_walk+0x4b/0xc0
  [<c0167d89>] path_walk+0x19/0x1b
  [<c0168057>] do_path_lookup+0x179/0x193
  [<c0168823>] __user_walk_fd+0x32/0x49
  [<c0162a09>] vfs_stat_fd+0x1b/0x41
  [<c0162ade>] vfs_stat+0x11/0x13
  [<c0162af4>] sys_stat64+0x14/0x28
  [<c0102a5a>] syscall_call+0x7/0xb
  =======================
Code: ff 45 ac 8b 55 ac 8a 02 3c 2f 74 f4 84 c0 0f 84 99 0a 00 00 8b 06 
83 7e 1c 00 8b 78 18 74 08 83 65 b8 04 83 4d b8 01 83 4e 14 04 <8b> 87 
b8 00 00 00 0f b7 5f 6e 85 c0 74 0a 83 78 34 00 0f 85 b7
EIP: [<c016719d>] __link_path_walk+0x41/0xb54 SS:ESP 0068:c167bde8
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
 Topic: [ANNOUNCE] The Linux Test Project has been Released for AUGUST 2007
[ANNOUNCE] The Linux Test Project has been Released for AUGUST 2007 [message #19862] Fri, 31 August 2007 01:42
Subrata Modak is currently offline Subrata Modak
Messages: 16
Registered: August 2007
Junior Member
From: openvz.org
*Dear All*,

The Linux Test Project test suite has been released for the month of 
AUGUST 2007. The latest version of the test-suite contains 3000+ tests 
for the Linux OS and can be found at http://ltp.sourceforge.net/. Latest 
happenings in LTP can also be found at:
http://ltp.sourceforge.net/wiki/, and
http://ltp.sourceforge.net/wikiArchives.php.

Our web site also contains other information such as:
- A Linux test tools matrix
- Technical papers
- How To's on Linux testing
- Code coverage analysis tool.

*Release Highlights:*
 ** * Integration of NUMA testcases*
 * *Releases of RHEL5 LSPP Certification Test suite***
 * * Release of LTP-KDUMP Test-Case Plan*
 * * Release of Containers Testcases Plan*
 * * Release of GCOV-Kernel & LCOV Packages*
 ** Merging of OPEN_HPI_TESTSUITE-2.9.3 to LTP*
 * Addition of a new Test Case 'swapon03'
 * Many more Bug Fixes and Patches
*
*Note(s) from the Maintainer:*
You will see couple of new test-cases in LTP in coming months and also 
broken testcases getting fixed slowly. We want to revisit all testcases 
and hence the time taken is long. From every release onwards you will 
also find results of LTP-RUNALL on different Architectures (_*ia64, 
x86_64, i386, PPC64, s390x*_) on varied Kernels & Distros. This will 
give you more insight into the behaviour of LTP across different platforms.

We encourage the community to post results to ltp-results@lists.sf.net, 
and patches, new tests, bugs or comments/questions to ltp-list@lists.sf.net,
http://sourceforge.net/tracker/?func=add&group_id=3382&atid=103382 (for 
New Bug(s)),
http://sourceforge.net/tracker/?func=add&group_id=3382&atid=303382 (for 
New Patch(s)),
http://sourceforge.net/tracker/?func=add&group_id=3382&atid=353382 (for 
New Feature Request(s))

Please also see the ChangeLog Attached (AUGUST 2007):

Happy testing,
Regards--
Subrata Modak,


1)  Log Message: "rsalveti@br.ibm.com" fixed write05 that failed with LTP 20070331 on Fedora 7 GA
File(s) Affected:
ltp/testcases/kernel/syscalls/write/write05.c

2)  Log Message: "dmarlin@redhat.com" corrected fail message in data_space testcase
File(s) Affected:
ltp/testcases/kernel/mem/vmtests/data_space.c

3)  Log Message: "liudeyan@cn.ibm.com" made mmap1 to be terminated by Ctrl-C
File(s) Affected:
ltp/testcases/kernel/mem/mtest06/mmap1.c

4)  Log Message: "suzuki@in.ibm.com" fixed times03, where it failed to generate report on user time on RHEl5.1 early build(2.6.18-32.el5)
File(s) Affected: ltp/testcases/kernel/syscalls/times/times03.c

5)  Log Message: gcov-kernel: added eabi-compatibility patch, renamed .diff to .patch by "oberpapr@users.sourceforge.net"
File(s) Added:
ltp/utils/analysis/gcov-kernel/linux-2.6.21-gcov-arm-eabi.patch 
ltp/utils/analysis/gcov-kernel/linux-2.6.22-gcov-arm-eabi.patch
File(s) Deleted:
ltp/utils/analysis/gcov-kernel/linux-2.6.21-gcov-arm-eabi.diff

6)  Log Message: lcov: fixed spec file
File(s) Affected:
ltp/utils/analysis/lcov/rpm/lcov.spec

7)  Log Message: gcov-kernel: removed outdated FAQ entry
File(s) Affected:
ltp/utils/analysis/gcov-kernel/FAQ

8)  Log Message: gcov-kernel: added Makefile
File(s) Added: ltp/utils/analysis/gcov-kernel/Makefile

9)  Log Message: lcov: Makefile for release 1.6
File(s) Affected:
/cvsroot/ltp/utils/analysis/lcov/Makefile

10) Log Message: lcov: Makefile for post-release
File(s) Affected:
/cvsroot/ltp/utils/analysis/lcov/Makefile

11) Log Message: lcov: add experimental option "--norecursion"
File(s) Affected:
/cvsroot/ltp/utils/analysis/lcov/bin/geninfo
/cvsroot/ltp/utils/analysis/lcov/bin/lcov

12) Log Message: Changes to make testcases/kernel/numa/numa01.sh executable
File(s) Affected:
ltp/testcases/kernel/numa/Makefile

13) Log Message: "carmelo.amoroso@st.com" changed the Default values for MAXSIZE and csize
File(s) Affected:
ltp/testcases/kernel/mem/vmtests/stack_space.c

14) Log Message: "brenohl@br.ibm.com" wanted to handle file descriptors properly 
File(s) Affected:
ltp/testcases/kernel/syscalls/mkdir/mkdir03.c
ltp/testcases/kernel/syscalls/mmap/mmap09.c
ltp/testcases/kernel/syscalls/open/open07.c
ltp/testcases/kernel/syscalls/sendfile/sendfile03.c

15) Log Message: "rsalvetidev@linux.vnet.ibm.com" says that this version can handle when the distro has MAX_SWAPFILES as 30 or 32
File(s) Modified:
ltp/runtest/ltplite ltp/runtest/stress.part3
ltp/runtest/syscalls
ltp/testcases/kernel/syscalls/swapon/swapon02.c
File(s) Added:
ltp/testcases/kernel/syscalls/swapon/swapon03.c

16) Log Message: Containers Testcases Plan
File(s) Added:
ltp/testcases/kernel/containers/TEST_PLAN.txt

17) Log Message: LTP-KDUMP Test-Case Plan
File(s) Added:
ltp/testcases/kdump/TEST_PLAN.txt

18) Log Message: Update to OpenHPI 2.9.3 (www.openhpi.org for more info)
File(s) Affected:
ltp/testcases/open_hpi_testsuite/
 


_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
 Topic: Re: [PATCH] Use task_pid_nr() in ip_vs_sync.c
Re: [PATCH] Use task_pid_nr() in ip_vs_sync.c [message #19808] Wed, 29 August 2007 17:50
Sukadev Bhattiprolu is currently offline Sukadev Bhattiprolu
Messages: 413
Registered: August 2006
Senior Member
From: openvz.org
Pavel Emelianov [xemul@openvz.org] wrote:
| The sync_master_pid and sync_backup_pid are set in set_sync_pid()
| and are used later for set/not-set checks and in printk. So it
| is safe to use the global pid value in this case.
| 
| Signed-off-by: Pavel Emelyanov <xemul@openvz.org>

Acked-by: Sukadev Bhattiprolu <sukadev@us.ibm.com>
| 
| ---
| 
| diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c
| index 959c08d..d0798a5 100644
| --- a/net/ipv4/ipvs/ip_vs_sync.c
| +++ b/net/ipv4/ipvs/ip_vs_sync.c
| @@ -794,7 +794,7 @@ static int sync_thread(void *startup)
| 
| 	add_wait_queue(&sync_wait, &wait);
| 
| -	set_sync_pid(state, current->pid);
| +	set_sync_pid(state, task_pid_nr(current));
| 	complete(tinfo->startup);
| 
| 	/*
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
 Topic: Re: [PATCH] Use same_thread_group() in signalfd.c
Re: [PATCH] Use same_thread_group() in signalfd.c [message #19804] Wed, 29 August 2007 17:18
Sukadev Bhattiprolu is currently offline Sukadev Bhattiprolu
Messages: 413
Registered: August 2006
Senior Member
From: openvz.org
Pavel Emelianov [xemul@openvz.org] wrote:
| This is a lost hunk of previous patch that isolated the
| explicit usage of task->tgid in some places. The signalfd
| code uses the tsk->tgid comparison.
| 
| Signed-off-by: Pavel Emelyanov <xemul@openvz.org>

Acked-by: Sukadev Bhattiprolu <sukadev@us.ibm.com>

| 
| ---
| 
| diff --git a/fs/signalfd.c b/fs/signalfd.c
| index a8e293d..5bfd2c5 100644
| --- a/fs/signalfd.c
| +++ b/fs/signalfd.c
| @@ -64,7 +64,7 @@ static int signalfd_lock(struct signalfd
| 		return 0;
| 	}
| 
| -	if (lk->tsk->tgid == current->tgid)
| +	if (same_thread_group(lk->tsk, current->tgid))
| 		lk->tsk = current;
| 
| 	return 1;
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
 Topic: Re: [-mm PATCH] Memory controller improve user interface
Re: [-mm PATCH] Memory controller improve user interface [message #19799] Wed, 29 August 2007 12:17
Paul Menage is currently offline Paul Menage
Messages: 642
Registered: September 2006
Senior Member
From: openvz.org
On 8/29/07, Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> >
> > This seems a bit inconsistent - if you write a value to a limit file,
> > then the value that you read back is reduced by a factor of 1024?
> > Having the "(kB)" suffix isn't really a big help to automated
> > middleware.
> >
>
> Why is that? Is it because you could write 4M and see it show up
> as 4096 kilobytes? We'll that can be fixed with another variant
> of the memparse() utility.

I was thinking the other way around - you can write 1048576 (i.e. 1MB)
to the file and read back 1024. It just seems to me that it's clearer
if you write X to the file to get X back.

>
> 64 bit might be an overkill for 32 bit machines. 32 bit machines with
> PAE cannot use 32 bit values, they need 64 bits.

How is using a 64-bit value for consistency overkill?

As someone pointed out, 4TB machines probably aren't that far around
the corner (if they're not here already) so even if you use KB rather
than bytes, userspace needs to be using an int64 for this value in
case it ends up running as a 32-bit-compiled app on a 64-bit kernel
with lots of memory.

Paul
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
 Topic: Re: [PATCH] Send quota messages via netlink
Re: [PATCH] Send quota messages via netlink [message #19798] Wed, 29 August 2007 11:57
Randy Dunlap is currently offline Randy Dunlap
Messages: 25
Registered: April 2007
Junior Member
From: openvz.org
On Wed, 29 Aug 2007 14:26:47 +0200 Jan Kara wrote:

> On Tue 28-08-07 21:13:35, Andrew Morton wrote:
> > On Tue, 28 Aug 2007 16:13:18 +0200 Jan Kara <jack@suse.cz> wrote:
> > 
> > >   Hello,
> > > 
> > >   I'm sending rediffed patch implementing sending of quota messages via netlink
> > > interface (some rationale in patch description). I've already posted it to
> > > LKML some time ago and there were no objections, so I guess it's fine to put
> > > it to -mm. Andrew, would you be so kind? Thanks.
> > >   Userspace deamon reading the messages from the kernel and sending them to
> > > dbus and/or user console is also written (it's part of quota-tools). The
> > > only remaining problem is there are a few changes needed to libnl needed for
> > > the userspace daemon. They were basically acked by the maintainer but it
> > > seems he has not merged the patches yet. So this will take a bit more time.
> > > 
> > 
> > So it's a new kernel->userspace interface.
> > 
> > But we have no description of the interface :(
>   Oops, forgotten about it. I'll write one. Do we have some standard place
> where to document such interfaces? I could create some file in
> Documentation/filesystems/ but that seems a bit superfluous...

It looks like other quota documentation is in Documentation/filesystems/,
and that seems reasonable to me for the other quota docs & this one.

---
~Randy
*** Remember to use Documentation/SubmitChecklist when testing your code ***
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
 Topic: Re: [PATCH] Switch nfs/callback.c to using struct pid, not pid_t
Re: [PATCH] Switch nfs/callback.c to using struct pid, not pid_t [message #19809] Wed, 29 August 2007 09:52
Christoph Hellwig is currently offline Christoph Hellwig
Messages: 59
Registered: April 2006
Member
From: openvz.org
On Wed, Aug 29, 2007 at 05:36:24PM +0400, Pavel Emelyanov wrote:
> Pid namespaces make it dangerous to use pid and tgid values
> when run in some namespace. The struct pid itself is going
> to be the only way for working with task pids, so make the
> nfs callback thread use it.
> 
> Since nfs_callback_info.pid is set to current's one and reset
> on the thread exit, it is safe not to get the struct pid. 
> 
> Since this pid is used later under lock_kernel() w/o sleeping 
> operations, checking for i to be not NULL and killing the 
> thread with kill_pid() is safe.

NACK.  This just makes the code even more obscure.  Please get rid
of the pid references entirely and convert the code to the kthread
API.

_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
 Topic: [PATCH] Switch nfs/callback.c to using struct pid, not pid_t
[PATCH] Switch nfs/callback.c to using struct pid, not pid_t [message #19824] Wed, 29 August 2007 09:36
Pavel Emelianov is currently offline Pavel Emelianov
Messages: 1149
Registered: September 2006
Senior Member
From: openvz.org
Pid namespaces make it dangerous to use pid and tgid values
when run in some namespace. The struct pid itself is going
to be the only way for working with task pids, so make the
nfs callback thread use it.

Since nfs_callback_info.pid is set to current's one and reset
on the thread exit, it is safe not to get the struct pid. 

Since this pid is used later under lock_kernel() w/o sleeping 
operations, checking for i to be not NULL and killing the 
thread with kill_pid() is safe.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>

---

diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index a796be5..5b8e5fc 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -27,7 +27,7 @@
 struct nfs_callback_data {
 	unsigned int users;
 	struct svc_serv *serv;
-	pid_t pid;
+	struct pid *pid;
 	struct completion started;
 	struct completion stopped;
 };
@@ -64,7 +64,7 @@ static void nfs_callback_svc(struct svc_
 	__module_get(THIS_MODULE);
 	lock_kernel();
 
-	nfs_callback_info.pid = current->pid;
+	nfs_callback_info.pid = task_pid(current);
 	daemonize("nfsv4-svc");
 	/* Process request with signals blocked, but allow SIGKILL.  */
 	allow_signal(SIGKILL);
@@ -98,7 +98,7 @@ static void nfs_callback_svc(struct svc_
 	}
 
 	svc_exit_thread(rqstp);
-	nfs_callback_info.pid = 0;
+	nfs_callback_info.pid = NULL;
 	complete(&nfs_callback_info.stopped);
 	unlock_kernel();
 	module_put_and_exit(0);
@@ -114,7 +114,7 @@ int nfs_callback_up(void)
 
 	lock_kernel();
 	mutex_lock(&nfs_callback_mutex);
-	if (nfs_callback_info.users++ || nfs_callback_info.pid != 0)
+	if (nfs_callback_info.users++ || nfs_callback_info.pid != NULL)
 		goto out;
 	init_completion(&nfs_callback_info.started);
 	init_completion(&nfs_callback_info.stopped);
@@ -157,9 +157,9 @@ void nfs_callback_down(void)
 	mutex_lock(&nfs_callback_mutex);
 	nfs_callback_info.users--;
 	do {
-		if (nfs_callback_info.users != 0 || nfs_callback_info.pid == 0)
+		if (nfs_callback_info.users != 0 || nfs_callback_info.pid == NULL)
 			break;
-		if (kill_proc(nfs_callback_info.pid, SIGKILL, 1) < 0)
+		if (kill_pid(nfs_callback_info.pid, SIGKILL, 1) < 0)
 			break;
 	} while (wait_for_completion_timeout(&nfs_callback_info.stopped, 5*HZ) == 0);
 	mutex_unlock(&nfs_callback_mutex);
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
 Topic: [PATCH] Use task_pid_nr() in ip_vs_sync.c
[PATCH] Use task_pid_nr() in ip_vs_sync.c [message #19823] Wed, 29 August 2007 09:30
Pavel Emelianov is currently offline Pavel Emelianov
Messages: 1149
Registered: September 2006
Senior Member
From: openvz.org
The sync_master_pid and sync_backup_pid are set in set_sync_pid()
and are used later for set/not-set checks and in printk. So it
is safe to use the global pid value in this case.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>

---

diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c
index 959c08d..d0798a5 100644
--- a/net/ipv4/ipvs/ip_vs_sync.c
+++ b/net/ipv4/ipvs/ip_vs_sync.c
@@ -794,7 +794,7 @@ static int sync_thread(void *startup)
 
 	add_wait_queue(&sync_wait, &wait);
 
-	set_sync_pid(state, current->pid);
+	set_sync_pid(state, task_pid_nr(current));
 	complete(tinfo->startup);
 
 	/*
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
 Topic: [PATCH] Remove write-only variable from pktgen_thread
[PATCH] Remove write-only variable from pktgen_thread [message #19822] Wed, 29 August 2007 09:22
Pavel Emelianov is currently offline Pavel Emelianov
Messages: 1149
Registered: September 2006
Senior Member
From: openvz.org
The pktgen_thread.pid is set to current->pid and is never used
after this. So remove this at all.

Found during isolating the explicit pid/tgid usage.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>

---

diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 3a3154e..93695c2 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -380,7 +380,6 @@ struct pktgen_thread {
 	/* Field for thread to receive "posted" events terminate, stop ifs etc. */
 
 	u32 control;
-	int pid;
 	int cpu;
 
 	wait_queue_head_t queue;
@@ -3462,8 +3461,6 @@ static int pktgen_thread_worker(void *ar
 
 	init_waitqueue_head(&t->queue);
 
-	t->pid = current->pid;
-
 	pr_debug("pktgen: starting pktgen/%d:  pid=%d\n", cpu, task_pid_nr(current));
 
 	max_before_softirq = t->max_before_softirq;
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
 Topic: [PATCH] Use same_thread_group() in signalfd.c
[PATCH] Use same_thread_group() in signalfd.c [message #19821] Wed, 29 August 2007 09:19
Pavel Emelianov is currently offline Pavel Emelianov
Messages: 1149
Registered: September 2006
Senior Member
From: openvz.org
This is a lost hunk of previous patch that isolated the
explicit usage of task->tgid in some places. The signalfd
code uses the tsk->tgid comparison.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>

---

diff --git a/fs/signalfd.c b/fs/signalfd.c
index a8e293d..5bfd2c5 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -64,7 +64,7 @@ static int signalfd_lock(struct signalfd
 		return 0;
 	}
 
-	if (lk->tsk->tgid == current->tgid)
+	if (same_thread_group(lk->tsk, current->tgid))
 		lk->tsk = current;
 
 	return 1;
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
Pages (31): [ «    8  9  10  11  12  13  14  15  16  17  18  19  20  21  22  23    »]


Current Time: Fri May 24 13:54:32 EDT 2013
Powered by FUDforum Powered by Parallels Virtuozzo Containers