Today's Messages (off)
| Unanswered Messages (on)
| Forum: Devel |
|---|
| Topic: [PATCH 10/33] task containersv11 automatic userspace notification of idle containers |
|---|
| [PATCH 10/33] task containersv11 automatic userspace notification of idle containers [message #20415] |
Mon, 17 September 2007 17:03 |
Paul Menage Messages: 642 Registered: September 2006 |
Senior Member |
From: openvz.org
|
|
Add the following files to the cgroup filesystem:
notify_on_release - configures/reports whether the cgroup subsystem should
attempt to run a release script when this cgroup becomes unused
release_agent - configures/reports the release agent to be used for this
hierarchy (top level in each hierarchy only)
releasable - reports whether this cgroup would have been auto-released if
notify_on_release was true and a release agent was configured (mainly useful
for debugging)
To avoid locking issues, invoking the userspace release agent is done via a
workqueue task; cgroups that need to have their release agents invoked by
the workqueue task are linked on to a list.
Signed-off-by: Paul Menage <menage@google.com>
---
include/linux/cgroup.h | 11
kernel/cgroup.c | 425 ++++++++++++++++++++++++++++++++----
2 files changed, 393 insertions(+), 43 deletions(-)
diff -puN include/linux/cgroup.h~task-cgroupsv11-automatic-userspace-notification-of-idle-cgroups include/linux/cgroup.h
--- a/include/linux/cgroup.h~task-cgroupsv11-automatic-userspace-notification-of-idle-cgroups
+++ a/include/linux/cgroup.h
@@ -77,10 +77,11 @@ static inline void css_get(struct contai
* css_get()
*/
+extern void __css_put(struct cgroup_subsys_state *css);
static inline void css_put(struct cgroup_subsys_state *css)
{
if (!test_bit(CSS_ROOT, &css->flags))
- atomic_dec(&css->refcnt);
+ __css_put(css);
}
struct cgroup {
@@ -112,6 +113,13 @@ struct cgroup {
* tasks in this cgroup. Protected by css_set_lock
*/
struct list_head css_sets;
+
+ /*
+ * Linked list running through all cgroups that can
+ * potentially be reaped by the release agent. Protected by
+ * release_list_lock
+ */
+ struct list_head release_list;
};
/* A css_set is a structure holding pointers to a set of
@@ -285,7 +293,6 @@ struct task_struct *cgroup_iter_next(
struct cgroup_iter *it);
void cgroup_iter_end(struct cgroup *cont, struct cgroup_iter *it);
-
#else /* !CONFIG_CGROUPS */
static inline int cgroup_init_early(void) { return 0; }
diff -puN kernel/cgroup.c~task-cgroupsv11-automatic-userspace-notification-of-idle-cgroups kernel/cgroup.c
--- a/kernel/cgroup.c~task-cgroupsv11-automatic-userspace-notification-of-idle-cgroups
+++ a/kernel/cgroup.c
@@ -44,6 +44,8 @@
#include <linux/sort.h>
#include <asm/atomic.h>
+static DEFINE_MUTEX(cgroup_mutex);
+
/* Generate an array of cgroup subsystem pointers */
#define SUBSYS(_x) &_x ## _subsys,
@@ -82,6 +84,13 @@ struct cgroupfs_root {
/* Hierarchy-specific flags */
unsigned long flags;
+
+ /* The path to use for release notifications. No locking
+ * between setting and use - so if userspace updates this
+ * while child cgroups exist, you could miss a
+ * notification. We ensure that it's always a valid
+ * NUL-terminated string */
+ char release_agent_path[PATH_MAX];
};
@@ -109,7 +118,13 @@ static int need_forkexit_callback;
/* bits in struct cgroup flags field */
enum {
+ /* Control Group is dead */
CONT_REMOVED,
+ /* Control Group has previously had a child cgroup or a task,
+ * but no longer (only if CONT_NOTIFY_ON_RELEASE is set) */
+ CONT_RELEASABLE,
+ /* Control Group requires release notifications to userspace */
+ CONT_NOTIFY_ON_RELEASE,
};
/* convenient tests for these bits */
@@ -123,6 +138,19 @@ enum {
ROOT_NOPREFIX, /* mounted subsystems have no named prefix */
};
+inline int cgroup_is_releasable(const struct cgroup *cont)
+{
+ const int bits =
+ (1 << CONT_RELEASABLE) |
+ (1 << CONT_NOTIFY_ON_RELEASE);
+ return (cont->flags & bits) == bits;
+}
+
+inline int notify_on_release(const struct cgroup *cont)
+{
+ return test_bit(CONT_NOTIFY_ON_RELEASE, &cont->flags);
+}
+
/*
* for_each_subsys() allows you to iterate on each subsystem attached to
* an active hierarchy
@@ -134,6 +162,14 @@ list_for_each_entry(_ss, &_root->subsys_
#define for_each_root(_root) \
list_for_each_entry(_root, &roots, root_list)
+/* the list of cgroups eligible for automatic release. Protected by
+ * release_list_lock */
+static LIST_HEAD(release_list);
+static DEFINE_SPINLOCK(release_list_lock);
+static void cgroup_release_agent(struct work_struct *work);
+static DECLARE_WORK(release_agent_work, cgroup_release_agent);
+static void check_for_release(struct cgroup *cont);
+
/* Link structure for associating css_set objects with cgroups */
struct cg_cgroup_link {
/*
@@ -188,11 +224,8 @@ static int use_task_css_set_links;
/*
* unlink a css_set from the list and free it
*/
-static void release_css_set(struct kref *k)
+static void unlink_css_set(struct css_set *cg)
{
- struct css_set *cg = container_of(k, struct css_set, ref);
- int i;
-
write_lock(&css_set_lock);
list_del(&cg->list);
css_set_count--;
@@ -205,11 +238,39 @@ static void release_css_set(struct kre
kfree(link);
}
write_unlock(&css_set_lock);
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
- atomic_dec(&cg->subsys[i]->cgroup->count);
+}
+
+static void __release_css_set(struct kref *k, int taskexit)
+{
+ int i;
+ struct css_set *cg = container_of(k, struct css_set, ref);
+
+ unlink_css_set(cg);
+
+ rcu_read_lock();
+ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ struct cgroup *cont = cg->subsys[i]->cgroup;
+ if (atomic_dec_and_test(&cont->count) &&
+ notify_on_release(cont)) {
+ if (taskexit)
+ set_bit(CONT_RELEASABLE, &cont->flags);
+ check_for_release(cont);
+ }
+ }
+ rcu_read_unlock();
kfree(cg);
}
+static void release_css_set(struct kref *k)
+{
+ __release_css_set(k, 0);
+}
+
+static void release_css_set_taskexit(struct kref *k)
+{
+ __release_css_set(k, 1);
+}
+
/*
* refcounted get/put for css_set objects
*/
@@ -223,6 +284,11 @@ static inline void put_css_set(struct
kref_put(&cg->ref, release_css_set);
}
+static inline void put_css_set_taskexit(struct css_set *cg)
+{
+ kref_put(&cg->ref, release_css_set_taskexit);
+}
+
/*
* find_existing_css_set() is a helper for
* find_css_set(), and checks to see whether an existing
@@ -464,8 +530,6 @@ static struct css_set *find_css_set(
* update of a tasks cgroup pointer by attach_task()
*/
-static DEFINE_MUTEX(cgroup_mutex);
-
/**
* cgroup_lock - lock out any changes to cgroup structures
*
@@ -524,6 +588,13 @@ static void cgroup_diput(struct dentr
if (S_ISDIR(inode->i_mode)) {
struct cgroup *cont = dentry->d_fsdata;
BUG_ON(!(cgroup_is_removed(cont)));
+ /* It's possible for external users to be holding css
+ * reference counts on a cgroup; css_put() needs to
+ * be able to access the cgroup after decrementing
+ * the reference count in order to know if it needs to
+ * queue the cgroup to be handled by the release
+ * agent */
+ synchronize_rcu();
kfree(cont);
}
iput(inode);
@@ -668,6 +739,8 @@ static int cgroup_show_options(struct
seq_printf(seq, ",%s", ss->name);
if (test_bit(ROOT_NOPREFIX, &root->flags))
seq_puts(seq, ",noprefix");
+ if (strlen(root->release_agent_path))
+ seq_printf(seq, ",release_agent=%s", root->release_agent_path);
mutex_unlock(&cgroup_mutex);
return 0;
}
@@ -675,6 +748,7 @@ static int cgroup_show_options(struct
struct cgroup_sb_opts {
unsigned long subsys_bits;
unsigned long flags;
+ char *release_agent;
};
/* Convert a hierarchy specifier into a bitmask of subsystems and
@@ -686,6 +760,7 @@ static int parse_cgroupfs_options(cha
opts->subsys_bits = 0;
opts->flags = 0;
+ opts->release_agent = NULL;
while ((token = strsep(&o, ",")) != NULL) {
if (!*token)
@@ -694,6 +769,15 @@ static int parse_cgroupfs_options(cha
opts->subsys_bits = (1 << CGROUP_SUBSYS_COUNT) - 1;
} else if (!strcmp(token, "noprefix")) {
set_bit(ROOT_NOPREFIX, &opts->flags);
+ } else if (!strncmp(token, "release_agent=", 14)) {
+ /* Specifying two release agents is forbidden */
+ if (opts->release_agent)
+ return -EINVAL;
+ opts->release_agent = kzalloc(PATH_MAX, GFP_KERNEL);
+ if (!opts->release_agent)
+ return -ENOMEM;
+ strncpy(opts->release_agent, token + 14, PATH_MAX - 1);
+ opts->release_agent[PATH_MAX - 1] = 0;
} else {
struct cgroup_subsys *ss;
int i;
@@ -743,7 +827,11 @@ static int cgroup_remount(struct supe
if (!ret)
cgroup_populate_dir(cont);
+ if (opts.release_agent)
+ strcpy(root->release_agent_path, opts.release_agent);
out_unlock:
+ if (opts.release_agent)
+ kfree(opts.release_agent);
mutex_unlock(&cgroup_mutex);
mutex_unlock(&cont->dentry->d_inode->i_mutex);
return ret;
@@ -767,6 +855,7 @@ static void init_cgroup_root(struct c
INIT_LIST_HEAD(&cont->sibling);
INIT_LIST_HEAD(&cont->children);
INIT_LIST_HEAD(&cont->css_sets);
+ INIT_LIST_HEAD(&cont->release_list);
}
static int cgroup_test_super(struct super_block *sb, void *data)
@@ -841,8 +930,11 @@ static int cgroup_get_sb(struct file_
/* First find the desired set of subsystems */
ret = parse_cgroupfs_options(data, &opts);
- if (ret)
+ if (ret) {
+ if (opts.release_agent)
+ kfree(opts.release_agent);
return ret;
+ }
root = kzalloc(sizeof(*root), GFP_KERNEL);
if (!root)
@@ -851,6 +943,10 @@ static int cgroup_get_sb(struct file_
init_cgroup_root(root);
root->subsys_bits = opts.subsys_bits;
root->flags = opts.flags;
+ if (opts.release_agent) {
+ strcpy(root->release_agent_path, opts.release_agent);
+ kfree(opts.release_agent);
+ }
sb = sget(fs_type, cgroup_test_super, cgroup_set_super, root);
@@ -1131,7 +1227,7 @@ static int attach_task(struct cgroup
ss->attach(ss, cont, oldcont, tsk);
}
}
-
+ set_bit(CONT_RELEASABLE, &oldcont->flags);
synchronize_rcu();
put_css_set(cg);
return 0;
@@ -1181,6 +1277,9 @@ enum cgroup_fi
...
|
|
| | Topic: [PATCH 09/33] task containersv11 shared container subsystem group arrays include fix |
|---|
| [PATCH 09/33] task containersv11 shared container subsystem group arrays include fix [message #20408] |
Mon, 17 September 2007 17:03 |
Paul Menage Messages: 642 Registered: September 2006 |
Senior Member |
From: openvz.org
|
|
kernel/cgroup.c: In function 'cgroup_new_inode':
kernel/cgroup.c:573: error: variable 'cgroup_backing_dev_info' has initializer but incomplete type
kernel/cgroup.c:574: error: unknown field 'capabilities' specified in initializer
kernel/cgroup.c:574: error: 'BDI_CAP_NO_ACCT_DIRTY' undeclared (first use in this function)
kernel/cgroup.c:574: error: (Each undeclared identifier is reported only once)
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Paul Menage <menage@google.com>
---
kernel/cgroup.c | 1 +
1 file changed, 1 insertion(+)
diff -puN kernel/cgroup.c~task-cgroupsv11-shared-cgroup-subsystem-group-arrays-include-fix kernel/cgroup.c
--- a/kernel/cgroup.c~task-cgroupsv11-shared-cgroup-subsystem-group-arrays-include-fix
+++ a/kernel/cgroup.c
@@ -36,6 +36,7 @@
#include <linux/proc_fs.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
+#include <linux/backing-dev.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/magic.h>
_
--
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
|
|
| | Topic: [PATCH 08/33] task containersv11 shared container subsystem group arrays avoid lockdep warning |
|---|
| [PATCH 08/33] task containersv11 shared container subsystem group arrays avoid lockdep warning [message #20430] |
Mon, 17 September 2007 17:03 |
Paul Menage Messages: 642 Registered: September 2006 |
Senior Member |
From: openvz.org
|
|
I think this is the right way to handle the lockdep false-positive in the
current cgroups patches, but I'm not that familiar with lockdep so any
suggestions for a better approach are welcomed.
In order to avoid a false-positive lockdep warning, we lock the root inode
of a new filesystem mount prior to taking cgroup_mutex, to preserve the
invariant that cgroup_mutex nests inside inode->i_mutex. In order to
prevent a lockdep false positive when locking i_mutex on a newly-created
cgroup directory inode we use mutex_lock_nested(), with a nesting level
of I_MUTEX_CHILD since the new inode will ultimately be a child directory
of the parent whose i_mutex is nested outside of cgroup_mutex.
Signed-off-by: Paul Menage <menage@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
kernel/cgroup.c | 17 +++++++----------
1 files changed, 7 insertions(+), 10 deletions(-)
diff -puN kernel/cgroup.c~task-cgroupsv11-shared-cgroup-subsystem-group-arrays-avoid-lockdep-warning kernel/cgroup.c
--- a/kernel/cgroup.c~task-cgroupsv11-shared-cgroup-subsystem-group-arrays-avoid-lockdep-warning
+++ a/kernel/cgroup.c
@@ -867,13 +867,16 @@ static int cgroup_get_sb(struct file_
} else {
/* New superblock */
struct cgroup *cont = &root->top_cgroup;
+ struct inode *inode;
BUG_ON(sb->s_root != NULL);
ret = cgroup_get_rootdir(sb);
if (ret)
goto drop_new_super;
+ inode = sb->s_root->d_inode;
+ mutex_lock(&inode->i_mutex);
mutex_lock(&cgroup_mutex);
/*
@@ -886,12 +889,14 @@ static int cgroup_get_sb(struct file_
ret = allocate_cg_links(css_set_count, &tmp_cg_links);
if (ret) {
mutex_unlock(&cgroup_mutex);
+ mutex_unlock(&inode->i_mutex);
goto drop_new_super;
}
ret = rebind_subsystems(root, root->subsys_bits);
if (ret == -EBUSY) {
mutex_unlock(&cgroup_mutex);
+ mutex_unlock(&inode->i_mutex);
goto drop_new_super;
}
@@ -931,16 +936,8 @@ static int cgroup_get_sb(struct file_
BUG_ON(!list_empty(&cont->children));
BUG_ON(root->number_of_cgroups != 1);
- /*
- * I believe that it's safe to nest i_mutex inside
- * cgroup_mutex in this case, since no-one else can
- * be accessing this directory yet. But we still need
- * to teach lockdep that this is the case - currently
- * a cgroupfs remount triggers a lockdep warning
- */
- mutex_lock(&cont->dentry->d_inode->i_mutex);
cgroup_populate_dir(cont);
- mutex_unlock(&cont->dentry->d_inode->i_mutex);
+ mutex_unlock(&inode->i_mutex);
mutex_unlock(&cgroup_mutex);
}
@@ -1358,7 +1355,7 @@ static int cgroup_create_file(struct
/* start with the directory inode held, so that we can
* populate it without racing with another mkdir */
- mutex_lock(&inode->i_mutex);
+ mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
} else if (S_ISREG(mode)) {
inode->i_size = 0;
inode->i_fop = &cgroup_file_operations;
_
--
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
|
|
| | Topic: [PATCH 07/33] task containersv11 shared container subsystem group arrays |
|---|
| [PATCH 07/33] task containersv11 shared container subsystem group arrays [message #20432] |
Mon, 17 September 2007 17:03 |
Paul Menage Messages: 642 Registered: September 2006 |
Senior Member |
From: openvz.org
|
|
Replace the struct css_set embedded in task_struct with a pointer; all tasks
that have the same set of memberships across all hierarchies will share a
css_set object, and will be linked via their css_sets field to the "tasks"
list_head in the css_set.
Assuming that many tasks share the same cgroup assignments, this reduces
overall space usage and keeps the size of the task_struct down (three pointers
added to task_struct compared to a non-cgroups kernel, no matter how many
subsystems are registered).
Signed-off-by: Paul Menage <menage@google.com>
---
Documentation/cgroups.txt | 14
include/linux/cgroup.h | 89 ++++
include/linux/sched.h | 33 -
kernel/cgroup.c | 606 ++++++++++++++++++++++++++++-----
kernel/fork.c | 1
5 files changed, 620 insertions(+), 123 deletions(-)
diff -puN Documentation/cgroups.txt~task-cgroupsv11-shared-cgroup-subsystem-group-arrays Documentation/cgroups.txt
--- a/Documentation/cgroups.txt~task-cgroupsv11-shared-cgroup-subsystem-group-arrays
+++ a/Documentation/cgroups.txt
@@ -176,7 +176,9 @@ Control Groups extends the kernel as follows
subsystem state is something that's expected to happen frequently
and in performance-critical code, whereas operations that require a
task's actual cgroup assignments (in particular, moving between
- cgroups) are less common.
+ cgroups) are less common. A linked list runs through the cg_list
+ field of each task_struct using the css_set, anchored at
+ css_set->tasks.
- A cgroup hierarchy filesystem can be mounted for browsing and
manipulation from user space.
@@ -252,6 +254,16 @@ linear search to locate an appropriate e
very efficient. A future version will use a hash table for better
performance.
+To allow access from a cgroup to the css_sets (and hence tasks)
+that comprise it, a set of cg_cgroup_link objects form a lattice;
+each cg_cgroup_link is linked into a list of cg_cgroup_links for
+a single cgroup on its cont_link_list field, and a list of
+cg_cgroup_links for a single css_set on its cg_link_list.
+
+Thus the set of tasks in a cgroup can be listed by iterating over
+each css_set that references the cgroup, and sub-iterating over
+each css_set's task set.
+
The use of a Linux virtual file system (vfs) to represent the
cgroup hierarchy provides for a familiar permission and name space
for cgroups, with a minimum of additional kernel code.
diff -puN include/linux/cgroup.h~task-cgroupsv11-shared-cgroup-subsystem-group-arrays include/linux/cgroup.h
--- a/include/linux/cgroup.h~task-cgroupsv11-shared-cgroup-subsystem-group-arrays
+++ a/include/linux/cgroup.h
@@ -27,10 +27,19 @@ extern void cgroup_lock(void);
extern void cgroup_unlock(void);
extern void cgroup_fork(struct task_struct *p);
extern void cgroup_fork_callbacks(struct task_struct *p);
+extern void cgroup_post_fork(struct task_struct *p);
extern void cgroup_exit(struct task_struct *p, int run_callbacks);
extern struct file_operations proc_cgroup_operations;
+/* Define the enumeration of all cgroup subsystems */
+#define SUBSYS(_x) _x ## _subsys_id,
+enum cgroup_subsys_id {
+#include <linux/cgroup_subsys.h>
+ CGROUP_SUBSYS_COUNT
+};
+#undef SUBSYS
+
/* Per-subsystem/per-cgroup state maintained by the system. */
struct cgroup_subsys_state {
/* The cgroup that this subsystem is attached to. Useful
@@ -97,6 +106,52 @@ struct cgroup {
struct cgroupfs_root *root;
struct cgroup *top_cgroup;
+
+ /*
+ * List of cg_cgroup_links pointing at css_sets with
+ * tasks in this cgroup. Protected by css_set_lock
+ */
+ struct list_head css_sets;
+};
+
+/* A css_set is a structure holding pointers to a set of
+ * cgroup_subsys_state objects. This saves space in the task struct
+ * object and speeds up fork()/exit(), since a single inc/dec and a
+ * list_add()/del() can bump the reference count on the entire
+ * cgroup set for a task.
+ */
+
+struct css_set {
+
+ /* Reference count */
+ struct kref ref;
+
+ /*
+ * List running through all cgroup groups. Protected by
+ * css_set_lock
+ */
+ struct list_head list;
+
+ /*
+ * List running through all tasks using this cgroup
+ * group. Protected by css_set_lock
+ */
+ struct list_head tasks;
+
+ /*
+ * List of cg_cgroup_link objects on link chains from
+ * cgroups referenced from this css_set. Protected by
+ * css_set_lock
+ */
+ struct list_head cg_links;
+
+ /*
+ * Set of subsystem states, one for each subsystem. This array
+ * is immutable after creation apart from the init_css_set
+ * during subsystem registration (at boot time).
+ */
+ struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
+
};
/* struct cftype:
@@ -149,15 +204,7 @@ int cgroup_is_removed(const struct co
int cgroup_path(const struct cgroup *cont, char *buf, int buflen);
-int __cgroup_task_count(const struct cgroup *cont);
-static inline int cgroup_task_count(const struct cgroup *cont)
-{
- int task_count;
- rcu_read_lock();
- task_count = __cgroup_task_count(cont);
- rcu_read_unlock();
- return task_count;
-}
+int cgroup_task_count(const struct cgroup *cont);
/* Return true if the cgroup is a descendant of the current cgroup */
int cgroup_is_descendant(const struct cgroup *cont);
@@ -205,7 +252,7 @@ static inline struct cgroup_subsys_st
static inline struct cgroup_subsys_state *task_subsys_state(
struct task_struct *task, int subsys_id)
{
- return rcu_dereference(task->cgroups.subsys[subsys_id]);
+ return rcu_dereference(task->cgroups->subsys[subsys_id]);
}
static inline struct cgroup* task_cgroup(struct task_struct *task,
@@ -218,6 +265,27 @@ int cgroup_path(const struct containe
int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *ss);
+/* A cgroup_iter should be treated as an opaque object */
+struct cgroup_iter {
+ struct list_head *cg_link;
+ struct list_head *task;
+};
+
+/* To iterate across the tasks in a cgroup:
+ *
+ * 1) call cgroup_iter_start to intialize an iterator
+ *
+ * 2) call cgroup_iter_next() to retrieve member tasks until it
+ * returns NULL or until you want to end the iteration
+ *
+ * 3) call cgroup_iter_end() to destroy the iterator.
+ */
+void cgroup_iter_start(struct cgroup *cont, struct cgroup_iter *it);
+struct task_struct *cgroup_iter_next(struct cgroup *cont,
+ struct cgroup_iter *it);
+void cgroup_iter_end(struct cgroup *cont, struct cgroup_iter *it);
+
+
#else /* !CONFIG_CGROUPS */
static inline int cgroup_init_early(void) { return 0; }
@@ -225,6 +293,7 @@ static inline int cgroup_init(void) {
static inline void cgroup_init_smp(void) {}
static inline void cgroup_fork(struct task_struct *p) {}
static inline void cgroup_fork_callbacks(struct task_struct *p) {}
+static inline void cgroup_post_fork(struct task_struct *p) {}
static inline void cgroup_exit(struct task_struct *p, int callbacks) {}
static inline void cgroup_lock(void) {}
diff -puN include/linux/sched.h~task-cgroupsv11-shared-cgroup-subsystem-group-arrays include/linux/sched.h
--- a/include/linux/sched.h~task-cgroupsv11-shared-cgroup-subsystem-group-arrays
+++ a/include/linux/sched.h
@@ -861,34 +861,6 @@ struct sched_entity {
#endif
};
-#ifdef CONFIG_CGROUPS
-
-#define SUBSYS(_x) _x ## _subsys_id,
-enum cgroup_subsys_id {
-#include <linux/cgroup_subsys.h>
- CGROUP_SUBSYS_COUNT
-};
-#undef SUBSYS
-
-/* A css_set is a structure holding pointers to a set of
- * cgroup_subsys_state objects.
- */
-
-struct css_set {
-
- /* Set of subsystem states, one for each subsystem. NULL for
- * subsystems that aren't part of this hierarchy. These
- * pointers reduce the number of dereferences required to get
- * from a task to its state for a given cgroup, but result
- * in increased space usage if tasks are in wildly different
- * groupings across different hierarchies. This array is
- * immutable after creation */
- struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
-
-};
-
-#endif /* CONFIG_CGROUPS */
-
struct task_struct {
volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
void *stack;
@@ -1125,7 +1097,10 @@ struct task_struct {
int cpuset_mem_spread_rotor;
#endif
#ifdef CONFIG_CGROUPS
- struct css_set cgroups;
+ /* Control Group info protected by css_set_lock */
+ struct css_set *cgroups;
+ /* cg_list protected by css_set_lock and tsk->alloc_lock */
+ struct list_head cg_list;
#endif
#ifdef CONFIG_FUTEX
struct robust_list_head __user *robust_list;
diff -puN kernel/cgroup.c~task-cgroupsv11-shared-cgroup-subsystem-group-arrays kernel/cgroup.c
--- a/kernel/cgroup.c~task-cgroupsv11-shared-cgroup-subsystem-group-arrays
+++ a/kernel/cgroup.c
@@ -95,6 +95,7 @@ static struct cgroupfs_root rootnode;
/* The list of hierarchy roots */
static LIST_HEAD(roots);
+static int root_count;
/* dummytop is a shorthand for the dummy hierarchy's top cgroup */
#define dummytop (&rootnode.top_cgroup)
@@ -133,12 +134,49 @@ list_for_each_entry(_ss, &_root->subsys_
#define for_each_root(_root) \
list_for_each_entry(_root, &roots, root_list)
-/* Each task_struct has an embedded css_set, so the get/put
- * operation simply takes a reference count on all the cgroups
- * referenced by subsystems in this css_set. This can end up
- * multiple-counting some cgroups, but that's OK - the ref-count is
- * just a busy/not-busy indicator; ensuring that we only count each
- * cgroup once would require taking a global lock to ensure that no
+/* Link structure for associating css_set objects with cgroups */
+struct cg_cgroup_link {
+ /*
+ * List running through cg_cgroup_links associated with a
+ * cgroup, anchored on cgroup->css_sets
+ */
+ struct list_head cont_link_list;
+ /*
+ * List running through cg_cgroup_links pointing at a
+ * single css_set object, anchored on css_set->cg_links
+ */
+ struct list_head cg_link_list;
+ struct css_set *cg;
+};
+
+/* The default css_set - used by init and its children prior to any
+ * hierarchies being mount
...
|
|
| | Topic: [PATCH 06/33] task containersv11 add procfs interface |
|---|
| [PATCH 06/33] task containersv11 add procfs interface [message #20407] |
Mon, 17 September 2007 17:03 |
Paul Menage Messages: 642 Registered: September 2006 |
Senior Member |
From: openvz.org
|
|
Add:
/proc/cgroups - general system info
/proc/*/cgroup - per-task cgroup membership info
Signed-off-by: Paul Menage <menage@google.com>
---
fs/proc/base.c | 7 +
include/linux/cgroup.h | 2
kernel/cgroup.c | 132 ++++++++++++++++++++++++++++++++++++
3 files changed, 141 insertions(+)
diff -puN fs/proc/base.c~task-cgroupsv11-add-procfs-interface fs/proc/base.c
--- a/fs/proc/base.c~task-cgroupsv11-add-procfs-interface
+++ a/fs/proc/base.c
@@ -67,6 +67,7 @@
#include <linux/mount.h>
#include <linux/security.h>
#include <linux/ptrace.h>
+#include <linux/cgroup.h>
#include <linux/cpuset.h>
#include <linux/audit.h>
#include <linux/poll.h>
@@ -2051,6 +2052,9 @@ static const struct pid_entry tgid_base_
#ifdef CONFIG_CPUSETS
REG("cpuset", S_IRUGO, cpuset),
#endif
+#ifdef CONFIG_CGROUPS
+ REG("cgroup", S_IRUGO, cgroup),
+#endif
INF("oom_score", S_IRUGO, oom_score),
REG("oom_adj", S_IRUGO|S_IWUSR, oom_adjust),
#ifdef CONFIG_AUDITSYSCALL
@@ -2340,6 +2344,9 @@ static const struct pid_entry tid_base_s
#ifdef CONFIG_CPUSETS
REG("cpuset", S_IRUGO, cpuset),
#endif
+#ifdef CONFIG_CGROUPS
+ REG("cgroup", S_IRUGO, cgroup),
+#endif
INF("oom_score", S_IRUGO, oom_score),
REG("oom_adj", S_IRUGO|S_IWUSR, oom_adjust),
#ifdef CONFIG_AUDITSYSCALL
diff -puN include/linux/cgroup.h~task-cgroupsv11-add-procfs-interface include/linux/cgroup.h
--- a/include/linux/cgroup.h~task-cgroupsv11-add-procfs-interface
+++ a/include/linux/cgroup.h
@@ -29,6 +29,8 @@ extern void cgroup_fork(struct task_s
extern void cgroup_fork_callbacks(struct task_struct *p);
extern void cgroup_exit(struct task_struct *p, int run_callbacks);
+extern struct file_operations proc_cgroup_operations;
+
/* Per-subsystem/per-cgroup state maintained by the system. */
struct cgroup_subsys_state {
/* The cgroup that this subsystem is attached to. Useful
diff -puN kernel/cgroup.c~task-cgroupsv11-add-procfs-interface kernel/cgroup.c
--- a/kernel/cgroup.c~task-cgroupsv11-add-procfs-interface
+++ a/kernel/cgroup.c
@@ -33,6 +33,7 @@
#include <linux/mutex.h>
#include <linux/mount.h>
#include <linux/pagemap.h>
+#include <linux/proc_fs.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/seq_file.h>
@@ -247,6 +248,7 @@ static int cgroup_mkdir(struct inode
static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
static int cgroup_populate_dir(struct cgroup *cont);
static struct inode_operations cgroup_dir_inode_operations;
+static struct file_operations proc_cgroupstats_operations;
static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
{
@@ -1576,6 +1578,7 @@ int __init cgroup_init(void)
{
int err;
int i;
+ struct proc_dir_entry *entry;
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
@@ -1587,10 +1590,139 @@ int __init cgroup_init(void)
if (err < 0)
goto out;
+ entry = create_proc_entry("cgroups", 0, NULL);
+ if (entry)
+ entry->proc_fops = &proc_cgroupstats_operations;
+
out:
return err;
}
+/*
+ * proc_cgroup_show()
+ * - Print task's cgroup paths into seq_file, one line for each hierarchy
+ * - Used for /proc/<pid>/cgroup.
+ * - No need to task_lock(tsk) on this tsk->cgroup reference, as it
+ * doesn't really matter if tsk->cgroup changes after we read it,
+ * and we take cgroup_mutex, keeping attach_task() from changing it
+ * anyway. No need to check that tsk->cgroup != NULL, thanks to
+ * the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks
+ * cgroup to top_cgroup.
+ */
+
+/* TODO: Use a proper seq_file iterator */
+static int proc_cgroup_show(struct seq_file *m, void *v)
+{
+ struct pid *pid;
+ struct task_struct *tsk;
+ char *buf;
+ int retval;
+ struct cgroupfs_root *root;
+
+ retval = -ENOMEM;
+ buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!buf)
+ goto out;
+
+ retval = -ESRCH;
+ pid = m->private;
+ tsk = get_pid_task(pid, PIDTYPE_PID);
+ if (!tsk)
+ goto out_free;
+
+ retval = 0;
+
+ mutex_lock(&cgroup_mutex);
+
+ for_each_root(root) {
+ struct cgroup_subsys *ss;
+ struct cgroup *cont;
+ int subsys_id;
+ int count = 0;
+
+ /* Skip this hierarchy if it has no active subsystems */
+ if (!root->actual_subsys_bits)
+ continue;
+ for_each_subsys(root, ss)
+ seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
+ seq_putc(m, ':');
+ get_first_subsys(&root->top_cgroup, NULL, &subsys_id);
+ cont = task_cgroup(tsk, subsys_id);
+ retval = cgroup_path(cont, buf, PAGE_SIZE);
+ if (retval < 0)
+ goto out_unlock;
+ seq_puts(m, buf);
+ seq_putc(m, '\n');
+ }
+
+out_unlock:
+ mutex_unlock(&cgroup_mutex);
+ put_task_struct(tsk);
+out_free:
+ kfree(buf);
+out:
+ return retval;
+}
+
+static int cgroup_open(struct inode *inode, struct file *file)
+{
+ struct pid *pid = PROC_I(inode)->pid;
+ return single_open(file, proc_cgroup_show, pid);
+}
+
+struct file_operations proc_cgroup_operations = {
+ .open = cgroup_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+/* Display information about each subsystem and each hierarchy */
+static int proc_cgroupstats_show(struct seq_file *m, void *v)
+{
+ int i;
+ struct cgroupfs_root *root;
+
+ mutex_lock(&cgroup_mutex);
+ seq_puts(m, "Hierarchies:\n");
+ for_each_root(root) {
+ struct cgroup_subsys *ss;
+ int first = 1;
+ seq_printf(m, "%p: bits=%lx cgroups=%d (", root,
+ root->subsys_bits, root->number_of_cgroups);
+ for_each_subsys(root, ss) {
+ seq_printf(m, "%s%s", first ? "" : ", ", ss->name);
+ first = false;
+ }
+ seq_putc(m, ')');
+ if (root->sb) {
+ seq_printf(m, " s_active=%d",
+ atomic_read(&root->sb->s_active));
+ }
+ seq_putc(m, '\n');
+ }
+ seq_puts(m, "Subsystems:\n");
+ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ struct cgroup_subsys *ss = subsys[i];
+ seq_printf(m, "%d: name=%s hierarchy=%p\n",
+ i, ss->name, ss->root);
+ }
+ mutex_unlock(&cgroup_mutex);
+ return 0;
+}
+
+static int cgroupstats_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, proc_cgroupstats_show, 0);
+}
+
+static struct file_operations proc_cgroupstats_operations = {
+ .open = cgroupstats_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
/**
* cgroup_fork - attach newly forked task to its parents cgroup.
* @tsk: pointer to task_struct of forking parent process.
_
--
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
|
|
| | Topic: [PATCH 05/33] task containersv11 add container_clone interface |
|---|
| [PATCH 05/33] task containersv11 add container_clone interface [message #20426] |
Mon, 17 September 2007 17:03 |
Paul Menage Messages: 642 Registered: September 2006 |
Senior Member |
From: openvz.org
|
|
Add support for cgroup_clone(), a way to create new cgroups intended to
be used for systems such as namespace unsharing. A new subsystem callback,
post_clone(), is added to allow subsystems to automatically configure cloned
cgroups.
Signed-off-by: Paul Menage <menage@google.com>
---
Documentation/cgroups.txt | 7 +
include/linux/cgroup.h | 3
kernel/cgroup.c | 135 +++++++++++++++++++++++++++++++++
3 files changed, 145 insertions(+)
diff -puN Documentation/cgroups.txt~task-cgroupsv11-add-cgroup_clone-interface Documentation/cgroups.txt
--- a/Documentation/cgroups.txt~task-cgroupsv11-add-cgroup_clone-interface
+++ a/Documentation/cgroups.txt
@@ -504,6 +504,13 @@ include/linux/cgroup.h for details).
method can return an error code, the error code is currently not
always handled well.
+void post_clone(struct cgroup_subsys *ss, struct cgroup *cont)
+
+Called at the end of cgroup_clone() to do any paramater
+initialization which might be required before a task could attach. For
+example in cpusets, no task may attach before 'cpus' and 'mems' are set
+up.
+
void bind(struct cgroup_subsys *ss, struct cgroup *root)
LL=callback_mutex
diff -puN include/linux/cgroup.h~task-cgroupsv11-add-cgroup_clone-interface include/linux/cgroup.h
--- a/include/linux/cgroup.h~task-cgroupsv11-add-cgroup_clone-interface
+++ a/include/linux/cgroup.h
@@ -174,6 +174,7 @@ struct cgroup_subsys {
void (*exit)(struct cgroup_subsys *ss, struct task_struct *task);
int (*populate)(struct cgroup_subsys *ss,
struct cgroup *cont);
+ void (*post_clone)(struct cgroup_subsys *ss, struct cgroup *cont);
void (*bind)(struct cgroup_subsys *ss, struct cgroup *root);
int subsys_id;
int active;
@@ -213,6 +214,8 @@ static inline struct cgroup* task_con
int cgroup_path(const struct cgroup *cont, char *buf, int buflen);
+int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *ss);
+
#else /* !CONFIG_CGROUPS */
static inline int cgroup_init_early(void) { return 0; }
diff -puN kernel/cgroup.c~task-cgroupsv11-add-cgroup_clone-interface kernel/cgroup.c
--- a/kernel/cgroup.c~task-cgroupsv11-add-cgroup_clone-interface
+++ a/kernel/cgroup.c
@@ -1684,3 +1684,138 @@ void cgroup_exit(struct task_struct *
tsk->cgroups = init_task.cgroups;
task_unlock(tsk);
}
+
+/**
+ * cgroup_clone - duplicate the current cgroup in the hierarchy
+ * that the given subsystem is attached to, and move this task into
+ * the new child
+ */
+int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys)
+{
+ struct dentry *dentry;
+ int ret = 0;
+ char nodename[MAX_CGROUP_TYPE_NAMELEN];
+ struct cgroup *parent, *child;
+ struct inode *inode;
+ struct css_set *cg;
+ struct cgroupfs_root *root;
+ struct cgroup_subsys *ss;
+
+ /* We shouldn't be called by an unregistered subsystem */
+ BUG_ON(!subsys->active);
+
+ /* First figure out what hierarchy and cgroup we're dealing
+ * with, and pin them so we can drop cgroup_mutex */
+ mutex_lock(&cgroup_mutex);
+ again:
+ root = subsys->root;
+ if (root == &rootnode) {
+ printk(KERN_INFO
+ "Not cloning cgroup for unused subsystem %s\n",
+ subsys->name);
+ mutex_unlock(&cgroup_mutex);
+ return 0;
+ }
+ cg = &tsk->cgroups;
+ parent = task_cgroup(tsk, subsys->subsys_id);
+
+ snprintf(nodename, MAX_CGROUP_TYPE_NAMELEN, "node_%d", tsk->pid);
+
+ /* Pin the hierarchy */
+ atomic_inc(&parent->root->sb->s_active);
+
+ mutex_unlock(&cgroup_mutex);
+
+ /* Now do the VFS work to create a cgroup */
+ inode = parent->dentry->d_inode;
+
+ /* Hold the parent directory mutex across this operation to
+ * stop anyone else deleting the new cgroup */
+ mutex_lock(&inode->i_mutex);
+ dentry = cgroup_get_dentry(parent->dentry, nodename);
+ if (IS_ERR(dentry)) {
+ printk(KERN_INFO
+ "Couldn't allocate dentry for %s: %ld\n", nodename,
+ PTR_ERR(dentry));
+ ret = PTR_ERR(dentry);
+ goto out_release;
+ }
+
+ /* Create the cgroup directory, which also creates the cgroup */
+ ret = vfs_mkdir(inode, dentry, S_IFDIR | 0755);
+ child = __d_cont(dentry);
+ dput(dentry);
+ if (ret) {
+ printk(KERN_INFO
+ "Failed to create cgroup %s: %d\n", nodename,
+ ret);
+ goto out_release;
+ }
+
+ if (!child) {
+ printk(KERN_INFO
+ "Couldn't find new cgroup %s\n", nodename);
+ ret = -ENOMEM;
+ goto out_release;
+ }
+
+ /* The cgroup now exists. Retake cgroup_mutex and check
+ * that we're still in the same state that we thought we
+ * were. */
+ mutex_lock(&cgroup_mutex);
+ if ((root != subsys->root) ||
+ (parent != task_cgroup(tsk, subsys->subsys_id))) {
+ /* Aargh, we raced ... */
+ mutex_unlock(&inode->i_mutex);
+
+ deactivate_super(parent->root->sb);
+ /* The cgroup is still accessible in the VFS, but
+ * we're not going to try to rmdir() it at this
+ * point. */
+ printk(KERN_INFO
+ "Race in cgroup_clone() - leaking cgroup %s\n",
+ nodename);
+ goto again;
+ }
+
+ /* do any required auto-setup */
+ for_each_subsys(root, ss) {
+ if (ss->post_clone)
+ ss->post_clone(ss, child);
+ }
+
+ /* All seems fine. Finish by moving the task into the new cgroup */
+ ret = attach_task(child, tsk);
+ mutex_unlock(&cgroup_mutex);
+
+ out_release:
+ mutex_unlock(&inode->i_mutex);
+ deactivate_super(parent->root->sb);
+ return ret;
+}
+
+/*
+ * See if "cont" is a descendant of the current task's cgroup in
+ * the appropriate hierarchy
+ *
+ * If we are sending in dummytop, then presumably we are creating
+ * the top cgroup in the subsystem.
+ *
+ * Called only by the ns (nsproxy) cgroup.
+ */
+int cgroup_is_descendant(const struct cgroup *cont)
+{
+ int ret;
+ struct cgroup *target;
+ int subsys_id;
+
+ if (cont == dummytop)
+ return 1;
+
+ get_first_subsys(cont, NULL, &subsys_id);
+ target = task_cgroup(current, subsys_id);
+ while (cont != target && cont!= cont->top_cgroup)
+ cont = cont->parent;
+ ret = (cont == target);
+ return ret;
+}
_
--
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
|
|
| | Topic: [PATCH 04/33] task containersv11 add fork exit hooks |
|---|
| [PATCH 04/33] task containersv11 add fork exit hooks [message #20402] |
Mon, 17 September 2007 17:03 |
Paul Menage Messages: 642 Registered: September 2006 |
Senior Member |
From: openvz.org
|
|
Adds the necessary hooks to the fork() and exit() paths to ensure that
new children inherit their parent's cgroup assignments, and that exiting
processes release reference counts on their cgroups.
Signed-off-by: Paul Menage <menage@google.com>
---
include/linux/cgroup.h | 6 +
kernel/cgroup.c | 121 ++++++++++++++++++++++++++++++++++++
kernel/exit.c | 2
kernel/fork.c | 14 +++-
4 files changed, 141 insertions(+), 2 deletions(-)
diff -puN include/linux/cgroup.h~task-cgroupsv11-add-fork-exit-hooks include/linux/cgroup.h
--- a/include/linux/cgroup.h~task-cgroupsv11-add-fork-exit-hooks
+++ a/include/linux/cgroup.h
@@ -25,6 +25,9 @@ extern int cgroup_init(void);
extern void cgroup_init_smp(void);
extern void cgroup_lock(void);
extern void cgroup_unlock(void);
+extern void cgroup_fork(struct task_struct *p);
+extern void cgroup_fork_callbacks(struct task_struct *p);
+extern void cgroup_exit(struct task_struct *p, int run_callbacks);
/* Per-subsystem/per-cgroup state maintained by the system. */
struct cgroup_subsys_state {
@@ -215,6 +218,9 @@ int cgroup_path(const struct containe
static inline int cgroup_init_early(void) { return 0; }
static inline int cgroup_init(void) { return 0; }
static inline void cgroup_init_smp(void) {}
+static inline void cgroup_fork(struct task_struct *p) {}
+static inline void cgroup_fork_callbacks(struct task_struct *p) {}
+static inline void cgroup_exit(struct task_struct *p, int callbacks) {}
static inline void cgroup_lock(void) {}
static inline void cgroup_unlock(void) {}
diff -puN kernel/cgroup.c~task-cgroupsv11-add-fork-exit-hooks kernel/cgroup.c
--- a/kernel/cgroup.c~task-cgroupsv11-add-fork-exit-hooks
+++ a/kernel/cgroup.c
@@ -132,6 +132,33 @@ list_for_each_entry(_ss, &_root->subsys_
#define for_each_root(_root) \
list_for_each_entry(_root, &roots, root_list)
+/* Each task_struct has an embedded css_set, so the get/put
+ * operation simply takes a reference count on all the cgroups
+ * referenced by subsystems in this css_set. This can end up
+ * multiple-counting some cgroups, but that's OK - the ref-count is
+ * just a busy/not-busy indicator; ensuring that we only count each
+ * cgroup once would require taking a global lock to ensure that no
+ * subsystems moved between hierarchies while we were doing so.
+ *
+ * Possible TODO: decide at boot time based on the number of
+ * registered subsystems and the number of CPUs or NUMA nodes whether
+ * it's better for performance to ref-count every subsystem, or to
+ * take a global lock and only add one ref count to each hierarchy.
+ */
+static void get_css_set(struct css_set *cg)
+{
+ int i;
+ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
+ atomic_inc(&cg->subsys[i]->cgroup->count);
+}
+
+static void put_css_set(struct css_set *cg)
+{
+ int i;
+ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
+ atomic_dec(&cg->subsys[i]->cgroup->count);
+}
+
/*
* There is one global cgroup mutex. We also require taking
* task_lock() when dereferencing a task's cgroup subsys pointers.
@@ -1563,3 +1590,97 @@ int __init cgroup_init(void)
out:
return err;
}
+
+/**
+ * cgroup_fork - attach newly forked task to its parents cgroup.
+ * @tsk: pointer to task_struct of forking parent process.
+ *
+ * Description: A task inherits its parent's cgroup at fork().
+ *
+ * A pointer to the shared css_set was automatically copied in
+ * fork.c by dup_task_struct(). However, we ignore that copy, since
+ * it was not made under the protection of RCU or cgroup_mutex, so
+ * might no longer be a valid cgroup pointer. attach_task() might
+ * have already changed current->cgroup, allowing the previously
+ * referenced cgroup to be removed and freed.
+ *
+ * At the point that cgroup_fork() is called, 'current' is the parent
+ * task, and the passed argument 'child' points to the child task.
+ */
+void cgroup_fork(struct task_struct *child)
+{
+ rcu_read_lock();
+ child->cgroups = rcu_dereference(current->cgroups);
+ get_css_set(&child->cgroups);
+ rcu_read_unlock();
+}
+
+/**
+ * cgroup_fork_callbacks - called on a new task very soon before
+ * adding it to the tasklist. No need to take any locks since no-one
+ * can be operating on this task
+ */
+void cgroup_fork_callbacks(struct task_struct *child)
+{
+ if (need_forkexit_callback) {
+ int i;
+ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ struct cgroup_subsys *ss = subsys[i];
+ if (ss->fork)
+ ss->fork(ss, child);
+ }
+ }
+}
+
+/**
+ * cgroup_exit - detach cgroup from exiting task
+ * @tsk: pointer to task_struct of exiting process
+ *
+ * Description: Detach cgroup from @tsk and release it.
+ *
+ * Note that cgroups marked notify_on_release force every task in
+ * them to take the global cgroup_mutex mutex when exiting.
+ * This could impact scaling on very large systems. Be reluctant to
+ * use notify_on_release cgroups where very high task exit scaling
+ * is required on large systems.
+ *
+ * the_top_cgroup_hack:
+ *
+ * Set the exiting tasks cgroup to the root cgroup (top_cgroup).
+ *
+ * We call cgroup_exit() while the task is still competent to
+ * handle notify_on_release(), then leave the task attached to the
+ * root cgroup in each hierarchy for the remainder of its exit.
+ *
+ * To do this properly, we would increment the reference count on
+ * top_cgroup, and near the very end of the kernel/exit.c do_exit()
+ * code we would add a second cgroup function call, to drop that
+ * reference. This would just create an unnecessary hot spot on
+ * the top_cgroup reference count, to no avail.
+ *
+ * Normally, holding a reference to a cgroup without bumping its
+ * count is unsafe. The cgroup could go away, or someone could
+ * attach us to a different cgroup, decrementing the count on
+ * the first cgroup that we never incremented. But in this case,
+ * top_cgroup isn't going away, and either task has PF_EXITING set,
+ * which wards off any attach_task() attempts, or task is a failed
+ * fork, never visible to attach_task.
+ *
+ */
+void cgroup_exit(struct task_struct *tsk, int run_callbacks)
+{
+ int i;
+
+ if (run_callbacks && need_forkexit_callback) {
+ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ struct cgroup_subsys *ss = subsys[i];
+ if (ss->exit)
+ ss->exit(ss, tsk);
+ }
+ }
+ /* Reassign the task to the init_css_set. */
+ task_lock(tsk);
+ put_css_set(&tsk->cgroups);
+ tsk->cgroups = init_task.cgroups;
+ task_unlock(tsk);
+}
diff -puN kernel/exit.c~task-cgroupsv11-add-fork-exit-hooks kernel/exit.c
--- a/kernel/exit.c~task-cgroupsv11-add-fork-exit-hooks
+++ a/kernel/exit.c
@@ -33,6 +33,7 @@
#include <linux/delayacct.h>
#include <linux/freezer.h>
#include <linux/cpuset.h>
+#include <linux/cgroup.h>
#include <linux/syscalls.h>
#include <linux/signal.h>
#include <linux/posix-timers.h>
@@ -981,6 +982,7 @@ fastcall NORET_TYPE void do_exit(long co
check_stack_usage();
exit_thread();
cpuset_exit(tsk);
+ cgroup_exit(tsk, 1);
exit_keys(tsk);
if (group_dead && tsk->signal->leader)
diff -puN kernel/fork.c~task-cgroupsv11-add-fork-exit-hooks kernel/fork.c
--- a/kernel/fork.c~task-cgroupsv11-add-fork-exit-hooks
+++ a/kernel/fork.c
@@ -30,6 +30,7 @@
#include <linux/capability.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
+#include <linux/cgroup.h>
#include <linux/security.h>
#include <linux/swap.h>
#include <linux/syscalls.h>
@@ -967,6 +968,7 @@ static struct task_struct *copy_process(
{
int retval;
struct task_struct *p = NULL;
+ int cgroup_callbacks_done = 0;
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
return ERR_PTR(-EINVAL);
@@ -1068,12 +1070,13 @@ static struct task_struct *copy_process(
p->io_context = NULL;
p->audit_context = NULL;
cpuset_fork(p);
+ cgroup_fork(p);
#ifdef CONFIG_NUMA
p->mempolicy = mpol_copy(p->mempolicy);
if (IS_ERR(p->mempolicy)) {
retval = PTR_ERR(p->mempolicy);
p->mempolicy = NULL;
- goto bad_fork_cleanup_cpuset;
+ goto bad_fork_cleanup_cgroup;
}
mpol_fix_fork_child_flag(p);
#endif
@@ -1184,6 +1187,12 @@ static struct task_struct *copy_process(
/* Perform scheduler related setup. Assign this task to a CPU. */
sched_fork(p, clone_flags);
+ /* Now that the task is set up, run cgroup callbacks if
+ * necessary. We need to run them before the task is visible
+ * on the tasklist. */
+ cgroup_fork_callbacks(p);
+ cgroup_callbacks_done = 1;
+
/* Need tasklist lock for parent etc handling! */
write_lock_irq(&tasklist_lock);
@@ -1306,9 +1315,10 @@ bad_fork_cleanup_security:
bad_fork_cleanup_policy:
#ifdef CONFIG_NUMA
mpol_free(p->mempolicy);
-bad_fork_cleanup_cpuset:
+bad_fork_cleanup_cgroup:
#endif
cpuset_exit(p);
+ cgroup_exit(p, cgroup_callbacks_done);
delayacct_tsk_free(p);
if (p->binfmt)
module_put(p->binfmt->module);
_
--
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
|
|
| | Topic: [PATCH 02/33] task containersv11 basic task container framework fix |
|---|
| [PATCH 02/33] task containersv11 basic task container framework fix [message #20425] |
Mon, 17 September 2007 17:03 |
Paul Menage Messages: 642 Registered: September 2006 |
Senior Member |
From: openvz.org
|
|
Handle reading /proc/self/cpuset when cpusets isn't mounted.
Signed-off-by: Paul Menage <menage@google.com>
---
kernel/cgroup.c | 9 +++++++++
1 file changed, 9 insertions(+)
diff -puN kernel/cgroup.c~task-cgroupsv11-basic-task-cgroup-framework-fix kernel/cgroup.c
--- a/kernel/cgroup.c~task-cgroupsv11-basic-task-cgroup-framework-fix
+++ a/kernel/cgroup.c
@@ -683,6 +683,15 @@ int cgroup_path(const struct containe
{
char *start;
+ if (cont == dummytop) {
+ /*
+ * Inactive subsystems have no dentry for their root
+ * cgroup
+ */
+ strcpy(buf, "/");
+ return 0;
+ }
+
start = buf + buflen;
*--start = '\0';
_
--
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
|
|
| | Topic: [PATCH 01/33] task containersv11 basic task container framework |
|---|
| [PATCH 01/33] task containersv11 basic task container framework [message #20428] |
Mon, 17 September 2007 17:03 |
Paul Menage Messages: 642 Registered: September 2006 |
Senior Member |
From: openvz.org
|
|
Generic Process Control Groups
--------------------------
There have recently been various proposals floating around for
resource management/accounting and other task grouping subsystems in
the kernel, including ResGroups, User BeanCounters, NSProxy
cgroups, and others. These all need the basic abstraction of being
able to group together multiple processes in an aggregate, in order to
track/limit the resources permitted to those processes, or control
other behaviour of the processes, and all implement this grouping in
different ways.
This patchset provides a framework for tracking and grouping processes
into arbitrary "cgroups" and assigning arbitrary state to those
groupings, in order to control the behaviour of the cgroup as an
aggregate.
The intention is that the various resource management and
virtualization/cgroup efforts can also become task cgroup
clients, with the result that:
- the userspace APIs are (somewhat) normalised
- it's easier to test e.g. the ResGroups CPU controller in
conjunction with the BeanCounters memory controller, or use either of
them as the resource-control portion of a virtual server system.
- the additional kernel footprint of any of the competing resource
management systems is substantially reduced, since it doesn't need
to provide process grouping/containment, hence improving their
chances of getting into the kernel
This patch:
Add the main task cgroups framework - the cgroup filesystem, and the
basic structures for tracking membership and associating subsystem state
objects to tasks.
Signed-off-by: Paul Menage <menage@google.com>
---
Documentation/cgroups.txt | 526 ++++++++++++
include/linux/cgroup.h | 214 +++++
include/linux/cgroup_subsys.h | 10
include/linux/magic.h | 1
include/linux/sched.h | 34
init/Kconfig | 8
init/main.c | 3
kernel/Makefile | 1
kernel/cgroup.c | 1199 +++++++++++++++++++++++++++++
9 files changed, 1995 insertions(+), 1 deletion(-)
diff -puN /dev/null Documentation/cgroups.txt
--- /dev/null
+++ a/Documentation/cgroups.txt
@@ -0,0 +1,526 @@
+ CGROUPS
+ -------
+
+Written by Paul Menage <menage@google.com> based on Documentation/cpusets.txt
+
+Original copyright statements from cpusets.txt:
+Portions Copyright (C) 2004 BULL SA.
+Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
+Modified by Paul Jackson <pj@sgi.com>
+Modified by Christoph Lameter <clameter@sgi.com>
+
+CONTENTS:
+=========
+
+1. Control Groups
+ 1.1 What are cgroups ?
+ 1.2 Why are cgroups needed ?
+ 1.3 How are cgroups implemented ?
+ 1.4 What does notify_on_release do ?
+ 1.5 How do I use cgroups ?
+2. Usage Examples and Syntax
+ 2.1 Basic Usage
+ 2.2 Attaching processes
+3. Kernel API
+ 3.1 Overview
+ 3.2 Synchronization
+ 3.3 Subsystem API
+4. Questions
+
+1. Control Groups
+==========
+
+1.1 What are cgroups ?
+----------------------
+
+Control Groups provide a mechanism for aggregating/partitioning sets of
+tasks, and all their future children, into hierarchical groups with
+specialized behaviour.
+
+Definitions:
+
+A *cgroup* associates a set of tasks with a set of parameters for one
+or more subsystems.
+
+A *subsystem* is a module that makes use of the task grouping
+facilities provided by cgroups to treat groups of tasks in
+particular ways. A subsystem is typically a "resource controller" that
+schedules a resource or applies per-cgroup limits, but it may be
+anything that wants to act on a group of processes, e.g. a
+virtualization subsystem.
+
+A *hierarchy* is a set of cgroups arranged in a tree, such that
+every task in the system is in exactly one of the cgroups in the
+hierarchy, and a set of subsystems; each subsystem has system-specific
+state attached to each cgroup in the hierarchy. Each hierarchy has
+an instance of the cgroup virtual filesystem associated with it.
+
+At any one time there may be multiple active hierachies of task
+cgroups. Each hierarchy is a partition of all tasks in the system.
+
+User level code may create and destroy cgroups by name in an
+instance of the cgroup virtual file system, specify and query to
+which cgroup a task is assigned, and list the task pids assigned to
+a cgroup. Those creations and assignments only affect the hierarchy
+associated with that instance of the cgroup file system.
+
+On their own, the only use for cgroups is for simple job
+tracking. The intention is that other subsystems hook into the generic
+cgroup support to provide new attributes for cgroups, such as
+accounting/limiting the resources which processes in a cgroup can
+access. For example, cpusets (see Documentation/cpusets.txt) allows
+you to associate a set of CPUs and a set of memory nodes with the
+tasks in each cgroup.
+
+1.2 Why are cgroups needed ?
+----------------------------
+
+There are multiple efforts to provide process aggregations in the
+Linux kernel, mainly for resource tracking purposes. Such efforts
+include cpusets, CKRM/ResGroups, UserBeanCounters, and virtual server
+namespaces. These all require the basic notion of a
+grouping/partitioning of processes, with newly forked processes ending
+in the same group (cgroup) as their parent process.
+
+The kernel cgroup patch provides the minimum essential kernel
+mechanisms required to efficiently implement such groups. It has
+minimal impact on the system fast paths, and provides hooks for
+specific subsystems such as cpusets to provide additional behaviour as
+desired.
+
+Multiple hierarchy support is provided to allow for situations where
+the division of tasks into cgroups is distinctly different for
+different subsystems - having parallel hierarchies allows each
+hierarchy to be a natural division of tasks, without having to handle
+complex combinations of tasks that would be present if several
+unrelated subsystems needed to be forced into the same tree of
+cgroups.
+
+At one extreme, each resource controller or subsystem could be in a
+separate hierarchy; at the other extreme, all subsystems
+would be attached to the same hierarchy.
+
+As an example of a scenario (originally proposed by vatsa@in.ibm.com)
+that can benefit from multiple hierarchies, consider a large
+university server with various users - students, professors, system
+tasks etc. The resource planning for this server could be along the
+following lines:
+
+ CPU : Top cpuset
+ / \
+ CPUSet1 CPUSet2
+ | |
+ (Profs) (Students)
+
+ In addition (system tasks) are attached to topcpuset (so
+ that they can run anywhere) with a limit of 20%
+
+ Memory : Professors (50%), students (30%), system (20%)
+
+ Disk : Prof (50%), students (30%), system (20%)
+
+ Network : WWW browsing (20%), Network File System (60%), others (20%)
+ / \
+ Prof (15%) students (5%)
+
+Browsers like firefox/lynx go into the WWW network class, while (k)nfsd go
+into NFS network class.
+
+At the same time firefox/lynx will share an appropriate CPU/Memory class
+depending on who launched it (prof/student).
+
+With the ability to classify tasks differently for different resources
+(by putting those resource subsystems in different hierarchies) then
+the admin can easily set up a script which receives exec notifications
+and depending on who is launching the browser he can
+
+ # echo browser_pid > /mnt/<restype>/<userclass>/tasks
+
+With only a single hierarchy, he now would potentially have to create
+a separate cgroup for every browser launched and associate it with
+approp network and other resource class. This may lead to
+proliferation of such cgroups.
+
+Also lets say that the administrator would like to give enhanced network
+access temporarily to a student's browser (since it is night and the user
+wants to do online gaming :) OR give one of the students simulation
+apps enhanced CPU power,
+
+With ability to write pids directly to resource classes, its just a
+matter of :
+
+ # echo pid > /mnt/network/<new_class>/tasks
+ (after some time)
+ # echo pid > /mnt/network/<orig_class>/tasks
+
+Without this ability, he would have to split the cgroup into
+multiple separate ones and then associate the new cgroups with the
+new resource classes.
+
+
+
+1.3 How are cgroups implemented ?
+---------------------------------
+
+Control Groups extends the kernel as follows:
+
+ - Each task in the system has a reference-counted pointer to a
+ css_set.
+
+ - A css_set contains a set of reference-counted pointers to
+ cgroup_subsys_state objects, one for each cgroup subsystem
+ registered in the system. There is no direct link from a task to
+ the cgroup of which it's a member in each hierarchy, but this
+ can be determined by following pointers through the
+ cgroup_subsys_state objects. This is because accessing the
+ subsystem state is something that's expected to happen frequently
+ and in performance-critical code, whereas operations that require a
+ task's actual cgroup assignments (in particular, moving between
+ cgroups) are less common.
+
+ - A cgroup hierarchy filesystem can be mounted for browsing and
+ manipulation from user space.
+
+ - You can list all the tasks (by pid) attached to any cgroup.
+
+The implementation of cgroups requires a few, simple hooks
+into the rest of the kernel, none in performance critical paths:
+
+ - in init/main.c, to initialize the root cgroups and initial
+ css_set at system boot.
+
+ - in fork and exit, to attach and detach a task from its css_set.
+
+In addition a new file system, of type "cgroup" may be mounted, to
+enable browsing and modifying the cgroups presently known to the
+kernel. When mounting a cgro
...
|
|
| | Topic: [PATCH 4/5][AFS] Cleanup explicit check for mandatory locks |
|---|
| [PATCH 4/5][AFS] Cleanup explicit check for mandatory locks [message #20351] |
Mon, 17 September 2007 03:56 |
Pavel Emelianov Messages: 1149 Registered: September 2006 |
Senior Member |
From: openvz.org
|
|
The __mandatory_lock(inode) macro makes the same check, but
makes the code more readable.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Cc: David Howells <dhowells@redhat.com>
---
fs/afs/flock.c | 3 +--
1 files changed, 1 insertion(+), 2 deletions(-)
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index af6952e..210acaf 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -524,8 +524,7 @@ int afs_lock(struct file *file, int cmd,
(long long) fl->fl_start, (long long) fl->fl_end);
/* AFS doesn't support mandatory locks */
- if ((vnode->vfs_inode.i_mode & (S_ISGID | S_IXGRP)) == S_ISGID &&
- fl->fl_type != F_UNLCK)
+ if (__mandatory_lock(&vnode->vfs_inode) && fl->fl_type != F_UNLCK)
return -ENOLCK;
if (IS_GETLK(cmd))
|
|
| | Topic: [PATCH 1/5] Cleanup macros for distinguishing mandatory locks |
|---|
| [PATCH 1/5] Cleanup macros for distinguishing mandatory locks [message #20348] |
Mon, 17 September 2007 03:50 |
Pavel Emelianov Messages: 1149 Registered: September 2006 |
Senior Member |
From: openvz.org
|
|
The combination of S_ISGID bit set and S_IXGRP bit unset is
used to mark the inode as "mandatory lockable" and there's a
macro for this check called MANDATORY_LOCK(inode). However,
fs/locks.c and some filesystems still perform the explicit
i_mode checking. Besides, Andrew pointed out, that this macro
is buggy itself, as it dereferences the inode arg twice.
Convert this macro into static inline function and switch
its users to it, making the code shorter and more readable.
The __mandatory_lock() helper is to be used in places where
the IS_MANDLOCK() for superblock is already known to be true.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
---
fs/locks.c | 14 ++++----------
fs/nfsd/nfs4state.c | 2 +-
fs/nfsd/vfs.c | 2 +-
fs/read_write.c | 2 +-
include/linux/fs.h | 21 +++++++++++++++++----
5 files changed, 24 insertions(+), 17 deletions(-)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 291d40b..9c519e6 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1488,12 +1488,25 @@ extern int locks_mandatory_area(int, str
* Candidates for mandatory locking have the setgid bit set
* but no group execute bit - an otherwise meaningless combination.
*/
-#define MANDATORY_LOCK(inode) \
- (IS_MANDLOCK(inode) && ((inode)->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
+
+static inline int __mandatory_lock(struct inode *ino)
+{
+ return (ino->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID;
+}
+
+/*
+ * ... and these candidates should be on MS_MANDLOCK mounted fs,
+ * otherwise these will be advisory locks
+ */
+
+static inline int mandatory_lock(struct inode *ino)
+{
+ return IS_MANDLOCK(ino) && __mandatory_lock(ino);
+}
static inline int locks_verify_locked(struct inode *inode)
{
- if (MANDATORY_LOCK(inode))
+ if (mandatory_lock(inode))
return locks_mandatory_locked(inode);
return 0;
}
@@ -1504,7 +1517,7 @@ static inline int locks_verify_truncate(
struct file *filp,
loff_t size)
{
- if (inode->i_flock && MANDATORY_LOCK(inode))
+ if (inode->i_flock && mandatory_lock(inode))
return locks_mandatory_area(
FLOCK_VERIFY_WRITE, inode, filp,
size < inode->i_size ? size : inode->i_size,
diff --git a/fs/locks.c b/fs/locks.c
index f59d066..a71c589 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1116,7 +1116,7 @@ int locks_mandatory_area(int read_write,
* If we've been sleeping someone might have
* changed the permissions behind our back.
*/
- if ((inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
+ if (__mandatory_lock(inode))
continue;
}
@@ -1755,9 +1755,7 @@ int fcntl_setlk(unsigned int fd, struct
/* Don't allow mandatory locks on files that may be memory mapped
* and shared.
*/
- if (IS_MANDLOCK(inode) &&
- (inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID &&
- mapping_writably_mapped(filp->f_mapping)) {
+ if (mandatory_lock(inode) && mapping_writably_mapped(filp->f_mapping)) {
error = -EAGAIN;
goto out;
}
@@ -1881,9 +1879,7 @@ int fcntl_setlk64(unsigned int fd, struc
/* Don't allow mandatory locks on files that may be memory mapped
* and shared.
*/
- if (IS_MANDLOCK(inode) &&
- (inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID &&
- mapping_writably_mapped(filp->f_mapping)) {
+ if (mandatory_lock(inode) && mapping_writably_mapped(filp->f_mapping)) {
error = -EAGAIN;
goto out;
}
@@ -2077,9 +2073,7 @@ static void lock_get_status(char* out, s
out += sprintf(out, "%6s %s ",
(fl->fl_flags & FL_ACCESS) ? "ACCESS" : "POSIX ",
(inode == NULL) ? "*NOINODE*" :
- (IS_MANDLOCK(inode) &&
- (inode->i_mode & (S_IXGRP | S_ISGID)) == S_ISGID) ?
- "MANDATORY" : "ADVISORY ");
+ mandatory_lock(inode) ? "MANDATORY" : "ADVISORY ");
} else if (IS_FLOCK(fl)) {
if (fl->fl_type & LOCK_MAND) {
out += sprintf(out, "FLOCK MSNFS ");
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 6256492..a0635d7 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2030,7 +2030,7 @@ static inline int
io_during_grace_disallowed(struct inode *inode, int flags)
{
return nfs4_in_grace() && (flags & (RD_STATE | WR_STATE))
- && MANDATORY_LOCK(inode);
+ && mandatory_lock(inode);
}
/*
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 70f2c86..3c703a7 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -65,7 +65,7 @@
* locks on them because there is no way to know if the accesser has
* the lock.
*/
-#define IS_ISMNDLK(i) (S_ISREG((i)->i_mode) && MANDATORY_LOCK(i))
+#define IS_ISMNDLK(i) (S_ISREG((i)->i_mode) && mandatory_lock(i))
/*
* This is a cache of readahead params that help us choose the proper
diff --git a/fs/read_write.c b/fs/read_write.c
index 507ddff..124693e 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -205,7 +205,7 @@ int rw_verify_area(int read_write, struc
if (unlikely((pos < 0) || (loff_t) (pos + count) < 0))
goto Einval;
- if (unlikely(inode->i_flock && MANDATORY_LOCK(inode))) {
+ if (unlikely(inode->i_flock && mandatory_lock(inode))) {
int retval = locks_mandatory_area(
read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
inode, file, pos, count);
|
|
| | Topic: [PATCH] shrink_dcache_sb speedup |
|---|
| [PATCH] shrink_dcache_sb speedup [message #20255] |
Fri, 14 September 2007 04:37 |
den Messages: 493 Registered: December 2005 |
Senior Member |
From: openvz.org
|
|
From: Denis V. Lunev <den@openvz.org>
This patch makes shrink_dcache_sb consistent with dentry pruning policy.
On the first pass we iterate over dentry unused list and prepare some
dentries for removal.
However, since the existing code moves evicted dentries
to the beginning of the LRU it can happen that fresh dentries from
other superblocks will be inserted *before* our dentries.
This can result in significant slowdown of shrink_dcache_sb().
Moreover, for virtual filesystems like unionfs which can call dput()
during dentries kill existing code results in O(n^2) complexity.
We observed 2 minutes shrink_dcache_sb() with only 35000 dentries.
To avoid this effects we propose to isolate sb dentries at the end
of LRU list.
Signed-off-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: Kirill Korotaev <dev@openvz.org>
Signed-off-by: Andrey Mirkin <amirkin@openvz.org>
-------
--- ./fs/dcache.c.shrink 2007-09-14 10:25:21.000000000 +0400
+++ ./fs/dcache.c 2007-09-14 10:26:08.000000000 +0400
@@ -553,18 +553,18 @@ void shrink_dcache_sb(struct super_block
* superblock to the most recent end of the unused list.
*/
spin_lock(&dcache_lock);
- list_for_each_safe(tmp, next, &dentry_unused) {
+ list_for_each_prev_safe(tmp, next, &dentry_unused) {
dentry = list_entry(tmp, struct dentry, d_lru);
if (dentry->d_sb != sb)
continue;
- list_move(tmp, &dentry_unused);
+ list_move_tail(tmp, &dentry_unused);
}
/*
* Pass two ... free the dentries for this superblock.
*/
repeat:
- list_for_each_safe(tmp, next, &dentry_unused) {
+ list_for_each_prev_safe(tmp, next, &dentry_unused) {
dentry = list_entry(tmp, struct dentry, d_lru);
if (dentry->d_sb != sb)
continue;
--- ./include/linux/list.h.shrink 2007-08-10 16:58:49.000000000 +0400
+++ ./include/linux/list.h 2007-09-14 10:26:08.000000000 +0400
@@ -478,6 +478,18 @@ static inline void list_splice_init_rcu(
pos = n, n = pos->next)
/**
+ * list_for_each_prev_safe - iterate over a list backwards safe against removal
+ of list entry
+ * @pos: the &struct list_head to use as a loop cursor.
+ * @n: another &struct list_head to use as temporary storage
+ * @head: the head for your list.
+ */
+#define list_for_each_prev_safe(pos, n, head) \
+ for (pos = (head)->prev, n = pos->prev; \
+ prefetch(pos->prev), pos != (head); \
+ pos = n, n = pos->prev)
+
+/**
* list_for_each_entry - iterate over list of given type
* @pos: the type * to use as a loop cursor.
* @head: the head for your list.
|
|
| | Topic: [NETNS][patch 0/1] fix allnoconfig compilation error |
|---|
| [NETNS][patch 0/1] fix allnoconfig compilation error [message #20172] |
Wed, 12 September 2007 16:48 |
Daniel Lezcano Messages: 417 Registered: June 2006 |
Senior Member |
From: openvz.org
|
|
fixes a compilation issue when allnoconfig is used.
- init_net is unresolved.
If ok, I send it right now to Dave Miller.
--
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
|
|
| | Topic: [BUG] ULOG problem on stable 2.6.18 |
|---|
| [BUG] ULOG problem on stable 2.6.18 [message #20147] |
Wed, 12 September 2007 09:39 |
Enrico Weigelt Messages: 31 Registered: July 2006 |
Member |
From: openvz.org
|
|
Hi folks,
I'm just trying the current stable kernel w/ on an amd-k8
and ran into trouble w/ netfilter's ulog target:
ip_tables: ULOG target: invalid size 44 != 76
I had a look at the web and found out it might be an 32/64 bit
issue (userland's and kernel's int size differs). So I tried
building for generic i586. But this also didn't help.
Some netfilter's changelog on the web tells that issue had
been solved, but probably that's in an newer kernel and not
backported to the ovz kernel yet.
My big problem is that I need the ulog (even if it's silent ;O)
to get the kernel running together with some given firewall script,
since I'd like to add openvz support to an production grade
distro which is used by non-experienced people. Rewriting the
firewall script would probably break the server management
system :(
Maybe someone has an idea that might help ?
thx
--
---------------------------------------------------------------------
Enrico Weigelt == metux IT service - http://www.metux.de/
---------------------------------------------------------------------
Please visit the OpenSource QM Taskforce:
http://wiki.metux.de/public/OpenSource_QM_Taskforce
Patches / Fixes for a lot dozens of packages in dozens of versions:
http://patches.metux.de/
---------------------------------------------------------------------
|
|
| | Topic: [-mm PATCH] Memory controller make charging gfp mask aware |
|---|
| [-mm PATCH] Memory controller make charging gfp mask aware [message #20132] |
Wed, 12 September 2007 08:14 |
Balbir Singh Messages: 491 Registered: August 2006 |
Senior Member |
From: openvz.org
|
|
Nick Piggin pointed out that swap cache and page cache addition routines
could be called from non GFP_KERNEL contexts. This patch makes the charging
routine aware of the gfp context. Charging might fail if the container is
over it's limit, in which case a suitable error is returned.
This patch was tested on a Powerpc box. I am still looking at being able
to test the path, through which allocations happen in non GFP_KERNEL contexts.
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
---
include/linux/memcontrol.h | 12 ++++++++----
include/linux/swap.h | 3 ++-
mm/filemap.c | 2 +-
mm/memcontrol.c | 24 +++++++++++++++++-------
mm/memory.c | 10 +++++-----
mm/migrate.c | 2 +-
mm/swap_state.c | 2 +-
mm/swapfile.c | 2 +-
mm/vmscan.c | 5 +++--
9 files changed, 39 insertions(+), 23 deletions(-)
diff -puN include/linux/memcontrol.h~memory-controller-make-charging-gfpmask-aware include/linux/memcontrol.h
--- linux-2.6.23-rc4/include/linux/memcontrol.h~memory-controller-make-charging-gfpmask-aware 2007-09-11 22:20:50.000000000 +0530
+++ linux-2.6.23-rc4-balbir/include/linux/memcontrol.h 2007-09-11 22:23:32.000000000 +0530
@@ -32,7 +32,8 @@ extern void mm_free_container(struct mm_
extern void page_assign_page_container(struct page *page,
struct page_container *pc);
extern struct page_container *page_get_page_container(struct page *page);
-extern int mem_container_charge(struct page *page, struct mm_struct *mm);
+extern int mem_container_charge(struct page *page, struct mm_struct *mm,
+ gfp_t gfp_mask);
extern void mem_container_uncharge(struct page_container *pc);
extern void mem_container_move_lists(struct page_container *pc, bool active);
extern unsigned long mem_container_isolate_pages(unsigned long nr_to_scan,
@@ -42,7 +43,8 @@ extern unsigned long mem_container_isola
struct mem_container *mem_cont,
int active);
extern void mem_container_out_of_memory(struct mem_container *mem);
-extern int mem_container_cache_charge(struct page *page, struct mm_struct *mm);
+extern int mem_container_cache_charge(struct page *page, struct mm_struct *mm,
+ gfp_t gfp_mask);
extern struct mem_container *mm_container(struct mm_struct *mm);
static inline void mem_container_uncharge_page(struct page *page)
@@ -70,7 +72,8 @@ static inline struct page_container *pag
return NULL;
}
-static inline int mem_container_charge(struct page *page, struct mm_struct *mm)
+static inline int mem_container_charge(struct page *page, struct mm_struct *mm,
+ gfp_t gfp_mask)
{
return 0;
}
@@ -89,7 +92,8 @@ static inline void mem_container_move_li
}
static inline int mem_container_cache_charge(struct page *page,
- struct mm_struct *mm)
+ struct mm_struct *mm,
+ gfp_t gfp_mask)
{
return 0;
}
diff -puN mm/memcontrol.c~memory-controller-make-charging-gfpmask-aware mm/memcontrol.c
--- linux-2.6.23-rc4/mm/memcontrol.c~memory-controller-make-charging-gfpmask-aware 2007-09-11 22:20:50.000000000 +0530
+++ linux-2.6.23-rc4-balbir/mm/memcontrol.c 2007-09-12 00:25:12.000000000 +0530
@@ -261,7 +261,8 @@ unsigned long mem_container_isolate_page
* 0 if the charge was successful
* < 0 if the container is over its limit
*/
-int mem_container_charge(struct page *page, struct mm_struct *mm)
+int mem_container_charge(struct page *page, struct mm_struct *mm,
+ gfp_t gfp_mask)
{
struct mem_container *mem;
struct page_container *pc, *race_pc;
@@ -287,7 +288,7 @@ int mem_container_charge(struct page *pa
unlock_page_container(page);
- pc = kzalloc(sizeof(struct page_container), GFP_KERNEL);
+ pc = kzalloc(sizeof(struct page_container), gfp_mask);
if (pc == NULL)
goto err;
@@ -314,7 +315,14 @@ int mem_container_charge(struct page *pa
* the container limit.
*/
while (res_counter_charge(&mem->res, 1)) {
- if (try_to_free_mem_container_pages(mem))
+ bool is_atomic = gfp_mask & GFP_ATOMIC;
+ /*
+ * We cannot reclaim under GFP_ATOMIC, fail the charge
+ */
+ if (is_atomic)
+ goto noreclaim;
+
+ if (try_to_free_mem_container_pages(mem, gfp_mask))
continue;
/*
@@ -338,9 +346,10 @@ int mem_container_charge(struct page *pa
congestion_wait(WRITE, HZ/10);
continue;
}
-
+noreclaim:
css_put(&mem->css);
- mem_container_out_of_memory(mem);
+ if (!is_atomic)
+ mem_container_out_of_memory(mem);
goto free_pc;
}
@@ -381,7 +390,8 @@ err:
/*
* See if the cached pages should be charged at all?
*/
-int mem_container_cache_charge(struct page *page, struct mm_struct *mm)
+int mem_container_cache_charge(struct page *page, struct mm_struct *mm,
+ gfp_t gfp_mask)
{
struct mem_container *mem;
if (!mm)
@@ -389,7 +399,7 @@ int mem_container_cache_charge(struct pa
mem = rcu_dereference(mm->mem_container);
if (mem->control_type == MEM_CONTAINER_TYPE_ALL)
- return mem_container_charge(page, mm);
+ return mem_container_charge(page, mm, gfp_mask);
else
return 0;
}
diff -puN mm/memory.c~memory-controller-make-charging-gfpmask-aware mm/memory.c
--- linux-2.6.23-rc4/mm/memory.c~memory-controller-make-charging-gfpmask-aware 2007-09-11 22:20:50.000000000 +0530
+++ linux-2.6.23-rc4-balbir/mm/memory.c 2007-09-11 22:54:09.000000000 +0530
@@ -1137,7 +1137,7 @@ static int insert_page(struct mm_struct
pte_t *pte;
spinlock_t *ptl;
- retval = mem_container_charge(page, mm);
+ retval = mem_container_charge(page, mm, GFP_KERNEL);
if (retval)
goto out;
@@ -1638,7 +1638,7 @@ gotten:
goto oom;
cow_user_page(new_page, old_page, address, vma);
- if (mem_container_charge(new_page, mm))
+ if (mem_container_charge(new_page, mm, GFP_KERNEL))
goto oom_free_new;
/*
@@ -2101,7 +2101,7 @@ static int do_swap_page(struct mm_struct
}
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
- if (mem_container_charge(page, mm)) {
+ if (mem_container_charge(page, mm, GFP_KERNEL)) {
ret = VM_FAULT_OOM;
goto out;
}
@@ -2185,7 +2185,7 @@ static int do_anonymous_page(struct mm_s
if (!page)
goto oom;
- if (mem_container_charge(page, mm))
+ if (mem_container_charge(page, mm, GFP_KERNEL))
goto oom_free_page;
entry = mk_pte(page, vma->vm_page_prot);
@@ -2320,7 +2320,7 @@ static int __do_fault(struct mm_struct *
}
- if (mem_container_charge(page, mm)) {
+ if (mem_container_charge(page, mm, GFP_KERNEL)) {
ret = VM_FAULT_OOM;
goto out;
}
diff -puN mm/filemap.c~memory-controller-make-charging-gfpmask-aware mm/filemap.c
--- linux-2.6.23-rc4/mm/filemap.c~memory-controller-make-charging-gfpmask-aware 2007-09-11 22:20:50.000000000 +0530
+++ linux-2.6.23-rc4-balbir/mm/filemap.c 2007-09-11 22:54:19.000000000 +0530
@@ -445,7 +445,7 @@ int add_to_page_cache(struct page *page,
if (error == 0) {
- error = mem_container_cache_charge(page, current->mm);
+ error = mem_container_cache_charge(page, current->mm, gfp_mask);
if (error)
goto out;
diff -puN mm/migrate.c~memory-controller-make-charging-gfpmask-aware mm/migrate.c
--- linux-2.6.23-rc4/mm/migrate.c~memory-controller-make-charging-gfpmask-aware 2007-09-11 22:20:50.000000000 +0530
+++ linux-2.6.23-rc4-balbir/mm/migrate.c 2007-09-11 22:54:29.000000000 +0530
@@ -158,7 +158,7 @@ static void remove_migration_pte(struct
return;
}
- if (mem_container_charge(new, mm)) {
+ if (mem_container_charge(new, mm, GFP_KERNEL)) {
pte_unmap(ptep);
return;
}
diff -puN mm/page_alloc.c~memory-controller-make-charging-gfpmask-aware mm/page_alloc.c
diff -puN mm/rmap.c~memory-controller-make-charging-gfpmask-aware mm/rmap.c
diff -puN mm/swapfile.c~memory-controller-make-charging-gfpmask-aware mm/swapfile.c
--- linux-2.6.23-rc4/mm/swapfile.c~memory-controller-make-charging-gfpmask-aware 2007-09-11 22:20:50.000000000 +0530
+++ linux-2.6.23-rc4-balbir/mm/swapfile.c 2007-09-11 22:54:52.000000000 +0530
@@ -510,7 +510,7 @@ unsigned int count_swap_pages(int type,
static int unuse_pte(struct vm_area_struct *vma, pte_t *pte,
unsigned long addr, swp_entry_t entry, struct page *page)
{
- if (mem_container_charge(page, vma->vm_mm))
+ if (mem_container_charge(page, vma->vm_mm, GFP_KERNEL))
return -ENOMEM;
inc_mm_counter(vma->vm_mm, anon_rss);
diff -puN mm/swap_state.c~memory-controller-make-charging-gfpmask-aware mm/swap_state.c
--- linux-2.6.23-rc4/mm/swap_state.c~memory-controller-make-charging-gfpmask-aware 2007-09-11 22:20:50.000000000 +0530
+++ linux-2.6.23-rc4-balbir/mm/swap_state.c 2007-09-11 22:55:12.000000000 +0530
@@ -81,7 +81,7 @@ static int __add_to_swap_cache(struct pa
error = radix_tree_preload(gfp_mask);
if (!error) {
- error = mem_container_cache_charge(page, current->mm);
+ error = mem_container_cache_charge(page, current->mm, gfp_mask);
if (error)
goto out;
diff -puN mm/vmscan.c~memory-controller-make-charging-gfpmask-aware mm/vmscan.c
--- linux-2.6.23-rc4/mm/vmscan.c~memory-controller-make-charging-gfpmask-aware 2007-09-11 22:20:50.000000000 +0530
+++ linux-2.6.23-rc4-balbir/mm/vmscan.c 2007-09-11 23:05:40.000000000 +0530
@@ -1357,10 +1357,11 @@ unsigned long try_to_free_pages(struct z
#define ZONE_USERPAGES ZONE_NORMAL
#endif
-unsigned long try_to_free_mem_container_pages(struct mem_container *mem_cont)
+unsigned long try_to_free_mem_container_pages(struct mem_container *mem_cont,
+ gfp_t gfp_mask)
{
struct scan_control sc = {
- .gfp_mask = GFP_KERNEL,
+ .gfp_mask = gfp_mask,
.may_writepage = !laptop_mode,
.may_swap = 1,
.swap_cluster_max = SWAP_CLUSTER_MAX,
diff -puN include/linux/swap.h~memory-controller-make-charging-gfpmask-aware include/linux/swap.h
--- linux-2.6.23-rc4/include/linux/swap.h~memory-controller-make-charging-gfpmask-aware 2007-09-12 00:11:37.000000000 +0530
+++ linux-2.6.23-rc4-balbir/include/linux/swap.h 2007-09-11 23:05:59.000000000 +0530
@@ -191,7 +191,8 @@ extern void swap_setup(void);
/* linux/mm/vmscan.c *
...
|
|
| | Topic: [PATCH 5/5][NFS] Use macro instead of explicit check for mandatory locks |
|---|
| [PATCH 5/5][NFS] Use macro instead of explicit check for mandatory locks [message #20120] |
Wed, 12 September 2007 07:28 |
Pavel Emelianov Messages: 1149 Registered: September 2006 |
Senior Member |
From: openvz.org
|
|
The __MANDATORY_LOCK(inode) macro makes the same check, but
makes the code more readable.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
---
fs/nfs/file.c | 3 +--
1 files changed, 1 insertion(+), 2 deletions(-)
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 73ddd2e..8dc2cde 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -605,8 +605,7 @@ static int nfs_lock(struct file *filp, i
nfs_inc_stats(inode, NFSIOS_VFSLOCK);
/* No mandatory locks over NFS */
- if ((inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID &&
- fl->fl_type != F_UNLCK)
+ if (__MANDATORY_LOCK(inode) && fl->fl_type != F_UNLCK)
return -ENOLCK;
if (IS_GETLK(cmd))
|
|
| | Topic: [PATCH 4/5][AFS] Use macro instead of explicit check for mandatory locks |
|---|
| [PATCH 4/5][AFS] Use macro instead of explicit check for mandatory locks [message #20119] |
Wed, 12 September 2007 07:27 |
Pavel Emelianov Messages: 1149 Registered: September 2006 |
Senior Member |
From: openvz.org
|
|
The __MANDATORY_LOCK(inode) macro makes the same check, but
makes the code more readable.
Unfortunately, I haven't found the maintainer for this FS in
the MAINTAINERS file.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
---
fs/afs/flock.c | 3 +--
1 files changed, 1 insertion(+), 2 deletions(-)
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index af6952e..9ddac05 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -524,8 +524,7 @@ int afs_lock(struct file *file, int cmd,
(long long) fl->fl_start, (long long) fl->fl_end);
/* AFS doesn't support mandatory locks */
- if ((vnode->vfs_inode.i_mode & (S_ISGID | S_IXGRP)) == S_ISGID &&
- fl->fl_type != F_UNLCK)
+ if (__MANDATORY_LOCK(vnode->vfs_inode) && fl->fl_type != F_UNLCK)
return -ENOLCK;
if (IS_GETLK(cmd))
|
|
| | Topic: [PATCH 3/5][9PFS] Use macro instead of explicit check for mandatory locks |
|---|
| [PATCH 3/5][9PFS] Use macro instead of explicit check for mandatory locks [message #20118] |
Wed, 12 September 2007 07:23 |
Pavel Emelianov Messages: 1149 Registered: September 2006 |
Senior Member |
From: openvz.org
|
|
The __MANDATORY_LOCK(inode) macro makes the same check, but
makes the code more readable.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Cc: Eric Van Hensbergen <ericvh@gmail.com>
Cc: Ron Minnich <rminnich@sandia.gov>
Cc: Latchesar Ionkov <lucho@ionkov.net>
---
fs/9p/vfs_file.c | 2 +-
1 files changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 2a40c29..7e75309 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -105,7 +105,7 @@ static int v9fs_file_lock(struct file *f
P9_DPRINTK(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl);
/* No mandatory locks */
- if ((inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
+ if (__MANDATORY_LOCK(inode))
return -ENOLCK;
if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
|
|
| | Topic: [PATCH 2/5][GFS2] Use macro instead of explicit check for mandatory locks |
|---|
| [PATCH 2/5][GFS2] Use macro instead of explicit check for mandatory locks [message #20117] |
Wed, 12 September 2007 07:20 |
Pavel Emelianov Messages: 1149 Registered: September 2006 |
Senior Member |
From: openvz.org
|
|
The __MANDATORY_LOCK(inode) macro makes the same check, but
makes the code more readable.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Cc: Steven Whitehouse <swhiteho@redhat.com>
---
fs/gfs2/ops_file.c | 4 ++--
1 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index 94d76ac..7e814f4 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -535,7 +535,7 @@ static int gfs2_lock(struct file *file,
if (!(fl->fl_flags & FL_POSIX))
return -ENOLCK;
- if ((ip->i_inode.i_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
+ if (__MANDATORY_LOCK(ip->i_inode))
return -ENOLCK;
if (sdp->sd_args.ar_localflocks) {
@@ -637,7 +637,7 @@ static int gfs2_flock(struct file *file,
if (!(fl->fl_flags & FL_FLOCK))
return -ENOLCK;
- if ((ip->i_inode.i_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
+ if (__MANDATORY_LOCK(ip->i_inode))
return -ENOLCK;
if (sdp->sd_args.ar_localflocks)
|
|
| | Topic: Couple of rpm requests, 2.6.22 |
|---|
| Couple of rpm requests, 2.6.22 [message #20055] |
Tue, 11 September 2007 11:07 |
mhw Messages: 12 Registered: March 2007 |
Junior Member |
From: openvz.org
|
|
Hey all!
Not sure if I should post this for discussion or just file these
requests in bugzilla. I'll do both... :-)
I've got a couple of requests for some changes in the rpm builds, now
that development is on 2.6.22.
2.6.22 includes MD5 tcp signatures which are necessary for IPv4
passwords on BGP sessions (quagga w/ password patch). The default
builds for Fedora 2.6.22 builds include this option but it has not been
enabled in the OpenVZ builds. Any chance of getting that slipped in for
the next release (which I understand should also fix the netfilter
problems?).
I also noticed that SIT tunnels for IPv6 are not enabled in the build,
which is a standard default option for all RedHat / Fedora / CentOS
builds.
The RedHat and Fedora builds include development rpm's. I maintain a
couple of kernel modules and often build outside of the kernel, so I
need the development bits and pieces. I'm curious why a "devel" package
isn't similarly provided with the 2.6.22 packages.
These are sort of show stoppers for me which require I rebuild the rpms
from srpm for some of my systems, where the 2.6.22 kernel is required.
Regards,
Mike
--
Michael H. Warfield (AI4NB) | (770) 985-6132 | mhw@WittsEnd.com
/\/\|=mhw=|\/\/ | (678) 463-0932 | http://www.wittsend.com/mhw/
NIC whois: MHW9 | An optimist believes we live in the best of all
PGP Key: 0xDF1DD471 | possible worlds. A pessimist is sure of it!
|
|
| | Topic: [PATCH 3/3] Signal semantics for pid namespaces |
|---|
| [PATCH 3/3] Signal semantics for pid namespaces [message #20026] |
Tue, 11 September 2007 00:12 |
Sukadev Bhattiprolu Messages: 413 Registered: August 2006 |
Senior Member |
From: openvz.org
|
|
From: Sukadev Bhattiprolu <sukadev@us.ibm.com>
Subject: [PATCH 3/3] Signal semantics for pid namespaces
With support for multiple pid namespaces, each pid namespace has a
separate child reaper and this process needs some special handling
of signals.
- The child reaper should appear like a normal process to other
processes in its ancestor namespaces and so should be killable
(or not) in the usual way.
- The child reaper should receive, from processes in it's active
and decendent namespaces, only those signals for which it has
installed a signal handler.
- System-wide signals (eg: kill signum -1) from within a child namespace
should only affect processes within that namespace and descendant
namespaces. They should not be posted to processes in ancestor or
sibling namespaces.
- If the sender of a signal does not have a pid_t in the receiver's
namespace (eg: a process in init_pid_ns sends a signal to a process
in a descendant namespace), the sender's pid and uid should appear
as 0 in the signal's 'siginfo' structure.
- Existing rules for SIGIO delivery still apply and a process can
choose any other process in its namespace and descendant namespaces
to receive the SIGIO signal.
The following appears to be incorrect in the fcntl() man page for
F_SETOWN.
Sending a signal to the owner process (group) specified by
F_SETOWN is subject to the same permissions checks as are
described for kill(2), where the sending process is the one that
employs F_SETOWN (but see BUGS below).
Current behavior is that the SIGIO signal is delivered on behalf of
the process that caused the event (eg: made data available on the
file) and not the process that called fcntl().
Changelog:
- [Oleg Nesterov]: Used the interfaces, is_current_in_ancestor_pid_ns()
and is_current_in_same_or_ancestor_pid_ns().
- [Oleg Nesterov]: Clear info.si_uid also when masquerading sender.
Signed-off-by: Sukadev Bhattiprolu <sukadev@us.ibm.com>
---
kernel/signal.c | 28 +++++++++++++++++++++++++++-
1 file changed, 27 insertions(+), 1 deletion(-)
Index: 2.6.23-rc4-mm1/kernel/signal.c
===================================================================
--- 2.6.23-rc4-mm1.orig/kernel/signal.c 2007-09-10 18:42:16.000000000 -0700
+++ 2.6.23-rc4-mm1/kernel/signal.c 2007-09-10 18:42:16.000000000 -0700
@@ -25,6 +25,7 @@
#include <linux/capability.h>
#include <linux/freezer.h>
#include <linux/pid_namespace.h>
+#include <linux/pid.h>
#include <linux/nsproxy.h>
#include <linux/hardirq.h>
@@ -45,7 +46,10 @@ static int sig_init_ignore(struct task_s
// Currently this check is a bit racy with exec(),
// we can _simplify_ de_thread and close the race.
- if (likely(!is_global_init(tsk->group_leader)))
+ if (likely(!is_container_init(tsk->group_leader)))
+ return 0;
+
+ if (is_current_in_ancestor_pid_ns(tsk) && !in_interrupt())
return 0;
return 1;
@@ -681,6 +685,20 @@ static void handle_stop_signal(int sig,
}
}
+static void masquerade_sender(struct task_struct *t, struct sigqueue *q)
+{
+ /*
+ * If the sender does not have a pid_t in the receiver's active
+ * pid namespace, set si_pid to 0 and pretend signal originated
+ * from the kernel.
+ */
+ if (!pid_ns_equal(t)) {
+ q->info.si_pid = 0;
+ q->info.si_uid = 0;
+ q->info.si_code = SI_KERNEL;
+ }
+}
+
static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
struct sigpending *signals)
{
@@ -732,6 +750,7 @@ static int send_signal(int sig, struct s
copy_siginfo(&q->info, info);
break;
}
+ masquerade_sender(t, q);
} else if (!is_si_special(info)) {
if (sig >= SIGRTMIN && info->si_code != SI_USER)
/*
@@ -1165,6 +1184,7 @@ EXPORT_SYMBOL_GPL(kill_pid_info_as_uid);
static int kill_something_info(int sig, struct siginfo *info, int pid)
{
int ret;
+
rcu_read_lock();
if (!pid) {
ret = kill_pgrp_info(sig, info, task_pgrp(current));
@@ -1174,6 +1194,12 @@ static int kill_something_info(int sig,
read_lock(&tasklist_lock);
for_each_process(p) {
+ /*
+ * System-wide signals only apply to pid namespace
+ * of sender.
+ */
+ if (!is_current_in_same_or_ancestor_pid_ns(p))
+ continue;
if (p->pid > 1 && !same_thread_group(p, current)) {
int err = group_send_sig_info(sig, info, p);
++count;
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
|
|
| | Topic: [PATCH 2/3] Pid ns helpers for signals |
|---|
| [PATCH 2/3] Pid ns helpers for signals [message #20025] |
Tue, 11 September 2007 00:11 |
Sukadev Bhattiprolu Messages: 413 Registered: August 2006 |
Senior Member |
From: openvz.org
|
|
Define some helper functions that will be used to implement signal semantics
with multiple pid namespaces.
is_current_in_ancestor_pid_ns(task)
TRUE iff active pid namespace of 'current' is an ancestor of
active pid namespace of @task.
is_current_in_same_or_ancestor_pid_ns(task)
TRUE iff active pid namespace of 'current' is either same as
or an ancestor of active pid namespace of @task.
pid_ns_equal(tsk)
TRUE if active pid ns of @tsk is same as active pid ns of
'current'.
Changelog: [Oleg Nesterov]: Renamed helpers. Dropped reference to pid and
pid-namespace since they are stable for current
callers.
---
include/linux/pid.h | 12 +++++++++
kernel/pid.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 75 insertions(+)
Index: 2.6.23-rc4-mm1/include/linux/pid.h
===================================================================
--- 2.6.23-rc4-mm1.orig/include/linux/pid.h 2007-09-07 19:18:34.000000000 -0700
+++ 2.6.23-rc4-mm1/include/linux/pid.h 2007-09-07 19:18:42.000000000 -0700
@@ -124,6 +124,18 @@ extern struct pid *alloc_pid(struct pid_
extern void FASTCALL(free_pid(struct pid *pid));
extern void zap_pid_ns_processes(struct pid_namespace *pid_ns);
+static inline struct pid_namespace *pid_active_ns(struct pid *pid)
+{
+ if (!pid)
+ return NULL;
+
+ return pid->numbers[pid->level].ns;
+}
+
+extern int pid_ns_equal(struct task_struct *tsk);
+extern int is_current_in_ancestor_pid_ns(struct task_struct *tsk);
+extern int is_current_in_same_or_ancestor_pid_ns(struct task_struct *tsk);
+
/*
* the helpers to get the pid's id seen from different namespaces
*
Index: 2.6.23-rc4-mm1/kernel/pid.c
===================================================================
--- 2.6.23-rc4-mm1.orig/kernel/pid.c 2007-09-07 19:18:34.000000000 -0700
+++ 2.6.23-rc4-mm1/kernel/pid.c 2007-09-10 18:35:51.000000000 -0700
@@ -199,6 +199,69 @@ static int next_pidmap(struct pid_namesp
return -1;
}
+/*
+ * Return TRUE if the active pid namespace of @tsk is same as active
+ * pid namespace of 'current'.
+ */
+int pid_ns_equal(struct task_struct *tsk)
+{
+ struct pid_namespace *my_ns = pid_active_ns(task_pid(current));
+ struct pid_namespace *tsk_ns = pid_active_ns(task_pid(tsk));
+
+ return my_ns == tsk_ns;
+}
+
+/*
+ * Return TRUE if pid namespace @ns1 is an ancestor of pid namespace @ns2.
+ * Return FALSE otherwise.
+ *
+ * Note: Callers must ensure @ns1 and @ns2 are stable.
+ */
+static int ancestor_pid_ns(struct pid_namespace *ns1, struct pid_namespace *ns2)
+{
+ int i;
+ struct pid_namespace *tmp;
+
+ if (ns1 == NULL || ns2 == NULL)
+ return 0;
+
+ if (ns1->level >= ns2->level)
+ return 0;
+
+ tmp = ns2->parent;
+ for (i = tmp->level; i >= ns1->level; i--) {
+ if (tmp == ns1)
+ return 1;
+ tmp = tmp->parent;
+ }
+
+ return 0;
+}
+
+/*
+ * Return TRUE if active pid namespace of 'current' is an ancestor of
+ * pid namespace of @tsk. Return FALSE otherwise.
+ */
+int is_current_in_ancestor_pid_ns(struct task_struct *tsk)
+{
+ struct pid_namespace *my_ns = pid_active_ns(task_pid(current));
+ struct pid_namespace *tsk_ns = pid_active_ns(task_pid(tsk));
+
+ return ancestor_pid_ns(my_ns, tsk_ns);
+}
+
+/*
+ * Return TRUE if active pid namespace of 'current' is either same as
+ * or an ancestor of active pid namespace of @tsk.
+ */
+int is_current_in_same_or_ancestor_pid_ns(struct task_struct *tsk)
+{
+ struct pid_namespace *my_ns = pid_active_ns(task_pid(current));
+ struct pid_namespace *tsk_ns = pid_active_ns(task_pid(tsk));
+
+ return my_ns == tsk_ns || ancestor_pid_ns(my_ns, tsk_ns);
+}
+
fastcall void put_pid(struct pid *pid)
{
struct pid_namespace *ns;
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
|
|
| | Topic: [PATCH] Containers: Fix refcount bug |
|---|
| [PATCH] Containers: Fix refcount bug [message #20010] |
Mon, 10 September 2007 17:51 |
menage Messages: 5 Registered: August 2007 |
Junior Member |
From: openvz.org
|
|
Fix a reference counting bug in containerfs
As part of the extraction of cpusetfs to containerfs, a call to
cpuset_get_dentry() was lost (justified by the fact that the dentry in
question was now being passed down by the caller). Since
cpuset_get_dentry() called lookup_one_len(), this resulted in a
reference count being missed from the directory dentry.
This patch removes container_get_dentry() and replaces it with direct
calls to lookup_one_len(); the initialization of containerfs dentry
ops is done now in container_create_file() at dentry creation time.
Signed-off-by: Paul Menage <menage@google.com>
---
kernel/container.c | 26 ++++++++------------------
1 file changed, 8 insertions(+), 18 deletions(-)
Index: container-2.6.23-rc3-mm1/kernel/container.c
===================================================================
--- container-2.6.23-rc3-mm1.orig/kernel/container.c
+++ container-2.6.23-rc3-mm1/kernel/container.c
@@ -603,19 +603,6 @@ static void container_diput(struct dentr
iput(inode);
}
-static struct dentry *container_get_dentry(struct dentry *parent,
- const char *name)
-{
- struct dentry *d = lookup_one_len(name, parent, strlen(name));
- static struct dentry_operations container_dops = {
- .d_iput = container_diput,
- };
-
- if (!IS_ERR(d))
- d->d_op = &container_dops;
- return d;
-}
-
static void remove_dir(struct dentry *d)
{
struct dentry *parent = dget(d->d_parent);
@@ -1506,6 +1493,10 @@ static struct inode_operations container
static int container_create_file(struct dentry *dentry, int mode,
struct super_block *sb)
{
+ static struct dentry_operations container_dops = {
+ .d_iput = container_diput,
+ };
+
struct inode *inode;
if (!dentry)
@@ -1531,7 +1522,7 @@ static int container_create_file(struct
inode->i_size = 0;
inode->i_fop = &container_file_operations;
}
-
+ dentry->d_op = &container_dops;
d_instantiate(dentry, inode);
dget(dentry); /* Extra count - pin the dentry in core */
return 0;
@@ -1552,13 +1543,12 @@ static int container_create_dir(struct c
int error = 0;
parent = cont->parent->dentry;
- if (IS_ERR(dentry))
- return PTR_ERR(dentry);
error = container_create_file(dentry, S_IFDIR | mode, cont->root->sb);
if (!error) {
dentry->d_fsdata = cont;
inc_nlink(parent->d_inode);
cont->dentry = dentry;
+ dget(dentry);
}
dput(dentry);
@@ -1580,7 +1570,7 @@ int container_add_file(struct container
}
strcat(name, cft->name);
BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
- dentry = container_get_dentry(dir, name);
+ dentry = lookup_one_len(name, dir, strlen(name));
if (!IS_ERR(dentry)) {
error = container_create_file(dentry, 0644 | S_IFREG,
cont->root->sb);
@@ -2586,7 +2576,7 @@ int container_clone(struct task_struct *
/* Hold the parent directory mutex across this operation to
* stop anyone else deleting the new container */
mutex_lock(&inode->i_mutex);
- dentry = container_get_dentry(parent->dentry, nodename);
+ dentry = lookup_one_len(nodename, parent->dentry, strlen(nodename));
if (IS_ERR(dentry)) {
printk(KERN_INFO
"Couldn't allocate dentry for %s: %ld\n", nodename,
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
|
|
| | Topic: pid namespace .text overhead |
|---|
| pid namespace .text overhead [message #20027] |
Mon, 10 September 2007 15:38 |
Cedric Le Goater Messages: 443 Registered: February 2006 |
Senior Member |
From: openvz.org
|
|
FYI,
I just did a compile test on a 2.6.23-rc4-mm1 kernel with and without
the following patches on a x86_64 defconfig (I also had to remove
CONFIG_IPV6 for some compile reason) :
+ pid-namespaces-rework-forget_original_parent.patch
+ pid-namespaces-move-exit_task_namespaces.patch
+ pid-namespaces-introduce-ms_kernmount-flag.patch
+ pid-namespaces-prepare-proc_flust_task-to-flush-entries-from-multiple-proc-trees.patch
+ pid-namespaces-introduce-struct-upid.patch
+ pid-namespaces-add-support-for-pid-namespaces-hierarchy.patch
+ pid-namespaces-make-alloc_pid-free_pid-and-put_pid-work-with-struct-upid.patch
+ pid-namespaces-helpers-to-obtain-pid-numbers.patch
+ pid-namespaces-helpers-to-find-the-task-by-its-numerical-ids.patch
+ pid-namespaces-helpers-to-find-the-task-by-its-numerical-ids-fix.patch
+ pid-namespaces-move-alloc_pid-lower-in-copy_process.patch
+ pid-namespaces-make-proc-have-multiple-superblocks-one-for-each-namespace.patch
+ pid-namespaces-miscelaneous-preparations-for-pid-namespaces.patch
+ pid-namespaces-allow-cloning-of-new-namespace.patch
+ pid-namespaces-allow-cloning-of-new-namespace-fix-check-for-return-value-of-create_pid_namespace.patch
+ pid-namespaces-make-proc_flush_task-actually-from-entries-from-multiple-namespaces.patch
+ pid-namespaces-initialize-the-namespaces-proc_mnt.patch
+ pid-namespaces-create-a-slab-cache-for-struct-pid_namespace.patch
+ pid-namespaces-allow-signalling-container-init.patch
+ pid-namespaces-destroy-pid-namespace-on-inits-death.patch
+ pid-namespaces-changes-to-show-virtual-ids-to-user.patch
+ pid-namespaces-changes-to-show-virtual-ids-to-user-fix-the-return-value-of-sys_set_tid_address.patch
+ pid-namespaces-changes-to-show-virtual-ids-to-user-use-find_task_by_pid_ns-in-places-that-operate-with-virtual.patch
+ pid-namespaces-changes-to-show-virtual-ids-to-user-use-find_task_by_pid_ns-in-places-that-operate-with-virtual-fix.patch
+ pid-namespaces-changes-to-show-virtual-ids-to-user-use-find_task_by_pid_ns-in-places-that-operate-with-virtual-fix-2.patch
+ pid-namespaces-changes-to-show-virtual-ids-to-user-use-find_task_by_pid_ns-in-places-that-operate-with-virtual-fix-3.patch
+ pid-namespaces-changes-to-show-virtual-ids-to-user-sys_getsid-sys_getpgid-return-wrong-id-for-task-from-another.patch
+ pid-namespaces-changes-to-show-virtual-ids-to-user-fix-the-sys_setpgrp-to-work-between-namespaces.patch
+ pid-namespaces-changes-to-show-virtual-ids-to-user-fix.patch
+ pid-namespaces-remove-the-struct-pid-unneeded-fields.patch
+ isolate-some-explicit-usage-of-task-tgid.patch
+ isolate-some-explicit-usage-of-task-tgid-fix.patch
+ isolate-some-explicit-usage-of-task-tgid-fix-fix.patch
I got a less than *6k* difference in .text. Here are the detailed size
results.
size without:
text data bss dec hex filename
5067718 892082 725544 6685344 6602a0 vmlinux
size with:
text data bss dec hex filename
5073314 892210 725544 6691068 6618fc vmlinux
section without with
.text 3383305 3388569
__ex_table 16704 16704
__bug_table 41136 41208
.rodata 1276129 1276469
.pci_fixup 4032 4032
__ksymtab 43984 43984
__ksymtab_gpl 13024 13056
__ksymtab_gpl_future 48 48
__ksymtab_strings 83388 83420
__param 7800 7800
.data 482928 483056
.data.cacheline_aligned 187008 187008
.data.read_mostly 41600 41600
.vsyscall_0 227 227
.vsyscall_fn 54 54
.vsyscall_gtod_data 80 80
.vsyscall_1 52 52
.vsyscall_2 91 91
.vgetcpu_mode 4 4
.jiffies 8 8
.vsyscall_3 8 8
.data.init_task 8192 8192
.data.page_aligned 4096 4096
.smp_locks 33584 33624
.init.text 153713 153529
.init.data 137714 137714
.init.setup 3168 3168
.initcall.init 2200 2200
.con_initcall.init 16 16
.altinstructions 1243 1243
.altinstr_replacement 244 244
.exit.text 4739 4739
.note 24 24
.vdso 4056 4056
.init.ramfs 133 133
.data.percpu 25064 25064
.data_nosave 4 4
.bss 725544 725544
.comment 44352 44352
Total 6729696 6735420
kernel/pid.o .text increases a lot (more than a 1K) but some inline
would also need to be checked (that i didn't do yet)
Cheers,
C.
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
|
|
| | Topic: Re: [PATCH] Hookup group-scheduler with task container infrastructure |
|---|
| Re: [PATCH] Hookup group-scheduler with task container infrastructure [message #20022] |
Mon, 10 September 2007 13:42 |
Jan Engelhardt Messages: 18 Registered: August 2006 |
Junior Member |
From: openvz.org
|
|
On Sep 10 2007 22:58, Srivatsa Vaddagiri wrote:
>On Mon, Sep 10, 2007 at 10:53:34PM +0530, Srivatsa Vaddagiri wrote:
>> > cpuctl, cpuctrl, cpu_controller?
>>
>> *shrug* .. I used "cpuctlr" to mean "CPU Controller". Any other short names
>> would do. From your list, cpuctl or cpuctrl both qualifies IMO!
>>
>> Unless folks have strong objection to it, I prefer "cptctlr", the way it is.
>
>s/cptctlr/cpuctlr !
Captain Controller to the rescue!
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
|
|
| | Topic: [PATCH] Leases can be hidden by flocks |
|---|
| [PATCH] Leases can be hidden by flocks [message #16517] |
Mon, 10 September 2007 10:16 |
Pavel Emelianov Messages: 1149 Registered: September 2006 |
Senior Member |
From: openvz.org
|
|
The inode->i_flock list contains the leases, flocks and posix
locks in the specified order. However, the flocks are added in
the head of this list thus hiding the leases from F_GETLEASE
command, from time_out_leases() and other code that expects
the leases to come first.
The following example will demonstrate this:
#define _GNU_SOURCE
#include <unistd.h>
#include <fcntl.h>
#include <stdio.h>
#include <sys/file.h>
static void show_lease(int fd)
{
int res;
res = fcntl(fd, F_GETLEASE);
switch (res) {
case F_RDLCK:
printf("Read lease\n");
break;
case F_WRLCK:
printf("Write lease\n");
break;
case F_UNLCK:
printf("No leases\n");
break;
default:
printf("Some shit\n");
break;
}
}
int main(int argc, char **argv)
{
int fd, res;
fd = open(argv[1], O_RDONLY);
if (fd == -1) {
perror("Can't open file");
return 1;
}
res = fcntl(fd, F_SETLEASE, F_WRLCK);
if (res == -1) {
perror("Can't set lease");
return 1;
}
show_lease(fd);
if (flock(fd, LOCK_SH) == -1) {
perror("Can't flock shared");
return 1;
}
show_lease(fd);
return 0;
}
The first call to show_lease() will show the write lease set, but
the second will show no leases.
Fix the flock adding so that the leases always stay in the head
of this list.
Found during making the flocks pid-namespaces aware.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
---
diff --git a/fs/locks.c b/fs/locks.c
index 6068f82..0db1a14 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -781,7 +781,7 @@ find_conflict:
if (request->fl_flags & FL_ACCESS)
goto out;
locks_copy_lock(new_fl, request);
- locks_insert_lock(&inode->i_flock, new_fl);
+ locks_insert_lock(before, new_fl);
new_fl = NULL;
error = 0;
|
|
| | Topic: ks and mini-summit documents |
|---|
| ks and mini-summit documents [message #19956] |
Thu, 06 September 2007 10:14 |
Cedric Le Goater Messages: 443 Registered: February 2006 |
Senior Member |
From: openvz.org
|
|
All,
I've gathered the ks and mini-summit documents here :
http://lxc.sourceforge.net/doc/
Cheers,
C.
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
|
|
| | Topic: Re: [PATCH] Send quota messages via netlink |
|---|
| Re: [PATCH] Send quota messages via netlink [message #19931] |
Wed, 05 September 2007 10:28 |
serue Messages: 750 Registered: February 2006 |
Senior Member |
From: openvz.org
|
|
Quoting Jan Kara (jack@suse.cz):
> On Tue 04-09-07 18:48:52, Serge E. Hallyn wrote:
> > Quoting Jan Kara (jack@suse.cz):
> > > On Tue 04-09-07 16:32:10, Serge E. Hallyn wrote:
> > > > Quoting Jan Kara (jack@suse.cz):
> > > > > On Thu 30-08-07 17:14:47, Serge E. Hallyn wrote:
> > > > > > Quoting Jan Kara (jack@suse.cz):
> > > > > > > I imagine it so that you have a machine and on it several virtual
> > > > > > > machines which are sharing a filesystem (or it could be a cluster). Now you
> > > > > > > want UIDs to be independent between these virtual machines. That's it,
> > > > > > > right?
> > > > > > > Now to continue the example: Alice has UID 100 on machineA, Bob has
> > > > > > > UID 100 on machineB. These translate to UIDs 1000 and 1001 on the common
> > > > > > > filesystem. Process of Alice writes to a file and Bob becomes to be over
> > > > > > > quota. In this situation, there would be probably two processes (from
> > > > > > > machineA and machineB) listening on the netlink socket. We want to send a
> > > > > > > message so that on Alice's desktop we can show a message: "You caused
> > > > > > > Bob to exceed his quotas" and of Bob's desktop: "Alice has caused that you
> > > > > > > are over quota.".
> > > > > >
> > > > > > Since this is over NFS, you handle it the way you would any other time
> > > > > > that user Alice on some other machine managed to do this.
> > > > > I meant this would actually happen over a local filesystem (imagine
> > > > > something like "hostfs" from UML).
> > > >
> > > > Ok, then that is where I was previously suggesting that we use an api to
> > > > report a uid meaningful in bob's context, where we currently (in the
> > > > absense of meaningful mount uids and uid equivalence) tell Bob that root
> > > > was the one who brought him over quota. From a user pov 'nobody' would
> > > > make more sense, but I don't think we want the kernel to know about user
> > > > nobody, right?
> > > But what is the problem with using the filesystem ids? All virtual
> > > machines in my example should have a notion of those...
> >
> > I don't know what you mean by filesystem ids. Do you mean the uid
> > stored on the fs? I imagine a network fs could get fancy and store
> > something more detailed than the unix uid, based on the user's keys.
> >
> > Do you mean the inode->i_uid? Nothing wrong with that. Then we just
> > assume that either you are in the superblock or mount's user namespace
> > (depending on how we implement it, probably superblock), or can figure
> > out what that is.
> I meant the identity the process uses to access the filesystem (to
> identify the user who caused the limit excess) and also the identity stored
> in the quota file (to identify whose quota was exceeded).
> Anyway, any identity more complicated than just a number needs changes in
> both quota file format and filesystems so at that moment, we can also
> change the netlink interface...
>
> > Sure, and in many ways. But if working with NFS, as far as I know the
> > most common way to solve it is to enforce a common /etc/passwd across
> > all the valid NFS clients :)
> Then one wonders whether user namespaces are really what users want ;).
Absolutely.
You use nfs to share filesystems among separate machines that you want
to have look similar.
You use user namespaces to pretend one machine is a bunch of separate
machines. So if you're just going to split up your machine into 5
vms and then have them all share disk over nfs, you may just want to
keep it as one machine :)
Ideally each vm would have completely separate disk space, so file
access across user namespaces wouldn't happen. More realistically,
file trees will be shared read-only - i.e. /lib, /usr, etc. Some of
that can be handled simply using read-only bind mounts. We'd like
to allow users to create vm's as well, so then we want uid 500 in
the initial user namespace to be uid 0 in a newly created user
namespace.
So what Eric and I are worried about are corner cases and admin
mistakes, not regular function.
(And again I really do think we'll want to tie netlink sockets to a user
namespace, not a network namespace, so there may be no issue at all
so long as proper filesystem access checks are implemented so that every
action on some filesystem is done with credentials valid in that
filesystems' user namespace)
-serge
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
|
|
| | Topic: Containers status update |
|---|
| Containers status update [message #19901] |
Mon, 03 September 2007 04:36 |
Pavel Emelianov Messages: 1149 Registered: September 2006 |
Senior Member |
From: openvz.org
|
|
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
|
|
| | Topic: containers - bug |
|---|
| containers - bug [message #19874] |
Fri, 31 August 2007 12:29 |
Daniel Lezcano Messages: 417 Registered: June 2006 |
Senior Member |
From: openvz.org
|
|
Hi Paul,
I was playing with the container filesystem (very nice) and I fall
inside a kbug.
I did the following:
mkdir /dev/container
mount -t container -o cpuset cpuset /dev/container
cd /dev/container/
mkdir Charlie
cd Charlie
echo $$ > tasks
bash
cd ..
rmdir Charlie
exit
ls => bang !
I run a 2.6.23-rc3-mm1 kernel with qemu.
Here is the message:
--------------------
BUG: unable to handle kernel paging request at virtual address 6b6b6c23
printing eip: c016719d *pde = 00000000
Oops: 0000 [#1] SMP
Modules linked in:
Pid: 960, comm: bash Not tainted (2.6.23-rc3-mm1 #522)
EIP: 0060:[<c016719d>] EFLAGS: 00000246 CPU: 0
EIP is at __link_path_walk+0x41/0xb54
EAX: c1873320 EBX: c141c608 ECX: c167be38 EDX: c15de000
ESI: c167bef4 EDI: 6b6b6b6b EBP: c167be3c ESP: c167bde8
DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068
Process bash (pid: 960, ti=c167a000 task=c1466b50 task.ti=c167a000)
Stack: c15de000 00000000 00000400 00000001 000000bf 00008a31 7f1c0300
01000415
1a131100 170f1200 00000000 00009600 00009600 00000500 00000003
c03ab570
00000082 c167be38 c141c608 c167bef4 c1873320 c167bea8 c0167cfb
c15de000
Call Trace:
[<c010398d>] show_trace_log_lvl+0x1a/0x2f
[<c0103a3f>] show_stack_log_lvl+0x9d/0xa5
[<c0103aee>] show_registers+0xa7/0x178
[<c0103cc6>] die+0x107/0x227
[<c011447c>] do_page_fault+0x47f/0x567
[<c030800a>] error_code+0x72/0x78
[<c0167cfb>] link_path_walk+0x4b/0xc0
[<c0167d89>] path_walk+0x19/0x1b
[<c0168057>] do_path_lookup+0x179/0x193
[<c0168823>] __user_walk_fd+0x32/0x49
[<c0162a09>] vfs_stat_fd+0x1b/0x41
[<c0162ade>] vfs_stat+0x11/0x13
[<c0162af4>] sys_stat64+0x14/0x28
[<c0102a5a>] syscall_call+0x7/0xb
=======================
Code: ff 45 ac 8b 55 ac 8a 02 3c 2f 74 f4 84 c0 0f 84 99 0a 00 00 8b 06
83 7e 1c 00 8b 78 18 74 08 83 65 b8 04 83 4d b8 01 83 4e 14 04 <8b> 87
b8 00 00 00 0f b7 5f 6e 85 c0 74 0a 83 78 34 00 0f 85 b7
EIP: [<c016719d>] __link_path_walk+0x41/0xb54 SS:ESP 0068:c167bde8
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
|
|
| | Topic: [ANNOUNCE] The Linux Test Project has been Released for AUGUST 2007 |
|---|
| [ANNOUNCE] The Linux Test Project has been Released for AUGUST 2007 [message #19862] |
Fri, 31 August 2007 01:42 |
Subrata Modak Messages: 16 Registered: August 2007 |
Junior Member |
From: openvz.org
|
|
*Dear All*,
The Linux Test Project test suite has been released for the month of
AUGUST 2007. The latest version of the test-suite contains 3000+ tests
for the Linux OS and can be found at http://ltp.sourceforge.net/. Latest
happenings in LTP can also be found at:
http://ltp.sourceforge.net/wiki/, and
http://ltp.sourceforge.net/wikiArchives.php.
Our web site also contains other information such as:
- A Linux test tools matrix
- Technical papers
- How To's on Linux testing
- Code coverage analysis tool.
*Release Highlights:*
** * Integration of NUMA testcases*
* *Releases of RHEL5 LSPP Certification Test suite***
* * Release of LTP-KDUMP Test-Case Plan*
* * Release of Containers Testcases Plan*
* * Release of GCOV-Kernel & LCOV Packages*
** Merging of OPEN_HPI_TESTSUITE-2.9.3 to LTP*
* Addition of a new Test Case 'swapon03'
* Many more Bug Fixes and Patches
*
*Note(s) from the Maintainer:*
You will see couple of new test-cases in LTP in coming months and also
broken testcases getting fixed slowly. We want to revisit all testcases
and hence the time taken is long. From every release onwards you will
also find results of LTP-RUNALL on different Architectures (_*ia64,
x86_64, i386, PPC64, s390x*_) on varied Kernels & Distros. This will
give you more insight into the behaviour of LTP across different platforms.
We encourage the community to post results to ltp-results@lists.sf.net,
and patches, new tests, bugs or comments/questions to ltp-list@lists.sf.net,
http://sourceforge.net/tracker/?func=add&group_id=3382&atid=103382 (for
New Bug(s)),
http://sourceforge.net/tracker/?func=add&group_id=3382&atid=303382 (for
New Patch(s)),
http://sourceforge.net/tracker/?func=add&group_id=3382&atid=353382 (for
New Feature Request(s))
Please also see the ChangeLog Attached (AUGUST 2007):
Happy testing,
Regards--
Subrata Modak,
1) Log Message: "rsalveti@br.ibm.com" fixed write05 that failed with LTP 20070331 on Fedora 7 GA
File(s) Affected:
ltp/testcases/kernel/syscalls/write/write05.c
2) Log Message: "dmarlin@redhat.com" corrected fail message in data_space testcase
File(s) Affected:
ltp/testcases/kernel/mem/vmtests/data_space.c
3) Log Message: "liudeyan@cn.ibm.com" made mmap1 to be terminated by Ctrl-C
File(s) Affected:
ltp/testcases/kernel/mem/mtest06/mmap1.c
4) Log Message: "suzuki@in.ibm.com" fixed times03, where it failed to generate report on user time on RHEl5.1 early build(2.6.18-32.el5)
File(s) Affected: ltp/testcases/kernel/syscalls/times/times03.c
5) Log Message: gcov-kernel: added eabi-compatibility patch, renamed .diff to .patch by "oberpapr@users.sourceforge.net"
File(s) Added:
ltp/utils/analysis/gcov-kernel/linux-2.6.21-gcov-arm-eabi.patch
ltp/utils/analysis/gcov-kernel/linux-2.6.22-gcov-arm-eabi.patch
File(s) Deleted:
ltp/utils/analysis/gcov-kernel/linux-2.6.21-gcov-arm-eabi.diff
6) Log Message: lcov: fixed spec file
File(s) Affected:
ltp/utils/analysis/lcov/rpm/lcov.spec
7) Log Message: gcov-kernel: removed outdated FAQ entry
File(s) Affected:
ltp/utils/analysis/gcov-kernel/FAQ
8) Log Message: gcov-kernel: added Makefile
File(s) Added: ltp/utils/analysis/gcov-kernel/Makefile
9) Log Message: lcov: Makefile for release 1.6
File(s) Affected:
/cvsroot/ltp/utils/analysis/lcov/Makefile
10) Log Message: lcov: Makefile for post-release
File(s) Affected:
/cvsroot/ltp/utils/analysis/lcov/Makefile
11) Log Message: lcov: add experimental option "--norecursion"
File(s) Affected:
/cvsroot/ltp/utils/analysis/lcov/bin/geninfo
/cvsroot/ltp/utils/analysis/lcov/bin/lcov
12) Log Message: Changes to make testcases/kernel/numa/numa01.sh executable
File(s) Affected:
ltp/testcases/kernel/numa/Makefile
13) Log Message: "carmelo.amoroso@st.com" changed the Default values for MAXSIZE and csize
File(s) Affected:
ltp/testcases/kernel/mem/vmtests/stack_space.c
14) Log Message: "brenohl@br.ibm.com" wanted to handle file descriptors properly
File(s) Affected:
ltp/testcases/kernel/syscalls/mkdir/mkdir03.c
ltp/testcases/kernel/syscalls/mmap/mmap09.c
ltp/testcases/kernel/syscalls/open/open07.c
ltp/testcases/kernel/syscalls/sendfile/sendfile03.c
15) Log Message: "rsalvetidev@linux.vnet.ibm.com" says that this version can handle when the distro has MAX_SWAPFILES as 30 or 32
File(s) Modified:
ltp/runtest/ltplite ltp/runtest/stress.part3
ltp/runtest/syscalls
ltp/testcases/kernel/syscalls/swapon/swapon02.c
File(s) Added:
ltp/testcases/kernel/syscalls/swapon/swapon03.c
16) Log Message: Containers Testcases Plan
File(s) Added:
ltp/testcases/kernel/containers/TEST_PLAN.txt
17) Log Message: LTP-KDUMP Test-Case Plan
File(s) Added:
ltp/testcases/kdump/TEST_PLAN.txt
18) Log Message: Update to OpenHPI 2.9.3 (www.openhpi.org for more info)
File(s) Affected:
ltp/testcases/open_hpi_testsuite/
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
|
|
| | Topic: Re: [PATCH] Use task_pid_nr() in ip_vs_sync.c |
|---|
| Re: [PATCH] Use task_pid_nr() in ip_vs_sync.c [message #19808] |
Wed, 29 August 2007 17:50 |
Sukadev Bhattiprolu Messages: 413 Registered: August 2006 |
Senior Member |
From: openvz.org
|
|
Pavel Emelianov [xemul@openvz.org] wrote:
| The sync_master_pid and sync_backup_pid are set in set_sync_pid()
| and are used later for set/not-set checks and in printk. So it
| is safe to use the global pid value in this case.
|
| Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Acked-by: Sukadev Bhattiprolu <sukadev@us.ibm.com>
|
| ---
|
| diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c
| index 959c08d..d0798a5 100644
| --- a/net/ipv4/ipvs/ip_vs_sync.c
| +++ b/net/ipv4/ipvs/ip_vs_sync.c
| @@ -794,7 +794,7 @@ static int sync_thread(void *startup)
|
| add_wait_queue(&sync_wait, &wait);
|
| - set_sync_pid(state, current->pid);
| + set_sync_pid(state, task_pid_nr(current));
| complete(tinfo->startup);
|
| /*
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
|
|
| | Topic: Re: [PATCH] Use same_thread_group() in signalfd.c |
|---|
| Re: [PATCH] Use same_thread_group() in signalfd.c [message #19804] |
Wed, 29 August 2007 17:18 |
Sukadev Bhattiprolu Messages: 413 Registered: August 2006 |
Senior Member |
From: openvz.org
|
|
Pavel Emelianov [xemul@openvz.org] wrote:
| This is a lost hunk of previous patch that isolated the
| explicit usage of task->tgid in some places. The signalfd
| code uses the tsk->tgid comparison.
|
| Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Acked-by: Sukadev Bhattiprolu <sukadev@us.ibm.com>
|
| ---
|
| diff --git a/fs/signalfd.c b/fs/signalfd.c
| index a8e293d..5bfd2c5 100644
| --- a/fs/signalfd.c
| +++ b/fs/signalfd.c
| @@ -64,7 +64,7 @@ static int signalfd_lock(struct signalfd
| return 0;
| }
|
| - if (lk->tsk->tgid == current->tgid)
| + if (same_thread_group(lk->tsk, current->tgid))
| lk->tsk = current;
|
| return 1;
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
|
|
| | Topic: Re: [-mm PATCH] Memory controller improve user interface |
|---|
| Re: [-mm PATCH] Memory controller improve user interface [message #19799] |
Wed, 29 August 2007 12:17 |
Paul Menage Messages: 642 Registered: September 2006 |
Senior Member |
From: openvz.org
|
|
On 8/29/07, Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> >
> > This seems a bit inconsistent - if you write a value to a limit file,
> > then the value that you read back is reduced by a factor of 1024?
> > Having the "(kB)" suffix isn't really a big help to automated
> > middleware.
> >
>
> Why is that? Is it because you could write 4M and see it show up
> as 4096 kilobytes? We'll that can be fixed with another variant
> of the memparse() utility.
I was thinking the other way around - you can write 1048576 (i.e. 1MB)
to the file and read back 1024. It just seems to me that it's clearer
if you write X to the file to get X back.
>
> 64 bit might be an overkill for 32 bit machines. 32 bit machines with
> PAE cannot use 32 bit values, they need 64 bits.
How is using a 64-bit value for consistency overkill?
As someone pointed out, 4TB machines probably aren't that far around
the corner (if they're not here already) so even if you use KB rather
than bytes, userspace needs to be using an int64 for this value in
case it ends up running as a 32-bit-compiled app on a 64-bit kernel
with lots of memory.
Paul
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
|
|
| | Topic: Re: [PATCH] Send quota messages via netlink |
|---|
| Re: [PATCH] Send quota messages via netlink [message #19798] |
Wed, 29 August 2007 11:57 |
Randy Dunlap Messages: 25 Registered: April 2007 |
Junior Member |
From: openvz.org
|
|
On Wed, 29 Aug 2007 14:26:47 +0200 Jan Kara wrote:
> On Tue 28-08-07 21:13:35, Andrew Morton wrote:
> > On Tue, 28 Aug 2007 16:13:18 +0200 Jan Kara <jack@suse.cz> wrote:
> >
> > > Hello,
> > >
> > > I'm sending rediffed patch implementing sending of quota messages via netlink
> > > interface (some rationale in patch description). I've already posted it to
> > > LKML some time ago and there were no objections, so I guess it's fine to put
> > > it to -mm. Andrew, would you be so kind? Thanks.
> > > Userspace deamon reading the messages from the kernel and sending them to
> > > dbus and/or user console is also written (it's part of quota-tools). The
> > > only remaining problem is there are a few changes needed to libnl needed for
> > > the userspace daemon. They were basically acked by the maintainer but it
> > > seems he has not merged the patches yet. So this will take a bit more time.
> > >
> >
> > So it's a new kernel->userspace interface.
> >
> > But we have no description of the interface :(
> Oops, forgotten about it. I'll write one. Do we have some standard place
> where to document such interfaces? I could create some file in
> Documentation/filesystems/ but that seems a bit superfluous...
It looks like other quota documentation is in Documentation/filesystems/,
and that seems reasonable to me for the other quota docs & this one.
---
~Randy
*** Remember to use Documentation/SubmitChecklist when testing your code ***
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
|
|
| | Topic: Re: [PATCH] Switch nfs/callback.c to using struct pid, not pid_t |
|---|
| Re: [PATCH] Switch nfs/callback.c to using struct pid, not pid_t [message #19809] |
Wed, 29 August 2007 09:52 |
Christoph Hellwig Messages: 59 Registered: April 2006 |
Member |
From: openvz.org
|
|
On Wed, Aug 29, 2007 at 05:36:24PM +0400, Pavel Emelyanov wrote:
> Pid namespaces make it dangerous to use pid and tgid values
> when run in some namespace. The struct pid itself is going
> to be the only way for working with task pids, so make the
> nfs callback thread use it.
>
> Since nfs_callback_info.pid is set to current's one and reset
> on the thread exit, it is safe not to get the struct pid.
>
> Since this pid is used later under lock_kernel() w/o sleeping
> operations, checking for i to be not NULL and killing the
> thread with kill_pid() is safe.
NACK. This just makes the code even more obscure. Please get rid
of the pid references entirely and convert the code to the kthread
API.
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
|
|
| | Topic: [PATCH] Switch nfs/callback.c to using struct pid, not pid_t |
|---|
| [PATCH] Switch nfs/callback.c to using struct pid, not pid_t [message #19824] |
Wed, 29 August 2007 09:36 |
Pavel Emelianov Messages: 1149 Registered: September 2006 |
Senior Member |
From: openvz.org
|
|
Pid namespaces make it dangerous to use pid and tgid values
when run in some namespace. The struct pid itself is going
to be the only way for working with task pids, so make the
nfs callback thread use it.
Since nfs_callback_info.pid is set to current's one and reset
on the thread exit, it is safe not to get the struct pid.
Since this pid is used later under lock_kernel() w/o sleeping
operations, checking for i to be not NULL and killing the
thread with kill_pid() is safe.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
---
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index a796be5..5b8e5fc 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -27,7 +27,7 @@
struct nfs_callback_data {
unsigned int users;
struct svc_serv *serv;
- pid_t pid;
+ struct pid *pid;
struct completion started;
struct completion stopped;
};
@@ -64,7 +64,7 @@ static void nfs_callback_svc(struct svc_
__module_get(THIS_MODULE);
lock_kernel();
- nfs_callback_info.pid = current->pid;
+ nfs_callback_info.pid = task_pid(current);
daemonize("nfsv4-svc");
/* Process request with signals blocked, but allow SIGKILL. */
allow_signal(SIGKILL);
@@ -98,7 +98,7 @@ static void nfs_callback_svc(struct svc_
}
svc_exit_thread(rqstp);
- nfs_callback_info.pid = 0;
+ nfs_callback_info.pid = NULL;
complete(&nfs_callback_info.stopped);
unlock_kernel();
module_put_and_exit(0);
@@ -114,7 +114,7 @@ int nfs_callback_up(void)
lock_kernel();
mutex_lock(&nfs_callback_mutex);
- if (nfs_callback_info.users++ || nfs_callback_info.pid != 0)
+ if (nfs_callback_info.users++ || nfs_callback_info.pid != NULL)
goto out;
init_completion(&nfs_callback_info.started);
init_completion(&nfs_callback_info.stopped);
@@ -157,9 +157,9 @@ void nfs_callback_down(void)
mutex_lock(&nfs_callback_mutex);
nfs_callback_info.users--;
do {
- if (nfs_callback_info.users != 0 || nfs_callback_info.pid == 0)
+ if (nfs_callback_info.users != 0 || nfs_callback_info.pid == NULL)
break;
- if (kill_proc(nfs_callback_info.pid, SIGKILL, 1) < 0)
+ if (kill_pid(nfs_callback_info.pid, SIGKILL, 1) < 0)
break;
} while (wait_for_completion_timeout(&nfs_callback_info.stopped, 5*HZ) == 0);
mutex_unlock(&nfs_callback_mutex);
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
|
|
| | Topic: [PATCH] Use task_pid_nr() in ip_vs_sync.c |
|---|
| [PATCH] Use task_pid_nr() in ip_vs_sync.c [message #19823] |
Wed, 29 August 2007 09:30 |
Pavel Emelianov Messages: 1149 Registered: September 2006 |
Senior Member |
From: openvz.org
|
|
The sync_master_pid and sync_backup_pid are set in set_sync_pid()
and are used later for set/not-set checks and in printk. So it
is safe to use the global pid value in this case.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
---
diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c
index 959c08d..d0798a5 100644
--- a/net/ipv4/ipvs/ip_vs_sync.c
+++ b/net/ipv4/ipvs/ip_vs_sync.c
@@ -794,7 +794,7 @@ static int sync_thread(void *startup)
add_wait_queue(&sync_wait, &wait);
- set_sync_pid(state, current->pid);
+ set_sync_pid(state, task_pid_nr(current));
complete(tinfo->startup);
/*
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
|
|
| | Topic: [PATCH] Remove write-only variable from pktgen_thread |
|---|
| [PATCH] Remove write-only variable from pktgen_thread [message #19822] |
Wed, 29 August 2007 09:22 |
Pavel Emelianov Messages: 1149 Registered: September 2006 |
Senior Member |
From: openvz.org
|
|
The pktgen_thread.pid is set to current->pid and is never used
after this. So remove this at all.
Found during isolating the explicit pid/tgid usage.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
---
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 3a3154e..93695c2 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -380,7 +380,6 @@ struct pktgen_thread {
/* Field for thread to receive "posted" events terminate, stop ifs etc. */
u32 control;
- int pid;
int cpu;
wait_queue_head_t queue;
@@ -3462,8 +3461,6 @@ static int pktgen_thread_worker(void *ar
init_waitqueue_head(&t->queue);
- t->pid = current->pid;
-
pr_debug("pktgen: starting pktgen/%d: pid=%d\n", cpu, task_pid_nr(current));
max_before_softirq = t->max_before_softirq;
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
|
|
| | Topic: [PATCH] Use same_thread_group() in signalfd.c |
|---|
| [PATCH] Use same_thread_group() in signalfd.c [message #19821] |
Wed, 29 August 2007 09:19 |
Pavel Emelianov Messages: 1149 Registered: September 2006 |
Senior Member |
From: openvz.org
|
|
This is a lost hunk of previous patch that isolated the
explicit usage of task->tgid in some places. The signalfd
code uses the tsk->tgid comparison.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
---
diff --git a/fs/signalfd.c b/fs/signalfd.c
index a8e293d..5bfd2c5 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -64,7 +64,7 @@ static int signalfd_lock(struct signalfd
return 0;
}
- if (lk->tsk->tgid == current->tgid)
+ if (same_thread_group(lk->tsk, current->tgid))
lk->tsk = current;
return 1;
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
|
|
|
Pages (31): [ 16 ]
Current Time: Fri May 24 13:54:32 EDT 2013
|