From: Paul Menage <menage@google.com>
Add the following files to the cgroup filesystem:
notify_on_release - configures/reports whether the cgroup subsystem should
attempt to run a release script when this cgroup becomes unused
release_agent - configures/reports the release agent to be used for this
hierarchy (top level in each hierarchy only)
releasable - reports whether this cgroup would have been auto-released if
notify_on_release was true and a release agent was configured (mainly useful
for debugging)
To avoid locking issues, invoking the userspace release agent is done via a
workqueue task; cgroups that need to have their release agents invoked by
the workqueue task are linked on to a list.
Signed-off-by: Paul Menage <menage@google.com>
Cc: Serge E. Hallyn <serue@us.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: Kirill Korotaev <dev@openvz.org>
Cc: Herbert Poetzl <herbert@13thfloor.at>
Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
include/linux/cgroup.h | 11
kernel/cgroup.c | 425 ++++++++++++++++++++++++++++++++----
2 files changed, 393 insertions(+), 43 deletions(-)
diff -puN include/linux/cgroup.h~task-cgroupsv11-automatic-userspace-notification-of-idle-cgroups include/linux/cgroup.h
--- a/include/linux/cgroup.h~task-cgroupsv11-automatic-userspace-notification-of-idle-cgroups
+++ a/include/linux/cgroup.h
@@ -77,10 +77,11 @@ static inline void css_get(struct contai
* css_get()
*/
+extern void __css_put(struct cgroup_subsys_state *css);
static inline void css_put(struct cgroup_subsys_state *css)
{
if (!test_bit(CSS_ROOT, &css->flags))
- atomic_dec(&css->refcnt);
+ __css_put(css);
}
struct cgroup {
@@ -112,6 +113,13 @@ struct cgroup {
* tasks in this cgroup. Protected by css_set_lock
*/
struct list_head css_sets;
+
+ /*
+ * Linked list running through all cgroups that can
+ * potentially be reaped by the release agent. Protected by
+ * release_list_lock
+ */
+ struct list_head release_list;
};
/* A css_set is a structure holding pointers to a set of
@@ -285,7 +293,6 @@ struct task_struct *cgroup_iter_next(
struct cgroup_iter *it);
void cgroup_iter_end(struct cgroup *cont, struct cgroup_iter *it);
-
#else /* !CONFIG_CGROUPS */
static inline int cgroup_init_early(void) { return 0; }
diff -puN kernel/cgroup.c~task-cgroupsv11-automatic-userspace-notification-of-idle-cgroups kernel/cgroup.c
--- a/kernel/cgroup.c~task-cgroupsv11-automatic-userspace-notification-of-idle-cgroups
+++ a/kernel/cgroup.c
@@ -44,6 +44,8 @@
#include <linux/sort.h>
#include <asm/atomic.h>
+static DEFINE_MUTEX(cgroup_mutex);
+
/* Generate an array of cgroup subsystem pointers */
#define SUBSYS(_x) &_x ## _subsys,
@@ -82,6 +84,13 @@ struct cgroupfs_root {
/* Hierarchy-specific flags */
unsigned long flags;
+
+ /* The path to use for release notifications. No locking
+ * between setting and use - so if userspace updates this
+ * while child cgroups exist, you could miss a
+ * notification. We ensure that it's always a valid
+ * NUL-terminated string */
+ char release_agent_path[PATH_MAX];
};
@@ -109,7 +118,13 @@ static int need_forkexit_callback;
/* bits in struct cgroup flags field */
enum {
+ /* Control Group is dead */
CONT_REMOVED,
+ /* Control Group has previously had a child cgroup or a task,
+ * but no longer (only if CONT_NOTIFY_ON_RELEASE is set) */
+ CONT_RELEASABLE,
+ /* Control Group requires release notifications to userspace */
+ CONT_NOTIFY_ON_RELEASE,
};
/* convenient tests for these bits */
@@ -123,6 +138,19 @@ enum {
ROOT_NOPREFIX, /* mounted subsystems have no named prefix */
};
+inline int cgroup_is_releasable(const struct cgroup *cont)
+{
+ const int bits =
+ (1 << CONT_RELEASABLE) |
+ (1 << CONT_NOTIFY_ON_RELEASE);
+ return (cont->flags & bits) == bits;
+}
+
+inline int notify_on_release(const struct cgroup *cont)
+{
+ return test_bit(CONT_NOTIFY_ON_RELEASE, &cont->flags);
+}
+
/*
* for_each_subsys() allows you to iterate on each subsystem attached to
* an active hierarchy
@@ -134,6 +162,14 @@ list_for_each_entry(_ss, &_root->subsys_
#define for_each_root(_root) \
list_for_each_entry(_root, &roots, root_list)
+/* the list of cgroups eligible for automatic release. Protected by
+ * release_list_lock */
+static LIST_HEAD(release_list);
+static DEFINE_SPINLOCK(release_list_lock);
+static void cgroup_release_agent(struct work_struct *work);
+static DECLARE_WORK(release_agent_work, cgroup_release_agent);
+static void check_for_release(struct cgroup *cont);
+
/* Link structure for associating css_set objects with cgroups */
struct cg_cgroup_link {
/*
@@ -188,11 +224,8 @@ static int use_task_css_set_links;
/*
* unlink a css_set from the list and free it
*/
-static void release_css_set(struct kref *k)
+static void unlink_css_set(struct css_set *cg)
{
- struct css_set *cg = container_of(k, struct css_set, ref);
- int i;
-
write_lock(&css_set_lock);
list_del(&cg->list);
css_set_count--;
@@ -205,11 +238,39 @@ static void release_css_set(struct kre
kfree(link);
}
write_unlock(&css_set_lock);
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
- atomic_dec(&cg->subsys[i]->cgroup->count);
+}
+
+static void __release_css_set(struct kref *k, int taskexit)
+{
+ int i;
+ struct css_set *cg = container_of(k, struct css_set, ref);
+
+ unlink_css_set(cg);
+
+ rcu_read_lock();
+ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ struct cgroup *cont = cg->subsys[i]->cgroup;
+ if (atomic_dec_and_test(&cont->count) &&
+ notify_on_release(cont)) {
+ if (taskexit)
+ set_bit(CONT_RELEASABLE, &cont->flags);
+ check_for_release(cont);
+ }
+ }
+ rcu_read_unlock();
kfree(cg);
}
+static void release_css_set(struct kref *k)
+{
+ __release_css_set(k, 0);
+}
+
+static void release_css_set_taskexit(struct kref *k)
+{
+ __release_css_set(k, 1);
+}
+
/*
* refcounted get/put for css_set objects
*/
@@ -223,6 +284,11 @@ static inline void put_css_set(struct
kref_put(&cg->ref, release_css_set);
}
+static inline void put_css_set_taskexit(struct css_set *cg)
+{
+ kref_put(&cg->ref, release_css_set_taskexit);
+}
+
/*
* find_existing_css_set() is a helper for
* find_css_set(), and checks to see whether an existing
@@ -464,8 +530,6 @@ static struct css_set *find_css_set(
* update of a tasks cgroup pointer by attach_task()
*/
-static DEFINE_MUTEX(cgroup_mutex);
-
/**
* cgroup_lock - lock out any changes to cgroup structures
*
@@ -524,6 +588,13 @@ static void cgroup_diput(struct dentr
if (S_ISDIR(inode->i_mode)) {
struct cgroup *cont = dentry->d_fsdata;
BUG_ON(!(cgroup_is_removed(cont)));
+ /* It's possible for external users to be holding css
+ * reference counts on a cgroup; css_put() needs to
+ * be able to access the cgroup after decrementing
+ * the reference count in order to know if it needs to
+ * queue the cgroup to be handled by the release
+ * agent */
+ synchronize_rcu();
kfree(cont);
}
iput(inode);
@@ -668,6 +739,8 @@ static int cgroup_show_options(struct
seq_printf(seq, ",%s", ss->name);
if (test_bit(ROOT_NOPREFIX, &root->flags))
seq_puts(seq, ",noprefix");
+ if (strlen(root->release_agent_path))
+ seq_printf(seq, ",release_agent=%s", root->release_agent_path);
mutex_unlock(&cgroup_mutex);
return 0;
}
@@ -675,6 +748,7 @@ static int cgroup_show_options(struct
struct cgroup_sb_opts {
unsigned long subsys_bits;
unsigned long flags;
+ char *release_agent;
};
/* Convert a hierarchy specifier into a bitmask of subsystems and
@@ -686,6 +760,7 @@ static int parse_cgroupfs_options(cha
opts->subsys_bits = 0;
opts->flags = 0;
+ opts->release_agent = NULL;
while ((token = strsep(&o, ",")) != NULL) {
if (!*token)
@@ -694,6 +769,15 @@ static int parse_cgroupfs_options(cha
opts->subsys_bits = (1 << CGROUP_SUBSYS_COUNT) - 1;
} else if (!strcmp(token, "noprefix")) {
set_bit(ROOT_NOPREFIX, &opts->flags);
+ } else if (!strncmp(token, "release_agent=", 14)) {
+ /* Specifying two release agents is forbidden */
+ if (opts->release_agent)
+ return -EINVAL;
+ opts->release_agent = kzalloc(PATH_MAX, GFP_KERNEL);
+ if (!opts->release_agent)
+ return -ENOMEM;
+ strncpy(opts->release_agent, token + 14, PATH_MAX - 1);
+ opts->release_agent[PATH_MAX - 1] = 0;
} else {
struct cgroup_subsys *ss;
int i;
@@ -743,7 +827,11 @@ static int cgroup_remount(struct supe
if (!ret)
cgroup_populate_dir(cont);
+ if (opts.release_agent)
+ strcpy(root->release_agent_path, opts.release_agent);
out_unlock:
+ if (opts.release_agent)
+ kfree(opts.release_agent);
mutex_unlock(&cgroup_mutex);
mutex_unlock(&cont->dentry->d_inode->i_mutex);
return ret;
@@ -767,6 +855,7 @@ static void init_cgroup_root(struct c
INIT_LIST_HEAD(&cont->sibling);
INIT_LIST_HEAD(&cont->children);
INIT_LIST_HEAD(&cont->css_sets);
+ INIT_LIST_HEAD(&cont->release_list);
}
static int cgroup_test_super(struct super_block *sb, void *data)
@@ -841,8 +930,11 @@ static int cgroup_get_sb(struct file_
/* First find the desired set of subsystems */
ret = parse_cgroupfs_options(data, &opts);
- if (ret)
+ if (ret) {
+ if (opts.release_agent)
+ kfree(opts.release_agent);
return ret;
+ }
root = kzalloc(sizeof(*root), GFP_KERNEL);
if (!root)
@@ -851,6 +943,10 @@ static int cgroup_get_sb(struct file_
init_cgroup_root(roo
...