OpenVZ Forum


Home » Mailing lists » Devel » [RFC PATCH 1/2] capabilities: define CONFIG_COMMONCAP
[PATCH RFC] cgroups: implement device whitelist cgroup+lsm [message #23366 is a reply to message #23364] Thu, 15 November 2007 23:23 Go to previous message
serue is currently offline  serue
Messages: 750
Registered: February 2006
Senior Member
(This patch is based against the same CONFIG_COMMONCAP patch as
the capability bounding set patch I just sent out)

>From fcbd0bd0a8ee1e37a68f5381b47ec0746cb9b1cc Mon Sep 17 00:00:00 2001
From: Serge E. Hallyn <serue@us.ibm.com>
Date: Thu, 11 Oct 2007 15:27:48 -0400
Subject: [PATCH 2/2] cgroups: implement device whitelist cgroup+lsm

Implement a cgroup using the LSM interface to enforce open and mknod
on device files.  Not a line of this code is expected to be used in a
final version, this is just a proof of concept to explore whether we
can or should use an LSM for this purpose until device namespaces are
really needed.  The alternative is to simply set up a static /dev for
each container and remove CAP_MKNOD from the container's bounding
set.  Several people feel that that approach is insufficient.

This patch implements a simple device access whitelist.  A whitelist entry
has 4 fields.  'type' is a (all), c (char), or b (block).  'all' means it
applies to all types, all major numbers, and all minor numbers.  Major and
minor are obvious.  Access is a composition of r (read), w (write), and
m (mknod).

The root devcgroup starts with rwm to 'all'.  A child devcg gets a copy
of the parent.  Admins can then add and remove devices to the whitelist.
Once CAP_HOST_ADMIN is introduced it will be needed to add entries as
well or remove entries from another cgroup, though just CAP_SYS_ADMIN
will suffice to remove entries for your own group.

An entry is added by doing "echo <type> <maj> <min> <access>" > devcg.allow,
for instance:

	echo b 7 0 mrw > /cgroups/1/devcg.allow

An entry is removed by doing likewise into devcg.deny.  Since this is a
pure whitelist, not acls, you can only remove entries which exist in the
whitelist.  You must explicitly

	echo a 0 0 mrw > /cgroups/1/devcg.deny

to remove the "allow all" entry which is automatically inherited from
the root cgroup.

While composing this with the ns_cgroup may seem logical, it may not
be the right thing to do.  Note that each newly created devcg gets
a copy of the parent whitelist.  So if you had done

	mount -t cgroup -o ns,devcg none /cgroups

then once a process in /cgroup/1 had done an unshare(CLONE_NEWNS)
it would be under /cgroup/1/node_<pid>
if an admin did

	echo b 7 0 m > /cgroups/1/devcg.deny

then the entry would still be in the whitelist for /cgroups/1/node_<pid>.
Something to discuss if we get that far before nixing this whole idea.

The devcg module calls all the capability security hooks, so
it does not need to (cannot) be stacked with capability.ko.
The security hooks are defined in a separate file from the
cgroup code so that the security/Makefile can force its hooks
to be loaded after the selinux hooks.  Otherwise selinux would
refuse to load if CONFIG_CGROUP_DEV=y.

Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
---
 include/linux/cgroup_subsys.h |    6 +
 init/Kconfig                  |    7 +
 kernel/Makefile               |    1 +
 kernel/dev_cgroup.c           |  410 +++++++++++++++++++++++++++++++++++++++++
 security/Kconfig              |    4 +-
 security/Makefile             |    1 +
 6 files changed, 427 insertions(+), 2 deletions(-)
 create mode 100644 kernel/dev_cgroup.c

diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index d3ec2ed..9e2f5f7 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -36,3 +36,9 @@ SUBSYS(mem_cgroup)
 #endif
 
 /* */
+
+#ifdef CONFIG_CGROUP_DEV
+SUBSYS(devcg)
+#endif
+
+/* */
diff --git a/init/Kconfig b/init/Kconfig
index 96fba82..2907248 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -324,6 +324,13 @@ config CPUSETS
 
 	  Say N if unsure.
 
+config CGROUP_DEV
+	bool "Device controller for cgroups"
+	depends on CGROUPS && SECURITY && EXPERIMENTAL
+	help
+	  Provides a cgroup implementing whitelists for devices which
+	  a process in the cgroup can mknod or open.
+
 config FAIR_GROUP_SCHED
 	bool "Fair group CPU scheduler"
 	default y
diff --git a/kernel/Makefile b/kernel/Makefile
index 876dbcd..1da0b66 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -41,6 +41,7 @@ obj-$(CONFIG_CGROUPS) += cgroup.o
 obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
 obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
+obj-$(CONFIG_CGROUP_DEV) += dev_cgroup.o
 obj-$(CONFIG_IKCONFIG) += configs.o
 obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
 obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
diff --git a/kernel/dev_cgroup.c b/kernel/dev_cgroup.c
new file mode 100644
index 0000000..365877d
--- /dev/null
+++ b/kernel/dev_cgroup.c
@@ -0,0 +1,410 @@
+/*
+ * dev_cgroup.c - device cgroup subsystem
+ *
+ * Copyright 2007 IBM Corp
+ */
+
+#include <linux/devcg.h>
+
+/*
+ * Once 64-bit caps and CAP_HOST_ADMIN exist, we will be
+ * requiring (CAP_HOST_ADMIN|CAP_MKNOD) to create a device
+ * not in the whitelist, * (CAP_HOST_ADMIN|CAP_SYS_ADMIN)
+ * to edit the whitelist,
+ */
+static int devcg_can_attach(struct cgroup_subsys *ss,
+		struct cgroup *new_cgroup, struct task_struct *task)
+{
+	struct cgroup *orig;
+
+	if (current != task) {
+		if (!cgroup_is_descendant(new_cgroup))
+			return -EPERM;
+	}
+
+	if (atomic_read(&new_cgroup->count) != 0)
+		return -EPERM;
+
+	orig = task_cgroup(task, devcg_subsys_id);
+	if (orig && orig != new_cgroup->parent)
+		return -EPERM;
+
+	return 0;
+}
+
+/*
+ * called under cgroup_lock()
+ */
+int dev_whitelist_copy(struct list_head *dest, struct list_head *orig)
+{
+	struct dev_whitelist_item *wh, *tmp, *new;
+
+	list_for_each_entry(wh, orig, list) {
+		new = kmalloc(sizeof(*wh), GFP_KERNEL);
+		if (!new)
+			goto free_and_exit;
+		new->major = wh->major;
+		new->minor = wh->minor;
+		new->type = wh->type;
+		new->access = wh->access;
+		list_add_tail(&new->list, dest);
+	}
+
+	return 0;
+
+free_and_exit:
+	list_for_each_entry_safe(wh, tmp, dest, list) {
+		list_del(&wh->list);
+		kfree(wh);
+	}
+	return -ENOMEM;
+}
+
+/* Stupid prototype - don't bother combining existing entries */
+/*
+ * called under cgroup_lock()
+ * since the list is visible to other tasks, we need the spinlock also
+ */
+void dev_whitelist_add(struct dev_cgroup *dev_cgroup,
+			struct dev_whitelist_item *wh)
+{
+	spin_lock(&dev_cgroup->lock);
+	list_add_tail(&wh->list, &dev_cgroup->whitelist);
+	spin_unlock(&dev_cgroup->lock);
+}
+
+/*
+ * called under cgroup_lock()
+ * since the list is visible to other tasks, we need the spinlock also
+ */
+void dev_whitelist_rm(struct dev_cgroup *dev_cgroup,
+			struct dev_whitelist_item *wh)
+{
+	struct dev_whitelist_item *walk, *tmp;
+
+	spin_lock(&dev_cgroup->lock);
+	list_for_each_entry_safe(walk, tmp, &dev_cgroup->whitelist, list) {
+		if (walk->type & DEV_ALL) {
+			list_del(&walk->list);
+			kfree(walk);
+			continue;
+		}
+		if (walk->type != wh->type)
+			continue;
+		if (walk->major != wh->major || walk->minor != wh->minor)
+			continue;
+		walk->access &= ~wh->access;
+		if (!walk->access) {
+			list_del(&walk->list);
+			kfree(walk);
+		}
+	}
+	spin_unlock(&dev_cgroup->lock);
+}
+
+/*
+ * Rules: you can only create a cgroup if
+ *     1. you are capable(CAP_SYS_ADMIN)
+ *     2. the target cgroup is a descendant of your own cgroup
+ *
+ * Note: called from kernel/cgroup.c with cgroup_lock() held.
+ */
+static struct cgroup_subsys_state *devcg_create(struct cgroup_subsys *ss,
+						struct cgroup *cgroup)
+{
+	struct dev_cgroup *dev_cgroup, *parent_dev_cgroup;
+	struct cgroup *parent_cgroup;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return ERR_PTR(-EPERM);
+	if (!cgroup_is_descendant(cgroup))
+		return ERR_PTR(-EPERM);
+
+	dev_cgroup = kzalloc(sizeof(*dev_cgroup), GFP_KERNEL);
+	if (!dev_cgroup)
+		return ERR_PTR(-ENOMEM);
+	INIT_LIST_HEAD(&dev_cgroup->whitelist);
+	parent_cgroup = cgroup->parent;
+
+	if (parent_cgroup == NULL) {
+		struct dev_whitelist_item *wh;
+		wh = kmalloc(sizeof(*wh), GFP_KERNEL);
+		wh->minor = wh->major = 0;
+		wh->type = DEV_ALL;
+		wh->access = ACC_MKNOD | ACC_READ | ACC_WRITE;
+		list_add(&wh->list, &dev_cgroup->whitelist);
+	} else {
+		parent_dev_cgroup = cgroup_to_devcg(parent_cgroup);
+		ret = dev_whitelist_copy(&dev_cgroup->whitelist,
+				&parent_dev_cgroup->whitelist);
+		if (ret) {
+			kfree(dev_cgroup);
+			return ERR_PTR(ret);
+		}
+	}
+
+	spin_lock_init(&dev_cgroup->lock);
+	return &dev_cgroup->css;
+}
+
+static void devcg_destroy(struct cgroup_subsys *ss,
+			struct cgroup *cgroup)
+{
+	struct dev_cgroup *dev_cgroup;
+	struct dev_whitelist_item *wh, *tmp;
+
+	dev_cgroup = cgroup_to_devcg(cgroup);
+	list_for_each_entry_safe(wh, tmp, &dev_cgroup->whitelist, list) {
+		list_del(&wh->list);
+		kfree(wh);
+	}
+	kfree(dev_cgroup);
+}
+
+#define DEVCG_ALLOW 1
+#define DEVCG_DENY 2
+
+void set_access(char *acc, short access)
+{
+	int idx = 0;
+	memset(acc, 0, 4);
+	if (access & ACC_READ)
+		acc[idx++] = 'r';
+	if (access & ACC_WRITE)
+		acc[idx++] = 'w';
+	if (access & ACC_MKNOD)
+		acc[idx++] = 'm';
+}
+
+char type_to_char(short type)
+{
+	if (type == DEV_ALL)
+		return 'a';
+	if (type == DEV_CHAR)
+		return 'c';
+	if (type == DEV_BLOCK)
+		return 'b';
+	return 'X';
+}
+
+char *print_whitelist(struct dev_cgroup *devcgroup, int *len)
+{
+	char *buf, *s, acc[4];
+	struct dev_whitelist_item *wh;
+	int ret;
+	int count = 0;
+
+	buf = kmalloc(4096, GFP_KERNEL);
+	if (!buf)
+		return ERR_PTR(-ENOMEM);
+	s = buf;
+	*s = '\0';
+	*len = 0;
+
+	spin_lock(&devcgroup->lock);
+	list_for_each_entry(wh, &devcgroup->whitelist, list) {
+		set_access(acc, wh->access);
+		printk(KERN_NOTICE
+			"%s (count%d): whtype %hd maj %u min %u acc %hd\n",
+			__FUNCTION__, count, wh->type, wh-&g
...

 
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Previous Topic: netns refcounting
Next Topic: cleanup in workq and dst_destroy
Goto Forum:
  


Current Time: Thu Oct 09 21:53:36 GMT 2025

Total time taken to generate the page: 0.09923 seconds