Serge E. Hallyn wrote:
> Quoting Pavel Emelyanov (xemul@openvz.org):
>> Each new group will have its own maps for char and block
>> layers. The devices access list is tuned via the
>> devices.permissions file. One may read from the file to get
>> the configured state.
>>
>> The top container isn't initialized, so that the char
>> and block layers will use the global maps to lookup
>> their devices. I did that not to export the static maps
>> to the outer world.
>>
>> Good news is that this patch now contains more comments
>> and Documentation file :)
>
> Seems to work as advertised :) I can't reproduce Suka's null/zero
> bug.
>
> You're relying fully on uid-0 to stop writes into the
> devices.permissions files. Would you mind adding a check for
> CAP_SYS_ADMIN (or CAP_NS_OVERRIDE+CAP_MKNOD)? Or were you really
> counting on using the filesystem visibility cgroup to stop a container
> from making changes to its device access whitelist?
Yup. I strongly believe that a controller itself should not bring
any security policy of its own, but the infrastructure should
take care of this.
However, I'm open to change my mind if I see good explanation of
why it is wrong.
> thanks,
> -serge
>
>> Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
>>
>> ---
>>
>> diff --git a/Documentation/controllers/devices.txt b/Documentation/controllers/devices.txt
>> new file mode 100644
>> index 0000000..dbd0c7a
>> --- /dev/null
>> +++ b/Documentation/controllers/devices.txt
>> @@ -0,0 +1,61 @@
>> +
>> + Devices visibility controller
>> +
>> +This controller allows to tune the devices accessibility by tasks,
>> +i.e. grant full access for /dev/null, /dev/zero etc, grant read-only
>> +access to IDE devices and completely hide SCSI disks.
>> +
>> +Tasks still can call mknod to create device files, regardless of
>> +whether the particular device is visible or accessible, but they
>> +may not be able to open it later.
>> +
>> +This one hides under CONFIG_CGROUP_DEVS option.
>> +
>> +
>> +Configuring
>> +
>> +The controller provides a single file to configure itself -- the
>> +devices.permissions one. To change the accessibility level for some
>> +device write the following string into it:
>> +
>> +[cb] <major>:(<minor>|*) [r-][w-]
>> + ^ ^ ^
>> + | | |
>> + | | +--- access rights (1)
>> + | |
>> + | +-- device major and minor numbers (2)
>> + |
>> + +-- device type (character / block)
>> +
>> +1) The access rights set to '--' remove the device from the group's
>> +access list, so that it will not even be shown in this file later.
>> +
>> +2) Setting the minor to '*' grants access to all the minors for
>> +particular major.
>> +
>> +When reading from it, one may see something like
>> +
>> + c 1:5 rw
>> + b 8:* r-
>> +
>> +Security issues, concerning who may grant access to what are governed
>> +at the cgroup infrastructure level.
>> +
>> +
>> +Examples:
>> +
>> +1. Grand full access to /dev/null
>> + # echo c 1:3 rw > /cgroups/<id>/devices.permissions
>> +
>> +2. Grant the read-only access to /dev/sda and partitions
>> + # echo b 8:* r- > ...
>> +
>> +3. Change the /dev/null access to write-only
>> + # echo c 1:3 -w > ...
>> +
>> +4. Revoke access to /dev/sda
>> + # echo b 8:* -- > ...
>> +
>> +
>> + Written by Pavel Emelyanov <xemul@openvz.org>
>> +
>> diff --git a/fs/Makefile b/fs/Makefile
>> index 7996220..5ad03be 100644
>> --- a/fs/Makefile
>> +++ b/fs/Makefile
>> @@ -64,6 +64,8 @@ obj-y += devpts/
>>
>> obj-$(CONFIG_PROFILING) += dcookies.o
>> obj-$(CONFIG_DLM) += dlm/
>> +
>> +obj-$(CONFIG_CGROUP_DEVS) += devscontrol.o
>>
>> # Do not add any filesystems before this line
>> obj-$(CONFIG_REISERFS_FS) += reiserfs/
>> diff --git a/fs/devscontrol.c b/fs/devscontrol.c
>> new file mode 100644
>> index 0000000..48c5f69
>> --- /dev/null
>> +++ b/fs/devscontrol.c
>> @@ -0,0 +1,314 @@
>> +/*
>> + * devscontrol.c - Device Controller
>> + *
>> + * Copyright 2007 OpenVZ SWsoft Inc
>> + * Author: Pavel Emelyanov <xemul at openvz dot org>
>> + *
>> + * This program is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU General Public License as published by
>> + * the Free Software Foundation; either version 2 of the License, or
>> + * (at your option) any later version.
>> + *
>> + * This program is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
>> + * GNU General Public License for more details.
>> + */
>> +
>> +#include <linux/cgroup.h>
>> +#include <linux/cdev.h>
>> +#include <linux/err.h>
>> +#include <linux/devscontrol.h>
>> +#include <linux/uaccess.h>
>> +#include <linux/fs.h>
>> +#include <linux/genhd.h>
>> +
>> +struct devs_cgroup {
>> + /*
>> + * The subsys state to build into cgrous infrastructure
>> + */
>> + struct cgroup_subsys_state css;
>> +
>> + /*
>> + * The maps of character and block devices. They provide a
>> + * map from dev_t-s to struct cdev/gendisk. See fs/char_dev.c
>> + * and block/genhd.c to find out how the ->open() callbacks
>> + * work when opening a device.
>> + *
>> + * Each group will have its onw maps, and at the open()
>> + * time code will lookup in this map to get the device
>> + * and permissions by its dev_t.
>> + */
>> + struct kobj_map *cdev_map;
>> + struct kobj_map *bdev_map;
>> +};
>> +
>> +static inline
>> +struct devs_cgroup *css_to_devs(struct cgroup_subsys_state *css)
>> +{
>> + return container_of(css, struct devs_cgroup, css);
>> +}
>> +
>> +static inline
>> +struct devs_cgroup *cgroup_to_devs(struct cgroup *cont)
>> +{
>> + return css_to_devs(cgroup_subsys_state(cont, devs_subsys_id));
>> +}
>> +
>> +struct kobj_map *task_cdev_map(struct task_struct *tsk)
>> +{
>> + struct cgroup_subsys_state *css;
>> +
>> + css = task_subsys_state(tsk, devs_subsys_id);
>> + if (css->cgroup->parent == NULL)
>> + return NULL;
>> + else
>> + return css_to_devs(css)->cdev_map;
>> +}
>> +
>> +struct kobj_map *task_bdev_map(struct task_struct *tsk)
>> +{
>> + struct cgroup_subsys_state *css;
>> +
>> + css = task_subsys_state(tsk, devs_subsys_id);
>> + if (css->cgroup->parent == NULL)
>> + return NULL;
>> + else
>> + return css_to_devs(css)->bdev_map;
>> +}
>> +
>> +static struct cgroup_subsys_state *
>> +devs_create(struct cgroup_subsys *ss, struct cgroup *cont)
>> +{
>> + struct devs_cgroup *devs;
>> +
>> + devs = kzalloc(sizeof(struct devs_cgroup), GFP_KERNEL);
>> + if (devs == NULL)
>> + goto out;
>> +
>> + devs->cdev_map = cdev_map_init();
>> + if (devs->cdev_map == NULL)
>> + goto out_free;
>> +
>> + devs->bdev_map = bdev_map_init();
>> + if (devs->bdev_map == NULL)
>> + goto out_free_cdev;
>> +
>> + return &devs->css;
>> +
>> +out_free_cdev:
>> + cdev_map_fini(devs->cdev_map);
>> +out_free:
>> + kfree(devs);
>> +out:
>> + return ERR_PTR(-ENOMEM);
>> +}
>> +
>> +static void devs_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
>> +{
>> + struct devs_cgroup *devs;
>> +
>> + devs = cgroup_to_devs(cont);
>> + bdev_map_fini(devs->bdev_map);
>> + cdev_map_fini(devs->cdev_map);
>> + kfree(devs);
>> +}
>> +
>> +/*
>> + * The devices.permissions file read/write functionality
>> + *
>> + * The following two routines parse and print the strings like
>> + * [cb] <major>:(<minor>|*) [r-][w-]
>> + */
>> +
>> +static int decode_perms_str(char *buf, int *chrdev, dev_t *dev,
>> + int *all, mode_t *mode)
>> +{
>> + unsigned int major, minor;
>> + char *end;
>> + mode_t tmp;
>> +
>> + if (buf[0] == 'c')
>> + *chrdev = 1;
>> + else if (buf[0] == 'b')
>> + *chrdev = 0;
>> + else
>> + return -EINVAL;
>> +
>> + if (buf[1] != ' ')
>> + return -EINVAL;
>> +
>> + major = simple_strtoul(buf + 2, &end, 10);
>> + if (*end != ':')
>> + return -EINVAL;
>> +
>> + if (end[1] == '*') {
>> + if (end[2] != ' ')
>> + return -EINVAL;
>> +
>> + *all = 1;
>> + minor = 0;
>> + end += 2;
>> + } else {
>> + minor = simple_strtoul(end + 1, &end, 10);
>> + if (*end != ' ')
>> + return -EINVAL;
>> +
>> + *all = 0;
>> + }
>> +
>> + tmp = 0;
>> +
>> + if (end[1] == 'r')
>> + tmp
...