Serge,
I've been thinking a lot about this one. As an alternative
implementation, have you considered changing one bounding capability bit
per system call? Something like this:
prctl(PR_CAPBSET_READ, CAPVERSION, CAP_NET_RAW);
returns -> 1(allowed) or 0(blocked)
prctl(PR_CAPBSET_DROP, CAPVERSION, CAP_NET_RAW)
returns -> 0(success) or -EPERM;
I also think we should use CAP_SETPCAP for the privilege of manipulating
the bounding set. In many ways irrevocably removing a permission
requires the same level of due care as adding one (to pI).
This has scalability designed in, at the expense of more system calls to
get the same (rare) work done.
Cheers
Andrew
Serge E. Hallyn wrote:
>>From 9ba95f1dbf88a512ffd423f6ccd627dc0460b052 Mon Sep 17 00:00:00 2001
> From: Serge E. Hallyn <serue@us.ibm.com>
> Date: Mon, 12 Nov 2007 16:50:04 -0500
> Subject: [PATCH 2/2] capabilities: introduce per-process capability bounding set (v7)
>
> The capability bounding set is a set beyond which capabilities
> cannot grow. Currently cap_bset is per-system. It can be
> manipulated through sysctl, but only init can add capabilities.
> Root can remove capabilities. By default it includes all caps
> except CAP_SETPCAP.
>
> This patch makes the bounding set per-process. It is inherited
> at fork from parent. Noone can add elements, CAP_SYS_ADMIN is
> required to remove them. Perhaps a new capability should be
> introduced to control the ability to remove capabilities, in
> order to help prevent running a privileged app with enough
> privs to be dangerous but not enough to be successful.
>
> One example use of this is to start a safer container. For
> instance, until device namespaces or per-container device
> whitelists are introduced, it is best to take CAP_MKNOD away
> from a container.
>
> Two questions:
>
> 1. I set CAP_FULL_SET and CAP_INIT_EFF_SET to contain
> only valid capabilities. Does that seem like a future maintenance
> headache? We only want the capability bounding set returned from kernel
> to container valid capabilities, so having CAP_FULL_SET contain all
> capabilities would mean that on every cap_prctl_getbset() we'd have to
> either manually clear invalid bits or let userspace sort it out.
>
> 2. Would getting and setting the bounding sets be
> better done through syscall? That better mirrors the capset+capget,
> but using prctl better mirrors the keep_capabilities setting.
>
> The following test program will get and set the bounding
> set. For instance
>
> ./bset get
> (lists capabilities in bset)
> ./bset strset cap_sys_admin
> (starts shell with new bset)
> (use capset, setuid binary, or binary with
> file capabilities to try to increase caps)
>
> ===========================================================
> bset.c:
> ===========================================================
> #include <sys/prctl.h>
> #include <linux/capability.h>
> #include <sys/types.h>
> #include <unistd.h>
> #include <stdio.h>
> #include <stdlib.h>
> #include <string.h>
>
> #ifndef PR_GET_CAPBSET
> #define PR_GET_CAPBSET 23
> #endif
>
> #ifndef PR_SET_CAPBSET
> #define PR_SET_CAPBSET 24
> #endif
>
> #define _LINUX_CAPABILITY_VERSION_1 0x19980330
> #define _LINUX_CAPABILITY_VERSION_2 0x20071026
> #define CAPVERSION _LINUX_CAPABILITY_VERSION_2
>
> #define NUMCAPS 31
>
> int usage(char *me)
> {
> printf("Usage: %s get\n", me);
> printf(" %s set capability_string\n", me);
> printf(" capability_string is for instance:\n");
> printf(" cap_sys_admin,cap_mknod,cap_dac_override\n");
> return 1;
> }
>
> char *captable[] = {
> "cap_dac_override",
> "cap_dac_read_search",
> "cap_fowner",
> "cap_fsetid",
> "cap_kill",
> "cap_setgid",
> "cap_setuid",
> "cap_setpcap",
> "cap_linux_immutable",
> "cap_net_bind_service",
> "cap_net_broadcast",
> "cap_net_admin",
> "cap_net_raw",
> "cap_ipc_lock",
> "cap_ipc_owner",
> "cap_sys_module",
> "cap_sys_rawio",
> "cap_sys_chroot",
> "cap_sys_ptrace",
> "cap_sys_pacct",
> "cap_sys_admin",
> "cap_sys_boot",
> "cap_sys_nice",
> "cap_sys_resource",
> "cap_sys_time",
> "cap_sys_tty_config",
> "cap_mknod",
> "cap_lease",
> "cap_audit_write",
> "cap_audit_control",
> "cap_setfcap"
> };
>
> char *bittostr(unsigned int i, unsigned int j)
> {
> if (i!=0 || j>31)
> return "invalid";
> return captable[j];
> }
>
> void print_capset(unsigned int *bset)
> {
> unsigned int i, j, comma=0;
> printf("Capability bounding set: ");
> for (i=0; i<2; i++) {
> for (j=0; j<31; j++)
> if (bset[i] & (1 << (j+1)))
> printf("%s%s", comma++?",":"",bittostr(i, j));
> }
> printf("\n");
> }
>
> int getbcap(void)
> {
> unsigned int bset[2];
> if (prctl(PR_GET_CAPBSET, CAPVERSION, &bset)) {
> perror("prctl");
> return 1;
> }
> print_capset(bset);
> return 0;
> }
>
> int captoint(char *cap)
> {
> int i;
> for (i=0; i<NUMCAPS; i++)
> if (strcmp(captable[i], cap) == 0)
> return i+1;
> return -1;
> }
>
> int setbcap(char *str)
> {
> int ret;
> unsigned int bset[2];
> char *token = strtok(str, ",");
>
> bset[0] = bset[1] = 0;
> while (token) {
> int bit = captoint(token);
> if (bit < 0) {
> printf("invalid cap: %s\n", token);
> return 1;
> }
> bset[bit/32] |= 1 << (bit%32);
> token = strtok(NULL, ",");
>
> }
> if (prctl(PR_SET_CAPBSET, CAPVERSION, &bset)) {
> perror("prctl");
> return 1;
> }
> return 0;
> }
>
> int main(int argc, char *argv[])
> {
> if (argc<2)
> return usage(argv[0]);
> if (strcmp(argv[1], "get")==0)
> return getbcap();
> if (strcmp(argv[1], "set")!=0 || argc<3)
> return usage(argv[0]);
> if (setbcap(argv[2]))
> return 1;
> return execl("/bin/bash", "/bin/bash", NULL);
> }
> ===========================================================
>
> Changelog:
> Enforce current-> capabilities are subsets of the
> new bounding set.
>
> As suggested by Andrew Morgan, send the capability
> version along with the bset for prctl(PR_SET_CAPBSET)
> and PR_GET_CAPBSET)
>
> Adapt to 64-bit capabilities.
>
> Update CAP_FULL_SET and CAP_INIT_EFF_SET to only
> contain valid capabilities.
>
> Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
> ---
> include/linux/capability.h | 34 +++++++++++++++++++++++++--
> include/linux/init_task.h | 1 +
> include/linux/prctl.h | 4 +++
> include/linux/sched.h | 2 +-
> include/linux/security.h | 5 ----
> include/linux/sysctl.h | 3 --
> kernel/fork.c | 1 +
> kernel/sys.c | 53 ++++++++++++++++++++++++++++++++++++++++++++
> kernel/sysctl.c | 35 -----------------------------
> kernel/sysctl_check.c | 7 -----
> security/commoncap.c | 37 +++++++++++++++++++++++++++---
> 11 files changed, 124 insertions(+), 58 deletions(-)
>
> diff --git a/include/linux/capability.h b/include/linux/capability.h
> index a1d93da..64e668a 100644
> --- a/include/linux/capability.h
> +++ b/include/linux/capability.h
> @@ -202,7 +202,6 @@ typedef struct kernel_cap_struct {
> #define CAP_IPC_OWNER 15
>
> /* Insert and remove kernel modules - modify kernel without limit */
> -/* Modify cap_bset */
> #define CAP_SYS_MODULE 16
>
> /* Allow ioperm/iopl access */
> @@ -259,6 +258,7 @@ typedef struct kernel_cap_struct {
> arbitrary SCSI commands */
> /* Allow setting encryption key on loopback filesystem */
> /* Allow setting zone reclaim policy */
> +/* Allow taking bits out of capability bounding set */
>
> #define CAP_SYS_ADMIN 21
>
> @@ -315,6 +315,12 @@ typedef struct kernel_cap_struct {
> #define CAP_SETFCAP 31
>
> /*
> + * XXX
> + * When adding a capability, please update the definitions of
> + * CAP_FULL_SET and CAP_INIT_EFF_SET below
> + */
> +
> +/*
> * Bit location of each capability (used by user-space library and kernel)
> */
>
> @@ -341,8 +347,8 @@ typedef struct kernel_cap_struct {
> #else /* HAND-CODED capability initializers */
>
> # define CAP_EMPTY_SET {{ 0, 0 }}
> -# define CAP_FULL_SET {{ ~0, ~0 }}
> -# define CAP_INIT_EFF_SET {{ ~CAP_TO_MASK(CAP_SETPCAP), ~0 }}
> +# define CAP_FULL_SET {{ ~0, 0 }}
> +# define CAP_INIT_EFF_SET {{ ~CAP_TO_MASK(CAP_SETPCAP), 0 }}
> # define CAP_FS_SET {{ CAP_FS_MASK_B0, 0 }}
> # define CAP_NFSD_SET {{ CAP_FS_MASK_B0|CAP_TO_MASK(CAP_SYS_RESOURCE), 0 }}
>
> @@ -350,6 +356,17 @@ typedef struct kernel_cap_struct {
>
> #define CAP_INIT_INH_SET CAP_EMPTY_SET
>
> +#ifdef CONFIG_SECURITY_FILE_CAPABILITIES
...