>From 9ba95f1dbf88a512ffd423f6ccd627dc0460b052 Mon Sep 17 00:00:00 2001
From: Serge E. Hallyn <serue@us.ibm.com>
Date: Mon, 12 Nov 2007 16:50:04 -0500
Subject: [PATCH 2/2] capabilities: introduce per-process capability bounding set (v7)
The capability bounding set is a set beyond which capabilities
cannot grow. Currently cap_bset is per-system. It can be
manipulated through sysctl, but only init can add capabilities.
Root can remove capabilities. By default it includes all caps
except CAP_SETPCAP.
This patch makes the bounding set per-process. It is inherited
at fork from parent. Noone can add elements, CAP_SYS_ADMIN is
required to remove them. Perhaps a new capability should be
introduced to control the ability to remove capabilities, in
order to help prevent running a privileged app with enough
privs to be dangerous but not enough to be successful.
One example use of this is to start a safer container. For
instance, until device namespaces or per-container device
whitelists are introduced, it is best to take CAP_MKNOD away
from a container.
Two questions:
1. I set CAP_FULL_SET and CAP_INIT_EFF_SET to contain
only valid capabilities. Does that seem like a future maintenance
headache? We only want the capability bounding set returned from kernel
to container valid capabilities, so having CAP_FULL_SET contain all
capabilities would mean that on every cap_prctl_getbset() we'd have to
either manually clear invalid bits or let userspace sort it out.
2. Would getting and setting the bounding sets be
better done through syscall? That better mirrors the capset+capget,
but using prctl better mirrors the keep_capabilities setting.
The following test program will get and set the bounding
set. For instance
./bset get
(lists capabilities in bset)
./bset strset cap_sys_admin
(starts shell with new bset)
(use capset, setuid binary, or binary with
file capabilities to try to increase caps)
===========================================================
bset.c:
===========================================================
#include <sys/prctl.h>
#include <linux/capability.h>
#include <sys/types.h>
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#ifndef PR_GET_CAPBSET
#define PR_GET_CAPBSET 23
#endif
#ifndef PR_SET_CAPBSET
#define PR_SET_CAPBSET 24
#endif
#define _LINUX_CAPABILITY_VERSION_1 0x19980330
#define _LINUX_CAPABILITY_VERSION_2 0x20071026
#define CAPVERSION _LINUX_CAPABILITY_VERSION_2
#define NUMCAPS 31
int usage(char *me)
{
printf("Usage: %s get\n", me);
printf(" %s set capability_string\n", me);
printf(" capability_string is for instance:\n");
printf(" cap_sys_admin,cap_mknod,cap_dac_override\n");
return 1;
}
char *captable[] = {
"cap_dac_override",
"cap_dac_read_search",
"cap_fowner",
"cap_fsetid",
"cap_kill",
"cap_setgid",
"cap_setuid",
"cap_setpcap",
"cap_linux_immutable",
"cap_net_bind_service",
"cap_net_broadcast",
"cap_net_admin",
"cap_net_raw",
"cap_ipc_lock",
"cap_ipc_owner",
"cap_sys_module",
"cap_sys_rawio",
"cap_sys_chroot",
"cap_sys_ptrace",
"cap_sys_pacct",
"cap_sys_admin",
"cap_sys_boot",
"cap_sys_nice",
"cap_sys_resource",
"cap_sys_time",
"cap_sys_tty_config",
"cap_mknod",
"cap_lease",
"cap_audit_write",
"cap_audit_control",
"cap_setfcap"
};
char *bittostr(unsigned int i, unsigned int j)
{
if (i!=0 || j>31)
return "invalid";
return captable[j];
}
void print_capset(unsigned int *bset)
{
unsigned int i, j, comma=0;
printf("Capability bounding set: ");
for (i=0; i<2; i++) {
for (j=0; j<31; j++)
if (bset[i] & (1 << (j+1)))
printf("%s%s", comma++?",":"",bittostr(i, j));
}
printf("\n");
}
int getbcap(void)
{
unsigned int bset[2];
if (prctl(PR_GET_CAPBSET, CAPVERSION, &bset)) {
perror("prctl");
return 1;
}
print_capset(bset);
return 0;
}
int captoint(char *cap)
{
int i;
for (i=0; i<NUMCAPS; i++)
if (strcmp(captable[i], cap) == 0)
return i+1;
return -1;
}
int setbcap(char *str)
{
int ret;
unsigned int bset[2];
char *token = strtok(str, ",");
bset[0] = bset[1] = 0;
while (token) {
int bit = captoint(token);
if (bit < 0) {
printf("invalid cap: %s\n", token);
return 1;
}
bset[bit/32] |= 1 << (bit%32);
token = strtok(NULL, ",");
}
if (prctl(PR_SET_CAPBSET, CAPVERSION, &bset)) {
perror("prctl");
return 1;
}
return 0;
}
int main(int argc, char *argv[])
{
if (argc<2)
return usage(argv[0]);
if (strcmp(argv[1], "get")==0)
return getbcap();
if (strcmp(argv[1], "set")!=0 || argc<3)
return usage(argv[0]);
if (setbcap(argv[2]))
return 1;
return execl("/bin/bash", "/bin/bash", NULL);
}
===========================================================
Changelog:
Enforce current-> capabilities are subsets of the
new bounding set.
As suggested by Andrew Morgan, send the capability
version along with the bset for prctl(PR_SET_CAPBSET)
and PR_GET_CAPBSET)
Adapt to 64-bit capabilities.
Update CAP_FULL_SET and CAP_INIT_EFF_SET to only
contain valid capabilities.
Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
---
include/linux/capability.h | 34 +++++++++++++++++++++++++--
include/linux/init_task.h | 1 +
include/linux/prctl.h | 4 +++
include/linux/sched.h | 2 +-
include/linux/security.h | 5 ----
include/linux/sysctl.h | 3 --
kernel/fork.c | 1 +
kernel/sys.c | 53 ++++++++++++++++++++++++++++++++++++++++++++
kernel/sysctl.c | 35 -----------------------------
kernel/sysctl_check.c | 7 -----
security/commoncap.c | 37 +++++++++++++++++++++++++++---
11 files changed, 124 insertions(+), 58 deletions(-)
diff --git a/include/linux/capability.h b/include/linux/capability.h
index a1d93da..64e668a 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -202,7 +202,6 @@ typedef struct kernel_cap_struct {
#define CAP_IPC_OWNER 15
/* Insert and remove kernel modules - modify kernel without limit */
-/* Modify cap_bset */
#define CAP_SYS_MODULE 16
/* Allow ioperm/iopl access */
@@ -259,6 +258,7 @@ typedef struct kernel_cap_struct {
arbitrary SCSI commands */
/* Allow setting encryption key on loopback filesystem */
/* Allow setting zone reclaim policy */
+/* Allow taking bits out of capability bounding set */
#define CAP_SYS_ADMIN 21
@@ -315,6 +315,12 @@ typedef struct kernel_cap_struct {
#define CAP_SETFCAP 31
/*
+ * XXX
+ * When adding a capability, please update the definitions of
+ * CAP_FULL_SET and CAP_INIT_EFF_SET below
+ */
+
+/*
* Bit location of each capability (used by user-space library and kernel)
*/
@@ -341,8 +347,8 @@ typedef struct kernel_cap_struct {
#else /* HAND-CODED capability initializers */
# define CAP_EMPTY_SET {{ 0, 0 }}
-# define CAP_FULL_SET {{ ~0, ~0 }}
-# define CAP_INIT_EFF_SET {{ ~CAP_TO_MASK(CAP_SETPCAP), ~0 }}
+# define CAP_FULL_SET {{ ~0, 0 }}
+# define CAP_INIT_EFF_SET {{ ~CAP_TO_MASK(CAP_SETPCAP), 0 }}
# define CAP_FS_SET {{ CAP_FS_MASK_B0, 0 }}
# define CAP_NFSD_SET {{ CAP_FS_MASK_B0|CAP_TO_MASK(CAP_SYS_RESOURCE), 0 }}
@@ -350,6 +356,17 @@ typedef struct kernel_cap_struct {
#define CAP_INIT_INH_SET CAP_EMPTY_SET
+#ifdef CONFIG_SECURITY_FILE_CAPABILITIES
+/*
+ * Because of the reduced scope of CAP_SETPCAP when filesystem
+ * capabilities are in effect, it is safe to allow this capability to
+ * be available in the default configuration.
+ */
+# define CAP_INIT_BSET CAP_FULL_SET
+#else
+# define CAP_INIT_BSET CAP_INIT_EFF_SET
+#endif
+
# define cap_clear(c) do { (c) = __cap_empty_set; } while (0)
# define cap_set_full(c) do { (c) = __cap_full_set; } while (0)
# define cap_set_init_eff(c) do { (c) = __cap_init_eff_set; } while (0)
@@ -465,6 +482,17 @@ extern const kernel_cap_t __cap_init_eff_set;
int capable(int cap);
int __capable(struct task_struct *t, int cap);
+#ifdef CONFIG_COMMONCAP
+extern int cap_prctl_setbset(kernel_cap_t new_bset);
+extern int cap_prctl_getbset(kernel_cap_t *bset);
+#else
+#include <linux/errno.h>
+static inline int cap_prctl_setbset(kernel_cap_t new_bset)
+{ return -EINVAL; }
+static inline int cap_prctl_getbset(kernel_cap_t *bset)
+{ return -EINVAL; }
+#endif
+
#endif /* __KERNEL__ */
#endif /* !_LINUX_CAPABILITY_H */
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index cae35b6..5c84d14 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -147,6 +147,7 @@ extern struct group_info init_groups;
.cap_effective = CAP_INIT_EFF_SET, \
.cap_inheritable = CAP_INIT_INH_SET, \
.cap_permitted = CAP_FULL_SET, \
+ .cap_bset = CAP_INIT_BSET, \
.keep_capabilities = 0, \
.user = INIT_USER, \
.comm = "swapper", \
diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index e2eff90..a7de023 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -63,4 +63,8 @@
#define PR_GET_SECCOMP 21
#define PR_SET_SECCOMP 22
+/* Get/set the capability bounding set */
+#define PR_GET_CAPBSET 23
+#define PR_SET_CAPBSET 24
+
#endif /* _LINUX_PRCTL_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1d17f7c..bf51a16 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1041,7 +1041,7 @@ struct task_struct {
...