This is just a first stab at doing hijack by cgroup files. I force
using the 'tasks' file just so that I can (a) predict and check
the name of the file, (b) make sure it's a cgroup file, and then
(c) trust that taking __d_cont(dentry parent) gives me a legitimate
container.
Seems to work at least.
Paul, does this look reasonable? task_from_cgroup_fd() in particular.
thanks,
-serge
>From 8ba6a4ae33da18b95d967a799c5edce4266d1f1f Mon Sep 17 00:00:00 2001
From: sergeh@us.ibm.com <sergeh@us.ibm.com>
Date: Tue, 16 Oct 2007 09:36:49 -0700
Subject: [PATCH] namespaces: introduce sys_hijack (v7)
Move most of do_fork() into a new do_fork_task() which acts on
a new argument, task, rather than on current. do_fork() becomes
a call to do_fork_task(current, ...).
Introduce sys_hijack (for x86 only so far). It is like clone, but
in place of a stack pointer (which is assumed null) it accepts a
pid. The process identified by that pid is the one which is
actually cloned. Some state - include the file table, the signals
and sighand (and hence tty), and the ->parent are taken from the
calling process.
A process to be hijacked may be identified by process id.
Alternatively, an open fd for a cgroup 'tasks' file may be
specified. The first available task in that cgroup will then
be hijacked.
In order to hijack a process, the calling process must be
allowed to ptrace the target.
The effect is a sort of namespace enter. The following program
uses sys_hijack to 'enter' all namespaces of the specified task.
For instance in one terminal, do
mount -t cgroup -ons /cgroup
hostname
qemu
ns_exec -u /bin/sh
hostname serge
echo $$
1073
cat /proc/$$/cgroup
ns:/node_1073
In another terminal then do
hostname
qemu
cat /proc/$$/cgroup
ns:/
hijack pid 1073
hostname
serge
cat /proc/$$/cgroup
ns:/node_1073
hijack cgroup /cgroup/node_1073/tasks
Changelog:
Aug 23: send a stop signal to the hijacked process
(like ptrace does).
Oct 09: Update for 2.6.23-rc8-mm2 (mainly pidns)
Don't take task_lock under rcu_read_lock
Send hijacked process to cgroup_fork() as
the first argument.
Removed some unneeded task_locks.
Oct 16: Fix bug introduced into alloc_pid.
Oct 16: Add 'int which' argument to sys_hijack to
allow later expansion to use cgroup in place
of pid to specify what to hijack.
Oct 24: Implement hijack by open cgroup file.
==============================================================
hijack.c
==============================================================
#define _BSD_SOURCE
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sched.h>
void usage(char *me)
{
printf("Usage: %s pid <pid> | %s cgroup <cgroup_tasks_file>\n", me, me);
}
int exec_shell(void)
{
execl("/bin/sh", "/bin/sh", NULL);
}
int main(int argc, char *argv[])
{
int id;
int ret;
int status;
int use_pid = 0;
if (argc < 3 || !strcmp(argv[1], "-h")) {
usage(argv[0]);
return 1;
}
if (strcmp(argv[1], "pid") == 0)
use_pid = 1;
if (use_pid)
id = atoi(argv[2]);
else {
id = open(argv[2], O_RDONLY);
if (id == -1) {
perror("cgroup open");
return 1;
}
}
ret = syscall(327, SIGCHLD, use_pid ? 1 : 2, (unsigned long)id);
if (use_pid)
close(id);
if (ret == 0) {
return exec_shell();
} else if (ret < 0) {
perror("sys_hijack");
} else {
printf("waiting on cloned process %d\n", ret);
// ret = waitpid(ret, &status, __WALL);
while(waitpid(-1, &status, __WALL) != -1)
;
printf("cloned process exited with %d (waitpid ret %d)\n",
status, ret);
}
return ret;
}
==============================================================
Signed-off-by: Serge Hallyn <serue@us.ibm.com>
---
arch/i386/kernel/process.c | 87 +++++++++++++++++++++++++++++++++++++-
arch/i386/kernel/syscall_table.S | 1 +
arch/s390/kernel/process.c | 12 ++++-
include/asm-i386/unistd.h | 3 +-
include/linux/cgroup.h | 10 ++++-
include/linux/ptrace.h | 1 +
include/linux/sched.h | 8 ++++
include/linux/syscalls.h | 1 +
kernel/cgroup.c | 69 ++++++++++++++++++++++++++++--
kernel/fork.c | 67 +++++++++++++++++++++--------
kernel/ptrace.c | 7 +++
11 files changed, 237 insertions(+), 29 deletions(-)
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index eab6c62..4b631b9 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -39,6 +39,7 @@
#include <linux/personality.h>
#include <linux/tick.h>
#include <linux/percpu.h>
+#include <linux/cgroup.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -455,8 +456,15 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
unsigned long unused,
struct task_struct * p, struct pt_regs * regs)
{
+ return copy_a_thread(current, nr, clone_flags, esp, unused,
+ p, regs);
+}
+
+int copy_a_thread(struct task_struct *tsk, int nr, unsigned long clone_flags,
+ unsigned long esp, unsigned long unused,
+ struct task_struct * p, struct pt_regs * regs)
+{
struct pt_regs * childregs;
- struct task_struct *tsk;
int err;
childregs = task_pt_regs(p);
@@ -471,7 +479,6 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
savesegment(gs,p->thread.gs);
- tsk = current;
if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
IO_BITMAP_BYTES, GFP_KERNEL);
@@ -784,6 +791,82 @@ asmlinkage int sys_clone(struct pt_regs regs)
}
/*
+ * Called with task count bumped, drops task count before returning
+ */
+static int hijack_task(struct task_struct *task, struct pt_regs regs)
+{
+ unsigned long clone_flags = regs.ebx;
+ int ret = -EINVAL;
+
+ task_lock(task);
+ put_task_struct(task);
+ if (!ptrace_may_attach_locked(task)) {
+ ret = -EPERM;
+ goto out_put_task;
+ }
+ if (task->ptrace) {
+ ret = -EBUSY;
+ goto out_put_task;
+ }
+ force_sig_specific(SIGSTOP, task);
+
+ task_unlock(task);
+ ret = do_fork_task(task, clone_flags, regs.esp, ®s, 0,
+ NULL, NULL);
+ wake_up_process(task);
+ return ret;
+
+out_put_task:
+ task_unlock(task);
+ return ret;
+}
+
+static int hijack_pid(struct pt_regs regs)
+{
+ pid_t pid = regs.edx;
+ struct task_struct *task;
+
+ rcu_read_lock();
+ task = find_task_by_vpid(pid);
+ if (task)
+ get_task_struct(task);
+ rcu_read_unlock();
+
+ if (!task)
+ return -EINVAL;
+
+ return hijack_task(task, regs);
+}
+
+static int hijack_cgroup(struct pt_regs regs)
+{
+ unsigned int fd;
+ struct task_struct *task;
+
+ fd = (unsigned int) regs.edx;
+ task = task_from_cgroup_fd(fd);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+
+ return hijack_task(task, regs);
+}
+
+asmlinkage int sys_hijack(struct pt_regs regs)
+{
+ int which = regs.ecx;
+
+ switch (which) {
+ case HIJACK_PID:
+ return hijack_pid(regs);
+ case HIJACK_CGROUP:
+ return hijack_cgroup(regs);
+ default:
+ return -EINVAL;
+ }
+
+}
+
+/*
* This is trivial, and on the face of it looks like it
* could equally well be done in user mode.
*
diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index df6e41e..495930c 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -326,3 +326,4 @@ ENTRY(sys_call_table)
.long sys_fallocate
.long sys_revokeat /* 325 */
.long sys_frevoke
+ .long sys_hijack
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index 70c5737..f256e7a 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -223,6 +223,14 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long new_stackp,
unsigned long unused,
struct task_struct * p, struct pt_regs * regs)
{
+ return copy_a_thread(current, nr, clone_flags, new_stackp, unused,
+ p, regs);
+}
+
+int copy_a_thread(struct task_struct *task, int nr, unsigned long clone_flags,
+ unsigned long new_stackp, unsigned long unused,
+ struct task_struct * p, struct pt_regs * regs)
+{
struct fake_frame
{
struct stack_frame sf;
@@ -251,8 +259,8 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long new_stackp,
* save fprs to current->thread.fp_regs to merge them with
* the emulated registers and then copy the result to the child.
*/
- save_fp_regs(¤t->thread.fp_regs);
- memcpy(&p->thread.fp_regs, ¤t->thread.fp_regs,
+ save_fp_regs(&task->thread.fp_regs);
+ memcpy(&p->thread.fp_regs, &task->thread.fp_regs,
sizeof(s390_fp_regs));
p->thread.user_seg = __pa((unsigned long) p->mm->pgd) | _SEGMENT_TABLE;
/* Set a new TLS ? */
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index 006c1b3..fe6eeb4 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -332,10 +332,11 @@
#define __NR_fallocate 324
#define __NR_revokeat 325
#define __NR_frevoke 326
+#define __NR_hijack 327
#ifdef __KERNEL__
-#define NR_syscalls 327
+#define NR_syscalls 328
#define __ARCH_WANT_IPC_PARSE_VERSION
#define __ARCH_WANT_OLD_READDIR
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 8747932..3edb820 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -26,7 +26,7 @@ extern int cgroup_init(void);
extern void cgroup_init_smp(void);
extern void cgroup_lock(void);
extern void cgroup_unlock(void);
-extern void cgroup_fork(struct task_struct *p);
+extern void cgroup_fork(struct task_struct *parent, struct task_struct *p);
extern void cgroup_fork_callbacks(struct task_struct *p);
extern void
...