One thought on this - could we make the API have a "which" parameter
that indicates the type of thing being acted upon? E.g., like
sys_setpriority(), which can specify the target as a process, a pgrp
or a user.
Right now the target would just be a process, but I'd really like the
ability to be able to specify an fd on a cgroup directory to indicate
that I want the child to inherit from that cgroup's namespaces. That
way you wouldn't need to keep a child process alive in the namespace
just to act as a hijack target.
Paul
On 10/9/07, Serge E. Hallyn <serue@us.ibm.com> wrote:
> >From 945fe66259cd0cfdc2fe846287b7821e329a558c Mon Sep 17 00:00:00 2001
> From: sergeh@us.ibm.com <hallyn@kernel.(none)>
> Date: Tue, 9 Oct 2007 08:30:30 -0700
> Subject: [PATCH] namespaces: introduce sys_hijack (v4)
>
> Move most of do_fork() into a new do_fork_task() which acts on
> a new argument, task, rather than on current. do_fork() becomes
> a call to do_fork_task(current, ...).
>
> Introduce sys_hijack (for x86 only so far). It is like clone, but
> in place of a stack pointer (which is assumed null) it accepts a
> pid. The process identified by that pid is the one which is
> actually cloned. Some state - include the file table, the signals
> and sighand (and hence tty), and the ->parent are taken from the
> calling process.
>
> The effect is a sort of namespace enter. The following program
> uses sys_hijack to 'enter' all namespaces of the specified pid.
> For instance in one terminal, do
>
> mount -t cgroup -ons /cgroup
> hostname
> qemu
> ns_exec -u /bin/sh
> hostname serge
> echo $$
> 1073
> cat /proc/$$/cgroup
> ns:/node_1073
>
> In another terminal then do
>
> hostname
> qemu
> cat /proc/$$/cgroup
> ns:/
> hijack 1073
> hostname
> serge
> cat /proc/$$/cgroup
> ns:/node_1073
>
> sys_hijack is arch-dependent and is only implemented for i386 so far.
>
> Changelog:
> Aug 23: send a stop signal to the hijacked process
> (like ptrace does).
> Oct 09: Update for 2.6.23-rc8-mm2 (mainly pidns)
> Don't take task_lock under rcu_read_lock
> Send hijacked process to cgroup_fork() as
> the first argument.
> Removed some unneeded task_locks.
>
> ==============================================================
> hijack.c
> ==============================================================
>
> int do_clone_task(void)
> {
> execl("/bin/sh", "/bin/sh", NULL);
> }
>
> int main(int argc, char *argv[])
> {
> int pid;
> int ret;
> int status;
>
> if (argc < 2)
> return 1;
> pid = atoi(argv[1]);
>
> ret = syscall(327, SIGCHLD, pid, NULL, NULL);
>
> if (ret == 0) {
> return do_clone_task();
> } else if (ret < 0) {
> perror("sys_hijack");
> } else {
> printf("waiting on cloned process %d\n", ret);
> while (waitpid(ret, &status, __WCLONE) != ret);
> printf("cloned process %d exited with %d\n", ret, status);
> }
>
> return ret;
> }
> ==============================================================
>
> Signed-off-by: Serge Hallyn <serue@us.ibm.com>
> ---
> arch/i386/kernel/process.c | 58 ++++++++++++++++++++++++++++++-
> arch/i386/kernel/syscall_table.S | 1 +
> arch/s390/kernel/process.c | 12 +++++-
> include/asm-i386/unistd.h | 3 +-
> include/linux/cgroup.h | 5 ++-
> include/linux/pid.h | 2 +-
> include/linux/ptrace.h | 1 +
> include/linux/sched.h | 2 +
> include/linux/syscalls.h | 1 +
> kernel/cgroup.c | 8 ++--
> kernel/fork.c | 69 +++++++++++++++++++++++++++----------
> kernel/pid.c | 5 ++-
> kernel/ptrace.c | 7 ++++
> 13 files changed, 141 insertions(+), 33 deletions(-)
>
> diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
> index bfcd01e..01f4d16 100644
> --- a/arch/i386/kernel/process.c
> +++ b/arch/i386/kernel/process.c
> @@ -455,8 +455,15 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
> unsigned long unused,
> struct task_struct * p, struct pt_regs * regs)
> {
> + return copy_a_thread(current, nr, clone_flags, esp, unused,
> + p, regs);
> +}
> +
> +int copy_a_thread(struct task_struct *tsk, int nr, unsigned long clone_flags,
> + unsigned long esp, unsigned long unused,
> + struct task_struct * p, struct pt_regs * regs)
> +{
> struct pt_regs * childregs;
> - struct task_struct *tsk;
> int err;
>
> childregs = task_pt_regs(p);
> @@ -471,7 +478,6 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
>
> savesegment(gs,p->thread.gs);
>
> - tsk = current;
> if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
> p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
> IO_BITMAP_BYTES, GFP_KERNEL);
> @@ -783,6 +789,54 @@ asmlinkage int sys_clone(struct pt_regs regs)
> return do_fork(clone_flags, newsp, ®s, 0, parent_tidptr, child_tidptr);
> }
>
> +asmlinkage int sys_hijack(struct pt_regs regs)
> +{
> + unsigned long clone_flags;
> + int __user *parent_tidptr, *child_tidptr;
> + pid_t pid;
> + struct task_struct *task;
> + int ret = -EINVAL;
> +
> + clone_flags = regs.ebx;
> + pid = regs.ecx;
> + parent_tidptr = (int __user *)regs.edx;
> + child_tidptr = (int __user *)regs.edi;
> +
> + rcu_read_lock();
> + task = find_task_by_vpid(pid);
> + if (task)
> + get_task_struct(task);
> + rcu_read_unlock();
> +
> + if (task) {
> + task_lock(task);
> + put_task_struct(task);
> + }
> +
> + if (task) {
> + if (!ptrace_may_attach_locked(task)) {
> + ret = -EPERM;
> + goto out_put_task;
> + }
> + if (task->ptrace) {
> + ret = -EBUSY;
> + goto out_put_task;
> + }
> + force_sig_specific(SIGSTOP, task);
> +
> + task_unlock(task);
> + ret = do_fork_task(task, clone_flags, regs.esp, ®s, 0,
> + parent_tidptr, child_tidptr);
> + wake_up_process(task);
> + task = NULL;
> + }
> +
> +out_put_task:
> + if (task)
> + task_unlock(task);
> + return ret;
> +}
> +
> /*
> * This is trivial, and on the face of it looks like it
> * could equally well be done in user mode.
> diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
> index df6e41e..495930c 100644
> --- a/arch/i386/kernel/syscall_table.S
> +++ b/arch/i386/kernel/syscall_table.S
> @@ -326,3 +326,4 @@ ENTRY(sys_call_table)
> .long sys_fallocate
> .long sys_revokeat /* 325 */
> .long sys_frevoke
> + .long sys_hijack
> diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
> index 70c5737..f256e7a 100644
> --- a/arch/s390/kernel/process.c
> +++ b/arch/s390/kernel/process.c
> @@ -223,6 +223,14 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long new_stackp,
> unsigned long unused,
> struct task_struct * p, struct pt_regs * regs)
> {
> + return copy_a_thread(current, nr, clone_flags, new_stackp, unused,
> + p, regs);
> +}
> +
> +int copy_a_thread(struct task_struct *task, int nr, unsigned long clone_flags,
> + unsigned long new_stackp, unsigned long unused,
> + struct task_struct * p, struct pt_regs * regs)
> +{
> struct fake_frame
> {
> struct stack_frame sf;
> @@ -251,8 +259,8 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long new_stackp,
> * save fprs to current->thread.fp_regs to merge them with
> * the emulated registers and then copy the result to the child.
> */
> - save_fp_regs(¤t->thread.fp_regs);
> - memcpy(&p->thread.fp_regs, ¤t->thread.fp_regs,
> + save_fp_regs(&task->thread.fp_regs);
> + memcpy(&p->thread.fp_regs, &task->thread.fp_regs,
> sizeof(s390_fp_regs));
> p->thread.user_seg = __pa((unsigned long) p->mm->pgd) | _SEGMENT_TABLE;
> /* Set a new TLS ? */
> diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
> index 006c1b3..fe6eeb4 100644
> --- a/include/asm-i386/unistd.h
> +++ b/include/asm-i386/unistd.h
> @@ -332,10 +332,11 @@
> #define __NR_fallocate 324
> #define __NR_revokeat 325
> #define __NR_frevoke
...