Serge E. Hallyn wrote:
> Apparently my hijack test program was just too mean-n-lean to show just
> how fragile sys_hijack was. Once it tried to do any kind of actual work
> before doing an exec(), it would segfault. Apparently just letting
> dup_task_struct() do "*tsk = *hijacked_task" was bogus.
>
> Mark, I'm sorry, this is without your powerpc/x86_64 patches. But note
> that I took out the 'copy_a_thread' stuff which was bogus anyway.
That's fine - I needed to do more testing on them anyway :)
I'll work on re-spinning them against this v8 of sys_hijack after I finish
up with 2.6.23-mm1-lxc1.
Thanks!
Mark.
>
> It's incomplete, I need to walk through the task_struct and copy more of
> the relevant pieces from the hijacked task_struct. But this appears to
> be robust.
>
> Tested with entering unshared uts namespace and with cloned
> pidns+mountns+utsns container.
>
> thanks,
> -serge
>
> PS: also a typo fix in the hijack.c code.
>
>>From 0efe7d1f607438be986d7000571fda406a866b12 Mon Sep 17 00:00:00 2001
> From: sergeh@us.ibm.com <sergeh@us.ibm.com>
> Date: Tue, 16 Oct 2007 09:36:49 -0700
> Subject: [PATCH] namespaces: introduce sys_hijack (v8)
>
> Move most of do_fork() into a new do_fork_task() which acts on
> a new argument, task, rather than on current. do_fork() becomes
> a call to do_fork_task(current, ...).
>
> Introduce sys_hijack (for x86 only so far). It is like clone, but
> in place of a stack pointer (which is assumed null) it accepts a
> pid. The process identified by that pid is the one which is
> actually cloned. Some state - include the file table, the signals
> and sighand (and hence tty), and the ->parent are taken from the
> calling process.
>
> A process to be hijacked may be identified by process id.
> Alternatively, an open fd for a cgroup 'tasks' file may be
> specified. The first available task in that cgroup will then
> be hijacked.
>
> In order to hijack a process, the calling process must be
> allowed to ptrace the target.
>
> The effect is a sort of namespace enter. The following program
> uses sys_hijack to 'enter' all namespaces of the specified task.
> For instance in one terminal, do
>
> mount -t cgroup -ons /cgroup
> hostname
> qemu
> ns_exec -u /bin/sh
> hostname serge
> echo $$
> 1073
> cat /proc/$$/cgroup
> ns:/node_1073
>
> In another terminal then do
>
> hostname
> qemu
> cat /proc/$$/cgroup
> ns:/
> hijack pid 1073
> hostname
> serge
> cat /proc/$$/cgroup
> ns:/node_1073
> hijack cgroup /cgroup/node_1073/tasks
>
> Changelog:
> Aug 23: send a stop signal to the hijacked process
> (like ptrace does).
> Oct 09: Update for 2.6.23-rc8-mm2 (mainly pidns)
> Don't take task_lock under rcu_read_lock
> Send hijacked process to cgroup_fork() as
> the first argument.
> Removed some unneeded task_locks.
> Oct 16: Fix bug introduced into alloc_pid.
> Oct 16: Add 'int which' argument to sys_hijack to
> allow later expansion to use cgroup in place
> of pid to specify what to hijack.
> Oct 24: Implement hijack by open cgroup file.
> Nov 02: Switch copying of task info: do full copy
> from current, then copy relevant pieces from
> hijacked task.
>
> ==============================================================
> hijack.c
> ==============================================================
> #define _BSD_SOURCE
> #include <unistd.h>
> #include <sys/syscall.h>
> #include <sys/types.h>
> #include <sys/wait.h>
> #include <sys/stat.h>
> #include <fcntl.h>
> #include <sched.h>
>
> void usage(char *me)
> {
> printf("Usage: %s pid <pid> | %s cgroup <cgroup_tasks_file>\n", me, me);
> }
>
> int exec_shell(void)
> {
> execl("/bin/sh", "/bin/sh", NULL);
> }
>
> int main(int argc, char *argv[])
> {
> int id;
> int ret;
> int status;
> int use_pid = 0;
>
> if (argc < 3 || !strcmp(argv[1], "-h")) {
> usage(argv[0]);
> return 1;
> }
> if (strcmp(argv[1], "pid") == 0)
> use_pid = 1;
>
> if (use_pid)
> id = atoi(argv[2]);
> else {
> id = open(argv[2], O_RDONLY);
> if (id == -1) {
> perror("cgroup open");
> return 1;
> }
> }
>
> ret = syscall(327, SIGCHLD, use_pid ? 1 : 2, (unsigned long)id);
>
> if (!use_pid)
> close(id);
> if (ret == 0) {
> return exec_shell();
> } else if (ret < 0) {
> perror("sys_hijack");
> } else {
> printf("waiting on cloned process %d\n", ret);
> while(waitpid(-1, &status, __WALL) != -1)
> ;
> printf("cloned process exited with %d (waitpid ret %d)\n",
> status, ret);
> }
>
> return ret;
> }
> ==============================================================
>
> Signed-off-by: Serge Hallyn <serue@us.ibm.com>
> ---
> arch/i386/kernel/process.c | 77 +++++++++++++++++++++++++++++++
> arch/i386/kernel/syscall_table.S | 1 +
> include/asm-i386/unistd.h | 3 +-
> include/linux/cgroup.h | 10 +++-
> include/linux/ptrace.h | 1 +
> include/linux/sched.h | 7 +++
> include/linux/syscalls.h | 1 +
> kernel/cgroup.c | 69 +++++++++++++++++++++++++++--
> kernel/fork.c | 92 +++++++++++++++++++++++++++++++-------
> kernel/ptrace.c | 7 +++
> 10 files changed, 244 insertions(+), 24 deletions(-)
>
> diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
> index eab6c62..8add6e4 100644
> --- a/arch/i386/kernel/process.c
> +++ b/arch/i386/kernel/process.c
> @@ -39,6 +39,7 @@
> #include <linux/personality.h>
> #include <linux/tick.h>
> #include <linux/percpu.h>
> +#include <linux/cgroup.h>
>
> #include <asm/uaccess.h>
> #include <asm/pgtable.h>
> @@ -784,6 +785,82 @@ asmlinkage int sys_clone(struct pt_regs regs)
> }
>
> /*
> + * Called with task count bumped, drops task count before returning
> + */
> +static int hijack_task(struct task_struct *task, struct pt_regs regs)
> +{
> + unsigned long clone_flags = regs.ebx;
> + int ret = -EINVAL;
> +
> + task_lock(task);
> + put_task_struct(task);
> + if (!ptrace_may_attach_locked(task)) {
> + ret = -EPERM;
> + goto out_put_task;
> + }
> + if (task->ptrace) {
> + ret = -EBUSY;
> + goto out_put_task;
> + }
> + force_sig_specific(SIGSTOP, task);
> +
> + task_unlock(task);
> + ret = do_fork_task(task, clone_flags, regs.esp, ®s, 0,
> + NULL, NULL);
> + wake_up_process(task);
> + return ret;
> +
> +out_put_task:
> + task_unlock(task);
> + return ret;
> +}
> +
> +static int hijack_pid(struct pt_regs regs)
> +{
> + pid_t pid = regs.edx;
> + struct task_struct *task;
> +
> + rcu_read_lock();
> + task = find_task_by_vpid(pid);
> + if (task)
> + get_task_struct(task);
> + rcu_read_unlock();
> +
> + if (!task)
> + return -EINVAL;
> +
> + return hijack_task(task, regs);
> +}
> +
> +static int hijack_cgroup(struct pt_regs regs)
> +{
> + unsigned int fd;
> + struct task_struct *task;
> +
> + fd = (unsigned int) regs.edx;
> + task = task_from_cgroup_fd(fd);
> + if (IS_ERR(task))
> + return PTR_ERR(task);
> +
> + return hijack_task(task, regs);
> +}
> +
> +asmlinkage int sys_hijack(struct pt_regs regs)
> +{
> + int which = regs.ecx;
> +
> + switch (which) {
> + case HIJACK_PID:
> + return hijack_pid(regs);
> + case HIJACK_CGROUP:
> + return hijack_cgroup(regs);
> + default:
> + return -EINVAL;
> + }
> +
> +}
> +
> +/*
> * This is trivial, and on the face of it looks like it
> * could equally well be done in user mode.
> *
> diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
> index df6e41e..495930c 100644
> --- a/arch/i386/kernel/syscall_table.S
> +++ b/arch/i386/kernel/syscall_table.S
> @@ -326,3 +326,4 @@ ENTRY(sys_call_table)
> .long sys_fallocate
> .long sys_revokeat /* 325 */
> .long sys_frevoke
> + .long sys_hijack
> diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
> index 006c1b3..fe6eeb4 100644
> --- a/include/asm-i386/unistd.h
> +++ b/include/asm-i386/unistd.h
> @@ -332,10 +332,11 @@
> #define __NR_fallocate 324
> #define __NR_revokeat 325
> #define __NR_frevoke 326
> +#define __NR_hijack 327
>
> #ifdef __KERNEL__
>
> -#define NR_syscalls 327
> +#define NR_syscalls 328
>
> #define __ARCH_WANT_IPC_PARSE_VERSION
> #define __ARCH_WANT_OLD_READDIR
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index 8747932..3edb820 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -26,7 +26,7 @@ extern int cgroup_init(void);
> extern void cgroup_init_smp(void);
> extern void cgroup_lock(void);
> extern void cgroup_unlock(void);
> -extern void cgroup_fork(struct task_struct *p);
> +extern void cgroup_fork(struct task_struct *parent, struct task_struct *p);
> extern void cgroup_fork_callbacks(struct task_struct *p);
> extern void cgroup_post_fork(struct task_struct *p);
> extern void cgroup_exit(str
...