Home » Mailing lists » Devel » [RFC][PATCH 0/16] Enable cloning of pid namespace
Re: [RFC][PATCH 07/16] Move alloc_pid call to copy_process [message #18667 is a reply to message #18616] |
Fri, 25 May 2007 06:41   |
Pavel Emelianov
Messages: 1149 Registered: September 2006
|
Senior Member |
|
|
sukadev@us.ibm.com wrote:
> Subject: Move alloc_pid call to copy_process
>
> From: Sukadev Bhattiprolu <sukadev@us.ibm.com>
>
> Move alloc_pid() into copy_process(). This will keep all pid and pid
> namespace code together and simplify error handling when we support
> multiple pid namespaces.
I haven't found this in patches, so I ask it here:
We clone a new task with CLONE_NEWPIDS flag. This task
allocates its PIDTYPE_PID pid and this pid happens in
booth parent and child namespace. This is OK. Then new
task attaches PIDTYPE_SID and PIDTYPE_PGID pids from parent
task. But these ones are in parent namespace only.
Right? Is that good?
> Changelog:
> - [Eric Biederman] Move the check of copy_process_type to alloc_pid()/
> free_pid() and to avoid clutter in copy_process().
>
> Signed-off-by: Sukadev Bhattiprolu <sukadev@us.ibm.com>
> ---
> include/linux/pid.h | 7 ++++++-
> kernel/fork.c | 21 +++++++++++++--------
> kernel/pid.c | 10 +++++++++-
> 3 files changed, 28 insertions(+), 10 deletions(-)
>
> Index: lx26-21-mm2/include/linux/pid.h
> ===================================================================
> --- lx26-21-mm2.orig/include/linux/pid.h 2007-05-22 16:59:40.000000000 -0700
> +++ lx26-21-mm2/include/linux/pid.h 2007-05-22 17:06:48.000000000 -0700
> @@ -3,6 +3,11 @@
>
> #include <linux/rcupdate.h>
>
> +enum copy_process_type {
> + COPY_NON_IDLE_PROCESS,
> + COPY_IDLE_PROCESS,
> +};
> +
> enum pid_type
> {
> PIDTYPE_PID,
> @@ -95,7 +100,7 @@ extern struct pid *FASTCALL(find_pid(int
> extern struct pid *find_get_pid(int nr);
> extern struct pid *find_ge_pid(int nr);
>
> -extern struct pid *alloc_pid(void);
> +extern struct pid *alloc_pid(enum copy_process_type);
> extern void FASTCALL(free_pid(struct pid *pid));
>
> static inline pid_t pid_to_nr(struct pid *pid)
> Index: lx26-21-mm2/kernel/fork.c
> ===================================================================
> --- lx26-21-mm2.orig/kernel/fork.c 2007-05-22 16:59:41.000000000 -0700
> +++ lx26-21-mm2/kernel/fork.c 2007-05-22 17:06:48.000000000 -0700
> @@ -961,10 +961,11 @@ static struct task_struct *copy_process(
> unsigned long stack_size,
> int __user *parent_tidptr,
> int __user *child_tidptr,
> - struct pid *pid)
> + enum copy_process_type copy_src)
> {
> int retval;
> struct task_struct *p = NULL;
> + struct pid *pid;
>
> if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
> return ERR_PTR(-EINVAL);
> @@ -1025,6 +1026,10 @@ static struct task_struct *copy_process(
> if (p->binfmt && !try_module_get(p->binfmt->module))
> goto bad_fork_cleanup_put_domain;
>
> + pid = alloc_pid(copy_src);
> + if (!pid)
> + goto bad_fork_put_binfmt_module;
> +
> p->did_exec = 0;
> delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
> copy_flags(clone_flags, p);
> @@ -1305,6 +1310,8 @@ bad_fork_cleanup_cpuset:
> #endif
> cpuset_exit(p);
> delayacct_tsk_free(p);
> + free_pid(pid);
> +bad_fork_put_binfmt_module:
> if (p->binfmt)
> module_put(p->binfmt->module);
> bad_fork_cleanup_put_domain:
> @@ -1331,7 +1338,7 @@ struct task_struct * __cpuinit fork_idle
> struct pt_regs regs;
>
> task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL, NULL,
> - &init_struct_pid);
> + COPY_IDLE_PROCESS);
> if (!IS_ERR(task))
> init_idle(task, cpu);
>
> @@ -1369,19 +1376,16 @@ long do_fork(unsigned long clone_flags,
> {
> struct task_struct *p;
> int trace = 0;
> - struct pid *pid = alloc_pid();
> long nr;
>
> - if (!pid)
> - return -EAGAIN;
> - nr = pid->nr;
> if (unlikely(current->ptrace)) {
> trace = fork_traceflag (clone_flags);
> if (trace)
> clone_flags |= CLONE_PTRACE;
> }
>
> - p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, pid);
> + p = copy_process(clone_flags, stack_start, regs, stack_size,
> + parent_tidptr, child_tidptr, COPY_NON_IDLE_PROCESS);
> /*
> * Do this prior waking up the new thread - the thread pointer
> * might get invalid after that point, if the thread exits quickly.
> @@ -1389,6 +1393,8 @@ long do_fork(unsigned long clone_flags,
> if (!IS_ERR(p)) {
> struct completion vfork;
>
> + nr = pid_to_nr(task_pid(p));
> +
> if (clone_flags & CLONE_VFORK) {
> p->vfork_done = &vfork;
> init_completion(&vfork);
> @@ -1422,7 +1428,6 @@ long do_fork(unsigned long clone_flags,
> }
> }
> } else {
> - free_pid(pid);
> nr = PTR_ERR(p);
> }
> return nr;
> Index: lx26-21-mm2/kernel/pid.c
> ===================================================================
> --- lx26-21-mm2.orig/kernel/pid.c 2007-05-22 16:59:46.000000000 -0700
> +++ lx26-21-mm2/kernel/pid.c 2007-05-22 17:06:48.000000000 -0700
> @@ -216,6 +216,10 @@ fastcall void free_pid(struct pid *pid)
> /* We can be called with write_lock_irq(&tasklist_lock) held */
> unsigned long flags;
>
> + /* check this here to keep copy_process() cleaner */
> + if (unlikely(pid == &init_struct_pid))
> + return;
> +
> spin_lock_irqsave(&pidmap_lock, flags);
> hlist_del_rcu(&pid->pid_chain);
> spin_unlock_irqrestore(&pidmap_lock, flags);
> @@ -224,12 +228,16 @@ fastcall void free_pid(struct pid *pid)
> call_rcu(&pid->rcu, delayed_put_pid);
> }
>
> -struct pid *alloc_pid(void)
> +struct pid *alloc_pid(enum copy_process_type copy_src)
> {
> struct pid *pid;
> enum pid_type type;
> int nr = -1;
>
> + /* check this here to keep copy_process() cleaner */
> + if (unlikely(copy_src == COPY_IDLE_PROCESS))
> + return &init_struct_pid;
> +
> pid = kmem_cache_alloc(pid_cachep, GFP_KERNEL);
> if (!pid)
> goto out;
> _______________________________________________
> Containers mailing list
> Containers@lists.linux-foundation.org
> https://lists.linux-foundation.org/mailman/listinfo/containers
>
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
|
|
|
Re: [RFC][PATCH 06/16] Define is_global_init() [message #18674 is a reply to message #18641] |
Fri, 25 May 2007 20:44   |
Sukadev Bhattiprolu
Messages: 413 Registered: August 2006
|
Senior Member |
|
|
Dave Hansen [hansendc@us.ibm.com] wrote:
| On Thu, 2007-05-24 at 13:24 +0400, Pavel Emelianov wrote:
| > > | > +int is_global_init(struct task_struct *tsk)
| > > | > +{
| > > | > + return (task_active_pid_ns(tsk) == &init_pid_ns && tsk->pid == 1);
| > > |
| > > | This can OOPS if you pass arbitrary task to this call...
| > > | tsk->nsproxy can already be NULL.
| > >
| > > Hmm. You are right. btw, this could be a bisect issue. Patch 9 of uses
| > > pid_ns from pid->upid_list and removes nsproxy->pid_ns.
| >
| > Yes, but that patch is not good either.
| > task_pid(tsk) may become NULL as well and this will oops.
|
| Have you reviewed the call paths to make sure this can actually happen
| in practice?
task_pid() can be NULL when we are tearing down the task structure in
release_task() and in the tiny window between detach_pid() and attach_pid()
in de_thread().
I think task_pid() is safe as long as it is called for 'current'. (we should
probably add some comments)
I will double check my code, but I think all my calls to task_pid() and hence,
to task_active_pid_ns() are safe, except for two cases:
a) is_global_init(). There are a few calls to process other than
current, but not sure if they are a problem.
For instance in current code, unhandled_signal() checks
tsk->pid == 1 and proceeds to derefernce tsk->sighand.
If task_pid() is NULL because the task was in release_task(),
then so is tsk->sighand.
b) the temporary check I added in check_kill_permissions().
(I need to address Serge's comment here anyway).
To make is_global_init() more efficient and independent of task_pid(),
can we steal a bit from task_struct->flags ? Like PF_KSWAPD, and there
are unused bits :-)
|
| This just seems like another one of those racing-with-task-exit races.
| Shouldn't be too invasive to solve.
A little invasive approach for the release_task() case could be to remove
the 'struct pid' from the hash table, but leave it attached to the
'task_struct' till the 'task_struct' itself is freed.
Removing from hash table ensures no one finds this process anymore, but
keeping it attached allows those who have already found the 'task_struct'
to also use the 'struct pid' as long as they have the task_struct.
Of course, needs investigation and micro surgery.
|
| -- Dave
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
|
|
|
Re: [RFC][PATCH 15/16] Enable signaling child reaper from parent ns. [message #18676 is a reply to message #18624] |
Fri, 25 May 2007 20:13   |
serue
Messages: 750 Registered: February 2006
|
Senior Member |
|
|
Quoting sukadev@us.ibm.com (sukadev@us.ibm.com):
>
> Subject: Enable signaling child reaper from parent ns.
>
> From: Sukadev Bhattiprolu <sukadev@us.ibm.com>
>
> The reaper of a child namespace must receive signals from its parent pid
> namespace but not receive any signals from its own namespace.
>
> This is a very early draft :-) and following tests seem to pass
>
> - Successfully kill child reaper from parent namespace (init_pid_ns)
>
> - Fail to kill child reaper from within its namespace (non init_pid_ns)
>
> - kill -1 1 from init_pid_ns seemed to work (rescanned inittab)
>
> TODO:
> - Test async io and SIGIO delivery.
>
> - Allow any legitimate signals that the child reaper can receive
> from within its namespace? (we block all signals now)
>
> - Sending SIGKILL to the child reaper of a namespace terminates the
> namespace But if the namespace remounted /proc from user space,
> /proc would remain mounted even after reaper and other process in
> the namespace go away.
>
> Signed-off-by: Sukadev Bhattiprolu <sukadev@us.ibm.com>
> ---
> kernel/signal.c | 22 +++++++++++++++++++++-
> 1 file changed, 21 insertions(+), 1 deletion(-)
>
> Index: lx26-21-mm2/kernel/signal.c
> ===================================================================
> --- lx26-21-mm2.orig/kernel/signal.c 2007-05-22 16:59:42.000000000 -0700
> +++ lx26-21-mm2/kernel/signal.c 2007-05-22 16:59:57.000000000 -0700
> @@ -507,6 +507,20 @@ static int check_kill_permission(int sig
> && !capable(CAP_KILL))
> return error;
>
> + /*
> + * If t is the reaper of its namespace and someone from that
> + * namespace is trying to send a signal.
> + *
> + * Note: If some one from parent namespace is sending a signal,
> + * task_child_reaper() != t and we allow the signal.
> + *
> + * In the child namespace, does this block even legitimate signals
> + * like the ones telinit sends to /sbin/init ?
> + *
> + */
> + if ((!is_global_init(t)) && (t == task_child_reaper(t)))
> + return -EPERM;
Ok, let's just go over the desired semantics.
Current treatment of init for signals is evident at
kernel/signal.c:get_signal_to_deliver(). There we used to check whether
current->pid == 1, which was turned into (current == child_reaper(current)).
First note on that, (current == child_reaper(current)) is correct if we
want to treat a container init as init no matter who sends the signal,
however I contend that if a signal is sent from an ancester namespace,
then we want to treat the task like any other task.
Could I get some confirmation or rebuttal from Suka, Eric, Dave, or
Pavel?
Next, note where that check is done: If the task has it's own custom
handler for a signal, then that will get called. Only if the handler is
the default, do we check whether the target is the init task. This is
why this patch is wrong, Suka. If init sets up a handler for USR1, then
tasks should be able to do a kill -USR1 1.
Ok, so finally here's the problem to solve. As I said above, if we are
willing to treat a container init like init no matter who signals it,
then we're ok with the current code. But if we want a container init to
be treated like a normal process if signaled from an ancestor pidns,
then we'll need to tack a struct pid or struct pid_ns pointer into
struct siginfo. And this means taking a reference to one of those,
which means slowing things down a touch.
thanks,
-serge
> +
> error = security_task_kill(t, info, sig, 0);
> if (!error)
> audit_signal_info(sig, t); /* Let audit system see the signal */
> @@ -1910,7 +1924,13 @@ relock:
> /*
> * Init of a pid space gets no signals it doesn't want from
> * within that pid space. It can of course get signals from
> - * its parent pid space.
> + * its parent pid space. But we have no way of knowing the
> + * namespace from which the signal was sent. For now check
> + * if we are global init here and add additional checks in
> + * sys_kill() and friends.
> + *
> + * Note that t == task_child_reaper(t) implies t is the global
> + * init (and we are in init_pid_ns).
> */
> if (current == task_child_reaper(current))
> continue;
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
|
|
|
|
Re: [RFC][PATCH 07/16] Move alloc_pid call to copy_process [message #18678 is a reply to message #18667] |
Fri, 25 May 2007 23:01   |
Sukadev Bhattiprolu
Messages: 413 Registered: August 2006
|
Senior Member |
|
|
Pavel Emelianov [xemul@openvz.org] wrote:
| sukadev@us.ibm.com wrote:
| > Subject: Move alloc_pid call to copy_process
| >
| > From: Sukadev Bhattiprolu <sukadev@us.ibm.com>
| >
| > Move alloc_pid() into copy_process(). This will keep all pid and pid
| > namespace code together and simplify error handling when we support
| > multiple pid namespaces.
|
| I haven't found this in patches, so I ask it here:
|
| We clone a new task with CLONE_NEWPIDS flag. This task
| allocates its PIDTYPE_PID pid and this pid happens in
| booth parent and child namespace. This is OK.
| Then new task attaches PIDTYPE_SID and PIDTYPE_PGID pids from
| parent task. But these ones are in parent namespace only.
|
| Right? Is that good?
|
In this patch, yes, we still attach to the parent process, bc at this
point we still support only one namespace.
In the patch that actually allows creating multiple namespaces,
(Patch #11), I have the following code which makes the process
that cloned its pid ns a session and pgrp leader, just like
/sbin/init for init_pid_ns.
@@ -1255,11 +1254,17 @@ static struct task_struct *copy_process(
__ptrace_link(p, current->parent);
if (thread_group_leader(p)) {
+ struct pid *pgrp = task_pgrp(current);
+ struct pid *session = task_session(current);
+
+ if (clone_flags & CLONE_NEWPID)
+ pgrp = session = pid;
+
p->signal->tty = current->signal->tty;
- p->signal->pgrp = process_group(current);
- set_signal_session(p->signal, process_session(current));
- attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
- attach_pid(p, PIDTYPE_SID, task_session(current));
+ p->signal->pgrp = pid_to_nr(pgrp);
+ set_signal_session(p->signal, pid_to_nr(session));
+ attach_pid(p, PIDTYPE_PGID, pgrp);
+ attach_pid(p, PIDTYPE_SID, session);
list_add_tail_rcu(&p->tasks, &init_task.tasks);
__get_cpu_var(process_counts)++;
| > Changelog:
| > - [Eric Biederman] Move the check of copy_process_type to alloc_pid()/
| > free_pid() and to avoid clutter in copy_process().
| >
| > Signed-off-by: Sukadev Bhattiprolu <sukadev@us.ibm.com>
| > ---
| > include/linux/pid.h | 7 ++++++-
| > kernel/fork.c | 21 +++++++++++++--------
| > kernel/pid.c | 10 +++++++++-
| > 3 files changed, 28 insertions(+), 10 deletions(-)
| >
| > Index: lx26-21-mm2/include/linux/pid.h
| > ===================================================================
| > --- lx26-21-mm2.orig/include/linux/pid.h 2007-05-22 16:59:40.000000000 -0700
| > +++ lx26-21-mm2/include/linux/pid.h 2007-05-22 17:06:48.000000000 -0700
| > @@ -3,6 +3,11 @@
| >
| > #include <linux/rcupdate.h>
| >
| > +enum copy_process_type {
| > + COPY_NON_IDLE_PROCESS,
| > + COPY_IDLE_PROCESS,
| > +};
| > +
| > enum pid_type
| > {
| > PIDTYPE_PID,
| > @@ -95,7 +100,7 @@ extern struct pid *FASTCALL(find_pid(int
| > extern struct pid *find_get_pid(int nr);
| > extern struct pid *find_ge_pid(int nr);
| >
| > -extern struct pid *alloc_pid(void);
| > +extern struct pid *alloc_pid(enum copy_process_type);
| > extern void FASTCALL(free_pid(struct pid *pid));
| >
| > static inline pid_t pid_to_nr(struct pid *pid)
| > Index: lx26-21-mm2/kernel/fork.c
| > ===================================================================
| > --- lx26-21-mm2.orig/kernel/fork.c 2007-05-22 16:59:41.000000000 -0700
| > +++ lx26-21-mm2/kernel/fork.c 2007-05-22 17:06:48.000000000 -0700
| > @@ -961,10 +961,11 @@ static struct task_struct *copy_process(
| > unsigned long stack_size,
| > int __user *parent_tidptr,
| > int __user *child_tidptr,
| > - struct pid *pid)
| > + enum copy_process_type copy_src)
| > {
| > int retval;
| > struct task_struct *p = NULL;
| > + struct pid *pid;
| >
| > if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
| > return ERR_PTR(-EINVAL);
| > @@ -1025,6 +1026,10 @@ static struct task_struct *copy_process(
| > if (p->binfmt && !try_module_get(p->binfmt->module))
| > goto bad_fork_cleanup_put_domain;
| >
| > + pid = alloc_pid(copy_src);
| > + if (!pid)
| > + goto bad_fork_put_binfmt_module;
| > +
| > p->did_exec = 0;
| > delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
| > copy_flags(clone_flags, p);
| > @@ -1305,6 +1310,8 @@ bad_fork_cleanup_cpuset:
| > #endif
| > cpuset_exit(p);
| > delayacct_tsk_free(p);
| > + free_pid(pid);
| > +bad_fork_put_binfmt_module:
| > if (p->binfmt)
| > module_put(p->binfmt->module);
| > bad_fork_cleanup_put_domain:
| > @@ -1331,7 +1338,7 @@ struct task_struct * __cpuinit fork_idle
| > struct pt_regs regs;
| >
| > task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL, NULL,
| > - &init_struct_pid);
| > + COPY_IDLE_PROCESS);
| > if (!IS_ERR(task))
| > init_idle(task, cpu);
| >
| > @@ -1369,19 +1376,16 @@ long do_fork(unsigned long clone_flags,
| > {
| > struct task_struct *p;
| > int trace = 0;
| > - struct pid *pid = alloc_pid();
| > long nr;
| >
| > - if (!pid)
| > - return -EAGAIN;
| > - nr = pid->nr;
| > if (unlikely(current->ptrace)) {
| > trace = fork_traceflag (clone_flags);
| > if (trace)
| > clone_flags |= CLONE_PTRACE;
| > }
| >
| > - p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, pid);
| > + p = copy_process(clone_flags, stack_start, regs, stack_size,
| > + parent_tidptr, child_tidptr, COPY_NON_IDLE_PROCESS);
| > /*
| > * Do this prior waking up the new thread - the thread pointer
| > * might get invalid after that point, if the thread exits quickly.
| > @@ -1389,6 +1393,8 @@ long do_fork(unsigned long clone_flags,
| > if (!IS_ERR(p)) {
| > struct completion vfork;
| >
| > + nr = pid_to_nr(task_pid(p));
| > +
| > if (clone_flags & CLONE_VFORK) {
| > p->vfork_done = &vfork;
| > init_completion(&vfork);
| > @@ -1422,7 +1428,6 @@ long do_fork(unsigned long clone_flags,
| > }
| > }
| > } else {
| > - free_pid(pid);
| > nr = PTR_ERR(p);
| > }
| > return nr;
| > Index: lx26-21-mm2/kernel/pid.c
| > ===================================================================
| > --- lx26-21-mm2.orig/kernel/pid.c 2007-05-22 16:59:46.000000000 -0700
| > +++ lx26-21-mm2/kernel/pid.c 2007-05-22 17:06:48.000000000 -0700
| > @@ -216,6 +216,10 @@ fastcall void free_pid(struct pid *pid)
| > /* We can be called with write_lock_irq(&tasklist_lock) held */
| > unsigned long flags;
| >
| > + /* check this here to keep copy_process() cleaner */
| > + if (unlikely(pid == &init_struct_pid))
| > + return;
| > +
| > spin_lock_irqsave(&pidmap_lock, flags);
| > hlist_del_rcu(&pid->pid_chain);
| > spin_unlock_irqrestore(&pidmap_lock, flags);
| > @@ -224,12 +228,16 @@ fastcall void free_pid(struct pid *pid)
| > call_rcu(&pid->rcu, delayed_put_pid);
| > }
| >
| > -struct pid *alloc_pid(void)
| > +struct pid *alloc_pid(enum copy_process_type copy_src)
| > {
| > struct pid *pid;
| > enum pid_type type;
| > int nr = -1;
| >
| > + /* check this here to keep copy_process() cleaner */
| > + if (unlikely(copy_src == COPY_IDLE_PROCESS))
| > + return &init_struct_pid;
| > +
| > pid = kmem_cache_alloc(pid_cachep, GFP_KERNEL);
| > if (!pid)
| > goto out;
| > _______________________________________________
| > Containers mailing list
| > Containers@lists.linux-foundation.org
| > https://lists.linux-foundation.org/mailman/listinfo/containers
| >
| > _______________________________________________
| > Devel mailing list
| > Devel@openvz.org
| > https://openvz.org/mailman/listinfo/devel
| >
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
|
|
|
Re: [RFC][PATCH 15/16] Enable signaling child reaper from parent ns. [message #18679 is a reply to message #18646] |
Fri, 25 May 2007 23:11   |
Sukadev Bhattiprolu
Messages: 413 Registered: August 2006
|
Senior Member |
|
|
Serge E. Hallyn [serue@us.ibm.com] wrote:
| Quoting sukadev@us.ibm.com (sukadev@us.ibm.com):
| >
| > Subject: Enable signaling child reaper from parent ns.
| >
| > From: Sukadev Bhattiprolu <sukadev@us.ibm.com>
| >
| > The reaper of a child namespace must receive signals from its parent pid
| > namespace but not receive any signals from its own namespace.
| >
| > This is a very early draft :-) and following tests seem to pass
| >
| > - Successfully kill child reaper from parent namespace (init_pid_ns)
| >
| > - Fail to kill child reaper from within its namespace (non init_pid_ns)
| >
| > - kill -1 1 from init_pid_ns seemed to work (rescanned inittab)
| >
| > TODO:
| > - Test async io and SIGIO delivery.
| >
| > - Allow any legitimate signals that the child reaper can receive
| > from within its namespace? (we block all signals now)
| >
| > - Sending SIGKILL to the child reaper of a namespace terminates the
| > namespace But if the namespace remounted /proc from user space,
| > /proc would remain mounted even after reaper and other process in
| > the namespace go away.
| >
| > Signed-off-by: Sukadev Bhattiprolu <sukadev@us.ibm.com>
| > ---
| > kernel/signal.c | 22 +++++++++++++++++++++-
| > 1 file changed, 21 insertions(+), 1 deletion(-)
| >
| > Index: lx26-21-mm2/kernel/signal.c
| > ===================================================================
| > --- lx26-21-mm2.orig/kernel/signal.c 2007-05-22 16:59:42.000000000 -0700
| > +++ lx26-21-mm2/kernel/signal.c 2007-05-22 16:59:57.000000000 -0700
| > @@ -507,6 +507,20 @@ static int check_kill_permission(int sig
| > && !capable(CAP_KILL))
| > return error;
| >
| > + /*
| > + * If t is the reaper of its namespace and someone from that
| > + * namespace is trying to send a signal.
| > + *
| > + * Note: If some one from parent namespace is sending a signal,
| > + * task_child_reaper() != t and we allow the signal.
| > + *
| > + * In the child namespace, does this block even legitimate signals
| > + * like the ones telinit sends to /sbin/init ?
| > + *
| > + */
| > + if ((!is_global_init(t)) && (t == task_child_reaper(t)))
|
| Couldn't you more clearly achieve what you want by doing:
| if ((!is_global_init(t)) && (t == task_child_reaper(current)))
Yes. I think so. My current implementation of task_child_reaper() returns
reaper of current namespace, if called from within the namespace.
I still need to modify it like you mentioned in the other mail. I will
then go over this signals checks again. It was just a quick fix to allow
terminating the ns from ancestor ns.
|
| Still like you say I think you need to study more how current code does
| the right thing for the global init. Reproduce exactly that if t ==
| task_child_reaper(current), else treat like any other task. And though
| I said "reproduce", I should think you could do it without separate
| checks as you have here.
|
| -serge
|
| > + return -EPERM;
| > +
| > error = security_task_kill(t, info, sig, 0);
| > if (!error)
| > audit_signal_info(sig, t); /* Let audit system see the signal */
| > @@ -1910,7 +1924,13 @@ relock:
| > /*
| > * Init of a pid space gets no signals it doesn't want from
| > * within that pid space. It can of course get signals from
| > - * its parent pid space.
| > + * its parent pid space. But we have no way of knowing the
| > + * namespace from which the signal was sent. For now check
| > + * if we are global init here and add additional checks in
| > + * sys_kill() and friends.
| > + *
| > + * Note that t == task_child_reaper(t) implies t is the global
| > + * init (and we are in init_pid_ns).
| > */
| > if (current == task_child_reaper(current))
| > continue;
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
|
|
|
Re: [RFC][PATCH 15/16] Enable signaling child reaper from parent ns. [message #18680 is a reply to message #18676] |
Fri, 25 May 2007 23:18   |
Sukadev Bhattiprolu
Messages: 413 Registered: August 2006
|
Senior Member |
|
|
Serge E. Hallyn [serue@us.ibm.com] wrote:
| Quoting sukadev@us.ibm.com (sukadev@us.ibm.com):
| >
| > Subject: Enable signaling child reaper from parent ns.
| >
| > From: Sukadev Bhattiprolu <sukadev@us.ibm.com>
| >
| > The reaper of a child namespace must receive signals from its parent pid
| > namespace but not receive any signals from its own namespace.
| >
| > This is a very early draft :-) and following tests seem to pass
| >
| > - Successfully kill child reaper from parent namespace (init_pid_ns)
| >
| > - Fail to kill child reaper from within its namespace (non init_pid_ns)
| >
| > - kill -1 1 from init_pid_ns seemed to work (rescanned inittab)
| >
| > TODO:
| > - Test async io and SIGIO delivery.
| >
| > - Allow any legitimate signals that the child reaper can receive
| > from within its namespace? (we block all signals now)
| >
| > - Sending SIGKILL to the child reaper of a namespace terminates the
| > namespace But if the namespace remounted /proc from user space,
| > /proc would remain mounted even after reaper and other process in
| > the namespace go away.
| >
| > Signed-off-by: Sukadev Bhattiprolu <sukadev@us.ibm.com>
| > ---
| > kernel/signal.c | 22 +++++++++++++++++++++-
| > 1 file changed, 21 insertions(+), 1 deletion(-)
| >
| > Index: lx26-21-mm2/kernel/signal.c
| > ===================================================================
| > --- lx26-21-mm2.orig/kernel/signal.c 2007-05-22 16:59:42.000000000 -0700
| > +++ lx26-21-mm2/kernel/signal.c 2007-05-22 16:59:57.000000000 -0700
| > @@ -507,6 +507,20 @@ static int check_kill_permission(int sig
| > && !capable(CAP_KILL))
| > return error;
| >
| > + /*
| > + * If t is the reaper of its namespace and someone from that
| > + * namespace is trying to send a signal.
| > + *
| > + * Note: If some one from parent namespace is sending a signal,
| > + * task_child_reaper() != t and we allow the signal.
| > + *
| > + * In the child namespace, does this block even legitimate signals
| > + * like the ones telinit sends to /sbin/init ?
| > + *
| > + */
| > + if ((!is_global_init(t)) && (t == task_child_reaper(t)))
| > + return -EPERM;
|
| Ok, let's just go over the desired semantics.
|
| Current treatment of init for signals is evident at
| kernel/signal.c:get_signal_to_deliver(). There we used to check whether
| current->pid == 1, which was turned into (current == child_reaper(current)).
|
| First note on that, (current == child_reaper(current)) is correct if we
| want to treat a container init as init no matter who sends the signal,
| however I contend that if a signal is sent from an ancester namespace,
| then we want to treat the task like any other task.
I agree that we need to be able terminate the namespace from an ancestor
namespace. The container init should be special only to other process in
that namespace, its like any other process in the ancestor namespaces.
|
| Could I get some confirmation or rebuttal from Suka, Eric, Dave, or
| Pavel?
|
| Next, note where that check is done: If the task has it's own custom
| handler for a signal, then that will get called. Only if the handler is
| the default, do we check whether the target is the init task. This is
| why this patch is wrong, Suka. If init sets up a handler for USR1, then
| tasks should be able to do a kill -USR1 1.
|
| Ok, so finally here's the problem to solve. As I said above, if we are
| willing to treat a container init like init no matter who signals it,
| then we're ok with the current code. But if we want a container init to
| be treated like a normal process if signaled from an ancestor pidns,
| then we'll need to tack a struct pid or struct pid_ns pointer into
| struct siginfo. And this means taking a reference to one of those,
| which means slowing things down a touch.
I think we need the reference to struct pid in 'struct siginfo', but
again need to investigate more.
|
| thanks,
| -serge
|
|
| > +
| > error = security_task_kill(t, info, sig, 0);
| > if (!error)
| > audit_signal_info(sig, t); /* Let audit system see the signal */
| > @@ -1910,7 +1924,13 @@ relock:
| > /*
| > * Init of a pid space gets no signals it doesn't want from
| > * within that pid space. It can of course get signals from
| > - * its parent pid space.
| > + * its parent pid space. But we have no way of knowing the
| > + * namespace from which the signal was sent. For now check
| > + * if we are global init here and add additional checks in
| > + * sys_kill() and friends.
| > + *
| > + * Note that t == task_child_reaper(t) implies t is the global
| > + * init (and we are in init_pid_ns).
| > */
| > if (current == task_child_reaper(current))
| > continue;
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
|
|
|
|
|
|
Re: [RFC][PATCH 06/16] Define is_global_init() [message #18702 is a reply to message #18674] |
Tue, 29 May 2007 16:54   |
Dave Hansen
Messages: 240 Registered: October 2005
|
Senior Member |
|
|
On Fri, 2007-05-25 at 13:44 -0700, sukadev@us.ibm.com wrote:
> Dave Hansen [hansendc@us.ibm.com] wrote:
> | On Thu, 2007-05-24 at 13:24 +0400, Pavel Emelianov wrote:
> | > > | > +int is_global_init(struct task_struct *tsk)
> | > > | > +{
> | > > | > + return (task_active_pid_ns(tsk) == &init_pid_ns && tsk->pid == 1);
> | > > |
> | > > | This can OOPS if you pass arbitrary task to this call...
> | > > | tsk->nsproxy can already be NULL.
> | > >
> | > > Hmm. You are right. btw, this could be a bisect issue. Patch 9 of uses
> | > > pid_ns from pid->upid_list and removes nsproxy->pid_ns.
> | >
> | > Yes, but that patch is not good either.
> | > task_pid(tsk) may become NULL as well and this will oops.
> |
> | Have you reviewed the call paths to make sure this can actually happen
> | in practice?
>
> task_pid() can be NULL when we are tearing down the task structure in
> release_task() and in the tiny window between detach_pid() and attach_pid()
> in de_thread().
>
> I think task_pid() is safe as long as it is called for 'current'. (we should
> probably add some comments)
If we only call it for "current", then perhaps we should just change the
function so that it doesn't take any arguments. That way nobody can
screw it up.
> I will double check my code, but I think all my calls to task_pid() and hence,
> to task_active_pid_ns() are safe, except for two cases:
>
> a) is_global_init(). There are a few calls to process other than
> current, but not sure if they are a problem.
>
> For instance in current code, unhandled_signal() checks
> tsk->pid == 1 and proceeds to derefernce tsk->sighand.
>
> If task_pid() is NULL because the task was in release_task(),
> then so is tsk->sighand.
Really? Are there barriers or locks to make this happen? Can you be
sure that compiler or cpu re-ordered code will keep this true?
> b) the temporary check I added in check_kill_permissions().
> (I need to address Serge's comment here anyway).
>
> To make is_global_init() more efficient and independent of task_pid(),
> can we steal a bit from task_struct->flags ? Like PF_KSWAPD, and there
> are unused bits :-)
It feels to me like we're adding too many hacks on hacks here. Let's
define the problem, because it sounds to me like we don't even know what
it really is. What is the problem here, again?
-- Dave
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
|
|
|
Re: [RFC][PATCH 0/16] Enable cloning of pid namespace [message #18703 is a reply to message #18700] |
Tue, 29 May 2007 17:12   |
Badari Pulavarty
Messages: 15 Registered: September 2006
|
Junior Member |
|
|
On Tue, 2007-05-29 at 08:48 -0700, Dave Hansen wrote:
> On Tue, 2007-05-29 at 14:29 +0400, Pavel Emelianov wrote:
> > sukadev@us.ibm.com wrote:
> > > Dave, Serge,
> > >
> > > Here is my current pid namespace patchset. There are still a couple of
> > > issues I am investigating, but appreciate any feedback.
> > >
> > > Suka
> > >
> >
> > I tried to compile your patches with CONFIG_PID_NS=n and got this:
> >
> > CC kernel/configs.o
> > kernel/pid.c:361: warning: function declaration isn't a prototype
> > kernel/pid.c: In function `dup_struct_pid':
> > kernel/pid.c:489: warning: initialization makes pointer from integer without a cast
>
> Could you send along your actual .config?
Not needed :(
Index: linux-2.6.21/kernel/pid.c
===================================================================
--- linux-2.6.21.orig/kernel/pid.c 2007-05-25 09:06:30.000000000 -0700
+++ linux-2.6.21/kernel/pid.c 2007-05-29 10:07:39.000000000 -0700
@@ -357,7 +357,7 @@
#else
-static int alloc_pid_ns()
+static struct pid_namespace *alloc_pid_ns(void)
{
static int warned;
@@ -365,7 +365,7 @@
printk(KERN_INFO "WARNING: CLONE_NEWPID disabled\n");
warned = 1;
}
- return 0;
+ return NULL;
}
void zap_pid_ns_processes(struct pid_namespace *pid_ns)
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
|
|
|
|
Re: [RFC][PATCH 06/16] Define is_global_init() [message #18709 is a reply to message #18702] |
Wed, 30 May 2007 00:29   |
Sukadev Bhattiprolu
Messages: 413 Registered: August 2006
|
Senior Member |
|
|
Dave Hansen [hansendc@us.ibm.com] wrote:
| On Fri, 2007-05-25 at 13:44 -0700, sukadev@us.ibm.com wrote:
| > Dave Hansen [hansendc@us.ibm.com] wrote:
| > | On Thu, 2007-05-24 at 13:24 +0400, Pavel Emelianov wrote:
| > | > > | > +int is_global_init(struct task_struct *tsk)
| > | > > | > +{
| > | > > | > + return (task_active_pid_ns(tsk) == &init_pid_ns && tsk->pid == 1);
| > | > > |
| > | > > | This can OOPS if you pass arbitrary task to this call...
| > | > > | tsk->nsproxy can already be NULL.
| > | > >
| > | > > Hmm. You are right. btw, this could be a bisect issue. Patch 9 of uses
| > | > > pid_ns from pid->upid_list and removes nsproxy->pid_ns.
| > | >
| > | > Yes, but that patch is not good either.
| > | > task_pid(tsk) may become NULL as well and this will oops.
| > |
| > | Have you reviewed the call paths to make sure this can actually happen
| > | in practice?
| >
| > task_pid() can be NULL when we are tearing down the task structure in
| > release_task() and in the tiny window between detach_pid() and attach_pid()
| > in de_thread().
| >
| > I think task_pid() is safe as long as it is called for 'current'. (we should
| > probably add some comments)
|
| If we only call it for "current", then perhaps we should just change the
| function so that it doesn't take any arguments. That way nobody can
| screw it up.
Well, if the caller can confirm that the tsk passed in to task_pid() is
not exiting, then it is ok to use. We have one such usage in do_fork()
where we use the task struct we just initialized. The task has not yet
been woken up.
|
| > I will double check my code, but I think all my calls to task_pid() and hence,
| > to task_active_pid_ns() are safe, except for two cases:
| >
| > a) is_global_init(). There are a few calls to process other than
| > current, but not sure if they are a problem.
| >
| > For instance in current code, unhandled_signal() checks
| > tsk->pid == 1 and proceeds to derefernce tsk->sighand.
| >
| > If task_pid() is NULL because the task was in release_task(),
| > then so is tsk->sighand.
|
| Really? Are there barriers or locks to make this happen? Can you be
| sure that compiler or cpu re-ordered code will keep this true?
I don't understand. Here is unhandled_signal() from 2.6.21-mm2.
int unhandled_signal(struct task_struct *tsk, int sig)
{
if (is_init(tsk))
return 1;
if (tsk->ptrace & PT_PTRACED)
return 0;
return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) ||
(tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
}
My patch changed the is_init() to is_global_init() but the theory is
that is_global_init() is not safe since it uses task_pid() which
can return NULL if @tsk is exiting.
But if @tsk is exiting, tsk->sighand will also be NULL. So either
the current callers have somehow ensured @tsk is not exiting or
they are risking accessing tsk->sighand.
Not sure how compiler/cpu-reordering changes things. Can you elaborate ?
|
| > b) the temporary check I added in check_kill_permissions().
| > (I need to address Serge's comment here anyway).
| >
| > To make is_global_init() more efficient and independent of task_pid(),
| > can we steal a bit from task_struct->flags ? Like PF_KSWAPD, and there
| > are unused bits :-)
|
| It feels to me like we're adding too many hacks on hacks here. Let's
| define the problem, because it sounds to me like we don't even know what
| it really is. What is the problem here, again?
We need an interface is_global_init() that tells us if the given process
is /sbin/init (which is the process with pid_t == 1 in init_pid_ns).
Why we need is_global_init(): to allow/deny some facilities (eg: you
cannot send arbitrary signals to the process).
How to implement is_global_init(): One simple/efficient way is to use a
bit in task->flags.
Another way is to:
- ensure tsk->pid == 1 and
- active pid ns of @tsk is init_pid_ns.
To check active pid ns of a process we currently need task_pid() which
*may* not be safe for all processes at all times.
A releated interface is is_container_init() which can be defined as
the process with pid_t == 1 in its active pid namespace.
|
| -- Dave
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
|
|
|
|
Re: [RFC][PATCH 14/16] Introduce proc_mnt for pid_ns [message #18727 is a reply to message #18725] |
Thu, 31 May 2007 19:50  |
Sukadev Bhattiprolu
Messages: 413 Registered: August 2006
|
Senior Member |
|
|
Dave Hansen [hansendc@us.ibm.com] wrote:
| On Thu, 2007-05-31 at 15:48 +0400, Pavel Emelianov wrote:
| > > +void proc_flush_task(struct task_struct *task)
| > > +{
| > > + int i;
| > > + struct pid *pid;
| > > + struct upid* upid;
| > > +
| > > + pid = task_pid(task);
| > > + if (!pid)
| > > + return;
| >
| > The code below will never be called as task flushes all his pids
| > in __unhash_process() that happens before this. Or did I miss smth?
|
| Nope, that's a very nice catch. Suka is working on a patch to fix this
| right now. We just need to keep the list of pids accessible a little
| longer, but I think Suka ran into a chicken-and-egg problem while trying
| to solve this. Suka, care to post your workaround?
Yes. the chicken and egg problem is in the order of the __unhash_process()
and proc_flush_task().
With __unhash_process() first, proc_flush_task() has no way of knowing
the pid namespaces the process belongs to. With proc_flush_task() first,
we find the process but have its dentry removed from proc. This crashes
when I run a test in a tight loop.
A quick/dirty fix is to save the pid namespace list before __unhash_process()
and use that in proc_flush_task(). That would make the exit() path quite
expensive. I am still investigating the crash and looking for a better option.
|
| -- Dave
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
|
|
|
Re: [RFC][PATCH 14/16] Introduce proc_mnt for pid_ns [message #18731 is a reply to message #18623] |
Thu, 31 May 2007 11:48  |
Pavel Emelianov
Messages: 1149 Registered: September 2006
|
Senior Member |
|
|
sukadev@us.ibm.com wrote:
> Subject: Introduce proc_mnt for pid_ns
>
> From: Dave Hansen <hansendc@us.ibm.com>
>
> The following patch completes the removal of the global proc_mnt.
> It fetches the mnt on which to do dentry invalidations from the
> pid_namespace in which the task appears.
>
> For now, there is only one pid namespace in mainline so this is
> straightforward. In the -lxc tree we'll have to do something
> more complex. The proc_flush_task() code takes a task, and
> needs to be able to find the corresponding proc superblocks on
> which that tasks's /proc/<pid> directories could appear. We
> can tell in which pid namespaces a task appears, so I put a
> pointer from the pid namespace to the corresponding proc_mnt.
>
> /proc currently has some special code to make sure that the root
> directory gets set up correctly. It proc_mnt variable in order
> to find its way to the root inode.
>
> Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
> Signed-off-by: Sukadev Bhattiprolu <sukadev@us.ibm.com>
> ---
>
> fs/proc/base.c | 32 ++++++++++++++++++++++++-
> fs/proc/inode.c | 11 +++++++-
> fs/proc/root.c | 52 ++++++++++++++++++++++++++++--------------
> include/linux/pid_namespace.h | 1
> include/linux/proc_fs.h | 1
> 5 files changed, 75 insertions(+), 22 deletions(-)
>
[snip]
> @@ -2071,6 +2084,21 @@ out:
> return;
> }
>
> +void proc_flush_task(struct task_struct *task)
> +{
> + int i;
> + struct pid *pid;
> + struct upid* upid;
> +
> + pid = task_pid(task);
> + if (!pid)
> + return;
The code below will never be called as task flushes all his pids
in __unhash_process() that happens before this. Or did I miss smth?
> + upid = &pid->upid_list[0];
> + for (i = 0; i < pid->num_upids; i++, upid++)
> + proc_flush_task_from_pid_ns(task, upid->pid_ns);
> +}
> +
> static struct dentry *proc_pid_instantiate(struct inode *dir,
> struct dentry * dentry,
> struct task_struct *task, const void *ptr)
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
|
|
|
Goto Forum:
Current Time: Wed Jun 18 01:46:15 GMT 2025
Total time taken to generate the page: 0.02599 seconds
|