OpenVZ Forum


Home » Mailing lists » Devel » [RFC][PATCH 0/16] Enable cloning of pid namespace
Re: [RFC][PATCH 07/16] Move alloc_pid call to copy_process [message #18667 is a reply to message #18616] Fri, 25 May 2007 06:41 Go to previous messageGo to next message
Pavel Emelianov is currently offline  Pavel Emelianov
Messages: 1149
Registered: September 2006
Senior Member
sukadev@us.ibm.com wrote:
> Subject: Move alloc_pid call to copy_process
> 
> From: Sukadev Bhattiprolu <sukadev@us.ibm.com>
> 
> Move alloc_pid() into copy_process(). This will keep all pid and pid
> namespace code together and simplify error handling when we support
> multiple pid namespaces.

I haven't found this in patches, so I ask it here:

We clone a new task with CLONE_NEWPIDS flag. This task
allocates its PIDTYPE_PID pid and this pid happens in 
booth parent and child namespace. This is OK. Then new 
task attaches PIDTYPE_SID and PIDTYPE_PGID pids from parent
task. But these ones are in parent namespace only. 

Right? Is that good?

> Changelog: 
> 	- [Eric Biederman] Move the check of copy_process_type to alloc_pid()/
> 	  free_pid() and to avoid clutter in copy_process().
> 
> Signed-off-by: Sukadev Bhattiprolu <sukadev@us.ibm.com>
> ---
>  include/linux/pid.h |    7 ++++++-
>  kernel/fork.c       |   21 +++++++++++++--------
>  kernel/pid.c        |   10 +++++++++-
>  3 files changed, 28 insertions(+), 10 deletions(-)
> 
> Index: lx26-21-mm2/include/linux/pid.h
> ===================================================================
> --- lx26-21-mm2.orig/include/linux/pid.h	2007-05-22 16:59:40.000000000 -0700
> +++ lx26-21-mm2/include/linux/pid.h	2007-05-22 17:06:48.000000000 -0700
> @@ -3,6 +3,11 @@
>  
>  #include <linux/rcupdate.h>
>  
> +enum copy_process_type {
> +	COPY_NON_IDLE_PROCESS,
> +	COPY_IDLE_PROCESS,
> +};
> +
>  enum pid_type
>  {
>  	PIDTYPE_PID,
> @@ -95,7 +100,7 @@ extern struct pid *FASTCALL(find_pid(int
>  extern struct pid *find_get_pid(int nr);
>  extern struct pid *find_ge_pid(int nr);
>  
> -extern struct pid *alloc_pid(void);
> +extern struct pid *alloc_pid(enum copy_process_type);
>  extern void FASTCALL(free_pid(struct pid *pid));
>  
>  static inline pid_t pid_to_nr(struct pid *pid)
> Index: lx26-21-mm2/kernel/fork.c
> ===================================================================
> --- lx26-21-mm2.orig/kernel/fork.c	2007-05-22 16:59:41.000000000 -0700
> +++ lx26-21-mm2/kernel/fork.c	2007-05-22 17:06:48.000000000 -0700
> @@ -961,10 +961,11 @@ static struct task_struct *copy_process(
>  					unsigned long stack_size,
>  					int __user *parent_tidptr,
>  					int __user *child_tidptr,
> -					struct pid *pid)
> +					enum copy_process_type copy_src)
>  {
>  	int retval;
>  	struct task_struct *p = NULL;
> +	struct pid *pid;
>  
>  	if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
>  		return ERR_PTR(-EINVAL);
> @@ -1025,6 +1026,10 @@ static struct task_struct *copy_process(
>  	if (p->binfmt && !try_module_get(p->binfmt->module))
>  		goto bad_fork_cleanup_put_domain;
>  
> +	pid = alloc_pid(copy_src);
> +	if (!pid)
> +		goto bad_fork_put_binfmt_module;
> +
>  	p->did_exec = 0;
>  	delayacct_tsk_init(p);	/* Must remain after dup_task_struct() */
>  	copy_flags(clone_flags, p);
> @@ -1305,6 +1310,8 @@ bad_fork_cleanup_cpuset:
>  #endif
>  	cpuset_exit(p);
>  	delayacct_tsk_free(p);
> +	free_pid(pid);
> +bad_fork_put_binfmt_module:
>  	if (p->binfmt)
>  		module_put(p->binfmt->module);
>  bad_fork_cleanup_put_domain:
> @@ -1331,7 +1338,7 @@ struct task_struct * __cpuinit fork_idle
>  	struct pt_regs regs;
>  
>  	task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL, NULL,
> -				&init_struct_pid);
> +				COPY_IDLE_PROCESS);
>  	if (!IS_ERR(task))
>  		init_idle(task, cpu);
>  
> @@ -1369,19 +1376,16 @@ long do_fork(unsigned long clone_flags,
>  {
>  	struct task_struct *p;
>  	int trace = 0;
> -	struct pid *pid = alloc_pid();
>  	long nr;
>  
> -	if (!pid)
> -		return -EAGAIN;
> -	nr = pid->nr;
>  	if (unlikely(current->ptrace)) {
>  		trace = fork_traceflag (clone_flags);
>  		if (trace)
>  			clone_flags |= CLONE_PTRACE;
>  	}
>  
> -	p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, pid);
> +	p = copy_process(clone_flags, stack_start, regs, stack_size,
> +			parent_tidptr, child_tidptr, COPY_NON_IDLE_PROCESS);
>  	/*
>  	 * Do this prior waking up the new thread - the thread pointer
>  	 * might get invalid after that point, if the thread exits quickly.
> @@ -1389,6 +1393,8 @@ long do_fork(unsigned long clone_flags,
>  	if (!IS_ERR(p)) {
>  		struct completion vfork;
>  
> +		nr = pid_to_nr(task_pid(p));
> +
>  		if (clone_flags & CLONE_VFORK) {
>  			p->vfork_done = &vfork;
>  			init_completion(&vfork);
> @@ -1422,7 +1428,6 @@ long do_fork(unsigned long clone_flags,
>  			}
>  		}
>  	} else {
> -		free_pid(pid);
>  		nr = PTR_ERR(p);
>  	}
>  	return nr;
> Index: lx26-21-mm2/kernel/pid.c
> ===================================================================
> --- lx26-21-mm2.orig/kernel/pid.c	2007-05-22 16:59:46.000000000 -0700
> +++ lx26-21-mm2/kernel/pid.c	2007-05-22 17:06:48.000000000 -0700
> @@ -216,6 +216,10 @@ fastcall void free_pid(struct pid *pid)
>  	/* We can be called with write_lock_irq(&tasklist_lock) held */
>  	unsigned long flags;
>  
> +	/* check this here to keep copy_process() cleaner */
> +	if (unlikely(pid == &init_struct_pid))
> +		return;
> +
>  	spin_lock_irqsave(&pidmap_lock, flags);
>  	hlist_del_rcu(&pid->pid_chain);
>  	spin_unlock_irqrestore(&pidmap_lock, flags);
> @@ -224,12 +228,16 @@ fastcall void free_pid(struct pid *pid)
>  	call_rcu(&pid->rcu, delayed_put_pid);
>  }
>  
> -struct pid *alloc_pid(void)
> +struct pid *alloc_pid(enum copy_process_type copy_src)
>  {
>  	struct pid *pid;
>  	enum pid_type type;
>  	int nr = -1;
>  
> +	/* check this here to keep copy_process() cleaner */
> +	if (unlikely(copy_src == COPY_IDLE_PROCESS))
> +		return &init_struct_pid;
> +
>  	pid = kmem_cache_alloc(pid_cachep, GFP_KERNEL);
>  	if (!pid)
>  		goto out;
> _______________________________________________
> Containers mailing list
> Containers@lists.linux-foundation.org
> https://lists.linux-foundation.org/mailman/listinfo/containers
> 
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
Re: [RFC][PATCH 06/16] Define is_global_init() [message #18674 is a reply to message #18641] Fri, 25 May 2007 20:44 Go to previous messageGo to next message
Sukadev Bhattiprolu is currently offline  Sukadev Bhattiprolu
Messages: 413
Registered: August 2006
Senior Member
Dave Hansen [hansendc@us.ibm.com] wrote:
| On Thu, 2007-05-24 at 13:24 +0400, Pavel Emelianov wrote:
| > > | > +int is_global_init(struct task_struct *tsk)
| > > | > +{
| > > | > + return (task_active_pid_ns(tsk) == &init_pid_ns && tsk->pid == 1);
| > > | 
| > > | This can OOPS if you pass arbitrary task to this call...
| > > | tsk->nsproxy can already be NULL.
| > > 
| > > Hmm. You are right. btw, this could be a bisect issue. Patch 9 of uses
| > > pid_ns from pid->upid_list and removes nsproxy->pid_ns.
| > 
| > Yes, but that patch is not good either.
| > task_pid(tsk) may become NULL as well and this will oops.
| 
| Have you reviewed the call paths to make sure this can actually happen
| in practice?

task_pid() can be NULL when we are tearing down the task structure in
release_task() and in the tiny window between detach_pid() and attach_pid()
in de_thread(). 

I think task_pid() is safe as long as it is called for 'current'. (we should
probably add some comments)

I will double check my code, but I think all my calls to task_pid() and hence,
to task_active_pid_ns() are safe, except for two cases:

        a) is_global_init(). There are a few calls to process other than
           current, but not sure if they are a problem.

           For instance in current code, unhandled_signal() checks
           tsk->pid == 1 and proceeds to derefernce tsk->sighand.

           If task_pid() is NULL because the task was in release_task(),
           then so is tsk->sighand.

        b) the temporary check I added in check_kill_permissions().
           (I need to address Serge's comment here anyway).

To make is_global_init() more efficient and independent of task_pid(),
can we steal a bit from task_struct->flags ? Like PF_KSWAPD, and there
are unused bits :-)

| 
| This just seems like another one of those racing-with-task-exit races.
| Shouldn't be too invasive to solve.

A little invasive approach for the release_task() case could be to remove
the 'struct pid' from the hash table, but leave it attached to the
'task_struct' till the 'task_struct' itself is freed.

Removing from hash table ensures no one finds this process anymore, but
keeping it attached allows those who have already found the 'task_struct'
to also use the 'struct pid' as long as they have the task_struct.

Of course, needs investigation and micro surgery.

| 
| -- Dave
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
Re: [RFC][PATCH 15/16] Enable signaling child reaper from parent ns. [message #18676 is a reply to message #18624] Fri, 25 May 2007 20:13 Go to previous messageGo to next message
serue is currently offline  serue
Messages: 750
Registered: February 2006
Senior Member
Quoting sukadev@us.ibm.com (sukadev@us.ibm.com):
> 
> Subject: Enable signaling child reaper from parent ns.
> 
> From: Sukadev Bhattiprolu <sukadev@us.ibm.com>
> 
> The reaper of a child namespace must receive signals from its parent pid
> namespace but not receive any signals from its own namespace. 
> 
> This is a very early draft :-) and following tests seem to pass 
> 
> 	- Successfully kill child reaper from parent namespace (init_pid_ns)
> 
> 	- Fail to kill child reaper from within its namespace (non init_pid_ns)
> 
> 	- kill -1 1 from init_pid_ns seemed to work (rescanned inittab)
> 
> TODO:
> 	- Test async io and SIGIO delivery.
> 
> 	- Allow any legitimate signals that the child reaper can receive
> 	  from within its namespace? (we block all signals now)
> 
>         - Sending SIGKILL to the child reaper of a namespace terminates the
>           namespace But if the namespace remounted /proc from user space,
> 	  /proc would remain mounted even after reaper and other process in
> 	  the namespace go away.
> 
> Signed-off-by: Sukadev Bhattiprolu <sukadev@us.ibm.com>
> ---
>  kernel/signal.c |   22 +++++++++++++++++++++-
>  1 file changed, 21 insertions(+), 1 deletion(-)
> 
> Index: lx26-21-mm2/kernel/signal.c
> ===================================================================
> --- lx26-21-mm2.orig/kernel/signal.c	2007-05-22 16:59:42.000000000 -0700
> +++ lx26-21-mm2/kernel/signal.c	2007-05-22 16:59:57.000000000 -0700
> @@ -507,6 +507,20 @@ static int check_kill_permission(int sig
>  	    && !capable(CAP_KILL))
>  		return error;
> 
> +	/*
> +	 * If t is the reaper of its namespace and someone from that
> +	 * namespace is trying to send a signal.
> +	 *
> +	 * Note: If some one from parent namespace is sending a signal,
> +	 *       task_child_reaper() != t and we allow the signal.
> +	 *
> +	 * In the child namespace, does this block even legitimate signals
> +	 * like the ones telinit sends to /sbin/init ?
> +	 *
> +	 */
> +	if ((!is_global_init(t)) && (t == task_child_reaper(t)))
> +		return -EPERM;

Ok, let's just go over the desired semantics.

Current treatment of init for signals is evident at
kernel/signal.c:get_signal_to_deliver().  There we used to check whether
current->pid == 1, which was turned into (current == child_reaper(current)).

First note on that, (current == child_reaper(current)) is correct if we
want to treat a container init as init no matter who sends the signal,
however I contend that if a signal is sent from an ancester namespace,
then we want to treat the task like any other task.

Could I get some confirmation or rebuttal from Suka, Eric, Dave, or
Pavel?

Next, note where that check is done: If the task has it's own custom
handler for a signal, then that will get called.  Only if the handler is
the default, do we check whether the target is the init task.  This is
why this patch is wrong, Suka.  If init sets up a handler for USR1, then
tasks should be able to do a kill -USR1 1.

Ok, so finally here's the problem to solve.  As I said above, if we are
willing to treat a container init like init no matter who signals it,
then we're ok with the current code.  But if we want a container init to
be treated like a normal process if signaled from an ancestor pidns,
then we'll need to tack a struct pid or struct pid_ns pointer into
struct siginfo.  And this means taking a reference to one of those,
which means slowing things down a touch.

thanks,
-serge


> +
>  	error = security_task_kill(t, info, sig, 0);
>  	if (!error)
>  		audit_signal_info(sig, t); /* Let audit system see the signal */
> @@ -1910,7 +1924,13 @@ relock:
>  		/*
>  		 * Init of a pid space gets no signals it doesn't want from
>  		 * within that pid space. It can of course get signals from
> -		 * its parent pid space.
> +		 * its parent pid space. But we have no way of knowing the
> +		 * namespace from which the signal was sent. For now check
> +		 * if we are global init here and add additional checks in
> +		 * sys_kill() and friends.
> +		 *
> +		 * Note that t == task_child_reaper(t) implies t is the global
> +		 * init (and we are in init_pid_ns).
>  		 */
>  		if (current == task_child_reaper(current))
>  			continue;
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
Re: [RFC][PATCH 04/16] Use pid_to_nr() in process info functions [message #18677 is a reply to message #18648] Fri, 25 May 2007 22:53 Go to previous messageGo to next message
Sukadev Bhattiprolu is currently offline  Sukadev Bhattiprolu
Messages: 413
Registered: August 2006
Senior Member
Pavel Emelianov [xemul@sw.ru] wrote:
| > ===================================================================
| > --- lx26-21-mm2.orig/kernel/timer.c	2007-05-22 16:58:38.000000000 -0700
| > +++ lx26-21-mm2/kernel/timer.c	2007-05-22 16:59:44.000000000 -0700
| > @@ -945,7 +945,7 @@ asmlinkage long sys_getppid(void)
| >  	int pid;
| >  
| >  	rcu_read_lock();
| > -	pid = rcu_dereference(current->real_parent)->tgid;
| > +	pid = pid_to_nr(task_parent_tgid(current));
| 
| This breaks fsys_getppid() call in ia64...

Good catch. I need to figure out how to make the pid_to_nr()
call in IA64 assembly and if there are other architectures
that implement in assembly.

But in general, do you see any problems with the interfaces
like task_parent_tgid() ?

| 
| >  	rcu_read_unlock();
| >  
| >  	return pid;
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
Re: [RFC][PATCH 07/16] Move alloc_pid call to copy_process [message #18678 is a reply to message #18667] Fri, 25 May 2007 23:01 Go to previous messageGo to next message
Sukadev Bhattiprolu is currently offline  Sukadev Bhattiprolu
Messages: 413
Registered: August 2006
Senior Member
Pavel Emelianov [xemul@openvz.org] wrote:
| sukadev@us.ibm.com wrote:
| > Subject: Move alloc_pid call to copy_process
| > 
| > From: Sukadev Bhattiprolu <sukadev@us.ibm.com>
| > 
| > Move alloc_pid() into copy_process(). This will keep all pid and pid
| > namespace code together and simplify error handling when we support
| > multiple pid namespaces.
| 
| I haven't found this in patches, so I ask it here:
| 
| We clone a new task with CLONE_NEWPIDS flag. This task
| allocates its PIDTYPE_PID pid and this pid happens in 
| booth parent and child namespace. This is OK.

| Then new task attaches PIDTYPE_SID and PIDTYPE_PGID pids from
| parent task. But these ones are in parent namespace only. 
| 
| Right? Is that good?
| 

In this patch, yes, we still attach to the parent process, bc at this
point we still support only one namespace.

In the patch that actually allows creating multiple namespaces,
(Patch #11), I have the following code which makes the process
that cloned its pid ns a session and pgrp leader, just like
/sbin/init for init_pid_ns.

@@ -1255,11 +1254,17 @@ static struct task_struct *copy_process(
                        __ptrace_link(p, current->parent);

                if (thread_group_leader(p)) {
+                       struct pid *pgrp = task_pgrp(current);
+                       struct pid *session = task_session(current);
+
+                       if (clone_flags & CLONE_NEWPID)
+                               pgrp = session = pid;
+
                        p->signal->tty = current->signal->tty;
-                       p->signal->pgrp = process_group(current);
-                       set_signal_session(p->signal, process_session(current));
-                       attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
-                       attach_pid(p, PIDTYPE_SID, task_session(current));
+                       p->signal->pgrp = pid_to_nr(pgrp);
+                       set_signal_session(p->signal, pid_to_nr(session));
+                       attach_pid(p, PIDTYPE_PGID, pgrp);
+                       attach_pid(p, PIDTYPE_SID, session);

                        list_add_tail_rcu(&p->tasks, &init_task.tasks);
                        __get_cpu_var(process_counts)++;

| > Changelog: 
| > 	- [Eric Biederman] Move the check of copy_process_type to alloc_pid()/
| > 	  free_pid() and to avoid clutter in copy_process().
| > 
| > Signed-off-by: Sukadev Bhattiprolu <sukadev@us.ibm.com>
| > ---
| >  include/linux/pid.h |    7 ++++++-
| >  kernel/fork.c       |   21 +++++++++++++--------
| >  kernel/pid.c        |   10 +++++++++-
| >  3 files changed, 28 insertions(+), 10 deletions(-)
| > 
| > Index: lx26-21-mm2/include/linux/pid.h
| > ===================================================================
| > --- lx26-21-mm2.orig/include/linux/pid.h	2007-05-22 16:59:40.000000000 -0700
| > +++ lx26-21-mm2/include/linux/pid.h	2007-05-22 17:06:48.000000000 -0700
| > @@ -3,6 +3,11 @@
| >  
| >  #include <linux/rcupdate.h>
| >  
| > +enum copy_process_type {
| > +	COPY_NON_IDLE_PROCESS,
| > +	COPY_IDLE_PROCESS,
| > +};
| > +
| >  enum pid_type
| >  {
| >  	PIDTYPE_PID,
| > @@ -95,7 +100,7 @@ extern struct pid *FASTCALL(find_pid(int
| >  extern struct pid *find_get_pid(int nr);
| >  extern struct pid *find_ge_pid(int nr);
| >  
| > -extern struct pid *alloc_pid(void);
| > +extern struct pid *alloc_pid(enum copy_process_type);
| >  extern void FASTCALL(free_pid(struct pid *pid));
| >  
| >  static inline pid_t pid_to_nr(struct pid *pid)
| > Index: lx26-21-mm2/kernel/fork.c
| > ===================================================================
| > --- lx26-21-mm2.orig/kernel/fork.c	2007-05-22 16:59:41.000000000 -0700
| > +++ lx26-21-mm2/kernel/fork.c	2007-05-22 17:06:48.000000000 -0700
| > @@ -961,10 +961,11 @@ static struct task_struct *copy_process(
| >  					unsigned long stack_size,
| >  					int __user *parent_tidptr,
| >  					int __user *child_tidptr,
| > -					struct pid *pid)
| > +					enum copy_process_type copy_src)
| >  {
| >  	int retval;
| >  	struct task_struct *p = NULL;
| > +	struct pid *pid;
| >  
| >  	if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
| >  		return ERR_PTR(-EINVAL);
| > @@ -1025,6 +1026,10 @@ static struct task_struct *copy_process(
| >  	if (p->binfmt && !try_module_get(p->binfmt->module))
| >  		goto bad_fork_cleanup_put_domain;
| >  
| > +	pid = alloc_pid(copy_src);
| > +	if (!pid)
| > +		goto bad_fork_put_binfmt_module;
| > +
| >  	p->did_exec = 0;
| >  	delayacct_tsk_init(p);	/* Must remain after dup_task_struct() */
| >  	copy_flags(clone_flags, p);
| > @@ -1305,6 +1310,8 @@ bad_fork_cleanup_cpuset:
| >  #endif
| >  	cpuset_exit(p);
| >  	delayacct_tsk_free(p);
| > +	free_pid(pid);
| > +bad_fork_put_binfmt_module:
| >  	if (p->binfmt)
| >  		module_put(p->binfmt->module);
| >  bad_fork_cleanup_put_domain:
| > @@ -1331,7 +1338,7 @@ struct task_struct * __cpuinit fork_idle
| >  	struct pt_regs regs;
| >  
| >  	task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL, NULL,
| > -				&init_struct_pid);
| > +				COPY_IDLE_PROCESS);
| >  	if (!IS_ERR(task))
| >  		init_idle(task, cpu);
| >  
| > @@ -1369,19 +1376,16 @@ long do_fork(unsigned long clone_flags,
| >  {
| >  	struct task_struct *p;
| >  	int trace = 0;
| > -	struct pid *pid = alloc_pid();
| >  	long nr;
| >  
| > -	if (!pid)
| > -		return -EAGAIN;
| > -	nr = pid->nr;
| >  	if (unlikely(current->ptrace)) {
| >  		trace = fork_traceflag (clone_flags);
| >  		if (trace)
| >  			clone_flags |= CLONE_PTRACE;
| >  	}
| >  
| > -	p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, pid);
| > +	p = copy_process(clone_flags, stack_start, regs, stack_size,
| > +			parent_tidptr, child_tidptr, COPY_NON_IDLE_PROCESS);
| >  	/*
| >  	 * Do this prior waking up the new thread - the thread pointer
| >  	 * might get invalid after that point, if the thread exits quickly.
| > @@ -1389,6 +1393,8 @@ long do_fork(unsigned long clone_flags,
| >  	if (!IS_ERR(p)) {
| >  		struct completion vfork;
| >  
| > +		nr = pid_to_nr(task_pid(p));
| > +
| >  		if (clone_flags & CLONE_VFORK) {
| >  			p->vfork_done = &vfork;
| >  			init_completion(&vfork);
| > @@ -1422,7 +1428,6 @@ long do_fork(unsigned long clone_flags,
| >  			}
| >  		}
| >  	} else {
| > -		free_pid(pid);
| >  		nr = PTR_ERR(p);
| >  	}
| >  	return nr;
| > Index: lx26-21-mm2/kernel/pid.c
| > ===================================================================
| > --- lx26-21-mm2.orig/kernel/pid.c	2007-05-22 16:59:46.000000000 -0700
| > +++ lx26-21-mm2/kernel/pid.c	2007-05-22 17:06:48.000000000 -0700
| > @@ -216,6 +216,10 @@ fastcall void free_pid(struct pid *pid)
| >  	/* We can be called with write_lock_irq(&tasklist_lock) held */
| >  	unsigned long flags;
| >  
| > +	/* check this here to keep copy_process() cleaner */
| > +	if (unlikely(pid == &init_struct_pid))
| > +		return;
| > +
| >  	spin_lock_irqsave(&pidmap_lock, flags);
| >  	hlist_del_rcu(&pid->pid_chain);
| >  	spin_unlock_irqrestore(&pidmap_lock, flags);
| > @@ -224,12 +228,16 @@ fastcall void free_pid(struct pid *pid)
| >  	call_rcu(&pid->rcu, delayed_put_pid);
| >  }
| >  
| > -struct pid *alloc_pid(void)
| > +struct pid *alloc_pid(enum copy_process_type copy_src)
| >  {
| >  	struct pid *pid;
| >  	enum pid_type type;
| >  	int nr = -1;
| >  
| > +	/* check this here to keep copy_process() cleaner */
| > +	if (unlikely(copy_src == COPY_IDLE_PROCESS))
| > +		return &init_struct_pid;
| > +
| >  	pid = kmem_cache_alloc(pid_cachep, GFP_KERNEL);
| >  	if (!pid)
| >  		goto out;
| > _______________________________________________
| > Containers mailing list
| > Containers@lists.linux-foundation.org
| > https://lists.linux-foundation.org/mailman/listinfo/containers
| > 
| > _______________________________________________
| > Devel mailing list
| > Devel@openvz.org
| > https://openvz.org/mailman/listinfo/devel
| > 
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
Re: [RFC][PATCH 15/16] Enable signaling child reaper from parent ns. [message #18679 is a reply to message #18646] Fri, 25 May 2007 23:11 Go to previous messageGo to next message
Sukadev Bhattiprolu is currently offline  Sukadev Bhattiprolu
Messages: 413
Registered: August 2006
Senior Member
Serge E. Hallyn [serue@us.ibm.com] wrote:
| Quoting sukadev@us.ibm.com (sukadev@us.ibm.com):
| > 
| > Subject: Enable signaling child reaper from parent ns.
| > 
| > From: Sukadev Bhattiprolu <sukadev@us.ibm.com>
| > 
| > The reaper of a child namespace must receive signals from its parent pid
| > namespace but not receive any signals from its own namespace. 
| > 
| > This is a very early draft :-) and following tests seem to pass 
| > 
| > 	- Successfully kill child reaper from parent namespace (init_pid_ns)
| > 
| > 	- Fail to kill child reaper from within its namespace (non init_pid_ns)
| > 
| > 	- kill -1 1 from init_pid_ns seemed to work (rescanned inittab)
| > 
| > TODO:
| > 	- Test async io and SIGIO delivery.
| > 
| > 	- Allow any legitimate signals that the child reaper can receive
| > 	  from within its namespace? (we block all signals now)
| > 
| >         - Sending SIGKILL to the child reaper of a namespace terminates the
| >           namespace But if the namespace remounted /proc from user space,
| > 	  /proc would remain mounted even after reaper and other process in
| > 	  the namespace go away.
| > 
| > Signed-off-by: Sukadev Bhattiprolu <sukadev@us.ibm.com>
| > ---
| >  kernel/signal.c |   22 +++++++++++++++++++++-
| >  1 file changed, 21 insertions(+), 1 deletion(-)
| > 
| > Index: lx26-21-mm2/kernel/signal.c
| > ===================================================================
| > --- lx26-21-mm2.orig/kernel/signal.c	2007-05-22 16:59:42.000000000 -0700
| > +++ lx26-21-mm2/kernel/signal.c	2007-05-22 16:59:57.000000000 -0700
| > @@ -507,6 +507,20 @@ static int check_kill_permission(int sig
| >  	    && !capable(CAP_KILL))
| >  		return error;
| > 
| > +	/*
| > +	 * If t is the reaper of its namespace and someone from that
| > +	 * namespace is trying to send a signal.
| > +	 *
| > +	 * Note: If some one from parent namespace is sending a signal,
| > +	 *       task_child_reaper() != t and we allow the signal.
| > +	 *
| > +	 * In the child namespace, does this block even legitimate signals
| > +	 * like the ones telinit sends to /sbin/init ?
| > +	 *
| > +	 */
| > +	if ((!is_global_init(t)) && (t == task_child_reaper(t)))
| 
| Couldn't you more clearly achieve what you want by doing:
| 	if ((!is_global_init(t)) && (t == task_child_reaper(current)))

Yes. I think so.  My current implementation of task_child_reaper() returns
reaper of current namespace, if called from within the namespace.

I still need to modify it like you mentioned in the other mail. I will
then go over this signals checks again. It was just a quick fix to allow
terminating the ns from ancestor ns.

| 
| Still like you say I think you need to study more how current code does
| the right thing for the global init.  Reproduce exactly that if t ==
| task_child_reaper(current), else treat like any other task.  And though
| I said "reproduce", I should think you could do it without separate
| checks as you have here.
| 
| -serge
| 
| > +		return -EPERM;
| > +
| >  	error = security_task_kill(t, info, sig, 0);
| >  	if (!error)
| >  		audit_signal_info(sig, t); /* Let audit system see the signal */
| > @@ -1910,7 +1924,13 @@ relock:
| >  		/*
| >  		 * Init of a pid space gets no signals it doesn't want from
| >  		 * within that pid space. It can of course get signals from
| > -		 * its parent pid space.
| > +		 * its parent pid space. But we have no way of knowing the
| > +		 * namespace from which the signal was sent. For now check
| > +		 * if we are global init here and add additional checks in
| > +		 * sys_kill() and friends.
| > +		 *
| > +		 * Note that t == task_child_reaper(t) implies t is the global
| > +		 * init (and we are in init_pid_ns).
| >  		 */
| >  		if (current == task_child_reaper(current))
| >  			continue;
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
Re: [RFC][PATCH 15/16] Enable signaling child reaper from parent ns. [message #18680 is a reply to message #18676] Fri, 25 May 2007 23:18 Go to previous messageGo to next message
Sukadev Bhattiprolu is currently offline  Sukadev Bhattiprolu
Messages: 413
Registered: August 2006
Senior Member
Serge E. Hallyn [serue@us.ibm.com] wrote:
| Quoting sukadev@us.ibm.com (sukadev@us.ibm.com):
| > 
| > Subject: Enable signaling child reaper from parent ns.
| > 
| > From: Sukadev Bhattiprolu <sukadev@us.ibm.com>
| > 
| > The reaper of a child namespace must receive signals from its parent pid
| > namespace but not receive any signals from its own namespace. 
| > 
| > This is a very early draft :-) and following tests seem to pass 
| > 
| > 	- Successfully kill child reaper from parent namespace (init_pid_ns)
| > 
| > 	- Fail to kill child reaper from within its namespace (non init_pid_ns)
| > 
| > 	- kill -1 1 from init_pid_ns seemed to work (rescanned inittab)
| > 
| > TODO:
| > 	- Test async io and SIGIO delivery.
| > 
| > 	- Allow any legitimate signals that the child reaper can receive
| > 	  from within its namespace? (we block all signals now)
| > 
| >         - Sending SIGKILL to the child reaper of a namespace terminates the
| >           namespace But if the namespace remounted /proc from user space,
| > 	  /proc would remain mounted even after reaper and other process in
| > 	  the namespace go away.
| > 
| > Signed-off-by: Sukadev Bhattiprolu <sukadev@us.ibm.com>
| > ---
| >  kernel/signal.c |   22 +++++++++++++++++++++-
| >  1 file changed, 21 insertions(+), 1 deletion(-)
| > 
| > Index: lx26-21-mm2/kernel/signal.c
| > ===================================================================
| > --- lx26-21-mm2.orig/kernel/signal.c	2007-05-22 16:59:42.000000000 -0700
| > +++ lx26-21-mm2/kernel/signal.c	2007-05-22 16:59:57.000000000 -0700
| > @@ -507,6 +507,20 @@ static int check_kill_permission(int sig
| >  	    && !capable(CAP_KILL))
| >  		return error;
| > 
| > +	/*
| > +	 * If t is the reaper of its namespace and someone from that
| > +	 * namespace is trying to send a signal.
| > +	 *
| > +	 * Note: If some one from parent namespace is sending a signal,
| > +	 *       task_child_reaper() != t and we allow the signal.
| > +	 *
| > +	 * In the child namespace, does this block even legitimate signals
| > +	 * like the ones telinit sends to /sbin/init ?
| > +	 *
| > +	 */
| > +	if ((!is_global_init(t)) && (t == task_child_reaper(t)))
| > +		return -EPERM;
| 
| Ok, let's just go over the desired semantics.
| 
| Current treatment of init for signals is evident at
| kernel/signal.c:get_signal_to_deliver().  There we used to check whether
| current->pid == 1, which was turned into (current == child_reaper(current)).
| 
| First note on that, (current == child_reaper(current)) is correct if we
| want to treat a container init as init no matter who sends the signal,
| however I contend that if a signal is sent from an ancester namespace,
| then we want to treat the task like any other task.

I agree that we need to be able terminate the namespace from an ancestor
namespace. The container init should be special only to other process in
that namespace, its like any other process in the ancestor namespaces.

| 
| Could I get some confirmation or rebuttal from Suka, Eric, Dave, or
| Pavel?
| 
| Next, note where that check is done: If the task has it's own custom
| handler for a signal, then that will get called.  Only if the handler is
| the default, do we check whether the target is the init task.  This is
| why this patch is wrong, Suka.  If init sets up a handler for USR1, then
| tasks should be able to do a kill -USR1 1.
| 
| Ok, so finally here's the problem to solve.  As I said above, if we are
| willing to treat a container init like init no matter who signals it,
| then we're ok with the current code.  But if we want a container init to
| be treated like a normal process if signaled from an ancestor pidns,
| then we'll need to tack a struct pid or struct pid_ns pointer into
| struct siginfo.  And this means taking a reference to one of those,
| which means slowing things down a touch.

I think we need the reference to struct pid in 'struct siginfo', but
again need to investigate more.

| 
| thanks,
| -serge
| 
| 
| > +
| >  	error = security_task_kill(t, info, sig, 0);
| >  	if (!error)
| >  		audit_signal_info(sig, t); /* Let audit system see the signal */
| > @@ -1910,7 +1924,13 @@ relock:
| >  		/*
| >  		 * Init of a pid space gets no signals it doesn't want from
| >  		 * within that pid space. It can of course get signals from
| > -		 * its parent pid space.
| > +		 * its parent pid space. But we have no way of knowing the
| > +		 * namespace from which the signal was sent. For now check
| > +		 * if we are global init here and add additional checks in
| > +		 * sys_kill() and friends.
| > +		 *
| > +		 * Note that t == task_child_reaper(t) implies t is the global
| > +		 * init (and we are in init_pid_ns).
| >  		 */
| >  		if (current == task_child_reaper(current))
| >  			continue;
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
Re: [RFC][PATCH 14/16] Introduce proc_mnt for pid_ns [message #18682 is a reply to message #18642] Fri, 25 May 2007 12:27 Go to previous messageGo to next message
Pavel Emelianov is currently offline  Pavel Emelianov
Messages: 1149
Registered: September 2006
Senior Member
Dave Hansen wrote:
> On Thu, 2007-05-24 at 14:15 +0400, Pavel Emelianov wrote:
>>>       s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC;
>>> @@ -466,6 +467,7 @@ int proc_fill_super(struct super_block *
>>>       s->s_magic = PROC_SUPER_MAGIC;
>>>       s->s_op = &proc_sops;
>>>       s->s_time_gran = 1;
>>> +     s->s_fs_info = pid_ns;
>> One more thing I've just noticed - you don't get the namespace
>> here so after all the tasks die and namespace is freed we
>> have a proc mount pointing to freed namespace... 
> 
> Yep, missed reference count.  Thanks for catching this!

refcount is not the only badness. As I said you have 
race in testing superblock's namespace and setting it 
and one more: do you use one struct proc_dir_entry 
proc_root for all the superolocks?

> -- Dave
> 
> 

_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
Re: [RFC][PATCH 0/16] Enable cloning of pid namespace [message #18700 is a reply to message #18609] Tue, 29 May 2007 15:48 Go to previous messageGo to next message
Dave Hansen is currently offline  Dave Hansen
Messages: 240
Registered: October 2005
Senior Member
On Tue, 2007-05-29 at 14:29 +0400, Pavel Emelianov wrote:
> sukadev@us.ibm.com wrote:
> > Dave, Serge,
> > 
> > Here is my current pid namespace patchset. There are still a couple of
> > issues I am investigating, but appreciate any feedback.
> > 
> > Suka
> > 
> 
> I tried to compile your patches with CONFIG_PID_NS=n and got this:
> 
>   CC      kernel/configs.o
> kernel/pid.c:361: warning: function declaration isn't a prototype
> kernel/pid.c: In function `dup_struct_pid':
> kernel/pid.c:489: warning: initialization makes pointer from integer without a cast

Could you send along your actual .config?

-- Dave

_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
Re: [RFC][PATCH 15/16] Enable signaling child reaper from parent ns. [message #18701 is a reply to message #18676] Tue, 29 May 2007 16:47 Go to previous messageGo to next message
Dave Hansen is currently offline  Dave Hansen
Messages: 240
Registered: October 2005
Senior Member
On Fri, 2007-05-25 at 15:13 -0500, Serge E. Hallyn wrote:
> 
> First note on that, (current == child_reaper(current)) is correct if we
> want to treat a container init as init no matter who sends the signal,
> however I contend that if a signal is sent from an ancester namespace,
> then we want to treat the task like any other task.
> 
> Could I get some confirmation or rebuttal from Suka, Eric, Dave, or
> Pavel?

I agree with this.  Signals should only be special when you're sending
them "up the chain" to _your_ init process.  We shouldn't do anything at
all special to them when one process in a set of peers just happens to
be the init for a child namespace.  

-- Dave

_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
Re: [RFC][PATCH 06/16] Define is_global_init() [message #18702 is a reply to message #18674] Tue, 29 May 2007 16:54 Go to previous messageGo to next message
Dave Hansen is currently offline  Dave Hansen
Messages: 240
Registered: October 2005
Senior Member
On Fri, 2007-05-25 at 13:44 -0700, sukadev@us.ibm.com wrote:
> Dave Hansen [hansendc@us.ibm.com] wrote:
> | On Thu, 2007-05-24 at 13:24 +0400, Pavel Emelianov wrote:
> | > > | > +int is_global_init(struct task_struct *tsk)
> | > > | > +{
> | > > | > + return (task_active_pid_ns(tsk) == &init_pid_ns && tsk->pid == 1);
> | > > | 
> | > > | This can OOPS if you pass arbitrary task to this call...
> | > > | tsk->nsproxy can already be NULL.
> | > > 
> | > > Hmm. You are right. btw, this could be a bisect issue. Patch 9 of uses
> | > > pid_ns from pid->upid_list and removes nsproxy->pid_ns.
> | > 
> | > Yes, but that patch is not good either.
> | > task_pid(tsk) may become NULL as well and this will oops.
> | 
> | Have you reviewed the call paths to make sure this can actually happen
> | in practice?
> 
> task_pid() can be NULL when we are tearing down the task structure in
> release_task() and in the tiny window between detach_pid() and attach_pid()
> in de_thread(). 
> 
> I think task_pid() is safe as long as it is called for 'current'. (we should
> probably add some comments)

If we only call it for "current", then perhaps we should just change the
function so that it doesn't take any arguments.  That way nobody can
screw it up.

> I will double check my code, but I think all my calls to task_pid() and hence,
> to task_active_pid_ns() are safe, except for two cases:
> 
>         a) is_global_init(). There are a few calls to process other than
>            current, but not sure if they are a problem.
> 
>            For instance in current code, unhandled_signal() checks
>            tsk->pid == 1 and proceeds to derefernce tsk->sighand.
> 
>            If task_pid() is NULL because the task was in release_task(),
>            then so is tsk->sighand.

Really?  Are there barriers or locks to make this happen?  Can you be
sure that compiler or cpu re-ordered code will keep this true?

>         b) the temporary check I added in check_kill_permissions().
>            (I need to address Serge's comment here anyway).
> 
> To make is_global_init() more efficient and independent of task_pid(),
> can we steal a bit from task_struct->flags ? Like PF_KSWAPD, and there
> are unused bits :-)

It feels to me like we're adding too many hacks on hacks here.  Let's
define the problem, because it sounds to me like we don't even know what
it really is.  What is the problem here, again?

-- Dave

_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
Re: [RFC][PATCH 0/16] Enable cloning of pid namespace [message #18703 is a reply to message #18700] Tue, 29 May 2007 17:12 Go to previous messageGo to next message
Badari Pulavarty is currently offline  Badari Pulavarty
Messages: 15
Registered: September 2006
Junior Member
On Tue, 2007-05-29 at 08:48 -0700, Dave Hansen wrote:
> On Tue, 2007-05-29 at 14:29 +0400, Pavel Emelianov wrote:
> > sukadev@us.ibm.com wrote:
> > > Dave, Serge,
> > > 
> > > Here is my current pid namespace patchset. There are still a couple of
> > > issues I am investigating, but appreciate any feedback.
> > > 
> > > Suka
> > > 
> > 
> > I tried to compile your patches with CONFIG_PID_NS=n and got this:
> > 
> >   CC      kernel/configs.o
> > kernel/pid.c:361: warning: function declaration isn't a prototype
> > kernel/pid.c: In function `dup_struct_pid':
> > kernel/pid.c:489: warning: initialization makes pointer from integer without a cast
> 
> Could you send along your actual .config?

Not needed :(

Index: linux-2.6.21/kernel/pid.c
===================================================================
--- linux-2.6.21.orig/kernel/pid.c	2007-05-25 09:06:30.000000000 -0700
+++ linux-2.6.21/kernel/pid.c	2007-05-29 10:07:39.000000000 -0700
@@ -357,7 +357,7 @@
 
 #else
 
-static int alloc_pid_ns()
+static struct pid_namespace *alloc_pid_ns(void)
 {
 	static int warned;
 
@@ -365,7 +365,7 @@
 		printk(KERN_INFO "WARNING: CLONE_NEWPID disabled\n");
 		warned = 1;
 	}
-	return 0;
+	return NULL;
 }
 
 void zap_pid_ns_processes(struct pid_namespace *pid_ns)


_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
Re: [RFC][PATCH 0/16] Enable cloning of pid namespace [message #18706 is a reply to message #18609] Tue, 29 May 2007 10:29 Go to previous messageGo to next message
Pavel Emelianov is currently offline  Pavel Emelianov
Messages: 1149
Registered: September 2006
Senior Member
sukadev@us.ibm.com wrote:
> Dave, Serge,
> 
> Here is my current pid namespace patchset. There are still a couple of
> issues I am investigating, but appreciate any feedback.
> 
> Suka
> 

I tried to compile your patches with CONFIG_PID_NS=n and got this:

  CC      kernel/configs.o
kernel/pid.c:361: warning: function declaration isn't a prototype
kernel/pid.c: In function `dup_struct_pid':
kernel/pid.c:489: warning: initialization makes pointer from integer without a cast

Thanks,
Pavel.
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
Re: [RFC][PATCH 06/16] Define is_global_init() [message #18709 is a reply to message #18702] Wed, 30 May 2007 00:29 Go to previous messageGo to next message
Sukadev Bhattiprolu is currently offline  Sukadev Bhattiprolu
Messages: 413
Registered: August 2006
Senior Member
Dave Hansen [hansendc@us.ibm.com] wrote:
| On Fri, 2007-05-25 at 13:44 -0700, sukadev@us.ibm.com wrote:
| > Dave Hansen [hansendc@us.ibm.com] wrote:
| > | On Thu, 2007-05-24 at 13:24 +0400, Pavel Emelianov wrote:
| > | > > | > +int is_global_init(struct task_struct *tsk)
| > | > > | > +{
| > | > > | > + return (task_active_pid_ns(tsk) == &init_pid_ns && tsk->pid == 1);
| > | > > | 
| > | > > | This can OOPS if you pass arbitrary task to this call...
| > | > > | tsk->nsproxy can already be NULL.
| > | > > 
| > | > > Hmm. You are right. btw, this could be a bisect issue. Patch 9 of uses
| > | > > pid_ns from pid->upid_list and removes nsproxy->pid_ns.
| > | > 
| > | > Yes, but that patch is not good either.
| > | > task_pid(tsk) may become NULL as well and this will oops.
| > | 
| > | Have you reviewed the call paths to make sure this can actually happen
| > | in practice?
| > 
| > task_pid() can be NULL when we are tearing down the task structure in
| > release_task() and in the tiny window between detach_pid() and attach_pid()
| > in de_thread(). 
| > 
| > I think task_pid() is safe as long as it is called for 'current'. (we should
| > probably add some comments)
| 
| If we only call it for "current", then perhaps we should just change the
| function so that it doesn't take any arguments.  That way nobody can
| screw it up.

Well, if the caller can confirm that the tsk passed in to task_pid() is
not exiting, then it is ok to use. We have one such usage in do_fork()
where we use the task struct we just initialized. The task has not yet
been woken up.

| 
| > I will double check my code, but I think all my calls to task_pid() and hence,
| > to task_active_pid_ns() are safe, except for two cases:
| > 
| >         a) is_global_init(). There are a few calls to process other than
| >            current, but not sure if they are a problem.
| > 
| >            For instance in current code, unhandled_signal() checks
| >            tsk->pid == 1 and proceeds to derefernce tsk->sighand.
| > 
| >            If task_pid() is NULL because the task was in release_task(),
| >            then so is tsk->sighand.
| 
| Really?  Are there barriers or locks to make this happen?  Can you be
| sure that compiler or cpu re-ordered code will keep this true?

I don't understand. Here is unhandled_signal() from 2.6.21-mm2.

int unhandled_signal(struct task_struct *tsk, int sig)
{
        if (is_init(tsk))
                return 1;
        if (tsk->ptrace & PT_PTRACED)
                return 0;
        return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) ||
                (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
}

My patch changed the is_init() to is_global_init() but the theory is
that is_global_init() is not safe since it uses task_pid() which
can return NULL if @tsk is exiting.

But if @tsk is exiting, tsk->sighand will also be NULL. So either
the current callers have somehow ensured @tsk is not exiting or
they are risking accessing tsk->sighand.

Not sure how compiler/cpu-reordering changes things. Can you elaborate ?

| 
| >         b) the temporary check I added in check_kill_permissions().
| >            (I need to address Serge's comment here anyway).
| > 
| > To make is_global_init() more efficient and independent of task_pid(),
| > can we steal a bit from task_struct->flags ? Like PF_KSWAPD, and there
| > are unused bits :-)
| 
| It feels to me like we're adding too many hacks on hacks here.  Let's
| define the problem, because it sounds to me like we don't even know what
| it really is.  What is the problem here, again?

We need an interface is_global_init() that tells us if the given process
is /sbin/init (which is the process with pid_t == 1 in init_pid_ns).

Why we need is_global_init(): to allow/deny some facilities (eg: you
cannot send arbitrary signals to the process).

How to implement is_global_init():  One simple/efficient way is to use a
bit in task->flags.

Another way is to:

	- ensure tsk->pid == 1 and
	- active pid ns of @tsk is init_pid_ns.

To check active pid ns of a process we currently need task_pid() which
*may* not be safe for all processes at all times.

A releated interface is is_container_init() which can be defined as
the process with pid_t == 1 in its active pid namespace.
| 
| -- Dave
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
Re: [RFC][PATCH 14/16] Introduce proc_mnt for pid_ns [message #18725 is a reply to message #18623] Thu, 31 May 2007 16:40 Go to previous messageGo to next message
Dave Hansen is currently offline  Dave Hansen
Messages: 240
Registered: October 2005
Senior Member
On Thu, 2007-05-31 at 15:48 +0400, Pavel Emelianov wrote:
> > +void proc_flush_task(struct task_struct *task)
> > +{
> > +     int i;
> > +     struct pid *pid;
> > +     struct upid* upid;
> > +
> > +     pid = task_pid(task);
> > +     if (!pid)
> > +             return;
> 
> The code below will never be called as task flushes all his pids
> in __unhash_process() that happens before this. Or did I miss smth? 

Nope, that's a very nice catch.  Suka is working on a patch to fix this
right now.  We just need to keep the list of pids accessible a little
longer, but I think Suka ran into a chicken-and-egg problem while trying
to solve this.  Suka, care to post your workaround?

-- Dave

_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
Re: [RFC][PATCH 14/16] Introduce proc_mnt for pid_ns [message #18727 is a reply to message #18725] Thu, 31 May 2007 19:50 Go to previous message
Sukadev Bhattiprolu is currently offline  Sukadev Bhattiprolu
Messages: 413
Registered: August 2006
Senior Member
Dave Hansen [hansendc@us.ibm.com] wrote:
| On Thu, 2007-05-31 at 15:48 +0400, Pavel Emelianov wrote:
| > > +void proc_flush_task(struct task_struct *task)
| > > +{
| > > +     int i;
| > > +     struct pid *pid;
| > > +     struct upid* upid;
| > > +
| > > +     pid = task_pid(task);
| > > +     if (!pid)
| > > +             return;
| > 
| > The code below will never be called as task flushes all his pids
| > in __unhash_process() that happens before this. Or did I miss smth? 
| 
| Nope, that's a very nice catch.  Suka is working on a patch to fix this
| right now.  We just need to keep the list of pids accessible a little
| longer, but I think Suka ran into a chicken-and-egg problem while trying
| to solve this.  Suka, care to post your workaround?

Yes. the chicken and egg problem is in the order of the __unhash_process()
and proc_flush_task().

With __unhash_process() first, proc_flush_task() has no way of knowing
the pid namespaces the process belongs to. With proc_flush_task() first,
we find the process but have its dentry removed from proc. This crashes
when I run a test in a tight loop.

A quick/dirty fix is to save the pid namespace list before __unhash_process()
and use that in proc_flush_task(). That would make the exit() path quite
expensive. I am still investigating the crash and looking for a better option.


| 
| -- Dave
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
Re: [RFC][PATCH 14/16] Introduce proc_mnt for pid_ns [message #18731 is a reply to message #18623] Thu, 31 May 2007 11:48 Go to previous message
Pavel Emelianov is currently offline  Pavel Emelianov
Messages: 1149
Registered: September 2006
Senior Member
sukadev@us.ibm.com wrote:
> Subject: Introduce proc_mnt for pid_ns
> 
> From: Dave Hansen <hansendc@us.ibm.com>
> 
> The following patch completes the removal of the global proc_mnt.
> It fetches the mnt on which to do dentry invalidations from the
> pid_namespace in which the task appears.
> 
> For now, there is only one pid namespace in mainline so this is
> straightforward.  In the -lxc tree we'll have to do something
> more complex.  The proc_flush_task() code takes a task, and
> needs to be able to find the corresponding proc superblocks on
> which that tasks's /proc/<pid> directories could appear.  We
> can tell in which pid namespaces a task appears, so I put a
> pointer from the pid namespace to the corresponding proc_mnt.
> 
> /proc currently has some special code to make sure that the root
> directory gets set up correctly.  It proc_mnt variable in order
> to find its way to the root inode.
> 
> Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
> Signed-off-by: Sukadev Bhattiprolu <sukadev@us.ibm.com>
> ---
> 
>  fs/proc/base.c                |   32 ++++++++++++++++++++++++-
>  fs/proc/inode.c               |   11 +++++++-
>  fs/proc/root.c                |   52 ++++++++++++++++++++++++++++--------------
>  include/linux/pid_namespace.h |    1 
>  include/linux/proc_fs.h       |    1 
>  5 files changed, 75 insertions(+), 22 deletions(-)
> 

[snip]

> @@ -2071,6 +2084,21 @@ out:
>  	return;
>  }
>  
> +void proc_flush_task(struct task_struct *task)
> +{
> +	int i;
> +	struct pid *pid;
> +	struct upid* upid;
> +
> +	pid = task_pid(task);
> +	if (!pid)
> +		return;

The code below will never be called as task flushes all his pids
in __unhash_process() that happens before this. Or did I miss smth?

> +	upid = &pid->upid_list[0];
> +	for (i = 0; i < pid->num_upids; i++, upid++)
> +		proc_flush_task_from_pid_ns(task, upid->pid_ns);
> +}
> +
>  static struct dentry *proc_pid_instantiate(struct inode *dir,
>  					   struct dentry * dentry,
>  					   struct task_struct *task, const void *ptr)
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
Previous Topic: [PATCH 0/13] Pid namespaces (OpenVZ view)
Next Topic: [PATCH 05/10] Containers(V10): Add container_clone() interface
Goto Forum:
  


Current Time: Sun Oct 26 05:31:59 GMT 2025

Total time taken to generate the page: 0.11809 seconds