| Home » Mailing lists » Devel » [PATCH 1/4] Virtualization/containers: introduction Goto Forum:
	| 
		
			| [PATCH 1/4] Virtualization/containers: introduction [message #1331] | Mon, 06 February 2006 21:55  |  
			| 
				
				
					|  Kirill Korotaev Messages: 137
 Registered: January 2006
 | Senior Member |  |  |  
	| Hello, 
 I tried to take into account all the comments from you guys (thanks a
 lot for them!) and prepared a new version of virtualization patches. I
 will send only 4 patches today, just not to overflow everyone and keep
 it clear/tidy/possible to review.
 
 This patch introduces some abstract container kernel structure and a
 number of operations on it.
 
 The important properties of the proposed container implementation:
 - each container has unique ID in the system
 - each process in the kernel can belong to one container only
 - effective container pointer (econtainer()) is used on the task to
 avoid insertion of additional argument "container" to all functions
 where it is required.
 - kernel compilation with disabled virtualization should result in old
 good linux kernel
 
 Patches following this one will be used for virtualization of the kernel
 resources based on this container infrastructure, including those VPID
 patches I sent before. Every virtualized resource can be given separate
 config option if needed (just give me to know if it is desired).
 
 Signed-Off-By: Kirill Korotaev <dev@openvz.org>
 
 Kirill
 
 P.S. I understand that this virtualization spam can be unintersting for
 some of you, just give me to know if you want to be removed from CC.
 
 P.P.S. All patches are against 2.6.16-rc2-git2 and compile when
 virtualization=n.
 
 
 --- ./include/linux/container.h.vpsinfo	2006-02-06 22:35:36.000000000 +0300
 +++ ./include/linux/container.h	2006-02-07 00:52:57.000000000 +0300
 @@ -0,0 +1,59 @@
 +#ifndef __LINUX_CONTAINER_H__
 +#define __LINUX_CONTAINER_H__
 +
 +#include <linux/config.h>
 +#include <asm/types.h>
 +#include <asm/atomic.h>
 +
 +struct task_struct;
 +
 +struct container {
 +	u32 id;
 +	struct task_struct *init_task;
 +	atomic_t refcnt;
 +};
 +
 +extern struct container init_container;
 +
 +#ifdef CONFIG_CONTAINER
 +
 +static inline struct container *get_container(struct container *cont)
 +{
 +	atomic_inc(&cont->refcnt);
 +	return cont;
 +}
 +
 +static inline void put_container(struct container *cont)
 +{
 +	atomic_dec(&container->refcnt);
 +}
 +
 +#include <linux/sched.h>
 +#include <asm/current.h>
 +
 +static inline struct container *set_econtainer(struct container *cont)
 +{
 +	struct container *ret;
 +
 +	ret = current->econtainer;
 +	current->econtainer = cont;
 +	return ret;
 +}
 +
 +#define econtainer()		(current->econtainer)
 +
 +extern void init_containers(void);
 +
 +#else	/* CONFIG_CONTAINER */
 +
 +#define get_container(cont)	(NULL)
 +#define put_container(cont)	do { } while (0)
 +
 +#define set_econtainer(cont)	((struct container *)NULL)
 +#define econtainer()		(NULL)
 +
 +#define init_containers()	do { } while (0)
 +
 +#endif	/* CONFIG_CONTAINER */
 +
 +#endif
 --- ./include/linux/init_task.h.vpsinfo	2006-01-03 06:21:10.000000000 +0300
 +++ ./include/linux/init_task.h	2006-02-07 00:52:52.000000000 +0300
 @@ -3,6 +3,7 @@
 
 #include <linux/file.h>
 #include <linux/rcupdate.h>
 +#include <linux/container.h>
 
 #define INIT_FDTABLE \
 {							\
 @@ -131,5 +132,7 @@ extern struct group_info init_groups;
 LIST_HEAD_INIT(cpu_timers[2]),					\
 }
 
 +#define INIT_CONTAINER(cont)						\
 +	.refcnt		= ATOMIC_INIT(1)
 
 #endif
 --- ./include/linux/sched.h.vpsinfo	2006-02-06 22:15:05.000000000 +0300
 +++ ./include/linux/sched.h	2006-02-07 00:53:32.000000000 +0300
 @@ -687,6 +687,7 @@ static inline void prefetch_stack(struct
 
 struct audit_context;		/* See audit.c */
 struct mempolicy;
 +struct container;
 
 struct task_struct {
 volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
 @@ -846,6 +847,11 @@ struct task_struct {
 
 struct io_context *io_context;
 
 +#ifdef CONFIG_CONTAINER
 +	struct container *container;
 +	struct container *econtainer;	/* effective container */
 +#endif
 +
 unsigned long ptrace_message;
 siginfo_t *last_siginfo; /* For ptrace use.  */
 /*
 --- ./init/main.c.vpsinfo	2006-02-06 22:15:06.000000000 +0300
 +++ ./init/main.c	2006-02-07 00:46:12.000000000 +0300
 @@ -47,6 +47,7 @@
 #include <linux/rmap.h>
 #include <linux/mempolicy.h>
 #include <linux/key.h>
 +#include <linux/container.h>
 
 #include <asm/io.h>
 #include <asm/bugs.h>
 @@ -603,6 +604,8 @@ static void __init do_initcalls(void)
 */
 static void __init do_basic_setup(void)
 {
 +	init_containers();
 +
 /* drivers will send hotplug events */
 init_workqueues();
 usermodehelper_init();
 --- ./kernel/Makefile.vpsinfo	2006-02-06 22:15:06.000000000 +0300
 +++ ./kernel/Makefile	2006-02-07 00:46:12.000000000 +0300
 @@ -34,6 +34,7 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softl
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 +obj-$(CONFIG_CONTAINER) += container.c
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
 --- ./kernel/container.c.vpsinfo	2006-02-07 00:01:21.000000000 +0300
 +++ ./kernel/container.c	2006-02-07 00:46:12.000000000 +0300
 @@ -0,0 +1,16 @@
 +#include <linux/container.h>
 +
 +/*
 + * Initial container.
 + *
 + * All tasks and other resources initially belong to it
 + */
 +struct container init_container = INIT_CONTAINER(init_container);
 +
 +EXPORT_SYMBOL(init_container);
 +
 +void init_containers(void)
 +{
 +	/* remember who is init in container */
 +	init_container.init_task = child_reaper;
 +}
 --- ./kernel/exit.c.vpsinfo	2006-02-06 22:15:06.000000000 +0300
 +++ ./kernel/exit.c	2006-02-07 00:46:12.000000000 +0300
 @@ -31,6 +31,7 @@
 #include <linux/signal.h>
 #include <linux/cn_proc.h>
 #include <linux/mutex.h>
 +#include <linux/container.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 @@ -107,6 +108,7 @@ repeat:
 spin_unlock(&p->proc_lock);
 proc_pid_flush(proc_dentry);
 release_thread(p);
 +	put_container(p->container);
 put_task_struct(p);
 
 p = leader;
 --- ./kernel/fork.c.vpsinfo	2006-02-06 22:15:06.000000000 +0300
 +++ ./kernel/fork.c	2006-02-07 00:46:12.000000000 +0300
 @@ -44,6 +44,7 @@
 #include <linux/rmap.h>
 #include <linux/acct.h>
 #include <linux/cn_proc.h>
 +#include <linux/container.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 @@ -1132,6 +1133,7 @@ static task_t *copy_process(unsigned lon
 p->ioprio = current->ioprio;
 
 SET_LINKS(p);
 +	(void)get_container(p->container);
 if (unlikely(p->ptrace & PT_PTRACED))
 __ptrace_link(p, current->parent);
 |  
	|  |  |  
	| 
		
			| [PATCH 2/4] Virtualization/containers: CONFIG_CONTAINER [message #1332 is a reply to message #1331] | Mon, 06 February 2006 22:10   |  
			| 
				
				
					|  Kirill Korotaev Messages: 137
 Registered: January 2006
 | Senior Member |  |  |  
	| This patch simply adds CONFIG_CONTAINER option for virtualization/containerss functionality.
 Per-resource config options can be added later if needed.
 
 Signed-Off-By: Kirill Korotaev <dev@openvz.org>
 
 Kirill
 
 --- ./arch/alpha/Kconfig.vkconfig	2006-02-06 22:14:50.000000000 +0300
 +++ ./arch/alpha/Kconfig	2006-02-06 23:26:35.000000000 +0300
 @@ -621,6 +621,8 @@ source "arch/alpha/Kconfig.debug"
 
 source "security/Kconfig"
 
 +source "kernel/Kconfig.container"
 +
 source "crypto/Kconfig"
 
 source "lib/Kconfig"
 --- ./arch/arm/Kconfig.vkconfig	2006-02-06 22:14:50.000000000 +0300
 +++ ./arch/arm/Kconfig	2006-02-06 23:27:06.000000000 +0300
 @@ -794,6 +794,8 @@ source "arch/arm/Kconfig.debug"
 
 source "security/Kconfig"
 
 +source "kernel/Kconfig.container"
 +
 source "crypto/Kconfig"
 
 source "lib/Kconfig"
 --- ./arch/arm26/Kconfig.vkconfig	2006-02-06 22:14:51.000000000 +0300
 +++ ./arch/arm26/Kconfig	2006-02-06 23:27:14.000000000 +0300
 @@ -232,6 +232,8 @@ source "arch/arm26/Kconfig.debug"
 
 source "security/Kconfig"
 
 +source "kernel/Kconfig.container"
 +
 source "crypto/Kconfig"
 
 source "lib/Kconfig"
 --- ./arch/cris/Kconfig.vkconfig	2006-02-06 22:14:51.000000000 +0300
 +++ ./arch/cris/Kconfig	2006-02-06 23:30:51.000000000 +0300
 @@ -175,6 +175,8 @@ source "arch/cris/Kconfig.debug"
 
 source "security/Kconfig"
 
 +source "kernel/Kconfig.container"
 +
 source "crypto/Kconfig"
 
 source "lib/Kconfig"
 --- ./arch/frv/Kconfig.vkconfig	2006-02-06 22:14:51.000000000 +0300
 +++ ./arch/frv/Kconfig	2006-02-06 23:30:51.000000000 +0300
 @@ -341,6 +341,8 @@ source "arch/frv/Kconfig.debug"
 
 source "security/Kconfig"
 
 +source "kernel/Kconfig.container"
 +
 source "crypto/Kconfig"
 
 source "lib/Kconfig"
 --- ./arch/h8300/Kconfig.vkconfig	2006-02-06 22:14:51.000000000 +0300
 +++ ./arch/h8300/Kconfig	2006-02-06 23:30:51.000000000 +0300
 @@ -189,6 +189,8 @@ source "arch/h8300/Kconfig.debug"
 
 source "security/Kconfig"
 
 +source "kernel/Kconfig.container"
 +
 source "crypto/Kconfig"
 
 source "lib/Kconfig"
 --- ./arch/i386/Kconfig.vkconfig	2006-02-06 22:15:14.000000000 +0300
 +++ ./arch/i386/Kconfig	2006-02-06 23:30:51.000000000 +0300
 @@ -1072,6 +1072,8 @@ source "arch/i386/Kconfig.debug"
 
 source "security/Kconfig"
 
 +source "kernel/Kconfig.container"
 +
 source "crypto/Kconfig"
 
 source "lib/Kconfig"
 --- ./arch/ia64/Kconfig.vkconfig	2006-01-03 06:21:10.000000000 +0300
 +++ ./arch/ia64/Kconfig	2006-02-06 23:30:51.000000000 +0300
 @@ -463,4 +463,6 @@ source "arch/ia64/Kconfig.debug"
 
 source "security/Kconfig"
 
 +source "kernel/Kconfig.container"
 +
 source "crypto/Kconfig"
 --- ./arch/m32r/Kconfig.vkconfig	2006-02-06 22:14:52.000000000 +0300
 +++ ./arch/m32r/Kconfig	2006-02-06 23:30:51.000000000 +0300
 @@ -377,6 +377,8 @@ source "arch/m32r/Kconfig.debug"
 
 source "security/Kconfig"
 
 +source "kernel/Kconfig.container"
 +
 source "crypto/Kconfig"
 
 source "lib/Kconfig"
 --- ./arch/m68k/Kconfig.vkconfig	2006-02-06 22:14:52.000000000 +0300
 +++ ./arch/m68k/Kconfig	2006-02-06 23:30:51.000000000 +0300
 @@ -648,6 +648,8 @@ source "arch/m68k/Kconfig.debug"
 
 source "security/Kconfig"
 
 +source "kernel/Kconfig.container"
 +
 source "crypto/Kconfig"
 
 source "lib/Kconfig"
 --- ./arch/m68knommu/Kconfig.vkconfig	2006-02-06 22:14:52.000000000 +0300
 +++ ./arch/m68knommu/Kconfig	2006-02-06 23:30:51.000000000 +0300
 @@ -644,6 +644,8 @@ source "arch/m68knommu/Kconfig.debug"
 
 source "security/Kconfig"
 
 +source "kernel/Kconfig.container"
 +
 source "crypto/Kconfig"
 
 source "lib/Kconfig"
 --- ./arch/mips/Kconfig.vkconfig	2006-02-06 22:14:52.000000000 +0300
 +++ ./arch/mips/Kconfig	2006-02-06 23:30:51.000000000 +0300
 @@ -1811,6 +1811,8 @@ source "arch/mips/Kconfig.debug"
 
 source "security/Kconfig"
 
 +source "kernel/Kconfig.container"
 +
 source "crypto/Kconfig"
 
 source "lib/Kconfig"
 --- ./arch/parisc/Kconfig.vkconfig	2006-02-06 22:15:14.000000000 +0300
 +++ ./arch/parisc/Kconfig	2006-02-06 23:30:51.000000000 +0300
 @@ -210,6 +210,8 @@ source "arch/parisc/Kconfig.debug"
 
 source "security/Kconfig"
 
 +source "kernel/Kconfig.container"
 +
 source "crypto/Kconfig"
 
 source "lib/Kconfig"
 --- ./arch/powerpc/Kconfig.vkconfig	2006-02-06 22:14:52.000000000 +0300
 +++ ./arch/powerpc/Kconfig	2006-02-06 23:30:51.000000000 +0300
 @@ -975,4 +975,6 @@ config KEYS_COMPAT
 depends on COMPAT && KEYS
 default y
 
 +source "kernel/Kconfig.container"
 +
 source "crypto/Kconfig"
 --- ./arch/ppc/Kconfig.vkconfig	2006-02-06 22:14:53.000000000 +0300
 +++ ./arch/ppc/Kconfig	2006-02-06 23:30:51.000000000 +0300
 @@ -1396,4 +1396,6 @@ source "arch/ppc/Kconfig.debug"
 
 source "security/Kconfig"
 
 +source "kernel/Kconfig.container"
 +
 source "crypto/Kconfig"
 --- ./arch/s390/Kconfig.vkconfig	2006-02-06 22:14:53.000000000 +0300
 +++ ./arch/s390/Kconfig	2006-02-06 23:30:51.000000000 +0300
 @@ -470,6 +470,8 @@ source "arch/s390/Kconfig.debug"
 
 source "security/Kconfig"
 
 +source "kernel/Kconfig.container"
 +
 source "crypto/Kconfig"
 
 source "lib/Kconfig"
 --- ./arch/sh/Kconfig.vkconfig	2006-02-06 22:14:53.000000000 +0300
 +++ ./arch/sh/Kconfig	2006-02-06 23:30:51.000000000 +0300
 @@ -635,6 +635,8 @@ source "arch/sh/Kconfig.debug"
 
 source "security/Kconfig"
 
 +source "kernel/Kconfig.container"
 +
 source "crypto/Kconfig"
 
 source "lib/Kconfig"
 --- ./arch/sh64/Kconfig.vkconfig	2006-02-06 22:14:53.000000000 +0300
 +++ ./arch/sh64/Kconfig	2006-02-06 23:30:51.000000000 +0300
 @@ -272,6 +272,8 @@ source "arch/sh64/Kconfig.debug"
 
 source "security/Kconfig"
 
 +source "kernel/Kconfig.container"
 +
 source "crypto/Kconfig"
 
 source "lib/Kconfig"
 --- ./arch/sparc/Kconfig.vkconfig	2006-02-06 22:14:53.000000000 +0300
 +++ ./arch/sparc/Kconfig	2006-02-06 23:30:51.000000000 +0300
 @@ -286,6 +286,8 @@ source "arch/sparc/Kconfig.debug"
 
 source "security/Kconfig"
 
 +source "kernel/Kconfig.container"
 +
 source "crypto/Kconfig"
 
 source "lib/Kconfig"
 --- ./arch/sparc64/Kconfig.vkconfig	2006-02-06 22:14:53.000000000 +0300
 +++ ./arch/sparc64/Kconfig	2006-02-06 23:30:51.000000000 +0300
 @@ -395,6 +395,8 @@ source "arch/sparc64/Kconfig.debug"
 
 source "security/Kconfig"
 
 +source "kernel/Kconfig.container"
 +
 source "crypto/Kconfig"
 
 source "lib/Kconfig"
 --- ./arch/um/Kconfig.vkconfig	2006-02-06 22:14:53.000000000 +0300
 +++ ./arch/um/Kconfig	2006-02-06 23:30:51.000000000 +0300
 @@ -292,6 +292,8 @@ source "fs/Kconfig"
 
 source "security/Kconfig"
 
 +source "kernel/Kconfig.container"
 +
 source "crypto/Kconfig"
 
 source "lib/Kconfig"
 --- ./arch/v850/Kconfig.vkconfig	2006-02-06 22:14:54.000000000 +0300
 +++ ./arch/v850/Kconfig	2006-02-06 23:30:51.000000000 +0300
 @@ -318,6 +318,8 @@ source "arch/v850/Kconfig.debug"
 
 source "security/Kconfig"
 
 +source "kernel/Kconfig.container"
 +
 source "crypto/Kconfig"
 
 source "lib/Kconfig"
 --- ./arch/x86_64/Kconfig.vkconfig	2006-02-06 22:14:54.000000000 +0300
 +++ ./arch/x86_64/Kconfig	2006-02-06 23:30:51.000000000 +0300
 @@ -604,6 +604,8 @@ source "arch/x86_64/Kconfig.debug"
 
 source "security/Kconfig"
 
 +source "kernel/Kconfig.container"
 +
 source "crypto/Kconfig"
 
 source "lib/Kconfig"
 --- ./arch/xtensa/Kconfig.vkconfig	2006-02-06 22:14:54.000000000 +0300
 +++ ./arch/xtensa/Kconfig	2006-02-06 23:30:51.000000000 +0300
 @@ -247,6 +247,8 @@ source "arch/xtensa/Kconfig.debug"
 
 source "security/Kconfig"
 
 +source "kernel/Kconfig.container"
 +
 source "crypto/Kconfig"
 
 source "lib/Kconfig"
 --- /dev/null	 23:22:33.000000000 +0300
 +++ ./kernel/Kconfig.container	2006-02-06 23:22:33.000000000 +0300
 @@ -0,0 +1,11 @@
 +menu "Virtual Containers"
 +
 +config CONTAINER
 +	bool "Virtual Containers support"
 +	default n
 +	help
 +	  This option enables support of virtual linux containers,
 +	  which can be used for creation of virtual environments,
 +	  Virtual Private Servers, checkpointing, isolation and so on
 +
 +endmenu
...
 
 
 |  
	|  |  |  
	| 
		
			| [PATCH 3/4] Virtualization/containers: UID hash [message #1333 is a reply to message #1331] | Mon, 06 February 2006 22:15   |  
			| 
				
				
					|  Kirill Korotaev Messages: 137
 Registered: January 2006
 | Senior Member |  |  |  
	| This patch virtualizes UID hash, so that processes in container can use it's own UID set.
 Can be done as an option if some virtualization solutions do not require it.
 
 Signed-Off-By: Kirill Korotaev <dev@openvz.org>
 
 Kirill
 
 --- ./include/linux/container.h.uids	2006-02-06 23:46:40.000000000 +0300
 +++ ./include/linux/container.h	2006-02-07 00:05:33.000000000 +0300
 @@ -6,11 +6,14 @@
 #include <asm/atomic.h>
 
 struct task_struct;
 +struct list_head;
 
 struct container {
 u32 id;
 struct task_struct *init_task;
 atomic_t refcnt;
 +
 +	struct list_head *c_uid_hash;
 };
 
 extern struct container init_container;
 --- ./kernel/user.c.uids	2006-02-06 22:15:06.000000000 +0300
 +++ ./kernel/user.c	2006-02-06 23:58:06.000000000 +0300
 @@ -14,6 +14,7 @@
 #include <linux/bitops.h>
 #include <linux/key.h>
 #include <linux/interrupt.h>
 +#include <linux/container.h>
 
 /*
 * UID task count cache, to get fast user lookup in "alloc_uid"
 @@ -24,7 +25,12 @@
 #define UIDHASH_SZ		(1 << UIDHASH_BITS)
 #define UIDHASH_MASK		(UIDHASH_SZ - 1)
 #define __uidhashfn(uid)	(((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK)
 +
 +#ifdef CONFIG_CONTAINER
 +#define uidhashentry(uid)	(econtainer()->c_uid_hash + __uidhashfn((uid)))
 +#else
 #define uidhashentry(uid)	(uidhash_table + __uidhashfn((uid)))
 +#endif
 
 static kmem_cache_t *uid_cachep;
 static struct list_head uidhash_table[UIDHASH_SZ];
 @@ -200,6 +206,9 @@ static int __init uid_cache_init(void)
 
 /* Insert the root user immediately (init already runs as root) */
 spin_lock_irq(&uidhash_lock);
 +#ifdef CONFIG_CONTAINER
 +	init_container.c_uid_hash = uidhash_table;
 +#endif
 uid_hash_insert(&root_user, uidhashentry(0));
 spin_unlock_irq(&uidhash_lock);
 |  
	|  |  |  
	| 
		
			| [PATCH 4/4] Virtualization/containers: uts name [message #1334 is a reply to message #1331] | Mon, 06 February 2006 22:20   |  
			| 
				
				
					|  Kirill Korotaev Messages: 137
 Registered: January 2006
 | Senior Member |  |  |  
	| This patch virtualizes uts name. main changes are done in container.h, uts_name.h,
 all other places are just replacement of system_utsname with uts_name.
 
 Signed-Off-By: Kirill Korotaev <dev@openvz.org>
 
 Kirill
 
 --- ./arch/alpha/kernel/osf_sys.c.utsnamex	2006-02-07 01:18:42.000000000 +0300
 +++ ./arch/alpha/kernel/osf_sys.c	2006-02-07 01:18:50.000000000 +0300
 @@ -402,15 +402,15 @@ osf_utsname(char __user *name)
 
 down_read(&uts_sem);
 error = -EFAULT;
 -	if (copy_to_user(name + 0, system_utsname.sysname, 32))
 +	if (copy_to_user(name + 0, uts_name.sysname, 32))
 goto out;
 -	if (copy_to_user(name + 32, system_utsname.nodename, 32))
 +	if (copy_to_user(name + 32, uts_name.nodename, 32))
 goto out;
 -	if (copy_to_user(name + 64, system_utsname.release, 32))
 +	if (copy_to_user(name + 64, uts_name.release, 32))
 goto out;
 -	if (copy_to_user(name + 96, system_utsname.version, 32))
 +	if (copy_to_user(name + 96, uts_name.version, 32))
 goto out;
 -	if (copy_to_user(name + 128, system_utsname.machine, 32))
 +	if (copy_to_user(name + 128, uts_name.machine, 32))
 goto out;
 
 error = 0;
 @@ -449,8 +449,8 @@ osf_getdomainname(char __user *name, int
 
 down_read(&uts_sem);
 for (i = 0; i < len; ++i) {
 -		__put_user(system_utsname.domainname[i], name + i);
 -		if (system_utsname.domainname[i] == '\0')
 +		__put_user(uts_name.domainname[i], name + i);
 +		if (uts_name.domainname[i] == '\0')
 break;
 }
 up_read(&uts_sem);
 @@ -608,11 +608,11 @@ asmlinkage long
 osf_sysinfo(int command, char __user *buf, long count)
 {
 static char * sysinfo_table[] = {
 -		system_utsname.sysname,
 -		system_utsname.nodename,
 -		system_utsname.release,
 -		system_utsname.version,
 -		system_utsname.machine,
 +		uts_name.sysname,
 +		uts_name.nodename,
 +		uts_name.release,
 +		uts_name.version,
 +		uts_name.machine,
 "alpha",	/* instruction set architecture */
 "dummy",	/* hardware serial number */
 "dummy",	/* hardware manufacturer */
 --- ./arch/i386/kernel/sys_i386.c.utsnamex	2006-02-07 01:18:42.000000000 +0300
 +++ ./arch/i386/kernel/sys_i386.c	2006-02-07 01:18:50.000000000 +0300
 @@ -217,7 +217,7 @@ asmlinkage int sys_uname(struct old_utsn
 if (!name)
 return -EFAULT;
 down_read(&uts_sem);
 -	err=copy_to_user(name, &system_utsname, sizeof (*name));
 +	err=copy_to_user(name, &uts_name, sizeof (*name));
 up_read(&uts_sem);
 return err?-EFAULT:0;
 }
 @@ -233,15 +233,15 @@ asmlinkage int sys_olduname(struct oldol
 
 down_read(&uts_sem);
 
 -	error = __copy_to_user(&name->sysname,&system_utsname.sysname,__OLD_UTS_LEN);
 +	error = __copy_to_user(&name->sysname,&uts_name.sysname,__OLD_UTS_LEN);
 error |= __put_user(0,name->sysname+__OLD_UTS_LEN);
 -	error |= __copy_to_user(&name->nodename,&system_utsname.nodename,__OLD_UTS_LEN);
 +	error |= __copy_to_user(&name->nodename,&uts_name.nodename,__OLD_UTS_LEN);
 error |= __put_user(0,name->nodename+__OLD_UTS_LEN);
 -	error |= __copy_to_user(&name->release,&system_utsname.release,__OLD_UTS_LEN);
 +	error |= __copy_to_user(&name->release,&uts_name.release,__OLD_UTS_LEN);
 error |= __put_user(0,name->release+__OLD_UTS_LEN);
 -	error |= __copy_to_user(&name->version,&system_utsname.version,__OLD_UTS_LEN);
 +	error |= __copy_to_user(&name->version,&uts_name.version,__OLD_UTS_LEN);
 error |= __put_user(0,name->version+__OLD_UTS_LEN);
 -	error |= __copy_to_user(&name->machine,&system_utsname.machine,__OLD_UTS_LEN);
 +	error |= __copy_to_user(&name->machine,&uts_name.machine,__OLD_UTS_LEN);
 error |= __put_user(0,name->machine+__OLD_UTS_LEN);
 
 up_read(&uts_sem);
 --- ./arch/m32r/kernel/sys_m32r.c.utsnamex	2006-02-07 01:18:42.000000000 +0300
 +++ ./arch/m32r/kernel/sys_m32r.c	2006-02-07 01:18:50.000000000 +0300
 @@ -199,7 +199,7 @@ asmlinkage int sys_uname(struct old_utsn
 if (!name)
 return -EFAULT;
 down_read(&uts_sem);
 -	err=copy_to_user(name, &system_utsname, sizeof (*name));
 +	err=copy_to_user(name, &uts_name, sizeof (*name));
 up_read(&uts_sem);
 return err?-EFAULT:0;
 }
 --- ./arch/mips/kernel/linux32.c.utsnamex	2006-02-07 01:18:42.000000000 +0300
 +++ ./arch/mips/kernel/linux32.c	2006-02-07 01:18:50.000000000 +0300
 @@ -1150,7 +1150,7 @@ asmlinkage long sys32_newuname(struct ne
 int ret = 0;
 
 down_read(&uts_sem);
 -	if (copy_to_user(name,&system_utsname,sizeof *name))
 +	if (copy_to_user(name,&uts_name,sizeof *name))
 ret = -EFAULT;
 up_read(&uts_sem);
 
 --- ./arch/mips/kernel/syscall.c.utsnamex	2006-02-07 01:18:42.000000000 +0300
 +++ ./arch/mips/kernel/syscall.c	2006-02-07 01:18:50.000000000 +0300
 @@ -229,7 +229,7 @@ out:
 */
 asmlinkage int sys_uname(struct old_utsname * name)
 {
 -	if (name && !copy_to_user(name, &system_utsname, sizeof (*name)))
 +	if (name && !copy_to_user(name, &uts_name, sizeof (*name)))
 return 0;
 return -EFAULT;
 }
 @@ -246,15 +246,15 @@ asmlinkage int sys_olduname(struct oldol
 if (!access_ok(VERIFY_WRITE,name,sizeof(struct oldold_utsname)))
 return -EFAULT;
 
 -	error = __copy_to_user(&name->sysname,&system_utsname.sysname,__OLD_UTS_LEN);
 +	error = __copy_to_user(&name->sysname,&uts_name.sysname,__OLD_UTS_LEN);
 error -= __put_user(0,name->sysname+__OLD_UTS_LEN);
 -	error -= __copy_to_user(&name->nodename,&system_utsname.nodename,__OLD_UTS_LEN);
 +	error -= __copy_to_user(&name->nodename,&uts_name.nodename,__OLD_UTS_LEN);
 error -= __put_user(0,name->nodename+__OLD_UTS_LEN);
 -	error -= __copy_to_user(&name->release,&system_utsname.release,__OLD_UTS_LEN);
 +	error -= __copy_to_user(&name->release,&uts_name.release,__OLD_UTS_LEN);
 error -= __put_user(0,name->release+__OLD_UTS_LEN);
 -	error -= __copy_to_user(&name->version,&system_utsname.version,__OLD_UTS_LEN);
 +	error -= __copy_to_user(&name->version,&uts_name.version,__OLD_UTS_LEN);
 error -= __put_user(0,name->version+__OLD_UTS_LEN);
 -	error -= __copy_to_user(&name->machine,&system_utsname.machine,__OLD_UTS_LEN);
 +	error -= __copy_to_user(&name->machine,&uts_name.machine,__OLD_UTS_LEN);
 error = __put_user(0,name->machine+__OLD_UTS_LEN);
 error = error ? -EFAULT : 0;
 
 @@ -290,10 +290,10 @@ asmlinkage int _sys_sysmips(int cmd, lon
 return -EFAULT;
 
 down_write(&uts_sem);
 -		strncpy(system_utsname.nodename, nodename, len);
 +		strncpy(uts_name.nodename, nodename, len);
 nodename[__NEW_UTS_LEN] = '\0';
 -		strlcpy(system_utsname.nodename, nodename,
 -		        sizeof(system_utsname.nodename));
 +		strlcpy(uts_name.nodename, nodename,
 +		        sizeof(uts_name.nodename));
 up_write(&uts_sem);
 return 0;
 }
 --- ./arch/mips/kernel/sysirix.c.utsnamex	2006-02-07 01:18:42.000000000 +0300
 +++ ./arch/mips/kernel/sysirix.c	2006-02-07 01:18:50.000000000 +0300
 @@ -904,7 +904,7 @@ asmlinkage int irix_getdomainname(char _
 down_read(&uts_sem);
 if (len > __NEW_UTS_LEN)
 len = __NEW_UTS_LEN;
 -	err = copy_to_user(name, system_utsname.domainname, len) ? -EFAULT : 0;
 +	err = copy_to_user(name, uts_name.domainname, len) ? -EFAULT : 0;
 up_read(&uts_sem);
 
 return err;
 @@ -1147,11 +1147,11 @@ struct iuname {
 asmlinkage int irix_uname(struct iuname __user *buf)
 {
 down_read(&uts_sem);
 -	if (copy_from_user(system_utsname.sysname, buf->sysname, 65)
 -	    || copy_from_user(system_utsname.nodename, buf->nodename, 65)
 -	    || copy_from_user(system_utsname.release, buf->release, 65)
 -	    || copy_from_user(system_utsname.version, buf->version, 65)
 -	    || copy_from_user(system_utsname.machine, buf->machine, 65)) {
 +	if (copy_from_user(uts_name.sysname, buf->sysname, 65)
 +	    || copy_from_user(uts_name.nodename, buf->nodename, 65)
 +	    || copy_from_user(uts_name.release, buf->release, 65)
 +	    || copy_from_user(uts_name.version, buf->version, 65)
 +	    || copy_from_user(uts_name.machine, buf->machine, 65)) {
 return -EFAULT;
 }
 up_read(&uts_sem);
 --- ./arch/parisc/hpux/sys_hpux.c.utsnamex	2006-02-07 01:18:42.000000000 +0300
 +++ ./arch/parisc/hpux/sys_hpux.c	2006-02-07 01:18:50.000000000 +0300
 @@ -266,15 +266,15 @@ static int hpux_uname(struct hpux_utsnam
 
 down_read(&uts_sem);
 
 -	error = __copy_to_user(&name->sysname,&system_utsname.sysname,HPUX_UTSLEN-1);
 +	error = __copy_to_user(&name->sysname,&uts_name.sysname,HPUX_UTSLEN-1);
 error |= __put_user(0,name->sysname+HPUX_UTSLEN-1);
 -	error |= __copy_to_user(&name->nodename,&system_utsname.nodename,HPUX_UTSLEN-1);
 +	error |= __copy_to_user(&name->nodename,&uts_name.nodename,HPUX_UTSLEN-1);
 error |= __put_user(0,name->nodename+HPUX_UTSLEN-1);
 -	error |= __copy_to_user(&name->release,&system_utsname.release,HPUX_UTSLEN-1);
 +	error |= __copy_to_user(&name->release,&uts_name.release,HPUX_UTSLEN-1);
 error |= __put_user(0,name->release+HPUX_UTSLEN-1);
 -	error |= __copy_to_user(&name->version,&system_utsname.version,HPUX_UTSLEN-1);
 +	error |= __copy_to_user(&name->version,&uts_name.version,HPUX_UTSLEN-1);
 error |= __put_user(0,name->version+HPUX_UTSLEN-1);
 -	error |= __copy_to_user(&name->machine,&system_utsname.machine,HPUX_UTSLEN-1);
 +	error |= __copy_to_user(&name->machine,&uts_name.machine,HPUX_UTSLEN-1);
 error |= __put_user(0,name->machine+HPUX_UTSLEN-1);
 
 up_read(&uts_sem);
 @@ -373,8 +373,8 @@ int hpux_utssys(char *ubuf, int n, int t
 /*  TODO:  print a warning about using this?  */
 down_write(&uts_sem);
 error = -EFAULT;
 -		if (!copy_from_user(system_utsname.sysname, ubuf, len)) {
 -			system_utsname.sysname[len] = 0;
 +		if (!copy_from_user(uts_name.sysname, ubuf, len)) {
 +			uts_name.sysname[len] = 0;
 error = 0;
 }
 up_write(&uts_sem);
 @@ -400,8 +4
...
 
 
 |  
	|  |  |  
	| 
		
			| Re: [PATCH 1/4] Virtualization/containers: introduction [message #1337 is a reply to message #1331] | Mon, 06 February 2006 23:00   |  
			| 
				
				
					|  Dave Hansen Messages: 240
 Registered: October 2005
 | Senior Member |  |  |  
	| On Tue, 2006-02-07 at 00:57 +0300, Kirill Korotaev wrote: > @@ -1132,6 +1133,7 @@ static task_t *copy_process(unsigned lon
 >         p->ioprio = current->ioprio;
 >
 >         SET_LINKS(p);
 > +       (void)get_container(p->container);
 >         if (unlikely(p->ptrace & PT_PTRACED))
 >                 __ptrace_link(p, current->parent);
 
 This entire patch looks nice and very straightforward, except for this
 bit. :)  The "(void)" bit isn't usual kernel coding style.  You can
 probably kill it.
 
 BTW, why does get_container() return the container argument?
 get_task_struct(), for instance is just a do{}while(0) loop, so it
 doesn't have a return value.  Is there some magic later on in your patch
 set that utilizes this?
 
 One other really minor thing: I usually try to do is keep the !
 CONFIG_FOO functions static inlines, just like the full versions.  The
 advantage is that you get some compile-time type checking, even when
 your CONFIG option is off.
 
 -- Dave
 |  
	|  |  |  
	|  |  
	|  |  
	| 
		
			| Re: [PATCH 1/4] Virtualization/containers: introduction [message #1354 is a reply to message #1343] | Tue, 07 February 2006 06:30   |  
			| 
				
				
					|  Sam Vilain Messages: 73
 Registered: February 2006
 | Member |  |  |  
	| Rik van Riel wrote: > On Mon, 6 Feb 2006, Eric W. Biederman wrote:
 >
 >
 >>We are never going to form a consensus if all of the people doing
 >>implementations don't talk.
 >
 >
 > Speaking of which - it would be interesting to get Kirill's
 > comments on Eric's patchset ;)
 >
 > Once we know what's good and bad about both patchsets, we'll
 > be a lot closer to knowing what exactly should go upstream.
 
 Let's compare approaches of patchsets before the patchsets themselves.
 
 It seems to be, should we:
 
 A) make a general form of virtualising PIDs, and hope this assists
 later virtualisation efforts (Eric's patch)
 
 B) make a general form of containers/jails/vservers/vpses, and layer
 PID virtualisation on top of it somewhere (as in openvz, vserver)
 
 I can't think of any real use cases where you would specifically want A)
 without B).
 
 Also, the problem space in B) is now very well explored.  To start with
 A) would mean to throw away 4+ years of experience at this approach
 (just counting vserver and variants - not FreeBSD Jail, etc).  Trying to
 re-base B) atop a massive refactoring and new patch like A) would incur
 a lot of work; however fitting it into B) is natural and solved
 conceptually and in practice, with the only drawback I see being that
 the use cases mentioned above wouldn't suffer from the side-effects of
 B).
 
 Perhaps I'm wrong there, but that's my gut feeling.
 
 Sam.
 |  
	|  |  |  
	|  |  
	|  |  
	| 
		
			| Re: [PATCH 1/4] Virtualization/containers: introduction [message #1368 is a reply to message #1337] | Tue, 07 February 2006 12:22   |  
			| 
				
				
					|  dev Messages: 1693
 Registered: September 2005
 Location: Moscow
 | Senior Member |  
 |  |  
	| >>@@ -1132,6 +1133,7 @@ static task_t *copy_process(unsigned lon >>        p->ioprio = current->ioprio;
 >>
 >>        SET_LINKS(p);
 >>+       (void)get_container(p->container);
 >>        if (unlikely(p->ptrace & PT_PTRACED))
 >>                __ptrace_link(p, current->parent);
 >
 >
 > This entire patch looks nice and very straightforward, except for this
 > bit. :)  The "(void)" bit isn't usual kernel coding style.  You can
 > probably kill it.
 it is to avoid warning message the value has no effect.
 
 > BTW, why does get_container() return the container argument?
 > get_task_struct(), for instance is just a do{}while(0) loop, so it
 > doesn't have a return value.  Is there some magic later on in your patch
 > set that utilizes this?
 ok, I will remake it without a return value. not a real problem at all.
 
 > One other really minor thing: I usually try to do is keep the !
 > CONFIG_FOO functions static inlines, just like the full versions.  The
 > advantage is that you get some compile-time type checking, even when
 > your CONFIG option is off.
 it is not always appropriate :( I try to follow this as well :)
 
 Kirill
 |  
	|  |  |  
	|  |  
	|  |  
	|  |  
	|  |  
	| 
		
			| Re: [PATCH 1/4] Virtualization/containers: introduction [message #1384 is a reply to message #1354] | Tue, 07 February 2006 15:42   |  
			| 
				
				
					|  ebiederm Messages: 1354
 Registered: February 2006
 | Senior Member |  |  |  
	| Sam Vilain <sam@vilain.net> writes: 
 > Rik van Riel wrote:
 >> On Mon, 6 Feb 2006, Eric W. Biederman wrote:
 >>
 >>> We are never going to form a consensus if all of the people doing
 >>> implementations don't talk.
 >> Speaking of which - it would be interesting to get Kirill's
 >> comments on Eric's patchset ;)
 >> Once we know what's good and bad about both patchsets, we'll
 >> be a lot closer to knowing what exactly should go upstream.
 >
 > Let's compare approaches of patchsets before the patchsets themselves.
 >
 > It seems to be, should we:
 >
 >    A) make a general form of virtualising PIDs, and hope this assists
 >       later virtualisation efforts (Eric's patch)
 >
 >    B) make a general form of containers/jails/vservers/vpses, and layer
 >       PID virtualisation on top of it somewhere (as in openvz, vserver)
 >
 > I can't think of any real use cases where you would specifically want A)
 > without B).
 
 
 You misrepresent my approach.
 
 First there is a huge commonality in the code bases between the
 different implementations and I have already gotten preliminary
 acceptance from the vserver developers, that my approach is sane.  The
 major difference is what user interface does the kernel export,
 and I posted my user interface.
 
 What user interface to export is a debate worth having.
 
 For a lot of things getting the details just so is very important
 to long term maintainability and it is not my impression that anyone
 has done that yet.
 
 
 Second I am not trying to just implement a form of virtualizing PIDs.
 Heck I don't intend to virtualize anything.  The kernel has already
 virtualized everything I require.  I want to implement multiple
 instances of the current kernel global namespaces.  All I want is
 to be able to use the same name twice in user space and not have
 a conflict.
 
 
 Beyond getting multiple instance of all of the kernel namespaces
 (which is the hard requirement for migration) my approach is to
 see what is needed for projects like vserver and vps and see how
 their needs can be met as well.
 
 
 I disagree with a struct container simply because I do not see what
 value it happens to bring to the table.  I have yet to see a problem
 that it solves that I have not solved yet.
 
 
 In addition I depart from vserver and other implementations in another
 regard.  It is my impression a lot of their work has been done so
 those projects are maintainable outside of the kernel, which makes
 sense as that is where those code bases live.  But I don't think that
 gives the best solution for an in kernel implementation, which is
 what we are implementing.
 
 
 So far I have succeeded in communicating with both the IBM and
 vserver developers.  Hopefully I can do the same with Kirill Korotaev
 and the OpenVz team.   I think my implementation stands up to
 criticism.  But expect surprises in the way I solve a number of
 problems.
 
 I suspect I will find similar surprises in the OpenVz code.
 
 Time to do some more research I guess.
 
 Eric
 |  
	|  |  |  
	|  |  
	| 
		
			| Re: [PATCH 1/4] Virtualization/containers: introduction [message #1386 is a reply to message #1354] | Tue, 07 February 2006 16:57   |  
			| 
				
				
					|  Hubertus Franke Messages: 16
 Registered: February 2006
 | Junior Member |  |  |  
	| Sam Vilain wrote: > Rik van Riel wrote:
 >
 >> On Mon, 6 Feb 2006, Eric W. Biederman wrote:
 >>
 >>
 >>> We are never going to form a consensus if all of the people doing
 >>> implementations don't talk.
 >>
 >>
 >> Speaking of which - it would be interesting to get Kirill's
 >> comments on Eric's patchset ;)
 >>
 >> Once we know what's good and bad about both patchsets, we'll
 >> be a lot closer to knowing what exactly should go upstream.
 >
 >
 > Let's compare approaches of patchsets before the patchsets themselves.
 >
 > It seems to be, should we:
 >
 >   A) make a general form of virtualising PIDs, and hope this assists
 >      later virtualisation efforts (Eric's patch)
 >
 >   B) make a general form of containers/jails/vservers/vpses, and layer
 >      PID virtualisation on top of it somewhere (as in openvz, vserver)
 >
 > I can't think of any real use cases where you would specifically want A)
 > without B).
 >
 > Also, the problem space in B) is now very well explored.  To start with
 > A) would mean to throw away 4+ years of experience at this approach
 > (just counting vserver and variants - not FreeBSD Jail, etc).  Trying to
 > re-base B) atop a massive refactoring and new patch like A) would incur
 > a lot of work; however fitting it into B) is natural and solved
 > conceptually and in practice, with the only drawback I see being that
 > the use cases mentioned above wouldn't suffer from the side-effects of
 > B).
 >
 Sam, that is a bit far fetched. I looked and experienced myself with both
 approaches and there is a lot of functional overlap, with both of them
 having advantages and disadvantages.
 What Eric provides is an alternative to the PID virtualization part of openvz.
 Indeed it is a pid isolation more then anything else (with some dealing at
 the boundary condition).
 I personally don't see much problem in replacing the pid virtualization of
 openvz with that of pidspaces.
 So the correct thing to do here is as RvR points out, simple discuss the
 merits and drawbacks of each PID approach for now and settle on one and
 move on....
 
 Here are my two cents on this.
 
 The pid-namespace (pspace) provides an approach of fully separate
 the allocation and maintenance of the pids and treating the <pspace,pid>
 tuple as an entity to uniquely identify a task and vice versa.
 As a result the logic of lookup can be pushed down the find_task_by_pid()
 lookup. There are specific cases where the init_task of a container or
 pspace needs to be checked to ensure that signals/waits and alike are properly
 handled across pspace boundaries. I think this is an intuitive and clean way.
 It also completely avoids the problem of having to think about all the locations
 at the user/kernel boundary where a vpid/pid conversion needs to take place.
 It also avoids the problems that logically vpids and pids are different types and
 therefore it would have been good to have type checking automatically identify
 problem spots.
 On the negative side, it does require to maintain a pidmap per pidspace.
 
 The vpid approach has the drawbacks of having to identify the conversion spots
 of all vpid vs. pid semantics. On the otherhand it does take advantage
 of the fact that no virtualization has to take place until a "container"
 has been migrated, thus rendering most of the vpid<->pid calls to be
 noops.
 
 What I like about the pspace approach is that it explicitely defines in the code
 when I am using a different pspace for the lookup. That is kind of hidden in
 the vpid/pid approach.
 
 The container is just an umbrella object that ties every "virtualized" subsystem
 together.
 
 So, what do other folks, see as pluses and minus of each approach.
 Once we have a more complete listing of these, maybe the decision becomes
 more obvious !
 
 Regards
 -- Hubertus
 |  
	|  |  |  
	|  |  
	| 
		
			| Re: [PATCH 1/4] Virtualization/containers: introduction [message #1391 is a reply to message #1386] | Tue, 07 February 2006 20:19   |  
			| 
				
				
					|  serue Messages: 750
 Registered: February 2006
 | Senior Member |  |  |  
	| Quoting Hubertus Franke (frankeh@watson.ibm.com): > The vpid approach has the drawbacks of having to identify the conversion
 > spots
 > of all vpid vs. pid semantics. On the otherhand it does take advantage
 > of the fact that no virtualization has to take place until a "container"
 > has been migrated, thus rendering most of the vpid<->pid calls to be
 > noops.
 >
 > What I like about the pspace approach is that it explicitely defines in the
 > code
 > when I am using a different pspace for the lookup. That is kind of hidden in
 > the vpid/pid approach.
 
 I agree with this.  From a maintenance pov, imagining making a minor
 change to some pid-related code, if I see something doing effectively
 "if (pspace1 == pspace2 && pid1==pid2)" that is clear, whereas trying
 to remember whether I'm supposed to return the pid or vpid can get
 really confusing.  We actually had some errors with that while we were
 developing the first vpid patchset we posted in december.
 
 I believe that from a vserver point of view, either approach will work.
 You either create a new pspace and make 'init' pid 1 in that pspace, or,
 in the openvz approach, you start virtualizing with a hashtable so
 userspace in the new vserver/container/vz/whateveritscalled sees that
 init as pid1, while the rest of the system sees it as pid 3270 or
 something.
 
 Likewise, for checkpoint/restore and migration, either approach works.
 All we really need is, on restore/migrate, to be able to create
 processes with their original pids, so we can do that either with real
 pids in a new container, or virtualized pids faked for process in the
 same vz.
 
 Are there other uses of pid virtualization which one approach or the
 other cannot accomodate?
 
 If not, then I for one lean towards the more maintainable code.  (which
 I'm sure we're not agreed on is Eric's, but imvho it is)
 
 -serge
 |  
	|  |  |  
	| 
		
			| Re: [PATCH 1/4] Virtualization/containers: introduction [message #1392 is a reply to message #1391] | Tue, 07 February 2006 20:46   |  
			| 
				
				
					|  Hubertus Franke Messages: 16
 Registered: February 2006
 | Junior Member |  |  |  
	| Serge E. Hallyn wrote: > Quoting Hubertus Franke (frankeh@watson.ibm.com):
 >
 >>The vpid approach has the drawbacks of having to identify the conversion
 >>spots
 >>of all vpid vs. pid semantics. On the otherhand it does take advantage
 >>of the fact that no virtualization has to take place until a "container"
 >>has been migrated, thus rendering most of the vpid<->pid calls to be
 >>noops.
 >>
 >>What I like about the pspace approach is that it explicitely defines in the
 >>code
 >>when I am using a different pspace for the lookup. That is kind of hidden in
 >>the vpid/pid approach.
 >
 >
 > I agree with this.  From a maintenance pov, imagining making a minor
 > change to some pid-related code, if I see something doing effectively
 > "if (pspace1 == pspace2 && pid1==pid2)" that is clear, whereas trying
 > to remember whether I'm supposed to return the pid or vpid can get
 > really confusing.  We actually had some errors with that while we were
 > developing the first vpid patchset we posted in december.
 >
 > I believe that from a vserver point of view, either approach will work.
 > You either create a new pspace and make 'init' pid 1 in that pspace, or,
 > in the openvz approach, you start virtualizing with a hashtable so
 > userspace in the new vserver/container/vz/whateveritscalled sees that
 > init as pid1, while the rest of the system sees it as pid 3270 or
 > something.
 >
 > Likewise, for checkpoint/restore and migration, either approach works.
 > All we really need is, on restore/migrate, to be able to create
 > processes with their original pids, so we can do that either with real
 > pids in a new container, or virtualized pids faked for process in the
 > same vz.
 >
 > Are there other uses of pid virtualization which one approach or the
 > other cannot accomodate?
 
 Kirill brought up that VPS can span a cluster..
 if so how do you (Kirill) do that? You pre-partition the pids into allocation
 ranges for each container?
 Eitherway, if this is an important feature, then one needs to look at
 how that is achieved in pspace (e.g. mod the pidmap_alloc() function
 to take legal ranges into account). Should still be straight forward.
 
 >
 > If not, then I for one lean towards the more maintainable code.  (which
 > I'm sure we're not agreed on is Eric's, but imvho it is)
 >
 > -serge
 >
 |  
	|  |  |  
	|  |  
	|  |  
	|  |  
	| 
		
			| Re: [PATCH 1/4] Virtualization/containers: introduction [message #1400 is a reply to message #1384] | Tue, 07 February 2006 22:43   |  
			| 
				
				
					|  Sam Vilain Messages: 73
 Registered: February 2006
 | Member |  |  |  
	| Eric W. Biederman wrote [note: quoting sections out of order]: > Sam Vilain <sam@vilain.net> writes:
 >>Let's compare approaches of patchsets before the patchsets themselves.
 >>It seems to be, should we:
 >>   A) make a general form of virtualising PIDs, and hope this assists
 >>      later virtualisation efforts (Eric's patch)
 >>I can't think of any real use cases where you would specifically want A)
 >>without B).
 > You misrepresent my approach.
 
 ok, after reading more of your post, agreed.
 
 > What user interface to export is a debate worth having.
 
 This is the bit that needs a long period of prototyping and experimental
 use IMHO.  So in essence, we're agreeing on that point.
 
 > First there is a huge commonality in the code bases between the
 > different implementations and I have already gotten preliminary
 > acceptance from the vserver developers, that my approach is sane.  The
 > major difference is what user interface does the kernel export,
 > and I posted my user interface.
 > Second I am not trying to just implement a form of virtualizing PIDs.
 > Heck I don't intend to virtualize anything.  The kernel has already
 > virtualized everything I require.  I want to implement multiple
 > instances of the current kernel global namespaces.  All I want is
 > to be able to use the same name twice in user space and not have
 > a conflict.
 
 Right, well, I think our approaches might have more in common than
 I previously thought.
 
 Indeed, it seems that at least one of the features of Linux-VServer I am
 preparing for consideration for inclusion into Linus' tree are your work
 :-).
 
 > Beyond getting multiple instance of all of the kernel namespaces
 > (which is the hard requirement for migration) my approach is to
 > see what is needed for projects like vserver and vps and see how
 > their needs can be met as well.
 
 ok, but the question is - doesn't this just constitute a refactoring
 once the stable virtualisation code is in?
 
 I'm just a bit nervous about trying to
 refactor-approach-and-concepts-as-we-go.
 
 But look, I'll take a closer look at your patches, and see if I can
 merge with you anyhow.  Thanks for the git repo!
 
 Sam.
 |  
	|  |  |  
	|  |  
	|  |  
	| 
		
			| Re: The issues for agreeing on a virtualization/namespaces implementation. [message #1403 is a reply to message #1396] | Tue, 07 February 2006 23:35   |  
			| 
				
				
					|  Hubertus Franke Messages: 16
 Registered: February 2006
 | Junior Member |  |  |  
	| Eric W. Biederman wrote: > I think I can boil the discussion down into some of the fundamental
 > questions that we are facing.
 >
 Man, bearly can keep up with this email load. Addressed some in
 previous thread, but will reiterate under this context.
 
 > Currently everyone seems to agree that we need something like
 > my namespace concept that isolates multiple resources.
 >
 > We need these for
 > PIDS
 > UIDS
 > SYSVIPC
 > NETWORK
 > UTSNAME
 > FILESYSTEM
 > etc.
 >
 > The questions seem to break down into:
 > 1) Where do we put the references to the different namespaces?
 >    - Do we put the references in a struct container that we reference from struct task_struct?
 >    - Do we put the references directly in struct task_struct?
 
 You "cache"   task_struct->container->hotsubsys   under task_struct->hotsubsys.
 We don't change containers other then at clone time, so no coherency issue here !!!!
 Which subsystems pointers to "cache", should be agreed by the experts,
 but first approach should always not to cache and go through the container.
 
 >
 > 2) What is the syscall interface to create these namespaces?
 >    - Do we add clone flags?
 >      (Plan 9 style)
 
 Like that approach .. flexible .. particular when one has well specified namespaces.
 
 >    - Do we add a syscall (similar to setsid) per namespace?
 >      (Traditional unix style)?
 
 Where does that approach end .. what's wrong with doing it at clone() time ?
 Mainly the naming issue. Just providing a flag does not give me name.
 
 >    - Do we in addition add syscalls to manipulate containers generically?
 >
 >    I don't think having a single system call to create a container and a new
 >    instance of each namespace is reasonable as that does not give us a
 >    path into the future when we create yet another namespace.
 >
 Agreed.
 >    If we have one syscall per each namespace why would we need a container
 >    structure?
 >
 > 3) How do we refer to namespaces and containers when we are not members?
 >    - Do we refer to them indirectly by processes or other objects that
 >      we can see and are members?
 >    - Do we assign some kind of unique id to the containers?
 
 In containers I simply created an explicite name, which ofcourse colides with the
 clone() approach ..
 One possibility is to allow associating a name with a namespace.
 For instance
 int set_namespace_name( long flags, const char *name ) /* the once we are using in clone */
 {
 if (!flag)
 set name of container associated with current.
 if (flag())
 set the name if only one container is associated with the namespace(s)
 identified .. or some similar rule
 }
 
 >
 >
 > 4) How do we implement each of these namespaces?
 >    Besides being maintainable are there other constraints?
 >
 Good question... at least with PID and FS two are there ..
 >
 > 5) How do we control the resource inside a namespace starting
 >    from a process that is outside of that namespace?
 >    - The filesystem mount namespace gave an interesting answer.
 >      So it is quite possible other namespaces will give
 >      equally interesting and surprising answers.
 >
 >
 > 6) How do we do all of this efficiently without a noticeable impact on
 >    performance?
 >    - I have already heard concerns that I might be introducing cache
 >      line bounces and thus increasing tasklist_lock hold time.
 >      Which on big way systems can be a problem.
 
 Possible to split the lock up now.. one for each pidspace ?
 
 >
 > 7) How do we allow a process inside a container to create containers
 >    for it's children?
 >    - In general this is trivial but there are a few ugly issues
 >      here.
 
 Speaking of pids only here ...
 Does it matter, you just hang all those containers hang of init.
 What ever hierarchy they form is external ...
 
 >
 > I think these are the key questions of the conversation.
 >
 >
 > Personally so long as we get true namespaces, implemented in a
 > performant and maintainable way that a process from the inside can't
 > distinguish from what we have now I have no hard requirements.
 >
 >
 > Eric
 >
 
 -- Hubertus
 |  
	|  |  |  
	|  |  
	|  |  
	|  |  
	|  |  
	|  |  
	|  |  
	| 
		
			| Re: The issues for agreeing on a virtualization/namespaces implementation. [message #1412 is a reply to message #1403] | Wed, 08 February 2006 05:23   |  
			| 
				
				
					|  ebiederm Messages: 1354
 Registered: February 2006
 | Senior Member |  |  |  
	| Hubertus Franke <frankeh@watson.ibm.com> writes: 
 > Eric W. Biederman wrote:
 >> I think I can boil the discussion down into some of the fundamental
 >> questions that we are facing.
 >>
 > Man, bearly can keep up with this email load. Addressed some in
 > previous thread, but will reiterate under this context.
 
 :)
 
 >> Currently everyone seems to agree that we need something like
 >> my namespace concept that isolates multiple resources.
 >> We need these for PIDS
 >> UIDS
 >> SYSVIPC
 >> NETWORK
 >> UTSNAME
 >> FILESYSTEM
 >> etc.
 >> The questions seem to break down into:
 >> 1) Where do we put the references to the different namespaces?
 >> - Do we put the references in a struct container that we reference from struct
 > task_struct?
 >>    - Do we put the references directly in struct task_struct?
 >
 > You "cache"   task_struct->container->hotsubsys   under task_struct->hotsubsys.
 > We don't change containers other then at clone time, so no coherency issue here
 > !!!!
 > Which subsystems pointers to "cache", should be agreed by the experts,
 > but first approach should always not to cache and go through the container.
 >
 >> 2) What is the syscall interface to create these namespaces?
 >>    - Do we add clone flags?       (Plan 9 style)
 >
 > Like that approach .. flexible .. particular when one has well specified
 > namespaces.
 >
 >>    - Do we add a syscall (similar to setsid) per namespace?
 >>      (Traditional unix style)?
 >
 > Where does that approach end .. what's wrong with doing it at clone() time ?
 > Mainly the naming issue. Just providing a flag does not give me name.
 
 It really is a fairly even toss up.  The usual argument for doing it
 this way is that you will get a endless stream of arguments added to
 fork+exec other wise.  Look of posix_spawn or the windows version if
 you want an example.  Bits to clone are skirting the edge of a slippery
 slope.
 
 >> 3) How do we refer to namespaces and containers when we are not members?
 >>    - Do we refer to them indirectly by processes or other objects that
 >>      we can see and are members?
 >>    - Do we assign some kind of unique id to the containers?
 >
 > In containers I simply created an explicite name, which ofcourse colides with
 > the
 > clone() approach ..
 > One possibility is to allow associating a name with a namespace.
 > For instance
 > int set_namespace_name( long flags, const char *name ) /* the once we are using
 > in clone */
 > {
 > 	if (!flag)
 > 		set name of container associated with current.
 > 	if (flag())
 > 		set the name if only one container is associated with the
 > namespace(s)
 > 		identified .. or some similar rule
 > }
 >
 
 What I have done which seems easier than creating new names is to refer
 to the process which has the namespace I want to manipulate.
 
 >> 6) How do we do all of this efficiently without a noticeable impact on
 >>    performance?
 >>    - I have already heard concerns that I might be introducing cache
 >>      line bounces and thus increasing tasklist_lock hold time.
 >>      Which on big way systems can be a problem.
 >
 > Possible to split the lock up now.. one for each pidspace ?
 
 At the moment it is worth thinking about.  If the problem isn't
 so bad that people aren't actively working on it we don't have to
 solve the problem for a little while, just be aware of it.
 
 >> 7) How do we allow a process inside a container to create containers
 >>    for it's children?
 >>    - In general this is trivial but there are a few ugly issues
 >>      here.
 >
 > Speaking of pids only here ...
 > Does it matter, you just hang all those containers hang of init.
 > What ever hierarchy they form is external ...
 
 In general it is simple.  For resource accounting, and for naming so
 you can migrate a container with a nested container it is a question
 you need to be slightly careful with.
 
 Eric
 |  
	|  |  |  
	|  |  
	|  |  
	| 
		
			| Re: [PATCH 1/4] Virtualization/containers: introduction [message #1419 is a reply to message #1405] | Wed, 08 February 2006 04:21   |  
			| 
				
				
					|  Paul Jackson Messages: 157
 Registered: February 2006
 | Senior Member |  |  |  
	| The driving force for cpusets are NUMA architectures. 
 Cpusets represent the topologies of NUMA systems, with hierarchies
 of cabinets, drawers, boards, packages, cores, hyperthreads, and
 with chunks of main memory associated usually with the board, but
 sometimes a layer or two up or down.
 
 Since not all cpus have the same access performance (delay and
 bandwidth) to all memory chunks (nodes), for optimum performance one
 wants to bind tasks, cpus and memory together, so as to run tasks on
 sets of cpus and memory that are "near" to each other, and to size the
 sets appropriately for the workload presented by the tasks.
 
 Cpusets have no explicit awareness of topology; they just provides a
 file system style hierarchy of named, permissioned sets, where each set
 has:
 mems - the memory nodes in that set
 cpus - the cpus in that set
 tasks - the tasks running on these cpus and mems
 
 For any cpuset, the 'cpus' and 'mems' are a subset of its parent in the
 hierarchy, and the root of the hierarchy (usually mounted at /dev/cpuset)
 contains all the online cpus and mems in the system.
 
 --
 I won't rest till it's the best ...
 Programmer, Linux Scalability
 Paul Jackson <pj@sgi.com> 1.925.600.0401
 |  
	|  |  |  
	| 
		
			| Re: The issues for agreeing on a virtualization/namespaces implementation. [message #1420 is a reply to message #1409] | Wed, 08 February 2006 04:37   |  
			| 
				
				
					|  Herbert Poetzl Messages: 239
 Registered: February 2006
 | Senior Member |  |  |  
	| On Tue, Feb 07, 2006 at 08:52:15PM -0700, Eric W. Biederman wrote: > "Serge E. Hallyn" <serue@us.ibm.com> writes:
 >
 > >
 > > What I tried to do in a proof of concept long ago was to have
 > > CLONE_NETNS mean that you get access to all the network devices, but
 > > then you could drop/add them.  Conceptually I prefer that to getting an
 > > empty namespace, but I'm not sure whether there's any practical use
 > > where you'd want that...
 >
 > My observation was that the network stack does not come out cleanly
 > as a namespace unless you adopt the rule that a network device
 > belongs to exactly one network namespace.
 
 yep, that's what the first network virtualization for
 Linux-VServer aimed at, but found too complicated
 the second one uses 'pairs' of communicating devices
 to send between guests/host
 
 > With that rule dealing with the network stack is just a matter of
 > making some currently global variables/data structures per container.
 
 yep, like the universal loopback and so ...
 
 > A pain to do the first round but easy to maintain once you are there
 > and the logic of the code doesn't need to change.
 
 best,
 Herbert
 
 > Eric
 |  
	|  |  |  
	| 
		
			| Re: The issues for agreeing on a virtualization/namespaces implementation. [message #1421 is a reply to message #1396] | Wed, 08 February 2006 04:56   |  
			| 
				
				
					|  Herbert Poetzl Messages: 239
 Registered: February 2006
 | Senior Member |  |  |  
	| On Tue, Feb 07, 2006 at 03:06:51PM -0700, Eric W. Biederman wrote: >
 > I think I can boil the discussion down into some of the fundamental
 > questions that we are facing.
 >
 > Currently everyone seems to agree that we need something like
 > my namespace concept that isolates multiple resources.
 >
 > We need these for
 > PIDS
 > UIDS
 > SYSVIPC
 > NETWORK
 > UTSNAME
 > FILESYSTEM
 > etc.
 >
 > The questions seem to break down into:
 > 1) Where do we put the references to the different namespaces?
 >    - Do we put the references in a struct container that we
 >      reference from struct task_struct?
 
 no, just let the tasks be in groups of disjunct spaces
 so that they can have shared or private structures for
 each of the identified spaces
 
 >    - Do we put the references directly in struct task_struct?
 
 yes, IMHO that's the way to do it .. Linux-VServer is
 moving in this direction for some time now, but we need
 to add a special space for context capabilities and
 context flags, basically a context struct, similar to
 a namespace ...
 
 > 2) What is the syscall interface to create these namespaces?
 >    - Do we add clone flags?
 >      (Plan 9 style)
 
 I'd definitely prefer that, maybe if necessary with a
 'new' clone syscall which allows to do a little more
 than clone does now, e.g.
 
 - clone into container/context/guest
 - set space initializations and limits, etc ...
 
 >    - Do we add a syscall (similar to setsid) per namespace?
 >      (Traditional unix style)?
 
 doesn't make sense for the creation, but a syscall
 for moving between and management of spaces are very
 important ...
 
 >    - Do we in addition add syscalls to manipulate containers
 >      generically?
 >
 >    I don't think having a single system call to create a container
 >    and a new instance of each namespace is reasonable as that does
 >    not give us a path into the future when we create yet another
 >    namespace.
 >
 >    If we have one syscall per each namespace why would we need a
 >    container structure?
 
 for the beforementioned permissions and flags, but
 you can as well see it as separate 'context' space
 
 > 3) How do we refer to namespaces and containers when we are not members?
 >    - Do we refer to them indirectly by processes or other objects that
 >      we can see and are members?
 
 the process will be an unique identifier to the
 namespace, but it might not be easy to use it, so
 IMHO it might at least make sense to ...
 
 >    - Do we assign some kind of unique id to the containers?
 
 have an unique identifier for the context space so
 that somebody can cherry pick namespaces from there
 
 (in this case, the context space would hold references
 to the other namespaces which are the ones used by
 new tasks created into a context, basically a template
 for them)
 
 > 4) How do we implement each of these namespaces?
 >    Besides being maintainable are there other constraints?
 
 extensible and with keeping hierarchical structures
 in mind .. would be bad if we could not do sub-contexts
 without a complete rewrite
 
 > 5) How do we control the resource inside a namespace starting
 >    from a process that is outside of that namespace?
 
 depends on the resource, but limits have to go to the
 context structure, so that they apply for the entire
 context space, not just for a task
 
 >    - The filesystem mount namespace gave an interesting answer.
 >      So it is quite possible other namespaces will give
 >      equally interesting and surprising answers.
 
 > 6) How do we do all of this efficiently without a noticeable impact on
 >    performance?
 
 >    - I have already heard concerns that I might be introducing cache
 >      line bounces and thus increasing tasklist_lock hold time.
 >      Which on big way systems can be a problem.
 
 well, we have to be careful with the complexity ...
 keep things like refcounting and 'magic' on clone
 simple, for the default case ...
 
 > 7) How do we allow a process inside a container to create containers
 >    for it's children?
 
 >    - In general this is trivial but there are a few ugly issues
 >      here.
 
 a flat implementation will work for a hierarchical
 design if certain things are handled properly, just
 think resource management for sub-contexts and
 changes in the parent's limits ...
 
 > I think these are the key questions of the conversation.
 >
 >
 > Personally so long as we get true namespaces, implemented in a
 > performant and maintainable way that a process from the inside can't
 > distinguish from what we have now I have no hard requirements.
 
 keep up the good work!
 
 best,
 Herbert
 
 > Eric
 |  
	|  |  | 
 
 
 Current Time: Sun Oct 26 14:12:06 GMT 2025 
 Total time taken to generate the page: 0.08987 seconds |