OpenVZ Forum


Home » Mailing lists » Devel » [RFC][PATCH 1/5] Virtualization/containers: startup
[RFC][PATCH 1/5] Virtualization/containers: startup [message #1239] Fri, 03 February 2006 16:57 Go to next message
Kirill Korotaev is currently offline  Kirill Korotaev
Messages: 137
Registered: January 2006
Senior Member
This patch introduces some abstract container/VPS kernel structure and
tiny amount of operations on it.

Patches following this one will be used for virtualization of some
resources based on this container infrastructure, including those VPIDs
patches I sent before.

What is important here is that:
- each VPS has unique ID
- each process in kernel can belong to one VPS only

Kirill

--- ./include/linux/vps_info.h.vps_info 2006-02-03 16:38:33.000000000 +0300
+++ ./include/linux/vps_info.h 2006-02-03 16:49:26.000000000 +0300
@@ -0,0 +1,41 @@
+#ifndef __LINUX_VPS_INFO_H_
+#define __LINUX_VPS_INFO_H_
+
+#include <asm/types.h>
+#include <asm/atomic.h>
+
+struct task_struct;
+
+struct vps_info {
+ u32 id;
+ struct task_struct *init_task;
+ atomic_t refcnt;
+};
+
+extern struct vps_info host_vps_info;
+typedef struct vps_info *vps_t;
+
+static inline vps_t get_vps(vps_t vps)
+{
+ atomic_inc(&vps->refcnt);
+ return vps;
+}
+
+static inline void put_vps(vps_t vps)
+{
+ atomic_dec(&vps->refcnt);
+}
+
+#include <linux/sched.h>
+#include <asm/current.h>
+
+static inline vps_t set_current_vps(vps_t vps)
+{
+ vps_t ret;
+
+ ret = current->owner_vps;
+ current->owner_vps = vps;
+ return ret;
+}
+
+#endif
--- ./include/linux/sched.h.vps_info 2006-02-03 16:38:33.000000000 +0300
+++ ./include/linux/sched.h 2006-02-03 16:43:42.000000000 +0300
@@ -687,6 +687,7 @@

struct audit_context; /* See audit.c */
struct mempolicy;
+struct vps_info;

struct task_struct {
volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
@@ -845,6 +846,7 @@
struct backing_dev_info *backing_dev_info;

struct io_context *io_context;
+ struct vps_info *owner_vps;

unsigned long ptrace_message;
siginfo_t *last_siginfo; /* For ptrace use. */
@@ -875,6 +877,8 @@
struct rcu_head rcu;
};

+#define current_vps() (current->owner_vps)
+
static inline pid_t process_group(struct task_struct *tsk)
{
return tsk->signal->pgrp;
--- ./include/linux/init_task.h.vps_info 2006-02-03 16:38:33.000000000 +0300
+++ ./include/linux/init_task.h 2006-02-03 16:43:18.000000000 +0300
@@ -3,6 +3,7 @@

#include <linux/file.h>
#include <linux/rcupdate.h>
+#include <linux/vps_info.h>

#define INIT_FDTABLE \
{ \
@@ -121,6 +122,7 @@
.journal_info = NULL, \
.cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \
.fs_excl = ATOMIC_INIT(0), \
+ .owner_vps = &host_vps_info, \
}


--- ./init/main.c.vps_info 2006-02-03 16:38:33.000000000 +0300
+++ ./init/main.c 2006-02-03 16:43:18.000000000 +0300
@@ -47,6 +47,7 @@
#include <linux/rmap.h>
#include <linux/mempolicy.h>
#include <linux/key.h>
+#include <linux/vps_info.h>

#include <asm/io.h>
#include <asm/bugs.h>
@@ -434,6 +435,13 @@
done = 1;
}

+struct vps_info host_vps_info = {
+ .id = 0,
+ .init_task = &init_task,
+ .refcnt = ATOMIC_INIT(1),
+};
+
+EXPORT_SYMBOL(host_vps_info);
/*
* Activate the first processor.
*/
--- ./kernel/fork.c.vps_info 2006-02-03 16:38:33.000000000 +0300
+++ ./kernel/fork.c 2006-02-03 16:43:18.000000000 +0300
@@ -44,6 +44,7 @@
#include <linux/rmap.h>
#include <linux/acct.h>
#include <linux/cn_proc.h>
+#include <linux/vps_info.h>

#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -1132,6 +1133,7 @@
p->ioprio = current->ioprio;

SET_LINKS(p);
+ get_vps(p->owner_vps);
if (unlikely(p->ptrace & PT_PTRACED))
__ptrace_link(p, current->parent);

--- ./kernel/exit.c.vps_info 2006-02-03 16:38:33.000000000 +0300
+++ ./kernel/exit.c 2006-02-03 16:43:18.000000000 +0300
@@ -31,6 +31,7 @@
#include <linux/signal.h>
#include <linux/cn_proc.h>
#include <linux/mutex.h>
+#include <linux/vps_info.h>

#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -107,6 +108,7 @@
spin_unlock(&p->proc_lock);
proc_pid_flush(proc_dentry);
release_thread(p);
+ put_vps(p->owner_vps);
put_task_struct(p);

p = leader;
[RFC][PATCH 2/5] Virtualization/containers: UIDs [message #1240 is a reply to message #1239] Fri, 03 February 2006 17:01 Go to previous messageGo to next message
dev is currently offline  dev
Messages: 1693
Registered: September 2005
Location: Moscow
Senior Member

The simplest virtualization of UID hashes, just to demonstrate our
approach. Each container has it's own set of uids and should simply
allocate hash/initialize it on startup.

Kirill

--- ./include/linux/vps_info.h.vps_uid_hash 2006-02-03 16:49:26.000000000 +0300
+++ ./include/linux/vps_info.h 2006-02-03 16:49:51.000000000 +0300
@@ -5,11 +5,14 @@
#include <asm/atomic.h>

struct task_struct;
+struct list_head;

struct vps_info {
u32 id;
struct task_struct *init_task;
atomic_t refcnt;
+
+ struct list_head *vps_uid_hash;
};

extern struct vps_info host_vps_info;
--- ./kernel/user.c.vps_uid_hash 2006-02-03 16:49:08.000000000 +0300
+++ ./kernel/user.c 2006-02-03 16:49:51.000000000 +0300
@@ -14,6 +14,7 @@
#include <linux/bitops.h>
#include <linux/key.h>
#include <linux/interrupt.h>
+#include <linux/vps_info.h>

/*
* UID task count cache, to get fast user lookup in "alloc_uid"
@@ -24,7 +25,8 @@
#define UIDHASH_SZ (1 << UIDHASH_BITS)
#define UIDHASH_MASK (UIDHASH_SZ - 1)
#define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK)
-#define uidhashentry(uid) (uidhash_table + __uidhashfn((uid)))
+#define uidhashentry(uid) (current_vps()->vps_uid_hash + \
+ __uidhashfn((uid)))

static kmem_cache_t *uid_cachep;
static struct list_head uidhash_table[UIDHASH_SZ];
@@ -200,6 +202,7 @@

/* Insert the root user immediately (init already runs as root) */
spin_lock_irq(&uidhash_lock);
+ host_vps_info.vps_uid_hash = uidhash_table;
uid_hash_insert(&root_user, uidhashentry(0));
spin_unlock_irq(&uidhash_lock);
[RFC][PATCH 3/5] Virtualization/containers: UTSNAME [message #1241 is a reply to message #1239] Fri, 03 February 2006 17:04 Go to previous messageGo to next message
dev is currently offline  dev
Messages: 1693
Registered: September 2005
Location: Moscow
Senior Member

Virtualization of UTSNAME.
As simple as UID hashes, just virtualizes system_utsname variable
(diff-vps-uts-name-core) and replaces access to it with vps_utsname in
required places (diff-vps-uts-name-kern).

Kirill

--- ./include/linux/vps_info.h.vps_uts_core 2006-02-03 16:49:51.000000000 +0300
+++ ./include/linux/vps_info.h 2006-02-03 16:50:39.000000000 +0300
@@ -6,6 +6,7 @@

struct task_struct;
struct list_head;
+struct new_utsname;

struct vps_info {
u32 id;
@@ -13,6 +14,7 @@
atomic_t refcnt;

struct list_head *vps_uid_hash;
+ struct new_utsname *vps_uts_name;
};

extern struct vps_info host_vps_info;
--- ./include/linux/utsname.h.vps_uts_core 2006-02-03 16:38:17.000000000 +0300
+++ ./include/linux/utsname.h 2006-02-03 16:50:26.000000000 +0300
@@ -1,6 +1,8 @@
#ifndef _LINUX_UTSNAME_H
#define _LINUX_UTSNAME_H

+#include <linux/vps_info.h>
+
#define __OLD_UTS_LEN 8

struct oldold_utsname {
@@ -31,6 +33,7 @@
};

extern struct new_utsname system_utsname;
+#define vps_utsname (*(current_vps()->vps_uts_name))

extern struct rw_semaphore uts_sem;
#endif
--- ./init/main.c.vps_uts_core 2006-02-03 16:43:18.000000000 +0300
+++ ./init/main.c 2006-02-03 16:50:26.000000000 +0300
@@ -439,6 +439,7 @@
.id = 0,
.init_task = &init_task,
.refcnt = ATOMIC_INIT(1),
+ .vps_uts_name = &system_utsname,
};

EXPORT_SYMBOL(host_vps_info);

--- ./arch/alpha/kernel/osf_sys.c.vps_uts 2006-02-03 11:56:22.000000000 +0300
+++ ./arch/alpha/kernel/osf_sys.c 2006-02-03 13:35:25.000000000 +0300
@@ -402,15 +402,15 @@

down_read(&uts_sem);
error = -EFAULT;
- if (copy_to_user(name + 0, system_utsname.sysname, 32))
+ if (copy_to_user(name + 0, vps_utsname.sysname, 32))
goto out;
- if (copy_to_user(name + 32, system_utsname.nodename, 32))
+ if (copy_to_user(name + 32, vps_utsname.nodename, 32))
goto out;
- if (copy_to_user(name + 64, system_utsname.release, 32))
+ if (copy_to_user(name + 64, vps_utsname.release, 32))
goto out;
- if (copy_to_user(name + 96, system_utsname.version, 32))
+ if (copy_to_user(name + 96, vps_utsname.version, 32))
goto out;
- if (copy_to_user(name + 128, system_utsname.machine, 32))
+ if (copy_to_user(name + 128, vps_utsname.machine, 32))
goto out;

error = 0;
@@ -449,8 +449,8 @@

down_read(&uts_sem);
for (i = 0; i < len; ++i) {
- __put_user(system_utsname.domainname[i], name + i);
- if (system_utsname.domainname[i] == '\0')
+ __put_user(vps_utsname.domainname[i], name + i);
+ if (vps_utsname.domainname[i] == '\0')
break;
}
up_read(&uts_sem);
@@ -608,11 +608,11 @@
osf_sysinfo(int command, char __user *buf, long count)
{
static char * sysinfo_table[] = {
- system_utsname.sysname,
- system_utsname.nodename,
- system_utsname.release,
- system_utsname.version,
- system_utsname.machine,
+ vps_utsname.sysname,
+ vps_utsname.nodename,
+ vps_utsname.release,
+ vps_utsname.version,
+ vps_utsname.machine,
"alpha", /* instruction set architecture */
"dummy", /* hardware serial number */
"dummy", /* hardware manufacturer */
--- ./arch/i386/kernel/sys_i386.c.vps_uts 2006-01-03 06:21:10.000000000 +0300
+++ ./arch/i386/kernel/sys_i386.c 2006-02-03 13:35:25.000000000 +0300
@@ -217,7 +217,7 @@
if (!name)
return -EFAULT;
down_read(&uts_sem);
- err=copy_to_user(name, &system_utsname, sizeof (*name));
+ err=copy_to_user(name, &vps_utsname, sizeof (*name));
up_read(&uts_sem);
return err?-EFAULT:0;
}
@@ -233,15 +233,15 @@

down_read(&uts_sem);

- error = __copy_to_user(&name->sysname,&system_utsname.sysname,__OLD_UTS_LEN);
+ error = __copy_to_user(&name->sysname,&vps_utsname.sysname,__OLD_UTS_LEN);
error |= __put_user(0,name->sysname+__OLD_UTS_LEN);
- error |= __copy_to_user(&name->nodename,&system_utsname.nodename,__OLD_UTS_LEN);
+ error |= __copy_to_user(&name->nodename,&vps_utsname.nodename,__OLD_UTS_LEN);
error |= __put_user(0,name->nodename+__OLD_UTS_LEN);
- error |= __copy_to_user(&name->release,&system_utsname.release,__OLD_UTS_LEN);
+ error |= __copy_to_user(&name->release,&vps_utsname.release,__OLD_UTS_LEN);
error |= __put_user(0,name->release+__OLD_UTS_LEN);
- error |= __copy_to_user(&name->version,&system_utsname.version,__OLD_UTS_LEN);
+ error |= __copy_to_user(&name->version,&vps_utsname.version,__OLD_UTS_LEN);
error |= __put_user(0,name->version+__OLD_UTS_LEN);
- error |= __copy_to_user(&name->machine,&system_utsname.machine,__OLD_UTS_LEN);
+ error |= __copy_to_user(&name->machine,&vps_utsname.machine,__OLD_UTS_LEN);
error |= __put_user(0,name->machine+__OLD_UTS_LEN);

up_read(&uts_sem);
--- ./arch/m32r/kernel/sys_m32r.c.vps_uts 2006-01-03 06:21:10.000000000 +0300
+++ ./arch/m32r/kernel/sys_m32r.c 2006-02-03 13:35:25.000000000 +0300
@@ -199,7 +199,7 @@
if (!name)
return -EFAULT;
down_read(&uts_sem);
- err=copy_to_user(name, &system_utsname, sizeof (*name));
+ err=copy_to_user(name, &vps_utsname, sizeof (*name));
up_read(&uts_sem);
return err?-EFAULT:0;
}
--- ./arch/mips/kernel/linux32.c.vps_uts 2006-02-03 11:56:03.000000000 +0300
+++ ./arch/mips/kernel/linux32.c 2006-02-03 13:35:25.000000000 +0300
@@ -1150,7 +1150,7 @@
int ret = 0;

down_read(&uts_sem);
- if (copy_to_user(name,&system_utsname,sizeof *name))
+ if (copy_to_user(name,&vps_utsname,sizeof *name))
ret = -EFAULT;
up_read(&uts_sem);

--- ./arch/mips/kernel/syscall.c.vps_uts 2006-02-03 11:56:03.000000000 +0300
+++ ./arch/mips/kernel/syscall.c 2006-02-03 13:35:25.000000000 +0300
@@ -229,7 +229,7 @@
*/
asmlinkage int sys_uname(struct old_utsname * name)
{
- if (name && !copy_to_user(name, &system_utsname, sizeof (*name)))
+ if (name && !copy_to_user(name, &vps_utsname, sizeof (*name)))
return 0;
return -EFAULT;
}
@@ -246,15 +246,15 @@
if (!access_ok(VERIFY_WRITE,name,sizeof(struct oldold_utsname)))
return -EFAULT;

- error = __copy_to_user(&name->sysname,&system_utsname.sysname,__OLD_UTS_LEN);
+ error = __copy_to_user(&name->sysname,&vps_utsname.sysname,__OLD_UTS_LEN);
error -= __put_user(0,name->sysname+__OLD_UTS_LEN);
- error -= __copy_to_user(&name->nodename,&system_utsname.nodename,__OLD_UTS_LEN);
+ error -= __copy_to_user(&name->nodename,&vps_utsname.nodename,__OLD_UTS_LEN);
error -= __put_user(0,name->nodename+__OLD_UTS_LEN);
- error -= __copy_to_user(&name->release,&system_utsname.release,__OLD_UTS_LEN);
+ error -= __copy_to_user(&name->release,&vps_utsname.release,__OLD_UTS_LEN);
error -= __put_user(0,name->release+__OLD_UTS_LEN);
- error -= __copy_to_user(&name->version,&system_utsname.version,__OLD_UTS_LEN);
+ error -= __copy_to_user(&name->version,&vps_utsname.version,__OLD_UTS_LEN);
error -= __put_user(0,name->version+__OLD_UTS_LEN);
- error -= __copy_to_user(&name->machine,&system_utsname.machine,__OLD_UTS_LEN);
+ error -= __copy_to_user(&name->machine,&vps_utsname.machine,__OLD_UTS_LEN);
error = __put_user(0,name->machine+__OLD_UTS_LEN);
error = error ? -EFAULT : 0;

@@ -290,10 +290,10 @@
return -EFAULT;

down_write(&uts_sem);
- strncpy(system_utsname.nodename, nodename, len);
+ strncpy(vps_utsname.nodename, nodename, len);
nodename[__NEW_UTS_LEN] = '\0';
- strlcpy(system_utsname.nodename, nodename,
- sizeof(system_utsname.nodename));
+ strlcpy(vps_utsname.nodename, nodename,
+ sizeof(vps_utsname.nodename));
up_write(&uts_sem);
return 0;
}
--- ./arch/mips/kernel/sysirix.c.vps_uts 2006-02-03 11:56:03.000000000 +0300
+++ ./arch/mips/kernel/sysirix.c 2006-02-03 13:35:25.000000000 +0300
@@ -904,7 +904,7 @@
down_read(&uts_sem);
if (len > __NEW_UTS_LEN)
len = __NEW_UTS_LEN;
- err = copy_to_user(name, system_utsname.domainname, len) ? -EFAULT : 0;
+ err = copy_to_user(name, vps_utsname.domainname, len) ? -EFAULT : 0;
up_read(&uts_sem);

return err;
@@ -1147,11 +1147,11 @@
asmlinkage int irix_uname(struct iuname __user *buf)
{
down_read(&uts_sem);
- if (copy_from_user(system_utsname.sysname, buf->sysname, 65)
- || copy_from_user(system_utsname.nodename, buf->nodename, 65)
- || copy_from_user(system_utsname.release, buf->release, 65)
- || copy_from_user(system_utsname.version, buf->version, 65)
- || copy_from_user(system_utsname.machine, buf->machine, 65)) {
+ if (copy_from_user(vps_utsname.sysname, buf->sysname, 65)
+ || copy_from_user(vps_utsname.nodename, buf->nodename, 65)
+ || copy_from_user(vps_utsname.release, buf->release, 65)
+ || copy_from_user(vps_utsname.version, buf->version, 65)
+ || copy_from_user(vps_utsname.machine, buf->machine, 65)) {
return -EFAULT;
}
up_read(&uts_sem);
--- ./arch/parisc/hpux/sys_hpux.c.vps_uts 2006-02-03 11:56:03.000000000 +0300
+++ ./arch/parisc/hpux/sys_hpux.c 2006-02-03 13:35:25.000000000 +0300
@@ -266,15 +266,15 @@

down_read(&uts_sem);

- error = __copy_to_user(&name->sysname,&system_utsname.sysname,HPUX_UTSLEN-1);
+ error = __copy_to_user(&name->sysname,&vps_utsname.sysname,HPUX_UTSLEN-1);
error |= __put_user(0,name->sysname+HPUX_UTSLEN-1);
- error |= __copy_to_user(&name->nodename,&system_utsname.nodename,HPUX_UTSLEN-1);
+ error |= __copy_to_user(&name->nodename,&vps_utsname.nodename,HPUX_UTSLEN-1);
error |= __put_user(0,name->nodename+HPUX_UTSLEN-1);
- error |= __copy_to_user(&name->release,&system_utsname.release,HPUX_UTSLEN-1);
+ error |= __copy_to_user(&name->release,&vps_utsname.release,HPUX_UTSLEN-1);
error |= __put_user(0,name->release+HPUX_UTSLEN-1);
- error |= __copy_to_user(&name->version,&system_u
...

Re: [RFC][PATCH 1/5] Virtualization/containers: startup [message #1244 is a reply to message #1239] Fri, 03 February 2006 17:22 Go to previous messageGo to next message
dev is currently offline  dev
Messages: 1693
Registered: September 2005
Location: Moscow
Senior Member

Linus,

Not a problem and fully agree with you.
Just had to better review patch before sending.

Do you have any other ideas/comments on this?
I will send additional IPC/filesystems virtualization patches a bit later.

Kirill

> On Fri, 3 Feb 2006, Kirill Korotaev wrote:
>> This patch introduces some abstract container/VPS kernel structure and tiny
>> amount of operations on it.
>
> Please don't use things like "vps_t".
>
> It's a _mistake_ to use typedef for structures and pointers. When you see
> a
>
> vps_t a;
>
> in the source, what does it mean?
>
> In contrast, if it says
>
> struct virtual_container *a;
>
> you can actually tell what "a" is.
>
> Lots of people think that typedefs "help readability". Not so. They are
> useful only for
>
> (a) totally opaque objects (where the typedef is actively used to _hide_
> what the object is).
>
> Example: "pte_t" etc opaque objects that you can only access using
> the proper accessor functions.
>
> NOTE! Opaqueness and "accessor functions" are not good in themselves.
> The reason we have them for things like pte_t etc is that there
> really is absolutely _zero_ portably accessible information there.
>
> (b) Clear integer types, where the abstraction _helps_ avoid confusion
> whether it is "int" or "long".
>
> u8/u16/u32 are perfectly fine typedefs.
>
> NOTE! Again - there needs to be a _reason_ for this. If something is
> "unsigned long", then there's no reason to do
>
> typedef long myflags_t;
>
> but if there is a clear reason for why it under certain circumstances
> might be an "unsigned int" and under other configurations might be
> "unsigned long", then by all means go ahead and use a typedef.
>
> (c) when you use sparse to literally create a _new_ type for
> type-checking.
>
> Maybe there are other cases too, but the rule should basically be to NEVER
> EVER use a typedef unless you can clearly match one of those rules.
>
> In general, a pointer, or a struct that has elements that can reasonably
> be directly accessed should _never_ be a typedef.
>
> Linus
>
Re: [RFC][PATCH 1/5] Virtualization/containers: startup [message #1251 is a reply to message #1239] Fri, 03 February 2006 18:34 Go to previous messageGo to next message
Dave Hansen is currently offline  Dave Hansen
Messages: 240
Registered: October 2005
Senior Member
On Fri, 2006-02-03 at 09:49 -0800, Linus Torvalds wrote:
> One thing I don't particularly like is some of the naming. To me "vps"
> doesn't sound particularly generic or logical. I realize that it probably
> makes perfect sense to you (and I assume it just means "virtual private
> servers"), but especially if you see patches 1-3 to really be independent
> of any "actual" virtualization code that is totally generic, I'd actually
> prefer a less specialized name.

I just did a global s/vps/container/ and it looks pretty reasonable, at
least from my point of view.

Couple of minor naming nitpick questions, though. Is vps/container_info
really what we want to call it? It seems to me to be the basis for a
real "container", without the _info part.

"tsk->owner_container" That makes it sound like a pointer to the "task
owner's container". How about "owning_container"? The "container
owning this task". Or, maybe just "container"?

Any particular reason for the "u32 id" in the vps_info struct as opposed
to one of the more generic types? Do we want to abstract this one in
the same way we do pid_t?

The "host" in "host_container_info" doesn't mean much to me. Though, I
guess it has some context in the UML space. Would "init_container_info"
or "root_container_info" be more descriptive?

Lastly, is this a place for krefs? I don't see a real need for a
destructor yet, but the idea is fresh in my mind.

How does the attached patch look?

-- Dave

memhotplug-dave/include/linux/container.h | 40 +++++++++++++++++++++++++
memhotplug-dave/include/linux/init_task.h | 2 +
memhotplug-dave/include/linux/sched.h | 4 ++
memhotplug-dave/init/main.c | 8 +++++
memhotplug-dave/kernel/exit.c | 2 +
memhotplug-dave/kernel/fork.c | 2 +
6 files changed, 58 insertions(+)

diff -puN include/linux/container.h~container-base include/linux/container.h
--- memhotplug/include/linux/container.h~container-base 2006-02-03 10:21:36.000000000 -0800
+++ memhotplug-dave/include/linux/container.h 2006-02-03 10:21:36.000000000 -0800
@@ -0,0 +1,40 @@
+#ifndef __LINUX_VPS_INFO_H_
+#define __LINUX_VPS_INFO_H_
+
+#include <asm/types.h>
+#include <asm/atomic.h>
+
+struct task_struct;
+
+struct container {
+ u32 id;
+ struct task_struct *init_task;
+ atomic_t refcnt;
+};
+
+extern struct container root_container;
+
+static inline container_t get_container(container_t container)
+{
+ atomic_inc(&container->refcnt);
+ return container;
+}
+
+static inline void put_container(container_t container)
+{
+ atomic_dec(&container->refcnt);
+}
+
+#include <linux/sched.h>
+#include <asm/current.h>
+
+static inline container_t set_current_container(container_t container)
+{
+ container_t ret;
+
+ ret = current->owning_container;
+ current->owning_container = container;
+ return ret;
+}
+
+#endif
diff -puN include/linux/init_task.h~container-base include/linux/init_task.h
--- memhotplug/include/linux/init_task.h~container-base 2006-02-03 10:21:36.000000000 -0800
+++ memhotplug-dave/include/linux/init_task.h 2006-02-03 10:23:49.000000000 -0800
@@ -3,6 +3,7 @@

#include <linux/file.h>
#include <linux/rcupdate.h>
+#include <linux/container.h>

#define INIT_FDTABLE \
{ \
@@ -121,6 +122,7 @@ extern struct group_info init_groups;
.journal_info = NULL, \
.cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \
.fs_excl = ATOMIC_INIT(0), \
+ .owning_container = &root_container, \
}


diff -puN include/linux/sched.h~container-base include/linux/sched.h
--- memhotplug/include/linux/sched.h~container-base 2006-02-03 10:21:36.000000000 -0800
+++ memhotplug-dave/include/linux/sched.h 2006-02-03 10:21:36.000000000 -0800
@@ -687,6 +687,7 @@ static inline void prefetch_stack(struct

struct audit_context; /* See audit.c */
struct mempolicy;
+struct container;

struct task_struct {
volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
@@ -844,6 +845,7 @@ struct task_struct {
struct backing_dev_info *backing_dev_info;

struct io_context *io_context;
+ struct container *owning_container;

unsigned long ptrace_message;
siginfo_t *last_siginfo; /* For ptrace use. */
@@ -874,6 +876,8 @@ struct task_struct {
struct rcu_head rcu;
};

+#define current_container() (current->owning_container)
+
static inline pid_t process_group(struct task_struct *tsk)
{
return tsk->signal->pgrp;
diff -puN init/main.c~container-base init/main.c
--- memhotplug/init/main.c~container-base 2006-02-03 10:21:36.000000000 -0800
+++ memhotplug-dave/init/main.c 2006-02-03 10:21:36.000000000 -0800
@@ -47,6 +47,7 @@
#include <linux/rmap.h>
#include <linux/mempolicy.h>
#include <linux/key.h>
+#include <linux/container.h>

#include <asm/io.h>
#include <asm/bugs.h>
@@ -434,6 +435,13 @@ void __init parse_early_param(void)
done = 1;
}

+struct container root_container = {
+ .id = 0,
+ .init_task = &init_task,
+ .refcnt = ATOMIC_INIT(1),
+};
+
+EXPORT_SYMBOL(root_container);
/*
* Activate the first processor.
*/
diff -puN kernel/exit.c~container-base kernel/exit.c
--- memhotplug/kernel/exit.c~container-base 2006-02-03 10:21:36.000000000 -0800
+++ memhotplug-dave/kernel/exit.c 2006-02-03 10:21:36.000000000 -0800
@@ -31,6 +31,7 @@
#include <linux/signal.h>
#include <linux/cn_proc.h>
#include <linux/mutex.h>
+#include <linux/container.h>

#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -107,6 +108,7 @@ repeat:
spin_unlock(&p->proc_lock);
proc_pid_flush(proc_dentry);
release_thread(p);
+ put_container(p->owning_container);
put_task_struct(p);

p = leader;
diff -puN kernel/fork.c~container-base kernel/fork.c
--- memhotplug/kernel/fork.c~container-base 2006-02-03 10:21:36.000000000 -0800
+++ memhotplug-dave/kernel/fork.c 2006-02-03 10:21:36.000000000 -0800
@@ -44,6 +44,7 @@
#include <linux/rmap.h>
#include <linux/acct.h>
#include <linux/cn_proc.h>
+#include <linux/container.h>

#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -1132,6 +1133,7 @@ static task_t *copy_process(unsigned lon
p->ioprio = current->ioprio;

SET_LINKS(p);
+ get_container(p->owning_container);
if (unlikely(p->ptrace & PT_PTRACED))
__ptrace_link(p, current->parent);

_
Re: [RFC][PATCH 1/5] Virtualization/containers: startup [message #1253 is a reply to message #1239] Fri, 03 February 2006 17:15 Go to previous messageGo to next message
Linus Torvalds is currently offline  Linus Torvalds
Messages: 11
Registered: February 2006
Junior Member
On Fri, 3 Feb 2006, Kirill Korotaev wrote:
>
> This patch introduces some abstract container/VPS kernel structure and tiny
> amount of operations on it.

Please don't use things like "vps_t".

It's a _mistake_ to use typedef for structures and pointers. When you see
a

vps_t a;

in the source, what does it mean?

In contrast, if it says

struct virtual_container *a;

you can actually tell what "a" is.

Lots of people think that typedefs "help readability". Not so. They are
useful only for

(a) totally opaque objects (where the typedef is actively used to _hide_
what the object is).

Example: "pte_t" etc opaque objects that you can only access using
the proper accessor functions.

NOTE! Opaqueness and "accessor functions" are not good in themselves.
The reason we have them for things like pte_t etc is that there
really is absolutely _zero_ portably accessible information there.

(b) Clear integer types, where the abstraction _helps_ avoid confusion
whether it is "int" or "long".

u8/u16/u32 are perfectly fine typedefs.

NOTE! Again - there needs to be a _reason_ for this. If something is
"unsigned long", then there's no reason to do

typedef long myflags_t;

but if there is a clear reason for why it under certain circumstances
might be an "unsigned int" and under other configurations might be
"unsigned long", then by all means go ahead and use a typedef.

(c) when you use sparse to literally create a _new_ type for
type-checking.

Maybe there are other cases too, but the rule should basically be to NEVER
EVER use a typedef unless you can clearly match one of those rules.

In general, a pointer, or a struct that has elements that can reasonably
be directly accessed should _never_ be a typedef.

Linus
Re: [RFC][PATCH 1/5] Virtualization/containers: startup [message #1254 is a reply to message #1244] Fri, 03 February 2006 17:49 Go to previous messageGo to next message
Linus Torvalds is currently offline  Linus Torvalds
Messages: 11
Registered: February 2006
Junior Member
On Fri, 3 Feb 2006, Kirill Korotaev wrote:
>
> Do you have any other ideas/comments on this?
> I will send additional IPC/filesystems virtualization patches a bit later.

I think that a patch like this - particularly just the 1/5 part - makes
total sense, because regardless of any other details of virtualization,
every single scheme is going to need this.

So I think at least 1/5 (and quite frankly, 2-3/5 look that way too) are
things that we can (and probably should) merge quickly, so that people can
then actually look at the differences in the places that they may actually
disagree about.

One thing I don't particularly like is some of the naming. To me "vps"
doesn't sound particularly generic or logical. I realize that it probably
makes perfect sense to you (and I assume it just means "virtual private
servers"), but especially if you see patches 1-3 to really be independent
of any "actual" virtualization code that is totally generic, I'd actually
prefer a less specialized name.

I realize that what things like this is used for is VPS web hosting etc,
but that's really naming by current common usage, not by any real "logic".

In other words, I personally would have called them "container" or
something similar, rather than "vps_info". See? From a logical
implementation standpoint, the fact that it is right now most commonly
used for VPS hosting is totally irrelevant to the code, no?

(And hey, maybe your "vps" means something different. In which case my
argument makes even more sense ;)

Linus
Summary: PID virtualization , Containers, Migration [message #1255 is a reply to message #1254] Fri, 03 February 2006 18:36 Go to previous messageGo to next message
Hubertus Franke is currently offline  Hubertus Franke
Messages: 16
Registered: February 2006
Junior Member
Seems lots of interest is happening in this direction now
with various ideas and patches having been submitted.

I think the terms should be clarified a bit, even in our own
discussion we are using them loosely. Let's keep focusing
on the PID issues of containers and migration.

- user have come to expect that pids range from [0..PIDMAX]
- we can only migrate a self contained process tree (with some glue at the top)
- when we migrate we expect that the pid is preserved to the user
- we expect that UNIX semantics of pid visibility (e.g. kill etc) do
not cross the container boundary (with some glue at the top)

Now there are two views to achieve the preservation that go at the heart
of the implementation only (not at the concept at all).
Two fundamental concepts are thrown around here:
(a) PID virtualization
(b) PID spaces

Let's be clear what I mean by these concepts:

PID virtualization:
===================
The kernel continues to use its current internal pid management.
Each task has a unique internal pid, throughout its life on that
particular kernel image.
At the user/kernel boundary these pids are virtualized on a
per container base. Hence pids coming from user space are converted to
an internal pid and internal pids handed to the user are converted
to the virtual pid.
As a result, all places along the user/kernel boundary that deal with
pids need to be identified and the conversion function needs to be called.
Also the vpid needs to be allocated/freed on fork/exit.

How this virtualization is maintained is an implementation detail.
One can keep a full "pid->vpid" and "vpi->pid" hash with the container
or play optimizations such as the OpenVZ folks to keep extra virtual
numbers in the task->pids[type] structure and or making
vpid == internal pid for non-containerized processes.

Right now the OpenVZ patch supplies a nice and clean means for this.
In a sense our patch also followes this approach because we explicitely
have these conversion functions and we too have similar optimizations
by masking the container id into the internal pid.
Other then on the details of the pid virtualization, the patches seem
to actually be the same ..

What has been pointed out is that
- calling these conversion function is overhead, cumbersome.
- confusing, hence dangerous, since we are maintaining different logical
pid domains here under the same type pid_t

PID spaces:
===========

Pidspaces drive the isolation based on containers all the way down into the
current pid management. This at first sounds scary, but it will remove
lots of problem with the above approach.

Instead of keeping a single internal pid range and
its associated managment (aka pidspace), we provide one pidspace for each
container. All lookup functions operate on the container's pidspace,
hence this approach should remove the conversion functions and the two
logical domains, which seem to be big problem with the first approach.

Alan Cox made a very important statement, namely there is no reason that
we require unique pids in the kernel, even gave some examples where it isn't
true today. What becomes however a problem is the fact that now rather then
storing the pid as a reference, we need to now store the <pid,container> pair,
or more elegantly the pointer to the task. This later part unfortunately
has the problem of expired references to task structs.

I think that Eric's patch on weak task references, if I understood it correctly,
might come in handy. Instead of storing the task pointer, one stores the
task reference.

What needs to be solved is the container spawner transition into a new container
just like in the pid virtualization case.

----------------

Do people agree on these two characterizations and if so lets agree on what is
the right approach. From Linus's and Alan's comments I read they favor exploring
two, because of the problems listed with the first approach.

----------------

Other issues..

Herbert, brought up the issue of container hierarchy. Again, I think different
things come to mind here.

If indeed the containers are fully isolated from each other then its seems more
a lifecycle management. E.g. migration of a container implies migration of all
its children containers.
From a kernel perspective, containers form a flat hierarchy.
From a user's perspective one might allow certain operations (e.g. running the
process list of another container) and here constraints can be enforced.

e.g. ps --show-container mycontainer will show a specific containers
Re: [RFC][PATCH 1/5] Virtualization/containers: startup [message #1256 is a reply to message #1254] Fri, 03 February 2006 18:36 Go to previous messageGo to next message
Rik van Riel is currently offline  Rik van Riel
Messages: 11
Registered: February 2006
Junior Member
On Fri, 3 Feb 2006, Linus Torvalds wrote:

> So I think at least 1/5 (and quite frankly, 2-3/5 look that way too) are
> things that we can (and probably should) merge quickly, so that people
> can then actually look at the differences in the places that they may
> actually disagree about.

Agreed. There are a lot of common bits between the various
virtualization solutions, if we can get those out of the way
it should be a lot easier to get the last bits done...

--
All Rights Reversed
Re: [RFC][PATCH 1/5] Virtualization/containers: startup [message #1257 is a reply to message #1251] Fri, 03 February 2006 18:55 Go to previous messageGo to next message
Jeff Garzik is currently offline  Jeff Garzik
Messages: 9
Registered: February 2006
Junior Member
Dave Hansen wrote:
> On Fri, 2006-02-03 at 09:49 -0800, Linus Torvalds wrote:
>
>>One thing I don't particularly like is some of the naming. To me "vps"
>>doesn't sound particularly generic or logical. I realize that it probably
>>makes perfect sense to you (and I assume it just means "virtual private
>>servers"), but especially if you see patches 1-3 to really be independent
>>of any "actual" virtualization code that is totally generic, I'd actually
>>prefer a less specialized name.
>
>
> I just did a global s/vps/container/ and it looks pretty reasonable, at
> least from my point of view.

I would have chosen the much shorter "box" or "jar", but that's just me :)


> "tsk->owner_container" That makes it sound like a pointer to the "task
> owner's container". How about "owning_container"? The "container
> owning this task". Or, maybe just "container"?

slip 'parent' in there...


> Any particular reason for the "u32 id" in the vps_info struct as opposed
> to one of the more generic types? Do we want to abstract this one in
> the same way we do pid_t?
>
> The "host" in "host_container_info" doesn't mean much to me. Though, I
> guess it has some context in the UML space. Would "init_container_info"
> or "root_container_info" be more descriptive?

probably

Jeff
Re: [RFC][PATCH 1/5] Virtualization/containers: startup [message #1258 is a reply to message #1251] Fri, 03 February 2006 19:18 Go to previous messageGo to next message
Hubertus Franke is currently offline  Hubertus Franke
Messages: 16
Registered: February 2006
Junior Member
Dave Hansen wrote:
> On Fri, 2006-02-03 at 09:49 -0800, Linus Torvalds wrote:
>
>>One thing I don't particularly like is some of the naming. To me "vps"
>>doesn't sound particularly generic or logical. I realize that it probably
>>makes perfect sense to you (and I assume it just means "virtual private
>>servers"), but especially if you see patches 1-3 to really be independent
>>of any "actual" virtualization code that is totally generic, I'd actually
>>prefer a less specialized name.
>
>
> I just did a global s/vps/container/ and it looks pretty reasonable, at
> least from my point of view.
>
> Couple of minor naming nitpick questions, though. Is vps/container_info
> really what we want to call it? It seems to me to be the basis for a
> real "container", without the _info part.
>
> "tsk->owner_container" That makes it sound like a pointer to the "task
> owner's container". How about "owning_container"? The "container
> owning this task". Or, maybe just "container"?
>
> Any particular reason for the "u32 id" in the vps_info struct as opposed
> to one of the more generic types? Do we want to abstract this one in
> the same way we do pid_t?
>
> The "host" in "host_container_info" doesn't mean much to me. Though, I
> guess it has some context in the UML space. Would "init_container_info"
> or "root_container_info" be more descriptive?
>
> Lastly, is this a place for krefs? I don't see a real need for a
> destructor yet, but the idea is fresh in my mind.
>
> How does the attached patch look?
>

Looks good to me ...

Similar to parts of our patch set, so overlap is good that means on
large parts we agree.
Let's go with Dave's adaption, since it already addresses some of
Linus's concerns and I start moving the pid isolation (in contrast to
pid virtualization) over this.

-- Hubertus
Re: [RFC][PATCH 1/5] Virtualization/containers: startup [message #1259 is a reply to message #1251] Fri, 03 February 2006 19:56 Go to previous messageGo to next message
Hubertus Franke is currently offline  Hubertus Franke
Messages: 16
Registered: February 2006
Junior Member
missed the typedef on container ..
I am still on 2.6.15.2

> Couple of minor naming nitpick questions, though. Is vps/container_info
> really what we want to call it? It seems to me to be the basis for a
> real "container", without the _info part.
>
> "tsk->owner_container" That makes it sound like a pointer to the "task
> owner's container". How about "owning_container"? The "container
> owning this task". Or, maybe just "container"?
>
> Any particular reason for the "u32 id" in the vps_info struct as opposed
> to one of the more generic types? Do we want to abstract this one in
> the same way we do pid_t?
>
> The "host" in "host_container_info" doesn't mean much to me. Though, I
> guess it has some context in the UML space. Would "init_container_info"
> or "root_container_info" be more descriptive?
>
> Lastly, is this a place for krefs? I don't see a real need for a
> destructor yet, but the idea is fresh in my mind.
>
> How does the attached patch look?

here is that adapted ....

Index: linux-2.6.15/include/linux/container.h
============================================================ =======
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.15/include/linux/container.h 2006-02-03 14:52:12.000000000
-0500
@@ -0,0 +1,42 @@
+#ifndef __LINUX_CONTAINER_H_
+#define __LINUX_CONTAINER_H_
+
+#include <asm/types.h>
+#include <asm/atomic.h>
+
+struct task_struct;
+
+struct container {
+ u32 id;
+ struct task_struct *init_task;
+ atomic_t refcnt;
+};
+
+extern struct container root_container;
+
+static inline struct container*
+get_container(struct container *container)
+{
+ atomic_inc(&container->refcnt);
+ return container;
+}
+
+static inline void put_container(struct container *container)
+{
+ atomic_dec(&container->refcnt);
+}
+
+#include <linux/sched.h>
+#include <asm/current.h>
+
+static inline struct container*
+set_current_container(struct container *container)
+{
+ struct container *ret;
+
+ ret = current->container;
+ current->container = container;
+ return ret;
+}
+
+#endif
Index: linux-2.6.15/include/linux/init_task.h
============================================================ =======
--- linux-2.6.15.orig/include/linux/init_task.h 2006-02-03
14:50:39.000000000 -0500
+++ linux-2.6.15/include/linux/init_task.h 2006-02-03 14:52:12.000000000
-0500
@@ -3,6 +3,7 @@

#include <linux/file.h>
#include <linux/rcupdate.h>
+#include <linux/container.h>

#define INIT_FDTABLE \
{ \
@@ -121,6 +122,7 @@ extern struct group_info init_groups;
.journal_info = NULL, \
.cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \
.fs_excl = ATOMIC_INIT(0), \
+ .container = &root_container, \
}


Index: linux-2.6.15/include/linux/sched.h
============================================================ =======
--- linux-2.6.15.orig/include/linux/sched.h 2006-02-03
14:50:39.000000000 -0500
+++ linux-2.6.15/include/linux/sched.h 2006-02-03 14:52:12.000000000
-0500
@@ -682,6 +682,7 @@ static inline void prefetch_stack(struct

struct audit_context; /* See audit.c */
struct mempolicy;
+struct container;

struct task_struct {
volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
@@ -830,6 +831,7 @@ struct task_struct {
struct backing_dev_info *backing_dev_info;

struct io_context *io_context;
+ struct container *container;

unsigned long ptrace_message;
siginfo_t *last_siginfo; /* For ptrace use. */
@@ -859,6 +861,8 @@ struct task_struct {
atomic_t fs_excl; /* holding fs exclusive resources */
};

+#define current_container() (current->container)
+
static inline pid_t process_group(struct task_struct *tsk)
{
return tsk->signal->pgrp;
Index: linux-2.6.15/init/main.c
============================================================ =======
--- linux-2.6.15.orig/init/main.c 2006-02-03 14:50:39.000000000 -0500
+++ linux-2.6.15/init/main.c 2006-02-03 14:52:12.000000000 -0500
@@ -47,8 +47,8 @@
#include <linux/rmap.h>
#include <linux/mempolicy.h>
#include <linux/key.h>
+#include <linux/container.h>
#include <net/sock.h>
-
#include <asm/io.h>
#include <asm/bugs.h>
#include <asm/setup.h>
@@ -438,6 +438,13 @@ void __init parse_early_param(void)
done = 1;
}

+struct container root_container = {
+ .id = 0,
+ .init_task = &init_task,
+ .refcnt = ATOMIC_INIT(1),
+};
+
+EXPORT_SYMBOL(root_container);
/*
* Activate the first processor.
*/
Index: linux-2.6.15/kernel/exit.c
============================================================ =======
--- linux-2.6.15.orig/kernel/exit.c 2006-02-03 14:50:39.000000000 -0500
+++ linux-2.6.15/kernel/exit.c 2006-02-03 14:52:12.000000000 -0500
@@ -29,6 +29,7 @@
#include <linux/syscalls.h>
#include <linux/signal.h>
#include <linux/cn_proc.h>
+#include <linux/container.h>

#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -106,6 +107,7 @@ repeat:
spin_unlock(&p->proc_lock);
proc_pid_flush(proc_dentry);
release_thread(p);
+ put_container(p->container);
put_task_struct(p);

p = leader;
Index: linux-2.6.15/kernel/fork.c
============================================================ =======
--- linux-2.6.15.orig/kernel/fork.c 2006-02-03 14:50:39.000000000 -0500
+++ linux-2.6.15/kernel/fork.c 2006-02-03 14:52:12.000000000 -0500
@@ -43,6 +43,7 @@
#include <linux/rmap.h>
#include <linux/acct.h>
#include <linux/cn_proc.h>
+#include <linux/container.h>

#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -1121,6 +1122,7 @@ static task_t *copy_process(unsigned lon
p->ioprio = current->ioprio;

SET_LINKS(p);
+ get_container(p->container);
if (unlikely(p->ptrace & PT_PTRACED))
__ptrace_link(p, current->parent);
Re: [RFC][PATCH 1/5] Virtualization/containers: startup [message #1260 is a reply to message #1239] Fri, 03 February 2006 20:34 Go to previous messageGo to next message
Hubertus Franke is currently offline  Hubertus Franke
Messages: 16
Registered: February 2006
Junior Member
Greg KH wrote:
> On Fri, Feb 03, 2006 at 10:34:01AM -0800, Dave Hansen wrote:
>
>>Lastly, is this a place for krefs? I don't see a real need for a
>>destructor yet, but the idea is fresh in my mind.
>
>
> Well, what happens when you drop the last reference to this container?
> Right now, your patch doesn't cause anything to happen, and if that's
> acceptable, then fine, you don't need to use a struct kref.
>
> But if not, then why have a reference count at all? :)
>
> thanks,
>
> greg k-h
>

Greg ...

In our pid virtualization patches we removed the object automatically.
I presume that Kirill does the same .... although the code for that is not released.

I am working on converting our pid stuff so to follow the recommendation of Linux/Alan
to just do <container,pid> isolation through pidspaces.

I could release an allocation/ before hand. Based on Kirill's 1/5 patch adoption
(which took Linus's & Dave's naming comments into account)

How do we want to create the container?
In our patch we did it through a /proc/container filesystem.
Which created the container object and then on fork/exec switched over.

How about an additional sys_exec_container( exec_args + "container_name").
This does all the work like exec, but creates new container
with name "..." and attaches task to new container.
If name exists, an error -EEXIST will be raised !

-- Hubertus
Re: [RFC][PATCH 1/5] Virtualization/containers: startup [message #1263 is a reply to message #1251] Fri, 03 February 2006 20:19 Go to previous messageGo to next message
Greg KH is currently offline  Greg KH
Messages: 27
Registered: February 2006
Junior Member
On Fri, Feb 03, 2006 at 10:34:01AM -0800, Dave Hansen wrote:
>
> Lastly, is this a place for krefs? I don't see a real need for a
> destructor yet, but the idea is fresh in my mind.

Well, what happens when you drop the last reference to this container?
Right now, your patch doesn't cause anything to happen, and if that's
acceptable, then fine, you don't need to use a struct kref.

But if not, then why have a reference count at all? :)

thanks,

greg k-h
Re: [RFC][PATCH 1/5] Virtualization/containers: startup [message #1280 is a reply to message #1254] Sun, 05 February 2006 14:52 Go to previous messageGo to next message
dev is currently offline  dev
Messages: 1693
Registered: September 2005
Location: Moscow
Senior Member

>> Do you have any other ideas/comments on this?
>> I will send additional IPC/filesystems virtualization patches a bit later.
>
> I think that a patch like this - particularly just the 1/5 part - makes
> total sense, because regardless of any other details of virtualization,
> every single scheme is going to need this.
>
> So I think at least 1/5 (and quite frankly, 2-3/5 look that way too) are
> things that we can (and probably should) merge quickly, so that people can
> then actually look at the differences in the places that they may actually
> disagree about.
Can we merge also proc/sysfs/network/netfilters virtualization?

> In other words, I personally would have called them "container" or
> something similar, rather than "vps_info". See? From a logical
> implementation standpoint, the fact that it is right now most commonly
> used for VPS hosting is totally irrelevant to the code, no?
>
> (And hey, maybe your "vps" means something different. In which case my
> argument makes even more sense ;)
virtual private sandbox :)

Actually, we call them "virtual environments" (VE) in OpenVZ. It is more
than abstract and have a nice brief name. If this suits you - I will be
happy to commit patches as is :)

other variants:
virtual context (vc, vctx),
virtual containers (vc).

I personally don't like "container", since it is too long and I see no
good abreviations for this...

Kirill
Re: [RFC][PATCH 1/5] Virtualization/containers: startup [message #1281 is a reply to message #1251] Sun, 05 February 2006 15:05 Go to previous messageGo to next message
dev is currently offline  dev
Messages: 1693
Registered: September 2005
Location: Moscow
Senior Member

> I just did a global s/vps/container/ and it looks pretty reasonable, at
> least from my point of view.
>
> Couple of minor naming nitpick questions, though. Is vps/container_info
> really what we want to call it? It seems to me to be the basis for a
> real "container", without the _info part.
Can be ommited.

> "tsk->owner_container" That makes it sound like a pointer to the "task
> owner's container". How about "owning_container"? The "container
> owning this task". Or, maybe just "container"?
This is why I don't like "container" name.

Please, also note, in OpenVZ we have 2 pointers on task_struct:
One is owner of a task (owner_env), 2nd is a current context (exec_env).
exec_env pointer is used to avoid adding of additional argument to all
the functions where current context is required.

Linus, does such approach makes sense to you or you prefer us to add
additional args to functions where container pointer is needed? This
looks undersiable for embedded guys and increases stack usage/code size
when virtualization is off, which doesn't look good for me.

> Any particular reason for the "u32 id" in the vps_info struct as opposed
> to one of the more generic types? Do we want to abstract this one in
> the same way we do pid_t?
VPS ID is passed to/from user space APIs and when you have a cluster
with different archs and VPSs it is better to have something in common
for managing this.

> The "host" in "host_container_info" doesn't mean much to me. Though, I
> guess it has some context in the UML space. Would "init_container_info"
> or "root_container_info" be more descriptive?
init_container?
(Again, this is why container doesn't sound for me :) )

> Lastly, is this a place for krefs? I don't see a real need for a
> destructor yet, but the idea is fresh in my mind.
I don't see much need for krefs, do you?
In OpenVZ we have 2-level refcounting (mentioned recently by Linus as in
mm). Process counter is used to decide when container should
collapse/cleanuped and real refcounter is used to free the structures
which can be referenced from somewhere else.

Also there is some probability that use of krefs will result in sysfs :)))

> How does the attached patch look?
I will resend all 5 patches tomorrow. And maybe more.

Kirill
Re: [RFC][PATCH 1/5] Virtualization/containers: startup [message #1282 is a reply to message #1263] Sun, 05 February 2006 15:10 Go to previous messageGo to next message
dev is currently offline  dev
Messages: 1693
Registered: September 2005
Location: Moscow
Senior Member

> On Fri, Feb 03, 2006 at 10:34:01AM -0800, Dave Hansen wrote:
>> Lastly, is this a place for krefs? I don't see a real need for a
>> destructor yet, but the idea is fresh in my mind.
>
> Well, what happens when you drop the last reference to this container?
> Right now, your patch doesn't cause anything to happen, and if that's
> acceptable, then fine, you don't need to use a struct kref.
>
> But if not, then why have a reference count at all? :)

Please note, this patch introduces only small parts of it.
It doesn't introduce the code which creates containers/destroys them etc.

As I mentioned in another email:
In OpenVZ we have 2-level refcounting (mentioned recently by Linus as in
mm). Process counter is used to decide when container should
collapse/cleanuped and real refcounter is used to free the structures
which can be referenced from somewhere else.

Kirill
Re: [RFC][PATCH 1/5] Virtualization/containers: startup [message #1283 is a reply to message #1260] Sun, 05 February 2006 15:11 Go to previous messageGo to next message
dev is currently offline  dev
Messages: 1693
Registered: September 2005
Location: Moscow
Senior Member

> How do we want to create the container?
> In our patch we did it through a /proc/container filesystem.
> Which created the container object and then on fork/exec switched over.
this doesn't look good for a full virtualization solution, since proc
should be virtualized as well :)

> How about an additional sys_exec_container( exec_args + "container_name").
> This does all the work like exec, but creates new container
> with name "..." and attaches task to new container.
> If name exists, an error -EEXIST will be raised !
Why do you need exec?

Kirill
Re: [RFC][PATCH 1/5] Virtualization/containers: startup [message #1285 is a reply to message #1283] Sun, 05 February 2006 15:39 Go to previous messageGo to next message
Hubertus Franke is currently offline  Hubertus Franke
Messages: 16
Registered: February 2006
Junior Member
Kirill Korotaev wrote:
>> How do we want to create the container?
>> In our patch we did it through a /proc/container filesystem.
>> Which created the container object and then on fork/exec switched over.
>
> this doesn't look good for a full virtualization solution, since proc
> should be virtualized as well :)

Just lazy's man's development version of a faked sys_call to create the container
without having to go through all architectures ...
Nothing permanent..


>
>> How about an additional sys_exec_container( exec_args +
>> "container_name").
>> This does all the work like exec, but creates new container
>> with name "..." and attaches task to new container.
>> If name exists, an error -EEXIST will be raised !
>
> Why do you need exec?

(a) how do you create/destroy a container
(b) how do you attach yourself to it?

-- Hubertus
Re: [RFC][PATCH 1/5] Virtualization/containers: startup [message #1289 is a reply to message #1239] Mon, 06 February 2006 00:56 Go to previous messageGo to next message
Benjamin Herrenschmid is currently offline  Benjamin Herrenschmid
Messages: 10
Registered: February 2006
Junior Member
On Fri, 2006-02-03 at 19:58 +0300, Kirill Korotaev wrote:

> +static inline vps_t get_vps(vps_t vps)
> +{
> + atomic_inc(&vps->refcnt);
> + return vps;
> +}
> +
> +static inline void put_vps(vps_t vps)
> +{
> + atomic_dec(&vps->refcnt);
> +}

I'm not too sure about the refcounting here .. you never destroy the
object ? Also, why introduce your own refcounting mecanism instead of
using existing ones ? You could probably use at least a kref to get a
nice refcount + destructor instead of home made atomics based. Maybe
some higher level structure if you think it makes sense (not too sure
what this virtualization stuff is about so I can't comment on what data
structure is appropriate here).

Cheers,
Ben.
Re: [RFC][PATCH 3/5] Virtualization/containers: UTSNAME [message #1290 is a reply to message #1241] Mon, 06 February 2006 08:21 Go to previous messageGo to next message
ebiederm is currently offline  ebiederm
Messages: 1354
Registered: February 2006
Senior Member
I am disturbed by the introduction of #defines like current_vps() and
vps_utsname.

Magic lower case #defines are usually a bad idea.

These defines hide the cost of the operations you are performing.
At that point you might as well name the thing system_utsname
so you don't have to change the code.

And of course you failed to change several references to
system_utsname.

Eric
Re: [RFC][PATCH 1/5] Virtualization/containers: startup [message #1291 is a reply to message #1239] Mon, 06 February 2006 08:31 Go to previous messageGo to next message
ebiederm is currently offline  ebiederm
Messages: 1354
Registered: February 2006
Senior Member
Kirill Korotaev <dev@openvz.org> writes:

> This patch introduces some abstract container/VPS kernel structure and tiny
> amount of operations on it.
>
> Patches following this one will be used for virtualization of some resources
> based on this container infrastructure, including those VPIDs patches I sent
> before.
>
> What is important here is that:
> - each VPS has unique ID
> - each process in kernel can belong to one VPS only

Next time could you please a diffstat for your patches,
Thanks

Eric
Re: [RFC][PATCH 1/5] Virtualization/containers: startup [message #1292 is a reply to message #1254] Mon, 06 February 2006 08:39 Go to previous messageGo to next message
ebiederm is currently offline  ebiederm
Messages: 1354
Registered: February 2006
Senior Member
Linus Torvalds <torvalds@osdl.org> writes:

> On Fri, 3 Feb 2006, Kirill Korotaev wrote:
>>
>> Do you have any other ideas/comments on this?
>> I will send additional IPC/filesystems virtualization patches a bit later.
>
> I think that a patch like this - particularly just the 1/5 part - makes
> total sense, because regardless of any other details of virtualization,
> every single scheme is going to need this.

I strongly disagree with this approach. I think Al Viro got it
right when he created a separate namespace for filesystems.

First this presumes an all or nothing interface. But that is not
what people are doing. Different people want different subsets
of the functionality. For the migration work I am doing having
multiple meanings for the same uid isn't interesting.

Secondly by implementing this in one big chunk there is no
migration path when you want to isolate an additional part of the
kernel interface.

So I really think an approach that allows for incremental progress
that allows for different subsets of this functionality to
be used is a better approach. In clone we already have
a perfectly serviceable interface for that and I have
seen no one refute that. I'm not sure I have seen anyone
get it though.



My apologies for the late reply I didn't see this thread until
just a couple of minutes ago. linux-kernel can be hard to
follow when you aren't cc'd.


Patches hopefully sometime in the next 24hours. So hopefully
conversation can be carried forward in a productive manner.

Eric
Re: [RFC][PATCH 3/5] Virtualization/containers: UTSNAME [message #1293 is a reply to message #1290] Mon, 06 February 2006 08:51 Go to previous messageGo to next message
dev is currently offline  dev
Messages: 1693
Registered: September 2005
Location: Moscow
Senior Member

> I am disturbed by the introduction of #defines like current_vps() and
> vps_utsname.
>
> Magic lower case #defines are usually a bad idea.
It is not magic defines, this is done intentionally.
You can take a more detailed view into OpenVZ sources, but the idea is
to make kernel compilable without virtualization.
When virtualization is OFF all this macros are defined to trivial
variables/defines which make it an old good kernel.

For example current_vps() should be (&init_vps), i.e. host system
environment only.

vps_utsname will be defined as system_utsname and so on.

> These defines hide the cost of the operations you are performing.
> At that point you might as well name the thing system_utsname
> so you don't have to change the code.
You mean to have variable and define with the same names?
it is not always good. It works fine, when both are defined in the same
file, but poorly when it is scattered all around...

> And of course you failed to change several references to
> system_utsname.
which one? Maybe intentionally? ;-)
Kirill
Re: [RFC][PATCH 1/5] Virtualization/containers: startup [message #1294 is a reply to message #1292] Mon, 06 February 2006 08:58 Go to previous messageGo to next message
dev is currently offline  dev
Messages: 1693
Registered: September 2005
Location: Moscow
Senior Member

>>I think that a patch like this - particularly just the 1/5 part - makes
>>total sense, because regardless of any other details of virtualization,
>>every single scheme is going to need this.
> I strongly disagree with this approach. I think Al Viro got it
> right when he created a separate namespace for filesystems.
These patch set introduces separate namespaces as those in filesystems.
What exactly you don't like in this approach? Can you be more specific?

> First this presumes an all or nothing interface. But that is not
> what people are doing. Different people want different subsets
> of the functionality. For the migration work I am doing having
> multiple meanings for the same uid isn't interesting.
What do you mean by that? That you don't care about virtualization of
UIDs? So your migration doesn't care at all whether 2 systems have same
uids? Do you keep /etc/passwd in sync when do migration?
Only full virtualization allows to migrate applications without bugs and
different effects.

> Secondly by implementing this in one big chunk there is no
> migration path when you want to isolate an additional part of the
> kernel interface.
>
> So I really think an approach that allows for incremental progress
> that allows for different subsets of this functionality to
> be used is a better approach. In clone we already have
> a perfectly serviceable interface for that and I have
> seen no one refute that. I'm not sure I have seen anyone
> get it though.
Just introduce config option for each virtualization functionality.
That's it.

> My apologies for the late reply I didn't see this thread until
> just a couple of minutes ago. linux-kernel can be hard to
> follow when you aren't cc'd.
>
>
> Patches hopefully sometime in the next 24hours. So hopefully
> conversation can be carried forward in a productive manner.
Ok. I will remake them either :)

Kirill
Re: Re: [RFC][PATCH 1/5] Virtualization/containers: startup [message #1295 is a reply to message #1289] Mon, 06 February 2006 09:01 Go to previous messageGo to next message
dev is currently offline  dev
Messages: 1693
Registered: September 2005
Location: Moscow
Senior Member

>>+static inline vps_t get_vps(vps_t vps)
>>+{
>>+ atomic_inc(&vps->refcnt);
>>+ return vps;
>>+}
>>+
>>+static inline void put_vps(vps_t vps)
>>+{
>>+ atomic_dec(&vps->refcnt);
>>+}
>
>
> I'm not too sure about the refcounting here .. you never destroy the
> object ? Also, why introduce your own refcounting mecanism instead of
> using existing ones ? You could probably use at least a kref to get a
> nice refcount + destructor instead of home made atomics based. Maybe
> some higher level structure if you think it makes sense (not too sure
> what this virtualization stuff is about so I can't comment on what data
> structure is appropriate here).
I replied to this in another email. Briefly:
this patch set doesn't introduce VPS/container creation/destroy
interface yet. Only small parts.
krefs are not needed here.

Kirill
Re: [RFC][PATCH 1/5] Virtualization/containers: startup [message #1296 is a reply to message #1285] Mon, 06 February 2006 09:06 Go to previous messageGo to next message
dev is currently offline  dev
Messages: 1693
Registered: September 2005
Location: Moscow
Senior Member

> Just lazy's man's development version of a faked sys_call to create the
> container
> without having to go through all architectures ...
> Nothing permanent..

>>> How about an additional sys_exec_container( exec_args +
>>> "container_name").
>>> This does all the work like exec, but creates new container
>>> with name "..." and attaches task to new container.
>>> If name exists, an error -EEXIST will be raised !
>>
>>
>> Why do you need exec?
>
>
> (a) how do you create/destroy a container
> (b) how do you attach yourself to it?
a)
create via syscall.
destroyed when the last process dies in container.
b)
syscall which changes container. you need to close unneeded
resources, change context and fork(). That's it.
The whole process is exactly the same as if you changes UID.
Re: [RFC][PATCH 1/5] Virtualization/containers: startup [message #1297 is a reply to message #1294] Mon, 06 February 2006 09:19 Go to previous messageGo to next message
ebiederm is currently offline  ebiederm
Messages: 1354
Registered: February 2006
Senior Member
Kirill Korotaev <dev@sw.ru> writes:

>>> I think that a patch like this - particularly just the 1/5 part - makes total
>>> sense, because regardless of any other details of virtualization, every
>>> single scheme is going to need this.
>> I strongly disagree with this approach. I think Al Viro got it
>> right when he created a separate namespace for filesystems.

> These patch set introduces separate namespaces as those in filesystems.
> What exactly you don't like in this approach? Can you be more specific?

That you placed the namespaces in a separate structure from task_struct.
That part seems completely unnecessary, that and the addition of a
global id in a completely new namespace that will be a pain to virtualize
when it's time comes.

>> First this presumes an all or nothing interface. But that is not
>> what people are doing. Different people want different subsets
>> of the functionality. For the migration work I am doing having
>> multiple meanings for the same uid isn't interesting.

> What do you mean by that? That you don't care about virtualization of UIDs? So
> your migration doesn't care at all whether 2 systems have same uids? Do you keep
> /etc/passwd in sync when do migration?

It is already in sync. When the only interesting target is inside of
a cluster it is trivial to handle. In addition frequently there is
only one job on a given machine at a time so there is no other
interesting user of UIDs.

> Only full virtualization allows to migrate applications without bugs and
> different effects.

The requirement is that for every global identifier you have a
namespace where you will not have a conflict in definition when you
restore that identifier. (Either that or you have to update all
references to that identifier which is usually to hideous to take
seriously)

Virtualization has nothing to do with it. It is all about
non-conflicting names. The kernel already has everything virtualized.

>> Secondly by implementing this in one big chunk there is no
>> migration path when you want to isolate an additional part of the
>> kernel interface.
>> So I really think an approach that allows for incremental progress
>> that allows for different subsets of this functionality to
>> be used is a better approach. In clone we already have
>> a perfectly serviceable interface for that and I have
>> seen no one refute that. I'm not sure I have seen anyone
>> get it though.
> Just introduce config option for each virtualization functionality. That's it.

If you have different pointers for each one that is sane. I meant in
the api. I did not see patches 4/5 and 5/5. But I presumed since
you allocated a great global structure everything was turned on
off with that.

>> My apologies for the late reply I didn't see this thread until
>> just a couple of minutes ago. linux-kernel can be hard to
>> follow when you aren't cc'd.
>> Patches hopefully sometime in the next 24hours. So hopefully
>> conversation can be carried forward in a productive manner.
> Ok. I will remake them either :)

My not seeing your user space api is one of the problems with incomplete
patches which seem to plague this conversation.

Eric
Re: [RFC][PATCH 1/5] Virtualization/containers: startup [message #1307 is a reply to message #1281] Mon, 06 February 2006 16:35 Go to previous messageGo to next message
Dave Hansen is currently offline  Dave Hansen
Messages: 240
Registered: October 2005
Senior Member
On Sun, 2006-02-05 at 18:05 +0300, Kirill Korotaev wrote:
> > "tsk->owner_container" That makes it sound like a pointer to the "task
> > owner's container". How about "owning_container"? The "container
> > owning this task". Or, maybe just "container"?
> This is why I don't like "container" name.

I worry that using something like "vps" obfuscates the real meaning a
bit. The reason that "owner_vps" doesn't sound weird is that people, by
default, usually won't understand what a "vps" is.

(if you like acronyms a lot, I'm sure I can find a job for you at IBM or
in the US military :)

> Please, also note, in OpenVZ we have 2 pointers on task_struct:
> One is owner of a task (owner_env), 2nd is a current context (exec_env).
> exec_env pointer is used to avoid adding of additional argument to all
> the functions where current context is required.

That makes sense. However, are there many cases in the kernel where a
task ends up doing something temporary like this:

tsk->exec_vnc = bar;
do_something_here(task);
tsk->exec_vnc = foo;

If that's the case very often, we probably want to change the APIs, just
to make the common action explicit. If it never happens, or is a
rarity, I think it should be just fine.

> > Any particular reason for the "u32 id" in the vps_info struct as opposed
> > to one of the more generic types? Do we want to abstract this one in
> > the same way we do pid_t?
> VPS ID is passed to/from user space APIs and when you have a cluster
> with different archs and VPSs it is better to have something in common
> for managing this.

I guess it does keep you from running into issues with mixing 32 and
64-bit processes. But, haven't we solved those problems already? Is it
just a pain?

> > Lastly, is this a place for krefs? I don't see a real need for a
> > destructor yet, but the idea is fresh in my mind.
> I don't see much need for krefs, do you?
> In OpenVZ we have 2-level refcounting (mentioned recently by Linus as in
> mm). Process counter is used to decide when container should
> collapse/cleanuped and real refcounter is used to free the structures
> which can be referenced from somewhere else.

It sounds to me like anything that needs to have an action taken when a
refcount reaches zero is a good candidate for a kref. Both of those
uses sound like they need that. Probably not too big of a deal, though.

-- Dave
Re: [RFC][PATCH 1/5] Virtualization/containers: startup [message #1308 is a reply to message #1297] Mon, 06 February 2006 16:37 Go to previous messageGo to next message
Dave Hansen is currently offline  Dave Hansen
Messages: 240
Registered: October 2005
Senior Member
On Mon, 2006-02-06 at 02:19 -0700, Eric W. Biederman wrote:
> That you placed the namespaces in a separate structure from
> task_struct.
> That part seems completely unnecessary, that and the addition of a
> global id in a completely new namespace that will be a pain to
> virtualize
> when it's time comes.

Could you explain a bit why the container ID would need to be
virtualized?

-- Dave
Re: [RFC][PATCH 1/5] Virtualization/containers: startup [message #1309 is a reply to message #1307] Mon, 06 February 2006 16:50 Go to previous messageGo to next message
dev is currently offline  dev
Messages: 1693
Registered: September 2005
Location: Moscow
Senior Member

Hello,

> I worry that using something like "vps" obfuscates the real meaning a
> bit. The reason that "owner_vps" doesn't sound weird is that people, by
> default, usually won't understand what a "vps" is.
container or context sounds the same :) it is impossible to feel this
notion naturally without getting into details. IMHO.

> (if you like acronyms a lot, I'm sure I can find a job for you at IBM or
> in the US military :)
We can talk about it separetely :)))

>>Please, also note, in OpenVZ we have 2 pointers on task_struct:
>>One is owner of a task (owner_env), 2nd is a current context (exec_env).
>>exec_env pointer is used to avoid adding of additional argument to all
>>the functions where current context is required.
>
> That makes sense. However, are there many cases in the kernel where a
> task ends up doing something temporary like this:
>
> tsk->exec_vnc = bar;
> do_something_here(task);
> tsk->exec_vnc = foo;
>
> If that's the case very often, we probably want to change the APIs, just
> to make the common action explicit. If it never happens, or is a
> rarity, I think it should be just fine.
It is quite rare. In IRQ, softIRQ, TCP/IP stack and some timers. Not much.

>>VPS ID is passed to/from user space APIs and when you have a cluster
>>with different archs and VPSs it is better to have something in common
>>for managing this.
> I guess it does keep you from running into issues with mixing 32 and
> 64-bit processes. But, haven't we solved those problems already? Is it
> just a pain?
VPSs can live in clusters. It is good to have one VPS ID space.

Kirill
Re: [RFC][PATCH 1/5] Virtualization/containers: startup [message #1310 is a reply to message #1281] Mon, 06 February 2006 16:56 Go to previous messageGo to next message
Linus Torvalds is currently offline  Linus Torvalds
Messages: 11
Registered: February 2006
Junior Member
On Sun, 5 Feb 2006, Kirill Korotaev wrote:
>
> Please, also note, in OpenVZ we have 2 pointers on task_struct:
> One is owner of a task (owner_env), 2nd is a current context (exec_env).
> exec_env pointer is used to avoid adding of additional argument to all the
> functions where current context is required.

That naming _has_ to change.

"exec" has a very clear meaning in unix: it talks about the notion of
switching to another process image, or perhaps the bit that says that a
file contains an image that can be executed. It has nothing to do with
"current".

What you seem to be talking about is the _effective_ environment. Ie the
same way we have "uid" and "euid", you'd have a "container" and the
"effective container".

The "owner" name also makes no sense. The security context doesn't "own"
tasks. A task is _part_ of a context.

So if some people don't like "container", how about just calling it
"context"? The downside of that name is that it's very commonly used in
the kenel, because a lot of things have "contexts". That's why "container"
would be a lot better.

I'd suggest

current->container - the current EFFECTIVE container
current->master_container - the "long term" container.

(replace "master" with some other non-S&M term if you want)

(It would make sense to just have the prepend-"e" semantics of uid/gid,
but the fact is, "euid/egid" has a long unix history and is readable only
for that reason. The same wouldn't be true of containers. And
"effective_container" is probably too long to use for the field that is
actually the _common_ case. Thus the above suggestion).

Linus
Re: [RFC][PATCH 1/5] Virtualization/containers: startup [message #1311 is a reply to message #1310] Mon, 06 February 2006 17:19 Go to previous messageGo to next message
dev is currently offline  dev
Messages: 1693
Registered: September 2005
Location: Moscow
Senior Member

>>Please, also note, in OpenVZ we have 2 pointers on task_struct:
>>One is owner of a task (owner_env), 2nd is a current context (exec_env).
>>exec_env pointer is used to avoid adding of additional argument to all the
>>functions where current context is required.
>
>
> That naming _has_ to change.
I agree.

> "exec" has a very clear meaning in unix: it talks about the notion of
> switching to another process image, or perhaps the bit that says that a
> file contains an image that can be executed. It has nothing to do with
> "current".
> What you seem to be talking about is the _effective_ environment. Ie the
> same way we have "uid" and "euid", you'd have a "container" and the
> "effective container".
agree on this either. Good point.

> The "owner" name also makes no sense. The security context doesn't "own"
> tasks. A task is _part_ of a context.

> So if some people don't like "container", how about just calling it
> "context"? The downside of that name is that it's very commonly used in
> the kenel, because a lot of things have "contexts". That's why "container"
> would be a lot better.
>
> I'd suggest
>
> current->container - the current EFFECTIVE container
> current->master_container - the "long term" container.
>
> (replace "master" with some other non-S&M term if you want)
maybe task_container? i.e. where task actually is.
Sounds good for you?

The only problem with such names I see, that task will be an exception
then compared to other objects. I mean, on other objects field
"container" will mean the container which object is part of. But for
tasks this will mean effective one. Only tasks need these 2 containers
pointers and I would prefer having the common one to be called simply
"container".

Maybe then
current->econtainer - effective container
current->container - "long term" container

> (It would make sense to just have the prepend-"e" semantics of uid/gid,
> but the fact is, "euid/egid" has a long unix history and is readable only
> for that reason. The same wouldn't be true of containers. And
> "effective_container" is probably too long to use for the field that is
> actually the _common_ case. Thus the above suggestion).
Your proposal looks quite nice.
Then we will have eventually "container" field on objects (not on task
only) which sounds good to me. I will prepare patches right now.

Kirill
Re: [RFC][PATCH 1/5] Virtualization/containers: startup [message #1314 is a reply to message #1308] Mon, 06 February 2006 18:37 Go to previous messageGo to next message
ebiederm is currently offline  ebiederm
Messages: 1354
Registered: February 2006
Senior Member
Dave Hansen <haveblue@us.ibm.com> writes:

> On Mon, 2006-02-06 at 02:19 -0700, Eric W. Biederman wrote:
>> That you placed the namespaces in a separate structure from
>> task_struct.
>> That part seems completely unnecessary, that and the addition of a
>> global id in a completely new namespace that will be a pain to
>> virtualize
>> when it's time comes.
>
> Could you explain a bit why the container ID would need to be
> virtualized?

As someone said to me a little bit ago, for migration or checkpointing
ultimately you have to capture the entire user/kernel interface if
things are going to work properly. Now if we add this facility to
the kernel and it is a general purpose facility. It is only a matter
of time before we need to deal with nested containers.

Not considering the case of having nested containers now is just foolish.
Maybe we don't have to implement it yet but not considering it is silly.

As far as I can tell there is a very reasonable chance that when we
are complete there is a very reasonable chance that software suspend
will just be a special case of migration, done complete in user space.
That is one of the more practical examples I can think of where this
kind of functionality would be used.

Eric
Re: [RFC][PATCH 1/5] Virtualization/containers: startup [message #1315 is a reply to message #1314] Mon, 06 February 2006 19:30 Go to previous messageGo to next message
dev is currently offline  dev
Messages: 1693
Registered: September 2005
Location: Moscow
Senior Member

> As someone said to me a little bit ago, for migration or checkpointing
> ultimately you have to capture the entire user/kernel interface if
> things are going to work properly. Now if we add this facility to
> the kernel and it is a general purpose facility. It is only a matter
> of time before we need to deal with nested containers.
Fully virtualized container is not a matter of virtualized ID - it is
the easiest thing to do actually, but a whole global problem of other
resources virtualization. We can ommit ID for now, if you like it more.

> Not considering the case of having nested containers now is just foolish.
> Maybe we don't have to implement it yet but not considering it is silly.
No one told that it is not considered. In fact PID virtualization send
both by IBM/us is abstract and doesn't care whether containers are
nested or not.

> As far as I can tell there is a very reasonable chance that when we
> are complete there is a very reasonable chance that software suspend
> will just be a special case of migration, done complete in user space.
> That is one of the more practical examples I can think of where this
> kind of functionality would be used.

Kirill
Re: [RFC][PATCH 1/5] Virtualization/containers: startup [message #1335 is a reply to message #1283] Mon, 06 February 2006 22:31 Go to previous messageGo to next message
Cedric Le Goater is currently offline  Cedric Le Goater
Messages: 443
Registered: February 2006
Senior Member
Kirill Korotaev wrote:
>> How do we want to create the container?
>> In our patch we did it through a /proc/container filesystem.
>> Which created the container object and then on fork/exec switched over.
>
> this doesn't look good for a full virtualization solution, since proc
> should be virtualized as well :)

Well, /proc should be "virtualized" or "isolated", how do you expect a
container to work correctly ? plenty of user space tools depend on it.

C.
Re: [RFC][PATCH 1/5] Virtualization/containers: startup [message #1336 is a reply to message #1314] Mon, 06 February 2006 22:40 Go to previous messageGo to next message
Cedric Le Goater is currently offline  Cedric Le Goater
Messages: 443
Registered: February 2006
Senior Member
Eric W. Biederman wrote:

> As someone said to me a little bit ago, for migration or checkpointing
> ultimately you have to capture the entire user/kernel interface if
> things are going to work properly. Now if we add this facility to
> the kernel and it is a general purpose facility. It is only a matter
> of time before we need to deal with nested containers.
>
> Not considering the case of having nested containers now is just foolish.
> Maybe we don't have to implement it yet but not considering it is silly.

That could be restricted. Today, process groups are not nested. Why do you
think nested containers are inevitable ?

> As far as I can tell there is a very reasonable chance that when we
> are complete there is a very reasonable chance that software suspend
> will just be a special case of migration, done complete in user space.

Being able to sofware suspend one container among many would be a very
interesting feature to have.

C.
Re: [RFC][PATCH 1/5] Virtualization/containers: startup [message #1340 is a reply to message #1336] Tue, 07 February 2006 01:57 Go to previous messageGo to next message
ebiederm is currently offline  ebiederm
Messages: 1354
Registered: February 2006
Senior Member
Cedric Le Goater <clg@fr.ibm.com> writes:

> Eric W. Biederman wrote:
>
>> As someone said to me a little bit ago, for migration or checkpointing
>> ultimately you have to capture the entire user/kernel interface if
>> things are going to work properly. Now if we add this facility to
>> the kernel and it is a general purpose facility. It is only a matter
>> of time before we need to deal with nested containers.
>>
>> Not considering the case of having nested containers now is just foolish.
>> Maybe we don't have to implement it yet but not considering it is silly.
>
> That could be restricted. Today, process groups are not nested. Why do you
> think nested containers are inevitable ?

process groups are a completely different kind of beast.
A closer analogy are hierarchical name spaces and mounts.
If we didn't need things like waitpid outside one pid namespace
to wait for a nested namespace they would be complete disjoint
and the implementation would be trivial.


>> As far as I can tell there is a very reasonable chance that when we
>> are complete there is a very reasonable chance that software suspend
>> will just be a special case of migration, done complete in user space.
>
> Being able to sofware suspend one container among many would be a very
> interesting feature to have.

That is what checkpointing. And that is simply the persistent form of
migration.

Eric
Re: [RFC][PATCH 1/5] Virtualization/containers: startup [message #1353 is a reply to message #1310] Tue, 07 February 2006 00:28 Go to previous messageGo to next message
Sam Vilain is currently offline  Sam Vilain
Messages: 73
Registered: February 2006
Member
Linus Torvalds wrote:
> So if some people don't like "container", how about just calling it
> "context"? The downside of that name is that it's very commonly used in
> the kenel, because a lot of things have "contexts". That's why "container"
> would be a lot better.
>
> I'd suggest
>
> current->container - the current EFFECTIVE container
> current->master_container - the "long term" container.
>
> (replace "master" with some other non-S&M term if you want)

Hmm. You actually need a linked list, otherwise you have replaced a one
level flat structure with a two level one, and you miss out on some of
the applications. VServer uses a special structure for this.

Sam.
Re: [RFC][PATCH 1/5] Virtualization/containers: startup [message #1367 is a reply to message #1353] Tue, 07 February 2006 12:19 Go to previous messageGo to previous message
dev is currently offline  dev
Messages: 1693
Registered: September 2005
Location: Moscow
Senior Member

>> So if some people don't like "container", how about just calling it
>> "context"? The downside of that name is that it's very commonly used
>> in the kenel, because a lot of things have "contexts". That's why
>> "container" would be a lot better.
>>
>> I'd suggest
>>
>> current->container - the current EFFECTIVE container
>> current->master_container - the "long term" container.
>>
>> (replace "master" with some other non-S&M term if you want)
>
>
> Hmm. You actually need a linked list, otherwise you have replaced a one
> level flat structure with a two level one, and you miss out on some of
> the applications. VServer uses a special structure for this.
Nope! :) This is pointer to current/effective container, which can be
anywhere in the hierarchy. list should be inside container struct.

Kirill
Previous Topic: Versioning issue on vzquota-3.0.0-2
Next Topic: Versioning issue on vzquota-3.0.0-2
Goto Forum:
  


Current Time: Sat Oct 25 09:10:15 GMT 2025

Total time taken to generate the page: 0.11368 seconds