| Home » Mailing lists » Devel » [PATCH 0/2] resource control file system - aka containers on top of nsproxy! Goto Forum:
	| 
		
			| [PATCH 0/2] resource control file system - aka containers on top of nsproxy! [message #17556] | Thu, 01 March 2007 13:35  |  
			| 
				
				
					|  Srivatsa Vaddagiri Messages: 241
 Registered: August 2006
 | Senior Member |  |  |  
	| Paul,
	Based on some of the feedback to container patches, I have
respun them to avoid the "container" structure abstraction and instead use
nsproxy structure in the kernel. User interface (which I felt was neat
in your patches) has been retained to be same.
What follows is the core (big) patch and the cpu_acct subsystem to serve
as an example of how to use it. I suspect we can make cpusets also work
on top of this very easily.
Oh and since most of the code is serving the purpose of being a
filesystem, I have renamed the patch to be a resource control file
system - rcfs!
-- 
Regards,
vatsa
_______________________________________________
Containers mailing list
Containers@lists.osdl.org
https://lists.osdl.org/mailman/listinfo/containers |  
	|  |  |  
	| 
		
			| [PATCH 1/2] rcfs core patch [message #17557 is a reply to message #17556] | Thu, 01 March 2007 13:45   |  
			| 
				
				
					|  Srivatsa Vaddagiri Messages: 241
 Registered: August 2006
 | Senior Member |  |  |  
	| Heavily based on Paul Menage's (inturn cpuset) work. The big difference
is that the patch uses task->nsproxy to group tasks for resource control
purpose (instead of task->containers).
The patch retains the same user interface as Paul Menage's patches. In
particular, you can have multiple hierarchies, each hierarchy giving a 
different composition/view of task-groups.
(Ideally this patch should have been split into 2 or 3 sub-patches, but
will do that on a subsequent version post)
Signed-off-by : Srivatsa Vaddagiri <vatsa@in.ibm.com>
Signed-off-by : Paul Menage <menage@google.com>
---
 linux-2.6.20-vatsa/include/linux/init_task.h |    4 
 linux-2.6.20-vatsa/include/linux/nsproxy.h   |    5 
 linux-2.6.20-vatsa/init/Kconfig              |   22 
 linux-2.6.20-vatsa/init/main.c               |    1 
 linux-2.6.20-vatsa/kernel/Makefile           |    1 
---
diff -puN include/linux/init_task.h~rcfs include/linux/init_task.h
--- linux-2.6.20/include/linux/init_task.h~rcfs	2007-03-01 14:20:47.000000000 +0530
+++ linux-2.6.20-vatsa/include/linux/init_task.h	2007-03-01 14:20:47.000000000 +0530
@@ -71,6 +71,16 @@
 }
 
 extern struct nsproxy init_nsproxy;
+
+#ifdef CONFIG_RCFS
+#define INIT_RCFS(nsproxy) 						\
+	.list 		= LIST_HEAD_INIT(nsproxy.list),			\
+	.ctlr_data 	= {[ 0 ... CONFIG_MAX_RC_SUBSYS-1 ] = NULL },
+#else
+#define INIT_RCFS(nsproxy)
+#endif
+
+
 #define INIT_NSPROXY(nsproxy) {						\
 	.pid_ns		= &init_pid_ns,					\
 	.count		= ATOMIC_INIT(1),				\
@@ -78,6 +88,7 @@ extern struct nsproxy init_nsproxy;
 	.uts_ns		= &init_uts_ns,					\
 	.mnt_ns		= NULL,						\
 	INIT_IPC_NS(ipc_ns)						\
+	INIT_RCFS(nsproxy)						\
 }
 
 #define INIT_SIGHAND(sighand) {						\
diff -puN include/linux/nsproxy.h~rcfs include/linux/nsproxy.h
--- linux-2.6.20/include/linux/nsproxy.h~rcfs	2007-03-01 14:20:47.000000000 +0530
+++ linux-2.6.20-vatsa/include/linux/nsproxy.h	2007-03-01 14:20:47.000000000 +0530
@@ -28,6 +28,10 @@ struct nsproxy {
 	struct ipc_namespace *ipc_ns;
 	struct mnt_namespace *mnt_ns;
 	struct pid_namespace *pid_ns;
+#ifdef CONFIG_RCFS
+	struct list_head list;
+	void *ctlr_data[CONFIG_MAX_RC_SUBSYS];
+#endif
 };
 extern struct nsproxy init_nsproxy;
 
@@ -35,6 +39,12 @@ struct nsproxy *dup_namespaces(struct ns
 int copy_namespaces(int flags, struct task_struct *tsk);
 void get_task_namespaces(struct task_struct *tsk);
 void free_nsproxy(struct nsproxy *ns);
+#ifdef CONFIG_RCFS
+struct nsproxy *find_nsproxy(struct nsproxy *ns);
+int namespaces_init(void);
+#else
+static inline int namespaces_init(void) { return 0;}
+#endif
 
 static inline void put_nsproxy(struct nsproxy *ns)
 {
diff -puN /dev/null include/linux/rcfs.h
--- /dev/null	2006-02-25 03:06:56.000000000 +0530
+++ linux-2.6.20-vatsa/include/linux/rcfs.h	2007-03-01 14:20:47.000000000 +0530
@@ -0,0 +1,72 @@
+#ifndef _LINUX_RCFS_H
+#define _LINUX_RCFS_H
+
+#ifdef CONFIG_RCFS
+
+/* struct cftype:
+ *
+ * The files in the container filesystem mostly have a very simple read/write
+ * handling, some common function will take care of it. Nevertheless some cases
+ * (read tasks) are special and therefore I define this structure for every
+ * kind of file.
+ *
+ *
+ * When reading/writing to a file:
+ *	- the container to use in file->f_dentry->d_parent->d_fsdata
+ *	- the 'cftype' of the file is file->f_dentry->d_fsdata
+ */
+
+struct inode;
+#define MAX_CFTYPE_NAME 64
+struct cftype {
+	/* By convention, the name should begin with the name of the
+	 * subsystem, followed by a period */
+	char name[MAX_CFTYPE_NAME];
+	int private;
+	int (*open) (struct inode *inode, struct file *file);
+	ssize_t (*read) (struct nsproxy *ns, struct cftype *cft,
+			 struct file *file,
+			 char __user *buf, size_t nbytes, loff_t *ppos);
+	ssize_t (*write) (struct nsproxy *ns, struct cftype *cft,
+			  struct file *file,
+			  const char __user *buf, size_t nbytes, loff_t *ppos);
+	int (*release) (struct inode *inode, struct file *file);
+};
+
+/* resource control subsystem type. See Documentation/rcfs.txt for details */
+
+struct rc_subsys {
+	int (*create)(struct rc_subsys *ss, struct nsproxy *ns,
+			 struct nsproxy *parent);
+	void (*destroy)(struct rc_subsys *ss, struct nsproxy *ns);
+	int (*can_attach)(struct rc_subsys *ss, struct nsproxy *ns,
+				 struct task_struct *tsk);
+	void (*attach)(struct rc_subsys *ss, void *new, void *old,
+				 struct task_struct *tsk);
+	int (*populate)(struct rc_subsys *ss, struct dentry *d);
+	int subsys_id;
+	int active;
+
+#define MAX_CONTAINER_TYPE_NAMELEN 32
+	const char *name;
+
+	/* Protected by RCU */
+	int hierarchy;
+
+	struct list_head sibling;
+};
+
+int rc_register_subsys(struct rc_subsys *subsys);
+/* Add a new file to the given container directory. Should only be
+ * called by subsystems from within a populate() method */
+int rcfs_add_file(struct dentry *d, const struct cftype *cft);
+extern int rcfs_init(void);
+
+#else
+
+static inline int rcfs_init(void) { return 0; }
+
+#endif
+
+
+#endif
diff -puN init/Kconfig~rcfs init/Kconfig
--- linux-2.6.20/init/Kconfig~rcfs	2007-03-01 14:20:47.000000000 +0530
+++ linux-2.6.20-vatsa/init/Kconfig	2007-03-01 16:52:50.000000000 +0530
@@ -238,6 +238,28 @@ config IKCONFIG_PROC
 	  This option enables access to the kernel configuration file
 	  through /proc/config.gz.
 
+config RCFS
+	bool "Resource control file system support"
+	default n
+	help
+	  This option will let you create and manage resource containers,
+	  which can be used to aggregate multiple processes, e.g. for
+	  the purposes of resource tracking.
+
+	  Say N if unsure
+
+config MAX_RC_SUBSYS
+       int "Number of resource control subsystems to support"
+       depends on RCFS
+       range 1 255
+       default 8
+
+config MAX_RC_HIERARCHIES
+       int "Number of rcfs hierarchies to support"
+       depends on RCFS
+       range 2 255
+       default 4
+
 config CPUSETS
 	bool "Cpuset support"
 	depends on SMP
diff -puN init/main.c~rcfs init/main.c
--- linux-2.6.20/init/main.c~rcfs	2007-03-01 14:20:47.000000000 +0530
+++ linux-2.6.20-vatsa/init/main.c	2007-03-01 14:20:47.000000000 +0530
@@ -52,6 +52,7 @@
 #include <linux/lockdep.h>
 #include <linux/pid_namespace.h>
 #include <linux/device.h>
+#include <linux/rcfs.h>
 
 #include <asm/io.h>
 #include <asm/bugs.h>
@@ -512,6 +513,7 @@ asmlinkage void __init start_kernel(void
 	setup_per_cpu_areas();
 	smp_prepare_boot_cpu();	/* arch-specific boot-cpu hooks */
 
+	namespaces_init();
 	/*
 	 * Set up the scheduler prior starting any interrupts (such as the
 	 * timer interrupt). Full topology setup happens at smp_init()
@@ -608,6 +610,7 @@ asmlinkage void __init start_kernel(void
 #ifdef CONFIG_PROC_FS
 	proc_root_init();
 #endif
+	rcfs_init();
 	cpuset_init();
 	taskstats_init_early();
 	delayacct_init();
diff -puN kernel/Makefile~rcfs kernel/Makefile
--- linux-2.6.20/kernel/Makefile~rcfs	2007-03-01 14:20:47.000000000 +0530
+++ linux-2.6.20-vatsa/kernel/Makefile	2007-03-01 16:52:50.000000000 +0530
@@ -50,6 +50,7 @@ obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_UTS_NS) += utsname.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
+obj-$(CONFIG_RCFS) += rcfs.o
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff -puN kernel/nsproxy.c~rcfs kernel/nsproxy.c
--- linux-2.6.20/kernel/nsproxy.c~rcfs	2007-03-01 14:20:47.000000000 +0530
+++ linux-2.6.20-vatsa/kernel/nsproxy.c	2007-03-01 14:20:47.000000000 +0530
@@ -23,6 +23,11 @@
 
 struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);
 
+#ifdef CONFIG_RCFS
+static LIST_HEAD(nslisthead);
+static DEFINE_SPINLOCK(nslistlock);
+#endif
+
 static inline void get_nsproxy(struct nsproxy *ns)
 {
 	atomic_inc(&ns->count);
@@ -71,6 +76,12 @@ struct nsproxy *dup_namespaces(struct ns
 			get_pid_ns(ns->pid_ns);
 	}
 
+#ifdef CONFIG_RCFS
+	spin_lock(&nslistlock);
+	list_add(&ns->list, &nslisthead);
+	spin_unlock(&nslistlock);
+#endif
+
 	return ns;
 }
 
@@ -145,5 +156,44 @@ void free_nsproxy(struct nsproxy *ns)
 		put_ipc_ns(ns->ipc_ns);
 	if (ns->pid_ns)
 		put_pid_ns(ns->pid_ns);
+#ifdef CONFIG_RCFS
+	spin_lock(&nslistlock);
+	list_del(&ns->list);
+	spin_unlock(&nslistlock);
+#endif
 	kfree(ns);
 }
+
+#ifdef CONFIG_RCFS
+struct nsproxy *find_nsproxy(struct nsproxy *target)
+{
+	struct nsproxy *ns;
+	int i = 0;
+
+	spin_lock(&nslistlock);
+	list_for_each_entry(ns, &nslisthead, list) {
+		for (i= 0; i < CONFIG_MAX_RC_SUBSYS; ++i)
+			if (ns->ctlr_data[i] != target->ctlr_data[i])
+				break;
+
+		if (i == CONFIG_MAX_RC_SUBSYS) {
+			/* Found a hit */
+			get_nsproxy(ns);
+			spin_unlock(&nslistlock);
+			return ns;
+		}
+	}
+
+	spin_unlock(&nslistlock);
+
+	ns = dup_namespaces(target);
+	return ns;
+}
+
+int __init namespaces_init(void)
+{
+	list_add(&init_nsproxy.list, &nslisthead);
+
+	return 0;
+}
+#endif
diff -puN /dev/null kernel/rcfs.c
--- /dev/null	2006-02-25 03:06:56.000000000 +0530
+++ linux-2.6.20-vatsa/kernel/rcfs.c	2007-03-01 16:53:24.000000000 +0530
@@ -0,0 +1,1138 @@
+/*
+ *  kernel/rcfs.c
+ *
+ *  Generic resource container system.
+ *
+ *  Based originally on the cpuset system, extracted by Paul Menage
+ *  Copyright (C) 2006 Google, Inc
+ *
+ *  Copyright notices from the original cpuset code:
+ *  --------------------------------------------------
+ *  Copyright (C) 2003 BULL SA.
+ *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
+ *
+ *  Portions derived from Patrick Mochel's sysfs code.
+ *  sysfs is Copyright (c) 2001-3 Patrick Mochel
+ *
+ *  2003-10-10 Written by Simon Derr.
+ *  2003-10-22 Updates by Stephen Hemminger.
+ *  2004 May-July Rework by Paul Jackson.
+ *  ---------------------------------------------------
+ *
+ *  This file is subject to the terms and condition...
 
 |  
	|  |  |  
	| 
		
			| [PATCH 2/2] cpu_accounting controller [message #17558 is a reply to message #17556] | Thu, 01 March 2007 13:50   |  
			| 
				
				
					|  Srivatsa Vaddagiri Messages: 241
 Registered: August 2006
 | Senior Member |  |  |  
	| This patch demonstrates how a resource controller can work with rcfs.
The controller counts the total CPU time used by all processes in a
resource container, during the time that they're members of the
container.
Written by Paul Menage. Adapted to work with rcfs by Srivatsa.
Signed-off-by : Paul Menage <menage@google.com>
Signed-off-by : Srivatsa Vaddagiri <vatsa@in.ibm.com>
diff -puN /dev/null include/linux/cpu_acct.h
--- /dev/null	2006-02-25 03:06:56.000000000 +0530
+++ linux-2.6.20-vatsa/include/linux/cpu_acct.h	2007-03-01 16:53:39.000000000 +0530
@@ -0,0 +1,14 @@
+
+#ifndef _LINUX_CPU_ACCT_H
+#define _LINUX_CPU_ACCT_H
+
+#include <linux/rcfs.h>
+#include <asm/cputime.h>
+
+#ifdef CONFIG_RC_CPUACCT
+extern void cpuacct_charge(struct task_struct *, cputime_t cputime);
+#else
+static void inline cpuacct_charge(struct task_struct *p, cputime_t cputime) {}
+#endif
+
+#endif
diff -puN init/Kconfig~cpu_acct init/Kconfig
--- linux-2.6.20/init/Kconfig~cpu_acct	2007-03-01 16:53:39.000000000 +0530
+++ linux-2.6.20-vatsa/init/Kconfig	2007-03-01 16:53:39.000000000 +0530
@@ -291,6 +291,13 @@ config SYSFS_DEPRECATED
 	  If you are using a distro that was released in 2006 or later,
 	  it should be safe to say N here.
 
+config RC_CPUACCT
+	bool "Simple CPU accounting container subsystem"
+	select RCFS
+	help
+	  Provides a simple Resource Controller for monitoring the
+	  total CPU consumed by the tasks in a container
+
 config RELAY
 	bool "Kernel->user space relay support (formerly relayfs)"
 	help
diff -puN /dev/null kernel/cpu_acct.c
--- /dev/null	2006-02-25 03:06:56.000000000 +0530
+++ linux-2.6.20-vatsa/kernel/cpu_acct.c	2007-03-01 16:53:39.000000000 +0530
@@ -0,0 +1,221 @@
+/*
+ * kernel/cpu_acct.c - CPU accounting container subsystem
+ *
+ * Copyright (C) Google Inc, 2006
+ *
+ * Developed by Paul Menage (menage@google.com) and Balbir Singh
+ * (balbir@in.ibm.com)
+ *
+ */
+
+/*
+ * Container subsystem for reporting total CPU usage of tasks in a
+ * container, along with percentage load over a time interval
+ */
+
+#include <linux/module.h>
+#include <linux/nsproxy.h>
+#include <linux/rcfs.h>
+#include <linux/fs.h>
+#include <asm/div64.h>
+
+struct cpuacct {
+	spinlock_t lock;
+	/* total time used by this class */
+	cputime64_t time;
+
+	/* time when next load calculation occurs */
+	u64 next_interval_check;
+
+	/* time used in current period */
+	cputime64_t current_interval_time;
+
+	/* time used in last period */
+	cputime64_t last_interval_time;
+};
+
+static struct rc_subsys cpuacct_subsys;
+
+static inline struct cpuacct *nsproxy_ca(struct nsproxy *ns)
+{
+	if (!ns)
+		return NULL;
+
+	return ns->ctlr_data[cpuacct_subsys.subsys_id];
+}
+
+static inline struct cpuacct *task_ca(struct task_struct *task)
+{
+	return nsproxy_ca(task->nsproxy);
+}
+
+#define INTERVAL (HZ * 10)
+
+static inline u64 next_interval_boundary(u64 now) {
+	/* calculate the next interval boundary beyond the
+	 * current time */
+	do_div(now, INTERVAL);
+	return (now + 1) * INTERVAL;
+}
+
+static int cpuacct_create(struct rc_subsys *ss, struct nsproxy *ns,
+					struct nsproxy *parent)
+{
+	struct cpuacct *ca;
+
+	if (parent && (parent != &init_nsproxy))
+		return -EINVAL;
+
+	ca = kzalloc(sizeof(*ca), GFP_KERNEL);
+	if (!ca)
+		return -ENOMEM;
+	spin_lock_init(&ca->lock);
+	ca->next_interval_check = next_interval_boundary(get_jiffies_64());
+	ns->ctlr_data[cpuacct_subsys.subsys_id] = ca;
+	return 0;
+}
+
+static void cpuacct_destroy(struct rc_subsys *ss, struct nsproxy *ns)
+{
+	kfree(nsproxy_ca(ns));
+}
+
+/* Lazily update the load calculation if necessary. Called with ca locked */
+static void cpuusage_update(struct cpuacct *ca)
+{
+	u64 now = get_jiffies_64();
+	/* If we're not due for an update, return */
+	if (ca->next_interval_check > now)
+		return;
+
+	if (ca->next_interval_check <= (now - INTERVAL)) {
+		/* If it's been more than an interval since the last
+		 * check, then catch up - the last interval must have
+		 * been zero load */
+		ca->last_interval_time = 0;
+		ca->next_interval_check = next_interval_boundary(now);
+	} else {
+		/* If a steal takes the last interval time negative,
+		 * then we just ignore it */
+		if ((s64)ca->current_interval_time > 0) {
+			ca->last_interval_time = ca->current_interval_time;
+		} else {
+			ca->last_interval_time = 0;
+		}
+		ca->next_interval_check += INTERVAL;
+	}
+	ca->current_interval_time = 0;
+}
+
+static ssize_t cpuusage_read(struct nsproxy *ns,
+			     struct cftype *cft,
+			     struct file *file,
+			     char __user *buf,
+			     size_t nbytes, loff_t *ppos)
+{
+	struct cpuacct *ca = nsproxy_ca(ns);
+	u64 time;
+	char usagebuf[64];
+	char *s = usagebuf;
+
+	spin_lock_irq(&ca->lock);
+	cpuusage_update(ca);
+	time = cputime64_to_jiffies64(ca->time);
+	spin_unlock_irq(&ca->lock);
+
+	/* Convert 64-bit jiffies to seconds */
+	time *= 1000;
+	do_div(time, HZ);
+	s += sprintf(s, "%llu", (unsigned long long) time);
+
+	return simple_read_from_buffer(buf, nbytes, ppos, usagebuf, s - usagebuf);
+}
+
+static ssize_t load_read(struct nsproxy *ns,
+			 struct cftype *cft,
+			 struct file *file,
+			 char __user *buf,
+			 size_t nbytes, loff_t *ppos)
+{
+	struct cpuacct *ca = nsproxy_ca(ns);
+	u64 time;
+	char usagebuf[64];
+	char *s = usagebuf;
+
+	/* Find the time used in the previous interval */
+	spin_lock_irq(&ca->lock);
+	cpuusage_update(ca);
+	time = cputime64_to_jiffies64(ca->last_interval_time);
+	spin_unlock_irq(&ca->lock);
+
+	/* Convert time to a percentage, to give the load in the
+	 * previous period */
+	time *= 100;
+	do_div(time, INTERVAL);
+
+	s += sprintf(s, "%llu", (unsigned long long) time);
+
+	return simple_read_from_buffer(buf, nbytes, ppos, usagebuf, s - usagebuf);
+}
+
+static struct cftype cft_usage = {
+	.name = "cpuacct.usage",
+	.read = cpuusage_read,
+};
+
+static struct cftype cft_load = {
+	.name = "cpuacct.load",
+	.read = load_read,
+};
+
+static int cpuacct_populate(struct rc_subsys *ss,
+			    struct dentry *d)
+{
+	int err;
+
+	if ((err = rcfs_add_file(d, &cft_usage)))
+		return err;
+	if ((err = rcfs_add_file(d, &cft_load)))
+		return err;
+
+	return 0;
+}
+
+
+void cpuacct_charge(struct task_struct *task, cputime_t cputime)
+{
+
+	struct cpuacct *ca;
+	unsigned long flags;
+
+	if (!cpuacct_subsys.active)
+		return;
+	rcu_read_lock();
+	ca = task_ca(task);
+	if (ca) {
+		spin_lock_irqsave(&ca->lock, flags);
+		cpuusage_update(ca);
+		ca->time = cputime64_add(ca->time, cputime);
+		ca->current_interval_time =
+			cputime64_add(ca->current_interval_time, cputime);
+		spin_unlock_irqrestore(&ca->lock, flags);
+	}
+	rcu_read_unlock();
+}
+
+static struct rc_subsys cpuacct_subsys = {
+	.name = "cpuacct",
+	.create = cpuacct_create,
+	.destroy = cpuacct_destroy,
+	.populate = cpuacct_populate,
+	.subsys_id = -1,
+};
+
+
+int __init init_cpuacct(void)
+{
+	int id = rc_register_subsys(&cpuacct_subsys);
+	return id < 0 ? id : 0;
+}
+
+module_init(init_cpuacct)
diff -puN kernel/Makefile~cpu_acct kernel/Makefile
--- linux-2.6.20/kernel/Makefile~cpu_acct	2007-03-01 16:53:39.000000000 +0530
+++ linux-2.6.20-vatsa/kernel/Makefile	2007-03-01 16:53:39.000000000 +0530
@@ -36,6 +36,7 @@ obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
 obj-$(CONFIG_KEXEC) += kexec.o
 obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
+obj-$(CONFIG_RC_CPUACCT) += cpu_acct.o
 obj-$(CONFIG_IKCONFIG) += configs.o
 obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
 obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
diff -puN kernel/sched.c~cpu_acct kernel/sched.c
--- linux-2.6.20/kernel/sched.c~cpu_acct	2007-03-01 16:53:39.000000000 +0530
+++ linux-2.6.20-vatsa/kernel/sched.c	2007-03-01 16:53:39.000000000 +0530
@@ -52,6 +52,7 @@
 #include <linux/tsacct_kern.h>
 #include <linux/kprobes.h>
 #include <linux/delayacct.h>
+#include <linux/cpu_acct.h>
 #include <asm/tlb.h>
 
 #include <asm/unistd.h>
@@ -3066,9 +3067,13 @@ void account_user_time(struct task_struc
 {
 	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
 	cputime64_t tmp;
+	struct rq *rq = this_rq();
 
 	p->utime = cputime_add(p->utime, cputime);
 
+	if (p != rq->idle)
+		cpuacct_charge(p, cputime);
+
 	/* Add user time to cpustat. */
 	tmp = cputime_to_cputime64(cputime);
 	if (TASK_NICE(p) > 0)
@@ -3098,9 +3103,10 @@ void account_system_time(struct task_str
 		cpustat->irq = cputime64_add(cpustat->irq, tmp);
 	else if (softirq_count())
 		cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
-	else if (p != rq->idle)
+	else if (p != rq->idle) {
 		cpustat->system = cputime64_add(cpustat->system, tmp);
-	else if (atomic_read(&rq->nr_iowait) > 0)
+		cpuacct_charge(p, cputime);
+	} else if (atomic_read(&rq->nr_iowait) > 0)
 		cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
 	else
 		cpustat->idle = cputime64_add(cpustat->idle, tmp);
@@ -3125,8 +3131,10 @@ void account_steal_time(struct task_stru
 			cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
 		else
 			cpustat->idle = cputime64_add(cpustat->idle, tmp);
-	} else
+	} else {
 		cpustat->steal = cputime64_add(cpustat->steal, tmp);
+		cpuacct_charge(p, -tmp);
+	}
 }
 
 static void task_running_tick(struct rq *rq, struct task_struct *p)
_
-- 
Regards,
vatsa
_______________________________________________
Containers mailing list
Containers@lists.osdl.org
https://lists.osdl.org/mailman/listinfo/containers |  
	|  |  |  
	|  |  
	|  |  
	|  |  
	| 
		
			| Re: [PATCH 0/2] resource control file system - aka containers on top of nsproxy! [message #17566 is a reply to message #17556] | Sat, 03 March 2007 10:21   |  
			| 
				
				
					|  Paul Jackson Messages: 157
 Registered: February 2006
 | Senior Member |  |  |  
	| > Regarding semantics, can you be more specific?
Unfortunately not - sorry.
I've been off in other areas, and not found the time
to read through this current PATCH or think about it
carefully enough to be really useful.
Your reply seemed reasonable enough.
> It should have the same perf overhead as the original container patches
> (basically a double dereference - task->containers/nsproxy->cpuset -
> required to get to the cpuset from a task). 
There is just one spot that this might matter to cpusets.
Except for one hook, cpusets uses the mems_allowed and cpus_allowed
masks in the task struct to avoid having to look at the cpuset on hot
code paths.
There is one RCU guarded reference per memory allocation to
	current->cpuset->mems_generation
in the call to cpuset_update_task_memory_state(), for tasks that are in
some cpuset -other- than the default top cpuset, on systems that have
explicitly created additional (other than the top cpuset) cpusets after
boot.
If that RCU guarded reference turned into taking a global lock, or
pulling in a cache line that was frequently off dirty in some other
node, that would be unfortunate.
But that's the key hook so far as cpuset performance impact is
concerned.
Perhaps you could summarize what becomes of this hook, in this
brave new world of rcfs ...
-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401
_______________________________________________
Containers mailing list
Containers@lists.osdl.org
https://lists.osdl.org/mailman/listinfo/containers |  
	|  |  |  
	| 
		
			| Re: [PATCH 0/2] resource control file system - aka containers on top of nsproxy! [message #17567 is a reply to message #17556] | Sat, 03 March 2007 21:22   |  
			| 
				
				
					|  Paul Jackson Messages: 157
 Registered: February 2006
 | Senior Member |  |  |  
	| Herbert wrote:
> I agree here, there is not much difference for the
> following aspects:
Whether two somewhat similar needs should be met by one shared
mechanism, or two distinct mechanisms, cannot really be decided by
listing the similarities.
One has to determine if there are any significant differences in
needs that are too difficult for a shared mechanism to provide.
A couple of things you wrote in your second message might
touch on such possible significant differences:
 - resources must be hierarchically suballocated, and
 - key resource management code hooks can't cause hot cache lines.
In a later message, Herbert wrote:
> well, the thing is, as nsproxy is working now, you
> will get a new one (with a changed subset of entries)
> every time a task does a clone() with one of the 
> space flags set, which means, that you will end up
> with quite a lot of them, but resource limits have
> to address a group of them, not a single nsproxy
> (or act in a deeply hierarchical way which is not
> there atm, and probably will never be, as it simply
> adds too much overhead)
I still can't claim to have my head around this, but what you write
here, Herbert, writes here touches on what I suspect is a key
difference between namespaces and resources that would make it
impractical to accomplish both with a shared mechanism for aggregating
tasks.
It is a natural and desirable capability when managing resources, which
are relatively scarce (that's why they're worth all this trouble)
commodities of which we have some limited amount, to subdivide
allowances of them.  Some group of tasks gets the right to use certain
memory pages or cpu time slices, and in turn suballocates that allotment
to some subgroup of itself.  This naturally leads to a hierarchy of
allocated resources.
There is no such necessary, hierarchy for name spaces. One name space
might be derived from another at setup, by some arbitrary conventions,
but once initialized, this way or that, they are separate name spaces,
or at least naturally can (must?) be separate.
The cpuset hierarchy is an important part of the API that cpusets
presents to user space, where that hierarchy reflects the suballocation
of resources.  If B is a child of A in the cpuset hierarchy, then
the CPUs and Memory Nodes allowed to B -must- be a subset of those allowed
to A.  That is the key semantic of the cpuset hierarchy.  This includes
forcing the removal of a resource from B if for some reason it must
be removed from A, in order to preserve the hierarchical suballocation,
which requirement is causing a fair bit of hard work for the cpu hot
unplug folks.
I am quite willing to believe that name spaces has no need for such a
hierarchy, and further that it probably never will have such ... "too
much overhead" as you say.
> > It should have the same perf overhead as the original
> > container patches (basically a double dereference -
> > task->containers/nsproxy->cpuset - required to get to the 
> > cpuset from a task).
> 
> on every limit accounting or check? I think that
> is quite a lot of overhead ...
Do either of these dereferences require locks?
The two critical resources that cpusets manages, memory pages and
time slices on a cpu, cannot afford such dereferences or locking
in the key code paths (allocating a page or scheduling a cpu.)  The
existing cpuset code is down to one RCU guarded dereference of
current->cpuset in the page allocation code path (and then only on
systems actively using cpusets), and no such dereferences at all in the
scheduler path.
It took a fair bit of hard work (for someone of my modest abilities)
to get that far; I doubt we can accept much regression on this point.
Most likely the other folks doing resource management will have similar
concerns in many cases - memory pages and cpu slices are not the only
resources we're trying to manage on critical code paths.
In short - the issues seem to be:
 - resources need to be hierarchical, name spaces don't (can't?), and
 - no hot cache lines allowed by the resource hooks in key code paths.
-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401
_______________________________________________
Containers mailing list
Containers@lists.osdl.org
https://lists.osdl.org/mailman/listinfo/containers |  
	|  |  |  
	|  |  
	|  |  
	| 
		
			| Re: [PATCH 0/2] resource control file system - aka containers on top of nsproxy! [message #17578 is a reply to message #17567] | Mon, 05 March 2007 17:47   |  
			| 
				
				
					|  Srivatsa Vaddagiri Messages: 241
 Registered: August 2006
 | Senior Member |  |  |  
	| On Sat, Mar 03, 2007 at 01:22:44PM -0800, Paul Jackson wrote:
> I still can't claim to have my head around this, but what you write
> here, Herbert, writes here touches on what I suspect is a key
> difference between namespaces and resources that would make it
> impractical to accomplish both with a shared mechanism for aggregating
> tasks.
The way nsproxy is structured, its all pointers to actual namespace (or
in case of rcfs patch) resource objects.  This lets namespaces objects be in a 
flat hierarchy while resource objects are in tree-like hierarchy.
nsproxy itself doesnt decide any hierarchy. Its those objects pointed to
by nsproxy which can form different hierarchies. In fact the rcfs
patches allows such a combination afaics.
> > on every limit accounting or check? I think that
> > is quite a lot of overhead ...
> 
> Do either of these dereferences require locks?
A rcu_read_lock() should be required, which is not that expensive.
-- 
Regards,
vatsa
_______________________________________________
Containers mailing list
Containers@lists.osdl.org
https://lists.osdl.org/mailman/listinfo/containers |  
	|  |  |  
	| 
		
			| Re: [PATCH 0/2] resource control file system - aka containers on top of nsproxy! [message #17581 is a reply to message #17556] | Wed, 07 March 2007 02:32   |  
			| 
				
				
					|  Paul Menage Messages: 642
 Registered: September 2006
 | Senior Member |  |  |  
	| Hi Vatsa,
Sorry for the delayed reply - the last week has been very busy ...
On 3/1/07, Srivatsa Vaddagiri <vatsa@in.ibm.com> wrote:
> Paul,
>         Based on some of the feedback to container patches, I have
> respun them to avoid the "container" structure abstraction and instead use
> nsproxy structure in the kernel. User interface (which I felt was neat
> in your patches) has been retained to be same.
I'm not really sure that I see the value of having this be part of
nsproxy rather than the previous independent container (and
container_group) structure. As far as I can see, you're putting the
container subsystem state pointers and the various task namespace
pointers into the same structure (nsproxy) but then they're remaining
pretty much independent in terms of code.
The impression that I'm getting (correct me if I'm wrong) is:
- when you do a mkdir within an rcfs directory, the nsproxy associated
with the parent is duplicated, and then each rcfs subsystem gets to
set a subsystem-state pointer in that nsproxy
- when you move a task into an rcfs container, you create a new
nsproxy consisting of the task's old namespaces and its new subsystem
pointers. Then you look through the current list of nsproxy objects to
see if you find one that matches. If you do, you reuse it, else you
create a new nsproxy and link it into the list
- when you do sys_unshare() or a clone that creates new namespaces,
then the task (or its child) will get a new nsproxy that has the rcfs
subsystem state associated with the old nsproxy, and one or more
namespace pointers cloned to point to new namespaces. So this means
that the nsproxy for the task is no longer the nsproxy associated with
any directory in rcfs. (So the task will disappear from any "tasks"
file in rcfs?)
You seem to have lost some features, including fork/exit subsystem callbacks
>
> What follows is the core (big) patch and the cpu_acct subsystem to serve
> as an example of how to use it. I suspect we can make cpusets also work
> on top of this very easily.
I'd like to see that. I suspect it will be a bit more fiddly than the
simple cpu_acct subsystem.
Paul
_______________________________________________
Containers mailing list
Containers@lists.osdl.org
https://lists.osdl.org/mailman/listinfo/containers |  
	|  |  |  
	|  |  
	|  |  
	| 
		
			| Re: [PATCH 0/2] resource control file system - aka containers on top of nsproxy! [message #17586 is a reply to message #17556] | Wed, 07 March 2007 17:43   |  
			| 
				
				
					|  serue Messages: 750
 Registered: February 2006
 | Senior Member |  |  |  
	| Quoting Srivatsa Vaddagiri (vatsa@in.ibm.com):
> On Tue, Mar 06, 2007 at 06:32:07PM -0800, Paul Menage wrote:
> > I'm not really sure that I see the value of having this be part of
> > nsproxy rather than the previous independent container (and
> > container_group) structure. 
> 
> *shrug*
> 
> I wrote the patch mainly to see whether the stuff container folks (Sam Vilain
> et al) were complaining abt (that container structure abstraction
> inside the kernel is redundant/unnecessary) made sense or not. 
I still think the complaint was about terminology, not implementation.
They just didn't want you calling them containers.
> The rcfs patches demonstrate that it is possible to implement resource control
> on top of just nsproxy -and- give the same interface that you now
> have. In essense, I would say that the rcfs patches are about 70% same as your 
> original V7 container patches.
> 
> However as I am converting over cpusets to work on top of nsproxy, I
> have learnt few things:
> 
> container structure in your patches provides for these things:
> 
> a.  A way to group tasks
> b.  A way to maintain several hierarchies of such groups
> 
> If you consider just a. then I agree that container abstraction is
> redundant, esp for vserver resource control (nsproxy can already be used
> to group tasks).
> 
> What nsproxy doesn't provide is b - a way to represent hierarchies of
> groups. 
> 
> So we got several choices here.
> 
> 1. Introduce the container abstraction as is in your patches
> 2. Extend nsproxy somehow to represent hierarchies
> 3. Let individual resource controllers that -actually- support
>    hierarchical resource management maintain hierarchy in their code.
> 
> In the last option, nsproxy still is unaware of any hierarchy. Some of
> the resource objects it points to (for ex: cpuset) may maintain a
> hierarchy. For ex: nsproxy->ctlr_data[cpuset_subsys.subsys_id] points to
> a 'struct cpuset' structure which could maintains the hierarchical
> relationship among cpuset objects.
> 
> If we consider that most resource controllers may not implement hierarchical 
> resource management, then 3 may not be a bad compromise. OTOH if we
> expect *most* resource controllers to support hierarchical resource
> management, then we could be better of with option 1.
> 
> Anyway, summarizing on "why nsproxy", the main point (I think) is about
> using existing abstraction in the kernel.
But nsproxy is not an abstraction, it's an implementation
detail/optimization.  I'm mostly being quiet because i don't
particularly care if it gets expanded upon, but it's nothing more than
that right now.
> > As far as I can see, you're putting the
> > container subsystem state pointers and the various task namespace
> > pointers into the same structure (nsproxy) but then they're remaining
> > pretty much independent in terms of code.
> > 
> > The impression that I'm getting (correct me if I'm wrong) is:
> > 
> > - when you do a mkdir within an rcfs directory, the nsproxy associated
> > with the parent is duplicated, and then each rcfs subsystem gets to
> > set a subsystem-state pointer in that nsproxy
> 
> yes.
> 
> > - when you move a task into an rcfs container, you create a new
> > nsproxy consisting of the task's old namespaces and its new subsystem
> > pointers. Then you look through the current list of nsproxy objects to
> > see if you find one that matches. If you do, you reuse it, else you
> > create a new nsproxy and link it into the list
> 
> yes
> 
> > - when you do sys_unshare() or a clone that creates new namespaces,
> > then the task (or its child) will get a new nsproxy that has the rcfs
> > subsystem state associated with the old nsproxy, and one or more
> > namespace pointers cloned to point to new namespaces. So this means
> > that the nsproxy for the task is no longer the nsproxy associated with
> > any directory in rcfs. (So the task will disappear from any "tasks"
> > file in rcfs?)
> 
> it "should" disappear yes, although I haven't carefully studied the
> unshare requirements yet.
> 
> > You seem to have lost some features, including fork/exit subsystem callbacks
> 
> That was mainly to keep it simple for a proof-of-concept patch! We can add it 
> back later.
> 
> > >What follows is the core (big) patch and the cpu_acct subsystem to serve
> > >as an example of how to use it. I suspect we can make cpusets also work
> > >on top of this very easily.
> > 
> > I'd like to see that. I suspect it will be a bit more fiddly than the
> > simple cpu_acct subsystem.
> 
> I am almost done with the conversion. And yes cpuset is a beast to
> convert over! Will test and send the patches out tomorrow.
> 
> -- 
> Regards,
> vatsa
_______________________________________________
Containers mailing list
Containers@lists.osdl.org
https://lists.osdl.org/mailman/listinfo/containers |  
	|  |  |  
	|  |  
	|  |  
	| 
		
			| Re: [PATCH 0/2] resource control file system - aka containers on top of nsproxy! [message #17589 is a reply to message #17556] | Wed, 07 March 2007 20:58   |  
			| 
				
				
					|  serue Messages: 750
 Registered: February 2006
 | Senior Member |  |  |  
	| Quoting Srivatsa Vaddagiri (vatsa@in.ibm.com):
> On Wed, Mar 07, 2007 at 11:43:46AM -0600, Serge E. Hallyn wrote:
> > I still think the complaint was about terminology, not implementation.
> 
> I don't think that is what http://lkml.org/lkml/2007/2/12/426 conveyed!
I don't have that in my inbox anymore so won't reply to it itself
unfortunately, but what it conveyed is also not that nsproxy should be
the 'resource control' object.  If anything it seems to argue that all of
Paul's patchset should be done in userspace.
Sam writes
> That's a great idea for a set of
> tightly integrated userland utilities to simplify the presentation to
> the admin, but I don't see why you need to enshrine this in the kernel. 
> Certainly not for any of the other patches in your set as far as I can
> see.
I disagree.
Sam, there are two very separate concepts here.  Actually three.
What you had originally presented in a patchset was resource
virtualization:  so when process A asks for some resource X, rather than
get resource_table[X] he gets resource_table[hash(x)].  The concensus
you mention at the start, and which you say Eric was arguing for, was to
not do such translation, but just get rid of the global
'resource_table'.  By allowing processes to have and manipulate their
own private resource_table, you implement namespaces.
And as I've said, the nsproxy is just an implementation detail to keep
namespace pointers out of the task struct.  Using nsproxy or not has
nothing to do with the namespace approach versus global resource tables
with id translation at all userspace->kernel boundaries.
So virtualization via explicit translation through global resource
tables is one concept, and private namespaces could be said to be a
second.  The third is resource controls, which Paul's container patchset
implements.  It has nothing to do with the previous two, and it is what
Paul's patches are addressing.
Sam asks:
> Ask yourself this - what do you need the container structure for so
> badly, that virtualising the individual resources does not provide for? 
To keep track of a process' place in any of several hierarchies, where
each node in a tree inherit default values from the parent and can
customize in some way.
> You don't need it to copy the namespaces of another process ("enter")
> and you don't need it for checkpoint/migration.
Because these have nothing to do with resource controls.
> What does it mean to make a new container? 
It means to create a new set of limits, probably inheriting defaults
from a parent set of limits, customizable but probably within some
limits imposed by the parent.  For instance, if there is a cpuset
container which limits its tasks to cpus 7 and 8, then when a new
container is created as a child of that one, it can be further
restricted, but can never have more than cpus 7 and 8.
> That's a great idea for a set of
> tightly integrated userland utilities to simplify the presentation to
> the admin, but I don't see why you need to enshrine this in the kernel.
If you want to argue that resource controls should be done in userspace
i *suspect* you'll find that approach insufficient but am interested to
see attempts to do so.
But just moving the container structure into nsproxy is just that
- moving the container structure.  Since Sam argues vehemently that
there should be no such thing, I don't see how he can be seen as wanting
to move it.
All that being said, if it were going to save space without overly
complicating things I'm actually not opposed to using nsproxy, but it
looks to me like it does complicate things.  One reason for this is that
through the nsproxy subsystem we are mixing pointers to data (the
namespaces) and pointers to pointers to the same data (nsproxy subsystem
containers pointing to nsproxies) in the same structure.  Yuck.
-serge
_______________________________________________
Containers mailing list
Containers@lists.osdl.org
https://lists.osdl.org/mailman/listinfo/containers |  
	|  |  |  
	|  |  
	|  |  
	|  |  
	|  |  
	|  |  
	| 
		
			| Re: [PATCH 0/2] resource control file system - aka containers on top of nsproxy! [message #17597 is a reply to message #17556] | Thu, 08 March 2007 00:42   |  
			| 
				
				
					|  Paul Menage Messages: 642
 Registered: September 2006
 | Senior Member |  |  |  
	| On 3/7/07, Sam Vilain <sam@vilain.net> wrote:
> Paul Menage wrote:
> >> In the namespace world when we say container we mean roughly at the level
> >> of nsproxy and container_group.
> >>
> > So you're saying that a task can only be in a single system-wide container.
> >
>
> Nope, we didn't make the mistake of nailing down what a "container" was
> too far before it is implemented.  We talked before about
> containers-within-containers because, inevitably if you provide a
> feature you'll end up having to deal with virtualising systems that in
> turn use that feature.
Sure, my aproach allows containers hierarchically as children of other
containers too.
>
> > My patch provides multiple potentially-independent ways of dividing up
> > the tasks on the system - if the "container" is the set of all
> > divisions that the process is in, what's an appropriate term for the
> > sub-units?
> >
>
> namespace, since 2.4.x
>
> > That assumes the viewpoint that your terminology is "correct" and
> > other people's needs "fixing". :-)
> >
>
> Absolutely.  Please respect the semantics established so far; changing
> them adds nothing at the cost of much confusion.
But "namespace" has well-established historical semantics too - a way
of changing the mappings of local names to global objects. This
doesn't describe things liek resource controllers, cpusets, resource
monitoring, etc.
Trying to extend the well-known term namespace to refer to things that
aren't namespaces isn't a useful approach, IMO.
Paul
_______________________________________________
Containers mailing list
Containers@lists.osdl.org
https://lists.osdl.org/mailman/listinfo/containers |  
	|  |  |  
	| 
		
			| Re: [PATCH 1/2] rcfs core patch [message #17603 is a reply to message #17557] | Thu, 08 March 2007 03:12   |  
			| 
				
				
					|  ebiederm Messages: 1354
 Registered: February 2006
 | Senior Member |  |  |  
	| Srivatsa Vaddagiri <vatsa@in.ibm.com> writes:
> Heavily based on Paul Menage's (inturn cpuset) work. The big difference
> is that the patch uses task->nsproxy to group tasks for resource control
> purpose (instead of task->containers).
>
> The patch retains the same user interface as Paul Menage's patches. In
> particular, you can have multiple hierarchies, each hierarchy giving a 
> different composition/view of task-groups.
>
> (Ideally this patch should have been split into 2 or 3 sub-patches, but
> will do that on a subsequent version post)
After looking at the discussion that happened immediately after this was
posted this feels like the right general direction to get the different
parties talking to each other.  I'm not convinced about the whole idea
yet but this looks like a step in a useful direction.
I have a big request.
Please next time this kind of patch is posted add a description of
what is happening and why.  I have yet to see people explain why
this is a good idea.  Why the current semantics were chosen.
The review is still largely happening at the why level but no
one is addressing that yet.  So please can we have a why.
I have a question?  What does rcfs look like if we start with
the code that is in the kernel?  That is start with namespaces
and nsproxy and just build a filesystem to display/manipulate them?
With the code built so it will support adding resource controllers
when they are ready?
> Signed-off-by : Srivatsa Vaddagiri <vatsa@in.ibm.com>
> Signed-off-by : Paul Menage <menage@google.com>
>
>
> ---
>
>  linux-2.6.20-vatsa/include/linux/init_task.h |    4 
>  linux-2.6.20-vatsa/include/linux/nsproxy.h   |    5 
>  linux-2.6.20-vatsa/init/Kconfig              |   22 
>  linux-2.6.20-vatsa/init/main.c               |    1 
>  linux-2.6.20-vatsa/kernel/Makefile           |    1 
>
>
> ---
>
> diff -puN include/linux/nsproxy.h~rcfs include/linux/nsproxy.h
> --- linux-2.6.20/include/linux/nsproxy.h~rcfs 2007-03-01 14:20:47.000000000
> +0530
> +++ linux-2.6.20-vatsa/include/linux/nsproxy.h 2007-03-01 14:20:47.000000000
> +0530
> @@ -28,6 +28,10 @@ struct nsproxy {
We probably want to rename this struct task_proxy....
And then we can rename most of the users things like:
dup_task_proxy, clone_task_proxy, get_task_proxy, free_task_proxy,
put_task_proxy, exit_task_proxy, init_task_proxy....
>  	struct ipc_namespace *ipc_ns;
>  	struct mnt_namespace *mnt_ns;
>  	struct pid_namespace *pid_ns;
> +#ifdef CONFIG_RCFS
> +	struct list_head list;
This extra list of nsproxy's is unneeded and a performance problem the
way it is used.  In general we want to talk about the individual resource
controllers not the nsproxy.
> +	void *ctlr_data[CONFIG_MAX_RC_SUBSYS];
I still don't understand why these pointers are so abstract,
and why we need an array lookup into them?
> +#endif
>  };
>  extern struct nsproxy init_nsproxy;
>  
> @@ -35,6 +39,12 @@ struct nsproxy *dup_namespaces(struct ns
>  int copy_namespaces(int flags, struct task_struct *tsk);
>  void get_task_namespaces(struct task_struct *tsk);
>  void free_nsproxy(struct nsproxy *ns);
> +#ifdef CONFIG_RCFS
> +struct nsproxy *find_nsproxy(struct nsproxy *ns);
> +int namespaces_init(void);
> +#else
> +static inline int namespaces_init(void) { return 0;}
> +#endif
>  
>  static inline void put_nsproxy(struct nsproxy *ns)
>  {
> diff -puN /dev/null include/linux/rcfs.h
> --- /dev/null	2006-02-25 03:06:56.000000000 +0530
> +++ linux-2.6.20-vatsa/include/linux/rcfs.h 2007-03-01 14:20:47.000000000 +0530
> @@ -0,0 +1,72 @@
> +#ifndef _LINUX_RCFS_H
> +#define _LINUX_RCFS_H
> +
> +#ifdef CONFIG_RCFS
> +
> +/* struct cftype:
> + *
> + * The files in the container filesystem mostly have a very simple read/write
> + * handling, some common function will take care of it. Nevertheless some cases
> + * (read tasks) are special and therefore I define this structure for every
> + * kind of file.
I'm still inclined to think this should be part of /proc, instead of a purely
separate fs.  But I might be missing something.
Eric
_______________________________________________
Containers mailing list
Containers@lists.osdl.org
https://lists.osdl.org/mailman/listinfo/containers |  
	|  |  |  
	| 
		
			| Re: [PATCH 0/2] resource control file system - aka containers on top of nsproxy! [message #17605 is a reply to message #17581] | Wed, 07 March 2007 17:30   |  
			| 
				
				
					|  Srivatsa Vaddagiri Messages: 241
 Registered: August 2006
 | Senior Member |  |  |  
	| On Tue, Mar 06, 2007 at 06:32:07PM -0800, Paul Menage wrote:
> I'm not really sure that I see the value of having this be part of
> nsproxy rather than the previous independent container (and
> container_group) structure. 
*shrug*
I wrote the patch mainly to see whether the stuff container folks (Sam Vilain
et al) were complaining abt (that container structure abstraction
inside the kernel is redundant/unnecessary) made sense or not. 
The rcfs patches demonstrate that it is possible to implement resource control
on top of just nsproxy -and- give the same interface that you now
have. In essense, I would say that the rcfs patches are about 70% same as your 
original V7 container patches.
However as I am converting over cpusets to work on top of nsproxy, I
have learnt few things:
container structure in your patches provides for these things:
a.  A way to group tasks
b.  A way to maintain several hierarchies of such groups
If you consider just a. then I agree that container abstraction is
redundant, esp for vserver resource control (nsproxy can already be used
to group tasks).
What nsproxy doesn't provide is b - a way to represent hierarchies of
groups. 
So we got several choices here.
1. Introduce the container abstraction as is in your patches
2. Extend nsproxy somehow to represent hierarchies
3. Let individual resource controllers that -actually- support
   hierarchical resource management maintain hierarchy in their code.
In the last option, nsproxy still is unaware of any hierarchy. Some of
the resource objects it points to (for ex: cpuset) may maintain a
hierarchy. For ex: nsproxy->ctlr_data[cpuset_subsys.subsys_id] points to
a 'struct cpuset' structure which could maintains the hierarchical
relationship among cpuset objects.
If we consider that most resource controllers may not implement hierarchical 
resource management, then 3 may not be a bad compromise. OTOH if we
expect *most* resource controllers to support hierarchical resource
management, then we could be better of with option 1.
Anyway, summarizing on "why nsproxy", the main point (I think) is about
using existing abstraction in the kernel.
> As far as I can see, you're putting the
> container subsystem state pointers and the various task namespace
> pointers into the same structure (nsproxy) but then they're remaining
> pretty much independent in terms of code.
> 
> The impression that I'm getting (correct me if I'm wrong) is:
> 
> - when you do a mkdir within an rcfs directory, the nsproxy associated
> with the parent is duplicated, and then each rcfs subsystem gets to
> set a subsystem-state pointer in that nsproxy
yes.
> - when you move a task into an rcfs container, you create a new
> nsproxy consisting of the task's old namespaces and its new subsystem
> pointers. Then you look through the current list of nsproxy objects to
> see if you find one that matches. If you do, you reuse it, else you
> create a new nsproxy and link it into the list
yes
> - when you do sys_unshare() or a clone that creates new namespaces,
> then the task (or its child) will get a new nsproxy that has the rcfs
> subsystem state associated with the old nsproxy, and one or more
> namespace pointers cloned to point to new namespaces. So this means
> that the nsproxy for the task is no longer the nsproxy associated with
> any directory in rcfs. (So the task will disappear from any "tasks"
> file in rcfs?)
it "should" disappear yes, although I haven't carefully studied the
unshare requirements yet.
> You seem to have lost some features, including fork/exit subsystem callbacks
That was mainly to keep it simple for a proof-of-concept patch! We can add it 
back later.
> >What follows is the core (big) patch and the cpu_acct subsystem to serve
> >as an example of how to use it. I suspect we can make cpusets also work
> >on top of this very easily.
> 
> I'd like to see that. I suspect it will be a bit more fiddly than the
> simple cpu_acct subsystem.
I am almost done with the conversion. And yes cpuset is a beast to
convert over! Will test and send the patches out tomorrow.
-- 
Regards,
vatsa
_______________________________________________
Containers mailing list
Containers@lists.osdl.org
https://lists.osdl.org/mailman/listinfo/containers |  
	|  |  |  
	|  |  
	| 
		
			| Re: [ckrm-tech] [PATCH 0/2] resource control file system - aka containers on top of nsproxy! [message #17607 is a reply to message #17584] | Wed, 07 March 2007 17:52   |  
			| 
				
				
					|  Srivatsa Vaddagiri Messages: 241
 Registered: August 2006
 | Senior Member |  |  |  
	| On Wed, Mar 07, 2007 at 09:29:12AM -0800, Paul Menage wrote:
> That seems bad. With the current way you're doing it, if I mount
> hierarchies A and B on /mnt/A and /mnt/B, then initially all tasks are
> in /mnt/A/tasks and /mnt/B/tasks. If I then create /mnt/A/foo and move
> a process into it, that process disappears from /mnt/B/tasks, since
> its nsproxy no longer matches the nsproxy of B's root container. Or am
> I missing something?
I realized that bug as I was doing cpuset conversion.
Basically, we can't use just tsk->nsproxy to find what tasks are in
a directory (/mnt/B for ex). Here's what I was think we should be doing instead:
struct nsproxy *ns;
void *data;
ns = dentry_of(/mnt/B/tasks)->d_parent->d_fsdata;
data = ns->ctlr_data[some subsystem id which is bound in /mnt/B hierarchy]
we now scan tasklist and find a match if:
	tsk->nsproxy->ctlr_data[the above id] == data
(maybe we need to match on all data from all subsystems bound to B)
There is a similar bug in rcfs_rmdir also. We can't just use the nsproxy
pointed to by dentry to know whether the resource objects are free or
not. I am thinking (if at all resource control has to be provided on top
of nsproxy) that we should have a get_res_ns, similar to get_mnt_ns or
get_uts_ns, which will track number of nsproxies pointing to the same
resource object. If we do that, then rmdir() needs to go and check
those resource object's refcounts to see if a dir is in use or not.
-- 
Regards,
vatsa
_______________________________________________
Containers mailing list
Containers@lists.osdl.org
https://lists.osdl.org/mailman/listinfo/containers |  
	|  |  |  
	|  |  
	| 
		
			| Re: [PATCH 1/2] rcfs core patch [message #17611 is a reply to message #17603] | Thu, 08 March 2007 09:10   |  
			| 
				
				
					|  Paul Menage Messages: 642
 Registered: September 2006
 | Senior Member |  |  |  
	| On 3/7/07, Eric W. Biederman <ebiederm@xmission.com> wrote:
>
> Please next time this kind of patch is posted add a description of
> what is happening and why.  I have yet to see people explain why
> this is a good idea.  Why the current semantics were chosen.
OK. I thought that the descriptions in my last patch 0/7 and
Documentation/containers.txt gave a reasonable amount of "why", but I
can look at adding more details.
>
> I have a question?  What does rcfs look like if we start with
> the code that is in the kernel?  That is start with namespaces
> and nsproxy and just build a filesystem to display/manipulate them?
> With the code built so it will support adding resource controllers
> when they are ready?
There's at least one resource controller that's already in the kernel - cpusets.
> We probably want to rename this struct task_proxy....
> And then we can rename most of the users things like:
> dup_task_proxy, clone_task_proxy, get_task_proxy, free_task_proxy,
> put_task_proxy, exit_task_proxy, init_task_proxy....
That could be a good start.
>
> This extra list of nsproxy's is unneeded and a performance problem the
> way it is used.  In general we want to talk about the individual resource
> controllers not the nsproxy.
There's one important reason why it's needed, and highlights one of
the ways that "resource controllers" are different from the way that
"namespaces" have currently been used.
Currently with a namespace, you can only unshare, either by
sys_unshare() or clone() - you can't "reshare" a namespace with some
other task. But resource controllers tend to have the concept a lot
more of being able to move between resource classes. If you're going
to have an ns_proxy/container_group object that gathers together a
group of pointers to namespaces/subsystem-states, then either:
1) you only allow a task to reshare *all* namespaces/subsystems with
another task, i.e. you can update current->task_proxy to point to
other->task_proxy. But that restricts flexibility of movement. It
would be impossible to have a process that could enter, say, an
existing process' network namespace without also entering its
pid/ipc/uts namespaces and all of its resource limits.
2) you allow a task to selectively reshare namespaces/subsystems with
another task, i.e. you can update current->task_proxy to point to a
proxy that matches your existing task_proxy in some ways and the
task_proxy of your destination in others. In that case a trivial
implementation would be to allocate a new task_proxy and copy some
pointers from the old task_proxy and some from the new. But then
whenever a task moves between different groupings it acquires a new
unique task_proxy. So moving a bunch of tasks between two groupings,
they'd all end up with unique task_proxy objects with identical
contents.
So it would be much more space efficient to be able to locate an
existing task_proxy with an identical set of namespace/subsystem
pointers in that event. The linked list approach that I put in my last
containers patch was a simple way to do that, and Vatsa's reused it
for his patches. My intention is to replace it with a more efficient
lookup (maybe using a hash of the desired pointers?) in a future
patch.
>
> > +     void *ctlr_data[CONFIG_MAX_RC_SUBSYS];
>
> I still don't understand why these pointers are so abstract,
> and why we need an array lookup into them?
>
For the same reason that we have:
- generic notifier chains rather than having a big pile of #ifdef'd
calls to the various notification sites
- linker sections to define initcalls and per-cpu variables, rather
than hard-coding all init calls into init/main.c and having a big
per-cpu structure (both of which would again be full of #ifdefs)
It makes the code much more readable, and makes patches much simpler
and less likely to stomp on one another.
OK, so my current approaches have involved an approach like notifier
chains, i.e. have a generic list/array, and do something to all the
objects on that array.
How about a radically different approach based around the
initcall/percpu way (linker sections)? Something like:
- each namespace or subsystem defines itself in its own code, via a
macro such as:
struct task_subsys {
  const char *name;
  ...
};
#define DECLARE_TASKGROUP_SUBSYSTEM(ss) \
    __attribute__((__section__(".data.tasksubsys"))) struct
task_subsys *ss##_ptr = &ss
It would be used like:
struct taskgroup_subsys uts_ns = {
  .name = "uts",
  .unshare = uts_unshare,
};
DECLARE_TASKGROUP_SUBSYSTEM(uts_ns);
...
struct taskgroup_subsys cpuset_ss {
  .name = "cpuset",
  .create = cpuset_create,
  .attach = cpuset_attach,
};
DECLARE_TASKGROUP_SUBSYSTEM(cpuset_ss);
At boot time, the task_proxy init code would figure out from the size
of the task_subsys section how many pointers had to be in the
task_proxy object (maybe add a few spares for dynamically-loaded
modules?). The offset of the subsystem pointer within the task_subsys
data section would also be the offset of that subsystem's
per-task-group state within the task_proxy object, which should allow
accesses to be pretty efficient (with macros providing user-friendly
access to the appropriate locations in the task_proxy)
The loops in container.c in my patch that iterate over the subsys
array to perform callbacks, and the code in nsproxy.c that performs
the same action for each namespace type, would be replaced with
iterations over the task_subsys data section; possibly some
pre-processing of the various linked-in subsystems could be done to
remove unnecessary iterations. The generic code would handle things
like reference counting.
The existing unshare()/clone() interface would be a way to create a
child "container" (for want of a better term) that shared some
subsystem pointers with its parent and had cloned versions of others
(perhaps only for the namespace-like subsystems?); the filesystem
interface would allow you to create new "containers" that weren't
explicitly associated with processes, and to move processes between
"containers". Also, the filesystem interface would allow you to bind
multiple subsystems together to allow easier manipulation from
userspace, in a similar way to my current containers patch.
So in summary, it takes the concepts that resource controllers and
namespaces share (that of grouping tasks) and unifies them, while not
forcing them to behave exactly the same way. I can envisage some other
per-task pointers that are generally inherited by children being
possibly moved into this in the same way, e.g. task->user and
task->mempolicy, if we could come up with a solution that handles
groupings with sufficiently different lifetimes.
Thoughts?
Paul
_______________________________________________
Containers mailing list
Containers@lists.osdl.org
https://lists.osdl.org/mailman/listinfo/containers |  
	|  |  |  
	|  |  
	| 
		
			| Re: [PATCH 0/2] resource control file system - aka containers on top of nsproxy! [message #17614 is a reply to message #17605] | Thu, 08 March 2007 00:50   |  
			| 
				
				
					|  Sam Vilain Messages: 73
 Registered: February 2006
 | Member |  |  |  
	| Srivatsa Vaddagiri wrote:
> container structure in your patches provides for these things:
>
> a.  A way to group tasks
> b.  A way to maintain several hierarchies of such groups
>
> If you consider just a. then I agree that container abstraction is
> redundant, esp for vserver resource control (nsproxy can already be used
> to group tasks).
>
> What nsproxy doesn't provide is b - a way to represent hierarchies of
> groups. 
>   
Well, that's like saying you can't put hierarchical data in a relational
database.
The hierarchy question is an interesting one, though. However I believe
it first needs to be broken down into subsystems and considered on a
subsystem-by-subsystem basis again, and if general patterns are
observed, then a common solution should stand out.
Let's go back to the namespaces we know about and discuss how
hierarchies apply to them. Please those able to brainstorm, do so - I
call green hat time.
1. UTS namespaces
Can a UTS namespace set any value it likes?
Can you inspect or set the UTS namespace values of a subservient UTS
namespace?
2. IPC namespaces
Can a process in an IPC namespace send a signal to those in a
subservient namespace?
3. PID namespaces
Can a process in a PID namespace see the processes in a subservient
namespace?
Do the processes in a subservient namespace appear in a higher level
namespace mapped to a different set of PIDs?
4. Filesystem namespaces
Can we see all of the mounts in a subservient namespace?
Does our namespace receive updates when their namespace mounts change?
(perhaps under a sub-directory)
5. L2 network namespaces
Can we see or alter the subservient network namespace's
interfaces/iptables/routing?
Are any of the subservient network namespace's interfaces visible in our
namespace, and by which mapping?
6. L3 network namespaces
Can we bind to a subservient network namespace's addresses?
Can we give or remove addresses to and from the subservient network
namespace's namespace?
Can we allow the namespace access to modify particular IP tables?
7. resource namespaces
Is the subservient namespace's resource usage counting against ours too?
Can we dynamically alter the subservient namespace's resource allocations?
8. anyone else?
So, we can see some general trends here - but it's never quite the same
question, and I think the best answers will come from a tailored
approach for each subsystem.
Each one *does* have some common questions - for instance, "is the
namespace allowed to create more namespaces of this type". That's
probably a capability bit for each, though.
So let's bring this back to your patches. If they are providing
visibility of ns_proxy, then it should be called namesfs or some such.
It doesn't really matter if processes disappear from namespace
aggregates, because that's what's really happening anyway. The only
problem is that if you try to freeze a namespace that has visibility of
things at this level, you might not be able to reconstruct the
filesystem in the same way. This may or may not be considered a problem,
but open filehandles and directory handles etc surviving a freeze/thaw
is part of what we're trying to achieve. Then again, perhaps some
visibility is better than none for the time being.
If they are restricted entirely to resource control, then don't use the
nsproxy directly - use the structure or structures which hang off the
nsproxy (or even task_struct) related to resource control.
Sam.
_______________________________________________
Containers mailing list
Containers@lists.osdl.org
https://lists.osdl.org/mailman/listinfo/containers |  
	|  |  |  
	|  |  
	| 
		
			| Re: [PATCH 1/2] rcfs core patch [message #17616 is a reply to message #17603] | Thu, 08 March 2007 10:13   |  
			| 
				
				
					|  Srivatsa Vaddagiri Messages: 241
 Registered: August 2006
 | Senior Member |  |  |  
	| On Wed, Mar 07, 2007 at 08:12:00PM -0700, Eric W. Biederman wrote:
> The review is still largely happening at the why level but no
> one is addressing that yet.  So please can we have a why.
Here's a brief summary of what's happening and why. If its not clear, pls get 
back to us with specific questions.
There have been various projects attempting to provide resource
management support in Linux, including CKRM/Resource Groups and UBC.
Each had its own task-grouping mechanism. 
Paul Menage observed [1] that cpusets in the kernel already has a grouping
mechanism which was working well for cpusets. He went ahead and
generalized the grouping code in cpusets so that it could be used for
overall resource management purpose. With his patches, it is possible to
even create multiple hierarchies of groups (see [2] on why multiple
hierarchies) as follows:
mount -t container -o cpuset none /dev/cpuset	<- cpuset hierarchy
mount -t container -o mem,cpu none /dev/mem	<- memory/cpu hierarchy
mount -t container -o disk none /dev/disk	<- disk hierarchy
In each hierarchy, you can create task groups and manipulate the
resource parameters of each group. You can also move tasks between
groups at run-time (see [3] on why this is required). Each hierarchy is also 
manipulated independent of the other. 
Paul's patches also introduced a 'struct container' in the kernel, which
serves these key purposes:
- Task-grouping
	'struct container' represents a task-group created in each hierarchy. 
	So every directory created under /dev/cpuset or /dev/mem above will
	have a corresponding 'struct container' inside the kernel.
	All tasks pointing to the same 'struct container' are considered
	to be part of a group
	The 'struct container' in turn has pointers to resource objects 
	which store actual resource parameters for that group. In above
	example, 'struct container' created under /dev/cpuset will have a 
	pointer to 'struct cpuset' while 'struct container' created
	under /dev/disk will have pointer to 'struct disk_quota_or_whatever'.
- Maintain hierarchical information
	The 'struct container' also keeps track of hierarchical relationship 
	between groups.
The filesystem interface in the patches essentially serves these
purposes:
	- Provide an interface to manipulate task-groups. This includes
	  creating/deleting groups, listing tasks present in a group and 
	  moving tasks across groups
	- Provdes an interface to manipulate the resource objects
	  (limits etc) pointed to by 'struct container'.
As you know, the introduction of 'struct container' was objected to and
was felt redundant as a means to group tasks. Thats where I took a shot
at converting over Paul Menage's patch to avoid 'struct container'
abstraction and insead work with 'struct nsproxy'. In the rcfs patch,
each directory (in /dev/cpuset or /dev/disk) is associated with a
'struct nsproxy' instead. The most important need of the filesystem
interface is not to manipulate the nsproxy objects directly, but to manipulate
the resource objects (nsproxy->ctlr_data[] in the patches) which store 
information like limit etc.
> I have a question?  What does rcfs look like if we start with
> the code that is in the kernel?  That is start with namespaces
> and nsproxy and just build a filesystem to display/manipulate them?
> With the code built so it will support adding resource controllers
> when they are ready?
If I am not mistaken, Serge did attempt something in that direction,
only that it was based on Paul's container patches. rcfs can no doubt
support the same feature.
> >  	struct ipc_namespace *ipc_ns;
> >  	struct mnt_namespace *mnt_ns;
> >  	struct pid_namespace *pid_ns;
> > +#ifdef CONFIG_RCFS
> > +	struct list_head list;
> 
> This extra list of nsproxy's is unneeded and a performance problem the
> way it is used.  In general we want to talk about the individual resource
> controllers not the nsproxy.
I think if you consider the multiple hierarchy picture, the need becomes
obvious.
Lets say that you had these hierarchies : /dev/cpuset, /dev/mem, /dev/disk
and the various resource classes (task-groups) under them as below:
	/dev/cpuset/C1, /dev/cpuset/C1/C11, /dev/cpuset/C2
	/dev/mem/M1, /dev/mem/M2, /dev/mem/M3
	/dev/disk/D1, /dev/disk/D2, /dev/disk/D3
The nsproxy structure basically has pointers to a resource objects in
each of these hierarchies. 
	nsproxy { ..., C1, M1, D1} could be one nsproxy
	nsproxy { ..., C1, M2, D3} could be another nsproxy and so on
So you see, because of multi-hierachies, we can have different
combinations of resource classes.
When we support task movement across resource classes, we need to find a
nsproxy which has the right combination of resource classes that the
task's nsproxy can be hooked to.
That's where we need the nsproxy list. Hope this makes it clear.
> > +	void *ctlr_data[CONFIG_MAX_RC_SUBSYS];
> 
> I still don't understand why these pointers are so abstract,
> and why we need an array lookup into them?
we can avoid these abstract pointers and instead have a set of pointers
like this:
	struct nsproxy {
		...
		struct cpu_limit *cpu;	/* cpu control namespace */
		struct rss_limit *rss;	/* rss control namespace */
		struct cpuset *cs;	/* cpuset namespace */
	}
But that will make some code (like searching for a right nsproxy when a
task moves across classes/groups) very awkward.
> I'm still inclined to think this should be part of /proc, instead of a purely
> separate fs.  But I might be missing something.
A separate filesystem would give us more flexibility like the
implementing multi-hierarchy support described above.
-- 
Regards,
vatsa
References:
1. http://lkml.org/lkml/2006/09/20/200 
2. http://lkml.org/lkml/2006/11/6/95
3. http://lkml.org/lkml/2006/09/5/178
_______________________________________________
Containers mailing list
Containers@lists.osdl.org
https://lists.osdl.org/mailman/listinfo/containers |  
	|  |  |  
	| 
		
			| Re: [PATCH 0/2] resource control file system - aka containers on top of nsproxy! [message #17619 is a reply to message #17614] | Thu, 08 March 2007 11:30   |  
			| 
				
				
					|  Srivatsa Vaddagiri Messages: 241
 Registered: August 2006
 | Senior Member |  |  |  
	| On Thu, Mar 08, 2007 at 01:50:01PM +1300, Sam Vilain wrote:
> 7. resource namespaces
It should be. Imagine giving 20% bandwidth to a user X. X wants to
divide this bandwidth further between multi-media (10%), kernel
compilation (5%) and rest (5%). So,
> Is the subservient namespace's resource usage counting against ours too?
Yes, the resource usage of children should be accounted when capping
parent resource usage.
> Can we dynamically alter the subservient namespace's resource allocations?
Should be possible yes. That lets user X completely manage his
allocation among whatever sub-groups he creates.
> So let's bring this back to your patches. If they are providing
> visibility of ns_proxy, then it should be called namesfs or some such.
The patches should give visibility to both nsproxy objects (by showing
what tasks share the same nsproxy objects and letting tasks move across
nsproxy objects if allowed) and the resource control objects pointed to
by nsproxy (struct cpuset, struct cpu_limit, struct rss_limit etc).
> It doesn't really matter if processes disappear from namespace
> aggregates, because that's what's really happening anyway. The only
> problem is that if you try to freeze a namespace that has visibility of
> things at this level, you might not be able to reconstruct the
> filesystem in the same way. This may or may not be considered a problem,
> but open filehandles and directory handles etc surviving a freeze/thaw
> is part of what we're trying to achieve. Then again, perhaps some
> visibility is better than none for the time being.
> 
> If they are restricted entirely to resource control, then don't use the
> nsproxy directly - use the structure or structures which hang off the
> nsproxy (or even task_struct) related to resource control.
-- 
Regards,
vatsa
_______________________________________________
Containers mailing list
Containers@lists.osdl.org
https://lists.osdl.org/mailman/listinfo/containers |  
	|  |  |  
	|  |  
	| 
		
			| Re: [PATCH 1/2] rcfs core patch [message #17621 is a reply to message #17557] | Fri, 09 March 2007 02:35   |  
			| 
				
				
					|  Paul Jackson Messages: 157
 Registered: February 2006
 | Senior Member |  |  |  
	| Herbert wrote:
> why is the filesystem approach so favored for this
> kind of manipulations?
I don't have any clear sense of whether the additional uses of file
systems being considered here are a good idea or not, but the use of a
file system for cpusets has turned out quite well, in my (vain and
biased ;) view.
Cpusets are subsets of the CPUs and memory nodes on a system.
These subsets naturally form a partial ordering, where one cpuset is
below another if its CPUs and nodes are a subset of the other ones.
This forms a natural hierarchical space.  It is quite convenient to be
able to add names and file system like attributes, so that one can do
things like -name- the set of CPUs to which you are attaching a job, as
in "this job is to run on the CPUs in cpuset /foo/bar", and to further
have file system like permissions on these subsets, to control who can
access or modify them.
For such hierarchical data structures, especially ones where names and
permissions are useful, file systems are a more natural interface than
traditional system call usage patterns.
The key, in my view, is the 'shape' of the data.  If the data schema is
basically a single table, with uniform rows having a few fields each,
where each field is a simple integer or string (not a fancy formatted
string encoding some more elaborate shape) then classic system call
patterns work well.  If the schema is tree shaped, and especially if
the usual file system attributes such as a hierarchical name space and
permissions are useful, then a file system based API is probably best.
-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401
_______________________________________________
Containers mailing list
Containers@lists.osdl.org
https://lists.osdl.org/mailman/listinfo/containers |  
	|  |  |  
	| 
		
			| Re: [PATCH 0/2] resource control file system - aka containers on top of nsproxy! [message #17622 is a reply to message #17615] | Fri, 09 March 2007 04:27   |  
			| 
				
				
					|  Paul Jackson Messages: 157
 Registered: February 2006
 | Senior Member |  |  |  
	| > But "namespace" has well-established historical semantics too - a way
> of changing the mappings of local * to global objects. This
> accurately describes things liek resource controllers, cpusets, resource
> monitoring, etc.
No!
Cpusets don't rename or change the mapping of objects.
I suspect you seriously misunderstand cpusets and are trying to cram them
into a 'namespace' remapping role into which they don't fit.
So far as cpusets are concerned, CPU #17 is CPU #17, for all tasks,
regardless of what cpuset they are in.  They just might not happen to
be allowed to execute on CPU #17 at the moment, because that CPU is not
allowed by the cpuset they are in.
But they still call it CPU #17.
Similary the namespace of cpusets and of tasks (pid's) are single
system-wide namespaces, so far as cpusets are concerned.
Cpusets are not about alternative or multiple or variant name spaces.
They are about (considering just CPUs for the moment):
 1) creating a set of maps M0, M1, ... from the set of CPUs to a Boolean,
 2) creating a mapping Q from the set of tasks to these M0, ... maps, and
 3) imposing constraints on where tasks can run, as follows:
	For any task t, that task is allowed to run on CPU x iff Q(t)(x)
	is True.  Here, Q(t) will be one of the maps M0, ... aka a cpuset.
So far as cpusets are concerned, there is only one each of:
 A] a namespace numbering CPUs,
 B] a namespace numbering tasks (the process id),
 C] a namespace naming cpusets (the hierarchical name space normally
    mounted at /dev/cpuset, and corresponding to the Mn maps above) and
 D] a mapping of tasks to cpusets, system wide (just a map, not a namespace.)
All tasks (of sufficient authority) can see each of these, using a single
system wide name space for each of [A], [B], and [C].
Unless, that is, you call any mapping a "way of changing mappings".
To do so would be a senseless abuse of the phrase, in my view.
More generally, these resource managers all tend to divide some external
limited physical resource into multiple separately allocatable units.
If the resource is amorphous (one atom or cycle of it is interchangeable
with another) then we usually do something like divide it into 100
equal units and speak of percentages.  If the resource is naturally
subdivided into sufficiently small units (sufficient for the
granularity of resource management we require) then we take those units
as is.  Occassionally, as in the 'fake numa node' patch by David
Rientjes <rientjes@cs.washington.edu>, who worked at Google over the
last summer, if the natural units are not of sufficient granularity, we
fake up a somewhat finer division.
Then, in any case, and somewhat separately, we divide the tasks running
on the system into subsets.  More precisely, we partition the tasks,
where a partition of a set is a set of subsets of that set, pairwise
disjoint, whose union equals that set.
Then, finally, we map the task subsets (partition element) to the
resource units, and add hooks in the kernel where this particular
resource is allocated or scheduled to constrain the tasks to only using
the units to which their task partition element is mapped.
These hooks are usually the 'interesting' part of a resource management
patch; one needs to minimize impact on both the kernel source code and
on the runtime performance, and for these hooks, that can be a
challenge.  In particular, what are naturally system wide resource
management stuctures cannot be allowed to impose system wide locks on
critical resource allocation code paths (and it's usually the most
critical resources, such as memory, cpu and network, that we most need
to manage in the first place.)
==> This has nothing to do with remapping namespaces as I might use that
    phrase though I cannot claim to be qualified enough to speak on behalf
    of the Generally Established Principles of Computer Science.
I am as qualified as anyone to speak on behalf of cpusets, and I suspect
you are not accurately understanding them if you think of them as remapping
namespaces.
-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401
_______________________________________________
Containers mailing list
Containers@lists.osdl.org
https://lists.osdl.org/mailman/listinfo/containers |  
	|  |  |  
	| 
		
			| Re: [PATCH 0/2] resource control file system - aka containers on top of nsproxy! [message #17625 is a reply to message #17605] | Thu, 08 March 2007 18:13   |  
			| 
				
				
					|  Srivatsa Vaddagiri Messages: 241
 Registered: August 2006
 | Senior Member |  |  |  
	| On Wed, Mar 07, 2007 at 11:00:31PM +0530, Srivatsa Vaddagiri wrote:
> > I'd like to see that. I suspect it will be a bit more fiddly than the
> > simple cpu_acct subsystem.
> 
> I am almost done with the conversion. And yes cpuset is a beast to
> convert over! Will test and send the patches out tomorrow.
Ok ..I am not in a state yet where I can post the patches to lkml in the
usual conventions (breaking down neatly/good documentation etc). But I
do have something which seems to work! I could mount cpuset as:
	mount -t rcfs -ocpuset none cpuset
	cd cpuset
	mkdir a
	cd a
	cat tasks	# shows nothing
	echo 7 > cpus
	echo 0 > mems
	echo 1 > cpu_exclusive
	echo some_pid > tasks
	cat tasks	# shows some_pid
top now shows some_pid running on CPU7, as expected :)
	
Instead of the usual convention of inlining patches and sending them in 
separate mails, I am sending all of them as attachments (beware, bugs around!). 
But this gives you an idea on which direction this is proceeding ..
Todo:
        - Introduce refcounting of resource objects (get/put_res_ns)
	- rmdir needs to check resource object refcount rather than
	  nsproxy's
	- Trace couple of other lockdep warnings I have hit
Patches attached.
-- 
Regards,
vatsa
---
 linux-2.6.20-vatsa/include/linux/init_task.h |   11 
 linux-2.6.20-vatsa/include/linux/nsproxy.h   |   11 
 linux-2.6.20-vatsa/init/Kconfig              |   22 
 linux-2.6.20-vatsa/init/main.c               |    3 
 linux-2.6.20-vatsa/kernel/Makefile           |    1 
---
 linux-2.6.20.1-vatsa/include/linux/init_task.h |   11 
 linux-2.6.20.1-vatsa/include/linux/nsproxy.h   |   11 
 linux-2.6.20.1-vatsa/include/linux/rcfs.h      |   76 +
 linux-2.6.20.1-vatsa/init/Kconfig              |   22 
 linux-2.6.20.1-vatsa/init/main.c               |    3 
 linux-2.6.20.1-vatsa/kernel/Makefile           |    1 
 linux-2.6.20.1-vatsa/kernel/nsproxy.c          |   65 +
 linux-2.6.20.1-vatsa/kernel/rcfs.c             | 1202 +++++++++++++++++++++++++
 8 files changed, 1391 insertions(+)
diff -puN include/linux/init_task.h~rcfs include/linux/init_task.h
--- linux-2.6.20.1/include/linux/init_task.h~rcfs	2007-03-08 21:21:33.000000000 +0530
+++ linux-2.6.20.1-vatsa/include/linux/init_task.h	2007-03-08 21:21:34.000000000 +0530
@@ -71,6 +71,16 @@
 }
 
 extern struct nsproxy init_nsproxy;
+
+#ifdef CONFIG_RCFS
+#define INIT_RCFS(nsproxy) 						\
+	.list 		= LIST_HEAD_INIT(nsproxy.list),			\
+	.ctlr_data 	= {[ 0 ... CONFIG_MAX_RC_SUBSYS-1 ] = NULL },
+#else
+#define INIT_RCFS(nsproxy)
+#endif
+
+
 #define INIT_NSPROXY(nsproxy) {						\
 	.pid_ns		= &init_pid_ns,					\
 	.count		= ATOMIC_INIT(1),				\
@@ -78,6 +88,7 @@ extern struct nsproxy init_nsproxy;
 	.uts_ns		= &init_uts_ns,					\
 	.mnt_ns		= NULL,						\
 	INIT_IPC_NS(ipc_ns)						\
+	INIT_RCFS(nsproxy)						\
 }
 
 #define INIT_SIGHAND(sighand) {						\
diff -puN include/linux/nsproxy.h~rcfs include/linux/nsproxy.h
--- linux-2.6.20.1/include/linux/nsproxy.h~rcfs	2007-03-08 21:21:33.000000000 +0530
+++ linux-2.6.20.1-vatsa/include/linux/nsproxy.h	2007-03-08 21:21:34.000000000 +0530
@@ -28,6 +28,10 @@ struct nsproxy {
 	struct ipc_namespace *ipc_ns;
 	struct mnt_namespace *mnt_ns;
 	struct pid_namespace *pid_ns;
+#ifdef CONFIG_RCFS
+	struct list_head list;
+	void *ctlr_data[CONFIG_MAX_RC_SUBSYS];
+#endif
 };
 extern struct nsproxy init_nsproxy;
 
@@ -35,6 +39,13 @@ struct nsproxy *dup_namespaces(struct ns
 int copy_namespaces(int flags, struct task_struct *tsk);
 void get_task_namespaces(struct task_struct *tsk);
 void free_nsproxy(struct nsproxy *ns);
+#ifdef CONFIG_RCFS
+struct nsproxy *find_nsproxy(struct nsproxy *ns);
+int namespaces_init(void);
+int nsproxy_task_count(void *data, int idx);
+#else
+static inline int namespaces_init(void) { return 0;}
+#endif
 
 static inline void put_nsproxy(struct nsproxy *ns)
 {
diff -puN /dev/null include/linux/rcfs.h
--- /dev/null	2007-03-08 22:46:54.325490448 +0530
+++ linux-2.6.20.1-vatsa/include/linux/rcfs.h	2007-03-08 21:21:34.000000000 +0530
@@ -0,0 +1,76 @@
+#ifndef _LINUX_RCFS_H
+#define _LINUX_RCFS_H
+
+#ifdef CONFIG_RCFS
+
+/* struct cftype:
+ *
+ * The files in the container filesystem mostly have a very simple read/write
+ * handling, some common function will take care of it. Nevertheless some cases
+ * (read tasks) are special and therefore I define this structure for every
+ * kind of file.
+ *
+ *
+ * When reading/writing to a file:
+ *	- the container to use in file->f_dentry->d_parent->d_fsdata
+ *	- the 'cftype' of the file is file->f_dentry->d_fsdata
+ */
+
+struct inode;
+#define MAX_CFTYPE_NAME 64
+struct cftype {
+	/* By convention, the name should begin with the name of the
+	 * subsystem, followed by a period */
+	char name[MAX_CFTYPE_NAME];
+	int private;
+	int (*open) (struct inode *inode, struct file *file);
+	ssize_t (*read) (struct nsproxy *ns, struct cftype *cft,
+			 struct file *file,
+			 char __user *buf, size_t nbytes, loff_t *ppos);
+	ssize_t (*write) (struct nsproxy *ns, struct cftype *cft,
+			  struct file *file,
+			  const char __user *buf, size_t nbytes, loff_t *ppos);
+	int (*release) (struct inode *inode, struct file *file);
+};
+
+/* resource control subsystem type. See Documentation/rcfs.txt for details */
+
+struct rc_subsys {
+	int (*create)(struct rc_subsys *ss, struct nsproxy *ns,
+			 struct nsproxy *parent);
+	void (*destroy)(struct rc_subsys *ss, struct nsproxy *ns);
+	int (*can_attach)(struct rc_subsys *ss, struct nsproxy *ns,
+				 struct task_struct *tsk);
+	void (*attach)(struct rc_subsys *ss, struct nsproxy *new,
+				 struct nsproxy *old, struct task_struct *tsk);
+	int (*populate)(struct rc_subsys *ss, struct dentry *d);
+	int subsys_id;
+	int active;
+
+#define MAX_CONTAINER_TYPE_NAMELEN 32
+	const char *name;
+
+	/* Protected by RCU */
+	int hierarchy;
+
+	struct list_head sibling;
+};
+
+int rc_register_subsys(struct rc_subsys *subsys);
+/* Add a new file to the given container directory. Should only be
+ * called by subsystems from within a populate() method */
+int rcfs_add_file(struct dentry *d, const struct cftype *cft);
+extern int rcfs_init(void);
+extern void rcfs_manage_lock(void);
+extern void rcfs_manage_unlock(void);
+extern int rcfs_dir_removed(struct dentry *d);
+extern int rcfs_path(struct dentry *d, char *buf, int len);
+
+#else
+
+static inline int rcfs_init(void) { return 0; }
+
+#endif
+
+
+#endif
diff -puN init/Kconfig~rcfs init/Kconfig
--- linux-2.6.20.1/init/Kconfig~rcfs	2007-03-08 21:21:33.000000000 +0530
+++ linux-2.6.20.1-vatsa/init/Kconfig	2007-03-08 22:47:50.000000000 +0530
@@ -238,6 +238,28 @@ config IKCONFIG_PROC
 	  This option enables access to the kernel configuration file
 	  through /proc/config.gz.
 
+config RCFS
+	bool "Resource control file system support"
+	default n
+	help
+	  This option will let you create and manage resource containers,
+	  which can be used to aggregate multiple processes, e.g. for
+	  the purposes of resource tracking.
+
+	  Say N if unsure
+
+config MAX_RC_SUBSYS
+       int "Number of resource control subsystems to support"
+       depends on RCFS
+       range 1 255
+       default 8
+
+config MAX_RC_HIERARCHIES
+       int "Number of rcfs hierarchies to support"
+       depends on RCFS
+       range 2 255
+       default 4
+
 config CPUSETS
 	bool "Cpuset support"
 	depends on SMP
diff -puN init/main.c~rcfs init/main.c
--- linux-2.6.20.1/init/main.c~rcfs	2007-03-08 21:21:33.000000000 +0530
+++ linux-2.6.20.1-vatsa/init/main.c	2007-03-08 21:21:34.000000000 +0530
@@ -52,6 +52,7 @@
 #include <linux/lockdep.h>
 #include <linux/pid_namespace.h>
 #include <linux/device.h>
+#include <linux/rcfs.h>
 
 #include <asm/io.h>
 #include <asm/bugs.h>
@@ -512,6 +513,7 @@ asmlinkage void __init start_kernel(void
 	setup_per_cpu_areas();
 	smp_prepare_boot_cpu();	/* arch-specific boot-cpu hooks */
 
+	namespaces_init();
 	/*
 	 * Set up the scheduler prior starting any interrupts (such as the
 	 * timer interrupt). Full topology setup happens at smp_init()
@@ -578,6 +580,7 @@ asmlinkage void __init start_kernel(void
 	}
 #endif
 	vfs_caches_init_early();
+	rcfs_init();
 	cpuset_init_early();
 	mem_init();
 	kmem_cache_init();
diff -puN kernel/Makefile~rcfs kernel/Makefile
--- linux-2.6.20.1/kernel/Makefile~rcfs	2007-03-08 21:21:34.000000000 +0530
+++ linux-2.6.20.1-vatsa/kernel/Makefile	2007-03-08 22:47:50.000000000 +0530
@@ -50,6 +50,7 @@ obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_UTS_NS) += utsname.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
+obj-$(CONFIG_RCFS) += rcfs.o
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff -puN kernel/nsproxy.c~rcfs kernel/nsproxy.c
--- linux-2.6.20.1/kernel/nsproxy.c~rcfs	2007-03-08 21:21:34.000000000 +0530
+++ linux-2.6.20.1-vatsa/kernel/nsproxy.c	2007-03-08 22:54:04.000000000 +0530
@@ -23,6 +23,11 @@
 
 struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);
 
+#ifdef CONFIG_RCFS
+static LIST_HEAD(nslisthead);
+static DEFINE_SPINLOCK(nslistlock);
+#endif
+
 static inline void get_nsproxy(struct nsproxy *ns)
 {
 	atomic_inc(&ns->count);
@@ -71,6 +76,12 @@ struct nsproxy *dup_namespaces(struct ns
 			get_pid_ns(ns->pid_ns);
 	}
 
+#ifdef CONFIG_RCFS
+	spin_lock_irq(&nslistlock);
+	list_add(&ns->list, &nslisthead);
+	spin_unlock_irq(&nslistlock);
+#endif
+
 	return ns;
 }
 
@@ -145,5 +156,59 @@ void free_nsproxy(struct nsproxy *ns)
 		put_ipc_ns(ns->ipc_ns);
 	if (ns->pid_ns)
 		put_pid_ns(ns->pid_ns);
+#ifdef CONFIG_RCFS
+	spin_lock_irq(&nslistlock);
+	list_del(&ns->list);
+	spin_unlock_irq(&nslistlock);
+#endif
 	kfree(ns);
 }
+
+#ifdef CONFIG_RCFS
+struct nsproxy *find_nsproxy(struct nsproxy *target)
+{
+	struct nsproxy *ns;
+	int i = ...
 
 |  
	|  |  |  
	| 
		
			| Re: [PATCH 1/2] rcfs core patch [message #17626 is a reply to message #17557] | Fri, 09 March 2007 09:38   |  
			| 
				
				
					|  Paul Jackson Messages: 157
 Registered: February 2006
 | Senior Member |  |  |  
	| Kirill, responding to Herbert:
> > do we need or even want that? IMHO the hierarchical
> > concept CKRM was designed with, was also the reason
> > for it being slow, unuseable and complicated
> 1. cpusets are hierarchical already. So hierarchy is required.
I think that CKRM has a harder time doing a hierarchy than cpusets.
CKRM is trying to account for and control how much of an amorphous
resource is used, whereas cpusets is trying to control whether a
specifically identifiable resource is used, or not used, not how
much of it is used.
A child cpuset gets configured to allow certain CPUs and Nodes, and
then does not need to dynamically pass back any information about
what is actually used - it's a one-way control with no feedback.
That's a relatively easier problem.
CKRM (as I recall it, from long ago ...) has to track the amount
of usage dynamically, across parent and child groups (whatever they
were called.)  That's a harder problem.
So, yes, as Kirill observes, we need the hierarchy because cpusets
has it, cpuset users make good use of the hierarchy, and the hierarchy
works fine in that case, even if a hierarchy is more difficult for CKRM.
-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401
_______________________________________________
Containers mailing list
Containers@lists.osdl.org
https://lists.osdl.org/mailman/listinfo/containers |  
	|  |  | 
 
 
 Current Time: Fri Oct 24 21:56:28 GMT 2025 
 Total time taken to generate the page: 0.18726 seconds |