| Home » Mailing lists » Devel » [RFC][PATCH 0/7] Resource controllers based on process containers Goto Forum:
	| 
		
			| [RFC][PATCH 0/7] Resource controllers based on process containers [message #10888] | Tue, 06 March 2007 14:42  |  
			| 
				
				
					|  xemul Messages: 248
 Registered: November 2005
 | Senior Member |  |  |  
	| This patchset adds RSS, accounting and control and limiting the number of tasks and files within container.
 
 Based on top of Paul Menage's container subsystem v7
 
 RSS controller includes per-container RSS accounter,
 reclamation and OOM killer. It behaves like standalone
 machine - when container runs out of resources it tries
 to reclaim some pages and if it doesn't succeed in it
 kills some task which mm_struct belongs to container in
 question.
 
 Num tasks and files containers are very simple and
 self-descriptive from code.
 
 As discussed before when a task moves from one container
 to another no resources follow it - they keep holding the
 container they were allocated in.
 
 The difficulties met during using of Pauls' containers were:
 
 1. Container fork hook is placed before new task
 changes. This makes impossible of handling fork
 properly. I.e. new mm_struct should have pointer
 to RSS container, but we don't have one at that
 early time.
 
 2. Extended containers may register themselves too late.
 Kernel threads/helpers start forking, opening files
 and touching pages much earlier. This patchset
 workarounds this in not-so-cute manner and I'm waiting
 for Paul's comments on this issue.
 |  
	|  |  |  
	| 
		
			| [RFC][PATCH 1/7] Resource counters [message #10889 is a reply to message #10888] | Tue, 06 March 2007 14:47   |  
			| 
				
				
					|  xemul Messages: 248
 Registered: November 2005
 | Senior Member |  |  |  
	| Introduce generic structures and routines for resource accounting.
 
 Each resource accounting container is supposed to
 aggregate it, container_subsystem_state and its
 resource-specific members within.
 
 diff -upr linux-2.6.20.orig/include/linux/res_counter.h linux-2.6.20-0/include/linux/res_counter.h
 --- linux-2.6.20.orig/include/linux/res_counter.h	2007-03-06 13:39:17.000000000 +0300
 +++ linux-2.6.20-0/include/linux/res_counter.h	2007-03-06 13:33:28.000000000 +0300
 @@ -0,0 +1,83 @@
 +#ifndef __RES_COUNTER_H__
 +#define __RES_COUNTER_H__
 +/*
 + * resource counters
 + *
 + * Copyright 2007 OpenVZ SWsoft Inc
 + *
 + * Author: Pavel Emelianov <xemul@openvz.org>
 + *
 + */
 +
 +#include <linux/container.h>
 +
 +struct res_counter {
 +	unsigned long usage;
 +	unsigned long limit;
 +	unsigned long failcnt;
 +	spinlock_t lock;
 +};
 +
 +enum {
 +	RES_USAGE,
 +	RES_LIMIT,
 +	RES_FAILCNT,
 +};
 +
 +ssize_t res_counter_read(struct res_counter *cnt, int member,
 +		const char __user *buf, size_t nbytes, loff_t *pos);
 +ssize_t res_counter_write(struct res_counter *cnt, int member,
 +		const char __user *buf, size_t nbytes, loff_t *pos);
 +
 +static inline void res_counter_init(struct res_counter *cnt)
 +{
 +	spin_lock_init(&cnt->lock);
 +	cnt->limit = (unsigned long)LONG_MAX;
 +}
 +
 +static inline int res_counter_charge_locked(struct res_counter *cnt,
 +		unsigned long val)
 +{
 +	if (cnt->usage <= cnt->limit - val) {
 +		cnt->usage += val;
 +		return 0;
 +	}
 +
 +	cnt->failcnt++;
 +	return -ENOMEM;
 +}
 +
 +static inline int res_counter_charge(struct res_counter *cnt,
 +		unsigned long val)
 +{
 +	int ret;
 +	unsigned long flags;
 +
 +	spin_lock_irqsave(&cnt->lock, flags);
 +	ret = res_counter_charge_locked(cnt, val);
 +	spin_unlock_irqrestore(&cnt->lock, flags);
 +	return ret;
 +}
 +
 +static inline void res_counter_uncharge_locked(struct res_counter *cnt,
 +		unsigned long val)
 +{
 +	if (unlikely(cnt->usage < val)) {
 +		WARN_ON(1);
 +		val = cnt->usage;
 +	}
 +
 +	cnt->usage -= val;
 +}
 +
 +static inline void res_counter_uncharge(struct res_counter *cnt,
 +		unsigned long val)
 +{
 +	unsigned long flags;
 +
 +	spin_lock_irqsave(&cnt->lock, flags);
 +	res_counter_uncharge_locked(cnt, val);
 +	spin_unlock_irqrestore(&cnt->lock, flags);
 +}
 +
 +#endif
 diff -upr linux-2.6.20.orig/init/Kconfig linux-2.6.20-0/init/Kconfig
 --- linux-2.6.20.orig/init/Kconfig	2007-03-06 13:33:28.000000000 +0300
 +++ linux-2.6.20-0/init/Kconfig	2007-03-06 13:33:28.000000000 +0300
 @@ -265,6 +265,10 @@ config CPUSETS
 
 Say N if unsure.
 
 +config RESOURCE_COUNTERS
 +	bool
 +	select CONTAINERS
 +
 config SYSFS_DEPRECATED
 bool "Create deprecated sysfs files"
 default y
 diff -upr linux-2.6.20.orig/kernel/Makefile linux-2.6.20-0/kernel/Makefile
 --- linux-2.6.20.orig/kernel/Makefile	2007-03-06 13:33:28.000000000 +0300
 +++ linux-2.6.20-0/kernel/Makefile	2007-03-06 13:33:28.000000000 +0300
 @@ -51,6 +51,7 @@ obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_UTS_NS) += utsname.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
 +obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
 diff -upr linux-2.6.20.orig/kernel/res_counter.c linux-2.6.20-0/kernel/res_counter.c
 --- linux-2.6.20.orig/kernel/res_counter.c	2007-03-06 13:39:17.000000000 +0300
 +++ linux-2.6.20-0/kernel/res_counter.c	2007-03-06 13:33:28.000000000 +0300
 @@ -0,0 +1,72 @@
 +/*
 + * resource containers
 + *
 + * Copyright 2007 OpenVZ SWsoft Inc
 + *
 + * Author: Pavel Emelianov <xemul@openvz.org>
 + *
 + */
 +
 +#include <linux/parser.h>
 +#include <linux/fs.h>
 +#include <linux/res_counter.h>
 +#include <asm/uaccess.h>
 +
 +static inline unsigned long *res_counter_member(struct res_counter *cnt, int member)
 +{
 +	switch (member) {
 +	case RES_USAGE:
 +		return &cnt->usage;
 +	case RES_LIMIT:
 +		return &cnt->limit;
 +	case RES_FAILCNT:
 +		return &cnt->failcnt;
 +	};
 +
 +	BUG();
 +	return NULL;
 +}
 +
 +ssize_t res_counter_read(struct res_counter *cnt, int member,
 +		const char __user *userbuf, size_t nbytes, loff_t *pos)
 +{
 +	unsigned long *val;
 +	char buf[64], *s;
 +
 +	s = buf;
 +	val = res_counter_member(cnt, member);
 +	s += sprintf(s, "%lu\n", *val);
 +	return simple_read_from_buffer((void __user *)userbuf, nbytes,
 +			pos, buf, s - buf);
 +}
 +
 +ssize_t res_counter_write(struct res_counter *cnt, int member,
 +		const char __user *userbuf, size_t nbytes, loff_t *pos)
 +{
 +	int ret;
 +	char *buf, *end;
 +	unsigned long tmp, *val;
 +
 +	buf = kmalloc(nbytes + 1, GFP_KERNEL);
 +	ret = -ENOMEM;
 +	if (buf == NULL)
 +		goto out;
 +
 +	buf[nbytes] = 0;
 +	ret = -EFAULT;
 +	if (copy_from_user(buf, userbuf, nbytes))
 +		goto out_free;
 +
 +	ret = -EINVAL;
 +	tmp = simple_strtoul(buf, &end, 10);
 +	if (*end != '\0')
 +		goto out_free;
 +
 +	val = res_counter_member(cnt, member);
 +	*val = tmp;
 +	ret = nbytes;
 +out_free:
 +	kfree(buf);
 +out:
 +	return ret;
 +}
 |  
	|  |  |  
	| 
		
			| [RFC][PATCH 2/7] RSS controller core [message #10890 is a reply to message #10888] | Tue, 06 March 2007 14:53   |  
			| 
				
				
					|  xemul Messages: 248
 Registered: November 2005
 | Senior Member |  |  |  
	| This includes setup of RSS container within generic process containers, all the declarations used in RSS
 accounting, and core code responsible for accounting.
 
 diff -upr linux-2.6.20.orig/include/linux/rss_container.h linux-2.6.20-0/include/linux/rss_container.h
 --- linux-2.6.20.orig/include/linux/rss_container.h	2007-03-06 13:39:17.000000000 +0300
 +++ linux-2.6.20-0/include/linux/rss_container.h	2007-03-06 13:33:28.000000000 +0300
 @@ -0,0 +1,68 @@
 +#ifndef __RSS_CONTAINER_H__
 +#define __RSS_CONTAINER_H__
 +/*
 + * RSS container
 + *
 + * Copyright 2007 OpenVZ SWsoft Inc
 + *
 + * Author: Pavel Emelianov <xemul@openvz.org>
 + *
 + */
 +
 +struct page_container;
 +struct rss_container;
 +
 +#ifdef CONFIG_RSS_CONTAINER
 +int container_rss_prepare(struct page *, struct vm_area_struct *vma,
 +		struct page_container **);
 +
 +void container_rss_add(struct page_container *);
 +void container_rss_del(struct page_container *);
 +void container_rss_release(struct page_container *);
 +
 +int mm_init_container(struct mm_struct *mm, struct task_struct *tsk);
 +void mm_free_container(struct mm_struct *mm);
 +
 +unsigned long container_isolate_pages(unsigned long nr_to_scan,
 +		struct rss_container *rss, struct list_head *dst,
 +		int active, unsigned long *scanned);
 +unsigned long container_nr_physpages(struct rss_container *rss);
 +
 +unsigned long container_try_to_free_pages(struct rss_container *);
 +void container_out_of_memory(struct rss_container *);
 +
 +void container_rss_init_early(void);
 +#else
 +static inline int container_rss_prepare(struct page *pg,
 +		struct vm_area_struct *vma, struct page_container **pc)
 +{
 +	*pc = NULL; /* to make gcc happy */
 +	return 0;
 +}
 +
 +static inline void container_rss_add(struct page_container *pc)
 +{
 +}
 +
 +static inline void container_rss_del(struct page_container *pc)
 +{
 +}
 +
 +static inline void container_rss_release(struct page_container *pc)
 +{
 +}
 +
 +static inline int mm_init_container(struct mm_struct *mm, struct task_struct *t)
 +{
 +	return 0;
 +}
 +
 +static inline void mm_free_container(struct mm_struct *mm)
 +{
 +}
 +
 +static inline void container_rss_init_early(void)
 +{
 +}
 +#endif
 +#endif
 diff -upr linux-2.6.20.orig/init/Kconfig linux-2.6.20-0/init/Kconfig
 --- linux-2.6.20.orig/init/Kconfig	2007-03-06 13:33:28.000000000 +0300
 +++ linux-2.6.20-0/init/Kconfig	2007-03-06 13:33:28.000000000 +0300
 @@ -265,6 +265,13 @@ config CPUSETS
 bool
 select CONTAINERS
 
 +config RSS_CONTAINER
 +	bool "RSS accounting container"
 +	select RESOURCE_COUNTERS
 +	help
 +	  Provides a simple Resource Controller for monitoring and
 +	  controlling the total Resident Set Size of the tasks in a container
 +
 config SYSFS_DEPRECATED
 bool "Create deprecated sysfs files"
 default y
 diff -upr linux-2.6.20.orig/mm/Makefile linux-2.6.20-0/mm/Makefile
 --- linux-2.6.20.orig/mm/Makefile	2007-02-04 21:44:54.000000000 +0300
 +++ linux-2.6.20-0/mm/Makefile	2007-03-06 13:33:28.000000000 +0300
 @@ -29,3 +29,5 @@ obj-$(CONFIG_MEMORY_HOTPLUG) += memory_h
 obj-$(CONFIG_FS_XIP) += filemap_xip.o
 obj-$(CONFIG_MIGRATION) += migrate.o
 obj-$(CONFIG_SMP) += allocpercpu.o
 +
 +obj-$(CONFIG_RSS_CONTAINER) += rss_container.o
 diff -upr linux-2.6.20.orig/mm/rss_container.c linux-2.6.20-0/mm/rss_container.c
 --- linux-2.6.20.orig/mm/rss_container.c	2007-03-06 13:39:17.000000000 +0300
 +++ linux-2.6.20-0/mm/rss_container.c	2007-03-06 13:33:28.000000000 +0300
 @@ -0,0 +1,307 @@
 +/*
 + * RSS accounting container
 + *
 + * Copyright 2007 OpenVZ SWsoft Inc
 + *
 + * Author: Pavel Emelianov <xemul@openvz.org>
 + *
 + */
 +
 +#include <linux/list.h>
 +#include <linux/sched.h>
 +#include <linux/mm.h>
 +#include <linux/res_counter.h>
 +#include <linux/rss_container.h>
 +
 +static struct container_subsys rss_subsys;
 +
 +struct rss_container {
 +	struct res_counter res;
 +	struct list_head page_list;
 +	struct container_subsys_state css;
 +};
 +
 +struct page_container {
 +	struct page *page;
 +	struct rss_container *cnt;
 +	struct list_head list;
 +};
 +
 +static inline struct rss_container *rss_from_cont(struct container *cnt)
 +{
 +	return container_of(container_subsys_state(cnt, &rss_subsys),
 +			struct rss_container, css);
 +}
 +
 +int mm_init_container(struct mm_struct *mm, struct task_struct *tsk)
 +{
 +	struct rss_container *cnt;
 +
 +	cnt = rss_from_cont(task_container(tsk, &rss_subsys));
 +	if (css_get(&cnt->css))
 +		return -EBUSY;
 +
 +	mm->rss_container = cnt;
 +	return 0;
 +}
 +
 +void mm_free_container(struct mm_struct *mm)
 +{
 +	css_put(&mm->rss_container->css);
 +}
 +
 +int container_rss_prepare(struct page *page, struct vm_area_struct *vma,
 +		struct page_container **ppc)
 +{
 +	struct rss_container *rss;
 +	struct page_container *pc;
 +
 +	rcu_read_lock();
 +	rss = rcu_dereference(vma->vm_mm->rss_container);
 +	css_get_current(&rss->css);
 +	rcu_read_unlock();
 +
 +	pc = kmalloc(sizeof(struct page_container), GFP_KERNEL);
 +	if (pc == NULL)
 +		goto out_nomem;
 +
 +	while (res_counter_charge(&rss->res, 1)) {
 +		if (container_try_to_free_pages(rss))
 +			continue;
 +
 +		container_out_of_memory(rss);
 +		if (test_thread_flag(TIF_MEMDIE))
 +			goto out_charge;
 +	}
 +
 +	pc->page = page;
 +	pc->cnt = rss;
 +	*ppc = pc;
 +	return 0;
 +
 +out_charge:
 +	kfree(pc);
 +out_nomem:
 +	css_put(&rss->css);
 +	return -ENOMEM;
 +}
 +
 +void container_rss_release(struct page_container *pc)
 +{
 +	struct rss_container *rss;
 +
 +	rss = pc->cnt;
 +	res_counter_uncharge(&rss->res, 1);
 +	css_put(&rss->css);
 +	kfree(pc);
 +}
 +
 +void container_rss_add(struct page_container *pc)
 +{
 +	struct page *pg;
 +	struct rss_container *rss;
 +
 +	pg = pc->page;
 +	rss = pc->cnt;
 +
 +	spin_lock(&rss->res.lock);
 +	list_add(&pc->list, &rss->page_list);
 +	spin_unlock(&rss->res.lock);
 +
 +	page_container(pg) = pc;
 +}
 +
 +void container_rss_del(struct page_container *pc)
 +{
 +	struct page *page;
 +	struct rss_container *rss;
 +
 +	page = pc->page;
 +	rss = pc->cnt;
 +
 +	spin_lock(&rss->res.lock);
 +	list_del(&pc->list);
 +	res_counter_uncharge_locked(&rss->res, 1);
 +	spin_unlock(&rss->res.lock);
 +
 +	css_put(&rss->css);
 +	kfree(pc);
 +}
 +
 +unsigned long container_isolate_pages(unsigned long nr_to_scan,
 +		struct rss_container *rss, struct list_head *dst,
 +		int active, unsigned long *scanned)
 +{
 +	unsigned long nr_taken = 0;
 +	struct page *page;
 +	struct page_container *pc;
 +	unsigned long scan;
 +	struct list_head *src;
 +	LIST_HEAD(pc_list);
 +	struct zone *z;
 +
 +	spin_lock_irq(&rss->res.lock);
 +	src = &rss->page_list;
 +
 +	for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
 +		pc = list_entry(src->prev, struct page_container, list);
 +		page = pc->page;
 +		z = page_zone(page);
 +
 +		list_move(&pc->list, &pc_list);
 +
 +		spin_lock(&z->lru_lock);
 +		if (PageLRU(page)) {
 +			if ((active && PageActive(page)) ||
 +					(!active && !PageActive(page))) {
 +				if (likely(get_page_unless_zero(page))) {
 +					ClearPageLRU(page);
 +					nr_taken++;
 +					list_move(&page->lru, dst);
 +				}
 +			}
 +		}
 +		spin_unlock(&z->lru_lock);
 +	}
 +
 +	list_splice(&pc_list, src);
 +	spin_unlock_irq(&rss->res.lock);
 +
 +	*scanned = scan;
 +	return nr_taken;
 +}
 +
 +unsigned long container_nr_physpages(struct rss_container *rss)
 +{
 +	return rss->res.usage;
 +}
 +
 +static void rss_move_task(struct container_subsys *ss,
 +		struct container *cont,
 +		struct container *old_cont,
 +		struct task_struct *p)
 +{
 +	struct mm_struct *mm;
 +	struct rss_container *rss, *old_rss;
 +
 +	mm = get_task_mm(p);
 +	if (mm == NULL)
 +		goto out;
 +
 +	rss = rss_from_cont(cont);
 +	old_rss = rss_from_cont(old_cont);
 +	if (old_rss != mm->rss_container)
 +		goto out_put;
 +
 +	css_get_current(&rss->css);
 +	rcu_assign_pointer(mm->rss_container, rss);
 +	css_put(&old_rss->css);
 +
 +out_put:
 +	mmput(mm);
 +out:
 +	return;
 +}
 +
 +static int rss_create(struct container_subsys *ss, struct container *cont)
 +{
 +	struct rss_container *rss;
 +
 +	rss = kzalloc(sizeof(struct rss_container), GFP_KERNEL);
 +	if (rss == NULL)
 +		return -ENOMEM;
 +
 +	res_counter_init(&rss->res);
 +	INIT_LIST_HEAD(&rss->page_list);
 +	cont->subsys[rss_subsys.subsys_id] = &rss->css;
 +	return 0;
 +}
 +
 +static void rss_destroy(struct container_subsys *ss,
 +		struct container *cont)
 +{
 +	kfree(rss_from_cont(cont));
 +}
 +
 +
 +static ssize_t rss_read(struct container *cont, struct cftype *cft,
 +		struct file *file, char __user *userbuf,
 +		size_t nbytes, loff_t *ppos)
 +{
 +	return res_counter_read(&rss_from_cont(cont)->res, cft->private,
 +			userbuf, nbytes, ppos);
 +}
 +
 +static ssize_t rss_write(struct container *cont, struct cftype *cft,
 +		struct file *file, const char __user *userbuf,
 +		size_t nbytes, loff_t *ppos)
 +{
 +	return res_counter_write(&rss_from_cont(cont)->res, cft->private,
 +			userbuf, nbytes, ppos);
 +}
 +
 +
 +static struct cftype rss_usage = {
 +	.name = "rss_usage",
 +	.private = RES_USAGE,
 +	.read = rss_read,
 +};
 +
 +static struct cftype rss_limit = {
 +	.name = "rss_limit",
 +	.private = RES_LIMIT,
 +	.read = rss_read,
 +	.write = rss_write,
 +};
 +
 +static struct cftype rss_failcnt = {
 +	.name = "rss_failcnt",
 +	.private = RES_FAILCNT,
 +	.read = rss_read,
 +};
 +
 +static int rss_populate(struct container_subsys *ss,
 +		struct container *cont)
 +{
 +	int rc;
 +
 +	if ((rc = container_add_file(cont, &rss_usage)) < 0)
 +		return rc;
 +	if ((rc = container_add_file(cont, &rss_failcnt)) < 0)
 +		return rc;
 +	if ((rc = container_add_file(cont, &rss_limit)) < 0)
 +		return rc;
 +
 +	return 0;
 +}
 +
 +static struct rss_container init_rss_container;
 +
 +static __init int rss_create_early(struct container_subsys *ss,
 +		struct container *cont)
 +{
 +	struct rss_container *rss;
 +
 +	rss = &init_rss_container;
 +	res_counter_init(&rss->res);
 +	INIT_LIST_HEAD(&rss->page_list);
 +	cont->subsys[rss_
...
 
 
 |  
	|  |  |  
	| 
		
			| [RFC][PATCH 3/7] Data structures changes for RSS accounting [message #10891 is a reply to message #10888] | Tue, 06 March 2007 14:55   |  
			| 
				
				
					|  xemul Messages: 248
 Registered: November 2005
 | Senior Member |  |  |  
	| Adds needed pointers to mm_struct and page struct, places hooks to core code for mm_struct initialization
 and hooks in container_init_early() to preinitialize
 RSS accounting subsystem.
 
 diff -upr linux-2.6.20.orig/include/linux/mm.h linux-2.6.20-0/include/linux/mm.h
 --- linux-2.6.20.orig/include/linux/mm.h	2007-02-04 21:44:54.000000000 +0300
 +++ linux-2.6.20-0/include/linux/mm.h	2007-03-06 13:33:28.000000000 +0300
 @@ -220,6 +220,12 @@ struct vm_operations_struct {
 struct mmu_gather;
 struct inode;
 
 +#ifdef CONFIG_RSS_CONTAINER
 +#define page_container(page)	(page->rss_container)
 +#else
 +#define page_container(page)	(NULL)
 +#endif
 +
 #define page_private(page)		((page)->private)
 #define set_page_private(page, v)	((page)->private = (v))
 
 diff -upr linux-2.6.20.orig/include/linux/mm_types.h linux-2.6.20-0/include/linux/mm_types.h
 --- linux-2.6.20.orig/include/linux/mm_types.h	2007-02-04 21:44:54.000000000 +0300
 +++ linux-2.6.20-0/include/linux/mm_types.h	2007-03-06 13:33:28.000000000 +0300
 @@ -62,6 +62,9 @@ struct page {
 void *virtual;			/* Kernel virtual address (NULL if
 not kmapped, ie. highmem) */
 #endif /* WANT_PAGE_VIRTUAL */
 +#ifdef CONFIG_RSS_CONTAINER
 +	struct page_container *rss_container;
 +#endif
 };
 
 #endif /* _LINUX_MM_TYPES_H */
 diff -upr linux-2.6.20.orig/include/linux/sched.h linux-2.6.20-0/include/linux/sched.h
 --- linux-2.6.20.orig/include/linux/sched.h	2007-03-06 13:33:28.000000000 +0300
 +++ linux-2.6.20-0/include/linux/sched.h	2007-03-06 13:33:28.000000000 +0300
 @@ -373,6 +373,9 @@ struct mm_struct {
 /* aio bits */
 rwlock_t		ioctx_list_lock;
 struct kioctx		*ioctx_list;
 +#ifdef CONFIG_RSS_CONTAINER
 +	struct rss_container	*rss_container;
 +#endif
 };
 
 struct sighand_struct {
 diff -upr linux-2.6.20.orig/kernel/fork.c linux-2.6.20-0/kernel/fork.c
 --- linux-2.6.20.orig/kernel/fork.c	2007-03-06 13:33:28.000000000 +0300
 +++ linux-2.6.20-0/kernel/fork.c	2007-03-06 13:33:28.000000000 +0300
 @@ -57,6 +57,8 @@
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 
 +#include <linux/rss_container.h>
 +
 /*
 * Protected counters by write_lock_irq(&tasklist_lock)
 */
 @@ -325,7 +328,7 @@ static inline void mm_free_pgd(struct mm
 
 #include <linux/init_task.h>
 
 -static struct mm_struct * mm_init(struct mm_struct * mm)
 +static struct mm_struct * mm_init(struct mm_struct *mm, struct task_struct *tsk)
 {
 atomic_set(&mm->mm_users, 1);
 atomic_set(&mm->mm_count, 1);
 @@ -341,10 +344,18 @@ static struct mm_struct * mm_init(struct
 mm->free_area_cache = TASK_UNMAPPED_BASE;
 mm->cached_hole_size = ~0UL;
 
 -	if (likely(!mm_alloc_pgd(mm))) {
 -		mm->def_flags = 0;
 -		return mm;
 -	}
 +	if (unlikely(mm_init_container(mm, tsk)))
 +		goto out_cont;
 +
 +	if (unlikely(mm_alloc_pgd(mm)))
 +		goto out_pgd;
 +
 +	mm->def_flags = 0;
 +	return mm;
 +
 +out_pgd:
 +	mm_free_container(mm);
 +out_cont:
 free_mm(mm);
 return NULL;
 }
 @@ -359,7 +370,7 @@ struct mm_struct * mm_alloc(void)
 mm = allocate_mm();
 if (mm) {
 memset(mm, 0, sizeof(*mm));
 -		mm = mm_init(mm);
 +		mm = mm_init(mm, current);
 }
 return mm;
 }
 @@ -373,6 +384,7 @@ void fastcall __mmdrop(struct mm_struct
 {
 BUG_ON(mm == &init_mm);
 mm_free_pgd(mm);
 +	mm_free_container(mm);
 destroy_context(mm);
 free_mm(mm);
 }
 @@ -493,7 +505,7 @@ static struct mm_struct *dup_mm(struct t
 mm->token_priority = 0;
 mm->last_interval = 0;
 
 -	if (!mm_init(mm))
 +	if (!mm_init(mm, tsk))
 goto fail_nomem;
 
 if (init_new_context(tsk, mm))
 @@ -520,6 +532,7 @@ fail_nocontext:
 * because it calls destroy_context()
 */
 mm_free_pgd(mm);
 +	mm_free_container(mm);
 free_mm(mm);
 return NULL;
 }
 diff -upr linux-2.6.20.orig/kernel/container.c linux-2.6.20-0/kernel/container.c
 --- linux-2.6.20.orig/kernel/container.c	2007-03-06 13:33:28.000000000 +0300
 +++ linux-2.6.20-0/kernel/container.c	2007-03-06 13:35:48.000000000 +0300
 @@ -60,6 +60,8 @@
 #include <asm/atomic.h>
 #include <linux/mutex.h>
 
 +#include <linux/rss_container.h>
 +
 #define CONTAINER_SUPER_MAGIC		0x27e0eb
 
 static struct container_subsys *subsys[CONFIG_MAX_CONTAINER_SUBSYS];
 @@ -1721,6 +1725,8 @@ int __init container_init_early(void)
 }
 init_task.containers = &init_container_group;
 
 +	container_rss_init_early();
 +
 return 0;
 }
 |  
	|  |  |  
	| 
		
			| [RFC][PATCH 4/7] RSS accounting hooks over the code [message #10892 is a reply to message #10888] | Tue, 06 March 2007 14:57   |  
			| 
				
				
					|  xemul Messages: 248
 Registered: November 2005
 | Senior Member |  |  |  
	| Pages are charged to their first touchers which are determined using pages' mapcount manipulations in
 rmap calls.
 
 diff -upr linux-2.6.20.orig/fs/exec.c linux-2.6.20-0/fs/exec.c
 --- linux-2.6.20.orig/fs/exec.c	2007-02-04 21:44:54.000000000 +0300
 +++ linux-2.6.20-0/fs/exec.c	2007-03-06 13:33:28.000000000 +0300
 @@ -58,6 +58,8 @@
 #include <linux/kmod.h>
 #endif
 
 +#include <linux/rss_container.h>
 +
 int core_uses_pid;
 char core_pattern[128] = "core";
 int suid_dumpable = 0;
 @@ -309,27 +311,34 @@ void install_arg_page(struct vm_area_str
 struct mm_struct *mm = vma->vm_mm;
 pte_t * pte;
 spinlock_t *ptl;
 +	struct page_container *pcont;
 
 if (unlikely(anon_vma_prepare(vma)))
 goto out;
 
 +	if (container_rss_prepare(page, vma, &pcont))
 +		goto out;
 +
 flush_dcache_page(page);
 pte = get_locked_pte(mm, address, &ptl);
 if (!pte)
 -		goto out;
 +		goto out_release;
 if (!pte_none(*pte)) {
 pte_unmap_unlock(pte, ptl);
 -		goto out;
 +		goto out_release;
 }
 inc_mm_counter(mm, anon_rss);
 lru_cache_add_active(page);
 set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte(
 page, vma->vm_page_prot))));
 -	page_add_new_anon_rmap(page, vma, address);
 +	page_add_new_anon_rmap(page, vma, address, pcont);
 pte_unmap_unlock(pte, ptl);
 
 /* no need for flush_tlb */
 return;
 +
 +out_release:
 +	container_rss_release(pcont);
 out:
 __free_page(page);
 force_sig(SIGKILL, current);
 diff -upr linux-2.6.20.orig/include/linux/rmap.h linux-2.6.20-0/include/linux/rmap.h
 --- linux-2.6.20.orig/include/linux/rmap.h	2007-02-04 21:44:54.000000000 +0300
 +++ linux-2.6.20-0/include/linux/rmap.h	2007-03-06 13:33:28.000000000 +0300
 @@ -69,9 +69,13 @@ void __anon_vma_link(struct vm_area_stru
 /*
 * rmap interfaces called when adding or removing pte of page
 */
 -void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
 -void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
 -void page_add_file_rmap(struct page *);
 +struct page_container;
 +
 +void page_add_anon_rmap(struct page *, struct vm_area_struct *,
 +		unsigned long, struct page_container *);
 +void page_add_new_anon_rmap(struct page *, struct vm_area_struct *,
 +		unsigned long, struct page_container *);
 +void page_add_file_rmap(struct page *, struct page_container *);
 void page_remove_rmap(struct page *, struct vm_area_struct *);
 
 /**
 diff -upr linux-2.6.20.orig/mm/fremap.c linux-2.6.20-0/mm/fremap.c
 --- linux-2.6.20.orig/mm/fremap.c	2007-02-04 21:44:54.000000000 +0300
 +++ linux-2.6.20-0/mm/fremap.c	2007-03-06 13:33:28.000000000 +0300
 @@ -20,6 +20,8 @@
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 
 +#include <linux/rss_container.h>
 +
 static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
 unsigned long addr, pte_t *ptep)
 {
 @@ -57,6 +59,10 @@ int install_page(struct mm_struct *mm, s
 pte_t *pte;
 pte_t pte_val;
 spinlock_t *ptl;
 +	struct page_container *pcont;
 +
 +	if (container_rss_prepare(page, vma, &pcont))
 +		goto out_release;
 
 pte = get_locked_pte(mm, addr, &ptl);
 if (!pte)
 @@ -81,13 +87,16 @@ int install_page(struct mm_struct *mm, s
 flush_icache_page(vma, page);
 pte_val = mk_pte(page, prot);
 set_pte_at(mm, addr, pte, pte_val);
 -	page_add_file_rmap(page);
 +	page_add_file_rmap(page, pcont);
 update_mmu_cache(vma, addr, pte_val);
 lazy_mmu_prot_update(pte_val);
 err = 0;
 unlock:
 pte_unmap_unlock(pte, ptl);
 out:
 +	if (err != 0)
 +		container_rss_release(pcont);
 +out_release:
 return err;
 }
 EXPORT_SYMBOL(install_page);
 diff -upr linux-2.6.20.orig/mm/memory.c linux-2.6.20-0/mm/memory.c
 --- linux-2.6.20.orig/mm/memory.c	2007-02-04 21:44:54.000000000 +0300
 +++ linux-2.6.20-0/mm/memory.c	2007-03-06 13:33:28.000000000 +0300
 @@ -60,6 +60,8 @@
 #include <linux/swapops.h>
 #include <linux/elf.h>
 
 +#include <linux/rss_container.h>
 +
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 /* use the per-pgdat data instead for discontigmem - mbligh */
 unsigned long max_mapnr;
 @@ -1126,7 +1128,7 @@ static int zeromap_pte_range(struct mm_s
 break;
 }
 page_cache_get(page);
 -		page_add_file_rmap(page);
 +		page_add_file_rmap(page, NULL);
 inc_mm_counter(mm, file_rss);
 set_pte_at(mm, addr, pte, zero_pte);
 } while (pte++, addr += PAGE_SIZE, addr != end);
 @@ -1234,7 +1236,7 @@ static int insert_page(struct mm_struct
 /* Ok, finally just insert the thing.. */
 get_page(page);
 inc_mm_counter(mm, file_rss);
 -	page_add_file_rmap(page);
 +	page_add_file_rmap(page, NULL);
 set_pte_at(mm, addr, pte, mk_pte(page, prot));
 
 retval = 0;
 @@ -1495,6 +1497,7 @@ static int do_wp_page(struct mm_struct *
 pte_t entry;
 int reuse = 0, ret = VM_FAULT_MINOR;
 struct page *dirty_page = NULL;
 +	struct page_container *pcont;
 
 old_page = vm_normal_page(vma, address, orig_pte);
 if (!old_page)
 @@ -1580,6 +1583,9 @@ gotten:
 cow_user_page(new_page, old_page, address, vma);
 }
 
 +	if (container_rss_prepare(new_page, vma, &pcont))
 +		goto oom;
 +
 /*
 * Re-check the pte - we dropped the lock
 */
 @@ -1607,12 +1613,14 @@ gotten:
 set_pte_at(mm, address, page_table, entry);
 update_mmu_cache(vma, address, entry);
 lru_cache_add_active(new_page);
 -		page_add_new_anon_rmap(new_page, vma, address);
 +		page_add_new_anon_rmap(new_page, vma, address, pcont);
 
 /* Free the old page.. */
 new_page = old_page;
 ret |= VM_FAULT_WRITE;
 -	}
 +	} else
 +		container_rss_release(pcont);
 +
 if (new_page)
 page_cache_release(new_page);
 if (old_page)
 @@ -1988,6 +1996,7 @@ static int do_swap_page(struct mm_struct
 swp_entry_t entry;
 pte_t pte;
 int ret = VM_FAULT_MINOR;
 +	struct page_container *pcont;
 
 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
 goto out;
 @@ -2020,6 +2029,11 @@ static int do_swap_page(struct mm_struct
 count_vm_event(PGMAJFAULT);
 }
 
 +	if (container_rss_prepare(page, vma, &pcont)) {
 +		ret = VM_FAULT_OOM;
 +		goto out;
 +	}
 +
 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
 mark_page_accessed(page);
 lock_page(page);
 @@ -2033,6 +2047,7 @@ static int do_swap_page(struct mm_struct
 
 if (unlikely(!PageUptodate(page))) {
 ret = VM_FAULT_SIGBUS;
 +		container_rss_release(pcont);
 goto out_nomap;
 }
 
 @@ -2047,7 +2062,7 @@ static int do_swap_page(struct mm_struct
 
 flush_icache_page(vma, page);
 set_pte_at(mm, address, page_table, pte);
 -	page_add_anon_rmap(page, vma, address);
 +	page_add_anon_rmap(page, vma, address, pcont);
 
 swap_free(entry);
 if (vm_swap_full())
 @@ -2069,6 +2084,7 @@ unlock:
 out:
 return ret;
 out_nomap:
 +	container_rss_release(pcont);
 pte_unmap_unlock(page_table, ptl);
 unlock_page(page);
 page_cache_release(page);
 @@ -2087,6 +2103,7 @@ static int do_anonymous_page(struct mm_s
 struct page *page;
 spinlock_t *ptl;
 pte_t entry;
 +	struct page_container *pcont;
 
 if (write_access) {
 /* Allocate our own private page. */
 @@ -2098,15 +2115,19 @@ static int do_anonymous_page(struct mm_s
 if (!page)
 goto oom;
 
 +		if (container_rss_prepare(page, vma, &pcont))
 +			goto oom_release;
 +
 entry = mk_pte(page, vma->vm_page_prot);
 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 
 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
 if (!pte_none(*page_table))
 -			goto release;
 +			goto release_container;
 +
 inc_mm_counter(mm, anon_rss);
 lru_cache_add_active(page);
 -		page_add_new_anon_rmap(page, vma, address);
 +		page_add_new_anon_rmap(page, vma, address, pcont);
 } else {
 /* Map the ZERO_PAGE - vm_page_prot is readonly */
 page = ZERO_PAGE(address);
 @@ -2118,7 +2139,7 @@ static int do_anonymous_page(struct mm_s
 if (!pte_none(*page_table))
 goto release;
 inc_mm_counter(mm, file_rss);
 -		page_add_file_rmap(page);
 +		page_add_file_rmap(page, NULL);
 }
 
 set_pte_at(mm, address, page_table, entry);
 @@ -2129,9 +2150,14 @@ static int do_anonymous_page(struct mm_s
 unlock:
 pte_unmap_unlock(page_table, ptl);
 return VM_FAULT_MINOR;
 +release_container:
 +	container_rss_release(pcont);
 release:
 page_cache_release(page);
 goto unlock;
 +
 +oom_release:
 +	page_cache_release(page);
 oom:
 return VM_FAULT_OOM;
 }
 @@ -2161,6 +2187,7 @@ static int do_no_page(struct mm_struct *
 int ret = VM_FAULT_MINOR;
 int anon = 0;
 struct page *dirty_page = NULL;
 +	struct page_container *pcont;
 
 pte_unmap(page_table);
 BUG_ON(vma->vm_flags & VM_PFNMAP);
 @@ -2218,6 +2245,9 @@ retry:
 }
 }
 
 +	if (container_rss_prepare(new_page, vma, &pcont))
 +		goto oom;
 +
 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
 /*
 * For a file-backed vma, someone could have truncated or otherwise
 @@ -2226,6 +2256,7 @@ retry:
 */
 if (mapping && unlikely(sequence != mapping->truncate_count)) {
 pte_unmap_unlock(page_table, ptl);
 +		container_rss_release(pcont);
 page_cache_release(new_page);
 cond_resched();
 sequence = mapping->truncate_count;
 @@ -2253,10 +2284,10 @@ retry:
 if (anon) {
 inc_mm_counter(mm, anon_rss);
 lru_cache_add_active(new_page);
 -			page_add_new_anon_rmap(new_page, vma, address);
 +			page_add_new_anon_rmap(new_page, vma, address, pcont);
 } else {
 inc_mm_counter(mm, file_rss);
 -			page_add_file_rmap(new_page);
 +			page_add_file_rmap(new_page, pcont);
 if (write_access) {
 dirty_page = new_page;
 get_page(dirty_page);
 @@ -2264,6 +2295,7 @@ retry:
 }
 } else {
 /* One of our sibling threads was faster, back out. */
 +		container_rss_release(pcont);
 page_cache_release(new_page);
 goto unlock;
 }
 diff -upr linux-2.6.20.orig/mm/migrate.c linux-2.6.20-0/mm/migrate.c
 --- linux-2.6.20.orig/mm/migrate.c	2007-02-04 21:44:54.000000000 +0300
 +++ linux-2.6.20-0/mm/migrate.c	2007-03-06 13:33:28.000000000 +0300
 @@ -134,6 +134,7 @@ static void remove_migration_pte(struct
 pte_t *ptep, pte;
 spinlock_t *ptl;
 unsigned long addr = page_address_in_vma(new, vma);
 +	struct page_container *pcont;
 
 if (addr == -EFAULT)
 return;
 @@
...
 
 
 |  
	|  |  |  
	| 
		
			| [RFC][PATCH 5/7] Per-container OOM killer and page reclamation [message #10893 is a reply to message #10888] | Tue, 06 March 2007 15:01   |  
			| 
				
				
					|  xemul Messages: 248
 Registered: November 2005
 | Senior Member |  |  |  
	| * container_try_to_free_pages() walks containers page list and tries to shrink pages. This is based
 on try_to_free_pages() and Co code.
 Called from core code when no resource left at the
 moment of page touching.
 
 * container_out_of_memory() selects a process to be
 killed which mm_struct belongs to container in question.
 Called from core code when no resources left and no
 pages were reclaimed.
 
 diff -upr linux-2.6.20.orig/mm/oom_kill.c linux-2.6.20-0/mm/oom_kill.c
 --- linux-2.6.20.orig/mm/oom_kill.c	2007-03-06 13:33:28.000000000 +0300
 +++ linux-2.6.20-0/mm/oom_kill.c	2007-03-06 13:33:28.000000000 +0300
 @@ -24,6 +24,7 @@
 #include <linux/cpuset.h>
 #include <linux/module.h>
 #include <linux/notifier.h>
 +#include <linux/rss_container.h>
 
 int sysctl_panic_on_oom;
 /* #define DEBUG */
 @@ -47,7 +48,8 @@ int sysctl_panic_on_oom;
 *    of least surprise ... (be careful when you change it)
 */
 
 -unsigned long badness(struct task_struct *p, unsigned long uptime)
 +unsigned long badness(struct task_struct *p, unsigned long uptime,
 +		struct rss_container *rss)
 {
 unsigned long points, cpu_time, run_time, s;
 struct mm_struct *mm;
 @@ -60,6 +62,13 @@ unsigned long badness(struct task_struct
 return 0;
 }
 
 +#ifdef CONFIG_RSS_CONTAINER
 +	if (rss != NULL && mm->rss_container != rss) {
 +		task_unlock(p);
 +		return 0;
 +	}
 +#endif
 +
 /*
 * The memory size of the process is the basis for the badness.
 */
 @@ -200,7 +209,8 @@ static inline int constrained_alloc(stru
 *
 * (not docbooked, we don't want this one cluttering up the manual)
 */
 -static struct task_struct *select_bad_process(unsigned long *ppoints)
 +static struct task_struct *select_bad_process(unsigned long *ppoints,
 +		struct rss_container *rss)
 {
 struct task_struct *g, *p;
 struct task_struct *chosen = NULL;
 @@ -254,7 +264,7 @@ static struct task_struct *select_bad_pr
 if (p->oomkilladj == OOM_DISABLE)
 continue;
 
 -		points = badness(p, uptime.tv_sec);
 +		points = badness(p, uptime.tv_sec, rss);
 if (points > *ppoints || !chosen) {
 chosen = p;
 *ppoints = points;
 @@ -435,7 +445,7 @@ retry:
 * Rambo mode: Shoot down a process and hope it solves whatever
 * issues we may have.
 */
 -		p = select_bad_process(&points);
 +		p = select_bad_process(&points, NULL);
 
 if (PTR_ERR(p) == -1UL)
 goto out;
 @@ -464,3 +474,27 @@ out:
 if (!test_thread_flag(TIF_MEMDIE))
 schedule_timeout_uninterruptible(1);
 }
 +
 +#ifdef CONFIG_RSS_CONTAINER
 +void container_out_of_memory(struct rss_container *rss)
 +{
 +	unsigned long points = 0;
 +	struct task_struct *p;
 +
 +	container_lock();
 +	read_lock(&tasklist_lock);
 +retry:
 +	p = select_bad_process(&points, rss);
 +	if (PTR_ERR(p) == -1UL)
 +		goto out;
 +
 +	if (!p)
 +		p = current;
 +
 +	if (oom_kill_process(p, points, "Container out of memory"))
 +		goto retry;
 +out:
 +	read_unlock(&tasklist_lock);
 +	container_unlock();
 +}
 +#endif
 diff -upr linux-2.6.20.orig/mm/vmscan.c linux-2.6.20-0/mm/vmscan.c
 --- linux-2.6.20.orig/mm/vmscan.c	2007-02-04 21:44:54.000000000 +0300
 +++ linux-2.6.20-0/mm/vmscan.c	2007-03-06 13:33:28.000000000 +0300
 @@ -45,6 +45,8 @@
 
 #include "internal.h"
 
 +#include <linux/rss_container.h>
 +
 struct scan_control {
 /* Incremented by the number of inactive pages that were scanned */
 unsigned long nr_scanned;
 @@ -1097,6 +1099,194 @@ out:
 return ret;
 }
 
 +#ifdef CONFIG_RSS_CONTAINER
 +/*
 + * These are containers' inactive and active pages shrinkers.
 + * Thes works like shrink_inactive_list() and shrink_active_list()
 + *
 + * Two main differences is that container_isolate_pages() is used to isolate
 + * pages, and that reclaim_mapped is considered to be 1 as hitting BC
 + * limit implies we have to shrink _mapped_ pages
 + */
 +static unsigned long container_shrink_pages_inactive(unsigned long max_scan,
 +		struct rss_container *rss, struct scan_control *sc)
 +{
 +	LIST_HEAD(page_list);
 +	unsigned long nr_scanned = 0;
 +	unsigned long nr_reclaimed = 0;
 +
 +	do {
 +		struct page *page;
 +		unsigned long nr_taken;
 +		unsigned long nr_scan;
 +		struct zone *z;
 +
 +		nr_taken = container_isolate_pages(sc->swap_cluster_max, rss,
 +				&page_list, 0, &nr_scan);
 +
 +		nr_scanned += nr_scan;
 +		nr_reclaimed += shrink_page_list(&page_list, sc);
 +		if (nr_taken == 0)
 +			goto done;
 +
 +		while (!list_empty(&page_list)) {
 +			page = lru_to_page(&page_list);
 +			z = page_zone(page);
 +
 +			spin_lock_irq(&z->lru_lock);
 +			VM_BUG_ON(PageLRU(page));
 +			SetPageLRU(page);
 +			list_del(&page->lru);
 +			if (PageActive(page))
 +				add_page_to_active_list(z, page);
 +			else
 +				add_page_to_inactive_list(z, page);
 +			spin_unlock_irq(&z->lru_lock);
 +
 +			put_page(page);
 +		}
 +  	} while (nr_scanned < max_scan);
 +done:
 +	return nr_reclaimed;
 +}
 +
 +static void container_shrink_pages_active(unsigned long nr_pages,
 +		struct rss_container *rss, struct scan_control *sc)
 +{
 +	LIST_HEAD(l_hold);
 +	LIST_HEAD(l_inactive);
 +	LIST_HEAD(l_active);
 +	struct page *page;
 +	unsigned long nr_scanned;
 +	unsigned long nr_deactivated = 0;
 +	struct zone *z;
 +
 +	container_isolate_pages(nr_pages, rss, &l_hold, 1, &nr_scanned);
 +
 +	while (!list_empty(&l_hold)) {
 +		cond_resched();
 +		page = lru_to_page(&l_hold);
 +		list_del(&page->lru);
 +		if (page_mapped(page)) {
 +			if ((total_swap_pages == 0 && PageAnon(page)) ||
 +			    page_referenced(page, 0)) {
 +				list_add(&page->lru, &l_active);
 +				continue;
 +			}
 +		}
 +		nr_deactivated++;
 +		list_add(&page->lru, &l_inactive);
 +	}
 +
 +	while (!list_empty(&l_inactive)) {
 +		page = lru_to_page(&l_inactive);
 +		z = page_zone(page);
 +
 +		spin_lock_irq(&z->lru_lock);
 +		VM_BUG_ON(PageLRU(page));
 +		SetPageLRU(page);
 +		VM_BUG_ON(!PageActive(page));
 +		ClearPageActive(page);
 +
 +		list_move(&page->lru, &z->inactive_list);
 +		z->nr_inactive++;
 +		spin_unlock_irq(&z->lru_lock);
 +
 +		put_page(page);
 +	}
 +
 +	while (!list_empty(&l_active)) {
 +		page = lru_to_page(&l_active);
 +		z = page_zone(page);
 +
 +		spin_lock_irq(&z->lru_lock);
 +		VM_BUG_ON(PageLRU(page));
 +		SetPageLRU(page);
 +		VM_BUG_ON(!PageActive(page));
 +		list_move(&page->lru, &z->active_list);
 +		z->nr_active++;
 +		spin_unlock_irq(&z->lru_lock);
 +
 +		put_page(page);
 +	}
 +}
 +
 +/*
 + * This is a reworked shrink_zone() routine - it scans active pages firts,
 + * then inactive and returns the number of pages reclaimed
 + */
 +static unsigned long container_shrink_pages(int priority,
 +		struct rss_container *rss, struct scan_control *sc)
 +{
 +	unsigned long nr_pages;
 +	unsigned long nr_to_scan;
 +	unsigned long nr_reclaimed = 0;
 +
 +	nr_pages = (container_nr_physpages(rss) >> priority) + 1;
 +	if (nr_pages < sc->swap_cluster_max)
 +		nr_pages = 0;
 +
 +	while (nr_pages) {
 +		nr_to_scan = min(nr_pages, (unsigned long)sc->swap_cluster_max);
 +		nr_pages -= nr_to_scan;
 +		container_shrink_pages_active(nr_to_scan, rss, sc);
 +	}
 +
 +	nr_pages = (container_nr_physpages(rss) >> priority) + 1;
 +	if (nr_pages < sc->swap_cluster_max)
 +		nr_pages = 0;
 +
 +	while (nr_pages) {
 +		nr_to_scan = min(nr_pages, (unsigned long)sc->swap_cluster_max);
 +		nr_pages -= nr_to_scan;
 +		nr_reclaimed += container_shrink_pages_inactive(nr_to_scan, rss, sc);
 +	}
 +
 +	throttle_vm_writeout();
 +	return nr_reclaimed;
 +}
 +
 +/*
 + * This functions works like try_to_free_pages() - it tries
 + * to shrink bc's pages with increasing priority
 + */
 +unsigned long container_try_to_free_pages(struct rss_container *rss)
 +{
 +	int priority;
 +	int ret = 0;
 +	unsigned long total_scanned = 0;
 +	unsigned long nr_reclaimed = 0;
 +	struct scan_control sc = {
 +		.gfp_mask = GFP_KERNEL,
 +		.may_writepage = !laptop_mode,
 +		.swap_cluster_max = SWAP_CLUSTER_MAX,
 +		.may_swap = 1,
 +		.swappiness = vm_swappiness,
 +	};
 +
 +	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
 +		sc.nr_scanned = 0;
 +		nr_reclaimed += container_shrink_pages(priority, rss, &sc);
 +		total_scanned += sc.nr_scanned;
 +		if (nr_reclaimed > 1) {
 +			ret = 1;
 +			goto out;
 +		}
 +
 +		if (total_scanned > sc.swap_cluster_max +
 +					sc.swap_cluster_max / 2) {
 +			wakeup_pdflush(laptop_mode ? 0 : total_scanned);
 +			sc.may_writepage = 1;
 +		}
 +
 +		if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
 +			congestion_wait(WRITE, HZ/10);
 +	}
 +out:
 +	return ret;
 +}
 +#endif
 +
 /*
 * For kswapd, balance_pgdat() will work across all this node's zones until
 * they are all at pages_high.
...
 
 
 |  
	|  |  |  
	| 
		
			| [RFC][PATCH 6/7] Account for the number of tasks within container [message #10894 is a reply to message #10888] | Tue, 06 March 2007 15:02   |  
			| 
				
				
					|  xemul Messages: 248
 Registered: November 2005
 | Senior Member |  |  |  
	| Small and simple - each fork()/clone() is accounted and rejected when limit is hit.
 
 diff -upr linux-2.6.20.orig/include/linux/numproc_container.h linux-2.6.20-0/include/linux/numproc_container.h
 --- linux-2.6.20.orig/include/linux/numproc_container.h	2007-03-06 13:39:17.000000000 +0300
 +++ linux-2.6.20-0/include/linux/numproc_container.h	2007-03-06 13:33:28.000000000 +0300
 @@ -0,0 +1,32 @@
 +#ifndef __NUMPROC_CONTAINER_H__
 +#define __NUMPROC_CONTAINER_H__
 +/*
 + * Numproc container
 + *
 + * Copyright 2007 OpenVZ SWsoft Inc
 + *
 + * Author: Pavel Emelianov <xemul@openvz.org>
 + *
 + */
 +
 +#ifdef CONFIG_PROCESS_CONTAINER
 +int container_proc_charge(struct task_struct *tsk);
 +void container_proc_uncharge(struct task_struct *tsk);
 +
 +void container_numproc_init_early(void);
 +#else
 +static inline int container_proc_charge(struct task_struct *tsk)
 +{
 +	return 0;
 +}
 +
 +static inline void container_proc_uncharge(struct task_struct *tsk)
 +{
 +}
 +
 +static inline void container_numproc_init_early(void)
 +{
 +}
 +#endif
 +
 +#endif
 diff -upr linux-2.6.20.orig/include/linux/sched.h linux-2.6.20-0/include/linux/sched.h
 --- linux-2.6.20.orig/include/linux/sched.h	2007-03-06 13:33:28.000000000 +0300
 +++ linux-2.6.20-0/include/linux/sched.h	2007-03-06 13:33:28.000000000 +0300
 @@ -1052,6 +1055,9 @@ struct task_struct {
 #ifdef CONFIG_FAULT_INJECTION
 int make_it_fail;
 #endif
 +#ifdef CONFIG_PROCESS_CONTAINER
 +	struct numproc_container *numproc_cnt;
 +#endif
 };
 
 static inline pid_t process_group(struct task_struct *tsk)
 diff -upr linux-2.6.20.orig/init/Kconfig linux-2.6.20-0/init/Kconfig
 --- linux-2.6.20.orig/init/Kconfig	2007-03-06 13:33:28.000000000 +0300
 +++ linux-2.6.20-0/init/Kconfig	2007-03-06 13:33:28.000000000 +0300
 @@ -265,6 +265,12 @@ config CPUSETS
 Provides a simple Resource Controller for monitoring and
 controlling the total Resident Set Size of the tasks in a container
 
 +config PROCESS_CONTAINER
 +	bool "Numproc accounting container"
 +	select RESOURCE_COUNTERS
 +	help
 +	  Provides the-number-of-tasks accounting container
 +
 config SYSFS_DEPRECATED
 bool "Create deprecated sysfs files"
 default y
 diff -upr linux-2.6.20.orig/kernel/Makefile linux-2.6.20-0/kernel/Makefile
 --- linux-2.6.20.orig/kernel/Makefile	2007-03-06 13:33:28.000000000 +0300
 +++ linux-2.6.20-0/kernel/Makefile	2007-03-06 13:33:28.000000000 +0300
 @@ -51,6 +51,7 @@ obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
 obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
 +obj-$(CONFIG_PROCESS_CONTAINER) += numproc_container.o
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
 diff -upr linux-2.6.20.orig/kernel/exit.c linux-2.6.20-0/kernel/exit.c
 --- linux-2.6.20.orig/kernel/exit.c	2007-03-06 13:33:28.000000000 +0300
 +++ linux-2.6.20-0/kernel/exit.c	2007-03-06 13:33:28.000000000 +0300
 @@ -48,6 +48,8 @@
 #include <asm/pgtable.h>
 #include <asm/mmu_context.h>
 
 +#include <linux/numproc_container.h>
 +
 extern void sem_exit (void);
 
 static void exit_mm(struct task_struct * tsk);
 @@ -174,6 +176,7 @@ repeat:
 write_unlock_irq(&tasklist_lock);
 proc_flush_task(p);
 release_thread(p);
 +	container_proc_uncharge(p);
 call_rcu(&p->rcu, delayed_put_task_struct);
 
 p = leader;
 diff -upr linux-2.6.20.orig/kernel/fork.c linux-2.6.20-0/kernel/fork.c
 --- linux-2.6.20.orig/kernel/fork.c	2007-03-06 13:33:28.000000000 +0300
 +++ linux-2.6.20-0/kernel/fork.c	2007-03-06 13:33:28.000000000 +0300
 @@ -57,6 +57,7 @@
 #include <asm/tlbflush.h>
 
 #include <linux/rss_container.h>
 +#include <linux/numproc_container.h>
 
 /*
 * Protected counters by write_lock_irq(&tasklist_lock)
 @@ -986,6 +999,9 @@ static struct task_struct *copy_process(
 if (!p)
 goto fork_out;
 
 +	if (container_proc_charge(p))
 +		goto charge_out;
 +
 rt_mutex_init_task(p);
 
 #ifdef CONFIG_TRACE_IRQFLAGS
 @@ -1302,6 +1318,8 @@ bad_fork_cleanup_count:
 atomic_dec(&p->user->processes);
 free_uid(p->user);
 bad_fork_free:
 +	container_proc_uncharge(p);
 +charge_out:
 free_task(p);
 fork_out:
 return ERR_PTR(retval);
 diff -upr linux-2.6.20.orig/kernel/numproc_container.c linux-2.6.20-0/kernel/numproc_container.c
 --- linux-2.6.20.orig/kernel/numproc_container.c	2007-03-06 13:39:17.000000000 +0300
 +++ linux-2.6.20-0/kernel/numproc_container.c	2007-03-06 13:33:28.000000000 +0300
 @@ -0,0 +1,151 @@
 +/*
 + * Numproc accounting container
 + *
 + * Copyright 2007 OpenVZ SWsoft Inc
 + *
 + * Author: Pavel Emelianov <xemul@openvz.org>
 + *
 + */
 +
 +#include <linux/list.h>
 +#include <linux/sched.h>
 +#include <linux/mm.h>
 +#include <linux/res_counter.h>
 +#include <linux/numproc_container.h>
 +
 +static struct container_subsys numproc_subsys;
 +
 +struct numproc_container {
 +	struct res_counter res;
 +	struct container_subsys_state css;
 +};
 +
 +static inline struct numproc_container *numproc_from_cont(struct container *cnt)
 +{
 +	return container_of(container_subsys_state(cnt, &numproc_subsys),
 +			struct numproc_container, css);
 +}
 +
 +int container_proc_charge(struct task_struct *new)
 +{
 +	struct numproc_container *np;
 +
 +	rcu_read_lock();
 +	np = numproc_from_cont(task_container(current, &numproc_subsys));
 +	css_get_current(&np->css);
 +	rcu_read_unlock();
 +
 +	if (res_counter_charge(&np->res, 1)) {
 +		css_put(&np->css);
 +		return -ENOMEM;
 +	}
 +
 +	new->numproc_cnt = np;
 +	return 0;
 +}
 +
 +void container_proc_uncharge(struct task_struct *tsk)
 +{
 +	struct numproc_container *np;
 +
 +	np = tsk->numproc_cnt;
 +	res_counter_uncharge(&np->res, 1);
 +	css_put(&np->css);
 +}
 +
 +static int numproc_create(struct container_subsys *ss, struct container *cont)
 +{
 +	struct numproc_container *np;
 +
 +	np = kzalloc(sizeof(struct numproc_container), GFP_KERNEL);
 +	if (np == NULL)
 +		return -ENOMEM;
 +
 +	res_counter_init(&np->res);
 +	cont->subsys[numproc_subsys.subsys_id] = &np->css;
 +	return 0;
 +}
 +
 +static void numproc_destroy(struct container_subsys *ss,
 +		struct container *cont)
 +{
 +	kfree(numproc_from_cont(cont));
 +}
 +
 +
 +static ssize_t numproc_read(struct container *cont, struct cftype *cft,
 +		struct file *file, char __user *userbuf,
 +		size_t nbytes, loff_t *ppos)
 +{
 +	return res_counter_read(&numproc_from_cont(cont)->res, cft->private,
 +			userbuf, nbytes, ppos);
 +}
 +
 +static ssize_t numproc_write(struct container *cont, struct cftype *cft,
 +		struct file *file, const char __user *userbuf,
 +		size_t nbytes, loff_t *ppos)
 +{
 +	return res_counter_write(&numproc_from_cont(cont)->res, cft->private,
 +			userbuf, nbytes, ppos);
 +}
 +
 +
 +static struct cftype numproc_usage = {
 +	.name = "numproc_usage",
 +	.private = RES_USAGE,
 +	.read = numproc_read,
 +};
 +
 +static struct cftype numproc_limit = {
 +	.name = "numproc_limit",
 +	.private = RES_LIMIT,
 +	.read = numproc_read,
 +	.write = numproc_write,
 +};
 +
 +static struct cftype numproc_failcnt = {
 +	.name = "numproc_failcnt",
 +	.private = RES_FAILCNT,
 +	.read = numproc_read,
 +};
 +
 +static int numproc_populate(struct container_subsys *ss,
 +		struct container *cont)
 +{
 +	int rc;
 +
 +	if ((rc = container_add_file(cont, &numproc_usage)) < 0)
 +		return rc;
 +	if ((rc = container_add_file(cont, &numproc_failcnt)) < 0)
 +		return rc;
 +	if ((rc = container_add_file(cont, &numproc_limit)) < 0)
 +		return rc;
 +
 +	return 0;
 +}
 +
 +static struct numproc_container init_numproc_container;
 +
 +static __init int numproc_create_early(struct container_subsys *ss,
 +		struct container *cont)
 +{
 +	struct numproc_container *np;
 +
 +	np = &init_numproc_container;
 +	res_counter_init(&np->res);
 +	cont->subsys[numproc_subsys.subsys_id] = &np->css;
 +	ss->create = numproc_create;
 +	return 0;
 +}
 +
 +static struct container_subsys numproc_subsys = {
 +	.name = "numproc",
 +	.create = numproc_create_early,
 +	.destroy = numproc_destroy,
 +	.populate = numproc_populate,
 +};
 +
 +void __init container_numproc_init_early(void)
 +{
 +	container_register_subsys(&numproc_subsys);
 +}
 diff -upr linux-2.6.20.orig/kernel/container.c linux-2.6.20-0/kernel/container.c
 --- linux-2.6.20.orig/kernel/container.c	2007-03-06 13:33:28.000000000 +0300
 +++ linux-2.6.20-0/kernel/container.c	2007-03-06 13:35:48.000000000 +0300
 @@ -60,6 +60,7 @@
 #include <linux/mutex.h>
 
 #include <linux/rss_container.h>
 +#include <linux/numproc_container.h>
 
 #define CONTAINER_SUPER_MAGIC		0x27e0eb
 
 @@ -1721,6 +1725,7 @@ int __init container_init_early(void)
 init_task.containers = &init_container_group;
 
 container_rss_init_early();
 +	container_numproc_init_early();
 
 return 0;
 }
...
 
 
 |  
	|  |  |  
	| 
		
			| [RFC][PATCH 7/7] Account for the number of files opened within container [message #10895 is a reply to message #10888] | Tue, 06 March 2007 15:05   |  
			| 
				
				
					|  xemul Messages: 248
 Registered: November 2005
 | Senior Member |  |  |  
	| Simple again - increment usage counter at file open and decrement at file close. Reject opening if limit is hit.
 
 diff -upr linux-2.6.20.orig/fs/Makefile linux-2.6.20-0/fs/Makefile
 --- linux-2.6.20.orig/fs/Makefile	2007-02-04 21:44:54.000000000 +0300
 +++ linux-2.6.20-0/fs/Makefile	2007-03-06 13:33:28.000000000 +0300
 @@ -19,6 +19,8 @@ else
 obj-y +=	no-block.o
 endif
 
 +obj-$(CONFIG_FILES_CONTAINER)	+= numfiles_container.o
 +
 obj-$(CONFIG_INOTIFY)		+= inotify.o
 obj-$(CONFIG_INOTIFY_USER)	+= inotify_user.o
 obj-$(CONFIG_EPOLL)		+= eventpoll.o
 diff -upr linux-2.6.20.orig/fs/file_table.c linux-2.6.20-0/fs/file_table.c
 --- linux-2.6.20.orig/fs/file_table.c	2007-02-04 21:44:54.000000000 +0300
 +++ linux-2.6.20-0/fs/file_table.c	2007-03-06 13:33:28.000000000 +0300
 @@ -21,6 +21,7 @@
 #include <linux/fsnotify.h>
 #include <linux/sysctl.h>
 #include <linux/percpu_counter.h>
 +#include <linux/numfiles_container.h>
 
 #include <asm/atomic.h>
 
 @@ -42,6 +43,7 @@ static inline void file_free_rcu(struct
 
 static inline void file_free(struct file *f)
 {
 +	container_file_uncharge(f);
 percpu_counter_dec(&nr_files);
 call_rcu(&f->f_u.fu_rcuhead, file_free_rcu);
 }
 @@ -109,6 +111,10 @@ struct file *get_empty_filp(void)
 
 percpu_counter_inc(&nr_files);
 memset(f, 0, sizeof(*f));
 +
 +	if (container_file_charge(f))
 +		goto fail_charge;
 +
 if (security_file_alloc(f))
 goto fail_sec;
 
 @@ -132,7 +138,10 @@ over:
 goto fail;
 
 fail_sec:
 -	file_free(f);
 +	container_file_uncharge(f);
 +fail_charge:
 +	percpu_counter_dec(&nr_files);
 +	kmem_cache_free(filp_cachep, f);
 fail:
 return NULL;
 }
 diff -upr linux-2.6.20.orig/fs/numfiles_container.c linux-2.6.20-0/fs/numfiles_container.c
 --- linux-2.6.20.orig/fs/numfiles_container.c	2007-03-06 13:39:17.000000000 +0300
 +++ linux-2.6.20-0/fs/numfiles_container.c	2007-03-06 13:33:28.000000000 +0300
 @@ -0,0 +1,152 @@
 +/*
 + * Numfiles accounting container
 + *
 + * Copyright 2007 OpenVZ SWsoft Inc
 + *
 + * Author: Pavel Emelianov <xemul@openvz.org>
 + *
 + */
 +
 +#include <linux/list.h>
 +#include <linux/sched.h>
 +#include <linux/mm.h>
 +#include <linux/res_counter.h>
 +#include <linux/numfiles_container.h>
 +
 +static struct container_subsys numfiles_subsys;
 +
 +struct files_container {
 +	struct res_counter res;
 +	struct container_subsys_state css;
 +};
 +
 +static inline struct files_container *numfiles_from_cont(struct container *cnt)
 +{
 +	return container_of(container_subsys_state(cnt, &numfiles_subsys),
 +			struct files_container, css);
 +}
 +
 +int container_file_charge(struct file *file)
 +{
 +	struct files_container *fc;
 +
 +	rcu_read_lock();
 +	fc = numfiles_from_cont(task_container(current, &numfiles_subsys));
 +	css_get_current(&fc->css);
 +	rcu_read_unlock();
 +
 +	if (res_counter_charge(&fc->res, 1)) {
 +		css_put(&fc->css);
 +		return -ENOMEM;
 +	}
 +
 +	file->f_cont = fc;
 +	return 0;
 +}
 +
 +void container_file_uncharge(struct file *file)
 +{
 +	struct files_container *fc;
 +
 +	fc = file->f_cont;
 +	res_counter_uncharge(&fc->res, 1);
 +	css_put(&fc->css);
 +}
 +
 +static int numfiles_create(struct container_subsys *ss, struct container *cont)
 +{
 +	struct files_container *fc;
 +
 +	fc = kzalloc(sizeof(struct files_container), GFP_KERNEL);
 +	if (fc == NULL)
 +		return -ENOMEM;
 +
 +	res_counter_init(&fc->res);
 +	cont->subsys[numfiles_subsys.subsys_id] = &fc->css;
 +	return 0;
 +}
 +
 +static void numfiles_destroy(struct container_subsys *ss,
 +		struct container *cont)
 +{
 +	kfree(numfiles_from_cont(cont));
 +}
 +
 +
 +static ssize_t numfiles_read(struct container *cont, struct cftype *cft,
 +		struct file *file, char __user *userbuf,
 +		size_t nbytes, loff_t *ppos)
 +{
 +	return res_counter_read(&numfiles_from_cont(cont)->res, cft->private,
 +			userbuf, nbytes, ppos);
 +}
 +
 +static ssize_t numfiles_write(struct container *cont, struct cftype *cft,
 +		struct file *file, const char __user *userbuf,
 +		size_t nbytes, loff_t *ppos)
 +{
 +	return res_counter_write(&numfiles_from_cont(cont)->res, cft->private,
 +			userbuf, nbytes, ppos);
 +}
 +
 +
 +static struct cftype numfiles_usage = {
 +	.name = "numfiles_usage",
 +	.private = RES_USAGE,
 +	.read = numfiles_read,
 +};
 +
 +static struct cftype numfiles_limit = {
 +	.name = "numfiles_limit",
 +	.private = RES_LIMIT,
 +	.read = numfiles_read,
 +	.write = numfiles_write,
 +};
 +
 +static struct cftype numfiles_failcnt = {
 +	.name = "numfiles_failcnt",
 +	.private = RES_FAILCNT,
 +	.read = numfiles_read,
 +};
 +
 +static int numfiles_populate(struct container_subsys *ss,
 +		struct container *cont)
 +{
 +	int rc;
 +
 +	if ((rc = container_add_file(cont, &numfiles_usage)) < 0)
 +		return rc;
 +	if ((rc = container_add_file(cont, &numfiles_failcnt)) < 0)
 +		return rc;
 +	if ((rc = container_add_file(cont, &numfiles_limit)) < 0)
 +		return rc;
 +
 +	return 0;
 +}
 +
 +static struct files_container init_files_container;
 +
 +static __init int numfiles_create_early(struct container_subsys *ss,
 +		struct container *cont)
 +{
 +	struct files_container *np;
 +
 +	np = &init_files_container;
 +	res_counter_init(&np->res);
 +	cont->subsys[numfiles_subsys.subsys_id] = &np->css;
 +	ss->create = numfiles_create;
 +	return 0;
 +}
 +
 +static struct container_subsys numfiles_subsys = {
 +	.name = "numfiles",
 +	.create = numfiles_create_early,
 +	.destroy = numfiles_destroy,
 +	.populate = numfiles_populate,
 +};
 +
 +void __init container_numfiles_init_early(void)
 +{
 +	container_register_subsys(&numfiles_subsys);
 +}
 +
 diff -upr linux-2.6.20.orig/include/linux/fs.h linux-2.6.20-0/include/linux/fs.h
 --- linux-2.6.20.orig/include/linux/fs.h	2007-03-06 13:33:28.000000000 +0300
 +++ linux-2.6.20-0/include/linux/fs.h	2007-03-06 13:33:28.000000000 +0300
 @@ -739,6 +739,9 @@ struct file {
 spinlock_t		f_ep_lock;
 #endif /* #ifdef CONFIG_EPOLL */
 struct address_space	*f_mapping;
 +#ifdef CONFIG_FILES_CONTAINER
 +	struct files_container	*f_cont;
 +#endif
 };
 extern spinlock_t files_lock;
 #define file_list_lock() spin_lock(&files_lock);
 diff -upr linux-2.6.20.orig/include/linux/numfiles_container.h linux-2.6.20-0/include/linux/numfiles_container.h
 --- linux-2.6.20.orig/include/linux/numfiles_container.h	2007-03-06 13:39:17.000000000 +0300
 +++ linux-2.6.20-0/include/linux/numfiles_container.h	2007-03-06 13:33:28.000000000 +0300
 @@ -0,0 +1,33 @@
 +#ifndef __NUMFILES_CONTAINER_H__
 +#define __NUMFILES_CONTAINER_H__
 +/*
 + * Numfiles container
 + *
 + * Copyright 2007 OpenVZ SWsoft Inc
 + *
 + * Author: Pavel Emelianov <xemul@openvz.org>
 + *
 + */
 +
 +#ifdef CONFIG_FILES_CONTAINER
 +int container_file_charge(struct file *file);
 +void container_file_uncharge(struct file *file);
 +
 +void container_numfiles_init_early(void);
 +#else
 +static inline int container_file_charge(struct file *file)
 +{
 +	return 0;
 +}
 +
 +static inline void container_file_uncharge(struct file *file)
 +{
 +}
 +
 +static inline void container_numfiles_init_early(void)
 +{
 +}
 +#endif
 +
 +#endif
 +
 diff -upr linux-2.6.20.orig/init/Kconfig linux-2.6.20-0/init/Kconfig
 --- linux-2.6.20.orig/init/Kconfig	2007-03-06 13:33:28.000000000 +0300
 +++ linux-2.6.20-0/init/Kconfig	2007-03-06 13:33:28.000000000 +0300
 @@ -265,6 +265,12 @@ config CPUSETS
 help
 Provides the-number-of-tasks accounting container
 
 +config FILES_CONTAINER
 +	bool "Numfiles accounting container"
 +	select RESOURCE_COUNTERS
 +	help
 +	  Provides the-number-of-files accounting container
 +
 config SYSFS_DEPRECATED
 bool "Create deprecated sysfs files"
 default y
 diff -upr linux-2.6.20.orig/kernel/container.c linux-2.6.20-0/kernel/container.c
 --- linux-2.6.20.orig/kernel/container.c	2007-03-06 13:33:28.000000000 +0300
 +++ linux-2.6.20-0/kernel/container.c	2007-03-06 13:35:48.000000000 +0300
 @@ -60,6 +60,7 @@
 
 #include <linux/rss_container.h>
 #include <linux/numproc_container.h>
 +#include <linux/numfiles_container.h>
 
 #define CONTAINER_SUPER_MAGIC		0x27e0eb
 
 @@ -1721,6 +1725,7 @@ int __init container_init_early(void)
 
 container_rss_init_early();
 container_numproc_init_early();
 +	container_numfiles_init_early();
 
 return 0;
 }
...
 
 
 |  
	|  |  |  
	|  |  
	|  |  
	|  |  
	| 
		
			| Re: [RFC][PATCH 1/7] Resource counters [message #10908 is a reply to message #10889] | Wed, 07 March 2007 04:03   |  
			| 
				
				
					|  Balbir Singh Messages: 491
 Registered: August 2006
 | Senior Member |  |  |  
	| Pavel Emelianov wrote: > Introduce generic structures and routines for
 > resource accounting.
 >
 > Each resource accounting container is supposed to
 > aggregate it, container_subsystem_state and its
 > resource-specific members within.
 >
 >
 >  ------------------------------------------------------------ ------------
 >
 > diff -upr linux-2.6.20.orig/include/linux/res_counter.h linux-2.6.20-0/include/linux/res_counter.h
 > --- linux-2.6.20.orig/include/linux/res_counter.h	2007-03-06 13:39:17.000000000 +0300
 > +++ linux-2.6.20-0/include/linux/res_counter.h	2007-03-06 13:33:28.000000000 +0300
 > @@ -0,0 +1,83 @@
 > +#ifndef __RES_COUNTER_H__
 > +#define __RES_COUNTER_H__
 > +/*
 > + * resource counters
 > + *
 > + * Copyright 2007 OpenVZ SWsoft Inc
 > + *
 > + * Author: Pavel Emelianov <xemul@openvz.org>
 > + *
 > + */
 > +
 > +#include <linux/container.h>
 > +
 > +struct res_counter {
 > +	unsigned long usage;
 > +	unsigned long limit;
 > +	unsigned long failcnt;
 > +	spinlock_t lock;
 > +};
 > +
 > +enum {
 > +	RES_USAGE,
 > +	RES_LIMIT,
 > +	RES_FAILCNT,
 > +};
 > +
 > +ssize_t res_counter_read(struct res_counter *cnt, int member,
 > +		const char __user *buf, size_t nbytes, loff_t *pos);
 > +ssize_t res_counter_write(struct res_counter *cnt, int member,
 > +		const char __user *buf, size_t nbytes, loff_t *pos);
 > +
 > +static inline void res_counter_init(struct res_counter *cnt)
 > +{
 > +	spin_lock_init(&cnt->lock);
 > +	cnt->limit = (unsigned long)LONG_MAX;
 > +}
 > +
 
 Is there any way to indicate that there are no limits on this container.
 LONG_MAX is quite huge, but still when the administrator wants to
 configure a container to *un-limited usage*, it becomes hard for
 the administrator.
 
 > +static inline int res_counter_charge_locked(struct res_counter *cnt,
 > +		unsigned long val)
 > +{
 > +	if (cnt->usage <= cnt->limit - val) {
 > +		cnt->usage += val;
 > +		return 0;
 > +	}
 > +
 > +	cnt->failcnt++;
 > +	return -ENOMEM;
 > +}
 > +
 > +static inline int res_counter_charge(struct res_counter *cnt,
 > +		unsigned long val)
 > +{
 > +	int ret;
 > +	unsigned long flags;
 > +
 > +	spin_lock_irqsave(&cnt->lock, flags);
 > +	ret = res_counter_charge_locked(cnt, val);
 > +	spin_unlock_irqrestore(&cnt->lock, flags);
 > +	return ret;
 > +}
 > +
 
 Will atomic counters help here.
 
 > +static inline void res_counter_uncharge_locked(struct res_counter *cnt,
 > +		unsigned long val)
 > +{
 > +	if (unlikely(cnt->usage < val)) {
 > +		WARN_ON(1);
 > +		val = cnt->usage;
 > +	}
 > +
 > +	cnt->usage -= val;
 > +}
 > +
 > +static inline void res_counter_uncharge(struct res_counter *cnt,
 > +		unsigned long val)
 > +{
 > +	unsigned long flags;
 > +
 > +	spin_lock_irqsave(&cnt->lock, flags);
 > +	res_counter_uncharge_locked(cnt, val);
 > +	spin_unlock_irqrestore(&cnt->lock, flags);
 > +}
 > +
 > +#endif
 > diff -upr linux-2.6.20.orig/init/Kconfig linux-2.6.20-0/init/Kconfig
 > --- linux-2.6.20.orig/init/Kconfig	2007-03-06 13:33:28.000000000 +0300
 > +++ linux-2.6.20-0/init/Kconfig	2007-03-06 13:33:28.000000000 +0300
 > @@ -265,6 +265,10 @@ config CPUSETS
 >
 >  	  Say N if unsure.
 >
 > +config RESOURCE_COUNTERS
 > +	bool
 > +	select CONTAINERS
 > +
 >  config SYSFS_DEPRECATED
 >  	bool "Create deprecated sysfs files"
 >  	default y
 > diff -upr linux-2.6.20.orig/kernel/Makefile linux-2.6.20-0/kernel/Makefile
 > --- linux-2.6.20.orig/kernel/Makefile	2007-03-06 13:33:28.000000000 +0300
 > +++ linux-2.6.20-0/kernel/Makefile	2007-03-06 13:33:28.000000000 +0300
 > @@ -51,6 +51,7 @@ obj-$(CONFIG_RELAY) += relay.o
 >  obj-$(CONFIG_UTS_NS) += utsname.o
 >  obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 >  obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
 > +obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
 >
 >  ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 >  # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
 > diff -upr linux-2.6.20.orig/kernel/res_counter.c linux-2.6.20-0/kernel/res_counter.c
 > --- linux-2.6.20.orig/kernel/res_counter.c	2007-03-06 13:39:17.000000000 +0300
 > +++ linux-2.6.20-0/kernel/res_counter.c	2007-03-06 13:33:28.000000000 +0300
 > @@ -0,0 +1,72 @@
 > +/*
 > + * resource containers
 > + *
 > + * Copyright 2007 OpenVZ SWsoft Inc
 > + *
 > + * Author: Pavel Emelianov <xemul@openvz.org>
 > + *
 > + */
 > +
 > +#include <linux/parser.h>
 > +#include <linux/fs.h>
 > +#include <linux/res_counter.h>
 > +#include <asm/uaccess.h>
 > +
 > +static inline unsigned long *res_counter_member(struct res_counter *cnt, int member)
 > +{
 > +	switch (member) {
 > +	case RES_USAGE:
 > +		return &cnt->usage;
 > +	case RES_LIMIT:
 > +		return &cnt->limit;
 > +	case RES_FAILCNT:
 > +		return &cnt->failcnt;
 > +	};
 > +
 > +	BUG();
 > +	return NULL;
 > +}
 > +
 > +ssize_t res_counter_read(struct res_counter *cnt, int member,
 > +		const char __user *userbuf, size_t nbytes, loff_t *pos)
 > +{
 > +	unsigned long *val;
 > +	char buf[64], *s;
 > +
 > +	s = buf;
 > +	val = res_counter_member(cnt, member);
 > +	s += sprintf(s, "%lu\n", *val);
 > +	return simple_read_from_buffer((void __user *)userbuf, nbytes,
 > +			pos, buf, s - buf);
 > +}
 > +
 > +ssize_t res_counter_write(struct res_counter *cnt, int member,
 > +		const char __user *userbuf, size_t nbytes, loff_t *pos)
 > +{
 > +	int ret;
 > +	char *buf, *end;
 > +	unsigned long tmp, *val;
 > +
 > +	buf = kmalloc(nbytes + 1, GFP_KERNEL);
 > +	ret = -ENOMEM;
 > +	if (buf == NULL)
 > +		goto out;
 > +
 > +	buf[nbytes] = 0;
 > +	ret = -EFAULT;
 > +	if (copy_from_user(buf, userbuf, nbytes))
 > +		goto out_free;
 > +
 > +	ret = -EINVAL;
 > +	tmp = simple_strtoul(buf, &end, 10);
 > +	if (*end != '\0')
 > +		goto out_free;
 > +
 > +	val = res_counter_member(cnt, member);
 > +	*val = tmp;
 > +	ret = nbytes;
 > +out_free:
 > +	kfree(buf);
 > +out:
 > +	return ret;
 > +}
 >
 
 
 These bits look a little out of sync, with no users for these routines in
 this patch. Won't you get a compiler warning, compiling this bit alone?
 
 --
 Warm Regards,
 Balbir Singh
 Linux Technology Center
 IBM, ISTL
 |  
	|  |  |  
	| 
		
			| Re: [RFC][PATCH 2/7] RSS controller core [message #10909 is a reply to message #10890] | Wed, 07 March 2007 05:37   |  
			| 
				
				
					|  Balbir Singh Messages: 491
 Registered: August 2006
 | Senior Member |  |  |  
	| Pavel Emelianov wrote: > This includes setup of RSS container within generic
 > process containers, all the declarations used in RSS
 > accounting, and core code responsible for accounting.
 >
 >
 >  ------------------------------------------------------------ ------------
 >
 > diff -upr linux-2.6.20.orig/include/linux/rss_container.h linux-2.6.20-0/include/linux/rss_container.h
 > --- linux-2.6.20.orig/include/linux/rss_container.h	2007-03-06 13:39:17.000000000 +0300
 > +++ linux-2.6.20-0/include/linux/rss_container.h	2007-03-06 13:33:28.000000000 +0300
 > @@ -0,0 +1,68 @@
 > +#ifndef __RSS_CONTAINER_H__
 > +#define __RSS_CONTAINER_H__
 > +/*
 > + * RSS container
 > + *
 > + * Copyright 2007 OpenVZ SWsoft Inc
 > + *
 > + * Author: Pavel Emelianov <xemul@openvz.org>
 > + *
 > + */
 > +
 > +struct page_container;
 > +struct rss_container;
 > +
 > +#ifdef CONFIG_RSS_CONTAINER
 > +int container_rss_prepare(struct page *, struct vm_area_struct *vma,
 > +		struct page_container **);
 > +
 > +void container_rss_add(struct page_container *);
 > +void container_rss_del(struct page_container *);
 > +void container_rss_release(struct page_container *);
 > +
 > +int mm_init_container(struct mm_struct *mm, struct task_struct *tsk);
 > +void mm_free_container(struct mm_struct *mm);
 > +
 > +unsigned long container_isolate_pages(unsigned long nr_to_scan,
 > +		struct rss_container *rss, struct list_head *dst,
 > +		int active, unsigned long *scanned);
 > +unsigned long container_nr_physpages(struct rss_container *rss);
 > +
 > +unsigned long container_try_to_free_pages(struct rss_container *);
 > +void container_out_of_memory(struct rss_container *);
 > +
 > +void container_rss_init_early(void);
 > +#else
 > +static inline int container_rss_prepare(struct page *pg,
 > +		struct vm_area_struct *vma, struct page_container **pc)
 > +{
 > +	*pc = NULL; /* to make gcc happy */
 > +	return 0;
 > +}
 > +
 > +static inline void container_rss_add(struct page_container *pc)
 > +{
 > +}
 > +
 > +static inline void container_rss_del(struct page_container *pc)
 > +{
 > +}
 > +
 > +static inline void container_rss_release(struct page_container *pc)
 > +{
 > +}
 > +
 > +static inline int mm_init_container(struct mm_struct *mm, struct task_struct *t)
 > +{
 > +	return 0;
 > +}
 > +
 > +static inline void mm_free_container(struct mm_struct *mm)
 > +{
 > +}
 > +
 > +static inline void container_rss_init_early(void)
 > +{
 > +}
 > +#endif
 > +#endif
 > diff -upr linux-2.6.20.orig/init/Kconfig linux-2.6.20-0/init/Kconfig
 > --- linux-2.6.20.orig/init/Kconfig	2007-03-06 13:33:28.000000000 +0300
 > +++ linux-2.6.20-0/init/Kconfig	2007-03-06 13:33:28.000000000 +0300
 > @@ -265,6 +265,13 @@ config CPUSETS
 >  	bool
 >  	select CONTAINERS
 >
 > +config RSS_CONTAINER
 > +	bool "RSS accounting container"
 > +	select RESOURCE_COUNTERS
 > +	help
 > +	  Provides a simple Resource Controller for monitoring and
 > +	  controlling the total Resident Set Size of the tasks in a container
 > +
 
 The wording looks very familiar :-). It would be useful to add
 "The reclaim logic is now container aware, when the container goes overlimit
 the page reclaimer reclaims pages belonging to this container. If we are
 unable to reclaim enough pages to satisfy the request, the process is
 killed with an out of memory warning"
 
 >  config SYSFS_DEPRECATED
 >  	bool "Create deprecated sysfs files"
 >  	default y
 > diff -upr linux-2.6.20.orig/mm/Makefile linux-2.6.20-0/mm/Makefile
 > --- linux-2.6.20.orig/mm/Makefile	2007-02-04 21:44:54.000000000 +0300
 > +++ linux-2.6.20-0/mm/Makefile	2007-03-06 13:33:28.000000000 +0300
 > @@ -29,3 +29,5 @@ obj-$(CONFIG_MEMORY_HOTPLUG) += memory_h
 >  obj-$(CONFIG_FS_XIP) += filemap_xip.o
 >  obj-$(CONFIG_MIGRATION) += migrate.o
 >  obj-$(CONFIG_SMP) += allocpercpu.o
 > +
 > +obj-$(CONFIG_RSS_CONTAINER) += rss_container.o
 > diff -upr linux-2.6.20.orig/mm/rss_container.c linux-2.6.20-0/mm/rss_container.c
 > --- linux-2.6.20.orig/mm/rss_container.c	2007-03-06 13:39:17.000000000 +0300
 > +++ linux-2.6.20-0/mm/rss_container.c	2007-03-06 13:33:28.000000000 +0300
 > @@ -0,0 +1,307 @@
 > +/*
 > + * RSS accounting container
 > + *
 > + * Copyright 2007 OpenVZ SWsoft Inc
 > + *
 > + * Author: Pavel Emelianov <xemul@openvz.org>
 > + *
 > + */
 > +
 > +#include <linux/list.h>
 > +#include <linux/sched.h>
 > +#include <linux/mm.h>
 > +#include <linux/res_counter.h>
 > +#include <linux/rss_container.h>
 > +
 > +static struct container_subsys rss_subsys;
 > +
 > +struct rss_container {
 > +	struct res_counter res;
 > +	struct list_head page_list;
 > +	struct container_subsys_state css;
 > +};
 > +
 > +struct page_container {
 > +	struct page *page;
 > +	struct rss_container *cnt;
 > +	struct list_head list;
 > +};
 > +
 
 Yes, this is what I was planning to get to -- a per container LRU list.
 But you have just one list, don't you need active and inactive lists?
 When the global LRU is manipulated, shouldn't this list be updated as
 well, so that reclaim will pick the best pages.
 
 > +static inline struct rss_container *rss_from_cont(struct container *cnt)
 > +{
 > +	return container_of(container_subsys_state(cnt, &rss_subsys),
 > +			struct rss_container, css);
 > +}
 > +
 > +int mm_init_container(struct mm_struct *mm, struct task_struct *tsk)
 > +{
 > +	struct rss_container *cnt;
 > +
 > +	cnt = rss_from_cont(task_container(tsk, &rss_subsys));
 > +	if (css_get(&cnt->css))
 > +		return -EBUSY;
 > +
 > +	mm->rss_container = cnt;
 > +	return 0;
 > +}
 > +
 > +void mm_free_container(struct mm_struct *mm)
 > +{
 > +	css_put(&mm->rss_container->css);
 > +}
 > +
 > +int container_rss_prepare(struct page *page, struct vm_area_struct *vma,
 > +		struct page_container **ppc)
 > +{
 > +	struct rss_container *rss;
 > +	struct page_container *pc;
 > +
 > +	rcu_read_lock();
 > +	rss = rcu_dereference(vma->vm_mm->rss_container);
 > +	css_get_current(&rss->css);
 > +	rcu_read_unlock();
 > +
 > +	pc = kmalloc(sizeof(struct page_container), GFP_KERNEL);
 > +	if (pc == NULL)
 > +		goto out_nomem;
 > +
 > +	while (res_counter_charge(&rss->res, 1)) {
 > +		if (container_try_to_free_pages(rss))
 > +			continue;
 > +
 
 The return codes of the functions is a bit confusing, ideally
 container_try_to_free_pages() should return 0 on success. Also
 res_counter_charge() has a WARN_ON(1) if the limit is exceeded.
 The system administrator can figure out the details from failcnt,
 I suspect when the container is running close to it's limit,
 dmesg will have too many WARNING messages.
 
 How much memory do you try to reclaim in container_try_to_free_pages()?
 With my patches, I was planning to export this knob to userspace with
 a default value. This will help the administrator decide how much
 of the working set/container LRU should be freed on reaching the limit.
 I cannot find the definition of container_try_to_free_pages() in
 this patch.
 
 
 
 > +		container_out_of_memory(rss);
 > +		if (test_thread_flag(TIF_MEMDIE))
 > +			goto out_charge;
 > +	}
 > +
 > +	pc->page = page;
 > +	pc->cnt = rss;
 > +	*ppc = pc;
 > +	return 0;
 > +
 > +out_charge:
 > +	kfree(pc);
 > +out_nomem:
 > +	css_put(&rss->css);
 > +	return -ENOMEM;
 > +}
 > +
 > +void container_rss_release(struct page_container *pc)
 > +{
 > +	struct rss_container *rss;
 > +
 > +	rss = pc->cnt;
 > +	res_counter_uncharge(&rss->res, 1);
 > +	css_put(&rss->css);
 > +	kfree(pc);
 > +}
 > +
 > +void container_rss_add(struct page_container *pc)
 > +{
 > +	struct page *pg;
 > +	struct rss_container *rss;
 > +
 > +	pg = pc->page;
 > +	rss = pc->cnt;
 > +
 > +	spin_lock(&rss->res.lock);
 > +	list_add(&pc->list, &rss->page_list);
 
 This is not good, it won't give us LRU behaviour which is
 useful for determining which pages to free.
 
 > +	spin_unlock(&rss->res.lock);
 > +
 > +	page_container(pg) = pc;
 > +}
 > +
 > +void container_rss_del(struct page_container *pc)
 > +{
 > +	struct page *page;
 > +	struct rss_container *rss;
 > +
 > +	page = pc->page;
 > +	rss = pc->cnt;
 > +
 > +	spin_lock(&rss->res.lock);
 > +	list_del(&pc->list);
 > +	res_counter_uncharge_locked(&rss->res, 1);
 > +	spin_unlock(&rss->res.lock);
 > +
 > +	css_put(&rss->css);
 > +	kfree(pc);
 > +}
 > +
 > +unsigned long container_isolate_pages(unsigned long nr_to_scan,
 > +		struct rss_container *rss, struct list_head *dst,
 > +		int active, unsigned long *scanned)
 > +{
 > +	unsigned long nr_taken = 0;
 > +	struct page *page;
 > +	struct page_container *pc;
 > +	unsigned long scan;
 > +	struct list_head *src;
 > +	LIST_HEAD(pc_list);
 > +	struct zone *z;
 > +
 > +	spin_lock_irq(&rss->res.lock);
 > +	src = &rss->page_list;
 > +
 
 Which part of the working set are we pushing out, this looks like
 we are using FIFO to determine which pages to reclaim. This needs
 to be FIXED.
 
 > +	for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
 > +		pc = list_entry(src->prev, struct page_container, list);
 > +		page = pc->page;
 > +		z = page_zone(page);
 > +
 > +		list_move(&pc->list, &pc_list);
 > +
 > +		spin_lock(&z->lru_lock);
 > +		if (PageLRU(page)) {
 > +			if ((active && PageActive(page)) ||
 > +					(!active && !PageActive(page))) {
 > +				if (likely(get_page_un
...
 
 
 |  
	|  |  |  
	|  |  
	|  |  
	| 
		
			| Re: [RFC][PATCH 1/7] Resource counters [message #10913 is a reply to message #10908] | Wed, 07 March 2007 07:17   |  
			| 
				
				
					|  xemul Messages: 248
 Registered: November 2005
 | Senior Member |  |  |  
	| Balbir Singh wrote: > Pavel Emelianov wrote:
 >> Introduce generic structures and routines for
 >> resource accounting.
 >>
 >> Each resource accounting container is supposed to
 >> aggregate it, container_subsystem_state and its
 >> resource-specific members within.
 >>
 >>
 >>  ------------------------------------------------------------ ------------
 >>
 >> diff -upr linux-2.6.20.orig/include/linux/res_counter.h
 >> linux-2.6.20-0/include/linux/res_counter.h
 >> --- linux-2.6.20.orig/include/linux/res_counter.h    2007-03-06
 >> 13:39:17.000000000 +0300
 >> +++ linux-2.6.20-0/include/linux/res_counter.h    2007-03-06
 >> 13:33:28.000000000 +0300
 >> @@ -0,0 +1,83 @@
 >> +#ifndef __RES_COUNTER_H__
 >> +#define __RES_COUNTER_H__
 >> +/*
 >> + * resource counters
 >> + *
 >> + * Copyright 2007 OpenVZ SWsoft Inc
 >> + *
 >> + * Author: Pavel Emelianov <xemul@openvz.org>
 >> + *
 >> + */
 >> +
 >> +#include <linux/container.h>
 >> +
 >> +struct res_counter {
 >> +    unsigned long usage;
 >> +    unsigned long limit;
 >> +    unsigned long failcnt;
 >> +    spinlock_t lock;
 >> +};
 >> +
 >> +enum {
 >> +    RES_USAGE,
 >> +    RES_LIMIT,
 >> +    RES_FAILCNT,
 >> +};
 >> +
 >> +ssize_t res_counter_read(struct res_counter *cnt, int member,
 >> +        const char __user *buf, size_t nbytes, loff_t *pos);
 >> +ssize_t res_counter_write(struct res_counter *cnt, int member,
 >> +        const char __user *buf, size_t nbytes, loff_t *pos);
 >> +
 >> +static inline void res_counter_init(struct res_counter *cnt)
 >> +{
 >> +    spin_lock_init(&cnt->lock);
 >> +    cnt->limit = (unsigned long)LONG_MAX;
 >> +}
 >> +
 >
 > Is there any way to indicate that there are no limits on this container.
 
 Yes - LONG_MAX is essentially a "no limit" value as no
 container will ever have such many files :)
 
 > LONG_MAX is quite huge, but still when the administrator wants to
 > configure a container to *un-limited usage*, it becomes hard for
 > the administrator.
 >
 >> +static inline int res_counter_charge_locked(struct res_counter *cnt,
 >> +        unsigned long val)
 >> +{
 >> +    if (cnt->usage <= cnt->limit - val) {
 >> +        cnt->usage += val;
 >> +        return 0;
 >> +    }
 >> +
 >> +    cnt->failcnt++;
 >> +    return -ENOMEM;
 >> +}
 >> +
 >> +static inline int res_counter_charge(struct res_counter *cnt,
 >> +        unsigned long val)
 >> +{
 >> +    int ret;
 >> +    unsigned long flags;
 >> +
 >> +    spin_lock_irqsave(&cnt->lock, flags);
 >> +    ret = res_counter_charge_locked(cnt, val);
 >> +    spin_unlock_irqrestore(&cnt->lock, flags);
 >> +    return ret;
 >> +}
 >> +
 >
 > Will atomic counters help here.
 
 I'm afraid no. We have to atomically check for limit and alter
 one of usage or failcnt depending on the checking result. Making
 this with atomic_xxx ops will require at least two ops.
 
 If we'll remove failcnt this would look like
 while (atomic_cmpxchg(...))
 which is also not that good.
 
 Moreover - in RSS accounting patches I perform page list
 manipulations under this lock, so this also saves one atomic op.
 
 >> +static inline void res_counter_uncharge_locked(struct res_counter *cnt,
 >> +        unsigned long val)
 >> +{
 >> +    if (unlikely(cnt->usage < val)) {
 >> +        WARN_ON(1);
 >> +        val = cnt->usage;
 >> +    }
 >> +
 >> +    cnt->usage -= val;
 >> +}
 >> +
 >> +static inline void res_counter_uncharge(struct res_counter *cnt,
 >> +        unsigned long val)
 >> +{
 >> +    unsigned long flags;
 >> +
 >> +    spin_lock_irqsave(&cnt->lock, flags);
 >> +    res_counter_uncharge_locked(cnt, val);
 >> +    spin_unlock_irqrestore(&cnt->lock, flags);
 >> +}
 >> +
 >> +#endif
 >> diff -upr linux-2.6.20.orig/init/Kconfig linux-2.6.20-0/init/Kconfig
 >> --- linux-2.6.20.orig/init/Kconfig    2007-03-06 13:33:28.000000000 +0300
 >> +++ linux-2.6.20-0/init/Kconfig    2007-03-06 13:33:28.000000000 +0300
 >> @@ -265,6 +265,10 @@ config CPUSETS
 >>
 >>        Say N if unsure.
 >>
 >> +config RESOURCE_COUNTERS
 >> +    bool
 >> +    select CONTAINERS
 >> +
 >>  config SYSFS_DEPRECATED
 >>      bool "Create deprecated sysfs files"
 >>      default y
 >> diff -upr linux-2.6.20.orig/kernel/Makefile
 >> linux-2.6.20-0/kernel/Makefile
 >> --- linux-2.6.20.orig/kernel/Makefile    2007-03-06 13:33:28.000000000
 >> +0300
 >> +++ linux-2.6.20-0/kernel/Makefile    2007-03-06 13:33:28.000000000 +0300
 >> @@ -51,6 +51,7 @@ obj-$(CONFIG_RELAY) += relay.o
 >>  obj-$(CONFIG_UTS_NS) += utsname.o
 >>  obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 >>  obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
 >> +obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
 >>
 >>  ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 >>  # According to Alan Modra <alan@linuxcare.com.au>, the
 >> -fno-omit-frame-pointer is
 >> diff -upr linux-2.6.20.orig/kernel/res_counter.c
 >> linux-2.6.20-0/kernel/res_counter.c
 >> --- linux-2.6.20.orig/kernel/res_counter.c    2007-03-06
 >> 13:39:17.000000000 +0300
 >> +++ linux-2.6.20-0/kernel/res_counter.c    2007-03-06
 >> 13:33:28.000000000 +0300
 >> @@ -0,0 +1,72 @@
 >> +/*
 >> + * resource containers
 >> + *
 >> + * Copyright 2007 OpenVZ SWsoft Inc
 >> + *
 >> + * Author: Pavel Emelianov <xemul@openvz.org>
 >> + *
 >> + */
 >> +
 >> +#include <linux/parser.h>
 >> +#include <linux/fs.h>
 >> +#include <linux/res_counter.h>
 >> +#include <asm/uaccess.h>
 >> +
 >> +static inline unsigned long *res_counter_member(struct res_counter
 >> *cnt, int member)
 >> +{
 >> +    switch (member) {
 >> +    case RES_USAGE:
 >> +        return &cnt->usage;
 >> +    case RES_LIMIT:
 >> +        return &cnt->limit;
 >> +    case RES_FAILCNT:
 >> +        return &cnt->failcnt;
 >> +    };
 >> +
 >> +    BUG();
 >> +    return NULL;
 >> +}
 >> +
 >> +ssize_t res_counter_read(struct res_counter *cnt, int member,
 >> +        const char __user *userbuf, size_t nbytes, loff_t *pos)
 >> +{
 >> +    unsigned long *val;
 >> +    char buf[64], *s;
 >> +
 >> +    s = buf;
 >> +    val = res_counter_member(cnt, member);
 >> +    s += sprintf(s, "%lu\n", *val);
 >> +    return simple_read_from_buffer((void __user *)userbuf, nbytes,
 >> +            pos, buf, s - buf);
 >> +}
 >> +
 >> +ssize_t res_counter_write(struct res_counter *cnt, int member,
 >> +        const char __user *userbuf, size_t nbytes, loff_t *pos)
 >> +{
 >> +    int ret;
 >> +    char *buf, *end;
 >> +    unsigned long tmp, *val;
 >> +
 >> +    buf = kmalloc(nbytes + 1, GFP_KERNEL);
 >> +    ret = -ENOMEM;
 >> +    if (buf == NULL)
 >> +        goto out;
 >> +
 >> +    buf[nbytes] = 0;
 >> +    ret = -EFAULT;
 >> +    if (copy_from_user(buf, userbuf, nbytes))
 >> +        goto out_free;
 >> +
 >> +    ret = -EINVAL;
 >> +    tmp = simple_strtoul(buf, &end, 10);
 >> +    if (*end != '\0')
 >> +        goto out_free;
 >> +
 >> +    val = res_counter_member(cnt, member);
 >> +    *val = tmp;
 >> +    ret = nbytes;
 >> +out_free:
 >> +    kfree(buf);
 >> +out:
 >> +    return ret;
 >> +}
 >>
 >
 >
 > These bits look a little out of sync, with no users for these routines in
 > this patch. Won't you get a compiler warning, compiling this bit alone?
 >
 
 Nope - when you have a non-static function without users in a
 file no compiler warning produced.
...
 
 
 |  
	|  |  |  
	| 
		
			| Re: [RFC][PATCH 2/7] RSS controller core [message #10914 is a reply to message #10909] | Wed, 07 March 2007 07:25   |  
			| 
				
				
					|  xemul Messages: 248
 Registered: November 2005
 | Senior Member |  |  |  
	| Balbir Singh wrote: > Pavel Emelianov wrote:
 >> This includes setup of RSS container within generic
 >> process containers, all the declarations used in RSS
 >> accounting, and core code responsible for accounting.
 >>
 >>
 >>  ------------------------------------------------------------ ------------
 >>
 >> diff -upr linux-2.6.20.orig/include/linux/rss_container.h
 >> linux-2.6.20-0/include/linux/rss_container.h
 >> --- linux-2.6.20.orig/include/linux/rss_container.h    2007-03-06
 >> 13:39:17.000000000 +0300
 >> +++ linux-2.6.20-0/include/linux/rss_container.h    2007-03-06
 >> 13:33:28.000000000 +0300
 >> @@ -0,0 +1,68 @@
 >> +#ifndef __RSS_CONTAINER_H__
 >> +#define __RSS_CONTAINER_H__
 >> +/*
 >> + * RSS container
 >> + *
 >> + * Copyright 2007 OpenVZ SWsoft Inc
 >> + *
 >> + * Author: Pavel Emelianov <xemul@openvz.org>
 >> + *
 >> + */
 >> +
 >> +struct page_container;
 >> +struct rss_container;
 >> +
 >> +#ifdef CONFIG_RSS_CONTAINER
 >> +int container_rss_prepare(struct page *, struct vm_area_struct *vma,
 >> +        struct page_container **);
 >> +
 >> +void container_rss_add(struct page_container *);
 >> +void container_rss_del(struct page_container *);
 >> +void container_rss_release(struct page_container *);
 >> +
 >> +int mm_init_container(struct mm_struct *mm, struct task_struct *tsk);
 >> +void mm_free_container(struct mm_struct *mm);
 >> +
 >> +unsigned long container_isolate_pages(unsigned long nr_to_scan,
 >> +        struct rss_container *rss, struct list_head *dst,
 >> +        int active, unsigned long *scanned);
 >> +unsigned long container_nr_physpages(struct rss_container *rss);
 >> +
 >> +unsigned long container_try_to_free_pages(struct rss_container *);
 >> +void container_out_of_memory(struct rss_container *);
 >> +
 >> +void container_rss_init_early(void);
 >> +#else
 >> +static inline int container_rss_prepare(struct page *pg,
 >> +        struct vm_area_struct *vma, struct page_container **pc)
 >> +{
 >> +    *pc = NULL; /* to make gcc happy */
 >> +    return 0;
 >> +}
 >> +
 >> +static inline void container_rss_add(struct page_container *pc)
 >> +{
 >> +}
 >> +
 >> +static inline void container_rss_del(struct page_container *pc)
 >> +{
 >> +}
 >> +
 >> +static inline void container_rss_release(struct page_container *pc)
 >> +{
 >> +}
 >> +
 >> +static inline int mm_init_container(struct mm_struct *mm, struct
 >> task_struct *t)
 >> +{
 >> +    return 0;
 >> +}
 >> +
 >> +static inline void mm_free_container(struct mm_struct *mm)
 >> +{
 >> +}
 >> +
 >> +static inline void container_rss_init_early(void)
 >> +{
 >> +}
 >> +#endif
 >> +#endif
 >> diff -upr linux-2.6.20.orig/init/Kconfig linux-2.6.20-0/init/Kconfig
 >> --- linux-2.6.20.orig/init/Kconfig    2007-03-06 13:33:28.000000000 +0300
 >> +++ linux-2.6.20-0/init/Kconfig    2007-03-06 13:33:28.000000000 +0300
 >> @@ -265,6 +265,13 @@ config CPUSETS
 >>      bool
 >>      select CONTAINERS
 >>
 >> +config RSS_CONTAINER
 >> +    bool "RSS accounting container"
 >> +    select RESOURCE_COUNTERS
 >> +    help
 >> +      Provides a simple Resource Controller for monitoring and
 >> +      controlling the total Resident Set Size of the tasks in a
 >> container
 >> +
 >
 > The wording looks very familiar :-). It would be useful to add
 > "The reclaim logic is now container aware, when the container goes
 > overlimit
 > the page reclaimer reclaims pages belonging to this container. If we are
 > unable to reclaim enough pages to satisfy the request, the process is
 > killed with an out of memory warning"
 
 OK. Thanks.
 
 >
 >>  config SYSFS_DEPRECATED
 >>      bool "Create deprecated sysfs files"
 >>      default y
 >> diff -upr linux-2.6.20.orig/mm/Makefile linux-2.6.20-0/mm/Makefile
 >> --- linux-2.6.20.orig/mm/Makefile    2007-02-04 21:44:54.000000000 +0300
 >> +++ linux-2.6.20-0/mm/Makefile    2007-03-06 13:33:28.000000000 +0300
 >> @@ -29,3 +29,5 @@ obj-$(CONFIG_MEMORY_HOTPLUG) += memory_h
 >>  obj-$(CONFIG_FS_XIP) += filemap_xip.o
 >>  obj-$(CONFIG_MIGRATION) += migrate.o
 >>  obj-$(CONFIG_SMP) += allocpercpu.o
 >> +
 >> +obj-$(CONFIG_RSS_CONTAINER) += rss_container.o
 >> diff -upr linux-2.6.20.orig/mm/rss_container.c
 >> linux-2.6.20-0/mm/rss_container.c
 >> --- linux-2.6.20.orig/mm/rss_container.c    2007-03-06
 >> 13:39:17.000000000 +0300
 >> +++ linux-2.6.20-0/mm/rss_container.c    2007-03-06 13:33:28.000000000
 >> +0300
 >> @@ -0,0 +1,307 @@
 >> +/*
 >> + * RSS accounting container
 >> + *
 >> + * Copyright 2007 OpenVZ SWsoft Inc
 >> + *
 >> + * Author: Pavel Emelianov <xemul@openvz.org>
 >> + *
 >> + */
 >> +
 >> +#include <linux/list.h>
 >> +#include <linux/sched.h>
 >> +#include <linux/mm.h>
 >> +#include <linux/res_counter.h>
 >> +#include <linux/rss_container.h>
 >> +
 >> +static struct container_subsys rss_subsys;
 >> +
 >> +struct rss_container {
 >> +    struct res_counter res;
 >> +    struct list_head page_list;
 >> +    struct container_subsys_state css;
 >> +};
 >> +
 >> +struct page_container {
 >> +    struct page *page;
 >> +    struct rss_container *cnt;
 >> +    struct list_head list;
 >> +};
 >> +
 >
 > Yes, this is what I was planning to get to -- a per container LRU list.
 > But you have just one list, don't you need active and inactive lists?
 > When the global LRU is manipulated, shouldn't this list be updated as
 > well, so that reclaim will pick the best pages.
 >
 >> +static inline struct rss_container *rss_from_cont(struct container *cnt)
 >> +{
 >> +    return container_of(container_subsys_state(cnt, &rss_subsys),
 >> +            struct rss_container, css);
 >> +}
 >> +
 >> +int mm_init_container(struct mm_struct *mm, struct task_struct *tsk)
 >> +{
 >> +    struct rss_container *cnt;
 >> +
 >> +    cnt = rss_from_cont(task_container(tsk, &rss_subsys));
 >> +    if (css_get(&cnt->css))
 >> +        return -EBUSY;
 >> +
 >> +    mm->rss_container = cnt;
 >> +    return 0;
 >> +}
 >> +
 >> +void mm_free_container(struct mm_struct *mm)
 >> +{
 >> +    css_put(&mm->rss_container->css);
 >> +}
 >> +
 >> +int container_rss_prepare(struct page *page, struct vm_area_struct *vma,
 >> +        struct page_container **ppc)
 >> +{
 >> +    struct rss_container *rss;
 >> +    struct page_container *pc;
 >> +
 >> +    rcu_read_lock();
 >> +    rss = rcu_dereference(vma->vm_mm->rss_container);
 >> +    css_get_current(&rss->css);
 >> +    rcu_read_unlock();
 >> +
 >> +    pc = kmalloc(sizeof(struct page_container), GFP_KERNEL);
 >> +    if (pc == NULL)
 >> +        goto out_nomem;
 >> +
 >> +    while (res_counter_charge(&rss->res, 1)) {
 >> +        if (container_try_to_free_pages(rss))
 >> +            continue;
 >> +
 >
 > The return codes of the functions is a bit confusing, ideally
 > container_try_to_free_pages() should return 0 on success. Also
 
 This returns exactly what try_to_free_pages() does.
 
 > res_counter_charge() has a WARN_ON(1) if the limit is exceeded.
 
 Nope - res_counter_uncharge() has - this is an absolutely
 sane check that we haven't over-uncharged resources.
 
 > The system administrator can figure out the details from failcnt,
 > I suspect when the container is running close to it's limit,
 > dmesg will have too many WARNING messages.
 >
 > How much memory do you try to reclaim in container_try_to_free_pages()?
 
 At least one page. This is enough to make one page charge.
 That's the difference from general try_to_free_pages() that
 returns success if it freed swap_cluster_max pages at least.
 
 > With my patches, I was planning to export this knob to userspace with
 > a default value. This will help the administrator decide how much
 > of the working set/container LRU should be freed on reaching the limit.
 > I cannot find the definition of container_try_to_free_pages() in
 > this patch.
 
 This is in patch #5.
 Sorry for such a bad split - I'll make it cleaner next time :)
 
 >
 >
 >> +        container_out_of_memory(rss);
 >> +        if (test_thread_flag(TIF_MEMDIE))
 >> +            goto out_charge;
 >> +    }
 >> +
 >> +    pc->page = page;
 >> +    pc->cnt = rss;
 >> +    *ppc = pc;
 >> +    return 0;
 >> +
 >> +out_charge:
 >> +    kfree(pc);
 >> +out_nomem:
 >> +    css_put(&rss->css);
 >> +    return -ENOMEM;
 >> +}
 >> +
 >> +void container_rss_release(struct page_container *pc)
 >> +{
 >> +    struct rss_container *rss;
 >> +
 >> +    rss = pc->cnt;
 >> +    res_counter_uncharge(&rss->res, 1);
 >> +    css_put(&rss->css);
 >> +    kfree(pc);
 >> +}
 >> +
 >> +void container_rss_add(struct page_container *pc)
 >> +{
 >> +    struct page *pg;
 >> +    struct rss_container *rss;
 >> +
 >> +    pg = pc->page;
 >> +    rss = pc->cnt;
 >> +
 >> +    spin_lock(&r
...
 
 
 |  
	|  |  |  
	|  |  
	|  |  
	|  |  
	| 
		
			| Re: [RFC][PATCH 6/7] Account for the number of tasks within container [message #10943 is a reply to message #10912] | Thu, 08 March 2007 13:49   |  
			| 
				
				
					|  Paul Menage Messages: 642
 Registered: September 2006
 | Senior Member |  |  |  
	| On 3/6/07, Pavel Emelianov <xemul@sw.ru> wrote: > The idea is:
 >
 > Task may be "the entity that allocates the resources" and "the
 > entity that is a resource allocated".
 >
 > When task is the first entity it may move across containers
 > (that is implemented in your patches). When task is a resource
 > it shouldn't move across containers like files or pages do.
 >
 > More generally - allocated resources hold reference to original
 > container till they die. No resource migration is performed.
 >
 > Did I express my idea cleanly?
 
 Yes, but I disagree with the premise. The title of your patch is
 "Account for the number of tasks within container", but that's not
 what the subsystem does, it accounts for the number of forks within
 the container that aren't directly accompanied by an exit.
 
 Ideally, resources like files and pages would be able to follow tasks
 as well. The reason that files and pages aren't easily migrated from
 one container to another is that there could be sharing involved;
 figuring out the sharing can be expensive, and it's not clear what to
 do if two users are in different containers.
 
 But in the case of a task count, there are no such issues with
 sharing, so it seems to me to be more sensible (and more efficient) to
 just limit the number of tasks in a container.
 
 i.e. when moving a task into a container or forking a task within a
 container, increment the count; when moving a task out of a container
 or when it exits, decrement the count.
 
 With your approach, if you were to set the task limit of an empty
 container A to 1, and then move a process P from B into A, P would be
 able to fork a new child, since the "task count" would be 0 (as P was
 being charged to B still). Surely the fact that there's 1 process in A
 should prevent P from forking?
 
 Paul
 |  
	|  |  |  
	|  |  
	| 
		
			| Re: [RFC][PATCH 6/7] Account for the number of tasks within container [message #10985 is a reply to message #10943] | Sun, 11 March 2007 08:34   |  
			| 
				
				
					|  xemul Messages: 248
 Registered: November 2005
 | Senior Member |  |  |  
	| Paul Menage wrote: > On 3/6/07, Pavel Emelianov <xemul@sw.ru> wrote:
 >> The idea is:
 >>
 >> Task may be "the entity that allocates the resources" and "the
 >> entity that is a resource allocated".
 >>
 >> When task is the first entity it may move across containers
 >> (that is implemented in your patches). When task is a resource
 >> it shouldn't move across containers like files or pages do.
 >>
 >> More generally - allocated resources hold reference to original
 >> container till they die. No resource migration is performed.
 >>
 >> Did I express my idea cleanly?
 >
 > Yes, but I disagree with the premise. The title of your patch is
 > "Account for the number of tasks within container", but that's not
 > what the subsystem does, it accounts for the number of forks within
 > the container that aren't directly accompanied by an exit.
 >
 > Ideally, resources like files and pages would be able to follow tasks
 > as well. The reason that files and pages aren't easily migrated from
 > one container to another is that there could be sharing involved;
 > figuring out the sharing can be expensive, and it's not clear what to
 > do if two users are in different containers.
 >
 > But in the case of a task count, there are no such issues with
 > sharing, so it seems to me to be more sensible (and more efficient) to
 > just limit the number of tasks in a container.
 >
 > i.e. when moving a task into a container or forking a task within a
 > container, increment the count; when moving a task out of a container
 > or when it exits, decrement the count.
 
 Sounds reasonable.
 I'll take this into account when I make the next iteration.
 Thanks.
 
 > With your approach, if you were to set the task limit of an empty
 > container A to 1, and then move a process P from B into A, P would be
 > able to fork a new child, since the "task count" would be 0 (as P was
 > being charged to B still). Surely the fact that there's 1 process in A
 > should prevent P from forking?
 >
 > Paul
 >
 |  
	|  |  |  
	| 
		
			| Re: [RFC][PATCH 5/7] Per-container OOM killer and page reclamation [message #10986 is a reply to message #10957] | Sun, 11 March 2007 08:39   |  
			| 
				
				
					|  xemul Messages: 248
 Registered: November 2005
 | Senior Member |  |  |  
	| Balbir Singh wrote: > Hi, Pavel,
 >
 > Please find my patch to add LRU behaviour to your latest RSS controller.
 
 Thanks for participation and additional testing :)
 I'll include this into next generation of patches.
 
 > Balbir Singh
 > Linux Technology Center
 > IBM, ISTL
 >
 >
 >  ------------------------------------------------------------ ------------
 >
 > Add LRU behaviour to the RSS controller patches posted by Pavel Emelianov
 >
 > 	http://lkml.org/lkml/2007/3/6/198
 >
 > which was in turn similar to the RSS controller posted by me
 >
 > 	http://lkml.org/lkml/2007/2/26/8
 >
 > Pavel's patches have a per container list of pages, which helps reduce
 > reclaim time of the RSS controller but the per container list of pages is
 > in FIFO order. I've implemented active and inactive lists per container to
 > help select the right set of pages to reclaim when the container is under
 > memory pressure.
 >
 > I've tested these patches on a ppc64 machine and they work fine for
 > the minimal testing I've done.
 >
 > Pavel would you please include these patches in your next iteration.
 >
 > Comments, suggestions and further improvements are as always welcome!
 >
 > Signed-off-by: <balbir@in.ibm.com>
 > ---
 >
 >  include/linux/rss_container.h |    1
 >  mm/rss_container.c            |   47 +++++++++++++++++++++++++++++++-----------
 >  mm/swap.c                     |    5 ++++
 >  mm/vmscan.c                   |    3 ++
 >  4 files changed, 44 insertions(+), 12 deletions(-)
 >
 > diff -puN include/linux/rss_container.h~rss-container-lru2 include/linux/rss_container.h
 > ---  linux-2.6.20/include/linux/rss_container.h~rss-container-lru 2	2007-03-09 22:52:56.000000000 +0530
 > +++ linux-2.6.20-balbir/include/linux/rss_container.h	2007-03-10 00:39:59.000000000 +0530
 > @@ -19,6 +19,7 @@ int container_rss_prepare(struct page *,
 >  void container_rss_add(struct page_container *);
 >  void container_rss_del(struct page_container *);
 >  void container_rss_release(struct page_container *);
 > +void container_rss_move_lists(struct page *pg, bool active);
 >
 >  int mm_init_container(struct mm_struct *mm, struct task_struct *tsk);
 >  void mm_free_container(struct mm_struct *mm);
 > diff -puN mm/rss_container.c~rss-container-lru2 mm/rss_container.c
 > --- linux-2.6.20/mm/rss_container.c~rss-container-lru2	2007-03-09 22:52:56.000000000 +0530
 > +++ linux-2.6.20-balbir/mm/rss_container.c	2007-03-10 02:42:54.000000000 +0530
 > @@ -17,7 +17,8 @@ static struct container_subsys rss_subsy
 >
 >  struct rss_container {
 >  	struct res_counter res;
 > -	struct list_head page_list;
 > +	struct list_head inactive_list;
 > +	struct list_head active_list;
 >  	struct container_subsys_state css;
 >  };
 >
 > @@ -96,6 +97,26 @@ void container_rss_release(struct page_c
 >  	kfree(pc);
 >  }
 >
 > +void container_rss_move_lists(struct page *pg, bool active)
 > +{
 > +	struct rss_container *rss;
 > +	struct page_container *pc;
 > +
 > +	if (!page_mapped(pg))
 > +		return;
 > +
 > +	pc = page_container(pg);
 > +	BUG_ON(!pc);
 > +	rss = pc->cnt;
 > +
 > +	spin_lock_irq(&rss->res.lock);
 > +	if (active)
 > +		list_move(&pc->list, &rss->active_list);
 > +	else
 > +		list_move(&pc->list, &rss->inactive_list);
 > +	spin_unlock_irq(&rss->res.lock);
 > +}
 > +
 >  void container_rss_add(struct page_container *pc)
 >  {
 >  	struct page *pg;
 > @@ -105,7 +126,7 @@ void container_rss_add(struct page_conta
 >  	rss = pc->cnt;
 >
 >  	spin_lock(&rss->res.lock);
 > -	list_add(&pc->list, &rss->page_list);
 > +	list_add(&pc->list, &rss->active_list);
 >  	spin_unlock(&rss->res.lock);
 >
 >  	page_container(pg) = pc;
 > @@ -141,7 +162,10 @@ unsigned long container_isolate_pages(un
 >  	struct zone *z;
 >
 >  	spin_lock_irq(&rss->res.lock);
 > -	src = &rss->page_list;
 > +	if (active)
 > +		src = &rss->active_list;
 > +	else
 > +		src = &rss->inactive_list;
 >
 >  	for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
 >  		pc = list_entry(src->prev, struct page_container, list);
 > @@ -152,13 +176,10 @@ unsigned long container_isolate_pages(un
 >
 >  		spin_lock(&z->lru_lock);
 >  		if (PageLRU(page)) {
 > -			if ((active && PageActive(page)) ||
 > -					(!active && !PageActive(page))) {
 > -				if (likely(get_page_unless_zero(page))) {
 > -					ClearPageLRU(page);
 > -					nr_taken++;
 > -					list_move(&page->lru, dst);
 > -				}
 > +			if (likely(get_page_unless_zero(page))) {
 > +				ClearPageLRU(page);
 > +				nr_taken++;
 > +				list_move(&page->lru, dst);
 >  			}
 >  		}
 >  		spin_unlock(&z->lru_lock);
 > @@ -212,7 +233,8 @@ static int rss_create(struct container_s
 >  		return -ENOMEM;
 >
 >  	res_counter_init(&rss->res);
 > -	INIT_LIST_HEAD(&rss->page_list);
 > +	INIT_LIST_HEAD(&rss->inactive_list);
 > +	INIT_LIST_HEAD(&rss->active_list);
 >  	cont->subsys[rss_subsys.subsys_id] = &rss->css;
 >  	return 0;
 >  }
 > @@ -284,7 +306,8 @@ static __init int rss_create_early(struc
 >
 >  	rss = &init_rss_container;
 >  	res_counter_init(&rss->res);
 > -	INIT_LIST_HEAD(&rss->page_list);
 > +	INIT_LIST_HEAD(&rss->inactive_list);
 > +	INIT_LIST_HEAD(&rss->active_list);
 >  	cont->subsys[rss_subsys.subsys_id] = &rss->css;
 >  	ss->create = rss_create;
 >  	return 0;
 > diff -puN mm/vmscan.c~rss-container-lru2 mm/vmscan.c
 > --- linux-2.6.20/mm/vmscan.c~rss-container-lru2	2007-03-09 22:52:56.000000000 +0530
 > +++ linux-2.6.20-balbir/mm/vmscan.c	2007-03-10 00:42:35.000000000 +0530
 > @@ -1142,6 +1142,7 @@ static unsigned long container_shrink_pa
 >  			else
 >  				add_page_to_inactive_list(z, page);
 >  			spin_unlock_irq(&z->lru_lock);
 > +			container_rss_move_lists(page, false);
 >
 >  			put_page(page);
 >  		}
 > @@ -1191,6 +1192,7 @@ static void container_shrink_pages_activ
 >  		list_move(&page->lru, &z->inactive_list);
 >  		z->nr_inactive++;
 >  		spin_unlock_irq(&z->lru_lock);
 > +		container_rss_move_lists(page, false);
 >
 >  		put_page(page);
 >  	}
 > @@ -1206,6 +1208,7 @@ static void container_shrink_pages_activ
 >  		list_move(&page->lru, &z->active_list);
 >  		z->nr_active++;
 >  		spin_unlock_irq(&z->lru_lock);
 > +		container_rss_move_lists(page, true);
 >
 >  		put_page(page);
 >  	}
 > diff -puN mm/swap.c~rss-container-lru2 mm/swap.c
 > --- linux-2.6.20/mm/swap.c~rss-container-lru2	2007-03-10 00:42:38.000000000 +0530
 > +++ linux-2.6.20-balbir/mm/swap.c	2007-03-10 01:20:39.000000000 +0530
 > @@ -30,6 +30,7 @@
 >  #include <linux/cpu.h>
 >  #include <linux/notifier.h>
 >  #include <linux/init.h>
 > +#include <linux/rss_container.h>
 >
 >  /* How many pages do we try to swap or page in/out together? */
 >  int page_cluster;
 > @@ -140,6 +141,7 @@ int rotate_reclaimable_page(struct page
 >  void fastcall activate_page(struct page *page)
 >  {
 >  	struct zone *zone = page_zone(page);
 > +	bool moved = false;
 >
 >  	spin_lock_irq(&zone->lru_lock);
 >  	if (PageLRU(page) && !PageActive(page)) {
 > @@ -147,8 +149,11 @@ void fastcall activate_page(struct page
 >  		SetPageActive(page);
 >  		add_page_to_active_list(zone, page);
 >  		__count_vm_event(PGACTIVATE);
 > +		moved = true;
 >  	}
 >  	spin_unlock_irq(&zone->lru_lock);
 > +	if (moved)
 > +		container_rss_move_lists(page, true);
 >  }
 >
 >  /*
 > _
 |  
	|  |  |  
	|  |  
	| 
		
			| Re: [RFC][PATCH 2/7] RSS controller core [message #11001 is a reply to message #10993] | Sun, 11 March 2007 12:51   |  
			| 
				
				
					|  Andrew Morton Messages: 127
 Registered: December 2005
 | Senior Member |  |  |  
	| > On Sun, 11 Mar 2007 15:26:41 +0300 Kirill Korotaev <dev@sw.ru> wrote: > Andrew Morton wrote:
 > > On Tue, 06 Mar 2007 17:55:29 +0300
 > > Pavel Emelianov <xemul@sw.ru> wrote:
 > >
 > >
 > >>+struct rss_container {
 > >>+	struct res_counter res;
 > >>+	struct list_head page_list;
 > >>+	struct container_subsys_state css;
 > >>+};
 > >>+
 > >>+struct page_container {
 > >>+	struct page *page;
 > >>+	struct rss_container *cnt;
 > >>+	struct list_head list;
 > >>+};
 > >
 > >
 > > ah.  This looks good.  I'll find a hunk of time to go through this work
 > > and through Paul's patches.  It'd be good to get both patchsets lined
 > > up in -mm within a couple of weeks.  But..
 > >
 > > We need to decide whether we want to do per-container memory limitation via
 > > these data structures, or whether we do it via a physical scan of some
 > > software zone, possibly based on Mel's patches.
 > i.e. a separate memzone for each container?
 
 Yep.  Straightforward machine partitioning.  An attractive thing is that it
 100% reuses existing page reclaim, unaltered.
 
 > imho memzone approach is inconvinient for pages sharing and shares accounting.
 > it also makes memory management more strict, forbids overcommiting
 > per-container etc.
 
 umm, who said they were requirements?
 
 > Maybe you have some ideas how we can decide on this?
 
 We need to work out what the requirements are before we can settle on an
 implementation.
 
 Sigh.  Who is running this show?   Anyone?
 
 You can actually do a form of overcommittment by allowing multiple
 containers to share one or more of the zones.  Whether that is sufficient
 or suitable I don't know.  That depends on the requirements, and we haven't
 even discussed those, let alone agreed to them.
 |  
	|  |  |  
	| 
		
			| Re: [RFC][PATCH 2/7] RSS controller core [message #11009 is a reply to message #11001] | Sun, 11 March 2007 15:51   |  
			| 
				
				
					|  Balbir Singh Messages: 491
 Registered: August 2006
 | Senior Member |  |  |  
	| On 3/11/07, Andrew Morton <akpm@linux-foundation.org> wrote: > > On Sun, 11 Mar 2007 15:26:41 +0300 Kirill Korotaev <dev@sw.ru> wrote:
 > > Andrew Morton wrote:
 > > > On Tue, 06 Mar 2007 17:55:29 +0300
 > > > Pavel Emelianov <xemul@sw.ru> wrote:
 > > >
 > > >
 > > >>+struct rss_container {
 > > >>+   struct res_counter res;
 > > >>+   struct list_head page_list;
 > > >>+   struct container_subsys_state css;
 > > >>+};
 > > >>+
 > > >>+struct page_container {
 > > >>+   struct page *page;
 > > >>+   struct rss_container *cnt;
 > > >>+   struct list_head list;
 > > >>+};
 > > >
 > > >
 > > > ah.  This looks good.  I'll find a hunk of time to go through this work
 > > > and through Paul's patches.  It'd be good to get both patchsets lined
 > > > up in -mm within a couple of weeks.  But..
 > > >
 > > > We need to decide whether we want to do per-container memory limitation via
 > > > these data structures, or whether we do it via a physical scan of some
 > > > software zone, possibly based on Mel's patches.
 > > i.e. a separate memzone for each container?
 >
 > Yep.  Straightforward machine partitioning.  An attractive thing is that it
 > 100% reuses existing page reclaim, unaltered.
 
 We discussed zones for resource control and some of the disadvantages at
 http://lkml.org/lkml/2006/10/30/222
 
 I need to look at Mel's patches to determine if they are suitable for
 control. But in a thread of discussion on those patches, it was agreed
 that memory fragmentation and resource control are independent issues.
 
 
 >
 > > imho memzone approach is inconvinient for pages sharing and shares accounting.
 > > it also makes memory management more strict, forbids overcommiting
 > > per-container etc.
 >
 > umm, who said they were requirements?
 >
 
 We discussed some of the requirements in the RFC: Memory Controller
 requirements thread
 http://lkml.org/lkml/2006/10/30/51
 
 > > Maybe you have some ideas how we can decide on this?
 >
 > We need to work out what the requirements are before we can settle on an
 > implementation.
 >
 > Sigh.  Who is running this show?   Anyone?
 >
 
 All the stake holders involved in the RFC discussion :-) We've been
 talking and building on top of each others patches. I hope that was a
 good answer ;)
 
 > You can actually do a form of overcommittment by allowing multiple
 > containers to share one or more of the zones.  Whether that is sufficient
 > or suitable I don't know.  That depends on the requirements, and we haven't
 > even discussed those, let alone agreed to them.
 >
 
 There are other things like resizing a zone, finding the right size,
 etc. I'll look
 at Mel's patches to see what is supported.
 
 Warm Regards,
 Balbir Singh
 |  
	|  |  |  
	| 
		
			| Re:  Re: [RFC][PATCH 2/7] RSS controller core [message #11024 is a reply to message #10902] | Mon, 12 March 2007 09:10   |  
			| 
				
				
					|  Kirill Korotaev Messages: 137
 Registered: January 2006
 | Senior Member |  |  |  
	| Eric, 
 > And misses every resource sharing opportunity in sight.
 
 that was my point too.
 
 > Except for
 > filtering the which pages are eligible for reclaim an RSS limit should
 > not need to change the existing reclaim logic, and with things like the
 > memory zones we have had that kind of restriction in the reclaim logic
 > for a long time.  So filtering out ineligible pages isn't anything new.
 
 exactly this is implemented in the current patches from Pavel.
 the only difference is that filtering is not done in general LRU list,
 which is not effective, but via per-container LRU list.
 So the pointer on the page structure does 2 things:
 - fast reclamation
 - correct uncharging of page from where it was charged
 (e.g. shared pages can be mapped first in one container, but the last unmap
 done from another one).
 
 >>We need to work out what the requirements are before we can settle on an
 >>implementation.
 >
 >
 > If you are talking about RSS limits the term is well defined.  The
 > number of pages you can have mapped into your set of address space at
 > any given time.
 >
 > Unless I'm totally blind that isn't what the patchset implements.
 
 Ouch, what makes you think so?
 The fact that a page mapped into 2 different processes is charged only once?
 Imho it is much more correct then sum of process' RSS within container, due to:
 1. it is clear how much container uses physical pages, not abstract items
 2. shared pages are charged only once, so the sum of containers RSS is still
 about physical RAM.
 
 > A
 > true RSS limit over multiple processes has a lot of potential to be
 > generally useful, is very understandable, doesn't affect kernel cache
 > decisions so largely performance should not be affected.  There is a
 > little more overhead in the fault logic but that is a moderately
 > expensive path anyway.
 
 100% agree here.
 
 >>You can actually do a form of overcommittment by allowing multiple
 >>containers to share one or more of the zones.  Whether that is sufficient
 >>or suitable I don't know.  That depends on the requirements, and we haven't
 >>even discussed those, let alone agreed to them.
 >
 >
 > Another really nasty issue is the container term as the resource guys
 > are using the term in a subtlety different way then it has been used
 > with namespaces leading to several threads where the participants talked
 > past each other.  We need a different term to designate the group of
 > tasks a resource controller is dealing with.
 taskgrp? resgrp?
 
 > The whole filesystem interface also is over general and makes it too
 > easy to express the hard things (like move an existing task from one
 > group of tasks to another) leading to code complications.
 the things which are not supported are easy to disable.
 
 > On the up side I think the code the focus is likely in the right place
 > to start delivering usable code.
 
 Thanks,
 Kirill
 |  
	|  |  |  
	| 
		
			| Re:  Re: [RFC][PATCH 2/7] RSS controller core [message #11079 is a reply to message #11024] | Tue, 13 March 2007 09:26   |  
			| 
				
				
					|  ebiederm Messages: 1354
 Registered: February 2006
 | Senior Member |  |  |  
	| Kirill Korotaev <dev@openvz.org> writes: 
 > Eric,
 >
 >> And misses every resource sharing opportunity in sight.
 >
 > that was my point too.
 >
 >> Except for
 >> filtering the which pages are eligible for reclaim an RSS limit should
 >> not need to change the existing reclaim logic, and with things like the
 >> memory zones we have had that kind of restriction in the reclaim logic
 >> for a long time.  So filtering out ineligible pages isn't anything new.
 >
 > exactly this is implemented in the current patches from Pavel.
 > the only difference is that filtering is not done in general LRU list,
 > which is not effective, but via per-container LRU list.
 > So the pointer on the page structure does 2 things:
 > - fast reclamation
 Better than the rmap list?
 > - correct uncharging of page from where it was charged
 >   (e.g. shared pages can be mapped first in one container, but the last unmap
 >    done from another one).
 We should charge/uncharge all of them, not just one.
 
 >>>We need to work out what the requirements are before we can settle on an
 >>>implementation.
 >>
 >>
 >> If you are talking about RSS limits the term is well defined.  The
 >> number of pages you can have mapped into your set of address space at
 >> any given time.
 >>
 >> Unless I'm totally blind that isn't what the patchset implements.
 >
 > Ouch, what makes you think so?
 > The fact that a page mapped into 2 different processes is charged only once?
 > Imho it is much more correct then sum of process' RSS within container, due to:
 > 1. it is clear how much container uses physical pages, not abstract items
 > 2. shared pages are charged only once, so the sum of containers RSS is still
 >    about physical RAM.
 
 No the fact that a page mapped into 2 separate mm_structs in two
 separate accounting domains is counted only once.  This is very likely
 to happen with things like glibc if you have a read-only shared copy
 of your distro.  There appears to be no technical reason for such a
 restriction.
 
 A page should not be owned.
 
 Going further unless the limits are draconian I don't expect users to
 hit the rss limits often or frequently.  So in 99% of all cases page
 reclaim should continue to be global.  Which makes me question messing
 with the general page reclaim lists.
 
 Now if the normal limits turn out to be draconian it may make sense to
 split the first level of page lists by some reasonable approximation
 to their rss group, so we don't normally scan unnecessary pages.
 
 >> The whole filesystem interface also is over general and makes it too
 >> easy to express the hard things (like move an existing task from one
 >> group of tasks to another) leading to code complications.
 > the things which are not supported are easy to disable.
 
 Maybe.  The extra locking complexity gives me fits.  But in the grand
 scheme of things it is minor as long as it is not user perceptible we
 can fix it later.  I'm still wrapping my head around the weird fs concepts.
 
 Eric
 |  
	|  |  |  
	|  |  
	|  |  
	| 
		
			| Re:  Re: [RFC][PATCH 2/7] RSS controller core [message #11085 is a reply to message #11083] | Tue, 13 March 2007 10:49   |  
			| 
				
				
					|  Andrew Morton Messages: 127
 Registered: December 2005
 | Senior Member |  |  |  
	| > On Tue, 13 Mar 2007 13:19:53 +0300 Kirill Korotaev <dev@sw.ru> wrote: > Andrew Morton wrote:
 > >>>> - shared mappings of 'shared' files (binaries
 > >>>>   and libraries) to allow for reduced memory
 > >>>>   footprint when N identical guests are running
 > >>>
 > >>>So, it sounds like this can be phrased as a requirement like:
 > >>>
 > >>>	"Guests must be able to share pages."
 > >>>
 > >>>Can you give us an idea why this is so?
 > >>
 > >>sure, one reason for this is that guests tend to
 > >>be similar (or almost identical) which results
 > >>in quite a lot of 'shared' libraries and executables
 > >>which would otherwise get cached for each guest and
 > >>would also be mapped for each guest separately
 > >
 > >
 > > nooooooo.  What you're saying there amounts to text replication.  There is
 > > no proposal here to create duplicated copies of pagecache pages: the VM
 > > just doesn't support that (Nick has soe protopatches which do this as a
 > > possible NUMA optimisation).
 > >
 > > So these mmapped pages will contiue to be shared across all guests.  The
 > > problem boils down to "which guest(s) get charged for each shared page".
 > >
 > > A simple and obvious and easy-to-implement answer is "the guest which paged
 > > it in".  I think we should firstly explain why that is insufficient.
 > I guess by "paged it in" you essentially mean
 > "mapped the page into address space for the *first* time"?
 
 Not really - I mean "first allocated the page".  ie: major fault(), read(),
 write(), etc.
 
 > i.e. how many times the same page mapped into 2 address spaces
 > in the same container should be accounted for?
 >
 > We believe ONE. It is better due to:
 > - it allows better estimate how much RAM container uses.
 > - if one container mapped a single page 10,000 times,
 >   it doesn't mean it is worse than a container which mapped only 200 pages
 >   and that it should be killed in case of OOM.
 
 I'm not sure that we need to account for pages at all, nor care about rss.
 
 If we use a physical zone-based containment scheme: fake-numa,
 variable-sized zones, etc then it all becomes moot.  You set up a container
 which has 1.5GB of physial memory then toss processes into it.  As that
 process set increases in size it will toss out stray pages which shouldn't
 be there, then it will start reclaiming and swapping out its own pages and
 eventually it'll get an oom-killing.
 
 No RSS acounting or page acounting in sight, because we already *have* that
 stuff, at the physical level, in the zone.
 
 Overcommitment can be performed by allowing different containers to share
 the same zone set, or by dynamically increasing or decreasing the size of
 a physical container.
 
 This all works today with fake-numa and cpusets, no kernel changes needed.
 
 It could be made to work fairly simply with a multi-zone approach, or with
 resizeable zones.
 
 I'd be interested in knowing what you think the shortcomings of this are
 likely to be,.
 |  
	|  |  |  
	| 
		
			| Re:  Re: [RFC][PATCH 2/7] RSS controller core [message #11103 is a reply to message #11085] | Tue, 13 March 2007 14:59   |  
			| 
				
				
					|  Herbert Poetzl Messages: 239
 Registered: February 2006
 | Senior Member |  |  |  
	| On Tue, Mar 13, 2007 at 03:48:34AM -0800, Andrew Morton wrote: > > On Tue, 13 Mar 2007 13:19:53 +0300 Kirill Korotaev <dev@sw.ru> wrote:
 > > Andrew Morton wrote:
 > > >>>> - shared mappings of 'shared' files (binaries
 > > >>>>   and libraries) to allow for reduced memory
 > > >>>>   footprint when N identical guests are running
 > > >>>
 > > >>>So, it sounds like this can be phrased as a requirement like:
 > > >>>
 > > >>>	"Guests must be able to share pages."
 > > >>>
 > > >>>Can you give us an idea why this is so?
 > > >>
 > > >>sure, one reason for this is that guests tend to
 > > >>be similar (or almost identical) which results
 > > >>in quite a lot of 'shared' libraries and executables
 > > >>which would otherwise get cached for each guest and
 > > >>would also be mapped for each guest separately
 > > >
 > > > nooooooo. What you're saying there amounts to text replication.
 > > > There is no proposal here to create duplicated copies of pagecache
 > > > pages: the VM just doesn't support that (Nick has soe protopatches
 > > > which do this as a possible NUMA optimisation).
 > > >
 > > > So these mmapped pages will contiue to be shared across all
 > > > guests. The problem boils down to "which guest(s) get charged for
 > > > each shared page".
 > > >
 > > > A simple and obvious and easy-to-implement answer is "the guest
 > > > which paged it in". I think we should firstly explain why that is
 > > > insufficient.
 
 > > I guess by "paged it in" you essentially mean
 > > "mapped the page into address space for the *first* time"?
 >
 > Not really - I mean "first allocated the page". ie: major fault(),
 > read(), write(), etc.
 >
 > > i.e. how many times the same page mapped into 2 address spaces
 > > in the same container should be accounted for?
 > >
 > > We believe ONE. It is better due to:
 > > - it allows better estimate how much RAM container uses.
 > > - if one container mapped a single page 10,000 times,
 > >   it doesn't mean it is worse than a container which mapped only 200
 > >   pages and that it should be killed in case of OOM.
 >
 > I'm not sure that we need to account for pages at all, nor care about
 > rss.
 >
 > If we use a physical zone-based containment scheme: fake-numa,
 > variable-sized zones, etc then it all becomes moot.
 
 sounds good to me, just not sure it provides what we
 need, but I'm sure I'll figure that with your help ...
 
 > You set up a container which has 1.5GB of physial memory then toss
 > processes into it. As that process set increases in size it will
 > toss out stray pages which shouldn't be there, then it will start
 > reclaiming and swapping out its own pages and eventually it'll get an
 > oom-killing.
 
 okay, let me ask a few naive questions about this scheme:
 
 how does this work for a _file_ which is shared between
 two guests (e.g. an executable like bash, hardlinked
 between guests) when both guests are in a different
 zone-based container?
 
 + assumed that the file is read in the first time,
 will it be accounted to the first guest doing so?
 
 + assumed it is accessed in the second guest, will
 it cause any additional cache/mapping besides the
 dentry stuff?
 
 + will container A be able to 'toss out' pages
 'shared' with container B (assumed sharing is
 possible :)
 
 + when the container A tosses out the pages for this
 executable, will guest B still be able to use them?
 
 + when the pages are tossed out, will they require
 the system to read them in again, or will they
 stay available ala swap cache?
 
 > No RSS acounting or page acounting in sight, because we already *have*
 > that stuff, at the physical level, in the zone.
 
 I'm fine with that ...
 
 > Overcommitment can be performed by allowing different containers to
 > share the same zone set, or by dynamically increasing or decreasing
 > the size of a physical container.
 
 here the question is, can a guest have several of
 those 'virtual zones' assigned, so that there is a
 container specific and a shared zone for example?
 
 > This all works today with fake-numa and cpusets, no kernel changes
 > needed.
 
 sounds good!
 
 > It could be made to work fairly simply with a multi-zone approach, or
 > with resizeable zones.
 >
 > I'd be interested in knowing what you think the shortcomings of
 > this are likely to be,.
 
 will do so once I have a better understanding how this
 approach will work ...
 
 TIA,
 Herbert
 |  
	|  |  |  
	| 
		
			| Re:  Re: [RFC][PATCH 2/7] RSS controller core [message #11104 is a reply to message #11079] | Tue, 13 March 2007 15:30   |  
			| 
				
				
					|  dev Messages: 1693
 Registered: September 2005
 Location: Moscow
 | Senior Member |  
 |  |  
	| Eric, 
 >>>And misses every resource sharing opportunity in sight.
 >>
 >>that was my point too.
 >>
 >>
 >>>Except for
 >>>filtering the which pages are eligible for reclaim an RSS limit should
 >>>not need to change the existing reclaim logic, and with things like the
 >>>memory zones we have had that kind of restriction in the reclaim logic
 >>>for a long time.  So filtering out ineligible pages isn't anything new.
 >>
 >>exactly this is implemented in the current patches from Pavel.
 >>the only difference is that filtering is not done in general LRU list,
 >>which is not effective, but via per-container LRU list.
 >>So the pointer on the page structure does 2 things:
 >>- fast reclamation
 >
 >     Better than the rmap list?
 >
 >>- correct uncharging of page from where it was charged
 >>  (e.g. shared pages can be mapped first in one container, but the last unmap
 >>   done from another one).
 >
 >     We should charge/uncharge all of them, not just one.
 >
 >
 >>>>We need to work out what the requirements are before we can settle on an
 >>>>implementation.
 >>>
 >>>
 >>>If you are talking about RSS limits the term is well defined.  The
 >>>number of pages you can have mapped into your set of address space at
 >>>any given time.
 >>>
 >>>Unless I'm totally blind that isn't what the patchset implements.
 >>
 >>Ouch, what makes you think so?
 >>The fact that a page mapped into 2 different processes is charged only once?
 >>Imho it is much more correct then sum of process' RSS within container, due to:
 >>1. it is clear how much container uses physical pages, not abstract items
 >>2. shared pages are charged only once, so the sum of containers RSS is still
 >>   about physical RAM.
 >
 >
 > No the fact that a page mapped into 2 separate mm_structs in two
 > separate accounting domains is counted only once.  This is very likely
 > to happen with things like glibc if you have a read-only shared copy
 > of your distro.  There appears to be no technical reason for such a
 > restriction.
 >
 > A page should not be owned.
 
 I would be happy to propose OVZ approach then, where a page is tracked
 with page_beancounter data structure, which ties together
 a page with beancounters which use it like this:
 
 page -> page_beancounter -> list of beanocunters which has the page mapped
 
 This gives a number of advantages:
 - the page is accounted to all the VEs which actually use it.
 - allows almost accurate tracking of page fractions used by VEs
 depending on how many VEs mapped the page.
 - allows to track dirty pages, i.e. which VE dirtied the page
 and implement correct disk I/O accounting and CFQ write scheduling
 based on VE priorities.
 
 > Going further unless the limits are draconian I don't expect users to
 > hit the rss limits often or frequently.  So in 99% of all cases page
 > reclaim should continue to be global.  Which makes me question messing
 > with the general page reclaim lists.
 
 It is not that rare when containers hit their limits, believe me :/
 In trusted environments - probably you are right, in hosting - no.
 
 Thanks,
 Kirill
 |  
	|  |  |  
	| 
		
			| Re: [RFC][PATCH 4/7] RSS accounting hooks over the code [message #11169 is a reply to message #10892] | Wed, 14 March 2007 15:43   |  
			| 
				
				
					|  xemul Messages: 248
 Registered: November 2005
 | Senior Member |  |  |  
	| Cedric Le Goater wrote: >> --- linux-2.6.20.orig/mm/migrate.c	2007-02-04 21:44:54.000000000 +0300
 >> +++ linux-2.6.20-0/mm/migrate.c	2007-03-06 13:33:28.000000000 +0300
 >> @@ -134,6 +134,7 @@ static void remove_migration_pte(struct
 >>  	pte_t *ptep, pte;
 >>   	spinlock_t *ptl;
 >>  	unsigned long addr = page_address_in_vma(new, vma);
 >> +	struct page_container *pcont;
 >>
 >>  	if (addr == -EFAULT)
 >>  		return;
 >> @@ -157,6 +158,11 @@ static void remove_migration_pte(struct
 >>   		return;
 >>   	}
 >>
 >> +	if (container_rss_prepare(new, vma, &pcont)) {
 >> +		pte_unmap(ptep);
 >> +		return;
 >> +	}
 >> +
 >>   	ptl = pte_lockptr(mm, pmd);
 >>   	spin_lock(ptl);
 >>  	pte = *ptep;
 >> @@ -175,16 +181,19 @@ static void remove_migration_pte(struct
 >>  	set_pte_at(mm, addr, ptep, pte);
 >>
 >>  	if (PageAnon(new))
 >> -		page_add_anon_rmap(new, vma, addr);
 >> +		page_add_anon_rmap(new, vma, addr, pcont);
 >>  	else
 >> -		page_add_file_rmap(new);
 >> +		page_add_file_rmap(new, pcont);
 >>
 >>  	/* No need to invalidate - it was non-present before */
 >>  	update_mmu_cache(vma, addr, pte);
 >>  	lazy_mmu_prot_update(pte);
 >> +	pte_unmap_unlock(ptep, ptl);
 >> +	return;
 >>
 >>  out:
 >>  	pte_unmap_unlock(ptep, ptl);
 >> +	container_rss_release(pcont);
 >>  }
 >>
 >>  /*
 >
 > you missed out an include in mm/migrate.c
 >
 > cheers,
 
 Thanks! :)
 
 > C.
 > Signed-off-by: Cedric Le Goater <clg@fr.ibm.com>
 > ---
 >  mm/migrate.c |    1 +
 >  1 file changed, 1 insertion(+)
 >
 > Index: 2.6.20/mm/migrate.c
 >  ============================================================ =======
 > --- 2.6.20.orig/mm/migrate.c
 > +++ 2.6.20/mm/migrate.c
 > @@ -28,6 +28,7 @@
 >  #include <linux/mempolicy.h>
 >  #include <linux/vmalloc.h>
 >  #include <linux/security.h>
 > +#include <linux/rss_container.h>
 >
 >  #include "internal.h"
 >
 > -
 > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
 > the body of a message to majordomo@vger.kernel.org
 > More majordomo info at  http://vger.kernel.org/majordomo-info.html
 > Please read the FAQ at  http://www.tux.org/lkml/
 >
 |  
	|  |  |  
	| 
		
			| Re: [RFC][PATCH 4/7] RSS accounting hooks over the code [message #11170 is a reply to message #10892] | Wed, 14 March 2007 15:37   |  
			| 
				
				
					|  Cedric Le Goater Messages: 443
 Registered: February 2006
 | Senior Member |  |  |  
	| > --- linux-2.6.20.orig/mm/migrate.c	2007-02-04 21:44:54.000000000 +0300 > +++ linux-2.6.20-0/mm/migrate.c	2007-03-06 13:33:28.000000000 +0300
 > @@ -134,6 +134,7 @@ static void remove_migration_pte(struct
 >  	pte_t *ptep, pte;
 >   	spinlock_t *ptl;
 >  	unsigned long addr = page_address_in_vma(new, vma);
 > +	struct page_container *pcont;
 >
 >  	if (addr == -EFAULT)
 >  		return;
 > @@ -157,6 +158,11 @@ static void remove_migration_pte(struct
 >   		return;
 >   	}
 >
 > +	if (container_rss_prepare(new, vma, &pcont)) {
 > +		pte_unmap(ptep);
 > +		return;
 > +	}
 > +
 >   	ptl = pte_lockptr(mm, pmd);
 >   	spin_lock(ptl);
 >  	pte = *ptep;
 > @@ -175,16 +181,19 @@ static void remove_migration_pte(struct
 >  	set_pte_at(mm, addr, ptep, pte);
 >
 >  	if (PageAnon(new))
 > -		page_add_anon_rmap(new, vma, addr);
 > +		page_add_anon_rmap(new, vma, addr, pcont);
 >  	else
 > -		page_add_file_rmap(new);
 > +		page_add_file_rmap(new, pcont);
 >
 >  	/* No need to invalidate - it was non-present before */
 >  	update_mmu_cache(vma, addr, pte);
 >  	lazy_mmu_prot_update(pte);
 > +	pte_unmap_unlock(ptep, ptl);
 > +	return;
 >
 >  out:
 >  	pte_unmap_unlock(ptep, ptl);
 > +	container_rss_release(pcont);
 >  }
 >
 >  /*
 
 you missed out an include in mm/migrate.c
 
 cheers,
 
 C.
 Signed-off-by: Cedric Le Goater <clg@fr.ibm.com>
 ---
 mm/migrate.c |    1 +
 1 file changed, 1 insertion(+)
 
 Index: 2.6.20/mm/migrate.c
 ============================================================ =======
 --- 2.6.20.orig/mm/migrate.c
 +++ 2.6.20/mm/migrate.c
 @@ -28,6 +28,7 @@
 #include <linux/mempolicy.h>
 #include <linux/vmalloc.h>
 #include <linux/security.h>
 +#include <linux/rss_container.h>
 
 #include "internal.h"
 |  
	|  |  |  
	| 
		
			| Re: [RFC][PATCH 1/7] Resource counters [message #17647 is a reply to message #10913] | Fri, 09 March 2007 16:37   |  
			| 
				
				
					|  Herbert Poetzl Messages: 239
 Registered: February 2006
 | Senior Member |  |  |  
	| On Wed, Mar 07, 2007 at 10:19:05AM +0300, Pavel Emelianov wrote:
> Balbir Singh wrote:
> > Pavel Emelianov wrote:
> >> Introduce generic structures and routines for
> >> resource accounting.
> >>
> >> Each resource accounting container is supposed to
> >> aggregate it, container_subsystem_state and its
> >> resource-specific members within.
> >>
> >>
> >> ------------------------------------------------------------------------
> >>
> >> diff -upr linux-2.6.20.orig/include/linux/res_counter.h
> >> linux-2.6.20-0/include/linux/res_counter.h
> >> --- linux-2.6.20.orig/include/linux/res_counter.h    2007-03-06
> >> 13:39:17.000000000 +0300
> >> +++ linux-2.6.20-0/include/linux/res_counter.h    2007-03-06
> >> 13:33:28.000000000 +0300
> >> @@ -0,0 +1,83 @@
> >> +#ifndef __RES_COUNTER_H__
> >> +#define __RES_COUNTER_H__
> >> +/*
> >> + * resource counters
> >> + *
> >> + * Copyright 2007 OpenVZ SWsoft Inc
> >> + *
> >> + * Author: Pavel Emelianov <xemul@openvz.org>
> >> + *
> >> + */
> >> +
> >> +#include <linux/container.h>
> >> +
> >> +struct res_counter {
> >> +    unsigned long usage;
> >> +    unsigned long limit;
> >> +    unsigned long failcnt;
> >> +    spinlock_t lock;
> >> +};
> >> +
> >> +enum {
> >> +    RES_USAGE,
> >> +    RES_LIMIT,
> >> +    RES_FAILCNT,
> >> +};
> >> +
> >> +ssize_t res_counter_read(struct res_counter *cnt, int member,
> >> +        const char __user *buf, size_t nbytes, loff_t *pos);
> >> +ssize_t res_counter_write(struct res_counter *cnt, int member,
> >> +        const char __user *buf, size_t nbytes, loff_t *pos);
> >> +
> >> +static inline void res_counter_init(struct res_counter *cnt)
> >> +{
> >> +    spin_lock_init(&cnt->lock);
> >> +    cnt->limit = (unsigned long)LONG_MAX;
> >> +}
> >> +
> > 
> > Is there any way to indicate that there are no limits on this container.
> 
> Yes - LONG_MAX is essentially a "no limit" value as no
> container will ever have such many files :)
-1 or ~0 is a viable choice for userspace to
communicate 'infinite' or 'unlimited'
> > LONG_MAX is quite huge, but still when the administrator wants to
> > configure a container to *un-limited usage*, it becomes hard for
> > the administrator.
> > 
> >> +static inline int res_counter_charge_locked(struct res_counter *cnt,
> >> +        unsigned long val)
> >> +{
> >> +    if (cnt->usage <= cnt->limit - val) {
> >> +        cnt->usage += val;
> >> +        return 0;
> >> +    }
> >> +
> >> +    cnt->failcnt++;
> >> +    return -ENOMEM;
> >> +}
> >> +
> >> +static inline int res_counter_charge(struct res_counter *cnt,
> >> +        unsigned long val)
> >> +{
> >> +    int ret;
> >> +    unsigned long flags;
> >> +
> >> +    spin_lock_irqsave(&cnt->lock, flags);
> >> +    ret = res_counter_charge_locked(cnt, val);
> >> +    spin_unlock_irqrestore(&cnt->lock, flags);
> >> +    return ret;
> >> +}
> >> +
> > 
> > Will atomic counters help here.
> 
> I'm afraid no. We have to atomically check for limit and alter
> one of usage or failcnt depending on the checking result. Making
> this with atomic_xxx ops will require at least two ops.
Linux-VServer does the accounting with atomic counters,
so that works quite fine, just do the checks at the
beginning of whatever resource allocation and the
accounting once the resource is acquired ...
> If we'll remove failcnt this would look like
>    while (atomic_cmpxchg(...))
> which is also not that good.
> 
> Moreover - in RSS accounting patches I perform page list
> manipulations under this lock, so this also saves one atomic op.
it still hasn't been shown that this kind of RSS limit
doesn't add big time overhead to normal operations
(inside and outside of such a resource container)
note that the 'usual' memory accounting is much more
lightweight and serves similar purposes ...
best,
Herbert
> >> +static inline void res_counter_uncharge_locked(struct res_counter *cnt,
> >> +        unsigned long val)
> >> +{
> >> +    if (unlikely(cnt->usage < val)) {
> >> +        WARN_ON(1);
> >> +        val = cnt->usage;
> >> +    }
> >> +
> >> +    cnt->usage -= val;
> >> +}
> >> +
> >> +static inline void res_counter_uncharge(struct res_counter *cnt,
> >> +        unsigned long val)
> >> +{
> >> +    unsigned long flags;
> >> +
> >> +    spin_lock_irqsave(&cnt->lock, flags);
> >> +    res_counter_uncharge_locked(cnt, val);
> >> +    spin_unlock_irqrestore(&cnt->lock, flags);
> >> +}
> >> +
> >> +#endif
> >> diff -upr linux-2.6.20.orig/init/Kconfig linux-2.6.20-0/init/Kconfig
> >> --- linux-2.6.20.orig/init/Kconfig    2007-03-06 13:33:28.000000000 +0300
> >> +++ linux-2.6.20-0/init/Kconfig    2007-03-06 13:33:28.000000000 +0300
> >> @@ -265,6 +265,10 @@ config CPUSETS
> >>
> >>        Say N if unsure.
> >>
> >> +config RESOURCE_COUNTERS
> >> +    bool
> >> +    select CONTAINERS
> >> +
> >>  config SYSFS_DEPRECATED
> >>      bool "Create deprecated sysfs files"
> >>      default y
> >> diff -upr linux-2.6.20.orig/kernel/Makefile
> >> linux-2.6.20-0/kernel/Makefile
> >> --- linux-2.6.20.orig/kernel/Makefile    2007-03-06 13:33:28.000000000
> >> +0300
> >> +++ linux-2.6.20-0/kernel/Makefile    2007-03-06 13:33:28.000000000 +0300
> >> @@ -51,6 +51,7 @@ obj-$(CONFIG_RELAY) += relay.o
> >>  obj-$(CONFIG_UTS_NS) += utsname.o
> >>  obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
> >>  obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
> >> +obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
> >>
> >>  ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
> >>  # According to Alan Modra <alan@linuxcare.com.au>, the
> >> -fno-omit-frame-pointer is
> >> diff -upr linux-2.6.20.orig/kernel/res_counter.c
> >> linux-2.6.20-0/kernel/res_counter.c
> >> --- linux-2.6.20.orig/kernel/res_counter.c    2007-03-06
> >> 13:39:17.000000000 +0300
> >> +++ linux-2.6.20-0/kernel/res_counter.c    2007-03-06
> >> 13:33:28.000000000 +0300
> >> @@ -0,0 +1,72 @@
> >> +/*
> >> + * resource containers
> >> + *
> >> + * Copyright 2007 OpenVZ SWsoft Inc
> >> + *
> >> + * Author: Pavel Emelianov <xemul@openvz.org>
> >> + *
> >> + */
> >> +
> >> +#include <linux/parser.h>
> >> +#include <linux/fs.h>
> >> +#include <linux/res_counter.h>
> >> +#include <asm/uaccess.h>
> >> +
> >> +static inline unsigned long *res_counter_member(struct res_counter
> >> *cnt, int member)
> >> +{
> >> +    switch (member) {
> >> +    case RES_USAGE:
> >> +        return &cnt->usage;
> >> +    case RES_LIMIT:
> >> +        return &cnt->limit;
> >> +    case RES_FAILCNT:
> >> +        return &cnt->failcnt;
> >> +    };
> >> +
> >> +    BUG();
> >> +    return NULL;
> >> +}
> >> +
> >> +ssize_t res_counter_read(struct res_counter *cnt, int member,
> >> +        const char __user *userbuf, size_t nbytes, loff_t *pos)
> >> +{
> >> +    unsigned long *val;
> >> +    char buf[64], *s;
> >> +
> >> +    s = buf;
> >> +    val = res_counter_member(cnt, member);
> >> +    s += sprintf(s, "%lu\n", *val);
> >> +    return simple_read_from_buffer((void __user *)userbuf, nbytes,
> >> +            pos, buf, s - buf);
> >> +}
> >> +
> >> +ssize_t res_counter_write(struct res_counter *cnt, int member,
> >> +        const char __user *userbuf, size_t nbytes, loff_t *pos)
> >> +{
> >> +    int ret;
> >> +    char *buf, *end;
> >> +    unsigned long tmp, *val;
> >> +
> >> +    buf = kmalloc(nbytes + 1, GFP_KERNEL);
> >> +    ret = -ENOMEM;
> >> +    if (buf == NULL)
> >> +        goto out;
> >> +
> >> +    buf[nbytes] = 0;
> >> +    ret = -EFAULT;
> >> +    if (copy_from_user(buf, userbuf, nbytes))
> >> +        goto out_free;
> >> +
> >> +    ret = -EINVAL;
> >> +    tmp = simple_strtoul(buf, &end, 10);
> >> +    if (*end != '\0')
> >> +        goto out_free;
> >> +
> >> +    val = res_counter_member(cnt, member);
> >> +    *val = tmp;
> >> +    ret = nbytes;
> >> +out_free:
> >> +    kfree(buf);
> >> +out:
> >> +    return ret;
> >> +}
> >>
> > 
> &g...
 
 |  
	|  |  |  
	| 
		
			| Re: [RFC][PATCH 2/7] RSS controller core [message #17648 is a reply to message #10902] | Fri, 09 March 2007 16:48   |  
			| 
				
				
					|  Herbert Poetzl Messages: 239
 Registered: February 2006
 | Senior Member |  |  |  
	| On Tue, Mar 06, 2007 at 02:00:36PM -0800, Andrew Morton wrote:
> On Tue, 06 Mar 2007 17:55:29 +0300
> Pavel Emelianov <xemul@sw.ru> wrote:
> 
> > +struct rss_container {
> > +	struct res_counter res;
> > +	struct list_head page_list;
> > +	struct container_subsys_state css;
> > +};
> > +
> > +struct page_container {
> > +	struct page *page;
> > +	struct rss_container *cnt;
> > +	struct list_head list;
> > +};
> 
> ah. This looks good. I'll find a hunk of time to go through this work
> and through Paul's patches. It'd be good to get both patchsets lined
> up in -mm within a couple of weeks. But..
doesn't look so good for me, mainly becaus of the 
additional per page data and per page processing
on 4GB memory, with 100 guests, 50% shared for each
guest, this basically means ~1mio pages, 500k shared
and 1500k x sizeof(page_container) entries, which
roughly boils down to ~25MB of wasted memory ...
increase the amount of shared pages and it starts
getting worse, but maybe I'm missing something here
> We need to decide whether we want to do per-container memory
> limitation via these data structures, or whether we do it via a
> physical scan of some software zone, possibly based on Mel's patches.
why not do simple page accounting (as done currently
in Linux) and use that for the limits, without
keeping the reference from container to page?
best,
Herbert
> _______________________________________________
> Containers mailing list
> Containers@lists.osdl.org
> https://lists.osdl.org/mailman/listinfo/containers
_______________________________________________
Containers mailing list
Containers@lists.osdl.org
https://lists.osdl.org/mailman/listinfo/containers |  
	|  |  |  
	| 
		
			| Re: [RFC][PATCH 1/7] Resource counters [message #17700 is a reply to message #17647] | Sun, 11 March 2007 09:01   |  
			| 
				
				
					|  xemul Messages: 248
 Registered: November 2005
 | Senior Member |  |  |  
	| Herbert Poetzl wrote:
> On Wed, Mar 07, 2007 at 10:19:05AM +0300, Pavel Emelianov wrote:
>> Balbir Singh wrote:
>>> Pavel Emelianov wrote:
>>>> Introduce generic structures and routines for
>>>> resource accounting.
>>>>
>>>> Each resource accounting container is supposed to
>>>> aggregate it, container_subsystem_state and its
>>>> resource-specific members within.
>>>>
>>>>
>>>> ------------------------------------------------------------------------
>>>>
>>>> diff -upr linux-2.6.20.orig/include/linux/res_counter.h
>>>> linux-2.6.20-0/include/linux/res_counter.h
>>>> --- linux-2.6.20.orig/include/linux/res_counter.h    2007-03-06
>>>> 13:39:17.000000000 +0300
>>>> +++ linux-2.6.20-0/include/linux/res_counter.h    2007-03-06
>>>> 13:33:28.000000000 +0300
>>>> @@ -0,0 +1,83 @@
>>>> +#ifndef __RES_COUNTER_H__
>>>> +#define __RES_COUNTER_H__
>>>> +/*
>>>> + * resource counters
>>>> + *
>>>> + * Copyright 2007 OpenVZ SWsoft Inc
>>>> + *
>>>> + * Author: Pavel Emelianov <xemul@openvz.org>
>>>> + *
>>>> + */
>>>> +
>>>> +#include <linux/container.h>
>>>> +
>>>> +struct res_counter {
>>>> +    unsigned long usage;
>>>> +    unsigned long limit;
>>>> +    unsigned long failcnt;
>>>> +    spinlock_t lock;
>>>> +};
>>>> +
>>>> +enum {
>>>> +    RES_USAGE,
>>>> +    RES_LIMIT,
>>>> +    RES_FAILCNT,
>>>> +};
>>>> +
>>>> +ssize_t res_counter_read(struct res_counter *cnt, int member,
>>>> +        const char __user *buf, size_t nbytes, loff_t *pos);
>>>> +ssize_t res_counter_write(struct res_counter *cnt, int member,
>>>> +        const char __user *buf, size_t nbytes, loff_t *pos);
>>>> +
>>>> +static inline void res_counter_init(struct res_counter *cnt)
>>>> +{
>>>> +    spin_lock_init(&cnt->lock);
>>>> +    cnt->limit = (unsigned long)LONG_MAX;
>>>> +}
>>>> +
>>> Is there any way to indicate that there are no limits on this container.
>> Yes - LONG_MAX is essentially a "no limit" value as no
>> container will ever have such many files :)
> 
> -1 or ~0 is a viable choice for userspace to
> communicate 'infinite' or 'unlimited'
OK, I'll make ULONG_MAX :)
>>> LONG_MAX is quite huge, but still when the administrator wants to
>>> configure a container to *un-limited usage*, it becomes hard for
>>> the administrator.
>>>
>>>> +static inline int res_counter_charge_locked(struct res_counter *cnt,
>>>> +        unsigned long val)
>>>> +{
>>>> +    if (cnt->usage <= cnt->limit - val) {
>>>> +        cnt->usage += val;
>>>> +        return 0;
>>>> +    }
>>>> +
>>>> +    cnt->failcnt++;
>>>> +    return -ENOMEM;
>>>> +}
>>>> +
>>>> +static inline int res_counter_charge(struct res_counter *cnt,
>>>> +        unsigned long val)
>>>> +{
>>>> +    int ret;
>>>> +    unsigned long flags;
>>>> +
>>>> +    spin_lock_irqsave(&cnt->lock, flags);
>>>> +    ret = res_counter_charge_locked(cnt, val);
>>>> +    spin_unlock_irqrestore(&cnt->lock, flags);
>>>> +    return ret;
>>>> +}
>>>> +
>>> Will atomic counters help here.
>> I'm afraid no. We have to atomically check for limit and alter
>> one of usage or failcnt depending on the checking result. Making
>> this with atomic_xxx ops will require at least two ops.
> 
> Linux-VServer does the accounting with atomic counters,
> so that works quite fine, just do the checks at the
> beginning of whatever resource allocation and the
> accounting once the resource is acquired ...
This works quite fine on non-preempted kernels.
>From the time you checked for resource till you really
account it kernel may preempt and let another process
pass through vx_anything_avail() check.
>> If we'll remove failcnt this would look like
>>    while (atomic_cmpxchg(...))
>> which is also not that good.
>>
>> Moreover - in RSS accounting patches I perform page list
>> manipulations under this lock, so this also saves one atomic op.
> 
> it still hasn't been shown that this kind of RSS limit
> doesn't add big time overhead to normal operations
> (inside and outside of such a resource container)
> 
> note that the 'usual' memory accounting is much more
> lightweight and serves similar purposes ...
It OOM-kills current int case of limit hit instead of
reclaiming pages or killing *memory eater* to free memory.
> best,
> Herbert
> 
>>>> +static inline void res_counter_uncharge_locked(struct res_counter *cnt,
>>>> +        unsigned long val)
>>>> +{
>>>> +    if (unlikely(cnt->usage < val)) {
>>>> +        WARN_ON(1);
>>>> +        val = cnt->usage;
>>>> +    }
>>>> +
>>>> +    cnt->usage -= val;
>>>> +}
>>>> +
>>>> +static inline void res_counter_uncharge(struct res_counter *cnt,
>>>> +        unsigned long val)
>>>> +{
>>>> +    unsigned long flags;
>>>> +
>>>> +    spin_lock_irqsave(&cnt->lock, flags);
>>>> +    res_counter_uncharge_locked(cnt, val);
>>>> +    spin_unlock_irqrestore(&cnt->lock, flags);
>>>> +}
>>>> +
>>>> +#endif
>>>> diff -upr linux-2.6.20.orig/init/Kconfig linux-2.6.20-0/init/Kconfig
>>>> --- linux-2.6.20.orig/init/Kconfig    2007-03-06 13:33:28.000000000 +0300
>>>> +++ linux-2.6.20-0/init/Kconfig    2007-03-06 13:33:28.000000000 +0300
>>>> @@ -265,6 +265,10 @@ config CPUSETS
>>>>
>>>>        Say N if unsure.
>>>>
>>>> +config RESOURCE_COUNTERS
>>>> +    bool
>>>> +    select CONTAINERS
>>>> +
>>>>  config SYSFS_DEPRECATED
>>>>      bool "Create deprecated sysfs files"
>>>>      default y
>>>> diff -upr linux-2.6.20.orig/kernel/Makefile
>>>> linux-2.6.20-0/kernel/Makefile
>>>> --- linux-2.6.20.orig/kernel/Makefile    2007-03-06 13:33:28.000000000
>>>> +0300
>>>> +++ linux-2.6.20-0/kernel/Makefile    2007-03-06 13:33:28.000000000 +0300
>>>> @@ -51,6 +51,7 @@ obj-$(CONFIG_RELAY) += relay.o
>>>>  obj-$(CONFIG_UTS_NS) += utsname.o
>>>>  obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
>>>>  obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
>>>> +obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
>>>>
>>>>  ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
>>>>  # According to Alan Modra <alan@linuxcare.com.au>, the
>>>> -fno-omit-frame-pointer is
>>>> diff -upr linux-2.6.20.orig/kernel/res_counter.c
>>>> linux-2.6.20-0/kernel/res_counter.c
>>>> --- linux-2.6.20.orig/kernel/res_counter.c    2007-03-06
>>>> 13:39:17.000000000 +0300
>>>> +++ linux-2.6.20-0/kernel/res_counter.c    2007-03-06
>>>> 13:33:28.000000000 +0300
>>>> @@ -0,0 +1,72 @@
>>>> +/*
>>>> + * resource containers
>>>> + *
>>>> + * Copyright 2007 OpenVZ SWsoft Inc
>>>> + *
>>>> + * Author: Pavel Emelianov <xemul@openvz.org>
>>>> + *
>>>> + */
>>>> +
>>>> +#include <linux/parser.h>
>>>> +#include <linux/fs.h>
>>>> +#include <linux/res_counter.h>
>>>> +#include <asm/uaccess.h>
>>>> +
>>>> +static inline unsigned long *res_counter_member(struct res_counter
>>>> *cnt, int member)
>>>> +{
>>>> +    switch (member) {
>>>> +    case RES_USAGE:
>>>> +        return &cnt->usage;
>>>> +    case RES_LIMIT:
>>>> +        return &cnt->limit;
>>>> +    case RES_FAILCNT:
>>>> +        return &cnt->failcnt;
>>>> +    };
>>>> +
>>>> +    BUG();
>>>> +    return NULL;
>>>> +}
>>>> +
>>>> +ssize_t res_counter_read(struct res_counter *cnt, int member,
>>>> +        const char __user *userbuf, size_t nbytes, loff_t *pos)
>>>> +{
>>>> +    unsigned long *val;
>>>> +    char buf[64], *s;
>>>> +
>>>> +    s = buf;
>>>> +    val = res_counter_member(cnt, member);
>>>> +    s += sprintf(s, "%lu\n", *val);
>>>> +    return simple_read_from_buffer((void __user *)userbuf, nbytes,
>>>> +            pos, buf, s - buf);
>>>> +}
>>>> +
>>>> +ssize_t res_counter_write(struct res_counter *cnt, int member,
>>>&g...
 
 |  
	|  |  |  
	| 
		
			| Re: [RFC][PATCH 2/7] RSS controller core [message #17702 is a reply to message #17648] | Sun, 11 March 2007 09:08   |  
			| 
				
				
					|  xemul Messages: 248
 Registered: November 2005
 | Senior Member |  |  |  
	| Herbert Poetzl wrote:
> On Tue, Mar 06, 2007 at 02:00:36PM -0800, Andrew Morton wrote:
>> On Tue, 06 Mar 2007 17:55:29 +0300
>> Pavel Emelianov <xemul@sw.ru> wrote:
>>
>>> +struct rss_container {
>>> +	struct res_counter res;
>>> +	struct list_head page_list;
>>> +	struct container_subsys_state css;
>>> +};
>>> +
>>> +struct page_container {
>>> +	struct page *page;
>>> +	struct rss_container *cnt;
>>> +	struct list_head list;
>>> +};
>> ah. This looks good. I'll find a hunk of time to go through this work
>> and through Paul's patches. It'd be good to get both patchsets lined
>> up in -mm within a couple of weeks. But..
> 
> doesn't look so good for me, mainly becaus of the 
> additional per page data and per page processing
> 
> on 4GB memory, with 100 guests, 50% shared for each
> guest, this basically means ~1mio pages, 500k shared
> and 1500k x sizeof(page_container) entries, which
> roughly boils down to ~25MB of wasted memory ...
> 
> increase the amount of shared pages and it starts
> getting worse, but maybe I'm missing something here
You are. Each page has only one page_container associated
with it despite the number of containers it is shared
between.
>> We need to decide whether we want to do per-container memory
>> limitation via these data structures, or whether we do it via a
>> physical scan of some software zone, possibly based on Mel's patches.
> 
> why not do simple page accounting (as done currently
> in Linux) and use that for the limits, without
> keeping the reference from container to page?
As I've already answered in my previous letter simple
limiting w/o per-container reclamation and per-container
oom killer isn't a good memory management. It doesn't allow
to handle resource shortage gracefully.
This patchset provides more grace way to handle this, but
full memory management includes accounting of VMA-length
as well (returning ENOMEM from system call) but we've decided
to start with RSS.
> best,
> Herbert
> 
>> _______________________________________________
>> Containers mailing list
>> Containers@lists.osdl.org
>> https://lists.osdl.org/mailman/listinfo/containers
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 
_______________________________________________
Containers mailing list
Containers@lists.osdl.org
https://lists.osdl.org/mailman/listinfo/containers |  
	|  |  | 
 
 
 Current Time: Sun Oct 26 14:11:53 GMT 2025 
 Total time taken to generate the page: 0.10498 seconds |