OpenVZ Forum


Home » Mailing lists » Devel » [RFC][PATCH 0/7] Resource controllers based on process containers
[RFC][PATCH 0/7] Resource controllers based on process containers [message #10888] Tue, 06 March 2007 14:42 Go to next message
xemul is currently offline  xemul
Messages: 248
Registered: November 2005
Senior Member
This patchset adds RSS, accounting and control and
limiting the number of tasks and files within container.

Based on top of Paul Menage's container subsystem v7

RSS controller includes per-container RSS accounter,
reclamation and OOM killer. It behaves like standalone
machine - when container runs out of resources it tries
to reclaim some pages and if it doesn't succeed in it
kills some task which mm_struct belongs to container in
question.

Num tasks and files containers are very simple and
self-descriptive from code.

As discussed before when a task moves from one container
to another no resources follow it - they keep holding the
container they were allocated in.

The difficulties met during using of Pauls' containers were:

1. Container fork hook is placed before new task
changes. This makes impossible of handling fork
properly. I.e. new mm_struct should have pointer
to RSS container, but we don't have one at that
early time.

2. Extended containers may register themselves too late.
Kernel threads/helpers start forking, opening files
and touching pages much earlier. This patchset
workarounds this in not-so-cute manner and I'm waiting
for Paul's comments on this issue.
[RFC][PATCH 1/7] Resource counters [message #10889 is a reply to message #10888] Tue, 06 March 2007 14:47 Go to previous messageGo to next message
xemul is currently offline  xemul
Messages: 248
Registered: November 2005
Senior Member
Introduce generic structures and routines for
resource accounting.

Each resource accounting container is supposed to
aggregate it, container_subsystem_state and its
resource-specific members within.

diff -upr linux-2.6.20.orig/include/linux/res_counter.h linux-2.6.20-0/include/linux/res_counter.h
--- linux-2.6.20.orig/include/linux/res_counter.h 2007-03-06 13:39:17.000000000 +0300
+++ linux-2.6.20-0/include/linux/res_counter.h 2007-03-06 13:33:28.000000000 +0300
@@ -0,0 +1,83 @@
+#ifndef __RES_COUNTER_H__
+#define __RES_COUNTER_H__
+/*
+ * resource counters
+ *
+ * Copyright 2007 OpenVZ SWsoft Inc
+ *
+ * Author: Pavel Emelianov <xemul@openvz.org>
+ *
+ */
+
+#include <linux/container.h>
+
+struct res_counter {
+ unsigned long usage;
+ unsigned long limit;
+ unsigned long failcnt;
+ spinlock_t lock;
+};
+
+enum {
+ RES_USAGE,
+ RES_LIMIT,
+ RES_FAILCNT,
+};
+
+ssize_t res_counter_read(struct res_counter *cnt, int member,
+ const char __user *buf, size_t nbytes, loff_t *pos);
+ssize_t res_counter_write(struct res_counter *cnt, int member,
+ const char __user *buf, size_t nbytes, loff_t *pos);
+
+static inline void res_counter_init(struct res_counter *cnt)
+{
+ spin_lock_init(&cnt->lock);
+ cnt->limit = (unsigned long)LONG_MAX;
+}
+
+static inline int res_counter_charge_locked(struct res_counter *cnt,
+ unsigned long val)
+{
+ if (cnt->usage <= cnt->limit - val) {
+ cnt->usage += val;
+ return 0;
+ }
+
+ cnt->failcnt++;
+ return -ENOMEM;
+}
+
+static inline int res_counter_charge(struct res_counter *cnt,
+ unsigned long val)
+{
+ int ret;
+ unsigned long flags;
+
+ spin_lock_irqsave(&cnt->lock, flags);
+ ret = res_counter_charge_locked(cnt, val);
+ spin_unlock_irqrestore(&cnt->lock, flags);
+ return ret;
+}
+
+static inline void res_counter_uncharge_locked(struct res_counter *cnt,
+ unsigned long val)
+{
+ if (unlikely(cnt->usage < val)) {
+ WARN_ON(1);
+ val = cnt->usage;
+ }
+
+ cnt->usage -= val;
+}
+
+static inline void res_counter_uncharge(struct res_counter *cnt,
+ unsigned long val)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&cnt->lock, flags);
+ res_counter_uncharge_locked(cnt, val);
+ spin_unlock_irqrestore(&cnt->lock, flags);
+}
+
+#endif
diff -upr linux-2.6.20.orig/init/Kconfig linux-2.6.20-0/init/Kconfig
--- linux-2.6.20.orig/init/Kconfig 2007-03-06 13:33:28.000000000 +0300
+++ linux-2.6.20-0/init/Kconfig 2007-03-06 13:33:28.000000000 +0300
@@ -265,6 +265,10 @@ config CPUSETS

Say N if unsure.

+config RESOURCE_COUNTERS
+ bool
+ select CONTAINERS
+
config SYSFS_DEPRECATED
bool "Create deprecated sysfs files"
default y
diff -upr linux-2.6.20.orig/kernel/Makefile linux-2.6.20-0/kernel/Makefile
--- linux-2.6.20.orig/kernel/Makefile 2007-03-06 13:33:28.000000000 +0300
+++ linux-2.6.20-0/kernel/Makefile 2007-03-06 13:33:28.000000000 +0300
@@ -51,6 +51,7 @@ obj-$(CONFIG_RELAY) += relay.o
obj-$(CONFIG_UTS_NS) += utsname.o
obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
+obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o

ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff -upr linux-2.6.20.orig/kernel/res_counter.c linux-2.6.20-0/kernel/res_counter.c
--- linux-2.6.20.orig/kernel/res_counter.c 2007-03-06 13:39:17.000000000 +0300
+++ linux-2.6.20-0/kernel/res_counter.c 2007-03-06 13:33:28.000000000 +0300
@@ -0,0 +1,72 @@
+/*
+ * resource containers
+ *
+ * Copyright 2007 OpenVZ SWsoft Inc
+ *
+ * Author: Pavel Emelianov <xemul@openvz.org>
+ *
+ */
+
+#include <linux/parser.h>
+#include <linux/fs.h>
+#include <linux/res_counter.h>
+#include <asm/uaccess.h>
+
+static inline unsigned long *res_counter_member(struct res_counter *cnt, int member)
+{
+ switch (member) {
+ case RES_USAGE:
+ return &cnt->usage;
+ case RES_LIMIT:
+ return &cnt->limit;
+ case RES_FAILCNT:
+ return &cnt->failcnt;
+ };
+
+ BUG();
+ return NULL;
+}
+
+ssize_t res_counter_read(struct res_counter *cnt, int member,
+ const char __user *userbuf, size_t nbytes, loff_t *pos)
+{
+ unsigned long *val;
+ char buf[64], *s;
+
+ s = buf;
+ val = res_counter_member(cnt, member);
+ s += sprintf(s, "%lu\n", *val);
+ return simple_read_from_buffer((void __user *)userbuf, nbytes,
+ pos, buf, s - buf);
+}
+
+ssize_t res_counter_write(struct res_counter *cnt, int member,
+ const char __user *userbuf, size_t nbytes, loff_t *pos)
+{
+ int ret;
+ char *buf, *end;
+ unsigned long tmp, *val;
+
+ buf = kmalloc(nbytes + 1, GFP_KERNEL);
+ ret = -ENOMEM;
+ if (buf == NULL)
+ goto out;
+
+ buf[nbytes] = 0;
+ ret = -EFAULT;
+ if (copy_from_user(buf, userbuf, nbytes))
+ goto out_free;
+
+ ret = -EINVAL;
+ tmp = simple_strtoul(buf, &end, 10);
+ if (*end != '\0')
+ goto out_free;
+
+ val = res_counter_member(cnt, member);
+ *val = tmp;
+ ret = nbytes;
+out_free:
+ kfree(buf);
+out:
+ return ret;
+}
[RFC][PATCH 2/7] RSS controller core [message #10890 is a reply to message #10888] Tue, 06 March 2007 14:53 Go to previous messageGo to next message
xemul is currently offline  xemul
Messages: 248
Registered: November 2005
Senior Member
This includes setup of RSS container within generic
process containers, all the declarations used in RSS
accounting, and core code responsible for accounting.

diff -upr linux-2.6.20.orig/include/linux/rss_container.h linux-2.6.20-0/include/linux/rss_container.h
--- linux-2.6.20.orig/include/linux/rss_container.h 2007-03-06 13:39:17.000000000 +0300
+++ linux-2.6.20-0/include/linux/rss_container.h 2007-03-06 13:33:28.000000000 +0300
@@ -0,0 +1,68 @@
+#ifndef __RSS_CONTAINER_H__
+#define __RSS_CONTAINER_H__
+/*
+ * RSS container
+ *
+ * Copyright 2007 OpenVZ SWsoft Inc
+ *
+ * Author: Pavel Emelianov <xemul@openvz.org>
+ *
+ */
+
+struct page_container;
+struct rss_container;
+
+#ifdef CONFIG_RSS_CONTAINER
+int container_rss_prepare(struct page *, struct vm_area_struct *vma,
+ struct page_container **);
+
+void container_rss_add(struct page_container *);
+void container_rss_del(struct page_container *);
+void container_rss_release(struct page_container *);
+
+int mm_init_container(struct mm_struct *mm, struct task_struct *tsk);
+void mm_free_container(struct mm_struct *mm);
+
+unsigned long container_isolate_pages(unsigned long nr_to_scan,
+ struct rss_container *rss, struct list_head *dst,
+ int active, unsigned long *scanned);
+unsigned long container_nr_physpages(struct rss_container *rss);
+
+unsigned long container_try_to_free_pages(struct rss_container *);
+void container_out_of_memory(struct rss_container *);
+
+void container_rss_init_early(void);
+#else
+static inline int container_rss_prepare(struct page *pg,
+ struct vm_area_struct *vma, struct page_container **pc)
+{
+ *pc = NULL; /* to make gcc happy */
+ return 0;
+}
+
+static inline void container_rss_add(struct page_container *pc)
+{
+}
+
+static inline void container_rss_del(struct page_container *pc)
+{
+}
+
+static inline void container_rss_release(struct page_container *pc)
+{
+}
+
+static inline int mm_init_container(struct mm_struct *mm, struct task_struct *t)
+{
+ return 0;
+}
+
+static inline void mm_free_container(struct mm_struct *mm)
+{
+}
+
+static inline void container_rss_init_early(void)
+{
+}
+#endif
+#endif
diff -upr linux-2.6.20.orig/init/Kconfig linux-2.6.20-0/init/Kconfig
--- linux-2.6.20.orig/init/Kconfig 2007-03-06 13:33:28.000000000 +0300
+++ linux-2.6.20-0/init/Kconfig 2007-03-06 13:33:28.000000000 +0300
@@ -265,6 +265,13 @@ config CPUSETS
bool
select CONTAINERS

+config RSS_CONTAINER
+ bool "RSS accounting container"
+ select RESOURCE_COUNTERS
+ help
+ Provides a simple Resource Controller for monitoring and
+ controlling the total Resident Set Size of the tasks in a container
+
config SYSFS_DEPRECATED
bool "Create deprecated sysfs files"
default y
diff -upr linux-2.6.20.orig/mm/Makefile linux-2.6.20-0/mm/Makefile
--- linux-2.6.20.orig/mm/Makefile 2007-02-04 21:44:54.000000000 +0300
+++ linux-2.6.20-0/mm/Makefile 2007-03-06 13:33:28.000000000 +0300
@@ -29,3 +29,5 @@ obj-$(CONFIG_MEMORY_HOTPLUG) += memory_h
obj-$(CONFIG_FS_XIP) += filemap_xip.o
obj-$(CONFIG_MIGRATION) += migrate.o
obj-$(CONFIG_SMP) += allocpercpu.o
+
+obj-$(CONFIG_RSS_CONTAINER) += rss_container.o
diff -upr linux-2.6.20.orig/mm/rss_container.c linux-2.6.20-0/mm/rss_container.c
--- linux-2.6.20.orig/mm/rss_container.c 2007-03-06 13:39:17.000000000 +0300
+++ linux-2.6.20-0/mm/rss_container.c 2007-03-06 13:33:28.000000000 +0300
@@ -0,0 +1,307 @@
+/*
+ * RSS accounting container
+ *
+ * Copyright 2007 OpenVZ SWsoft Inc
+ *
+ * Author: Pavel Emelianov <xemul@openvz.org>
+ *
+ */
+
+#include <linux/list.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/res_counter.h>
+#include <linux/rss_container.h>
+
+static struct container_subsys rss_subsys;
+
+struct rss_container {
+ struct res_counter res;
+ struct list_head page_list;
+ struct container_subsys_state css;
+};
+
+struct page_container {
+ struct page *page;
+ struct rss_container *cnt;
+ struct list_head list;
+};
+
+static inline struct rss_container *rss_from_cont(struct container *cnt)
+{
+ return container_of(container_subsys_state(cnt, &rss_subsys),
+ struct rss_container, css);
+}
+
+int mm_init_container(struct mm_struct *mm, struct task_struct *tsk)
+{
+ struct rss_container *cnt;
+
+ cnt = rss_from_cont(task_container(tsk, &rss_subsys));
+ if (css_get(&cnt->css))
+ return -EBUSY;
+
+ mm->rss_container = cnt;
+ return 0;
+}
+
+void mm_free_container(struct mm_struct *mm)
+{
+ css_put(&mm->rss_container->css);
+}
+
+int container_rss_prepare(struct page *page, struct vm_area_struct *vma,
+ struct page_container **ppc)
+{
+ struct rss_container *rss;
+ struct page_container *pc;
+
+ rcu_read_lock();
+ rss = rcu_dereference(vma->vm_mm->rss_container);
+ css_get_current(&rss->css);
+ rcu_read_unlock();
+
+ pc = kmalloc(sizeof(struct page_container), GFP_KERNEL);
+ if (pc == NULL)
+ goto out_nomem;
+
+ while (res_counter_charge(&rss->res, 1)) {
+ if (container_try_to_free_pages(rss))
+ continue;
+
+ container_out_of_memory(rss);
+ if (test_thread_flag(TIF_MEMDIE))
+ goto out_charge;
+ }
+
+ pc->page = page;
+ pc->cnt = rss;
+ *ppc = pc;
+ return 0;
+
+out_charge:
+ kfree(pc);
+out_nomem:
+ css_put(&rss->css);
+ return -ENOMEM;
+}
+
+void container_rss_release(struct page_container *pc)
+{
+ struct rss_container *rss;
+
+ rss = pc->cnt;
+ res_counter_uncharge(&rss->res, 1);
+ css_put(&rss->css);
+ kfree(pc);
+}
+
+void container_rss_add(struct page_container *pc)
+{
+ struct page *pg;
+ struct rss_container *rss;
+
+ pg = pc->page;
+ rss = pc->cnt;
+
+ spin_lock(&rss->res.lock);
+ list_add(&pc->list, &rss->page_list);
+ spin_unlock(&rss->res.lock);
+
+ page_container(pg) = pc;
+}
+
+void container_rss_del(struct page_container *pc)
+{
+ struct page *page;
+ struct rss_container *rss;
+
+ page = pc->page;
+ rss = pc->cnt;
+
+ spin_lock(&rss->res.lock);
+ list_del(&pc->list);
+ res_counter_uncharge_locked(&rss->res, 1);
+ spin_unlock(&rss->res.lock);
+
+ css_put(&rss->css);
+ kfree(pc);
+}
+
+unsigned long container_isolate_pages(unsigned long nr_to_scan,
+ struct rss_container *rss, struct list_head *dst,
+ int active, unsigned long *scanned)
+{
+ unsigned long nr_taken = 0;
+ struct page *page;
+ struct page_container *pc;
+ unsigned long scan;
+ struct list_head *src;
+ LIST_HEAD(pc_list);
+ struct zone *z;
+
+ spin_lock_irq(&rss->res.lock);
+ src = &rss->page_list;
+
+ for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
+ pc = list_entry(src->prev, struct page_container, list);
+ page = pc->page;
+ z = page_zone(page);
+
+ list_move(&pc->list, &pc_list);
+
+ spin_lock(&z->lru_lock);
+ if (PageLRU(page)) {
+ if ((active && PageActive(page)) ||
+ (!active && !PageActive(page))) {
+ if (likely(get_page_unless_zero(page))) {
+ ClearPageLRU(page);
+ nr_taken++;
+ list_move(&page->lru, dst);
+ }
+ }
+ }
+ spin_unlock(&z->lru_lock);
+ }
+
+ list_splice(&pc_list, src);
+ spin_unlock_irq(&rss->res.lock);
+
+ *scanned = scan;
+ return nr_taken;
+}
+
+unsigned long container_nr_physpages(struct rss_container *rss)
+{
+ return rss->res.usage;
+}
+
+static void rss_move_task(struct container_subsys *ss,
+ struct container *cont,
+ struct container *old_cont,
+ struct task_struct *p)
+{
+ struct mm_struct *mm;
+ struct rss_container *rss, *old_rss;
+
+ mm = get_task_mm(p);
+ if (mm == NULL)
+ goto out;
+
+ rss = rss_from_cont(cont);
+ old_rss = rss_from_cont(old_cont);
+ if (old_rss != mm->rss_container)
+ goto out_put;
+
+ css_get_current(&rss->css);
+ rcu_assign_pointer(mm->rss_container, rss);
+ css_put(&old_rss->css);
+
+out_put:
+ mmput(mm);
+out:
+ return;
+}
+
+static int rss_create(struct container_subsys *ss, struct container *cont)
+{
+ struct rss_container *rss;
+
+ rss = kzalloc(sizeof(struct rss_container), GFP_KERNEL);
+ if (rss == NULL)
+ return -ENOMEM;
+
+ res_counter_init(&rss->res);
+ INIT_LIST_HEAD(&rss->page_list);
+ cont->subsys[rss_subsys.subsys_id] = &rss->css;
+ return 0;
+}
+
+static void rss_destroy(struct container_subsys *ss,
+ struct container *cont)
+{
+ kfree(rss_from_cont(cont));
+}
+
+
+static ssize_t rss_read(struct container *cont, struct cftype *cft,
+ struct file *file, char __user *userbuf,
+ size_t nbytes, loff_t *ppos)
+{
+ return res_counter_read(&rss_from_cont(cont)->res, cft->private,
+ userbuf, nbytes, ppos);
+}
+
+static ssize_t rss_write(struct container *cont, struct cftype *cft,
+ struct file *file, const char __user *userbuf,
+ size_t nbytes, loff_t *ppos)
+{
+ return res_counter_write(&rss_from_cont(cont)->res, cft->private,
+ userbuf, nbytes, ppos);
+}
+
+
+static struct cftype rss_usage = {
+ .name = "rss_usage",
+ .private = RES_USAGE,
+ .read = rss_read,
+};
+
+static struct cftype rss_limit = {
+ .name = "rss_limit",
+ .private = RES_LIMIT,
+ .read = rss_read,
+ .write = rss_write,
+};
+
+static struct cftype rss_failcnt = {
+ .name = "rss_failcnt",
+ .private = RES_FAILCNT,
+ .read = rss_read,
+};
+
+static int rss_populate(struct container_subsys *ss,
+ struct container *cont)
+{
+ int rc;
+
+ if ((rc = container_add_file(cont, &rss_usage)) < 0)
+ return rc;
+ if ((rc = container_add_file(cont, &rss_failcnt)) < 0)
+ return rc;
+ if ((rc = container_add_file(cont, &rss_limit)) < 0)
+ return rc;
+
+ return 0;
+}
+
+static struct rss_container init_rss_container;
+
+static __init int rss_create_early(struct container_subsys *ss,
+ struct container *cont)
+{
+ struct rss_container *rss;
+
+ rss = &init_rss_container;
+ res_counter_init(&rss->res);
+ INIT_LIST_HEAD(&rss->page_list);
+ cont->subsys[rss_
...

[RFC][PATCH 3/7] Data structures changes for RSS accounting [message #10891 is a reply to message #10888] Tue, 06 March 2007 14:55 Go to previous messageGo to next message
xemul is currently offline  xemul
Messages: 248
Registered: November 2005
Senior Member
Adds needed pointers to mm_struct and page struct,
places hooks to core code for mm_struct initialization
and hooks in container_init_early() to preinitialize
RSS accounting subsystem.

diff -upr linux-2.6.20.orig/include/linux/mm.h linux-2.6.20-0/include/linux/mm.h
--- linux-2.6.20.orig/include/linux/mm.h 2007-02-04 21:44:54.000000000 +0300
+++ linux-2.6.20-0/include/linux/mm.h 2007-03-06 13:33:28.000000000 +0300
@@ -220,6 +220,12 @@ struct vm_operations_struct {
struct mmu_gather;
struct inode;

+#ifdef CONFIG_RSS_CONTAINER
+#define page_container(page) (page->rss_container)
+#else
+#define page_container(page) (NULL)
+#endif
+
#define page_private(page) ((page)->private)
#define set_page_private(page, v) ((page)->private = (v))

diff -upr linux-2.6.20.orig/include/linux/mm_types.h linux-2.6.20-0/include/linux/mm_types.h
--- linux-2.6.20.orig/include/linux/mm_types.h 2007-02-04 21:44:54.000000000 +0300
+++ linux-2.6.20-0/include/linux/mm_types.h 2007-03-06 13:33:28.000000000 +0300
@@ -62,6 +62,9 @@ struct page {
void *virtual; /* Kernel virtual address (NULL if
not kmapped, ie. highmem) */
#endif /* WANT_PAGE_VIRTUAL */
+#ifdef CONFIG_RSS_CONTAINER
+ struct page_container *rss_container;
+#endif
};

#endif /* _LINUX_MM_TYPES_H */
diff -upr linux-2.6.20.orig/include/linux/sched.h linux-2.6.20-0/include/linux/sched.h
--- linux-2.6.20.orig/include/linux/sched.h 2007-03-06 13:33:28.000000000 +0300
+++ linux-2.6.20-0/include/linux/sched.h 2007-03-06 13:33:28.000000000 +0300
@@ -373,6 +373,9 @@ struct mm_struct {
/* aio bits */
rwlock_t ioctx_list_lock;
struct kioctx *ioctx_list;
+#ifdef CONFIG_RSS_CONTAINER
+ struct rss_container *rss_container;
+#endif
};

struct sighand_struct {
diff -upr linux-2.6.20.orig/kernel/fork.c linux-2.6.20-0/kernel/fork.c
--- linux-2.6.20.orig/kernel/fork.c 2007-03-06 13:33:28.000000000 +0300
+++ linux-2.6.20-0/kernel/fork.c 2007-03-06 13:33:28.000000000 +0300
@@ -57,6 +57,8 @@
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>

+#include <linux/rss_container.h>
+
/*
* Protected counters by write_lock_irq(&tasklist_lock)
*/
@@ -325,7 +328,7 @@ static inline void mm_free_pgd(struct mm

#include <linux/init_task.h>

-static struct mm_struct * mm_init(struct mm_struct * mm)
+static struct mm_struct * mm_init(struct mm_struct *mm, struct task_struct *tsk)
{
atomic_set(&mm->mm_users, 1);
atomic_set(&mm->mm_count, 1);
@@ -341,10 +344,18 @@ static struct mm_struct * mm_init(struct
mm->free_area_cache = TASK_UNMAPPED_BASE;
mm->cached_hole_size = ~0UL;

- if (likely(!mm_alloc_pgd(mm))) {
- mm->def_flags = 0;
- return mm;
- }
+ if (unlikely(mm_init_container(mm, tsk)))
+ goto out_cont;
+
+ if (unlikely(mm_alloc_pgd(mm)))
+ goto out_pgd;
+
+ mm->def_flags = 0;
+ return mm;
+
+out_pgd:
+ mm_free_container(mm);
+out_cont:
free_mm(mm);
return NULL;
}
@@ -359,7 +370,7 @@ struct mm_struct * mm_alloc(void)
mm = allocate_mm();
if (mm) {
memset(mm, 0, sizeof(*mm));
- mm = mm_init(mm);
+ mm = mm_init(mm, current);
}
return mm;
}
@@ -373,6 +384,7 @@ void fastcall __mmdrop(struct mm_struct
{
BUG_ON(mm == &init_mm);
mm_free_pgd(mm);
+ mm_free_container(mm);
destroy_context(mm);
free_mm(mm);
}
@@ -493,7 +505,7 @@ static struct mm_struct *dup_mm(struct t
mm->token_priority = 0;
mm->last_interval = 0;

- if (!mm_init(mm))
+ if (!mm_init(mm, tsk))
goto fail_nomem;

if (init_new_context(tsk, mm))
@@ -520,6 +532,7 @@ fail_nocontext:
* because it calls destroy_context()
*/
mm_free_pgd(mm);
+ mm_free_container(mm);
free_mm(mm);
return NULL;
}
diff -upr linux-2.6.20.orig/kernel/container.c linux-2.6.20-0/kernel/container.c
--- linux-2.6.20.orig/kernel/container.c 2007-03-06 13:33:28.000000000 +0300
+++ linux-2.6.20-0/kernel/container.c 2007-03-06 13:35:48.000000000 +0300
@@ -60,6 +60,8 @@
#include <asm/atomic.h>
#include <linux/mutex.h>

+#include <linux/rss_container.h>
+
#define CONTAINER_SUPER_MAGIC 0x27e0eb

static struct container_subsys *subsys[CONFIG_MAX_CONTAINER_SUBSYS];
@@ -1721,6 +1725,8 @@ int __init container_init_early(void)
}
init_task.containers = &init_container_group;

+ container_rss_init_early();
+
return 0;
}
[RFC][PATCH 4/7] RSS accounting hooks over the code [message #10892 is a reply to message #10888] Tue, 06 March 2007 14:57 Go to previous messageGo to next message
xemul is currently offline  xemul
Messages: 248
Registered: November 2005
Senior Member
Pages are charged to their first touchers which are
determined using pages' mapcount manipulations in
rmap calls.

diff -upr linux-2.6.20.orig/fs/exec.c linux-2.6.20-0/fs/exec.c
--- linux-2.6.20.orig/fs/exec.c 2007-02-04 21:44:54.000000000 +0300
+++ linux-2.6.20-0/fs/exec.c 2007-03-06 13:33:28.000000000 +0300
@@ -58,6 +58,8 @@
#include <linux/kmod.h>
#endif

+#include <linux/rss_container.h>
+
int core_uses_pid;
char core_pattern[128] = "core";
int suid_dumpable = 0;
@@ -309,27 +311,34 @@ void install_arg_page(struct vm_area_str
struct mm_struct *mm = vma->vm_mm;
pte_t * pte;
spinlock_t *ptl;
+ struct page_container *pcont;

if (unlikely(anon_vma_prepare(vma)))
goto out;

+ if (container_rss_prepare(page, vma, &pcont))
+ goto out;
+
flush_dcache_page(page);
pte = get_locked_pte(mm, address, &ptl);
if (!pte)
- goto out;
+ goto out_release;
if (!pte_none(*pte)) {
pte_unmap_unlock(pte, ptl);
- goto out;
+ goto out_release;
}
inc_mm_counter(mm, anon_rss);
lru_cache_add_active(page);
set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte(
page, vma->vm_page_prot))));
- page_add_new_anon_rmap(page, vma, address);
+ page_add_new_anon_rmap(page, vma, address, pcont);
pte_unmap_unlock(pte, ptl);

/* no need for flush_tlb */
return;
+
+out_release:
+ container_rss_release(pcont);
out:
__free_page(page);
force_sig(SIGKILL, current);
diff -upr linux-2.6.20.orig/include/linux/rmap.h linux-2.6.20-0/include/linux/rmap.h
--- linux-2.6.20.orig/include/linux/rmap.h 2007-02-04 21:44:54.000000000 +0300
+++ linux-2.6.20-0/include/linux/rmap.h 2007-03-06 13:33:28.000000000 +0300
@@ -69,9 +69,13 @@ void __anon_vma_link(struct vm_area_stru
/*
* rmap interfaces called when adding or removing pte of page
*/
-void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
-void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
-void page_add_file_rmap(struct page *);
+struct page_container;
+
+void page_add_anon_rmap(struct page *, struct vm_area_struct *,
+ unsigned long, struct page_container *);
+void page_add_new_anon_rmap(struct page *, struct vm_area_struct *,
+ unsigned long, struct page_container *);
+void page_add_file_rmap(struct page *, struct page_container *);
void page_remove_rmap(struct page *, struct vm_area_struct *);

/**
diff -upr linux-2.6.20.orig/mm/fremap.c linux-2.6.20-0/mm/fremap.c
--- linux-2.6.20.orig/mm/fremap.c 2007-02-04 21:44:54.000000000 +0300
+++ linux-2.6.20-0/mm/fremap.c 2007-03-06 13:33:28.000000000 +0300
@@ -20,6 +20,8 @@
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>

+#include <linux/rss_container.h>
+
static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
{
@@ -57,6 +59,10 @@ int install_page(struct mm_struct *mm, s
pte_t *pte;
pte_t pte_val;
spinlock_t *ptl;
+ struct page_container *pcont;
+
+ if (container_rss_prepare(page, vma, &pcont))
+ goto out_release;

pte = get_locked_pte(mm, addr, &ptl);
if (!pte)
@@ -81,13 +87,16 @@ int install_page(struct mm_struct *mm, s
flush_icache_page(vma, page);
pte_val = mk_pte(page, prot);
set_pte_at(mm, addr, pte, pte_val);
- page_add_file_rmap(page);
+ page_add_file_rmap(page, pcont);
update_mmu_cache(vma, addr, pte_val);
lazy_mmu_prot_update(pte_val);
err = 0;
unlock:
pte_unmap_unlock(pte, ptl);
out:
+ if (err != 0)
+ container_rss_release(pcont);
+out_release:
return err;
}
EXPORT_SYMBOL(install_page);
diff -upr linux-2.6.20.orig/mm/memory.c linux-2.6.20-0/mm/memory.c
--- linux-2.6.20.orig/mm/memory.c 2007-02-04 21:44:54.000000000 +0300
+++ linux-2.6.20-0/mm/memory.c 2007-03-06 13:33:28.000000000 +0300
@@ -60,6 +60,8 @@
#include <linux/swapops.h>
#include <linux/elf.h>

+#include <linux/rss_container.h>
+
#ifndef CONFIG_NEED_MULTIPLE_NODES
/* use the per-pgdat data instead for discontigmem - mbligh */
unsigned long max_mapnr;
@@ -1126,7 +1128,7 @@ static int zeromap_pte_range(struct mm_s
break;
}
page_cache_get(page);
- page_add_file_rmap(page);
+ page_add_file_rmap(page, NULL);
inc_mm_counter(mm, file_rss);
set_pte_at(mm, addr, pte, zero_pte);
} while (pte++, addr += PAGE_SIZE, addr != end);
@@ -1234,7 +1236,7 @@ static int insert_page(struct mm_struct
/* Ok, finally just insert the thing.. */
get_page(page);
inc_mm_counter(mm, file_rss);
- page_add_file_rmap(page);
+ page_add_file_rmap(page, NULL);
set_pte_at(mm, addr, pte, mk_pte(page, prot));

retval = 0;
@@ -1495,6 +1497,7 @@ static int do_wp_page(struct mm_struct *
pte_t entry;
int reuse = 0, ret = VM_FAULT_MINOR;
struct page *dirty_page = NULL;
+ struct page_container *pcont;

old_page = vm_normal_page(vma, address, orig_pte);
if (!old_page)
@@ -1580,6 +1583,9 @@ gotten:
cow_user_page(new_page, old_page, address, vma);
}

+ if (container_rss_prepare(new_page, vma, &pcont))
+ goto oom;
+
/*
* Re-check the pte - we dropped the lock
*/
@@ -1607,12 +1613,14 @@ gotten:
set_pte_at(mm, address, page_table, entry);
update_mmu_cache(vma, address, entry);
lru_cache_add_active(new_page);
- page_add_new_anon_rmap(new_page, vma, address);
+ page_add_new_anon_rmap(new_page, vma, address, pcont);

/* Free the old page.. */
new_page = old_page;
ret |= VM_FAULT_WRITE;
- }
+ } else
+ container_rss_release(pcont);
+
if (new_page)
page_cache_release(new_page);
if (old_page)
@@ -1988,6 +1996,7 @@ static int do_swap_page(struct mm_struct
swp_entry_t entry;
pte_t pte;
int ret = VM_FAULT_MINOR;
+ struct page_container *pcont;

if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
goto out;
@@ -2020,6 +2029,11 @@ static int do_swap_page(struct mm_struct
count_vm_event(PGMAJFAULT);
}

+ if (container_rss_prepare(page, vma, &pcont)) {
+ ret = VM_FAULT_OOM;
+ goto out;
+ }
+
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
mark_page_accessed(page);
lock_page(page);
@@ -2033,6 +2047,7 @@ static int do_swap_page(struct mm_struct

if (unlikely(!PageUptodate(page))) {
ret = VM_FAULT_SIGBUS;
+ container_rss_release(pcont);
goto out_nomap;
}

@@ -2047,7 +2062,7 @@ static int do_swap_page(struct mm_struct

flush_icache_page(vma, page);
set_pte_at(mm, address, page_table, pte);
- page_add_anon_rmap(page, vma, address);
+ page_add_anon_rmap(page, vma, address, pcont);

swap_free(entry);
if (vm_swap_full())
@@ -2069,6 +2084,7 @@ unlock:
out:
return ret;
out_nomap:
+ container_rss_release(pcont);
pte_unmap_unlock(page_table, ptl);
unlock_page(page);
page_cache_release(page);
@@ -2087,6 +2103,7 @@ static int do_anonymous_page(struct mm_s
struct page *page;
spinlock_t *ptl;
pte_t entry;
+ struct page_container *pcont;

if (write_access) {
/* Allocate our own private page. */
@@ -2098,15 +2115,19 @@ static int do_anonymous_page(struct mm_s
if (!page)
goto oom;

+ if (container_rss_prepare(page, vma, &pcont))
+ goto oom_release;
+
entry = mk_pte(page, vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);

page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
if (!pte_none(*page_table))
- goto release;
+ goto release_container;
+
inc_mm_counter(mm, anon_rss);
lru_cache_add_active(page);
- page_add_new_anon_rmap(page, vma, address);
+ page_add_new_anon_rmap(page, vma, address, pcont);
} else {
/* Map the ZERO_PAGE - vm_page_prot is readonly */
page = ZERO_PAGE(address);
@@ -2118,7 +2139,7 @@ static int do_anonymous_page(struct mm_s
if (!pte_none(*page_table))
goto release;
inc_mm_counter(mm, file_rss);
- page_add_file_rmap(page);
+ page_add_file_rmap(page, NULL);
}

set_pte_at(mm, address, page_table, entry);
@@ -2129,9 +2150,14 @@ static int do_anonymous_page(struct mm_s
unlock:
pte_unmap_unlock(page_table, ptl);
return VM_FAULT_MINOR;
+release_container:
+ container_rss_release(pcont);
release:
page_cache_release(page);
goto unlock;
+
+oom_release:
+ page_cache_release(page);
oom:
return VM_FAULT_OOM;
}
@@ -2161,6 +2187,7 @@ static int do_no_page(struct mm_struct *
int ret = VM_FAULT_MINOR;
int anon = 0;
struct page *dirty_page = NULL;
+ struct page_container *pcont;

pte_unmap(page_table);
BUG_ON(vma->vm_flags & VM_PFNMAP);
@@ -2218,6 +2245,9 @@ retry:
}
}

+ if (container_rss_prepare(new_page, vma, &pcont))
+ goto oom;
+
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
/*
* For a file-backed vma, someone could have truncated or otherwise
@@ -2226,6 +2256,7 @@ retry:
*/
if (mapping && unlikely(sequence != mapping->truncate_count)) {
pte_unmap_unlock(page_table, ptl);
+ container_rss_release(pcont);
page_cache_release(new_page);
cond_resched();
sequence = mapping->truncate_count;
@@ -2253,10 +2284,10 @@ retry:
if (anon) {
inc_mm_counter(mm, anon_rss);
lru_cache_add_active(new_page);
- page_add_new_anon_rmap(new_page, vma, address);
+ page_add_new_anon_rmap(new_page, vma, address, pcont);
} else {
inc_mm_counter(mm, file_rss);
- page_add_file_rmap(new_page);
+ page_add_file_rmap(new_page, pcont);
if (write_access) {
dirty_page = new_page;
get_page(dirty_page);
@@ -2264,6 +2295,7 @@ retry:
}
} else {
/* One of our sibling threads was faster, back out. */
+ container_rss_release(pcont);
page_cache_release(new_page);
goto unlock;
}
diff -upr linux-2.6.20.orig/mm/migrate.c linux-2.6.20-0/mm/migrate.c
--- linux-2.6.20.orig/mm/migrate.c 2007-02-04 21:44:54.000000000 +0300
+++ linux-2.6.20-0/mm/migrate.c 2007-03-06 13:33:28.000000000 +0300
@@ -134,6 +134,7 @@ static void remove_migration_pte(struct
pte_t *ptep, pte;
spinlock_t *ptl;
unsigned long addr = page_address_in_vma(new, vma);
+ struct page_container *pcont;

if (addr == -EFAULT)
return;
@@
...

[RFC][PATCH 5/7] Per-container OOM killer and page reclamation [message #10893 is a reply to message #10888] Tue, 06 March 2007 15:01 Go to previous messageGo to next message
xemul is currently offline  xemul
Messages: 248
Registered: November 2005
Senior Member
* container_try_to_free_pages() walks containers
page list and tries to shrink pages. This is based
on try_to_free_pages() and Co code.
Called from core code when no resource left at the
moment of page touching.

* container_out_of_memory() selects a process to be
killed which mm_struct belongs to container in question.
Called from core code when no resources left and no
pages were reclaimed.

diff -upr linux-2.6.20.orig/mm/oom_kill.c linux-2.6.20-0/mm/oom_kill.c
--- linux-2.6.20.orig/mm/oom_kill.c 2007-03-06 13:33:28.000000000 +0300
+++ linux-2.6.20-0/mm/oom_kill.c 2007-03-06 13:33:28.000000000 +0300
@@ -24,6 +24,7 @@
#include <linux/cpuset.h>
#include <linux/module.h>
#include <linux/notifier.h>
+#include <linux/rss_container.h>

int sysctl_panic_on_oom;
/* #define DEBUG */
@@ -47,7 +48,8 @@ int sysctl_panic_on_oom;
* of least surprise ... (be careful when you change it)
*/

-unsigned long badness(struct task_struct *p, unsigned long uptime)
+unsigned long badness(struct task_struct *p, unsigned long uptime,
+ struct rss_container *rss)
{
unsigned long points, cpu_time, run_time, s;
struct mm_struct *mm;
@@ -60,6 +62,13 @@ unsigned long badness(struct task_struct
return 0;
}

+#ifdef CONFIG_RSS_CONTAINER
+ if (rss != NULL && mm->rss_container != rss) {
+ task_unlock(p);
+ return 0;
+ }
+#endif
+
/*
* The memory size of the process is the basis for the badness.
*/
@@ -200,7 +209,8 @@ static inline int constrained_alloc(stru
*
* (not docbooked, we don't want this one cluttering up the manual)
*/
-static struct task_struct *select_bad_process(unsigned long *ppoints)
+static struct task_struct *select_bad_process(unsigned long *ppoints,
+ struct rss_container *rss)
{
struct task_struct *g, *p;
struct task_struct *chosen = NULL;
@@ -254,7 +264,7 @@ static struct task_struct *select_bad_pr
if (p->oomkilladj == OOM_DISABLE)
continue;

- points = badness(p, uptime.tv_sec);
+ points = badness(p, uptime.tv_sec, rss);
if (points > *ppoints || !chosen) {
chosen = p;
*ppoints = points;
@@ -435,7 +445,7 @@ retry:
* Rambo mode: Shoot down a process and hope it solves whatever
* issues we may have.
*/
- p = select_bad_process(&points);
+ p = select_bad_process(&points, NULL);

if (PTR_ERR(p) == -1UL)
goto out;
@@ -464,3 +474,27 @@ out:
if (!test_thread_flag(TIF_MEMDIE))
schedule_timeout_uninterruptible(1);
}
+
+#ifdef CONFIG_RSS_CONTAINER
+void container_out_of_memory(struct rss_container *rss)
+{
+ unsigned long points = 0;
+ struct task_struct *p;
+
+ container_lock();
+ read_lock(&tasklist_lock);
+retry:
+ p = select_bad_process(&points, rss);
+ if (PTR_ERR(p) == -1UL)
+ goto out;
+
+ if (!p)
+ p = current;
+
+ if (oom_kill_process(p, points, "Container out of memory"))
+ goto retry;
+out:
+ read_unlock(&tasklist_lock);
+ container_unlock();
+}
+#endif
diff -upr linux-2.6.20.orig/mm/vmscan.c linux-2.6.20-0/mm/vmscan.c
--- linux-2.6.20.orig/mm/vmscan.c 2007-02-04 21:44:54.000000000 +0300
+++ linux-2.6.20-0/mm/vmscan.c 2007-03-06 13:33:28.000000000 +0300
@@ -45,6 +45,8 @@

#include "internal.h"

+#include <linux/rss_container.h>
+
struct scan_control {
/* Incremented by the number of inactive pages that were scanned */
unsigned long nr_scanned;
@@ -1097,6 +1099,194 @@ out:
return ret;
}

+#ifdef CONFIG_RSS_CONTAINER
+/*
+ * These are containers' inactive and active pages shrinkers.
+ * Thes works like shrink_inactive_list() and shrink_active_list()
+ *
+ * Two main differences is that container_isolate_pages() is used to isolate
+ * pages, and that reclaim_mapped is considered to be 1 as hitting BC
+ * limit implies we have to shrink _mapped_ pages
+ */
+static unsigned long container_shrink_pages_inactive(unsigned long max_scan,
+ struct rss_container *rss, struct scan_control *sc)
+{
+ LIST_HEAD(page_list);
+ unsigned long nr_scanned = 0;
+ unsigned long nr_reclaimed = 0;
+
+ do {
+ struct page *page;
+ unsigned long nr_taken;
+ unsigned long nr_scan;
+ struct zone *z;
+
+ nr_taken = container_isolate_pages(sc->swap_cluster_max, rss,
+ &page_list, 0, &nr_scan);
+
+ nr_scanned += nr_scan;
+ nr_reclaimed += shrink_page_list(&page_list, sc);
+ if (nr_taken == 0)
+ goto done;
+
+ while (!list_empty(&page_list)) {
+ page = lru_to_page(&page_list);
+ z = page_zone(page);
+
+ spin_lock_irq(&z->lru_lock);
+ VM_BUG_ON(PageLRU(page));
+ SetPageLRU(page);
+ list_del(&page->lru);
+ if (PageActive(page))
+ add_page_to_active_list(z, page);
+ else
+ add_page_to_inactive_list(z, page);
+ spin_unlock_irq(&z->lru_lock);
+
+ put_page(page);
+ }
+ } while (nr_scanned < max_scan);
+done:
+ return nr_reclaimed;
+}
+
+static void container_shrink_pages_active(unsigned long nr_pages,
+ struct rss_container *rss, struct scan_control *sc)
+{
+ LIST_HEAD(l_hold);
+ LIST_HEAD(l_inactive);
+ LIST_HEAD(l_active);
+ struct page *page;
+ unsigned long nr_scanned;
+ unsigned long nr_deactivated = 0;
+ struct zone *z;
+
+ container_isolate_pages(nr_pages, rss, &l_hold, 1, &nr_scanned);
+
+ while (!list_empty(&l_hold)) {
+ cond_resched();
+ page = lru_to_page(&l_hold);
+ list_del(&page->lru);
+ if (page_mapped(page)) {
+ if ((total_swap_pages == 0 && PageAnon(page)) ||
+ page_referenced(page, 0)) {
+ list_add(&page->lru, &l_active);
+ continue;
+ }
+ }
+ nr_deactivated++;
+ list_add(&page->lru, &l_inactive);
+ }
+
+ while (!list_empty(&l_inactive)) {
+ page = lru_to_page(&l_inactive);
+ z = page_zone(page);
+
+ spin_lock_irq(&z->lru_lock);
+ VM_BUG_ON(PageLRU(page));
+ SetPageLRU(page);
+ VM_BUG_ON(!PageActive(page));
+ ClearPageActive(page);
+
+ list_move(&page->lru, &z->inactive_list);
+ z->nr_inactive++;
+ spin_unlock_irq(&z->lru_lock);
+
+ put_page(page);
+ }
+
+ while (!list_empty(&l_active)) {
+ page = lru_to_page(&l_active);
+ z = page_zone(page);
+
+ spin_lock_irq(&z->lru_lock);
+ VM_BUG_ON(PageLRU(page));
+ SetPageLRU(page);
+ VM_BUG_ON(!PageActive(page));
+ list_move(&page->lru, &z->active_list);
+ z->nr_active++;
+ spin_unlock_irq(&z->lru_lock);
+
+ put_page(page);
+ }
+}
+
+/*
+ * This is a reworked shrink_zone() routine - it scans active pages firts,
+ * then inactive and returns the number of pages reclaimed
+ */
+static unsigned long container_shrink_pages(int priority,
+ struct rss_container *rss, struct scan_control *sc)
+{
+ unsigned long nr_pages;
+ unsigned long nr_to_scan;
+ unsigned long nr_reclaimed = 0;
+
+ nr_pages = (container_nr_physpages(rss) >> priority) + 1;
+ if (nr_pages < sc->swap_cluster_max)
+ nr_pages = 0;
+
+ while (nr_pages) {
+ nr_to_scan = min(nr_pages, (unsigned long)sc->swap_cluster_max);
+ nr_pages -= nr_to_scan;
+ container_shrink_pages_active(nr_to_scan, rss, sc);
+ }
+
+ nr_pages = (container_nr_physpages(rss) >> priority) + 1;
+ if (nr_pages < sc->swap_cluster_max)
+ nr_pages = 0;
+
+ while (nr_pages) {
+ nr_to_scan = min(nr_pages, (unsigned long)sc->swap_cluster_max);
+ nr_pages -= nr_to_scan;
+ nr_reclaimed += container_shrink_pages_inactive(nr_to_scan, rss, sc);
+ }
+
+ throttle_vm_writeout();
+ return nr_reclaimed;
+}
+
+/*
+ * This functions works like try_to_free_pages() - it tries
+ * to shrink bc's pages with increasing priority
+ */
+unsigned long container_try_to_free_pages(struct rss_container *rss)
+{
+ int priority;
+ int ret = 0;
+ unsigned long total_scanned = 0;
+ unsigned long nr_reclaimed = 0;
+ struct scan_control sc = {
+ .gfp_mask = GFP_KERNEL,
+ .may_writepage = !laptop_mode,
+ .swap_cluster_max = SWAP_CLUSTER_MAX,
+ .may_swap = 1,
+ .swappiness = vm_swappiness,
+ };
+
+ for (priority = DEF_PRIORITY; priority >= 0; priority--) {
+ sc.nr_scanned = 0;
+ nr_reclaimed += container_shrink_pages(priority, rss, &sc);
+ total_scanned += sc.nr_scanned;
+ if (nr_reclaimed > 1) {
+ ret = 1;
+ goto out;
+ }
+
+ if (total_scanned > sc.swap_cluster_max +
+ sc.swap_cluster_max / 2) {
+ wakeup_pdflush(laptop_mode ? 0 : total_scanned);
+ sc.may_writepage = 1;
+ }
+
+ if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
+ congestion_wait(WRITE, HZ/10);
+ }
+out:
+ return ret;
+}
+#endif
+
/*
* For kswapd, balance_pgdat() will work across all this node's zones until
* they are all at pages_high.
...

[RFC][PATCH 6/7] Account for the number of tasks within container [message #10894 is a reply to message #10888] Tue, 06 March 2007 15:02 Go to previous messageGo to next message
xemul is currently offline  xemul
Messages: 248
Registered: November 2005
Senior Member
Small and simple - each fork()/clone() is accounted
and rejected when limit is hit.

diff -upr linux-2.6.20.orig/include/linux/numproc_container.h linux-2.6.20-0/include/linux/numproc_container.h
--- linux-2.6.20.orig/include/linux/numproc_container.h 2007-03-06 13:39:17.000000000 +0300
+++ linux-2.6.20-0/include/linux/numproc_container.h 2007-03-06 13:33:28.000000000 +0300
@@ -0,0 +1,32 @@
+#ifndef __NUMPROC_CONTAINER_H__
+#define __NUMPROC_CONTAINER_H__
+/*
+ * Numproc container
+ *
+ * Copyright 2007 OpenVZ SWsoft Inc
+ *
+ * Author: Pavel Emelianov <xemul@openvz.org>
+ *
+ */
+
+#ifdef CONFIG_PROCESS_CONTAINER
+int container_proc_charge(struct task_struct *tsk);
+void container_proc_uncharge(struct task_struct *tsk);
+
+void container_numproc_init_early(void);
+#else
+static inline int container_proc_charge(struct task_struct *tsk)
+{
+ return 0;
+}
+
+static inline void container_proc_uncharge(struct task_struct *tsk)
+{
+}
+
+static inline void container_numproc_init_early(void)
+{
+}
+#endif
+
+#endif
diff -upr linux-2.6.20.orig/include/linux/sched.h linux-2.6.20-0/include/linux/sched.h
--- linux-2.6.20.orig/include/linux/sched.h 2007-03-06 13:33:28.000000000 +0300
+++ linux-2.6.20-0/include/linux/sched.h 2007-03-06 13:33:28.000000000 +0300
@@ -1052,6 +1055,9 @@ struct task_struct {
#ifdef CONFIG_FAULT_INJECTION
int make_it_fail;
#endif
+#ifdef CONFIG_PROCESS_CONTAINER
+ struct numproc_container *numproc_cnt;
+#endif
};

static inline pid_t process_group(struct task_struct *tsk)
diff -upr linux-2.6.20.orig/init/Kconfig linux-2.6.20-0/init/Kconfig
--- linux-2.6.20.orig/init/Kconfig 2007-03-06 13:33:28.000000000 +0300
+++ linux-2.6.20-0/init/Kconfig 2007-03-06 13:33:28.000000000 +0300
@@ -265,6 +265,12 @@ config CPUSETS
Provides a simple Resource Controller for monitoring and
controlling the total Resident Set Size of the tasks in a container

+config PROCESS_CONTAINER
+ bool "Numproc accounting container"
+ select RESOURCE_COUNTERS
+ help
+ Provides the-number-of-tasks accounting container
+
config SYSFS_DEPRECATED
bool "Create deprecated sysfs files"
default y
diff -upr linux-2.6.20.orig/kernel/Makefile linux-2.6.20-0/kernel/Makefile
--- linux-2.6.20.orig/kernel/Makefile 2007-03-06 13:33:28.000000000 +0300
+++ linux-2.6.20-0/kernel/Makefile 2007-03-06 13:33:28.000000000 +0300
@@ -51,6 +51,7 @@ obj-$(CONFIG_RELAY) += relay.o
obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
+obj-$(CONFIG_PROCESS_CONTAINER) += numproc_container.o

ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff -upr linux-2.6.20.orig/kernel/exit.c linux-2.6.20-0/kernel/exit.c
--- linux-2.6.20.orig/kernel/exit.c 2007-03-06 13:33:28.000000000 +0300
+++ linux-2.6.20-0/kernel/exit.c 2007-03-06 13:33:28.000000000 +0300
@@ -48,6 +48,8 @@
#include <asm/pgtable.h>
#include <asm/mmu_context.h>

+#include <linux/numproc_container.h>
+
extern void sem_exit (void);

static void exit_mm(struct task_struct * tsk);
@@ -174,6 +176,7 @@ repeat:
write_unlock_irq(&tasklist_lock);
proc_flush_task(p);
release_thread(p);
+ container_proc_uncharge(p);
call_rcu(&p->rcu, delayed_put_task_struct);

p = leader;
diff -upr linux-2.6.20.orig/kernel/fork.c linux-2.6.20-0/kernel/fork.c
--- linux-2.6.20.orig/kernel/fork.c 2007-03-06 13:33:28.000000000 +0300
+++ linux-2.6.20-0/kernel/fork.c 2007-03-06 13:33:28.000000000 +0300
@@ -57,6 +57,7 @@
#include <asm/tlbflush.h>

#include <linux/rss_container.h>
+#include <linux/numproc_container.h>

/*
* Protected counters by write_lock_irq(&tasklist_lock)
@@ -986,6 +999,9 @@ static struct task_struct *copy_process(
if (!p)
goto fork_out;

+ if (container_proc_charge(p))
+ goto charge_out;
+
rt_mutex_init_task(p);

#ifdef CONFIG_TRACE_IRQFLAGS
@@ -1302,6 +1318,8 @@ bad_fork_cleanup_count:
atomic_dec(&p->user->processes);
free_uid(p->user);
bad_fork_free:
+ container_proc_uncharge(p);
+charge_out:
free_task(p);
fork_out:
return ERR_PTR(retval);
diff -upr linux-2.6.20.orig/kernel/numproc_container.c linux-2.6.20-0/kernel/numproc_container.c
--- linux-2.6.20.orig/kernel/numproc_container.c 2007-03-06 13:39:17.000000000 +0300
+++ linux-2.6.20-0/kernel/numproc_container.c 2007-03-06 13:33:28.000000000 +0300
@@ -0,0 +1,151 @@
+/*
+ * Numproc accounting container
+ *
+ * Copyright 2007 OpenVZ SWsoft Inc
+ *
+ * Author: Pavel Emelianov <xemul@openvz.org>
+ *
+ */
+
+#include <linux/list.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/res_counter.h>
+#include <linux/numproc_container.h>
+
+static struct container_subsys numproc_subsys;
+
+struct numproc_container {
+ struct res_counter res;
+ struct container_subsys_state css;
+};
+
+static inline struct numproc_container *numproc_from_cont(struct container *cnt)
+{
+ return container_of(container_subsys_state(cnt, &numproc_subsys),
+ struct numproc_container, css);
+}
+
+int container_proc_charge(struct task_struct *new)
+{
+ struct numproc_container *np;
+
+ rcu_read_lock();
+ np = numproc_from_cont(task_container(current, &numproc_subsys));
+ css_get_current(&np->css);
+ rcu_read_unlock();
+
+ if (res_counter_charge(&np->res, 1)) {
+ css_put(&np->css);
+ return -ENOMEM;
+ }
+
+ new->numproc_cnt = np;
+ return 0;
+}
+
+void container_proc_uncharge(struct task_struct *tsk)
+{
+ struct numproc_container *np;
+
+ np = tsk->numproc_cnt;
+ res_counter_uncharge(&np->res, 1);
+ css_put(&np->css);
+}
+
+static int numproc_create(struct container_subsys *ss, struct container *cont)
+{
+ struct numproc_container *np;
+
+ np = kzalloc(sizeof(struct numproc_container), GFP_KERNEL);
+ if (np == NULL)
+ return -ENOMEM;
+
+ res_counter_init(&np->res);
+ cont->subsys[numproc_subsys.subsys_id] = &np->css;
+ return 0;
+}
+
+static void numproc_destroy(struct container_subsys *ss,
+ struct container *cont)
+{
+ kfree(numproc_from_cont(cont));
+}
+
+
+static ssize_t numproc_read(struct container *cont, struct cftype *cft,
+ struct file *file, char __user *userbuf,
+ size_t nbytes, loff_t *ppos)
+{
+ return res_counter_read(&numproc_from_cont(cont)->res, cft->private,
+ userbuf, nbytes, ppos);
+}
+
+static ssize_t numproc_write(struct container *cont, struct cftype *cft,
+ struct file *file, const char __user *userbuf,
+ size_t nbytes, loff_t *ppos)
+{
+ return res_counter_write(&numproc_from_cont(cont)->res, cft->private,
+ userbuf, nbytes, ppos);
+}
+
+
+static struct cftype numproc_usage = {
+ .name = "numproc_usage",
+ .private = RES_USAGE,
+ .read = numproc_read,
+};
+
+static struct cftype numproc_limit = {
+ .name = "numproc_limit",
+ .private = RES_LIMIT,
+ .read = numproc_read,
+ .write = numproc_write,
+};
+
+static struct cftype numproc_failcnt = {
+ .name = "numproc_failcnt",
+ .private = RES_FAILCNT,
+ .read = numproc_read,
+};
+
+static int numproc_populate(struct container_subsys *ss,
+ struct container *cont)
+{
+ int rc;
+
+ if ((rc = container_add_file(cont, &numproc_usage)) < 0)
+ return rc;
+ if ((rc = container_add_file(cont, &numproc_failcnt)) < 0)
+ return rc;
+ if ((rc = container_add_file(cont, &numproc_limit)) < 0)
+ return rc;
+
+ return 0;
+}
+
+static struct numproc_container init_numproc_container;
+
+static __init int numproc_create_early(struct container_subsys *ss,
+ struct container *cont)
+{
+ struct numproc_container *np;
+
+ np = &init_numproc_container;
+ res_counter_init(&np->res);
+ cont->subsys[numproc_subsys.subsys_id] = &np->css;
+ ss->create = numproc_create;
+ return 0;
+}
+
+static struct container_subsys numproc_subsys = {
+ .name = "numproc",
+ .create = numproc_create_early,
+ .destroy = numproc_destroy,
+ .populate = numproc_populate,
+};
+
+void __init container_numproc_init_early(void)
+{
+ container_register_subsys(&numproc_subsys);
+}
diff -upr linux-2.6.20.orig/kernel/container.c linux-2.6.20-0/kernel/container.c
--- linux-2.6.20.orig/kernel/container.c 2007-03-06 13:33:28.000000000 +0300
+++ linux-2.6.20-0/kernel/container.c 2007-03-06 13:35:48.000000000 +0300
@@ -60,6 +60,7 @@
#include <linux/mutex.h>

#include <linux/rss_container.h>
+#include <linux/numproc_container.h>

#define CONTAINER_SUPER_MAGIC 0x27e0eb

@@ -1721,6 +1725,7 @@ int __init container_init_early(void)
init_task.containers = &init_container_group;

container_rss_init_early();
+ container_numproc_init_early();

return 0;
}
...

[RFC][PATCH 7/7] Account for the number of files opened within container [message #10895 is a reply to message #10888] Tue, 06 March 2007 15:05 Go to previous messageGo to next message
xemul is currently offline  xemul
Messages: 248
Registered: November 2005
Senior Member
Simple again - increment usage counter at file open and
decrement at file close. Reject opening if limit is hit.

diff -upr linux-2.6.20.orig/fs/Makefile linux-2.6.20-0/fs/Makefile
--- linux-2.6.20.orig/fs/Makefile 2007-02-04 21:44:54.000000000 +0300
+++ linux-2.6.20-0/fs/Makefile 2007-03-06 13:33:28.000000000 +0300
@@ -19,6 +19,8 @@ else
obj-y += no-block.o
endif

+obj-$(CONFIG_FILES_CONTAINER) += numfiles_container.o
+
obj-$(CONFIG_INOTIFY) += inotify.o
obj-$(CONFIG_INOTIFY_USER) += inotify_user.o
obj-$(CONFIG_EPOLL) += eventpoll.o
diff -upr linux-2.6.20.orig/fs/file_table.c linux-2.6.20-0/fs/file_table.c
--- linux-2.6.20.orig/fs/file_table.c 2007-02-04 21:44:54.000000000 +0300
+++ linux-2.6.20-0/fs/file_table.c 2007-03-06 13:33:28.000000000 +0300
@@ -21,6 +21,7 @@
#include <linux/fsnotify.h>
#include <linux/sysctl.h>
#include <linux/percpu_counter.h>
+#include <linux/numfiles_container.h>

#include <asm/atomic.h>

@@ -42,6 +43,7 @@ static inline void file_free_rcu(struct

static inline void file_free(struct file *f)
{
+ container_file_uncharge(f);
percpu_counter_dec(&nr_files);
call_rcu(&f->f_u.fu_rcuhead, file_free_rcu);
}
@@ -109,6 +111,10 @@ struct file *get_empty_filp(void)

percpu_counter_inc(&nr_files);
memset(f, 0, sizeof(*f));
+
+ if (container_file_charge(f))
+ goto fail_charge;
+
if (security_file_alloc(f))
goto fail_sec;

@@ -132,7 +138,10 @@ over:
goto fail;

fail_sec:
- file_free(f);
+ container_file_uncharge(f);
+fail_charge:
+ percpu_counter_dec(&nr_files);
+ kmem_cache_free(filp_cachep, f);
fail:
return NULL;
}
diff -upr linux-2.6.20.orig/fs/numfiles_container.c linux-2.6.20-0/fs/numfiles_container.c
--- linux-2.6.20.orig/fs/numfiles_container.c 2007-03-06 13:39:17.000000000 +0300
+++ linux-2.6.20-0/fs/numfiles_container.c 2007-03-06 13:33:28.000000000 +0300
@@ -0,0 +1,152 @@
+/*
+ * Numfiles accounting container
+ *
+ * Copyright 2007 OpenVZ SWsoft Inc
+ *
+ * Author: Pavel Emelianov <xemul@openvz.org>
+ *
+ */
+
+#include <linux/list.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/res_counter.h>
+#include <linux/numfiles_container.h>
+
+static struct container_subsys numfiles_subsys;
+
+struct files_container {
+ struct res_counter res;
+ struct container_subsys_state css;
+};
+
+static inline struct files_container *numfiles_from_cont(struct container *cnt)
+{
+ return container_of(container_subsys_state(cnt, &numfiles_subsys),
+ struct files_container, css);
+}
+
+int container_file_charge(struct file *file)
+{
+ struct files_container *fc;
+
+ rcu_read_lock();
+ fc = numfiles_from_cont(task_container(current, &numfiles_subsys));
+ css_get_current(&fc->css);
+ rcu_read_unlock();
+
+ if (res_counter_charge(&fc->res, 1)) {
+ css_put(&fc->css);
+ return -ENOMEM;
+ }
+
+ file->f_cont = fc;
+ return 0;
+}
+
+void container_file_uncharge(struct file *file)
+{
+ struct files_container *fc;
+
+ fc = file->f_cont;
+ res_counter_uncharge(&fc->res, 1);
+ css_put(&fc->css);
+}
+
+static int numfiles_create(struct container_subsys *ss, struct container *cont)
+{
+ struct files_container *fc;
+
+ fc = kzalloc(sizeof(struct files_container), GFP_KERNEL);
+ if (fc == NULL)
+ return -ENOMEM;
+
+ res_counter_init(&fc->res);
+ cont->subsys[numfiles_subsys.subsys_id] = &fc->css;
+ return 0;
+}
+
+static void numfiles_destroy(struct container_subsys *ss,
+ struct container *cont)
+{
+ kfree(numfiles_from_cont(cont));
+}
+
+
+static ssize_t numfiles_read(struct container *cont, struct cftype *cft,
+ struct file *file, char __user *userbuf,
+ size_t nbytes, loff_t *ppos)
+{
+ return res_counter_read(&numfiles_from_cont(cont)->res, cft->private,
+ userbuf, nbytes, ppos);
+}
+
+static ssize_t numfiles_write(struct container *cont, struct cftype *cft,
+ struct file *file, const char __user *userbuf,
+ size_t nbytes, loff_t *ppos)
+{
+ return res_counter_write(&numfiles_from_cont(cont)->res, cft->private,
+ userbuf, nbytes, ppos);
+}
+
+
+static struct cftype numfiles_usage = {
+ .name = "numfiles_usage",
+ .private = RES_USAGE,
+ .read = numfiles_read,
+};
+
+static struct cftype numfiles_limit = {
+ .name = "numfiles_limit",
+ .private = RES_LIMIT,
+ .read = numfiles_read,
+ .write = numfiles_write,
+};
+
+static struct cftype numfiles_failcnt = {
+ .name = "numfiles_failcnt",
+ .private = RES_FAILCNT,
+ .read = numfiles_read,
+};
+
+static int numfiles_populate(struct container_subsys *ss,
+ struct container *cont)
+{
+ int rc;
+
+ if ((rc = container_add_file(cont, &numfiles_usage)) < 0)
+ return rc;
+ if ((rc = container_add_file(cont, &numfiles_failcnt)) < 0)
+ return rc;
+ if ((rc = container_add_file(cont, &numfiles_limit)) < 0)
+ return rc;
+
+ return 0;
+}
+
+static struct files_container init_files_container;
+
+static __init int numfiles_create_early(struct container_subsys *ss,
+ struct container *cont)
+{
+ struct files_container *np;
+
+ np = &init_files_container;
+ res_counter_init(&np->res);
+ cont->subsys[numfiles_subsys.subsys_id] = &np->css;
+ ss->create = numfiles_create;
+ return 0;
+}
+
+static struct container_subsys numfiles_subsys = {
+ .name = "numfiles",
+ .create = numfiles_create_early,
+ .destroy = numfiles_destroy,
+ .populate = numfiles_populate,
+};
+
+void __init container_numfiles_init_early(void)
+{
+ container_register_subsys(&numfiles_subsys);
+}
+
diff -upr linux-2.6.20.orig/include/linux/fs.h linux-2.6.20-0/include/linux/fs.h
--- linux-2.6.20.orig/include/linux/fs.h 2007-03-06 13:33:28.000000000 +0300
+++ linux-2.6.20-0/include/linux/fs.h 2007-03-06 13:33:28.000000000 +0300
@@ -739,6 +739,9 @@ struct file {
spinlock_t f_ep_lock;
#endif /* #ifdef CONFIG_EPOLL */
struct address_space *f_mapping;
+#ifdef CONFIG_FILES_CONTAINER
+ struct files_container *f_cont;
+#endif
};
extern spinlock_t files_lock;
#define file_list_lock() spin_lock(&files_lock);
diff -upr linux-2.6.20.orig/include/linux/numfiles_container.h linux-2.6.20-0/include/linux/numfiles_container.h
--- linux-2.6.20.orig/include/linux/numfiles_container.h 2007-03-06 13:39:17.000000000 +0300
+++ linux-2.6.20-0/include/linux/numfiles_container.h 2007-03-06 13:33:28.000000000 +0300
@@ -0,0 +1,33 @@
+#ifndef __NUMFILES_CONTAINER_H__
+#define __NUMFILES_CONTAINER_H__
+/*
+ * Numfiles container
+ *
+ * Copyright 2007 OpenVZ SWsoft Inc
+ *
+ * Author: Pavel Emelianov <xemul@openvz.org>
+ *
+ */
+
+#ifdef CONFIG_FILES_CONTAINER
+int container_file_charge(struct file *file);
+void container_file_uncharge(struct file *file);
+
+void container_numfiles_init_early(void);
+#else
+static inline int container_file_charge(struct file *file)
+{
+ return 0;
+}
+
+static inline void container_file_uncharge(struct file *file)
+{
+}
+
+static inline void container_numfiles_init_early(void)
+{
+}
+#endif
+
+#endif
+
diff -upr linux-2.6.20.orig/init/Kconfig linux-2.6.20-0/init/Kconfig
--- linux-2.6.20.orig/init/Kconfig 2007-03-06 13:33:28.000000000 +0300
+++ linux-2.6.20-0/init/Kconfig 2007-03-06 13:33:28.000000000 +0300
@@ -265,6 +265,12 @@ config CPUSETS
help
Provides the-number-of-tasks accounting container

+config FILES_CONTAINER
+ bool "Numfiles accounting container"
+ select RESOURCE_COUNTERS
+ help
+ Provides the-number-of-files accounting container
+
config SYSFS_DEPRECATED
bool "Create deprecated sysfs files"
default y
diff -upr linux-2.6.20.orig/kernel/container.c linux-2.6.20-0/kernel/container.c
--- linux-2.6.20.orig/kernel/container.c 2007-03-06 13:33:28.000000000 +0300
+++ linux-2.6.20-0/kernel/container.c 2007-03-06 13:35:48.000000000 +0300
@@ -60,6 +60,7 @@

#include <linux/rss_container.h>
#include <linux/numproc_container.h>
+#include <linux/numfiles_container.h>

#define CONTAINER_SUPER_MAGIC 0x27e0eb

@@ -1721,6 +1725,7 @@ int __init container_init_early(void)

container_rss_init_early();
container_numproc_init_early();
+ container_numfiles_init_early();

return 0;
}
...

Re: [RFC][PATCH 2/7] RSS controller core [message #10902 is a reply to message #10890] Tue, 06 March 2007 22:00 Go to previous messageGo to next message
Andrew Morton is currently offline  Andrew Morton
Messages: 127
Registered: December 2005
Senior Member
On Tue, 06 Mar 2007 17:55:29 +0300
Pavel Emelianov <xemul@sw.ru> wrote:

> +struct rss_container {
> + struct res_counter res;
> + struct list_head page_list;
> + struct container_subsys_state css;
> +};
> +
> +struct page_container {
> + struct page *page;
> + struct rss_container *cnt;
> + struct list_head list;
> +};

ah. This looks good. I'll find a hunk of time to go through this work
and through Paul's patches. It'd be good to get both patchsets lined
up in -mm within a couple of weeks. But..

We need to decide whether we want to do per-container memory limitation via
these data structures, or whether we do it via a physical scan of some
software zone, possibly based on Mel's patches.
Re: [RFC][PATCH 6/7] Account for the number of tasks within container [message #10906 is a reply to message #10894] Wed, 07 March 2007 02:00 Go to previous messageGo to next message
Paul Menage is currently offline  Paul Menage
Messages: 642
Registered: September 2006
Senior Member
Hi Pavel,

On 3/6/07, Pavel Emelianov <xemul@sw.ru> wrote:
> diff -upr linux-2.6.20.orig/include/linux/sched.h linux-2.6.20-0/include/linux/sched.h
> --- linux-2.6.20.orig/include/linux/sched.h 2007-03-06 13:33:28.000000000 +0300
> +++ linux-2.6.20-0/include/linux/sched.h 2007-03-06 13:33:28.000000000 +0300
> @@ -1052,6 +1055,9 @@ struct task_struct {
> #ifdef CONFIG_FAULT_INJECTION
> int make_it_fail;
> #endif
> +#ifdef CONFIG_PROCESS_CONTAINER
> + struct numproc_container *numproc_cnt;
> +#endif
> };

Why do you need a pointer added to task_struct? One of the main points
of the generic containers is to avoid every different subsystem and
resource controller having to add new pointers there.

> +
> + rcu_read_lock();
> + np = numproc_from_cont(task_container(current, &numproc_subsys));
> + css_get_current(&np->css);

There's no need to hold a reference here - by definition, the task's
container can't go away while the task is in it.

Also, shouldn't you have an attach() method to move the count from one
container to another when a task moves?

Paul
Re: [RFC][PATCH 0/7] Resource controllers based on process containers [message #10907 is a reply to message #10888] Wed, 07 March 2007 02:02 Go to previous messageGo to next message
Paul Menage is currently offline  Paul Menage
Messages: 642
Registered: September 2006
Senior Member
On 3/6/07, Pavel Emelianov <xemul@sw.ru> wrote:
> 2. Extended containers may register themselves too late.
> Kernel threads/helpers start forking, opening files
> and touching pages much earlier. This patchset
> workarounds this in not-so-cute manner and I'm waiting
> for Paul's comments on this issue.
>

Can we not make sure that each subsystem registers itself before any
of its resources become usable? So the file counting subsystem should
register at some point before filp_open() becomes usable, and the
process counting subsystem should register before it's possible to
fork, etc.

Paul
Re: [RFC][PATCH 1/7] Resource counters [message #10908 is a reply to message #10889] Wed, 07 March 2007 04:03 Go to previous messageGo to next message
Balbir Singh is currently offline  Balbir Singh
Messages: 491
Registered: August 2006
Senior Member
Pavel Emelianov wrote:
> Introduce generic structures and routines for
> resource accounting.
>
> Each resource accounting container is supposed to
> aggregate it, container_subsystem_state and its
> resource-specific members within.
>
>
> ------------------------------------------------------------ ------------
>
> diff -upr linux-2.6.20.orig/include/linux/res_counter.h linux-2.6.20-0/include/linux/res_counter.h
> --- linux-2.6.20.orig/include/linux/res_counter.h 2007-03-06 13:39:17.000000000 +0300
> +++ linux-2.6.20-0/include/linux/res_counter.h 2007-03-06 13:33:28.000000000 +0300
> @@ -0,0 +1,83 @@
> +#ifndef __RES_COUNTER_H__
> +#define __RES_COUNTER_H__
> +/*
> + * resource counters
> + *
> + * Copyright 2007 OpenVZ SWsoft Inc
> + *
> + * Author: Pavel Emelianov <xemul@openvz.org>
> + *
> + */
> +
> +#include <linux/container.h>
> +
> +struct res_counter {
> + unsigned long usage;
> + unsigned long limit;
> + unsigned long failcnt;
> + spinlock_t lock;
> +};
> +
> +enum {
> + RES_USAGE,
> + RES_LIMIT,
> + RES_FAILCNT,
> +};
> +
> +ssize_t res_counter_read(struct res_counter *cnt, int member,
> + const char __user *buf, size_t nbytes, loff_t *pos);
> +ssize_t res_counter_write(struct res_counter *cnt, int member,
> + const char __user *buf, size_t nbytes, loff_t *pos);
> +
> +static inline void res_counter_init(struct res_counter *cnt)
> +{
> + spin_lock_init(&cnt->lock);
> + cnt->limit = (unsigned long)LONG_MAX;
> +}
> +

Is there any way to indicate that there are no limits on this container.
LONG_MAX is quite huge, but still when the administrator wants to
configure a container to *un-limited usage*, it becomes hard for
the administrator.

> +static inline int res_counter_charge_locked(struct res_counter *cnt,
> + unsigned long val)
> +{
> + if (cnt->usage <= cnt->limit - val) {
> + cnt->usage += val;
> + return 0;
> + }
> +
> + cnt->failcnt++;
> + return -ENOMEM;
> +}
> +
> +static inline int res_counter_charge(struct res_counter *cnt,
> + unsigned long val)
> +{
> + int ret;
> + unsigned long flags;
> +
> + spin_lock_irqsave(&cnt->lock, flags);
> + ret = res_counter_charge_locked(cnt, val);
> + spin_unlock_irqrestore(&cnt->lock, flags);
> + return ret;
> +}
> +

Will atomic counters help here.

> +static inline void res_counter_uncharge_locked(struct res_counter *cnt,
> + unsigned long val)
> +{
> + if (unlikely(cnt->usage < val)) {
> + WARN_ON(1);
> + val = cnt->usage;
> + }
> +
> + cnt->usage -= val;
> +}
> +
> +static inline void res_counter_uncharge(struct res_counter *cnt,
> + unsigned long val)
> +{
> + unsigned long flags;
> +
> + spin_lock_irqsave(&cnt->lock, flags);
> + res_counter_uncharge_locked(cnt, val);
> + spin_unlock_irqrestore(&cnt->lock, flags);
> +}
> +
> +#endif
> diff -upr linux-2.6.20.orig/init/Kconfig linux-2.6.20-0/init/Kconfig
> --- linux-2.6.20.orig/init/Kconfig 2007-03-06 13:33:28.000000000 +0300
> +++ linux-2.6.20-0/init/Kconfig 2007-03-06 13:33:28.000000000 +0300
> @@ -265,6 +265,10 @@ config CPUSETS
>
> Say N if unsure.
>
> +config RESOURCE_COUNTERS
> + bool
> + select CONTAINERS
> +
> config SYSFS_DEPRECATED
> bool "Create deprecated sysfs files"
> default y
> diff -upr linux-2.6.20.orig/kernel/Makefile linux-2.6.20-0/kernel/Makefile
> --- linux-2.6.20.orig/kernel/Makefile 2007-03-06 13:33:28.000000000 +0300
> +++ linux-2.6.20-0/kernel/Makefile 2007-03-06 13:33:28.000000000 +0300
> @@ -51,6 +51,7 @@ obj-$(CONFIG_RELAY) += relay.o
> obj-$(CONFIG_UTS_NS) += utsname.o
> obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
> obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
> +obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
>
> ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
> # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
> diff -upr linux-2.6.20.orig/kernel/res_counter.c linux-2.6.20-0/kernel/res_counter.c
> --- linux-2.6.20.orig/kernel/res_counter.c 2007-03-06 13:39:17.000000000 +0300
> +++ linux-2.6.20-0/kernel/res_counter.c 2007-03-06 13:33:28.000000000 +0300
> @@ -0,0 +1,72 @@
> +/*
> + * resource containers
> + *
> + * Copyright 2007 OpenVZ SWsoft Inc
> + *
> + * Author: Pavel Emelianov <xemul@openvz.org>
> + *
> + */
> +
> +#include <linux/parser.h>
> +#include <linux/fs.h>
> +#include <linux/res_counter.h>
> +#include <asm/uaccess.h>
> +
> +static inline unsigned long *res_counter_member(struct res_counter *cnt, int member)
> +{
> + switch (member) {
> + case RES_USAGE:
> + return &cnt->usage;
> + case RES_LIMIT:
> + return &cnt->limit;
> + case RES_FAILCNT:
> + return &cnt->failcnt;
> + };
> +
> + BUG();
> + return NULL;
> +}
> +
> +ssize_t res_counter_read(struct res_counter *cnt, int member,
> + const char __user *userbuf, size_t nbytes, loff_t *pos)
> +{
> + unsigned long *val;
> + char buf[64], *s;
> +
> + s = buf;
> + val = res_counter_member(cnt, member);
> + s += sprintf(s, "%lu\n", *val);
> + return simple_read_from_buffer((void __user *)userbuf, nbytes,
> + pos, buf, s - buf);
> +}
> +
> +ssize_t res_counter_write(struct res_counter *cnt, int member,
> + const char __user *userbuf, size_t nbytes, loff_t *pos)
> +{
> + int ret;
> + char *buf, *end;
> + unsigned long tmp, *val;
> +
> + buf = kmalloc(nbytes + 1, GFP_KERNEL);
> + ret = -ENOMEM;
> + if (buf == NULL)
> + goto out;
> +
> + buf[nbytes] = 0;
> + ret = -EFAULT;
> + if (copy_from_user(buf, userbuf, nbytes))
> + goto out_free;
> +
> + ret = -EINVAL;
> + tmp = simple_strtoul(buf, &end, 10);
> + if (*end != '\0')
> + goto out_free;
> +
> + val = res_counter_member(cnt, member);
> + *val = tmp;
> + ret = nbytes;
> +out_free:
> + kfree(buf);
> +out:
> + return ret;
> +}
>


These bits look a little out of sync, with no users for these routines in
this patch. Won't you get a compiler warning, compiling this bit alone?

--
Warm Regards,
Balbir Singh
Linux Technology Center
IBM, ISTL
Re: [RFC][PATCH 2/7] RSS controller core [message #10909 is a reply to message #10890] Wed, 07 March 2007 05:37 Go to previous messageGo to next message
Balbir Singh is currently offline  Balbir Singh
Messages: 491
Registered: August 2006
Senior Member
Pavel Emelianov wrote:
> This includes setup of RSS container within generic
> process containers, all the declarations used in RSS
> accounting, and core code responsible for accounting.
>
>
> ------------------------------------------------------------ ------------
>
> diff -upr linux-2.6.20.orig/include/linux/rss_container.h linux-2.6.20-0/include/linux/rss_container.h
> --- linux-2.6.20.orig/include/linux/rss_container.h 2007-03-06 13:39:17.000000000 +0300
> +++ linux-2.6.20-0/include/linux/rss_container.h 2007-03-06 13:33:28.000000000 +0300
> @@ -0,0 +1,68 @@
> +#ifndef __RSS_CONTAINER_H__
> +#define __RSS_CONTAINER_H__
> +/*
> + * RSS container
> + *
> + * Copyright 2007 OpenVZ SWsoft Inc
> + *
> + * Author: Pavel Emelianov <xemul@openvz.org>
> + *
> + */
> +
> +struct page_container;
> +struct rss_container;
> +
> +#ifdef CONFIG_RSS_CONTAINER
> +int container_rss_prepare(struct page *, struct vm_area_struct *vma,
> + struct page_container **);
> +
> +void container_rss_add(struct page_container *);
> +void container_rss_del(struct page_container *);
> +void container_rss_release(struct page_container *);
> +
> +int mm_init_container(struct mm_struct *mm, struct task_struct *tsk);
> +void mm_free_container(struct mm_struct *mm);
> +
> +unsigned long container_isolate_pages(unsigned long nr_to_scan,
> + struct rss_container *rss, struct list_head *dst,
> + int active, unsigned long *scanned);
> +unsigned long container_nr_physpages(struct rss_container *rss);
> +
> +unsigned long container_try_to_free_pages(struct rss_container *);
> +void container_out_of_memory(struct rss_container *);
> +
> +void container_rss_init_early(void);
> +#else
> +static inline int container_rss_prepare(struct page *pg,
> + struct vm_area_struct *vma, struct page_container **pc)
> +{
> + *pc = NULL; /* to make gcc happy */
> + return 0;
> +}
> +
> +static inline void container_rss_add(struct page_container *pc)
> +{
> +}
> +
> +static inline void container_rss_del(struct page_container *pc)
> +{
> +}
> +
> +static inline void container_rss_release(struct page_container *pc)
> +{
> +}
> +
> +static inline int mm_init_container(struct mm_struct *mm, struct task_struct *t)
> +{
> + return 0;
> +}
> +
> +static inline void mm_free_container(struct mm_struct *mm)
> +{
> +}
> +
> +static inline void container_rss_init_early(void)
> +{
> +}
> +#endif
> +#endif
> diff -upr linux-2.6.20.orig/init/Kconfig linux-2.6.20-0/init/Kconfig
> --- linux-2.6.20.orig/init/Kconfig 2007-03-06 13:33:28.000000000 +0300
> +++ linux-2.6.20-0/init/Kconfig 2007-03-06 13:33:28.000000000 +0300
> @@ -265,6 +265,13 @@ config CPUSETS
> bool
> select CONTAINERS
>
> +config RSS_CONTAINER
> + bool "RSS accounting container"
> + select RESOURCE_COUNTERS
> + help
> + Provides a simple Resource Controller for monitoring and
> + controlling the total Resident Set Size of the tasks in a container
> +

The wording looks very familiar :-). It would be useful to add
"The reclaim logic is now container aware, when the container goes overlimit
the page reclaimer reclaims pages belonging to this container. If we are
unable to reclaim enough pages to satisfy the request, the process is
killed with an out of memory warning"

> config SYSFS_DEPRECATED
> bool "Create deprecated sysfs files"
> default y
> diff -upr linux-2.6.20.orig/mm/Makefile linux-2.6.20-0/mm/Makefile
> --- linux-2.6.20.orig/mm/Makefile 2007-02-04 21:44:54.000000000 +0300
> +++ linux-2.6.20-0/mm/Makefile 2007-03-06 13:33:28.000000000 +0300
> @@ -29,3 +29,5 @@ obj-$(CONFIG_MEMORY_HOTPLUG) += memory_h
> obj-$(CONFIG_FS_XIP) += filemap_xip.o
> obj-$(CONFIG_MIGRATION) += migrate.o
> obj-$(CONFIG_SMP) += allocpercpu.o
> +
> +obj-$(CONFIG_RSS_CONTAINER) += rss_container.o
> diff -upr linux-2.6.20.orig/mm/rss_container.c linux-2.6.20-0/mm/rss_container.c
> --- linux-2.6.20.orig/mm/rss_container.c 2007-03-06 13:39:17.000000000 +0300
> +++ linux-2.6.20-0/mm/rss_container.c 2007-03-06 13:33:28.000000000 +0300
> @@ -0,0 +1,307 @@
> +/*
> + * RSS accounting container
> + *
> + * Copyright 2007 OpenVZ SWsoft Inc
> + *
> + * Author: Pavel Emelianov <xemul@openvz.org>
> + *
> + */
> +
> +#include <linux/list.h>
> +#include <linux/sched.h>
> +#include <linux/mm.h>
> +#include <linux/res_counter.h>
> +#include <linux/rss_container.h>
> +
> +static struct container_subsys rss_subsys;
> +
> +struct rss_container {
> + struct res_counter res;
> + struct list_head page_list;
> + struct container_subsys_state css;
> +};
> +
> +struct page_container {
> + struct page *page;
> + struct rss_container *cnt;
> + struct list_head list;
> +};
> +

Yes, this is what I was planning to get to -- a per container LRU list.
But you have just one list, don't you need active and inactive lists?
When the global LRU is manipulated, shouldn't this list be updated as
well, so that reclaim will pick the best pages.

> +static inline struct rss_container *rss_from_cont(struct container *cnt)
> +{
> + return container_of(container_subsys_state(cnt, &rss_subsys),
> + struct rss_container, css);
> +}
> +
> +int mm_init_container(struct mm_struct *mm, struct task_struct *tsk)
> +{
> + struct rss_container *cnt;
> +
> + cnt = rss_from_cont(task_container(tsk, &rss_subsys));
> + if (css_get(&cnt->css))
> + return -EBUSY;
> +
> + mm->rss_container = cnt;
> + return 0;
> +}
> +
> +void mm_free_container(struct mm_struct *mm)
> +{
> + css_put(&mm->rss_container->css);
> +}
> +
> +int container_rss_prepare(struct page *page, struct vm_area_struct *vma,
> + struct page_container **ppc)
> +{
> + struct rss_container *rss;
> + struct page_container *pc;
> +
> + rcu_read_lock();
> + rss = rcu_dereference(vma->vm_mm->rss_container);
> + css_get_current(&rss->css);
> + rcu_read_unlock();
> +
> + pc = kmalloc(sizeof(struct page_container), GFP_KERNEL);
> + if (pc == NULL)
> + goto out_nomem;
> +
> + while (res_counter_charge(&rss->res, 1)) {
> + if (container_try_to_free_pages(rss))
> + continue;
> +

The return codes of the functions is a bit confusing, ideally
container_try_to_free_pages() should return 0 on success. Also
res_counter_charge() has a WARN_ON(1) if the limit is exceeded.
The system administrator can figure out the details from failcnt,
I suspect when the container is running close to it's limit,
dmesg will have too many WARNING messages.

How much memory do you try to reclaim in container_try_to_free_pages()?
With my patches, I was planning to export this knob to userspace with
a default value. This will help the administrator decide how much
of the working set/container LRU should be freed on reaching the limit.
I cannot find the definition of container_try_to_free_pages() in
this patch.



> + container_out_of_memory(rss);
> + if (test_thread_flag(TIF_MEMDIE))
> + goto out_charge;
> + }
> +
> + pc->page = page;
> + pc->cnt = rss;
> + *ppc = pc;
> + return 0;
> +
> +out_charge:
> + kfree(pc);
> +out_nomem:
> + css_put(&rss->css);
> + return -ENOMEM;
> +}
> +
> +void container_rss_release(struct page_container *pc)
> +{
> + struct rss_container *rss;
> +
> + rss = pc->cnt;
> + res_counter_uncharge(&rss->res, 1);
> + css_put(&rss->css);
> + kfree(pc);
> +}
> +
> +void container_rss_add(struct page_container *pc)
> +{
> + struct page *pg;
> + struct rss_container *rss;
> +
> + pg = pc->page;
> + rss = pc->cnt;
> +
> + spin_lock(&rss->res.lock);
> + list_add(&pc->list, &rss->page_list);

This is not good, it won't give us LRU behaviour which is
useful for determining which pages to free.

> + spin_unlock(&rss->res.lock);
> +
> + page_container(pg) = pc;
> +}
> +
> +void container_rss_del(struct page_container *pc)
> +{
> + struct page *page;
> + struct rss_container *rss;
> +
> + page = pc->page;
> + rss = pc->cnt;
> +
> + spin_lock(&rss->res.lock);
> + list_del(&pc->list);
> + res_counter_uncharge_locked(&rss->res, 1);
> + spin_unlock(&rss->res.lock);
> +
> + css_put(&rss->css);
> + kfree(pc);
> +}
> +
> +unsigned long container_isolate_pages(unsigned long nr_to_scan,
> + struct rss_container *rss, struct list_head *dst,
> + int active, unsigned long *scanned)
> +{
> + unsigned long nr_taken = 0;
> + struct page *page;
> + struct page_container *pc;
> + unsigned long scan;
> + struct list_head *src;
> + LIST_HEAD(pc_list);
> + struct zone *z;
> +
> + spin_lock_irq(&rss->res.lock);
> + src = &rss->page_list;
> +

Which part of the working set are we pushing out, this looks like
we are using FIFO to determine which pages to reclaim. This needs
to be FIXED.

> + for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
> + pc = list_entry(src->prev, struct page_container, list);
> + page = pc->page;
> + z = page_zone(page);
> +
> + list_move(&pc->list, &pc_list);
> +
> + spin_lock(&z->lru_lock);
> + if (PageLRU(page)) {
> + if ((active && PageActive(page)) ||
> + (!active && !PageActive(page))) {
> + if (likely(get_page_un
...

Re: [RFC][PATCH 0/7] Resource controllers based on process containers [message #10910 is a reply to message #10888] Wed, 07 March 2007 06:52 Go to previous messageGo to next message
Balbir Singh is currently offline  Balbir Singh
Messages: 491
Registered: August 2006
Senior Member
Pavel Emelianov wrote:
> This patchset adds RSS, accounting and control and
> limiting the number of tasks and files within container.
>
> Based on top of Paul Menage's container subsystem v7
>
> RSS controller includes per-container RSS accounter,
> reclamation and OOM killer. It behaves like standalone
> machine - when container runs out of resources it tries
> to reclaim some pages and if it doesn't succeed in it
> kills some task which mm_struct belongs to container in
> question.
>
> Num tasks and files containers are very simple and
> self-descriptive from code.
>
> As discussed before when a task moves from one container
> to another no resources follow it - they keep holding the
> container they were allocated in.
>

I have one problem with the patchset, I cannot compile
the patches individually and some of the code is hard
to read as it depends on functions from future patches.
Patch 2, 3 and 4 fail to compile without patch 5 applied.

Patch 1 failed to apply with a reject in kernel/Makefile
I applied it on top of 2.6.20 with all of Paul Menage's
patches (all 7).



--
Warm Regards,
Balbir Singh
Linux Technology Center
IBM, ISTL
Re: [RFC][PATCH 6/7] Account for the number of tasks within container [message #10912 is a reply to message #10906] Wed, 07 March 2007 07:10 Go to previous messageGo to next message
xemul is currently offline  xemul
Messages: 248
Registered: November 2005
Senior Member
Paul Menage wrote:
> Hi Pavel,
>
> On 3/6/07, Pavel Emelianov <xemul@sw.ru> wrote:
>> diff -upr linux-2.6.20.orig/include/linux/sched.h
>> linux-2.6.20-0/include/linux/sched.h
>> --- linux-2.6.20.orig/include/linux/sched.h 2007-03-06
>> 13:33:28.000000000 +0300
>> +++ linux-2.6.20-0/include/linux/sched.h 2007-03-06
>> 13:33:28.000000000 +0300
>> @@ -1052,6 +1055,9 @@ struct task_struct {
>> #ifdef CONFIG_FAULT_INJECTION
>> int make_it_fail;
>> #endif
>> +#ifdef CONFIG_PROCESS_CONTAINER
>> + struct numproc_container *numproc_cnt;
>> +#endif
>> };
>
> Why do you need a pointer added to task_struct? One of the main points
> of the generic containers is to avoid every different subsystem and
> resource controller having to add new pointers there.
>
>> +
>> + rcu_read_lock();
>> + np = numproc_from_cont(task_container(current, &numproc_subsys));
>> + css_get_current(&np->css);
>
> There's no need to hold a reference here - by definition, the task's
> container can't go away while the task is in it.
>
> Also, shouldn't you have an attach() method to move the count from one
> container to another when a task moves?

The idea is:

Task may be "the entity that allocates the resources" and "the
entity that is a resource allocated".

When task is the first entity it may move across containers
(that is implemented in your patches). When task is a resource
it shouldn't move across containers like files or pages do.

More generally - allocated resources hold reference to original
container till they die. No resource migration is performed.

Did I express my idea cleanly?

> Paul
>
Re: [RFC][PATCH 1/7] Resource counters [message #10913 is a reply to message #10908] Wed, 07 March 2007 07:17 Go to previous messageGo to next message
xemul is currently offline  xemul
Messages: 248
Registered: November 2005
Senior Member
Balbir Singh wrote:
> Pavel Emelianov wrote:
>> Introduce generic structures and routines for
>> resource accounting.
>>
>> Each resource accounting container is supposed to
>> aggregate it, container_subsystem_state and its
>> resource-specific members within.
>>
>>
>> ------------------------------------------------------------ ------------
>>
>> diff -upr linux-2.6.20.orig/include/linux/res_counter.h
>> linux-2.6.20-0/include/linux/res_counter.h
>> --- linux-2.6.20.orig/include/linux/res_counter.h 2007-03-06
>> 13:39:17.000000000 +0300
>> +++ linux-2.6.20-0/include/linux/res_counter.h 2007-03-06
>> 13:33:28.000000000 +0300
>> @@ -0,0 +1,83 @@
>> +#ifndef __RES_COUNTER_H__
>> +#define __RES_COUNTER_H__
>> +/*
>> + * resource counters
>> + *
>> + * Copyright 2007 OpenVZ SWsoft Inc
>> + *
>> + * Author: Pavel Emelianov <xemul@openvz.org>
>> + *
>> + */
>> +
>> +#include <linux/container.h>
>> +
>> +struct res_counter {
>> + unsigned long usage;
>> + unsigned long limit;
>> + unsigned long failcnt;
>> + spinlock_t lock;
>> +};
>> +
>> +enum {
>> + RES_USAGE,
>> + RES_LIMIT,
>> + RES_FAILCNT,
>> +};
>> +
>> +ssize_t res_counter_read(struct res_counter *cnt, int member,
>> + const char __user *buf, size_t nbytes, loff_t *pos);
>> +ssize_t res_counter_write(struct res_counter *cnt, int member,
>> + const char __user *buf, size_t nbytes, loff_t *pos);
>> +
>> +static inline void res_counter_init(struct res_counter *cnt)
>> +{
>> + spin_lock_init(&cnt->lock);
>> + cnt->limit = (unsigned long)LONG_MAX;
>> +}
>> +
>
> Is there any way to indicate that there are no limits on this container.

Yes - LONG_MAX is essentially a "no limit" value as no
container will ever have such many files :)

> LONG_MAX is quite huge, but still when the administrator wants to
> configure a container to *un-limited usage*, it becomes hard for
> the administrator.
>
>> +static inline int res_counter_charge_locked(struct res_counter *cnt,
>> + unsigned long val)
>> +{
>> + if (cnt->usage <= cnt->limit - val) {
>> + cnt->usage += val;
>> + return 0;
>> + }
>> +
>> + cnt->failcnt++;
>> + return -ENOMEM;
>> +}
>> +
>> +static inline int res_counter_charge(struct res_counter *cnt,
>> + unsigned long val)
>> +{
>> + int ret;
>> + unsigned long flags;
>> +
>> + spin_lock_irqsave(&cnt->lock, flags);
>> + ret = res_counter_charge_locked(cnt, val);
>> + spin_unlock_irqrestore(&cnt->lock, flags);
>> + return ret;
>> +}
>> +
>
> Will atomic counters help here.

I'm afraid no. We have to atomically check for limit and alter
one of usage or failcnt depending on the checking result. Making
this with atomic_xxx ops will require at least two ops.

If we'll remove failcnt this would look like
while (atomic_cmpxchg(...))
which is also not that good.

Moreover - in RSS accounting patches I perform page list
manipulations under this lock, so this also saves one atomic op.

>> +static inline void res_counter_uncharge_locked(struct res_counter *cnt,
>> + unsigned long val)
>> +{
>> + if (unlikely(cnt->usage < val)) {
>> + WARN_ON(1);
>> + val = cnt->usage;
>> + }
>> +
>> + cnt->usage -= val;
>> +}
>> +
>> +static inline void res_counter_uncharge(struct res_counter *cnt,
>> + unsigned long val)
>> +{
>> + unsigned long flags;
>> +
>> + spin_lock_irqsave(&cnt->lock, flags);
>> + res_counter_uncharge_locked(cnt, val);
>> + spin_unlock_irqrestore(&cnt->lock, flags);
>> +}
>> +
>> +#endif
>> diff -upr linux-2.6.20.orig/init/Kconfig linux-2.6.20-0/init/Kconfig
>> --- linux-2.6.20.orig/init/Kconfig 2007-03-06 13:33:28.000000000 +0300
>> +++ linux-2.6.20-0/init/Kconfig 2007-03-06 13:33:28.000000000 +0300
>> @@ -265,6 +265,10 @@ config CPUSETS
>>
>> Say N if unsure.
>>
>> +config RESOURCE_COUNTERS
>> + bool
>> + select CONTAINERS
>> +
>> config SYSFS_DEPRECATED
>> bool "Create deprecated sysfs files"
>> default y
>> diff -upr linux-2.6.20.orig/kernel/Makefile
>> linux-2.6.20-0/kernel/Makefile
>> --- linux-2.6.20.orig/kernel/Makefile 2007-03-06 13:33:28.000000000
>> +0300
>> +++ linux-2.6.20-0/kernel/Makefile 2007-03-06 13:33:28.000000000 +0300
>> @@ -51,6 +51,7 @@ obj-$(CONFIG_RELAY) += relay.o
>> obj-$(CONFIG_UTS_NS) += utsname.o
>> obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
>> obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
>> +obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
>>
>> ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
>> # According to Alan Modra <alan@linuxcare.com.au>, the
>> -fno-omit-frame-pointer is
>> diff -upr linux-2.6.20.orig/kernel/res_counter.c
>> linux-2.6.20-0/kernel/res_counter.c
>> --- linux-2.6.20.orig/kernel/res_counter.c 2007-03-06
>> 13:39:17.000000000 +0300
>> +++ linux-2.6.20-0/kernel/res_counter.c 2007-03-06
>> 13:33:28.000000000 +0300
>> @@ -0,0 +1,72 @@
>> +/*
>> + * resource containers
>> + *
>> + * Copyright 2007 OpenVZ SWsoft Inc
>> + *
>> + * Author: Pavel Emelianov <xemul@openvz.org>
>> + *
>> + */
>> +
>> +#include <linux/parser.h>
>> +#include <linux/fs.h>
>> +#include <linux/res_counter.h>
>> +#include <asm/uaccess.h>
>> +
>> +static inline unsigned long *res_counter_member(struct res_counter
>> *cnt, int member)
>> +{
>> + switch (member) {
>> + case RES_USAGE:
>> + return &cnt->usage;
>> + case RES_LIMIT:
>> + return &cnt->limit;
>> + case RES_FAILCNT:
>> + return &cnt->failcnt;
>> + };
>> +
>> + BUG();
>> + return NULL;
>> +}
>> +
>> +ssize_t res_counter_read(struct res_counter *cnt, int member,
>> + const char __user *userbuf, size_t nbytes, loff_t *pos)
>> +{
>> + unsigned long *val;
>> + char buf[64], *s;
>> +
>> + s = buf;
>> + val = res_counter_member(cnt, member);
>> + s += sprintf(s, "%lu\n", *val);
>> + return simple_read_from_buffer((void __user *)userbuf, nbytes,
>> + pos, buf, s - buf);
>> +}
>> +
>> +ssize_t res_counter_write(struct res_counter *cnt, int member,
>> + const char __user *userbuf, size_t nbytes, loff_t *pos)
>> +{
>> + int ret;
>> + char *buf, *end;
>> + unsigned long tmp, *val;
>> +
>> + buf = kmalloc(nbytes + 1, GFP_KERNEL);
>> + ret = -ENOMEM;
>> + if (buf == NULL)
>> + goto out;
>> +
>> + buf[nbytes] = 0;
>> + ret = -EFAULT;
>> + if (copy_from_user(buf, userbuf, nbytes))
>> + goto out_free;
>> +
>> + ret = -EINVAL;
>> + tmp = simple_strtoul(buf, &end, 10);
>> + if (*end != '\0')
>> + goto out_free;
>> +
>> + val = res_counter_member(cnt, member);
>> + *val = tmp;
>> + ret = nbytes;
>> +out_free:
>> + kfree(buf);
>> +out:
>> + return ret;
>> +}
>>
>
>
> These bits look a little out of sync, with no users for these routines in
> this patch. Won't you get a compiler warning, compiling this bit alone?
>

Nope - when you have a non-static function without users in a
file no compiler warning produced.
...

Re: [RFC][PATCH 2/7] RSS controller core [message #10914 is a reply to message #10909] Wed, 07 March 2007 07:25 Go to previous messageGo to next message
xemul is currently offline  xemul
Messages: 248
Registered: November 2005
Senior Member
Balbir Singh wrote:
> Pavel Emelianov wrote:
>> This includes setup of RSS container within generic
>> process containers, all the declarations used in RSS
>> accounting, and core code responsible for accounting.
>>
>>
>> ------------------------------------------------------------ ------------
>>
>> diff -upr linux-2.6.20.orig/include/linux/rss_container.h
>> linux-2.6.20-0/include/linux/rss_container.h
>> --- linux-2.6.20.orig/include/linux/rss_container.h 2007-03-06
>> 13:39:17.000000000 +0300
>> +++ linux-2.6.20-0/include/linux/rss_container.h 2007-03-06
>> 13:33:28.000000000 +0300
>> @@ -0,0 +1,68 @@
>> +#ifndef __RSS_CONTAINER_H__
>> +#define __RSS_CONTAINER_H__
>> +/*
>> + * RSS container
>> + *
>> + * Copyright 2007 OpenVZ SWsoft Inc
>> + *
>> + * Author: Pavel Emelianov <xemul@openvz.org>
>> + *
>> + */
>> +
>> +struct page_container;
>> +struct rss_container;
>> +
>> +#ifdef CONFIG_RSS_CONTAINER
>> +int container_rss_prepare(struct page *, struct vm_area_struct *vma,
>> + struct page_container **);
>> +
>> +void container_rss_add(struct page_container *);
>> +void container_rss_del(struct page_container *);
>> +void container_rss_release(struct page_container *);
>> +
>> +int mm_init_container(struct mm_struct *mm, struct task_struct *tsk);
>> +void mm_free_container(struct mm_struct *mm);
>> +
>> +unsigned long container_isolate_pages(unsigned long nr_to_scan,
>> + struct rss_container *rss, struct list_head *dst,
>> + int active, unsigned long *scanned);
>> +unsigned long container_nr_physpages(struct rss_container *rss);
>> +
>> +unsigned long container_try_to_free_pages(struct rss_container *);
>> +void container_out_of_memory(struct rss_container *);
>> +
>> +void container_rss_init_early(void);
>> +#else
>> +static inline int container_rss_prepare(struct page *pg,
>> + struct vm_area_struct *vma, struct page_container **pc)
>> +{
>> + *pc = NULL; /* to make gcc happy */
>> + return 0;
>> +}
>> +
>> +static inline void container_rss_add(struct page_container *pc)
>> +{
>> +}
>> +
>> +static inline void container_rss_del(struct page_container *pc)
>> +{
>> +}
>> +
>> +static inline void container_rss_release(struct page_container *pc)
>> +{
>> +}
>> +
>> +static inline int mm_init_container(struct mm_struct *mm, struct
>> task_struct *t)
>> +{
>> + return 0;
>> +}
>> +
>> +static inline void mm_free_container(struct mm_struct *mm)
>> +{
>> +}
>> +
>> +static inline void container_rss_init_early(void)
>> +{
>> +}
>> +#endif
>> +#endif
>> diff -upr linux-2.6.20.orig/init/Kconfig linux-2.6.20-0/init/Kconfig
>> --- linux-2.6.20.orig/init/Kconfig 2007-03-06 13:33:28.000000000 +0300
>> +++ linux-2.6.20-0/init/Kconfig 2007-03-06 13:33:28.000000000 +0300
>> @@ -265,6 +265,13 @@ config CPUSETS
>> bool
>> select CONTAINERS
>>
>> +config RSS_CONTAINER
>> + bool "RSS accounting container"
>> + select RESOURCE_COUNTERS
>> + help
>> + Provides a simple Resource Controller for monitoring and
>> + controlling the total Resident Set Size of the tasks in a
>> container
>> +
>
> The wording looks very familiar :-). It would be useful to add
> "The reclaim logic is now container aware, when the container goes
> overlimit
> the page reclaimer reclaims pages belonging to this container. If we are
> unable to reclaim enough pages to satisfy the request, the process is
> killed with an out of memory warning"

OK. Thanks.

>
>> config SYSFS_DEPRECATED
>> bool "Create deprecated sysfs files"
>> default y
>> diff -upr linux-2.6.20.orig/mm/Makefile linux-2.6.20-0/mm/Makefile
>> --- linux-2.6.20.orig/mm/Makefile 2007-02-04 21:44:54.000000000 +0300
>> +++ linux-2.6.20-0/mm/Makefile 2007-03-06 13:33:28.000000000 +0300
>> @@ -29,3 +29,5 @@ obj-$(CONFIG_MEMORY_HOTPLUG) += memory_h
>> obj-$(CONFIG_FS_XIP) += filemap_xip.o
>> obj-$(CONFIG_MIGRATION) += migrate.o
>> obj-$(CONFIG_SMP) += allocpercpu.o
>> +
>> +obj-$(CONFIG_RSS_CONTAINER) += rss_container.o
>> diff -upr linux-2.6.20.orig/mm/rss_container.c
>> linux-2.6.20-0/mm/rss_container.c
>> --- linux-2.6.20.orig/mm/rss_container.c 2007-03-06
>> 13:39:17.000000000 +0300
>> +++ linux-2.6.20-0/mm/rss_container.c 2007-03-06 13:33:28.000000000
>> +0300
>> @@ -0,0 +1,307 @@
>> +/*
>> + * RSS accounting container
>> + *
>> + * Copyright 2007 OpenVZ SWsoft Inc
>> + *
>> + * Author: Pavel Emelianov <xemul@openvz.org>
>> + *
>> + */
>> +
>> +#include <linux/list.h>
>> +#include <linux/sched.h>
>> +#include <linux/mm.h>
>> +#include <linux/res_counter.h>
>> +#include <linux/rss_container.h>
>> +
>> +static struct container_subsys rss_subsys;
>> +
>> +struct rss_container {
>> + struct res_counter res;
>> + struct list_head page_list;
>> + struct container_subsys_state css;
>> +};
>> +
>> +struct page_container {
>> + struct page *page;
>> + struct rss_container *cnt;
>> + struct list_head list;
>> +};
>> +
>
> Yes, this is what I was planning to get to -- a per container LRU list.
> But you have just one list, don't you need active and inactive lists?
> When the global LRU is manipulated, shouldn't this list be updated as
> well, so that reclaim will pick the best pages.
>
>> +static inline struct rss_container *rss_from_cont(struct container *cnt)
>> +{
>> + return container_of(container_subsys_state(cnt, &rss_subsys),
>> + struct rss_container, css);
>> +}
>> +
>> +int mm_init_container(struct mm_struct *mm, struct task_struct *tsk)
>> +{
>> + struct rss_container *cnt;
>> +
>> + cnt = rss_from_cont(task_container(tsk, &rss_subsys));
>> + if (css_get(&cnt->css))
>> + return -EBUSY;
>> +
>> + mm->rss_container = cnt;
>> + return 0;
>> +}
>> +
>> +void mm_free_container(struct mm_struct *mm)
>> +{
>> + css_put(&mm->rss_container->css);
>> +}
>> +
>> +int container_rss_prepare(struct page *page, struct vm_area_struct *vma,
>> + struct page_container **ppc)
>> +{
>> + struct rss_container *rss;
>> + struct page_container *pc;
>> +
>> + rcu_read_lock();
>> + rss = rcu_dereference(vma->vm_mm->rss_container);
>> + css_get_current(&rss->css);
>> + rcu_read_unlock();
>> +
>> + pc = kmalloc(sizeof(struct page_container), GFP_KERNEL);
>> + if (pc == NULL)
>> + goto out_nomem;
>> +
>> + while (res_counter_charge(&rss->res, 1)) {
>> + if (container_try_to_free_pages(rss))
>> + continue;
>> +
>
> The return codes of the functions is a bit confusing, ideally
> container_try_to_free_pages() should return 0 on success. Also

This returns exactly what try_to_free_pages() does.

> res_counter_charge() has a WARN_ON(1) if the limit is exceeded.

Nope - res_counter_uncharge() has - this is an absolutely
sane check that we haven't over-uncharged resources.

> The system administrator can figure out the details from failcnt,
> I suspect when the container is running close to it's limit,
> dmesg will have too many WARNING messages.
>
> How much memory do you try to reclaim in container_try_to_free_pages()?

At least one page. This is enough to make one page charge.
That's the difference from general try_to_free_pages() that
returns success if it freed swap_cluster_max pages at least.

> With my patches, I was planning to export this knob to userspace with
> a default value. This will help the administrator decide how much
> of the working set/container LRU should be freed on reaching the limit.
> I cannot find the definition of container_try_to_free_pages() in
> this patch.

This is in patch #5.
Sorry for such a bad split - I'll make it cleaner next time :)

>
>
>> + container_out_of_memory(rss);
>> + if (test_thread_flag(TIF_MEMDIE))
>> + goto out_charge;
>> + }
>> +
>> + pc->page = page;
>> + pc->cnt = rss;
>> + *ppc = pc;
>> + return 0;
>> +
>> +out_charge:
>> + kfree(pc);
>> +out_nomem:
>> + css_put(&rss->css);
>> + return -ENOMEM;
>> +}
>> +
>> +void container_rss_release(struct page_container *pc)
>> +{
>> + struct rss_container *rss;
>> +
>> + rss = pc->cnt;
>> + res_counter_uncharge(&rss->res, 1);
>> + css_put(&rss->css);
>> + kfree(pc);
>> +}
>> +
>> +void container_rss_add(struct page_container *pc)
>> +{
>> + struct page *pg;
>> + struct rss_container *rss;
>> +
>> + pg = pc->page;
>> + rss = pc->cnt;
>> +
>> + spin_lock(&r
...

Re: [RFC][PATCH 0/7] Resource controllers based on process containers [message #10915 is a reply to message #10907] Wed, 07 March 2007 07:27 Go to previous messageGo to next message
xemul is currently offline  xemul
Messages: 248
Registered: November 2005
Senior Member
Paul Menage wrote:
> On 3/6/07, Pavel Emelianov <xemul@sw.ru> wrote:
>> 2. Extended containers may register themselves too late.
>> Kernel threads/helpers start forking, opening files
>> and touching pages much earlier. This patchset
>> workarounds this in not-so-cute manner and I'm waiting
>> for Paul's comments on this issue.
>>
>
> Can we not make sure that each subsystem registers itself before any
> of its resources become usable? So the file counting subsystem should

Actually all the subsystems I've sent became usable very early.
Much earlier that initcalls started. I didn't found where exactly
but I can make it if we really need it.

> register at some point before filp_open() becomes usable, and the
> process counting subsystem should register before it's possible to
> fork, etc.
>
> Paul
>
Re: [RFC][PATCH 0/7] Resource controllers based on process containers [message #10917 is a reply to message #10910] Wed, 07 March 2007 07:30 Go to previous messageGo to next message
xemul is currently offline  xemul
Messages: 248
Registered: November 2005
Senior Member
Balbir Singh wrote:
> Pavel Emelianov wrote:
>> This patchset adds RSS, accounting and control and
>> limiting the number of tasks and files within container.
>>
>> Based on top of Paul Menage's container subsystem v7
>>
>> RSS controller includes per-container RSS accounter,
>> reclamation and OOM killer. It behaves like standalone
>> machine - when container runs out of resources it tries
>> to reclaim some pages and if it doesn't succeed in it
>> kills some task which mm_struct belongs to container in
>> question.
>>
>> Num tasks and files containers are very simple and
>> self-descriptive from code.
>>
>> As discussed before when a task moves from one container
>> to another no resources follow it - they keep holding the
>> container they were allocated in.
>>
>
> I have one problem with the patchset, I cannot compile
> the patches individually and some of the code is hard
> to read as it depends on functions from future patches.
> Patch 2, 3 and 4 fail to compile without patch 5 applied.
>
> Patch 1 failed to apply with a reject in kernel/Makefile
> I applied it on top of 2.6.20 with all of Paul Menage's
> patches (all 7).

This sounds weird for me :( I've taken a stock 2.6.20
and applied Paul's patches. This is what this patchset
is applicable for.
Re: [RFC][PATCH 0/7] Resource controllers based on process containers [message #10920 is a reply to message #10917] Wed, 07 March 2007 09:30 Go to previous messageGo to next message
dev is currently offline  dev
Messages: 1693
Registered: September 2005
Location: Moscow
Senior Member

Pavel Emelianov wrote:
> Balbir Singh wrote:
>
>>Pavel Emelianov wrote:
>>
>>>This patchset adds RSS, accounting and control and
>>>limiting the number of tasks and files within container.
>>>
>>>Based on top of Paul Menage's container subsystem v7
>>>
>>>RSS controller includes per-container RSS accounter,
>>>reclamation and OOM killer. It behaves like standalone
>>>machine - when container runs out of resources it tries
>>>to reclaim some pages and if it doesn't succeed in it
>>>kills some task which mm_struct belongs to container in
>>>question.
>>>
>>>Num tasks and files containers are very simple and
>>>self-descriptive from code.
>>>
>>>As discussed before when a task moves from one container
>>>to another no resources follow it - they keep holding the
>>>container they were allocated in.
>>>
>>
>>I have one problem with the patchset, I cannot compile
>>the patches individually and some of the code is hard
>>to read as it depends on functions from future patches.
>>Patch 2, 3 and 4 fail to compile without patch 5 applied.
>>
>>Patch 1 failed to apply with a reject in kernel/Makefile
>>I applied it on top of 2.6.20 with all of Paul Menage's
>>patches (all 7).
maybe Paul's patch should be taken w/o subsystems examples
(CKRM, UBC), i.e. first 3 patches only?

Kirill
Re: [RFC][PATCH 6/7] Account for the number of tasks within container [message #10943 is a reply to message #10912] Thu, 08 March 2007 13:49 Go to previous messageGo to next message
Paul Menage is currently offline  Paul Menage
Messages: 642
Registered: September 2006
Senior Member
On 3/6/07, Pavel Emelianov <xemul@sw.ru> wrote:
> The idea is:
>
> Task may be "the entity that allocates the resources" and "the
> entity that is a resource allocated".
>
> When task is the first entity it may move across containers
> (that is implemented in your patches). When task is a resource
> it shouldn't move across containers like files or pages do.
>
> More generally - allocated resources hold reference to original
> container till they die. No resource migration is performed.
>
> Did I express my idea cleanly?

Yes, but I disagree with the premise. The title of your patch is
"Account for the number of tasks within container", but that's not
what the subsystem does, it accounts for the number of forks within
the container that aren't directly accompanied by an exit.

Ideally, resources like files and pages would be able to follow tasks
as well. The reason that files and pages aren't easily migrated from
one container to another is that there could be sharing involved;
figuring out the sharing can be expensive, and it's not clear what to
do if two users are in different containers.

But in the case of a task count, there are no such issues with
sharing, so it seems to me to be more sensible (and more efficient) to
just limit the number of tasks in a container.

i.e. when moving a task into a container or forking a task within a
container, increment the count; when moving a task out of a container
or when it exits, decrement the count.

With your approach, if you were to set the task limit of an empty
container A to 1, and then move a process P from B into A, P would be
able to fork a new child, since the "task count" would be 0 (as P was
being charged to B still). Surely the fact that there's 1 process in A
should prevent P from forking?

Paul
Re: [RFC][PATCH 5/7] Per-container OOM killer and page reclamation [message #10957 is a reply to message #10893] Fri, 09 March 2007 21:21 Go to previous messageGo to next message
Balbir Singh is currently offline  Balbir Singh
Messages: 491
Registered: August 2006
Senior Member
Hi, Pavel,

Please find my patch to add LRU behaviour to your latest RSS controller.

Balbir Singh
Linux Technology Center
IBM, ISTL
Re: [RFC][PATCH 6/7] Account for the number of tasks within container [message #10985 is a reply to message #10943] Sun, 11 March 2007 08:34 Go to previous messageGo to next message
xemul is currently offline  xemul
Messages: 248
Registered: November 2005
Senior Member
Paul Menage wrote:
> On 3/6/07, Pavel Emelianov <xemul@sw.ru> wrote:
>> The idea is:
>>
>> Task may be "the entity that allocates the resources" and "the
>> entity that is a resource allocated".
>>
>> When task is the first entity it may move across containers
>> (that is implemented in your patches). When task is a resource
>> it shouldn't move across containers like files or pages do.
>>
>> More generally - allocated resources hold reference to original
>> container till they die. No resource migration is performed.
>>
>> Did I express my idea cleanly?
>
> Yes, but I disagree with the premise. The title of your patch is
> "Account for the number of tasks within container", but that's not
> what the subsystem does, it accounts for the number of forks within
> the container that aren't directly accompanied by an exit.
>
> Ideally, resources like files and pages would be able to follow tasks
> as well. The reason that files and pages aren't easily migrated from
> one container to another is that there could be sharing involved;
> figuring out the sharing can be expensive, and it's not clear what to
> do if two users are in different containers.
>
> But in the case of a task count, there are no such issues with
> sharing, so it seems to me to be more sensible (and more efficient) to
> just limit the number of tasks in a container.
>
> i.e. when moving a task into a container or forking a task within a
> container, increment the count; when moving a task out of a container
> or when it exits, decrement the count.

Sounds reasonable.
I'll take this into account when I make the next iteration.
Thanks.

> With your approach, if you were to set the task limit of an empty
> container A to 1, and then move a process P from B into A, P would be
> able to fork a new child, since the "task count" would be 0 (as P was
> being charged to B still). Surely the fact that there's 1 process in A
> should prevent P from forking?
>
> Paul
>
Re: [RFC][PATCH 5/7] Per-container OOM killer and page reclamation [message #10986 is a reply to message #10957] Sun, 11 March 2007 08:39 Go to previous messageGo to next message
xemul is currently offline  xemul
Messages: 248
Registered: November 2005
Senior Member
Balbir Singh wrote:
> Hi, Pavel,
>
> Please find my patch to add LRU behaviour to your latest RSS controller.

Thanks for participation and additional testing :)
I'll include this into next generation of patches.

> Balbir Singh
> Linux Technology Center
> IBM, ISTL
>
>
> ------------------------------------------------------------ ------------
>
> Add LRU behaviour to the RSS controller patches posted by Pavel Emelianov
>
> http://lkml.org/lkml/2007/3/6/198
>
> which was in turn similar to the RSS controller posted by me
>
> http://lkml.org/lkml/2007/2/26/8
>
> Pavel's patches have a per container list of pages, which helps reduce
> reclaim time of the RSS controller but the per container list of pages is
> in FIFO order. I've implemented active and inactive lists per container to
> help select the right set of pages to reclaim when the container is under
> memory pressure.
>
> I've tested these patches on a ppc64 machine and they work fine for
> the minimal testing I've done.
>
> Pavel would you please include these patches in your next iteration.
>
> Comments, suggestions and further improvements are as always welcome!
>
> Signed-off-by: <balbir@in.ibm.com>
> ---
>
> include/linux/rss_container.h | 1
> mm/rss_container.c | 47 +++++++++++++++++++++++++++++++-----------
> mm/swap.c | 5 ++++
> mm/vmscan.c | 3 ++
> 4 files changed, 44 insertions(+), 12 deletions(-)
>
> diff -puN include/linux/rss_container.h~rss-container-lru2 include/linux/rss_container.h
> --- linux-2.6.20/include/linux/rss_container.h~rss-container-lru 2 2007-03-09 22:52:56.000000000 +0530
> +++ linux-2.6.20-balbir/include/linux/rss_container.h 2007-03-10 00:39:59.000000000 +0530
> @@ -19,6 +19,7 @@ int container_rss_prepare(struct page *,
> void container_rss_add(struct page_container *);
> void container_rss_del(struct page_container *);
> void container_rss_release(struct page_container *);
> +void container_rss_move_lists(struct page *pg, bool active);
>
> int mm_init_container(struct mm_struct *mm, struct task_struct *tsk);
> void mm_free_container(struct mm_struct *mm);
> diff -puN mm/rss_container.c~rss-container-lru2 mm/rss_container.c
> --- linux-2.6.20/mm/rss_container.c~rss-container-lru2 2007-03-09 22:52:56.000000000 +0530
> +++ linux-2.6.20-balbir/mm/rss_container.c 2007-03-10 02:42:54.000000000 +0530
> @@ -17,7 +17,8 @@ static struct container_subsys rss_subsy
>
> struct rss_container {
> struct res_counter res;
> - struct list_head page_list;
> + struct list_head inactive_list;
> + struct list_head active_list;
> struct container_subsys_state css;
> };
>
> @@ -96,6 +97,26 @@ void container_rss_release(struct page_c
> kfree(pc);
> }
>
> +void container_rss_move_lists(struct page *pg, bool active)
> +{
> + struct rss_container *rss;
> + struct page_container *pc;
> +
> + if (!page_mapped(pg))
> + return;
> +
> + pc = page_container(pg);
> + BUG_ON(!pc);
> + rss = pc->cnt;
> +
> + spin_lock_irq(&rss->res.lock);
> + if (active)
> + list_move(&pc->list, &rss->active_list);
> + else
> + list_move(&pc->list, &rss->inactive_list);
> + spin_unlock_irq(&rss->res.lock);
> +}
> +
> void container_rss_add(struct page_container *pc)
> {
> struct page *pg;
> @@ -105,7 +126,7 @@ void container_rss_add(struct page_conta
> rss = pc->cnt;
>
> spin_lock(&rss->res.lock);
> - list_add(&pc->list, &rss->page_list);
> + list_add(&pc->list, &rss->active_list);
> spin_unlock(&rss->res.lock);
>
> page_container(pg) = pc;
> @@ -141,7 +162,10 @@ unsigned long container_isolate_pages(un
> struct zone *z;
>
> spin_lock_irq(&rss->res.lock);
> - src = &rss->page_list;
> + if (active)
> + src = &rss->active_list;
> + else
> + src = &rss->inactive_list;
>
> for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
> pc = list_entry(src->prev, struct page_container, list);
> @@ -152,13 +176,10 @@ unsigned long container_isolate_pages(un
>
> spin_lock(&z->lru_lock);
> if (PageLRU(page)) {
> - if ((active && PageActive(page)) ||
> - (!active && !PageActive(page))) {
> - if (likely(get_page_unless_zero(page))) {
> - ClearPageLRU(page);
> - nr_taken++;
> - list_move(&page->lru, dst);
> - }
> + if (likely(get_page_unless_zero(page))) {
> + ClearPageLRU(page);
> + nr_taken++;
> + list_move(&page->lru, dst);
> }
> }
> spin_unlock(&z->lru_lock);
> @@ -212,7 +233,8 @@ static int rss_create(struct container_s
> return -ENOMEM;
>
> res_counter_init(&rss->res);
> - INIT_LIST_HEAD(&rss->page_list);
> + INIT_LIST_HEAD(&rss->inactive_list);
> + INIT_LIST_HEAD(&rss->active_list);
> cont->subsys[rss_subsys.subsys_id] = &rss->css;
> return 0;
> }
> @@ -284,7 +306,8 @@ static __init int rss_create_early(struc
>
> rss = &init_rss_container;
> res_counter_init(&rss->res);
> - INIT_LIST_HEAD(&rss->page_list);
> + INIT_LIST_HEAD(&rss->inactive_list);
> + INIT_LIST_HEAD(&rss->active_list);
> cont->subsys[rss_subsys.subsys_id] = &rss->css;
> ss->create = rss_create;
> return 0;
> diff -puN mm/vmscan.c~rss-container-lru2 mm/vmscan.c
> --- linux-2.6.20/mm/vmscan.c~rss-container-lru2 2007-03-09 22:52:56.000000000 +0530
> +++ linux-2.6.20-balbir/mm/vmscan.c 2007-03-10 00:42:35.000000000 +0530
> @@ -1142,6 +1142,7 @@ static unsigned long container_shrink_pa
> else
> add_page_to_inactive_list(z, page);
> spin_unlock_irq(&z->lru_lock);
> + container_rss_move_lists(page, false);
>
> put_page(page);
> }
> @@ -1191,6 +1192,7 @@ static void container_shrink_pages_activ
> list_move(&page->lru, &z->inactive_list);
> z->nr_inactive++;
> spin_unlock_irq(&z->lru_lock);
> + container_rss_move_lists(page, false);
>
> put_page(page);
> }
> @@ -1206,6 +1208,7 @@ static void container_shrink_pages_activ
> list_move(&page->lru, &z->active_list);
> z->nr_active++;
> spin_unlock_irq(&z->lru_lock);
> + container_rss_move_lists(page, true);
>
> put_page(page);
> }
> diff -puN mm/swap.c~rss-container-lru2 mm/swap.c
> --- linux-2.6.20/mm/swap.c~rss-container-lru2 2007-03-10 00:42:38.000000000 +0530
> +++ linux-2.6.20-balbir/mm/swap.c 2007-03-10 01:20:39.000000000 +0530
> @@ -30,6 +30,7 @@
> #include <linux/cpu.h>
> #include <linux/notifier.h>
> #include <linux/init.h>
> +#include <linux/rss_container.h>
>
> /* How many pages do we try to swap or page in/out together? */
> int page_cluster;
> @@ -140,6 +141,7 @@ int rotate_reclaimable_page(struct page
> void fastcall activate_page(struct page *page)
> {
> struct zone *zone = page_zone(page);
> + bool moved = false;
>
> spin_lock_irq(&zone->lru_lock);
> if (PageLRU(page) && !PageActive(page)) {
> @@ -147,8 +149,11 @@ void fastcall activate_page(struct page
> SetPageActive(page);
> add_page_to_active_list(zone, page);
> __count_vm_event(PGACTIVATE);
> + moved = true;
> }
> spin_unlock_irq(&zone->lru_lock);
> + if (moved)
> + container_rss_move_lists(page, true);
> }
>
> /*
> _
Re: [RFC][PATCH 2/7] RSS controller core [message #10993 is a reply to message #10902] Sun, 11 March 2007 12:13 Go to previous messageGo to next message
dev is currently offline  dev
Messages: 1693
Registered: September 2005
Location: Moscow
Senior Member

Andrew Morton wrote:
> On Tue, 06 Mar 2007 17:55:29 +0300
> Pavel Emelianov <xemul@sw.ru> wrote:
>
>
>>+struct rss_container {
>>+ struct res_counter res;
>>+ struct list_head page_list;
>>+ struct container_subsys_state css;
>>+};
>>+
>>+struct page_container {
>>+ struct page *page;
>>+ struct rss_container *cnt;
>>+ struct list_head list;
>>+};
>
>
> ah. This looks good. I'll find a hunk of time to go through this work
> and through Paul's patches. It'd be good to get both patchsets lined
> up in -mm within a couple of weeks. But..
>
> We need to decide whether we want to do per-container memory limitation via
> these data structures, or whether we do it via a physical scan of some
> software zone, possibly based on Mel's patches.
i.e. a separate memzone for each container?
imho memzone approach is inconvinient for pages sharing and shares accounting.
it also makes memory management more strict, forbids overcommiting
per-container etc.
Maybe you have some ideas how we can decide on this?

Thanks,
Kirill
Re: [RFC][PATCH 2/7] RSS controller core [message #11001 is a reply to message #10993] Sun, 11 March 2007 12:51 Go to previous messageGo to next message
Andrew Morton is currently offline  Andrew Morton
Messages: 127
Registered: December 2005
Senior Member
> On Sun, 11 Mar 2007 15:26:41 +0300 Kirill Korotaev <dev@sw.ru> wrote:
> Andrew Morton wrote:
> > On Tue, 06 Mar 2007 17:55:29 +0300
> > Pavel Emelianov <xemul@sw.ru> wrote:
> >
> >
> >>+struct rss_container {
> >>+ struct res_counter res;
> >>+ struct list_head page_list;
> >>+ struct container_subsys_state css;
> >>+};
> >>+
> >>+struct page_container {
> >>+ struct page *page;
> >>+ struct rss_container *cnt;
> >>+ struct list_head list;
> >>+};
> >
> >
> > ah. This looks good. I'll find a hunk of time to go through this work
> > and through Paul's patches. It'd be good to get both patchsets lined
> > up in -mm within a couple of weeks. But..
> >
> > We need to decide whether we want to do per-container memory limitation via
> > these data structures, or whether we do it via a physical scan of some
> > software zone, possibly based on Mel's patches.
> i.e. a separate memzone for each container?

Yep. Straightforward machine partitioning. An attractive thing is that it
100% reuses existing page reclaim, unaltered.

> imho memzone approach is inconvinient for pages sharing and shares accounting.
> it also makes memory management more strict, forbids overcommiting
> per-container etc.

umm, who said they were requirements?

> Maybe you have some ideas how we can decide on this?

We need to work out what the requirements are before we can settle on an
implementation.

Sigh. Who is running this show? Anyone?

You can actually do a form of overcommittment by allowing multiple
containers to share one or more of the zones. Whether that is sufficient
or suitable I don't know. That depends on the requirements, and we haven't
even discussed those, let alone agreed to them.
Re: [RFC][PATCH 2/7] RSS controller core [message #11009 is a reply to message #11001] Sun, 11 March 2007 15:51 Go to previous messageGo to next message
Balbir Singh is currently offline  Balbir Singh
Messages: 491
Registered: August 2006
Senior Member
On 3/11/07, Andrew Morton <akpm@linux-foundation.org> wrote:
> > On Sun, 11 Mar 2007 15:26:41 +0300 Kirill Korotaev <dev@sw.ru> wrote:
> > Andrew Morton wrote:
> > > On Tue, 06 Mar 2007 17:55:29 +0300
> > > Pavel Emelianov <xemul@sw.ru> wrote:
> > >
> > >
> > >>+struct rss_container {
> > >>+ struct res_counter res;
> > >>+ struct list_head page_list;
> > >>+ struct container_subsys_state css;
> > >>+};
> > >>+
> > >>+struct page_container {
> > >>+ struct page *page;
> > >>+ struct rss_container *cnt;
> > >>+ struct list_head list;
> > >>+};
> > >
> > >
> > > ah. This looks good. I'll find a hunk of time to go through this work
> > > and through Paul's patches. It'd be good to get both patchsets lined
> > > up in -mm within a couple of weeks. But..
> > >
> > > We need to decide whether we want to do per-container memory limitation via
> > > these data structures, or whether we do it via a physical scan of some
> > > software zone, possibly based on Mel's patches.
> > i.e. a separate memzone for each container?
>
> Yep. Straightforward machine partitioning. An attractive thing is that it
> 100% reuses existing page reclaim, unaltered.

We discussed zones for resource control and some of the disadvantages at
http://lkml.org/lkml/2006/10/30/222

I need to look at Mel's patches to determine if they are suitable for
control. But in a thread of discussion on those patches, it was agreed
that memory fragmentation and resource control are independent issues.


>
> > imho memzone approach is inconvinient for pages sharing and shares accounting.
> > it also makes memory management more strict, forbids overcommiting
> > per-container etc.
>
> umm, who said they were requirements?
>

We discussed some of the requirements in the RFC: Memory Controller
requirements thread
http://lkml.org/lkml/2006/10/30/51

> > Maybe you have some ideas how we can decide on this?
>
> We need to work out what the requirements are before we can settle on an
> implementation.
>
> Sigh. Who is running this show? Anyone?
>

All the stake holders involved in the RFC discussion :-) We've been
talking and building on top of each others patches. I hope that was a
good answer ;)

> You can actually do a form of overcommittment by allowing multiple
> containers to share one or more of the zones. Whether that is sufficient
> or suitable I don't know. That depends on the requirements, and we haven't
> even discussed those, let alone agreed to them.
>

There are other things like resizing a zone, finding the right size,
etc. I'll look
at Mel's patches to see what is supported.

Warm Regards,
Balbir Singh
Re: Re: [RFC][PATCH 2/7] RSS controller core [message #11024 is a reply to message #10902] Mon, 12 March 2007 09:10 Go to previous messageGo to next message
Kirill Korotaev is currently offline  Kirill Korotaev
Messages: 137
Registered: January 2006
Senior Member
Eric,

> And misses every resource sharing opportunity in sight.

that was my point too.

> Except for
> filtering the which pages are eligible for reclaim an RSS limit should
> not need to change the existing reclaim logic, and with things like the
> memory zones we have had that kind of restriction in the reclaim logic
> for a long time. So filtering out ineligible pages isn't anything new.

exactly this is implemented in the current patches from Pavel.
the only difference is that filtering is not done in general LRU list,
which is not effective, but via per-container LRU list.
So the pointer on the page structure does 2 things:
- fast reclamation
- correct uncharging of page from where it was charged
(e.g. shared pages can be mapped first in one container, but the last unmap
done from another one).

>>We need to work out what the requirements are before we can settle on an
>>implementation.
>
>
> If you are talking about RSS limits the term is well defined. The
> number of pages you can have mapped into your set of address space at
> any given time.
>
> Unless I'm totally blind that isn't what the patchset implements.

Ouch, what makes you think so?
The fact that a page mapped into 2 different processes is charged only once?
Imho it is much more correct then sum of process' RSS within container, due to:
1. it is clear how much container uses physical pages, not abstract items
2. shared pages are charged only once, so the sum of containers RSS is still
about physical RAM.

> A
> true RSS limit over multiple processes has a lot of potential to be
> generally useful, is very understandable, doesn't affect kernel cache
> decisions so largely performance should not be affected. There is a
> little more overhead in the fault logic but that is a moderately
> expensive path anyway.

100% agree here.

>>You can actually do a form of overcommittment by allowing multiple
>>containers to share one or more of the zones. Whether that is sufficient
>>or suitable I don't know. That depends on the requirements, and we haven't
>>even discussed those, let alone agreed to them.
>
>
> Another really nasty issue is the container term as the resource guys
> are using the term in a subtlety different way then it has been used
> with namespaces leading to several threads where the participants talked
> past each other. We need a different term to designate the group of
> tasks a resource controller is dealing with.
taskgrp? resgrp?

> The whole filesystem interface also is over general and makes it too
> easy to express the hard things (like move an existing task from one
> group of tasks to another) leading to code complications.
the things which are not supported are easy to disable.

> On the up side I think the code the focus is likely in the right place
> to start delivering usable code.

Thanks,
Kirill
Re: Re: [RFC][PATCH 2/7] RSS controller core [message #11079 is a reply to message #11024] Tue, 13 March 2007 09:26 Go to previous messageGo to next message
ebiederm is currently offline  ebiederm
Messages: 1354
Registered: February 2006
Senior Member
Kirill Korotaev <dev@openvz.org> writes:

> Eric,
>
>> And misses every resource sharing opportunity in sight.
>
> that was my point too.
>
>> Except for
>> filtering the which pages are eligible for reclaim an RSS limit should
>> not need to change the existing reclaim logic, and with things like the
>> memory zones we have had that kind of restriction in the reclaim logic
>> for a long time. So filtering out ineligible pages isn't anything new.
>
> exactly this is implemented in the current patches from Pavel.
> the only difference is that filtering is not done in general LRU list,
> which is not effective, but via per-container LRU list.
> So the pointer on the page structure does 2 things:
> - fast reclamation
Better than the rmap list?
> - correct uncharging of page from where it was charged
> (e.g. shared pages can be mapped first in one container, but the last unmap
> done from another one).
We should charge/uncharge all of them, not just one.

>>>We need to work out what the requirements are before we can settle on an
>>>implementation.
>>
>>
>> If you are talking about RSS limits the term is well defined. The
>> number of pages you can have mapped into your set of address space at
>> any given time.
>>
>> Unless I'm totally blind that isn't what the patchset implements.
>
> Ouch, what makes you think so?
> The fact that a page mapped into 2 different processes is charged only once?
> Imho it is much more correct then sum of process' RSS within container, due to:
> 1. it is clear how much container uses physical pages, not abstract items
> 2. shared pages are charged only once, so the sum of containers RSS is still
> about physical RAM.

No the fact that a page mapped into 2 separate mm_structs in two
separate accounting domains is counted only once. This is very likely
to happen with things like glibc if you have a read-only shared copy
of your distro. There appears to be no technical reason for such a
restriction.

A page should not be owned.

Going further unless the limits are draconian I don't expect users to
hit the rss limits often or frequently. So in 99% of all cases page
reclaim should continue to be global. Which makes me question messing
with the general page reclaim lists.

Now if the normal limits turn out to be draconian it may make sense to
split the first level of page lists by some reasonable approximation
to their rss group, so we don't normally scan unnecessary pages.

>> The whole filesystem interface also is over general and makes it too
>> easy to express the hard things (like move an existing task from one
>> group of tasks to another) leading to code complications.
> the things which are not supported are easy to disable.

Maybe. The extra locking complexity gives me fits. But in the grand
scheme of things it is minor as long as it is not user perceptible we
can fix it later. I'm still wrapping my head around the weird fs concepts.

Eric
Re: Re: [RFC][PATCH 1/7] Resource counters [message #11081 is a reply to message #10908] Tue, 13 March 2007 09:36 Go to previous messageGo to next message
dev is currently offline  dev
Messages: 1693
Registered: September 2005
Location: Moscow
Senior Member

>> - doesn't store the accounted value but
>> limit - accounted (i.e. the free resource)
>> - uses atomic_add_return()
>> - when negative, an error is returned and
>> the resource amount is added back
>>
>>changes to the limit have to adjust the 'current'
>>value too, but that is again simple and atomic
>>
>>best,
>>Herbert
>>
>>PS: atomic_add_unless() didn't exist back then
>>(at least I think so) but that might be an option
>>too ...
>
>
> I think as far as having this discussion if you can remove that race
> people will be more willing to talk about what vserver does.
>
> That said anything that uses locks or atomic operations (finer grained locks)
> because of the cache line ping pong is going to have scaling issues on large
> boxes.

> So in that sense anything short of per cpu variables sucks at scale. That said
> I would much rather get a simple correct version without the complexity of
> per cpu counters, before we optimize the counters that much.
fully agree with it. We need to get a working version first.

FYI, in OVZ we recently added such optimizations: reserves like in TCP/IP,
e.g. for kmemsize, numfile these reserves are done on task-basis for
fast charges/uncharges w/o involving lock operations.
On task exit reserves are returned back to the beancounter.

As it demonstrated atomic counters can be replaced with
task-reserves on the next step.

Thanks,
Kirill
Re: Re: [RFC][PATCH 2/7] RSS controller core [message #11083 is a reply to message #10902] Tue, 13 March 2007 10:06 Go to previous messageGo to next message
dev is currently offline  dev
Messages: 1693
Registered: September 2005
Location: Moscow
Senior Member

Andrew Morton wrote:
>>>> - shared mappings of 'shared' files (binaries
>>>> and libraries) to allow for reduced memory
>>>> footprint when N identical guests are running
>>>
>>>So, it sounds like this can be phrased as a requirement like:
>>>
>>> "Guests must be able to share pages."
>>>
>>>Can you give us an idea why this is so?
>>
>>sure, one reason for this is that guests tend to
>>be similar (or almost identical) which results
>>in quite a lot of 'shared' libraries and executables
>>which would otherwise get cached for each guest and
>>would also be mapped for each guest separately
>
>
> nooooooo. What you're saying there amounts to text replication. There is
> no proposal here to create duplicated copies of pagecache pages: the VM
> just doesn't support that (Nick has soe protopatches which do this as a
> possible NUMA optimisation).
>
> So these mmapped pages will contiue to be shared across all guests. The
> problem boils down to "which guest(s) get charged for each shared page".
>
> A simple and obvious and easy-to-implement answer is "the guest which paged
> it in". I think we should firstly explain why that is insufficient.
I guess by "paged it in" you essentially mean
"mapped the page into address space for the *first* time"?

i.e. how many times the same page mapped into 2 address spaces
in the same container should be accounted for?

We believe ONE. It is better due to:
- it allows better estimate how much RAM container uses.
- if one container mapped a single page 10,000 times,
it doesn't mean it is worse than a container which mapped only 200 pages
and that it should be killed in case of OOM.

Thanks,
Kirill
Re: Re: [RFC][PATCH 2/7] RSS controller core [message #11085 is a reply to message #11083] Tue, 13 March 2007 10:49 Go to previous messageGo to next message
Andrew Morton is currently offline  Andrew Morton
Messages: 127
Registered: December 2005
Senior Member
> On Tue, 13 Mar 2007 13:19:53 +0300 Kirill Korotaev <dev@sw.ru> wrote:
> Andrew Morton wrote:
> >>>> - shared mappings of 'shared' files (binaries
> >>>> and libraries) to allow for reduced memory
> >>>> footprint when N identical guests are running
> >>>
> >>>So, it sounds like this can be phrased as a requirement like:
> >>>
> >>> "Guests must be able to share pages."
> >>>
> >>>Can you give us an idea why this is so?
> >>
> >>sure, one reason for this is that guests tend to
> >>be similar (or almost identical) which results
> >>in quite a lot of 'shared' libraries and executables
> >>which would otherwise get cached for each guest and
> >>would also be mapped for each guest separately
> >
> >
> > nooooooo. What you're saying there amounts to text replication. There is
> > no proposal here to create duplicated copies of pagecache pages: the VM
> > just doesn't support that (Nick has soe protopatches which do this as a
> > possible NUMA optimisation).
> >
> > So these mmapped pages will contiue to be shared across all guests. The
> > problem boils down to "which guest(s) get charged for each shared page".
> >
> > A simple and obvious and easy-to-implement answer is "the guest which paged
> > it in". I think we should firstly explain why that is insufficient.
> I guess by "paged it in" you essentially mean
> "mapped the page into address space for the *first* time"?

Not really - I mean "first allocated the page". ie: major fault(), read(),
write(), etc.

> i.e. how many times the same page mapped into 2 address spaces
> in the same container should be accounted for?
>
> We believe ONE. It is better due to:
> - it allows better estimate how much RAM container uses.
> - if one container mapped a single page 10,000 times,
> it doesn't mean it is worse than a container which mapped only 200 pages
> and that it should be killed in case of OOM.

I'm not sure that we need to account for pages at all, nor care about rss.

If we use a physical zone-based containment scheme: fake-numa,
variable-sized zones, etc then it all becomes moot. You set up a container
which has 1.5GB of physial memory then toss processes into it. As that
process set increases in size it will toss out stray pages which shouldn't
be there, then it will start reclaiming and swapping out its own pages and
eventually it'll get an oom-killing.

No RSS acounting or page acounting in sight, because we already *have* that
stuff, at the physical level, in the zone.

Overcommitment can be performed by allowing different containers to share
the same zone set, or by dynamically increasing or decreasing the size of
a physical container.

This all works today with fake-numa and cpusets, no kernel changes needed.

It could be made to work fairly simply with a multi-zone approach, or with
resizeable zones.

I'd be interested in knowing what you think the shortcomings of this are
likely to be,.
Re: Re: [RFC][PATCH 2/7] RSS controller core [message #11103 is a reply to message #11085] Tue, 13 March 2007 14:59 Go to previous messageGo to next message
Herbert Poetzl is currently offline  Herbert Poetzl
Messages: 239
Registered: February 2006
Senior Member
On Tue, Mar 13, 2007 at 03:48:34AM -0800, Andrew Morton wrote:
> > On Tue, 13 Mar 2007 13:19:53 +0300 Kirill Korotaev <dev@sw.ru> wrote:
> > Andrew Morton wrote:
> > >>>> - shared mappings of 'shared' files (binaries
> > >>>> and libraries) to allow for reduced memory
> > >>>> footprint when N identical guests are running
> > >>>
> > >>>So, it sounds like this can be phrased as a requirement like:
> > >>>
> > >>> "Guests must be able to share pages."
> > >>>
> > >>>Can you give us an idea why this is so?
> > >>
> > >>sure, one reason for this is that guests tend to
> > >>be similar (or almost identical) which results
> > >>in quite a lot of 'shared' libraries and executables
> > >>which would otherwise get cached for each guest and
> > >>would also be mapped for each guest separately
> > >
> > > nooooooo. What you're saying there amounts to text replication.
> > > There is no proposal here to create duplicated copies of pagecache
> > > pages: the VM just doesn't support that (Nick has soe protopatches
> > > which do this as a possible NUMA optimisation).
> > >
> > > So these mmapped pages will contiue to be shared across all
> > > guests. The problem boils down to "which guest(s) get charged for
> > > each shared page".
> > >
> > > A simple and obvious and easy-to-implement answer is "the guest
> > > which paged it in". I think we should firstly explain why that is
> > > insufficient.

> > I guess by "paged it in" you essentially mean
> > "mapped the page into address space for the *first* time"?
>
> Not really - I mean "first allocated the page". ie: major fault(),
> read(), write(), etc.
>
> > i.e. how many times the same page mapped into 2 address spaces
> > in the same container should be accounted for?
> >
> > We believe ONE. It is better due to:
> > - it allows better estimate how much RAM container uses.
> > - if one container mapped a single page 10,000 times,
> > it doesn't mean it is worse than a container which mapped only 200
> > pages and that it should be killed in case of OOM.
>
> I'm not sure that we need to account for pages at all, nor care about
> rss.
>
> If we use a physical zone-based containment scheme: fake-numa,
> variable-sized zones, etc then it all becomes moot.

sounds good to me, just not sure it provides what we
need, but I'm sure I'll figure that with your help ...

> You set up a container which has 1.5GB of physial memory then toss
> processes into it. As that process set increases in size it will
> toss out stray pages which shouldn't be there, then it will start
> reclaiming and swapping out its own pages and eventually it'll get an
> oom-killing.

okay, let me ask a few naive questions about this scheme:

how does this work for a _file_ which is shared between
two guests (e.g. an executable like bash, hardlinked
between guests) when both guests are in a different
zone-based container?

+ assumed that the file is read in the first time,
will it be accounted to the first guest doing so?

+ assumed it is accessed in the second guest, will
it cause any additional cache/mapping besides the
dentry stuff?

+ will container A be able to 'toss out' pages
'shared' with container B (assumed sharing is
possible :)

+ when the container A tosses out the pages for this
executable, will guest B still be able to use them?

+ when the pages are tossed out, will they require
the system to read them in again, or will they
stay available ala swap cache?

> No RSS acounting or page acounting in sight, because we already *have*
> that stuff, at the physical level, in the zone.

I'm fine with that ...

> Overcommitment can be performed by allowing different containers to
> share the same zone set, or by dynamically increasing or decreasing
> the size of a physical container.

here the question is, can a guest have several of
those 'virtual zones' assigned, so that there is a
container specific and a shared zone for example?

> This all works today with fake-numa and cpusets, no kernel changes
> needed.

sounds good!

> It could be made to work fairly simply with a multi-zone approach, or
> with resizeable zones.
>
> I'd be interested in knowing what you think the shortcomings of
> this are likely to be,.

will do so once I have a better understanding how this
approach will work ...

TIA,
Herbert
Re: Re: [RFC][PATCH 2/7] RSS controller core [message #11104 is a reply to message #11079] Tue, 13 March 2007 15:30 Go to previous messageGo to next message
dev is currently offline  dev
Messages: 1693
Registered: September 2005
Location: Moscow
Senior Member

Eric,

>>>And misses every resource sharing opportunity in sight.
>>
>>that was my point too.
>>
>>
>>>Except for
>>>filtering the which pages are eligible for reclaim an RSS limit should
>>>not need to change the existing reclaim logic, and with things like the
>>>memory zones we have had that kind of restriction in the reclaim logic
>>>for a long time. So filtering out ineligible pages isn't anything new.
>>
>>exactly this is implemented in the current patches from Pavel.
>>the only difference is that filtering is not done in general LRU list,
>>which is not effective, but via per-container LRU list.
>>So the pointer on the page structure does 2 things:
>>- fast reclamation
>
> Better than the rmap list?
>
>>- correct uncharging of page from where it was charged
>> (e.g. shared pages can be mapped first in one container, but the last unmap
>> done from another one).
>
> We should charge/uncharge all of them, not just one.
>
>
>>>>We need to work out what the requirements are before we can settle on an
>>>>implementation.
>>>
>>>
>>>If you are talking about RSS limits the term is well defined. The
>>>number of pages you can have mapped into your set of address space at
>>>any given time.
>>>
>>>Unless I'm totally blind that isn't what the patchset implements.
>>
>>Ouch, what makes you think so?
>>The fact that a page mapped into 2 different processes is charged only once?
>>Imho it is much more correct then sum of process' RSS within container, due to:
>>1. it is clear how much container uses physical pages, not abstract items
>>2. shared pages are charged only once, so the sum of containers RSS is still
>> about physical RAM.
>
>
> No the fact that a page mapped into 2 separate mm_structs in two
> separate accounting domains is counted only once. This is very likely
> to happen with things like glibc if you have a read-only shared copy
> of your distro. There appears to be no technical reason for such a
> restriction.
>
> A page should not be owned.

I would be happy to propose OVZ approach then, where a page is tracked
with page_beancounter data structure, which ties together
a page with beancounters which use it like this:

page -> page_beancounter -> list of beanocunters which has the page mapped

This gives a number of advantages:
- the page is accounted to all the VEs which actually use it.
- allows almost accurate tracking of page fractions used by VEs
depending on how many VEs mapped the page.
- allows to track dirty pages, i.e. which VE dirtied the page
and implement correct disk I/O accounting and CFQ write scheduling
based on VE priorities.

> Going further unless the limits are draconian I don't expect users to
> hit the rss limits often or frequently. So in 99% of all cases page
> reclaim should continue to be global. Which makes me question messing
> with the general page reclaim lists.

It is not that rare when containers hit their limits, believe me :/
In trusted environments - probably you are right, in hosting - no.

Thanks,
Kirill
Re: [RFC][PATCH 4/7] RSS accounting hooks over the code [message #11169 is a reply to message #10892] Wed, 14 March 2007 15:43 Go to previous messageGo to next message
xemul is currently offline  xemul
Messages: 248
Registered: November 2005
Senior Member
Cedric Le Goater wrote:
>> --- linux-2.6.20.orig/mm/migrate.c 2007-02-04 21:44:54.000000000 +0300
>> +++ linux-2.6.20-0/mm/migrate.c 2007-03-06 13:33:28.000000000 +0300
>> @@ -134,6 +134,7 @@ static void remove_migration_pte(struct
>> pte_t *ptep, pte;
>> spinlock_t *ptl;
>> unsigned long addr = page_address_in_vma(new, vma);
>> + struct page_container *pcont;
>>
>> if (addr == -EFAULT)
>> return;
>> @@ -157,6 +158,11 @@ static void remove_migration_pte(struct
>> return;
>> }
>>
>> + if (container_rss_prepare(new, vma, &pcont)) {
>> + pte_unmap(ptep);
>> + return;
>> + }
>> +
>> ptl = pte_lockptr(mm, pmd);
>> spin_lock(ptl);
>> pte = *ptep;
>> @@ -175,16 +181,19 @@ static void remove_migration_pte(struct
>> set_pte_at(mm, addr, ptep, pte);
>>
>> if (PageAnon(new))
>> - page_add_anon_rmap(new, vma, addr);
>> + page_add_anon_rmap(new, vma, addr, pcont);
>> else
>> - page_add_file_rmap(new);
>> + page_add_file_rmap(new, pcont);
>>
>> /* No need to invalidate - it was non-present before */
>> update_mmu_cache(vma, addr, pte);
>> lazy_mmu_prot_update(pte);
>> + pte_unmap_unlock(ptep, ptl);
>> + return;
>>
>> out:
>> pte_unmap_unlock(ptep, ptl);
>> + container_rss_release(pcont);
>> }
>>
>> /*
>
> you missed out an include in mm/migrate.c
>
> cheers,

Thanks! :)

> C.
> Signed-off-by: Cedric Le Goater <clg@fr.ibm.com>
> ---
> mm/migrate.c | 1 +
> 1 file changed, 1 insertion(+)
>
> Index: 2.6.20/mm/migrate.c
> ============================================================ =======
> --- 2.6.20.orig/mm/migrate.c
> +++ 2.6.20/mm/migrate.c
> @@ -28,6 +28,7 @@
> #include <linux/mempolicy.h>
> #include <linux/vmalloc.h>
> #include <linux/security.h>
> +#include <linux/rss_container.h>
>
> #include "internal.h"
>
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>
Re: [RFC][PATCH 4/7] RSS accounting hooks over the code [message #11170 is a reply to message #10892] Wed, 14 March 2007 15:37 Go to previous messageGo to next message
Cedric Le Goater is currently offline  Cedric Le Goater
Messages: 443
Registered: February 2006
Senior Member
> --- linux-2.6.20.orig/mm/migrate.c 2007-02-04 21:44:54.000000000 +0300
> +++ linux-2.6.20-0/mm/migrate.c 2007-03-06 13:33:28.000000000 +0300
> @@ -134,6 +134,7 @@ static void remove_migration_pte(struct
> pte_t *ptep, pte;
> spinlock_t *ptl;
> unsigned long addr = page_address_in_vma(new, vma);
> + struct page_container *pcont;
>
> if (addr == -EFAULT)
> return;
> @@ -157,6 +158,11 @@ static void remove_migration_pte(struct
> return;
> }
>
> + if (container_rss_prepare(new, vma, &pcont)) {
> + pte_unmap(ptep);
> + return;
> + }
> +
> ptl = pte_lockptr(mm, pmd);
> spin_lock(ptl);
> pte = *ptep;
> @@ -175,16 +181,19 @@ static void remove_migration_pte(struct
> set_pte_at(mm, addr, ptep, pte);
>
> if (PageAnon(new))
> - page_add_anon_rmap(new, vma, addr);
> + page_add_anon_rmap(new, vma, addr, pcont);
> else
> - page_add_file_rmap(new);
> + page_add_file_rmap(new, pcont);
>
> /* No need to invalidate - it was non-present before */
> update_mmu_cache(vma, addr, pte);
> lazy_mmu_prot_update(pte);
> + pte_unmap_unlock(ptep, ptl);
> + return;
>
> out:
> pte_unmap_unlock(ptep, ptl);
> + container_rss_release(pcont);
> }
>
> /*

you missed out an include in mm/migrate.c

cheers,

C.
Signed-off-by: Cedric Le Goater <clg@fr.ibm.com>
---
mm/migrate.c | 1 +
1 file changed, 1 insertion(+)

Index: 2.6.20/mm/migrate.c
============================================================ =======
--- 2.6.20.orig/mm/migrate.c
+++ 2.6.20/mm/migrate.c
@@ -28,6 +28,7 @@
#include <linux/mempolicy.h>
#include <linux/vmalloc.h>
#include <linux/security.h>
+#include <linux/rss_container.h>

#include "internal.h"
Re: [RFC][PATCH 1/7] Resource counters [message #17647 is a reply to message #10913] Fri, 09 March 2007 16:37 Go to previous messageGo to next message
Herbert Poetzl is currently offline  Herbert Poetzl
Messages: 239
Registered: February 2006
Senior Member
On Wed, Mar 07, 2007 at 10:19:05AM +0300, Pavel Emelianov wrote:
> Balbir Singh wrote:
> > Pavel Emelianov wrote:
> >> Introduce generic structures and routines for
> >> resource accounting.
> >>
> >> Each resource accounting container is supposed to
> >> aggregate it, container_subsystem_state and its
> >> resource-specific members within.
> >>
> >>
> >> ------------------------------------------------------------------------
> >>
> >> diff -upr linux-2.6.20.orig/include/linux/res_counter.h
> >> linux-2.6.20-0/include/linux/res_counter.h
> >> --- linux-2.6.20.orig/include/linux/res_counter.h    2007-03-06
> >> 13:39:17.000000000 +0300
> >> +++ linux-2.6.20-0/include/linux/res_counter.h    2007-03-06
> >> 13:33:28.000000000 +0300
> >> @@ -0,0 +1,83 @@
> >> +#ifndef __RES_COUNTER_H__
> >> +#define __RES_COUNTER_H__
> >> +/*
> >> + * resource counters
> >> + *
> >> + * Copyright 2007 OpenVZ SWsoft Inc
> >> + *
> >> + * Author: Pavel Emelianov <xemul@openvz.org>
> >> + *
> >> + */
> >> +
> >> +#include <linux/container.h>
> >> +
> >> +struct res_counter {
> >> +    unsigned long usage;
> >> +    unsigned long limit;
> >> +    unsigned long failcnt;
> >> +    spinlock_t lock;
> >> +};
> >> +
> >> +enum {
> >> +    RES_USAGE,
> >> +    RES_LIMIT,
> >> +    RES_FAILCNT,
> >> +};
> >> +
> >> +ssize_t res_counter_read(struct res_counter *cnt, int member,
> >> +        const char __user *buf, size_t nbytes, loff_t *pos);
> >> +ssize_t res_counter_write(struct res_counter *cnt, int member,
> >> +        const char __user *buf, size_t nbytes, loff_t *pos);
> >> +
> >> +static inline void res_counter_init(struct res_counter *cnt)
> >> +{
> >> +    spin_lock_init(&cnt->lock);
> >> +    cnt->limit = (unsigned long)LONG_MAX;
> >> +}
> >> +
> > 
> > Is there any way to indicate that there are no limits on this container.
> 
> Yes - LONG_MAX is essentially a "no limit" value as no
> container will ever have such many files :)

-1 or ~0 is a viable choice for userspace to
communicate 'infinite' or 'unlimited'

> > LONG_MAX is quite huge, but still when the administrator wants to
> > configure a container to *un-limited usage*, it becomes hard for
> > the administrator.
> > 
> >> +static inline int res_counter_charge_locked(struct res_counter *cnt,
> >> +        unsigned long val)
> >> +{
> >> +    if (cnt->usage <= cnt->limit - val) {
> >> +        cnt->usage += val;
> >> +        return 0;
> >> +    }
> >> +
> >> +    cnt->failcnt++;
> >> +    return -ENOMEM;
> >> +}
> >> +
> >> +static inline int res_counter_charge(struct res_counter *cnt,
> >> +        unsigned long val)
> >> +{
> >> +    int ret;
> >> +    unsigned long flags;
> >> +
> >> +    spin_lock_irqsave(&cnt->lock, flags);
> >> +    ret = res_counter_charge_locked(cnt, val);
> >> +    spin_unlock_irqrestore(&cnt->lock, flags);
> >> +    return ret;
> >> +}
> >> +
> > 
> > Will atomic counters help here.
> 
> I'm afraid no. We have to atomically check for limit and alter
> one of usage or failcnt depending on the checking result. Making
> this with atomic_xxx ops will require at least two ops.

Linux-VServer does the accounting with atomic counters,
so that works quite fine, just do the checks at the
beginning of whatever resource allocation and the
accounting once the resource is acquired ...

> If we'll remove failcnt this would look like
>    while (atomic_cmpxchg(...))
> which is also not that good.
> 
> Moreover - in RSS accounting patches I perform page list
> manipulations under this lock, so this also saves one atomic op.

it still hasn't been shown that this kind of RSS limit
doesn't add big time overhead to normal operations
(inside and outside of such a resource container)

note that the 'usual' memory accounting is much more
lightweight and serves similar purposes ...

best,
Herbert

> >> +static inline void res_counter_uncharge_locked(struct res_counter *cnt,
> >> +        unsigned long val)
> >> +{
> >> +    if (unlikely(cnt->usage < val)) {
> >> +        WARN_ON(1);
> >> +        val = cnt->usage;
> >> +    }
> >> +
> >> +    cnt->usage -= val;
> >> +}
> >> +
> >> +static inline void res_counter_uncharge(struct res_counter *cnt,
> >> +        unsigned long val)
> >> +{
> >> +    unsigned long flags;
> >> +
> >> +    spin_lock_irqsave(&cnt->lock, flags);
> >> +    res_counter_uncharge_locked(cnt, val);
> >> +    spin_unlock_irqrestore(&cnt->lock, flags);
> >> +}
> >> +
> >> +#endif
> >> diff -upr linux-2.6.20.orig/init/Kconfig linux-2.6.20-0/init/Kconfig
> >> --- linux-2.6.20.orig/init/Kconfig    2007-03-06 13:33:28.000000000 +0300
> >> +++ linux-2.6.20-0/init/Kconfig    2007-03-06 13:33:28.000000000 +0300
> >> @@ -265,6 +265,10 @@ config CPUSETS
> >>
> >>        Say N if unsure.
> >>
> >> +config RESOURCE_COUNTERS
> >> +    bool
> >> +    select CONTAINERS
> >> +
> >>  config SYSFS_DEPRECATED
> >>      bool "Create deprecated sysfs files"
> >>      default y
> >> diff -upr linux-2.6.20.orig/kernel/Makefile
> >> linux-2.6.20-0/kernel/Makefile
> >> --- linux-2.6.20.orig/kernel/Makefile    2007-03-06 13:33:28.000000000
> >> +0300
> >> +++ linux-2.6.20-0/kernel/Makefile    2007-03-06 13:33:28.000000000 +0300
> >> @@ -51,6 +51,7 @@ obj-$(CONFIG_RELAY) += relay.o
> >>  obj-$(CONFIG_UTS_NS) += utsname.o
> >>  obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
> >>  obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
> >> +obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
> >>
> >>  ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
> >>  # According to Alan Modra <alan@linuxcare.com.au>, the
> >> -fno-omit-frame-pointer is
> >> diff -upr linux-2.6.20.orig/kernel/res_counter.c
> >> linux-2.6.20-0/kernel/res_counter.c
> >> --- linux-2.6.20.orig/kernel/res_counter.c    2007-03-06
> >> 13:39:17.000000000 +0300
> >> +++ linux-2.6.20-0/kernel/res_counter.c    2007-03-06
> >> 13:33:28.000000000 +0300
> >> @@ -0,0 +1,72 @@
> >> +/*
> >> + * resource containers
> >> + *
> >> + * Copyright 2007 OpenVZ SWsoft Inc
> >> + *
> >> + * Author: Pavel Emelianov <xemul@openvz.org>
> >> + *
> >> + */
> >> +
> >> +#include <linux/parser.h>
> >> +#include <linux/fs.h>
> >> +#include <linux/res_counter.h>
> >> +#include <asm/uaccess.h>
> >> +
> >> +static inline unsigned long *res_counter_member(struct res_counter
> >> *cnt, int member)
> >> +{
> >> +    switch (member) {
> >> +    case RES_USAGE:
> >> +        return &cnt->usage;
> >> +    case RES_LIMIT:
> >> +        return &cnt->limit;
> >> +    case RES_FAILCNT:
> >> +        return &cnt->failcnt;
> >> +    };
> >> +
> >> +    BUG();
> >> +    return NULL;
> >> +}
> >> +
> >> +ssize_t res_counter_read(struct res_counter *cnt, int member,
> >> +        const char __user *userbuf, size_t nbytes, loff_t *pos)
> >> +{
> >> +    unsigned long *val;
> >> +    char buf[64], *s;
> >> +
> >> +    s = buf;
> >> +    val = res_counter_member(cnt, member);
> >> +    s += sprintf(s, "%lu\n", *val);
> >> +    return simple_read_from_buffer((void __user *)userbuf, nbytes,
> >> +            pos, buf, s - buf);
> >> +}
> >> +
> >> +ssize_t res_counter_write(struct res_counter *cnt, int member,
> >> +        const char __user *userbuf, size_t nbytes, loff_t *pos)
> >> +{
> >> +    int ret;
> >> +    char *buf, *end;
> >> +    unsigned long tmp, *val;
> >> +
> >> +    buf = kmalloc(nbytes + 1, GFP_KERNEL);
> >> +    ret = -ENOMEM;
> >> +    if (buf == NULL)
> >> +        goto out;
> >> +
> >> +    buf[nbytes] = 0;
> >> +    ret = -EFAULT;
> >> +    if (copy_from_user(buf, userbuf, nbytes))
> >> +        goto out_free;
> >> +
> >> +    ret = -EINVAL;
> >> +    tmp = simple_strtoul(buf, &end, 10);
> >> +    if (*end != '\0')
> >> +        goto out_free;
> >> +
> >> +    val = res_counter_member(cnt, member);
> >> +    *val = tmp;
> >> +    ret = nbytes;
> >> +out_free:
> >> +    kfree(buf);
> >> +out:
> >> +    return ret;
> >> +}
> >>
> > 
> &g
...

Re: [RFC][PATCH 2/7] RSS controller core [message #17648 is a reply to message #10902] Fri, 09 March 2007 16:48 Go to previous messageGo to next message
Herbert Poetzl is currently offline  Herbert Poetzl
Messages: 239
Registered: February 2006
Senior Member
On Tue, Mar 06, 2007 at 02:00:36PM -0800, Andrew Morton wrote:
> On Tue, 06 Mar 2007 17:55:29 +0300
> Pavel Emelianov <xemul@sw.ru> wrote:
> 
> > +struct rss_container {
> > +	struct res_counter res;
> > +	struct list_head page_list;
> > +	struct container_subsys_state css;
> > +};
> > +
> > +struct page_container {
> > +	struct page *page;
> > +	struct rss_container *cnt;
> > +	struct list_head list;
> > +};
> 
> ah. This looks good. I'll find a hunk of time to go through this work
> and through Paul's patches. It'd be good to get both patchsets lined
> up in -mm within a couple of weeks. But..

doesn't look so good for me, mainly becaus of the 
additional per page data and per page processing

on 4GB memory, with 100 guests, 50% shared for each
guest, this basically means ~1mio pages, 500k shared
and 1500k x sizeof(page_container) entries, which
roughly boils down to ~25MB of wasted memory ...

increase the amount of shared pages and it starts
getting worse, but maybe I'm missing something here

> We need to decide whether we want to do per-container memory
> limitation via these data structures, or whether we do it via a
> physical scan of some software zone, possibly based on Mel's patches.

why not do simple page accounting (as done currently
in Linux) and use that for the limits, without
keeping the reference from container to page?

best,
Herbert

> _______________________________________________
> Containers mailing list
> Containers@lists.osdl.org
> https://lists.osdl.org/mailman/listinfo/containers
_______________________________________________
Containers mailing list
Containers@lists.osdl.org
https://lists.osdl.org/mailman/listinfo/containers
Re: [RFC][PATCH 1/7] Resource counters [message #17700 is a reply to message #17647] Sun, 11 March 2007 09:01 Go to previous messageGo to next message
xemul is currently offline  xemul
Messages: 248
Registered: November 2005
Senior Member
Herbert Poetzl wrote:
> On Wed, Mar 07, 2007 at 10:19:05AM +0300, Pavel Emelianov wrote:
>> Balbir Singh wrote:
>>> Pavel Emelianov wrote:
>>>> Introduce generic structures and routines for
>>>> resource accounting.
>>>>
>>>> Each resource accounting container is supposed to
>>>> aggregate it, container_subsystem_state and its
>>>> resource-specific members within.
>>>>
>>>>
>>>> ------------------------------------------------------------------------
>>>>
>>>> diff -upr linux-2.6.20.orig/include/linux/res_counter.h
>>>> linux-2.6.20-0/include/linux/res_counter.h
>>>> --- linux-2.6.20.orig/include/linux/res_counter.h    2007-03-06
>>>> 13:39:17.000000000 +0300
>>>> +++ linux-2.6.20-0/include/linux/res_counter.h    2007-03-06
>>>> 13:33:28.000000000 +0300
>>>> @@ -0,0 +1,83 @@
>>>> +#ifndef __RES_COUNTER_H__
>>>> +#define __RES_COUNTER_H__
>>>> +/*
>>>> + * resource counters
>>>> + *
>>>> + * Copyright 2007 OpenVZ SWsoft Inc
>>>> + *
>>>> + * Author: Pavel Emelianov <xemul@openvz.org>
>>>> + *
>>>> + */
>>>> +
>>>> +#include <linux/container.h>
>>>> +
>>>> +struct res_counter {
>>>> +    unsigned long usage;
>>>> +    unsigned long limit;
>>>> +    unsigned long failcnt;
>>>> +    spinlock_t lock;
>>>> +};
>>>> +
>>>> +enum {
>>>> +    RES_USAGE,
>>>> +    RES_LIMIT,
>>>> +    RES_FAILCNT,
>>>> +};
>>>> +
>>>> +ssize_t res_counter_read(struct res_counter *cnt, int member,
>>>> +        const char __user *buf, size_t nbytes, loff_t *pos);
>>>> +ssize_t res_counter_write(struct res_counter *cnt, int member,
>>>> +        const char __user *buf, size_t nbytes, loff_t *pos);
>>>> +
>>>> +static inline void res_counter_init(struct res_counter *cnt)
>>>> +{
>>>> +    spin_lock_init(&cnt->lock);
>>>> +    cnt->limit = (unsigned long)LONG_MAX;
>>>> +}
>>>> +
>>> Is there any way to indicate that there are no limits on this container.
>> Yes - LONG_MAX is essentially a "no limit" value as no
>> container will ever have such many files :)
> 
> -1 or ~0 is a viable choice for userspace to
> communicate 'infinite' or 'unlimited'

OK, I'll make ULONG_MAX :)

>>> LONG_MAX is quite huge, but still when the administrator wants to
>>> configure a container to *un-limited usage*, it becomes hard for
>>> the administrator.
>>>
>>>> +static inline int res_counter_charge_locked(struct res_counter *cnt,
>>>> +        unsigned long val)
>>>> +{
>>>> +    if (cnt->usage <= cnt->limit - val) {
>>>> +        cnt->usage += val;
>>>> +        return 0;
>>>> +    }
>>>> +
>>>> +    cnt->failcnt++;
>>>> +    return -ENOMEM;
>>>> +}
>>>> +
>>>> +static inline int res_counter_charge(struct res_counter *cnt,
>>>> +        unsigned long val)
>>>> +{
>>>> +    int ret;
>>>> +    unsigned long flags;
>>>> +
>>>> +    spin_lock_irqsave(&cnt->lock, flags);
>>>> +    ret = res_counter_charge_locked(cnt, val);
>>>> +    spin_unlock_irqrestore(&cnt->lock, flags);
>>>> +    return ret;
>>>> +}
>>>> +
>>> Will atomic counters help here.
>> I'm afraid no. We have to atomically check for limit and alter
>> one of usage or failcnt depending on the checking result. Making
>> this with atomic_xxx ops will require at least two ops.
> 
> Linux-VServer does the accounting with atomic counters,
> so that works quite fine, just do the checks at the
> beginning of whatever resource allocation and the
> accounting once the resource is acquired ...

This works quite fine on non-preempted kernels.
>From the time you checked for resource till you really
account it kernel may preempt and let another process
pass through vx_anything_avail() check.

>> If we'll remove failcnt this would look like
>>    while (atomic_cmpxchg(...))
>> which is also not that good.
>>
>> Moreover - in RSS accounting patches I perform page list
>> manipulations under this lock, so this also saves one atomic op.
> 
> it still hasn't been shown that this kind of RSS limit
> doesn't add big time overhead to normal operations
> (inside and outside of such a resource container)
> 
> note that the 'usual' memory accounting is much more
> lightweight and serves similar purposes ...

It OOM-kills current int case of limit hit instead of
reclaiming pages or killing *memory eater* to free memory.

> best,
> Herbert
> 
>>>> +static inline void res_counter_uncharge_locked(struct res_counter *cnt,
>>>> +        unsigned long val)
>>>> +{
>>>> +    if (unlikely(cnt->usage < val)) {
>>>> +        WARN_ON(1);
>>>> +        val = cnt->usage;
>>>> +    }
>>>> +
>>>> +    cnt->usage -= val;
>>>> +}
>>>> +
>>>> +static inline void res_counter_uncharge(struct res_counter *cnt,
>>>> +        unsigned long val)
>>>> +{
>>>> +    unsigned long flags;
>>>> +
>>>> +    spin_lock_irqsave(&cnt->lock, flags);
>>>> +    res_counter_uncharge_locked(cnt, val);
>>>> +    spin_unlock_irqrestore(&cnt->lock, flags);
>>>> +}
>>>> +
>>>> +#endif
>>>> diff -upr linux-2.6.20.orig/init/Kconfig linux-2.6.20-0/init/Kconfig
>>>> --- linux-2.6.20.orig/init/Kconfig    2007-03-06 13:33:28.000000000 +0300
>>>> +++ linux-2.6.20-0/init/Kconfig    2007-03-06 13:33:28.000000000 +0300
>>>> @@ -265,6 +265,10 @@ config CPUSETS
>>>>
>>>>        Say N if unsure.
>>>>
>>>> +config RESOURCE_COUNTERS
>>>> +    bool
>>>> +    select CONTAINERS
>>>> +
>>>>  config SYSFS_DEPRECATED
>>>>      bool "Create deprecated sysfs files"
>>>>      default y
>>>> diff -upr linux-2.6.20.orig/kernel/Makefile
>>>> linux-2.6.20-0/kernel/Makefile
>>>> --- linux-2.6.20.orig/kernel/Makefile    2007-03-06 13:33:28.000000000
>>>> +0300
>>>> +++ linux-2.6.20-0/kernel/Makefile    2007-03-06 13:33:28.000000000 +0300
>>>> @@ -51,6 +51,7 @@ obj-$(CONFIG_RELAY) += relay.o
>>>>  obj-$(CONFIG_UTS_NS) += utsname.o
>>>>  obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
>>>>  obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
>>>> +obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
>>>>
>>>>  ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
>>>>  # According to Alan Modra <alan@linuxcare.com.au>, the
>>>> -fno-omit-frame-pointer is
>>>> diff -upr linux-2.6.20.orig/kernel/res_counter.c
>>>> linux-2.6.20-0/kernel/res_counter.c
>>>> --- linux-2.6.20.orig/kernel/res_counter.c    2007-03-06
>>>> 13:39:17.000000000 +0300
>>>> +++ linux-2.6.20-0/kernel/res_counter.c    2007-03-06
>>>> 13:33:28.000000000 +0300
>>>> @@ -0,0 +1,72 @@
>>>> +/*
>>>> + * resource containers
>>>> + *
>>>> + * Copyright 2007 OpenVZ SWsoft Inc
>>>> + *
>>>> + * Author: Pavel Emelianov <xemul@openvz.org>
>>>> + *
>>>> + */
>>>> +
>>>> +#include <linux/parser.h>
>>>> +#include <linux/fs.h>
>>>> +#include <linux/res_counter.h>
>>>> +#include <asm/uaccess.h>
>>>> +
>>>> +static inline unsigned long *res_counter_member(struct res_counter
>>>> *cnt, int member)
>>>> +{
>>>> +    switch (member) {
>>>> +    case RES_USAGE:
>>>> +        return &cnt->usage;
>>>> +    case RES_LIMIT:
>>>> +        return &cnt->limit;
>>>> +    case RES_FAILCNT:
>>>> +        return &cnt->failcnt;
>>>> +    };
>>>> +
>>>> +    BUG();
>>>> +    return NULL;
>>>> +}
>>>> +
>>>> +ssize_t res_counter_read(struct res_counter *cnt, int member,
>>>> +        const char __user *userbuf, size_t nbytes, loff_t *pos)
>>>> +{
>>>> +    unsigned long *val;
>>>> +    char buf[64], *s;
>>>> +
>>>> +    s = buf;
>>>> +    val = res_counter_member(cnt, member);
>>>> +    s += sprintf(s, "%lu\n", *val);
>>>> +    return simple_read_from_buffer((void __user *)userbuf, nbytes,
>>>> +            pos, buf, s - buf);
>>>> +}
>>>> +
>>>> +ssize_t res_counter_write(struct res_counter *cnt, int member,
>>>&g
...

Re: [RFC][PATCH 2/7] RSS controller core [message #17702 is a reply to message #17648] Sun, 11 March 2007 09:08 Go to previous messageGo to previous message
xemul is currently offline  xemul
Messages: 248
Registered: November 2005
Senior Member
Herbert Poetzl wrote:
> On Tue, Mar 06, 2007 at 02:00:36PM -0800, Andrew Morton wrote:
>> On Tue, 06 Mar 2007 17:55:29 +0300
>> Pavel Emelianov <xemul@sw.ru> wrote:
>>
>>> +struct rss_container {
>>> +	struct res_counter res;
>>> +	struct list_head page_list;
>>> +	struct container_subsys_state css;
>>> +};
>>> +
>>> +struct page_container {
>>> +	struct page *page;
>>> +	struct rss_container *cnt;
>>> +	struct list_head list;
>>> +};
>> ah. This looks good. I'll find a hunk of time to go through this work
>> and through Paul's patches. It'd be good to get both patchsets lined
>> up in -mm within a couple of weeks. But..
> 
> doesn't look so good for me, mainly becaus of the 
> additional per page data and per page processing
> 
> on 4GB memory, with 100 guests, 50% shared for each
> guest, this basically means ~1mio pages, 500k shared
> and 1500k x sizeof(page_container) entries, which
> roughly boils down to ~25MB of wasted memory ...
> 
> increase the amount of shared pages and it starts
> getting worse, but maybe I'm missing something here

You are. Each page has only one page_container associated
with it despite the number of containers it is shared
between.

>> We need to decide whether we want to do per-container memory
>> limitation via these data structures, or whether we do it via a
>> physical scan of some software zone, possibly based on Mel's patches.
> 
> why not do simple page accounting (as done currently
> in Linux) and use that for the limits, without
> keeping the reference from container to page?

As I've already answered in my previous letter simple
limiting w/o per-container reclamation and per-container
oom killer isn't a good memory management. It doesn't allow
to handle resource shortage gracefully.

This patchset provides more grace way to handle this, but
full memory management includes accounting of VMA-length
as well (returning ENOMEM from system call) but we've decided
to start with RSS.

> best,
> Herbert
> 
>> _______________________________________________
>> Containers mailing list
>> Containers@lists.osdl.org
>> https://lists.osdl.org/mailman/listinfo/containers
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 

_______________________________________________
Containers mailing list
Containers@lists.osdl.org
https://lists.osdl.org/mailman/listinfo/containers
Previous Topic: Re: [ckrm-tech] [PATCH 7/7] containers (V7): Container interface to nsproxy subsystem
Next Topic: Linux-VServer example results for sharing vs. separate mappings ...
Goto Forum:
  


Current Time: Sun Oct 26 14:11:53 GMT 2025

Total time taken to generate the page: 0.10498 seconds