| Home » Mailing lists » Devel » [RFC][PATCH] UBC: user resource beancounters Goto Forum:
	| 
		
			| Re: [ckrm-tech] [RFC][PATCH 2/7] UBC: core (structures, API) [message #5396 is a reply to message #5196] | Fri, 18 August 2006 01:58   |  
			| 
				
				
					|  Matt Helsley Messages: 86
 Registered: August 2006
 | Member |  |  |  
	| On Wed, 2006-08-16 at 19:37 +0400, Kirill Korotaev wrote: > Core functionality and interfaces of UBC:
 > find/create beancounter, initialization,
 > charge/uncharge of resource, core objects' declarations.
 >
 > Basic structures:
 >   ubparm           - resource description
 >   user_beancounter - set of resources, id, lock
 >
 > Signed-Off-By: Pavel Emelianov <xemul@sw.ru>
 > Signed-Off-By: Kirill Korotaev <dev@sw.ru>
 >
 > ---
 >  include/ub/beancounter.h |  157 ++++++++++++++++++
 >  init/main.c              |    4
 >  kernel/Makefile          |    1
 >  kernel/ub/Makefile       |    7
 >  kernel/ub/beancounter.c  |  398 +++++++++++++++++++++++++++++++++++++++++++++++
 >  5 files changed, 567 insertions(+)
 >
 > --- /dev/null	2006-07-18 14:52:43.075228448 +0400
 > +++ ./include/ub/beancounter.h	2006-08-10 14:58:27.000000000 +0400
 > @@ -0,0 +1,157 @@
 > +/*
 > + *  include/ub/beancounter.h
 > + *
 > + *  Copyright (C) 2006 OpenVZ. SWsoft Inc
 > + *
 > + */
 > +
 > +#ifndef _LINUX_BEANCOUNTER_H
 > +#define _LINUX_BEANCOUNTER_H
 > +
 > +/*
 > + *	Resource list.
 > + */
 > +
 > +#define UB_RESOURCES	0
 > +
 > +struct ubparm {
 > +	/*
 > +	 * A barrier over which resource allocations are failed gracefully.
 > +	 * e.g. if the amount of consumed memory is over the barrier further
 > +	 * sbrk() or mmap() calls fail, the existing processes are not killed.
 > +	 */
 > +	unsigned long	barrier;
 > +	/* hard resource limit */
 > +	unsigned long	limit;
 > +	/* consumed resources */
 > +	unsigned long	held;
 > +	/* maximum amount of consumed resources through the last period */
 > +	unsigned long	maxheld;
 > +	/* minimum amount of consumed resources through the last period */
 > +	unsigned long	minheld;
 > +	/* count of failed charges */
 > +	unsigned long	failcnt;
 > +};
 > +
 > +/*
 > + * Kernel internal part.
 > + */
 > +
 > +#ifdef __KERNEL__
 > +
 > +#include <linux/config.h>
 > +#include <linux/spinlock.h>
 > +#include <linux/list.h>
 > +#include <asm/atomic.h>
 > +
 > +/*
 > + * UB_MAXVALUE is essentially LONG_MAX declared in a cross-compiling safe form.
 > + */
 > +#define UB_MAXVALUE	( (1UL << (sizeof(unsigned long)*8-1)) - 1)
 > +
 > +
 > +/*
 > + *	Resource management structures
 > + * Serialization issues:
 > + *   beancounter list management is protected via ub_hash_lock
 > + *   task pointers are set only for current task and only once
 > + *   refcount is managed atomically
 > + *   value and limit comparison and change are protected by per-ub spinlock
 > + */
 > +
 > +struct user_beancounter
 > +{
 > +	atomic_t		ub_refcount;
 > +	spinlock_t		ub_lock;
 > +	uid_t			ub_uid;
 > +	struct hlist_node	hash;
 > +
 > +	struct user_beancounter	*parent;
 > +	void			*private_data;
 > +
 > +	/* resources statistics and settings */
 > +	struct ubparm		ub_parms[UB_RESOURCES];
 > +};
 > +
 > +enum severity { UB_BARRIER, UB_LIMIT, UB_FORCE };
 > +
 > +/* Flags passed to beancounter_findcreate() */
 > +#define UB_LOOKUP_SUB		0x01 /* Lookup subbeancounter */
 > +#define UB_ALLOC		0x02 /* May allocate new one */
 > +#define UB_ALLOC_ATOMIC		0x04 /* Allocate with GFP_ATOMIC */
 > +
 > +#define UB_HASH_SIZE		256
 > +
 > +#ifdef CONFIG_USER_RESOURCE
 > +extern struct hlist_head ub_hash[];
 > +extern spinlock_t ub_hash_lock;
 > +
 > +static inline void ub_adjust_held_minmax(struct user_beancounter *ub,
 > +		int resource)
 > +{
 > +	if (ub->ub_parms[resource].maxheld < ub->ub_parms[resource].held)
 > +		ub->ub_parms[resource].maxheld = ub->ub_parms[resource].held;
 > +	if (ub->ub_parms[resource].minheld > ub->ub_parms[resource].held)
 > +		ub->ub_parms[resource].minheld = ub->ub_parms[resource].held;
 > +}
 > +
 > +void ub_print_resource_warning(struct user_beancounter *ub, int res,
 > +		char *str, unsigned long val, unsigned long held);
 > +void ub_print_uid(struct user_beancounter *ub, char *str, int size);
 > +
 > +int  __charge_beancounter_locked(struct user_beancounter *ub,
 > +		int resource, unsigned long val, enum severity strict);
 > +void charge_beancounter_notop(struct user_beancounter *ub,
 > +		int resource, unsigned long val);
 > +int  charge_beancounter(struct user_beancounter *ub,
 > +		int resource, unsigned long val, enum severity strict);
 > +
 > +void __uncharge_beancounter_locked(struct user_beancounter *ub,
 > +		int resource, unsigned long val);
 > +void uncharge_beancounter_notop(struct user_beancounter *ub,
 > +		int resource, unsigned long val);
 > +void uncharge_beancounter(struct user_beancounter *ub,
 > +		int resource, unsigned long val);
 > +
 > +struct user_beancounter *beancounter_findcreate(uid_t uid,
 > +		struct user_beancounter *parent, int flags);
 > +
 > +static inline struct user_beancounter *get_beancounter(
 > +		struct user_beancounter *ub)
 > +{
 > +	atomic_inc(&ub->ub_refcount);
 > +	return ub;
 > +}
 > +
 > +void __put_beancounter(struct user_beancounter *ub);
 > +static inline void put_beancounter(struct user_beancounter *ub)
 > +{
 > +	__put_beancounter(ub);
 > +}
 > +
 > +void ub_init_early(void);
 > +void ub_init_late(void);
 > +void ub_init_proc(void);
 > +
 > +extern struct user_beancounter ub0;
 > +extern const char *ub_rnames[];
 > +
 > +#else /* CONFIG_USER_RESOURCE */
 > +
 > +#define beancounter_findcreate(id, p, f)		(NULL)
 > +#define get_beancounter(ub)				(NULL)
 > +#define put_beancounter(ub)				do { } while (0)
 > +#define __charge_beancounter_locked(ub, r, v, s)	(0)
 > +#define charge_beancounter(ub, r, v, s)			(0)
 > +#define charge_beancounter_notop(ub, r, v)		do { } while (0)
 > +#define __uncharge_beancounter_locked(ub, r, v)		do { } while (0)
 > +#define uncharge_beancounter(ub, r, v)			do { } while (0)
 > +#define uncharge_beancounter_notop(ub, r, v)		do { } while (0)
 > +#define ub_init_early()					do { } while (0)
 > +#define ub_init_late()					do { } while (0)
 > +#define ub_init_proc()					do { } while (0)
 > +
 > +#endif /* CONFIG_USER_RESOURCE */
 > +#endif /* __KERNEL__ */
 > +
 > +#endif /* _LINUX_BEANCOUNTER_H */
 > --- ./init/main.c.ubcore	2006-08-10 14:55:47.000000000 +0400
 > +++ ./init/main.c	2006-08-10 14:57:01.000000000 +0400
 > @@ -52,6 +52,8 @@
 >  #include <linux/debug_locks.h>
 >  #include <linux/lockdep.h>
 >
 > +#include <ub/beancounter.h>
 > +
 >  #include <asm/io.h>
 >  #include <asm/bugs.h>
 >  #include <asm/setup.h>
 > @@ -470,6 +472,7 @@ asmlinkage void __init start_kernel(void
 >  	early_boot_irqs_off();
 >  	early_init_irq_lock_class();
 >
 > +	ub_init_early();
 >  /*
 >   * Interrupts are still disabled. Do necessary setups, then
 >   * enable them
 > @@ -563,6 +566,7 @@ asmlinkage void __init start_kernel(void
 >  #endif
 >  	fork_init(num_physpages);
 >  	proc_caches_init();
 > +	ub_init_late();
 >  	buffer_init();
 >  	unnamed_dev_init();
 >  	key_init();
 > --- ./kernel/Makefile.ubcore	2006-08-10 14:55:47.000000000 +0400
 > +++ ./kernel/Makefile	2006-08-10 14:57:01.000000000 +0400
 > @@ -12,6 +12,7 @@ obj-y     = sched.o fork.o exec_domain.o
 >
 >  obj-$(CONFIG_STACKTRACE) += stacktrace.o
 >  obj-y += time/
 > +obj-y += ub/
 >  obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
 >  obj-$(CONFIG_LOCKDEP) += lockdep.o
 >  ifeq ($(CONFIG_PROC_FS),y)
 > --- /dev/null	2006-07-18 14:52:43.075228448 +0400
 > +++ ./kernel/ub/Makefile	2006-08-10 14:57:01.000000000 +0400
 > @@ -0,0 +1,7 @@
 > +#
 > +# User resources part (UBC)
 > +#
 > +# Copyright (C) 2006 OpenVZ. SWsoft Inc
 > +#
 > +
 > +obj-$(CONFIG_USER_RESOURCE) += beancounter.o
 > --- /dev/null	2006-07-18 14:52:43.075228448 +0400
 > +++ ./kernel/ub/beancounter.c	2006-08-10 15:09:34.000000000 +0400
 > @@ -0,0 +1,398 @@
 > +/*
 > + *  kernel/ub/beancounter.c
 > + *
 > + *  Copyright (C) 2006 OpenVZ. SWsoft Inc
 > + *  Original code by (C) 1998      Alan Cox
 > + *                       1998-2000 Andrey Savochkin <saw@saw.sw.com.sg>
 > + */
 > +
 > +#include <linux/slab.h>
 > +#include <linux/module.h>
 > +
 > +#include <ub/beancounter.h>
 > +
 > +static kmem_cache_t *ub_cachep;
 > +static struct user_beancounter default_beancounter;
 > +static struct user_beancounter default_subbeancounter;
 > +
 > +static void init_beancounter_struct(struct user_beancounter *ub, uid_t id);
 > +
 > +struct user_beancounter ub0;
 > +
 > +const char *ub_rnames[] = {
 > +};
 > +
 > +#define ub_hash_fun(x) ((((x) >> 8) ^ (x)) & (UB_HASH_SIZE - 1))
 > +#define ub_subhash_fun(p, id) ub_hash_fun((p)->ub_uid + (id) * 17)
 > +
 > +struct hlist_head ub_hash[UB_HASH_SIZE];
 > +spinlock_t ub_hash_lock;
 > +
 > +EXPORT_SYMBOL(ub_hash);
 > +EXPORT_SYMBOL(ub_hash_lock);
 > +
 > +/*
 > + *	Per user resource beancounting. Resources are tied to their luid.
 
 You haven't explained what an luid is at this point in the patch series.
 Patch 0 says:
 
 diff-ubc-syscalls.patch:
 Patch adds system calls for UB management:
 1. sys_getluid    - get current UB id
 
 But I have no idea what that l is there for. Why not sys_get_ubid() for
 instance?
 
 > + *	The resource structure itself is tagged both to the process and
 > + *	the charging resources (a socket doesn't want to have to search for
 > + *	things at irq time for example). Reference counters keep things in
 > + *	hand.
 > + *
 > + *	The case where a user creates resource, kills all his processes and
 > + *	then starts new ones is correctly handled this way. The refcounters
 > + *	will mean the old entry is still around with resource tied to it.
 > + */
 > +
 
 So we create one beancounter object for every resource the user's tasks
 allocate? For instance, one per socket? Or does "resource structure"
 refer to something else?
 
 > +struct user_beancounter *beancounter_findcreate(uid_t uid,
 > +		struct user_beancounter *p, int mask)
 > +{
 > +	struct user_beancounter *new_ub, *ub, *tmpl_ub;
 > +	unsigned long flags;
 > +	struct hlist_head *slot;
 > +	struct hlist_node *pos;
 > +
 > +	if (mask & UB_LOOKUP_SUB) {
 > +		WARN_ON(p == NULL);
 > +		tmpl_ub = &default_subbeancounter;
 > +		slot = &ub_hash[ub_subhash_fun(p, uid)];
 > +	} else {
 > +		WARN_ON(p != NULL);
 > +		tmpl_ub = &default_beancounter;
 > +		slot = &ub_hash[ub_hash_fun(uid)];
 > +	}
 > +	new_ub = NULL;
 > +
 > +retry:
 > +	spin_lock_irqsave(&ub_hash_lock, flags);
 > +	hlist_for_each_entry (ub, pos, slot, hash)
 > +		if (ub->ub_uid == uid && ub->parent == p)
 > +			break;
 > +
 > +	if (pos != NULL) {
 > +		get_beancounter(ub);
 > +		spin_unlock_irqrestore(&ub_hash_lock, flags);
 > +
 > +		if (new_ub != NULL) {
 > +			put_beancounter(new_ub->parent);
 > +			kmem_cache_free(ub_cachep, new_ub);
 > +		}
 > +		return ub;
 > +	}
 > +
 > +	if (!(mask & UB_ALLOC))
 > +		goto out_unlock;
 > +
 > +	if (new_ub != NULL)
 > +		goto out_install;
 > +
 > +	if (mask & UB_ALLOC_ATOMIC) {
 
 This block..
 
 > +		new_ub = kmem_cache_alloc(ub_cachep, GFP_ATOMIC);
 > +		if (new_ub == NULL)
 > +			goto out_unlock;
 > +
 > +		memcpy(new_ub, tmpl_ub, sizeof(*new_ub));
 > +		init_beancounter_struct(new_ub, uid);
 > +		if (p)
 > +			new_ub->parent = get_beancounter(p);
 
 ending here is almost exactly the same as the block ..
 
 > +		goto out_install;
 > +	}
 > +
 > +	spin_unlock_irqrestore(&ub_hash_lock, flags);
 > +
 
 >From here..
 
 > +	new_ub = kmem_cache_alloc(ub_cachep, GFP_KERNEL);
 > +	if (new_ub == NULL)
 > +		goto out;
 > +
 > +	memcpy(new_ub, tmpl_ub, sizeof(*new_ub));
 > +	init_beancounter_struct(new_ub, uid);
 > +	if (p)
 > +		new_ub->parent = get_beancounter(p);
 
 to here. You could make a flag variable that holds GFP_ATOMIC or
 GFP_KERNEL based on mask & UB_ALLOC_ATOMIC and perhaps turn this block
 into a small helper.
 
 > +	goto retry;
 > +
 > +out_install:
 > +	hlist_add_head(&new_ub->hash, slot);
 > +out_unlock:
 > +	spin_unlock_irqrestore(&ub_hash_lock, flags);
 > +out:
 > +	return new_ub;
 > +}
 > +
 > +EXPORT_SYMBOL(beancounter_findcreate);
 > +
 > +void ub_print_uid(struct user_beancounter *ub, char *str, int size)
 > +{
 > +	if (ub->parent != NULL)
 > +		snprintf(str, size, "%u.%u", ub->parent->ub_uid, ub->ub_uid);
 > +	else
 > +		snprintf(str, size, "%u", ub->ub_uid);
 > +}
 > +
 > +EXPORT_SYMBOL(ub_print_uid);
 
 >From what I can see this patch doesn't really justify the need for the
 EXPORT_SYMBOL. Shouldn't that be done in the patch where it's needed
 outside of the kernel/ub code itself?
 
 > +void ub_print_resource_warning(struct user_beancounter *ub, int res,
 > +		char *str, unsigned long val, unsigned long held)
 > +{
 > +	char uid[64];
 > +
 > +	ub_print_uid(ub, uid, sizeof(uid));
 > +	printk(KERN_WARNING "UB %s %s warning: %s "
 > +			"(held %lu, fails %lu, val %lu)\n",
 > +			uid, ub_rnames[res], str,
 > +			(res < UB_RESOURCES ? ub->ub_parms[res].held : held),
 > +			(res < UB_RESOURCES ? ub->ub_parms[res].failcnt : 0),
 > +			val);
 > +}
 > +
 > +EXPORT_SYMBOL(ub_print_resource_warning);
 > +
 > +static inline void verify_held(struct user_beancounter *ub)
 > +{
 > +	int i;
 > +
 > +	for (i = 0; i < UB_RESOURCES; i++)
 > +		if (ub->ub_parms[i].held != 0)
 > +			ub_print_resource_warning(ub, i,
 > +					"resource is held on put", 0, 0);
 > +}
 > +
 > +void __put_beancounter(struct user_beancounter *ub)
 > +{
 > +	unsigned long flags;
 > +	struct user_beancounter *parent;
 > +
 > +again:
 > +	parent = ub->parent;
 > +	/* equevalent to atomic_dec_and_lock_irqsave() */
 
 nit: s/que/qui/
 
 > +	local_irq_save(flags);
 > +	if (likely(!atomic_dec_and_lock(&ub->ub_refcount, &ub_hash_lock))) {
 > +		if (unlikely(atomic_read(&ub->ub_refcount) < 0))
 > +			printk(KERN_ERR "UB: Bad ub refcount: ub=%p, "
 > +					"luid=%d, ref=%d\n",
 > +					ub, ub->ub_uid,
 > +					atomic_read(&ub->ub_refcount));
 
 This seems to be for debugging purposes only. If not, perhaps this
 printk ought to be rate limited?
 
 > +		local_irq_restore(flags);
 > +		return;
 > +	}
 > +
 > +	if (unlikely(ub == &ub0)) {
 > +		printk(KERN_ERR "Trying to put ub0\n");
 
 Same thing for this printk.
 
 > +		spin_unlock_irqrestore(&ub_hash_lock, flags);
 > +		return;
 > +	}
 > +
 > +	verify_held(ub);
 > +	hlist_del(&ub->hash);
 > +	spin_unlock_irqrestore(&ub_hash_lock, flags);
 > +
 > +	kmem_cache_free(ub_cachep, ub);
 > +
 > +	ub = parent;
 > +	if (ub != NULL)
 > +		goto again;
 
 Couldn't this be replaced by a do { } while (ub != NULL); loop?
 
 > +}
 > +
 > +EXPORT_SYMBOL(__put_beancounter);
 > +
 > +/*
 > + *	Generic resource charging stuff
 > + */
 > +
 > +int __charge_beancounter_locked(struct user_beancounter *ub,
 > +		int resource, unsigned long val, enum severity strict)
 > +{
 > +	/*
 > +	 * ub_value <= UB_MAXVALUE, value <= UB_MAXVALUE, and only one addition
 > +	 * at the moment is possible so an overflow is impossible.
 > +	 */
 > +	ub->ub_parms[resource].held += val;
 > +
 > +	switch (strict) {
 > +		case UB_BARRIER:
 > +			if (ub->ub_parms[resource].held >
 > +					ub->ub_parms[resource].barrier)
 > +				break;
 > +			/* fallthrough */
 > +		case UB_LIMIT:
 > +			if (ub->ub_parms[resource].held >
 > +					ub->ub_parms[resource].limit)
 > +				break;
 > +			/* fallthrough */
 > +		case UB_FORCE:
 > +			ub_adjust_held_minmax(ub, resource);
 > +			return 0;
 > +		default:
 > +			BUG();
 > +	}
 > +
 > +	ub->ub_parms[resource].failcnt++;
 > +	ub->ub_parms[resource].held -= val;
 > +	return -ENOMEM;
 > +}
 > +
 > +int charge_beancounter(struct user_beancounter *ub,
 > +		int resource, unsigned long val, enum severity strict)
 > +{
 > +	int retval;
 > +	struct user_beancounter *p, *q;
 > +	unsigned long flags;
 > +
 > +	retval = -EINVAL;
 > +	BUG_ON(val > UB_MAXVALUE);
 > +
 > +	local_irq_save(flags);
 
 <factor>
 
 > +	for (p = ub; p != NULL; p = p->parent) {
 
 Seems rather expensive to walk up the tree for every charge. Especially
 if the administrator wants a fine degree of resource control and makes a
 tall tree. This would be a problem especially when it comes to resources
 that require frequent and fast allocation.
 
 > +		spin_lock(&p->ub_lock);
 > +		retval = __charge_beancounter_locked(p, resource, val, strict);
 > +		spin_unlock(&p->ub_lock);
 > +		if (retval)
 > +			goto unroll;
 
 This can be factored by passing a flag that breaks the loop on an error:
 
 if (retval && do_break_err)
 return retval;
 
 > +	}
 
 </factor>
 
 > +out_restore:
 > +	local_irq_restore(flags);
 > +	return retval;
 > +
 
 <factor>
 
 > +unroll:
 > +	for (q = ub; q != p; q = q->parent) {
 > +		spin_lock(&q->ub_lock);
 > +		__uncharge_beancounter_locked(q, resource, val);
 > +		spin_unlock(&q->ub_lock);
 > +	}
 
 </factor>
 
 > +	goto out_restore;
 > +}
 > +
 > +EXPORT_SYMBOL(charge_beancounter);
 > +
 > +void charge_beancounter_notop(struct user_beancounter *ub,
 > +		int resource, unsigned long val)
 > +{
 > +	struct user_beancounter *p;
 > +	unsigned long flags;
 > +
 > +	local_irq_save(flags);
 
 <factor>
 
 > +	for (p = ub; p->parent != NULL; p = p->parent) {
 > +		spin_lock(&p->ub_lock);
 > +		__charge_beancounter_locked(p, resource, val, UB_FORCE);
 > +		spin_unlock(&p->ub_lock);
 > +	}
 
 <factor>
 
 > +	local_irq_restore(flags);
 
 Again, this could be factored with charge_beancounter using a helper
 function.
 
 > +}
 > +
 > +EXPORT_SYMBOL(charge_beancounter_notop);
 > +
 > +void __uncharge_beancounter_locked(struct user_beancounter *ub,
 > +		int resource, unsigned long val)
 > +{
 > +	if (unlikely(ub->ub_parms[resource].held < val)) {
 > +		ub_print_resource_warning(ub, resource,
 > +				"uncharging too much", val, 0);
 > +		val = ub->ub_parms[resource].held;
 > +	}
 > +	ub->ub_parms[resource].held -= val;
 > +	ub_adjust_held_minmax(ub, resource);
 > +}
 > +
 > +void uncharge_beancounter(struct user_beancounter *ub,
 > +		int resource, unsigned long val)
 > +{
 > +	unsigned long flags;
 > +	struct user_beancounter *p;
 > +
 > +	for (p = ub; p != NULL; p = p->parent) {
 > +		spin_lock_irqsave(&p->ub_lock, flags);
 > +		__uncharge_beancounter_locked(p, resource, val);
 > +		spin_unlock_irqrestore(&p->ub_lock, flags);
 > +	}
 > +}
 
 Looks like your unroll: label in charge_beancounter above.
 
 > +
 > +EXPORT_SYMBOL(uncharge_beancounter);
 > +
 > +void uncharge_beancounter_notop(struct user_beancounter *ub,
 > +		int resource, unsigned long val)
 > +{
 > +	struct user_beancounter *p;
 > +	unsigned long flags;
 > +
 > +	local_irq_save(flags);
 
 <factor>
 
 > +	for (p = ub; p->parent != NULL; p = p->parent) {
 > +		spin_lock(&p->ub_lock);
 > +		__uncharge_beancounter_locked(p, resource, val);
 > +		spin_unlock(&p->ub_lock);
 > +	}
 
 </factor>
 
 > +	local_irq_restore(flags);
 > +}
 > +
 > +EXPORT_SYMBOL(uncharge_beancounter_notop);
 > +
 > +/*
 > + *	Initialization
 > + *
 > + *	struct user_beancounter contains
 > + *	 - limits and other configuration settings
 > + *	 - structural fields: lists, spinlocks and so on.
 > + *
 > + *	Before these parts are initialized, the structure should be memset
 > + *	to 0 or copied from a known clean structure.  That takes care of a lot
 > + *	of fields not initialized explicitly.
 > + */
 > +
 > +static void init_beancounter_struct(struct user_beancounter *ub, uid_t id)
 > +{
 > +	atomic_set(&ub->ub_refcount, 1);
 > +	spin_lock_init(&ub->ub_lock);
 > +	ub->ub_uid = id;
 > +}
 > +
 > +static void init_beancounter_nolimits(struct user_beancounter *ub)
 > +{
 > +	int k;
 > +
 > +	for (k = 0; k < UB_RESOURCES; k++) {
 > +		ub->ub_parms[k].limit = UB_MAXVALUE;
 > +		ub->ub_parms[k].barrier = UB_MAXVALUE;
 > +	}
 > +}
 > +
 > +static void init_beancounter_syslimits(struct user_beancounter *ub)
 > +{
 > +	int k;
 > +
 > +	for (k = 0; k < UB_RESOURCES; k++)
 > +		ub->ub_parms[k].barrier = ub->ub_parms[k].limit;
 > +}
 > +
 > +void __init ub_init_early(void)
 > +{
 > +	struct user_beancounter *ub;
 > +	struct hlist_head *slot;
 > +
 > +	ub = &ub0;
 > +
 
 <factor>
 
 > +	memset(ub, 0, sizeof(*ub));
 > +	init_beancounter_nolimits(ub);
 > +	init_beancounter_struct(ub, 0);
 > +
 
 </factor>
 
 > +	spin_lock_init(&ub_hash_lock);
 > +	slot = &ub_hash[ub_hash_fun(ub->ub_uid)];
 > +	hlist_add_head(&ub->hash, slot);
 > +}
 > +
 > +void __init ub_init_late(void)
 > +{
 > +	struct user_beancounter *ub;
 > +
 > +	ub_cachep = kmem_cache_create("user_beancounters",
 > +			sizeof(struct user_beancounter),
 > +			0, SLAB_HWCACHE_ALIGN, NULL, NULL);
 > +	if (ub_cachep == NULL)
 > +		panic("Can't create ubc caches\n");
 > +
 > +	ub = &default_beancounter;
 
 <factor>
 
 > +	memset(ub, 0, sizeof(default_beancounter));
 > +	init_beancounter_syslimits(ub);
 > +	init_beancounter_struct(ub, 0);
 > +
 
 </factor>
 
 > +	ub = &default_subbeancounter;
 
 <factor>
 
 > +	memset(ub, 0, sizeof(default_subbeancounter));
 > +	init_beancounter_nolimits(ub);
 > +	init_beancounter_struct(ub, 0);
 
 </factor>
 
 > +}
 
 Cheers,
 -Matt Helsley
 |  
	|  |  |  
	|  |  
	| 
		
			| Re: [ckrm-tech] [RFC][PATCH 5/7] UBC: kernel memory accounting (core) [message #5405 is a reply to message #5359] | Fri, 18 August 2006 14:43   |  
			| 
				
				
					|  Dave Hansen Messages: 240
 Registered: October 2005
 | Senior Member |  |  |  
	| On Fri, 2006-08-18 at 12:12 +0400, Kirill Korotaev wrote: > LDT takes from 1 to 16 pages. and is allocated by vmalloc.
 > do you propose to replace it with slab which can fail due to memory
 > fragmentation?
 
 Nope.  ;)
 
 > the same applies to fdset, fdarray, ipc ids and iptables entries.
 
 The vmalloc area, along with all of those other structures _have_ other
 data structures.  Now, it will take a wee bit more patching to directly
 tag those thing with explicit container pointers (or accounting
 references), but I would much prefer that, especially for the things
 that are larger than a page.
 
 I worry that this approach was used instead of patching all of the
 individual subsystems because this was easier to maintain as an
 out-of-tree patch, and it isn't necessarily the best approach.
 
 -- Dave
 |  
	|  |  |  
	| 
		
			| Re: [ckrm-tech] [RFC][PATCH 5/7] UBC: kernel memory accounting (core) [message #5407 is a reply to message #5372] | Fri, 18 August 2006 15:06   |  
			| 
				
				
					|  Dave Hansen Messages: 240
 Registered: October 2005
 | Senior Member |  |  |  
	| On Fri, 2006-08-18 at 13:31 +0400, Kirill Korotaev wrote: > they all are troublesome :/
 > user can create lots of vmas, w/o page tables.
 > lots of fdsets, ipcids.
 > These are not reclaimable.
 
 In the real world, with the customers to which you've given these
 patches, which of these objects is most likely to be consuming the most
 space?  Is there one set of objects that we could work on that would fix
 _most_ of the cases which you have encountered?
 
 > Also consider the following scenario with reclaimable page tables.
 > e.g. user hit kmemsize limit due to fat page tables.
 > kernel reclaims some of the page tables and frees user kenerl memory.
 > after that user creates some uncreclaimable objects like fdsets or ipcs
 > and then accesses memory with reclaimed page tables.
 > Sooner or later we kill user with SIGSEGV from page fault due to
 > no memory. This is worse then returning ENOMEM from poll() or
 > mmap() where user allocates kernel objects.
 
 I think you're claiming that doing reclaim of kernel objects causes much
 more severe and less recoverable errors than does reclaiming of user
 pages.  That might generally be true, but I have one example that's
 killing me.  (You shouldn't have mentioned mmap ;)
 
 Let's say you have a memory area mapped by one pagetable page, and with
 1 user page mapped in it.  The system is out of memory, and if you
 reclaim either the pagetable page or the user page, you're never going
 to get it back.
 
 So, you pick the user page to reclaim.  The user touches it, the memory
 allocation fails, and the process gets killed.
 
 Instead, you reclaim the pagetable page.  The user touches their page,
 the memory allocation for the pagetable fails, and the process gets
 killed.
 
 Seems like the same end result to me.
 
 -- Dave
 |  
	|  |  |  
	|  |  
	|  |  
	|  |  
	| 
		
			| Re: [ckrm-tech] [RFC][PATCH 1/7] UBC: kconfig [message #5424 is a reply to message #5195] | Fri, 18 August 2006 19:57   |  
			| 
				
				
					|  Chandra Seetharaman Messages: 88
 Registered: August 2006
 | Member |  |  |  
	| As pointed in an earlier email, it would be better if we could have this in a arch-independent Kconfig, unless there is any problem with that.
 
 On Wed, 2006-08-16 at 19:35 +0400, Kirill Korotaev wrote:
 > Add kernel/ub/Kconfig file with UBC options and
 > includes it into arch Kconfigs
 >
 > Signed-Off-By: Pavel Emelianov <xemul@sw.ru>
 > Signed-Off-By: Kirill Korotaev <dev@sw.ru>
 >
 > ---
 >  arch/i386/Kconfig    |    2 ++
 >  arch/ia64/Kconfig    |    2 ++
 >  arch/powerpc/Kconfig |    2 ++
 >  arch/ppc/Kconfig     |    2 ++
 >  arch/sparc/Kconfig   |    2 ++
 >  arch/sparc64/Kconfig |    2 ++
 >  arch/x86_64/Kconfig  |    2 ++
 >  kernel/ub/Kconfig    |   25 +++++++++++++++++++++++++
 >  8 files changed, 39 insertions(+)
 >
 > --- ./arch/i386/Kconfig.ubkm	2006-07-10 12:39:10.000000000 +0400
 > +++ ./arch/i386/Kconfig	2006-07-28 14:10:41.000000000 +0400
 > @@ -1146,6 +1146,8 @@ source "crypto/Kconfig"
 >
 >  source "lib/Kconfig"
 >
 > +source "kernel/ub/Kconfig"
 > +
 >  #
 >  # Use the generic interrupt handling code in kernel/irq/:
 >  #
 > --- ./arch/ia64/Kconfig.ubkm	2006-07-10 12:39:10.000000000 +0400
 > +++ ./arch/ia64/Kconfig	2006-07-28 14:10:56.000000000 +0400
 > @@ -481,6 +481,8 @@ source "fs/Kconfig"
 >
 >  source "lib/Kconfig"
 >
 > +source "kernel/ub/Kconfig"
 > +
 >  #
 >  # Use the generic interrupt handling code in kernel/irq/:
 >  #
 > --- ./arch/powerpc/Kconfig.arkcfg	2006-08-07 14:07:12.000000000 +0400
 > +++ ./arch/powerpc/Kconfig	2006-08-10 17:55:58.000000000 +0400
 > @@ -1038,6 +1038,8 @@ source "arch/powerpc/platforms/iseries/K
 >
 >  source "lib/Kconfig"
 >
 > +source "ub/Kconfig"
 > +
 >  menu "Instrumentation Support"
 >          depends on EXPERIMENTAL
 >
 > --- ./arch/ppc/Kconfig.arkcfg	2006-07-10 12:39:10.000000000 +0400
 > +++ ./arch/ppc/Kconfig	2006-08-10 17:56:13.000000000 +0400
 > @@ -1414,6 +1414,8 @@ endmenu
 >
 >  source "lib/Kconfig"
 >
 > +source "ub/Kconfig"
 > +
 >  source "arch/powerpc/oprofile/Kconfig"
 >
 >  source "arch/ppc/Kconfig.debug"
 > --- ./arch/sparc/Kconfig.arkcfg	2006-04-21 11:59:32.000000000 +0400
 > +++ ./arch/sparc/Kconfig	2006-08-10 17:56:24.000000000 +0400
 > @@ -296,3 +296,5 @@ source "security/Kconfig"
 >  source "crypto/Kconfig"
 >
 >  source "lib/Kconfig"
 > +
 > +source "ub/Kconfig"
 > --- ./arch/sparc64/Kconfig.arkcfg	2006-07-17 17:01:11.000000000 +0400
 > +++ ./arch/sparc64/Kconfig	2006-08-10 17:56:36.000000000 +0400
 > @@ -432,3 +432,5 @@ source "security/Kconfig"
 >  source "crypto/Kconfig"
 >
 >  source "lib/Kconfig"
 > +
 > +source "lib/Kconfig"
 > --- ./arch/x86_64/Kconfig.ubkm	2006-07-10 12:39:11.000000000 +0400
 > +++ ./arch/x86_64/Kconfig	2006-07-28 14:10:49.000000000 +0400
 > @@ -655,3 +655,5 @@ source "security/Kconfig"
 >  source "crypto/Kconfig"
 >
 >  source "lib/Kconfig"
 > +
 > +source "kernel/ub/Kconfig"
 > --- ./kernel/ub/Kconfig.ubkm	2006-07-28 13:07:38.000000000 +0400
 > +++ ./kernel/ub/Kconfig	2006-07-28 13:09:51.000000000 +0400
 > @@ -0,0 +1,25 @@
 > +#
 > +# User resources part (UBC)
 > +#
 > +# Copyright (C) 2006 OpenVZ. SWsoft Inc
 > +
 > +menu "User resources"
 > +
 > +config USER_RESOURCE
 > +	bool "Enable user resource accounting"
 > +	default y
 > +	help
 > +          This patch provides accounting and allows to configure
 > +          limits for user's consumption of exhaustible system resources.
 > +          The most important resource controlled by this patch is unswappable
 > +          memory (either mlock'ed or used by internal kernel structures and
 > +          buffers). The main goal of this patch is to protect processes
 > +          from running short of important resources because of an accidental
 > +          misbehavior of processes or malicious activity aiming to ``kill''
 > +          the system. It's worth to mention that resource limits configured
 > +          by setrlimit(2) do not give an acceptable level of protection
 > +          because they cover only small fraction of resources and work on a
 > +          per-process basis.  Per-process accounting doesn't prevent malicious
 > +          users from spawning a lot of resource-consuming processes.
 > +
 > +endmenu
 >
 >  ------------------------------------------------------------ -------------
 > Using Tomcat but need to do more? Need to support web services, security?
 > Get stuff done quickly with pre-integrated technology to make your job easier
 > Download IBM WebSphere Application Server v.1.0.1 based on Apache Geronimo
 >  http://sel.as-us.falkag.net/sel?cmd=lnk&kid=120709&b id=263057&dat=121642
 > _______________________________________________
 > ckrm-tech mailing list
 > https://lists.sourceforge.net/lists/listinfo/ckrm-tech
 --
 
 ------------------------------------------------------------ ----------
 Chandra Seetharaman               | Be careful what you choose....
 - sekharan@us.ibm.com   |      .......you may get it.
 ------------------------------------------------------------ ----------
 |  
	|  |  |  
	| 
		
			| Re: [ckrm-tech] [RFC][PATCH] UBC: user resource beancounters [message #5425 is a reply to message #5192] | Fri, 18 August 2006 19:39   |  
			| 
				
				
					|  Chandra Seetharaman Messages: 88
 Registered: August 2006
 | Member |  |  |  
	| Kirill, 
 Here are some concerns I have (as of now) w.r.t using UBC for resource
 management (in the context of resource groups).
 
 - guarantee support is missing. I do not see any code to provide the
 minimum amount of resource a group can get. It is important for
 providing QoS. (In a different email you did mention guarantee, i am
 referring it here for completeness).
 - Creation of a UBC and assignment of task to a UBC always happen in
 the context of the task that is affected. I can understand it works in
 OpenVZ environment, but IMO has issues if one wants it to be used for
 basic resource management
 - application needs to be changed to use this feature.
 - System administrator does not have the control to assign tasks to a
 UBC. Application does by itself.
 - Assignment of task to a UBC need to be transparent to the
 application.
 - UBC is deleted when the last task (in that UBC) exits. For resource
 management purposes, UBC should be deleted only when the administrator
 deletes it.
 - No ability to set resource specific configuration information.
 - No ability to maintain resource specific data in the controller.
 - No ability to get the list of tasks belonging to a UBC.
 - Doesn't inform the resource controllers when limits(shares) change.
 - Doesn't inform the resource controllers when a task's UBC has changed.
 - Doesn't recalculate the resource usage when a task's UBC has changed.
 i.e doesn't uncharge the old UBC and charge new UBC.
 - For a system administrator name for identification of a UBC is
 better than a number (uid).
 
 regards,
 
 chandra
 
 On Wed, 2006-08-16 at 19:24 +0400, Kirill Korotaev wrote:
 > The following patch set presents base of
 > User Resource Beancounters (UBC).
 > UBC allows to account and control consumption
 > of kernel resources used by group of processes.
 >
 > The full UBC patch set allows to control:
 > - kernel memory. All the kernel objects allocatable
 >   on user demand should be accounted and limited
 >   for DoS protection.
 >   E.g. page tables, task structs, vmas etc.
 >
 > - virtual memory pages. UBC allows to
 >   limit a container to some amount of memory and
 >   introduces 2-level OOM killer taking into account
 >   container's consumption.
 >   pages shared between containers are correctly
 >   charged as fractions (tunable).
 >
 > - network buffers. These includes TCP/IP rcv/snd
 >   buffers, dgram snd buffers, unix, netlinks and
 >   other buffers.
 >
 > - minor resources accounted/limited by number:
 >   tasks, files, flocks, ptys, siginfo, pinned dcache
 >   mem, sockets, iptentries (for containers with
 >   virtualized networking)
 >
 > As the first step we want to propose for discussion
 > the most complicated parts of resource management:
 > kernel memory and virtual memory.
 > The patch set to be sent provides core for UBC and
 > management of kernel memory only. Virtual memory
 > management will be sent in a couple of days.
 >
 > The patches in these series are:
 > diff-ubc-kconfig.patch:
 >     Adds kernel/ub/Kconfig file with UBC options and
 >     includes it into arch Kconfigs
 >
 > diff-ubc-core.patch:
 >     Contains core functionality and interfaces of UBC:
 >     find/create beancounter, initialization,
 >     charge/uncharge of resource, core objects' declarations.
 >
 > diff-ubc-task.patch:
 >     Contains code responsible for setting UB on task,
 >     it's inheriting and setting host context in interrupts.
 >
 >     Task contains three beancounters:
 >     1. exec_ub  - current context. all resources are charged
 >                   to this beancounter.
 >     2. task_ub  - beancounter to which task_struct is charged
 >                   itself.
 >     3. fork_sub - beancounter which is inherited by
 >                   task's children on fork
 >
 > diff-ubc-syscalls.patch:
 >     Patch adds system calls for UB management:
 >     1. sys_getluid    - get current UB id
 >     2. sys_setluid    - changes exec_ and fork_ UBs on current
 >     3. sys_setublimit - set limits for resources consumtions
 >
 > diff-ubc-kmem-core.patch:
 >     Introduces UB_KMEMSIZE resource which accounts kernel
 >     objects allocated by task's request.
 >
 >     Objects are accounted via struct page and slab objects.
 >     For the latter ones each slab contains a set of pointers
 >     corresponding object is charged to.
 >
 >     Allocation charge rules:
 >     1. Pages - if allocation is performed with __GFP_UBC flag - page
 >        is charged to current's exec_ub.
 >     2. Slabs - kmem_cache may be created with SLAB_UBC flag - in this
 >        case each allocation is charged. Caches used by kmalloc are
 >        created with SLAB_UBC | SLAB_UBC_NOCHARGE flags. In this case
 >        only __GFP_UBC allocations are charged.
 >
 > diff-ubc-kmem-charge.patch:
 >     Adds SLAB_UBC and __GFP_UBC flags in appropriate places
 >     to cause charging/limiting of specified resources.
 >
 > diff-ubc-proc.patch:
 >     Adds two proc entries user_beancounters and user_beancounters_sub
 >     allowing to see current state (usage/limits/fails for each UB).
 >     Implemented via seq files.
 >
 > Patch set is applicable to 2.6.18-rc4-mm1
 >
 > Thanks,
 > Kirill
 >
 >
 >  ------------------------------------------------------------ -------------
 > Using Tomcat but need to do more? Need to support web services, security?
 > Get stuff done quickly with pre-integrated technology to make your job easier
 > Download IBM WebSphere Application Server v.1.0.1 based on Apache Geronimo
 >  http://sel.as-us.falkag.net/sel?cmd=lnk&kid=120709&b id=263057&dat=121642
 > _______________________________________________
 > ckrm-tech mailing list
 > https://lists.sourceforge.net/lists/listinfo/ckrm-tech
 --
 
 ------------------------------------------------------------ ----------
 Chandra Seetharaman               | Be careful what you choose....
 - sekharan@us.ibm.com   |      .......you may get it.
 ------------------------------------------------------------ ----------
 |  
	|  |  |  
	| 
		
			| Re: [ckrm-tech] [RFC][PATCH 4/7] UBC: syscalls (user interface) [message #5426 is a reply to message #5199] | Fri, 18 August 2006 20:13   |  
			| 
				
				
					|  Chandra Seetharaman Messages: 88
 Registered: August 2006
 | Member |  |  |  
	| On Wed, 2006-08-16 at 19:39 +0400, Kirill Korotaev wrote: 
 <snip>
 
 > +/*
 > + *	The setbeanlimit syscall
 > + */
 > +asmlinkage long sys_setublimit(uid_t uid, unsigned long resource,
 > +		unsigned long *limits)
 > +{
 > +	int error;
 > +	unsigned long flags;
 > +	struct user_beancounter *ub;
 > +	unsigned long new_limits[2];
 > +
 > +	error = -EPERM;
 > +	if(!capable(CAP_SYS_RESOURCE))
 > +		goto out;
 > +
 > +	error = -EINVAL;
 > +	if (resource >= UB_RESOURCES)
 > +		goto out;
 > +
 > +	error = -EFAULT;
 > +	if (copy_from_user(&new_limits, limits, sizeof(new_limits)))
 > +		goto out;
 > +
 > +	error = -EINVAL;
 > +	if (new_limits[0] > UB_MAXVALUE || new_limits[1] > UB_MAXVALUE)
 > +		goto out;
 > +
 > +	error = -ENOENT;
 > +	ub = beancounter_findcreate(uid, NULL, 0);
 > +	if (ub == NULL)
 > +		goto out;
 > +
 > +	spin_lock_irqsave(&ub->ub_lock, flags);
 > +	ub->ub_parms[resource].barrier = new_limits[0];
 > +	ub->ub_parms[resource].limit = new_limits[1];
 
 >From my understanding it appear that barrier <= limit. But, the check is
 missing here.
 > +	spin_unlock_irqrestore(&ub->ub_lock, flags);
 > +
 > +	put_beancounter(ub);
 > +	error = 0;
 > +out:
 > +	return error;
 > +}
 > +#endif
 >
 >  ------------------------------------------------------------ -------------
 > Using Tomcat but need to do more? Need to support web services, security?
 > Get stuff done quickly with pre-integrated technology to make your job easier
 > Download IBM WebSphere Application Server v.1.0.1 based on Apache Geronimo
 >  http://sel.as-us.falkag.net/sel?cmd=lnk&kid=120709&b id=263057&dat=121642
 > _______________________________________________
 > ckrm-tech mailing list
 > https://lists.sourceforge.net/lists/listinfo/ckrm-tech
 --
 
 ------------------------------------------------------------ ----------
 Chandra Seetharaman               | Be careful what you choose....
 - sekharan@us.ibm.com   |      .......you may get it.
 ------------------------------------------------------------ ----------
 |  
	|  |  |  
	|  |  
	| 
		
			| Re: [ckrm-tech] [PATCH 4/7] UBC: syscalls (user interface) [message #5434 is a reply to message #5357] | Fri, 18 August 2006 14:45   |  
			| 
				
				
					|  Dave Hansen Messages: 240
 Registered: October 2005
 | Senior Member |  |  |  
	| On Fri, 2006-08-18 at 12:08 +0400, Andrey Savochkin wrote: >
 > A) Have separate memory management for each container,
 >    with separate buddy allocator, lru lists, page replacement mechanism.
 >    That implies a considerable overhead, and the main challenge there
 >    is sharing of pages between these separate memory managers.
 
 Hold on here for just a sec...
 
 It is quite possible to do memory management aimed at one container
 while that container's memory still participates in the main VM.
 
 There is overhead here, as the LRU scanning mechanisms get less
 efficient, but I'd rather pay a penalty at LRU scanning time than divide
 up the VM, or coarsely start failing allocations.
 
 -- Dave
 |  
	|  |  |  
	| 
		
			| Re: [ckrm-tech] [RFC][PATCH] UBC: user resource beancounters [message #5437 is a reply to message #5301] | Thu, 17 August 2006 19:55   |  
			| 
				
				
					|  Chandra Seetharaman Messages: 88
 Registered: August 2006
 | Member |  |  |  
	| On Thu, 2006-08-17 at 17:55 +0400, Kirill Korotaev wrote: > > On Wed, Aug 16, 2006 at 07:24:03PM +0400, Kirill Korotaev wrote:
 > >
 > >>As the first step we want to propose for discussion
 > >>the most complicated parts of resource management:
 > >>kernel memory and virtual memory.
 > >
 > > Do you have any plans to post a CPU controller? Is that tied to UBC
 > > interface as well?
 >
 > Not everything at once :) To tell the truth I think CPU controller
 > is even more complicated than user memory accounting/limiting.
 >
 > No, fair CPU scheduler is not tied to UBC in any regard.
 
 Not having the CPU controller on UBC doesn't sound good for the
 infrastructure. IMHO, the infrastructure (for resource management) we
 are going to have should be able to support different resource
 controllers, without each controllers needing to have their own
 infrastructure/interface etc.,
 
 > As we discussed before, it is valuable to have an ability to limit
 > different resources separately (CPU, disk I/O, memory, etc.).
 
 Having ability to limit/control different resources separately not
 necessarily mean we should have different infrastructure for each.
 
 > For example, it can be possible to place some mission critical
 > kernel threads (like kjournald) in a separate contanier.
 
 I don't understand the comment above (in this context).
 >
 > This patches are related to kernel memory and nothing more :)
 >
 > Thanks,
 > Kirill
 >
 >
 >  ------------------------------------------------------------ -------------
 > Using Tomcat but need to do more? Need to support web services, security?
 > Get stuff done quickly with pre-integrated technology to make your job easier
 > Download IBM WebSphere Application Server v.1.0.1 based on Apache Geronimo
 >  http://sel.as-us.falkag.net/sel?cmd=lnk&kid=120709&b id=263057&dat=121642
 > _______________________________________________
 > ckrm-tech mailing list
 > https://lists.sourceforge.net/lists/listinfo/ckrm-tech
 --
 
 ------------------------------------------------------------ ----------
 Chandra Seetharaman               | Be careful what you choose....
 - sekharan@us.ibm.com   |      .......you may get it.
 ------------------------------------------------------------ ----------
 |  
	|  |  |  
	| 
		
			| Re: [RFC][PATCH 4/7] UBC: syscalls (user interface) [message #5438 is a reply to message #5199] | Fri, 18 August 2006 11:40   |  
			| 
				
				
					|  Arnd Bergmann Messages: 10
 Registered: February 2006
 | Junior Member |  |  |  
	| On Wednesday 16 August 2006 17:39, Kirill Korotaev wrote: > --- ./include/asm-powerpc/systbl.h.arsys        2006-07-10 12:39:19.000000000 +0400
 > +++ ./include/asm-powerpc/systbl.h      2006-08-10 17:05:53.000000000 +0400
 > @@ -304,3 +304,6 @@ SYSCALL_SPU(fchmodat)
 >  SYSCALL_SPU(faccessat)
 >  COMPAT_SYS_SPU(get_robust_list)
 >  COMPAT_SYS_SPU(set_robust_list)
 > +SYSCALL(sys_getluid)
 > +SYSCALL(sys_setluid)
 > +SYSCALL(sys_setublimit)
 ...
 > --- ./include/asm-x86_64/unistd.h.ubsys 2006-07-10 12:39:19.000000000 +0400
 > +++ ./include/asm-x86_64/unistd.h       2006-07-31 16:00:01.000000000 +0400
 > @@ -619,10 +619,16 @@ __SYSCALL(__NR_sync_file_range, sys_sync
 >  __SYSCALL(__NR_vmsplice, sys_vmsplice)
 >  #define __NR_move_pages                279
 >  __SYSCALL(__NR_move_pages, sys_move_pages)
 > +#define __NR_getluid           280
 > +__SYSCALL(__NR_getluid, sys_getluid)
 > +#define __NR_setluid           281
 > +__SYSCALL(__NR_setluid, sys_setluid)
 > +#define __NR_setublimit                282
 > +__SYSCALL(__NR_setublimit, sys_setublimit)
 >
 ...
 > +/*
 > + *     The setbeanlimit syscall
 > + */
 > +asmlinkage long sys_setublimit(uid_t uid, unsigned long resource,
 > +               unsigned long *limits)
 
 While I don't yet understand what this call does, it looks to me that
 the way it's implemented breaks in 32 bit emulation mode on x86_64 and
 powerpc.
 
 You either need to pass a pointer a something that is the same on 32 and
 64 bit (e.g. __u64 __user *limits), or need to provide a different
 entry point for 32 bit applications:
 
 long compat_sys_setublimit(compat_uid_t uid, compat_ulong_t resource,
 compat_ulong_t __user *limits);
 
 You should also add the prototypes to include/linux/syscalls.h.
 
 Arnd <><
 |  
	|  |  |  
	|  |  
	| 
		
			| Re: [RFC][PATCH 1/7] UBC: kconfig [message #5444 is a reply to message #5195] | Fri, 18 August 2006 21:14   |  
			| 
				
				
					|  Adrian Bunk Messages: 10
 Registered: August 2006
 | Junior Member |  |  |  
	| On Wed, Aug 16, 2006 at 07:35:34PM +0400, Kirill Korotaev wrote: > Add kernel/ub/Kconfig file with UBC options and
 > includes it into arch Kconfigs
 >
 > Signed-Off-By: Pavel Emelianov <xemul@sw.ru>
 > Signed-Off-By: Kirill Korotaev <dev@sw.ru>
 >
 > ---
 > arch/i386/Kconfig    |    2 ++
 > arch/ia64/Kconfig    |    2 ++
 > arch/powerpc/Kconfig |    2 ++
 > arch/ppc/Kconfig     |    2 ++
 > arch/sparc/Kconfig   |    2 ++
 > arch/sparc64/Kconfig |    2 ++
 > arch/x86_64/Kconfig  |    2 ++
 > kernel/ub/Kconfig    |   25 +++++++++++++++++++++++++
 > 8 files changed, 39 insertions(+)
 >...
 > --- ./arch/powerpc/Kconfig.arkcfg	2006-08-07 14:07:12.000000000 +0400
 > +++ ./arch/powerpc/Kconfig	2006-08-10 17:55:58.000000000 +0400
 > @@ -1038,6 +1038,8 @@ source "arch/powerpc/platforms/iseries/K
 >
 > source "lib/Kconfig"
 >
 > +source "ub/Kconfig"
 
 kernel/ub/Kconfig
 
 >...
 > --- ./arch/ppc/Kconfig.arkcfg	2006-07-10 12:39:10.000000000 +0400
 > +++ ./arch/ppc/Kconfig	2006-08-10 17:56:13.000000000 +0400
 > @@ -1414,6 +1414,8 @@ endmenu
 >
 > source "lib/Kconfig"
 >
 > +source "ub/Kconfig"
 
 kernel/ub/Kconfig
 
 >...
 > --- ./arch/sparc/Kconfig.arkcfg	2006-04-21 11:59:32.000000000 +0400
 > +++ ./arch/sparc/Kconfig	2006-08-10 17:56:24.000000000 +0400
 > @@ -296,3 +296,5 @@ source "security/Kconfig"
 > source "crypto/Kconfig"
 >
 > source "lib/Kconfig"
 > +
 > +source "ub/Kconfig"
 
 kernel/ub/Kconfig
 
 > --- ./arch/sparc64/Kconfig.arkcfg	2006-07-17 17:01:11.000000000 +0400
 > +++ ./arch/sparc64/Kconfig	2006-08-10 17:56:36.000000000 +0400
 > @@ -432,3 +432,5 @@ source "security/Kconfig"
 > source "crypto/Kconfig"
 >
 > source "lib/Kconfig"
 > +
 > +source "lib/Kconfig"
 
 kernel/ub/Kconfig
 
 >...
 > --- ./kernel/ub/Kconfig.ubkm	2006-07-28 13:07:38.000000000 +0400
 > +++ ./kernel/ub/Kconfig	2006-07-28 13:09:51.000000000 +0400
 > @@ -0,0 +1,25 @@
 > +#
 > +# User resources part (UBC)
 > +#
 > +# Copyright (C) 2006 OpenVZ. SWsoft Inc
 > +
 > +menu "User resources"
 > +
 > +config USER_RESOURCE
 > +	bool "Enable user resource accounting"
 > +	default y
 >...
 
 Optional functionality shouldn't default to y.
 
 cu
 Adrian
 
 --
 
 Gentoo kernels are 42 times more popular than SUSE kernels among
 KLive users  (a service by SUSE contractor Andrea Arcangeli that
 gathers data about kernels from many users worldwide).
 
 There are three kinds of lies: Lies, Damn Lies, and Statistics.
 Benjamin Disraeli
 |  
	|  |  |  
	| 
		
			| Re: [ckrm-tech] [RFC][PATCH 3/7] UBC: ub context and inheritance [message #5445 is a reply to message #5371] | Sat, 19 August 2006 02:19   |  
			| 
				
				
					|  Matt Helsley Messages: 86
 Registered: August 2006
 | Member |  |  |  
	| On Fri, 2006-08-18 at 13:23 +0400, Kirill Korotaev wrote: > Matt Helsley wrote:
 > > On Wed, 2006-08-16 at 19:38 +0400, Kirill Korotaev wrote:
 > >
 > >>Contains code responsible for setting UB on task,
 > >>it's inheriting and setting host context in interrupts.
 > >>
 > >>Task references three beancounters:
 > >>  1. exec_ub  current context. all resources are
 > >>              charged to this beancounter.
 > >
 > >
 > > nit: 2-3 below seem to contradict "all". If you mean "the rest" then
 > > perhaps you ought to reorder these:
 > >
 > > 1. task_ub ...
 > > 2. fork_sub ...
 > > 3. exec_ub Current context. Resources not charged to task_ub
 > >            or fork_sub are charged to this beancounter.
 > not sure what you mean.
 > task_ub - where _task_ _itself_ is charged as an object.
 >           following patches will add charging of "number of tasks" using it.
 > fork_sub - beancounter which is inherited on fork() (chaning task beancounter).
 > exec_ub - is current context.
 >
 >
 > >>  2. task_ub  beancounter to which task_struct is
 > >>              charged itself.
 > >
 > >
 > > Is task_ub frequently the parent beancounter of exec_ub? If it's always
 > > the parent then perhaps the one or more of these _ub fields in the task
 > > struct are not necessary.
 > no, task_ub != exec_ub of parent task
 > when task is created anything can happen: task can change ub, parent can change ub,
 > task can be reparented. But the UB we charged task to should be known.
 >
 > > Also in that case keeping copies of the
 > > "parent" user_beancounter pointers in the task_beancounters would seem
 > > bug-prone -- if the hierarchy of beancounters changes then these would
 > > need to be changed too.
 > >
 > >
 > >>  3. fork_sub beancounter which is inherited by
 > >>              task's children on fork
 > >
 > >
 > > Is this frequently the same as exec_ub?
 > frequently, but not always. exec_ub is changed in softirq for example.
 > consider exec_ub as 'current' pointer in kernel.
 >
 > see other comments below
 >
 > >>Signed-Off-By: Pavel Emelianov <xemul@sw.ru>
 > >>Signed-Off-By: Kirill Korotaev <dev@sw.ru>
 > >>
 > >>---
 > >> include/linux/sched.h   |    5 +++++
 > >> include/ub/task.h       |   42 ++++++++++++++++++++++++++++++++++++++++++
 > >> kernel/fork.c           |   21 ++++++++++++++++-----
 > >> kernel/irq/handle.c     |    9 +++++++++
 > >> kernel/softirq.c        |    8 ++++++++
 > >> kernel/ub/Makefile      |    1 +
 > >> kernel/ub/beancounter.c |    4 ++++
 > >> kernel/ub/misc.c        |   34 ++++++++++++++++++++++++++++++++++
 > >> 8 files changed, 119 insertions(+), 5 deletions(-)
 > >>
 > >>--- ./include/linux/sched.h.ubfork	2006-07-17 17:01:12.000000000 +0400
 > >>+++ ./include/linux/sched.h	2006-07-31 16:01:54.000000000 +0400
 > >>@@ -81,6 +81,8 @@ struct sched_param {
 > >> #include <linux/timer.h>
 > >> #include <linux/hrtimer.h>
 > >>
 > >>+#include <ub/task.h>
 > >>+
 > >> #include <asm/processor.h>
 > >>
 > >> struct exec_domain;
 > >>@@ -997,6 +999,9 @@ struct task_struct {
 > >> 	spinlock_t delays_lock;
 > >> 	struct task_delay_info *delays;
 > >> #endif
 > >>+#ifdef CONFIG_USER_RESOURCE
 > >>+	struct task_beancounter	task_bc;
 > >>+#endif
 > >> };
 > >>
 > >> static inline pid_t process_group(struct task_struct *tsk)
 > >>--- ./include/ub/task.h.ubfork	2006-07-28 18:53:52.000000000 +0400
 > >>+++ ./include/ub/task.h	2006-08-01 15:26:08.000000000 +0400
 > >>@@ -0,0 +1,42 @@
 > >>+/*
 > >>+ *  include/ub/task.h
 > >>+ *
 > >>+ *  Copyright (C) 2006 OpenVZ. SWsoft Inc
 > >>+ *
 > >>+ */
 > >>+
 > >>+#ifndef __UB_TASK_H_
 > >>+#define __UB_TASK_H_
 > >>+
 > >>+#include <linux/config.h>
 > >>+
 > >>+struct user_beancounter;
 > >>+
 > >>+struct task_beancounter {
 > >>+	struct user_beancounter *exec_ub;
 > >>+	struct user_beancounter *task_ub;
 > >>+	struct user_beancounter *fork_sub;
 > >>+};
 > >>+
 > >>+#ifdef CONFIG_USER_RESOURCE
 > >>+#define get_exec_ub()		(current->task_bc.exec_ub)
 > >>+#define set_exec_ub(newub)			\
 > >>+	({					\
 > >>+		 struct user_beancounter *old;	\
 > >>+		 struct task_beancounter *tbc;	\
 > >>+		 tbc = ¤t->task_bc;	\
 > >>+		 old = tbc->exec_ub;		\
 > >>+		 tbc->exec_ub = newub;		\
 > >>+		 old;				\
 > >>+	 })
 > >>+
 > >
 > >
 > > How about making these static inlines?
 > possible, but this requires including sched.h, which includes this file...
 > so this one is easier and more separated.
 >
 > >>+int ub_task_charge(struct task_struct *parent, struct task_struct *new);
 > >>+void ub_task_uncharge(struct task_struct *tsk);
 > >>+
 > >>+#else /* CONFIG_USER_RESOURCE */
 > >>+#define get_exec_ub()		(NULL)
 > >>+#define set_exec_ub(__ub)	(NULL)
 > >>+#define ub_task_charge(p, t)	(0)
 > >>+#define ub_task_uncharge(t)	do { } while (0)
 > >>+#endif /* CONFIG_USER_RESOURCE */
 > >>+#endif /* __UB_TASK_H_ */
 > >>--- ./kernel/irq/handle.c.ubirq	2006-07-10 12:39:20.000000000 +0400
 > >>+++ ./kernel/irq/handle.c	2006-08-01 12:39:34.000000000 +0400
 > >>@@ -16,6 +16,9 @@
 > >> #include <linux/interrupt.h>
 > >> #include <linux/kernel_stat.h>
 > >>
 > >>+#include <ub/beancounter.h>
 > >>+#include <ub/task.h>
 > >>+
 > >> #include "internals.h"
 > >>
 > >> /**
 > >>@@ -166,6 +169,9 @@ fastcall unsigned int __do_IRQ(unsigned
 > >> 	struct irq_desc *desc = irq_desc + irq;
 > >> 	struct irqaction *action;
 > >> 	unsigned int status;
 > >>+	struct user_beancounter *ub;
 > >>+
 > >>+	ub = set_exec_ub(&ub0);
 > >
 > >
 > > Perhaps a comment: "/* Don't charge resources gained in interrupts to current */
 > ok, will add comment:
 > /* UBC charges should be done to host system */
 > >
 > >
 > >> 	kstat_this_cpu.irqs[irq]++;
 > >> 	if (CHECK_IRQ_PER_CPU(desc->status)) {
 > >>@@ -178,6 +184,8 @@ fastcall unsigned int __do_IRQ(unsigned
 > >> 			desc->chip->ack(irq);
 > >> 		action_ret = handle_IRQ_event(irq, regs, desc->action);
 > >> 		desc->chip->end(irq);
 > >>+
 > >>+		(void) set_exec_ub(ub);
 > >> 		return 1;
 > >> 	}
 > >>
 > >>@@ -246,6 +254,7 @@ out:
 > >> 	desc->chip->end(irq);
 > >> 	spin_unlock(&desc->lock);
 > >>
 > >>+	(void) set_exec_ub(ub);
 > >
 > >
 > >
 > > Seems like a WARN_ON() would be appropriate rather than ignoring the
 > > return code.
 > BUG_ON(ret != &ub0) ?
 
 Oops, yes, it's not a return code and BUG_ON() does seem more
 appropriate.
 
 >
 > maybe introduce a kind of
 > reset_exec_ub(old_ub, expected_current_ub)
 > {
 >   ret = set_exec_ub(old_ub);
 >   BUG_ON(ret != expected_current_ub);
 > }
 > ?
 
 Seems like a good idea to me. This way when UBC is not configured
 there'd also be no BUG_ON().
 
 > >> 	return 1;
 > >> }
 > >>
 > >>--- ./kernel/softirq.c.ubirq	2006-07-17 17:01:12.000000000 +0400
 > >>+++ ./kernel/softirq.c	2006-08-01 12:40:44.000000000 +0400
 > >>@@ -18,6 +18,9 @@
 > >> #include <linux/rcupdate.h>
 > >> #include <linux/smp.h>
 > >>
 > >>+#include <ub/beancounter.h>
 > >>+#include <ub/task.h>
 > >>+
 > >> #include <asm/irq.h>
 > >> /*
 > >>    - No shared variables, all the data are CPU local.
 > >>@@ -191,6 +194,9 @@ asmlinkage void __do_softirq(void)
 > >> 	__u32 pending;
 > >> 	int max_restart = MAX_SOFTIRQ_RESTART;
 > >> 	int cpu;
 > >>+	struct user_beancounter *ub;
 > >>+
 > >>+	ub = set_exec_ub(&ub0);
 > >
 > >
 > > Perhaps add the same comment...
 > ok
 >
 > >
 > >
 > >> 	pending = local_softirq_pending();
 > >> 	account_system_vtime(current);
 > >>@@ -229,6 +235,8 @@ restart:
 > >>
 > >> 	account_system_vtime(current);
 > >> 	_local_bh_enable();
 > >>+
 > >>+	(void) set_exec_ub(ub);
 > >
 > >
 > > .. and the same WARN_ON.
 > >
 > >
 > >> }
 > >>
 > >> #ifndef __ARCH_HAS_DO_SOFTIRQ
 > >>--- ./kernel/fork.c.ubfork	2006-07-17 17:01:12.000000000 +0400
 > >>+++ ./kernel/fork.c	2006-08-01 12:58:36.000000000 +0400
 > >>@@ -46,6 +46,8 @@
 > >> #include <linux/delayacct.h>
 > >> #include <linux/taskstats_kern.h>
 > >>
 > >>+#include <ub/task.h>
 > >>+
 > >> #include <asm/pgtable.h>
 > >> #include <asm/pgalloc.h>
 > >> #include <asm/uaccess.h>
 > >>@@ -102,6 +104,7 @@ static kmem_cache_t *mm_cachep;
 > >>
 > >> void free_task(struct task_struct *tsk)
 > >> {
 > >>+	ub_task_uncharge(tsk);
 > >> 	free_thread_info(tsk->thread_info);
 > >> 	rt_mutex_debug_task_free(tsk);
 > >> 	free_task_struct(tsk);
 > >>@@ -162,18 +165,19 @@ static struct task_struct *dup_task_stru
 > >>
 > >> 	tsk = alloc_task_struct();
 > >> 	if (!tsk)
 > >>-		return NULL;
 > >>+		goto out;
 > >>
 > >> 	ti = alloc_thread_info(tsk);
 > >>-	if (!ti) {
 > >>-		free_task_struct(tsk);
 > >>-		return NULL;
 > >>-	}
 > >>+	if (!ti)
 > >>+		goto out_tsk;
 > >>
 > >> 	*tsk = *orig;
 > >> 	tsk->thread_info = ti;
 > >> 	setup_thread_stack(tsk, orig);
 > >>
 > >>+	if (ub_task_charge(orig, tsk))
 > >>+		goto out_ti;
 > >>+
 > >> 	/* One for us, one for whoever does the "release_task()" (usually parent) */
 > >> 	atomic_set(&tsk->usage,2);
 > >> 	atomic_set(&tsk->fs_excl, 0);
 > >>@@ -180,6 +184,13 @@ static struct task_struct *dup_task_stru
 > >> #endif
 > >> 	tsk->splice_pipe = NULL;
 > >> 	return tsk;
 > >>+
 > >>+out_ti:
 > >>+	free_thread_info(ti);
 > >>+out_tsk:
 > >>+	free_task_struct(tsk);
 > >>+out:
 > >>+	return NULL;
 > >
 > >
 > > Ugh. This is starting to look like copy_process(). Any reason you
 > > couldn't move the bean counter bits to copy_process() instead?
 > This is more logical place since we _will_ charge task here
 > (next patchset for numproc).
 > It is logically better to charge objects in places where
 > they are allocated. At the same time we inherit tasks ubs here.
 
 Most other systems that aren't so critically related to task struct
 allocation and copying place their code in copy_process() and not in
 dup_task_struct().
 
 Frankly this still seems to belong in copy_process(). The pattern (with
 the gotos) is already there, as are accounting, audit, and security
 pieces for example.
 
 > >> }
 > >>
 > >> #ifdef CONFIG_MMU
 > >>--- ./kernel/ub/Makefile.ubcore	2006-08-03 16:24:56.000000000 +0400
 > >>+++ ./kernel/ub/Makefile	2006-08-01 11:08:39.000000000 +0400
 > >>@@ -5,3 +5,4 @@
 > >> #
 > >>
 > >> obj-$(CONFIG_USER_RESOURCE) += beancounter.o
 > >>+obj-$(CONFIG_USER_RESOURCE) += misc.o
 > >>--- ./kernel/ub/beancounter.c.ubcore	2006-07-28 13:07:44.000000000 +0400
 > >>+++ ./kernel/ub/beancounter.c	2006-08-03 16:14:17.000000000 +0400
 > >>@@ -395,6 +395,10 @@
 > >> 	spin_lock_init(&ub_hash_lock);
 > >> 	slot = &ub_hash[ub_hash_fun(ub->ub_uid)];
 > >> 	hlist_add_head(&ub->hash, slot);
 > >>+
 > >>+	current->task_bc.exec_ub = ub;
 > >>+	current->task_bc.task_ub = get_beancounter(ub);
 > >>+	current->task_bc.fork_sub = get_beancounter(ub);
 > >> }
 > >>
 > >> void __init ub_init_late(void)
 > >>--- ./kernel/ub/misc.c.ubfork	2006-07-31 16:23:44.000000000 +0400
 > >>+++ ./kernel/ub/misc.c	2006-07-31 16:28:47.000000000 +0400
 > >>@@ -0,0 +1,34 @@
 > >>+/*
 > >>+ * kernel/ub/misc.c
 > >>+ *
 > >>+ * Copyright (C) 2006 OpenVZ. SWsoft Inc.
 > >>+ *
 > >>+ */
 > >>+
 > >>+#include <linux/sched.h>
 > >>+
 > >>+#include <ub/beancounter.h>
 > >>+#include <ub/task.h>
 > >>+
 > >>+int ub_task_charge(struct task_struct *parent, struct task_struct *new)
 > >>+{
 > >
 > >
 > > parent could be derived from new if you move the charge to copy_process
 > > instead of dup_task_struct.
 > we can split it into:
 > ub_charge_task() in dup_task_struct to account _task_ itself.
 > ub_copy_process() in copy_process() to inherit and initialize
 >   exec_ub and fork_sub
 >
 > what do you think?
 
 I do like the idea of splitting this up.
 
 Though as I said I'm still against adding stuff to dup_task_struct() if
 it's not allocating/copying the task struct or thread info. I think of
 "dup_task_struct()" as dealing with the core purpose of the task_struct
 -- copy_process() seems to be for all of the things that have plugged
 their own fields into task_struct over the years.
 
 Since I haven't seen the numproc patches that follow this I can't
 really comment (one way or the other) on whether plugging that into
 dup_task_struct() seems appropriate or necessary.
 
 > >>+	struct task_beancounter *old_bc;
 > >>+	struct task_beancounter *new_bc;
 > >>+	struct user_beancounter *ub;
 > >>+
 > >>+	old_bc = &parent->task_bc;
 > >>+	new_bc = &new->task_bc;
 > >>+
 > >>+	ub = old_bc->fork_sub;
 > >>+	new_bc->exec_ub = get_beancounter(ub);
 > >>+	new_bc->task_ub = get_beancounter(ub);
 > >>+	new_bc->fork_sub = get_beancounter(ub);
 > >>+	return 0;
 > >>+}
 
 <snip>
 
 Cheers,
 -Matt Helsley
 |  
	|  |  |  
	| 
		
			| Re: [ckrm-tech] [RFC][PATCH 4/7] UBC: syscalls (user interface) [message #5446 is a reply to message #5388] | Sat, 19 August 2006 02:43   |  
			| 
				
				
					|  Matt Helsley Messages: 86
 Registered: August 2006
 | Member |  |  |  
	| On Fri, 2006-08-18 at 15:45 +0400, Kirill Korotaev wrote: > Matt Helsley wrote:
 >
 > [... snip ...]
 > >>--- ./kernel/ub/sys.c.ubsys	2006-07-28 18:52:18.000000000 +0400
 > >>+++ ./kernel/ub/sys.c	2006-08-03 16:14:23.000000000 +0400
 > >>@@ -0,0 +1,126 @@
 
 <snip>
 
 > >>+#else /* CONFIG_USER_RESOURCE */
 > >>+
 > >>+/*
 > >>+ *	The (rather boring) getluid syscall
 > >>+ */
 > >>+asmlinkage long sys_getluid(void)
 > >>+{
 > >>+	struct user_beancounter *ub;
 > >>+
 > >>+	ub = get_exec_ub();
 > >>+	if (ub == NULL)
 > >>+		return -EINVAL;
 > >>+
 > >>+	return ub->ub_uid;
 > >>+}
 > >>+
 > >>+/*
 > >>+ *	The setluid syscall
 > >>+ */
 > >>+asmlinkage long sys_setluid(uid_t uid)
 > >>+{
 > >>+	int error;
 > >>+	struct user_beancounter *ub;
 > >>+	struct task_beancounter *task_bc;
 > >>+
 > >>+	task_bc = ¤t->task_bc;
 > >>+
 > >>+	/* You may not disown a setluid */
 > >>+	error = -EINVAL;
 > >>+	if (uid == (uid_t)-1)
 > >>+		goto out;
 > >>+
 > >>+	/* You may only set an ub as root */
 > >>+	error = -EPERM;
 > >>+	if (!capable(CAP_SETUID))
 > >>+		goto out;
 > >
 > >
 > > With resource groups you don't necessarily have to be root -- just the
 > > owner of the group and task.
 > the question is - who is the owner of group?
 
 Whoever is made the 'owner' of the directory is the owner of the group.
 If you own both then you can add your task to your group.
 
 > user, user group or who?
 > Both are bad, since the same user can run inside the container and thus
 > container will be potentially controllable/breakable from inside.
 
 No, that's not a problem. The way shares work is you get a "portion" of
 the parent group's resources and if the parent has limited your portion
 you cannot exceed that. At the same time you can control how your
 portion is dealt out within the child group.
 
 > > Filesystems and appropriate share representations offer a way to give
 > > regular users the ability to manage their resources without requiring
 > > CAP_FOO.
 > not sure what you propose...
 
 A filesystem interface.
 
 > we can introduce the following rules:
 >
 > containers (UB) can be created by process with SETUID cap only.
 > subcontainers (SUB) can be created by any process.
 
 Can subsubcontainers be created?
 
 > what do you think?
 
 I think a filesystem interface would work better. ;)
 
 >
 > >>+	/* Ok - set up a beancounter entry for this user */
 > >>+	error = -ENOBUFS;
 > >>+	ub = beancounter_findcreate(uid, NULL, UB_ALLOC);
 > >>+	if (ub == NULL)
 > >>+		goto out;
 > >>+
 > >>+	/* install bc */
 > >>+	put_beancounter(task_bc->exec_ub);
 > >>+	task_bc->exec_ub = ub;
 > >>+	put_beancounter(task_bc->fork_sub);
 > >>+	task_bc->fork_sub = get_beancounter(ub);
 > >>+	error = 0;
 > >>+out:
 > >>+	return error;
 > >>+}
 > >>+
 > >>+/*
 > >>+ *	The setbeanlimit syscall
 > >>+ */
 > >>+asmlinkage long sys_setublimit(uid_t uid, unsigned long resource,
 > >>+		unsigned long *limits)
 > >>+{
 > >>+	int error;
 > >>+	unsigned long flags;
 > >>+	struct user_beancounter *ub;
 > >>+	unsigned long new_limits[2];
 > >>+
 > >>+	error = -EPERM;
 > >>+	if(!capable(CAP_SYS_RESOURCE))
 > >>+		goto out;
 > >
 > >
 > > Again, a filesystem interface would give us more flexibility when it
 > > comes to allowing users to manage their resources while still preventing
 > > them from exceeding limits.
 > we can have 2 different root users with uid = 0 in 2 different containers.
 
 You shouldn't need to have the 2 containers to give resource control to
 other users. In other words you shouldn't need to use containers in
 order to do resource management. The container model  is by no means the
 only way to model resource management.
 
 > > I doubt you really want to give owners of a container CAP_SYS_RESOURCE
 > > and CAP_USER (i.e. total control over resource management) just to allow
 > > them to manage their subset of the resources.
 > The origin idea is that administator of the node can manage user
 > resources only. Users can't, since otherwise they can increase the limits.
 
 The user may wish to manage the resource usage of her applications
 within restrictions imposed by an administrator. If the user has a
 portion of resources then you only need to ensure that the sum of her
 resources does not exceed the administrator-provided limit.
 
 > But we can allow them to manage sub beancoutners imho...
 
 And subsubbeancounters?
 
 <snip>
 
 Cheers,
 -Matt Helsley
 |  
	|  |  |  
	| 
		
			| Re: [ckrm-tech] [RFC][PATCH 2/7] UBC: core (structures, API) [message #5459 is a reply to message #5196] | Sun, 20 August 2006 05:01   |  
			| 
				
				
					|  Balbir Singh Messages: 491
 Registered: August 2006
 | Senior Member |  |  |  
	| > + > +void __put_beancounter(struct user_beancounter *ub)
 > +{
 > +	unsigned long flags;
 > +	struct user_beancounter *parent;
 > +
 > +again:
 > +	parent = ub->parent;
 > +	/* equevalent to atomic_dec_and_lock_irqsave() */
 > +	local_irq_save(flags);
 > +	if (likely(!atomic_dec_and_lock(&ub->ub_refcount, &ub_hash_lock))) {
 > +		if (unlikely(atomic_read(&ub->ub_refcount) < 0))
 > +			printk(KERN_ERR "UB: Bad ub refcount: ub=%p, "
 > +					"luid=%d, ref=%d\n",
 > +					ub, ub->ub_uid,
 > +					atomic_read(&ub->ub_refcount));
 > +		local_irq_restore(flags);
 > +		return;
 
 Minor comment - the printk (I think there is one other place) could come after
 the local_irq_restore()
 
 --
 
 Balbir Singh,
 Linux Technology Center,
 IBM Software Labs
 |  
	|  |  |  
	| 
		
			| Re: [ckrm-tech] [RFC][PATCH 2/7] UBC: core (structures, API) [message #5460 is a reply to message #5196] | Sun, 20 August 2006 04:58   |  
			| 
				
				
					|  Balbir Singh Messages: 491
 Registered: August 2006
 | Senior Member |  |  |  
	| Kirill Korotaev wrote: 
 > +/*
 > + *	Resource list.
 > + */
 > +
 > +#define UB_RESOURCES	0
 > +
 > +struct ubparm {
 > +	/*
 > +	 * A barrier over which resource allocations are failed gracefully.
 > +	 * e.g. if the amount of consumed memory is over the barrier further
 > +	 * sbrk() or mmap() calls fail, the existing processes are not killed.
 > +	 */
 > +	unsigned long	barrier;
 > +	/* hard resource limit */
 > +	unsigned long	limit;
 > +	/* consumed resources */
 > +	unsigned long	held;
 > +	/* maximum amount of consumed resources through the last period */
 > +	unsigned long	maxheld;
 > +	/* minimum amount of consumed resources through the last period */
 > +	unsigned long	minheld;
 > +	/* count of failed charges */
 > +	unsigned long	failcnt;
 > +};
 > +
 
 Comments to the side of the field would make it easier to read and understand
 the structure. I think there are already other comments requesting for renaming
 of the barrier field to hard_limit.
 
 <snip>
 
 > +static inline void ub_adjust_held_minmax(struct user_beancounter *ub,
 > +		int resource)
 > +{
 > +	if (ub->ub_parms[resource].maxheld < ub->ub_parms[resource].held)
 > +		ub->ub_parms[resource].maxheld = ub->ub_parms[resource].held;
 > +	if (ub->ub_parms[resource].minheld > ub->ub_parms[resource].held)
 > +		ub->ub_parms[resource].minheld = ub->ub_parms[resource].held;
 > +}
 
 A comment here to clarify what the function does would be helpful, specially due
 to the comparison above
 
 if (maxheld < held)
 maxheld = held
 if (minheld > held)
 minheld = held
 
 <snip>
 
 > +struct user_beancounter ub0;
 
 How about global_ub or init_ub?
 
 > +
 > +#define ub_hash_fun(x) ((((x) >> 8) ^ (x)) & (UB_HASH_SIZE - 1))
 > +#define ub_subhash_fun(p, id) ub_hash_fun((p)->ub_uid + (id) * 17)
 > +
 
 What hash properties are we looking for in the hash function? Is the hash
 function universal?
 
 > +struct hlist_head ub_hash[UB_HASH_SIZE];
 > +spinlock_t ub_hash_lock;
 > +
 > +EXPORT_SYMBOL(ub_hash);
 > +EXPORT_SYMBOL(ub_hash_lock);
 > +
 > +/*
 > + *	Per user resource beancounting. Resources are tied to their luid.
 > + *	The resource structure itself is tagged both to the process and
 > + *	the charging resources (a socket doesn't want to have to search for
 > + *	things at irq time for example). Reference counters keep things in
 > + *	hand.
 > + *
 > + *	The case where a user creates resource, kills all his processes and
 > + *	then starts new ones is correctly handled this way. The refcounters
 > + *	will mean the old entry is still around with resource tied to it.
 > + */
 > +
 > +struct user_beancounter *beancounter_findcreate(uid_t uid,
 > +		struct user_beancounter *p, int mask)
 > +{
 > +	struct user_beancounter *new_ub, *ub, *tmpl_ub;
 > +	unsigned long flags;
 > +	struct hlist_head *slot;
 > +	struct hlist_node *pos;
 > +
 > +	if (mask & UB_LOOKUP_SUB) {
 > +		WARN_ON(p == NULL);
 > +		tmpl_ub = &default_subbeancounter;
 > +		slot = &ub_hash[ub_subhash_fun(p, uid)];
 > +	} else {
 > +		WARN_ON(p != NULL);
 > +		tmpl_ub = &default_beancounter;
 > +		slot = &ub_hash[ub_hash_fun(uid)];
 > +	}
 > +	new_ub = NULL;
 > +
 > +retry:
 > +	spin_lock_irqsave(&ub_hash_lock, flags);
 > +	hlist_for_each_entry (ub, pos, slot, hash)
 > +		if (ub->ub_uid == uid && ub->parent == p)
 > +			break;
 > +
 > +	if (pos != NULL) {
 > +		get_beancounter(ub);
 > +		spin_unlock_irqrestore(&ub_hash_lock, flags);
 > +
 > +		if (new_ub != NULL) {
 > +			put_beancounter(new_ub->parent);
 > +			kmem_cache_free(ub_cachep, new_ub);
 > +		}
 
 A comment indicative of this being a part of race handing would be useful.
 Could you please consider refactoring this function if possible.
 
 > +		return ub;
 > +	}
 > +
 > +	if (!(mask & UB_ALLOC))
 > +		goto out_unlock;
 > +
 > +	if (new_ub != NULL)
 > +		goto out_install;
 > +
 > +	if (mask & UB_ALLOC_ATOMIC) {
 > +		new_ub = kmem_cache_alloc(ub_cachep, GFP_ATOMIC);
 > +		if (new_ub == NULL)
 > +			goto out_unlock;
 > +
 > +		memcpy(new_ub, tmpl_ub, sizeof(*new_ub));
 > +		init_beancounter_struct(new_ub, uid);
 > +		if (p)
 > +			new_ub->parent = get_beancounter(p);
 > +		goto out_install;
 > +	}
 > +
 > +	spin_unlock_irqrestore(&ub_hash_lock, flags);
 > +
 > +	new_ub = kmem_cache_alloc(ub_cachep, GFP_KERNEL);
 > +	if (new_ub == NULL)
 > +		goto out;
 > +
 > +	memcpy(new_ub, tmpl_ub, sizeof(*new_ub));
 > +	init_beancounter_struct(new_ub, uid);
 > +	if (p)
 > +		new_ub->parent = get_beancounter(p);
 > +	goto retry;
 > +
 > +out_install:
 > +	hlist_add_head(&new_ub->hash, slot);
 > +out_unlock:
 > +	spin_unlock_irqrestore(&ub_hash_lock, flags);
 > +out:
 > +	return new_ub;
 > +}
 > +
 > +EXPORT_SYMBOL(beancounter_findcreate);
 > +
 
 <snip>
 
 > +void __put_beancounter(struct user_beancounter *ub)
 > +{
 > +	unsigned long flags;
 > +	struct user_beancounter *parent;
 > +
 > +again:
 > +	parent = ub->parent;
 > +	/* equevalent to atomic_dec_and_lock_irqsave() */
 > +	local_irq_save(flags);
 > +	if (likely(!atomic_dec_and_lock(&ub->ub_refcount, &ub_hash_lock))) {
 > +		if (unlikely(atomic_read(&ub->ub_refcount) < 0))
 > +			printk(KERN_ERR "UB: Bad ub refcount: ub=%p, "
 > +					"luid=%d, ref=%d\n",
 > +					ub, ub->ub_uid,
 > +					atomic_read(&ub->ub_refcount));
 > +		local_irq_restore(flags);
 > +		return;
 > +	}
 > +
 > +	if (unlikely(ub == &ub0)) {
 > +		printk(KERN_ERR "Trying to put ub0\n");
 > +		spin_unlock_irqrestore(&ub_hash_lock, flags);
 > +		return;
 > +	}
 > +
 > +	verify_held(ub);
 > +	hlist_del(&ub->hash);
 > +	spin_unlock_irqrestore(&ub_hash_lock, flags);
 
 Is this function called with the ub_hash_lock held()? A comment would be useful
 or you could call it __put_beancounter_locked :-)
 
 > +
 > +	kmem_cache_free(ub_cachep, ub);
 > +
 > +	ub = parent;
 > +	if (ub != NULL)
 > +		goto again;
 
 Could you please convert this to a do {} while() loop.
 
 > +}
 > +
 > +EXPORT_SYMBOL(__put_beancounter);
 
 <snip>
 
 > +int charge_beancounter(struct user_beancounter *ub,
 > +		int resource, unsigned long val, enum severity strict)
 > +{
 > +	int retval;
 > +	struct user_beancounter *p, *q;
 > +	unsigned long flags;
 > +
 > +	retval = -EINVAL;
 > +	BUG_ON(val > UB_MAXVALUE);
 > +
 > +	local_irq_save(flags);
 > +	for (p = ub; p != NULL; p = p->parent) {
 > +		spin_lock(&p->ub_lock);
 > +		retval = __charge_beancounter_locked(p, resource, val, strict);
 
 Everyone in the hierarchy is charged the same amount - val?
 
 > +		spin_unlock(&p->ub_lock);
 > +		if (retval)
 > +			goto unroll;
 > +	}
 > +out_restore:
 > +	local_irq_restore(flags);
 > +	return retval;
 > +
 > +unroll:
 > +	for (q = ub; q != p; q = q->parent) {
 > +		spin_lock(&q->ub_lock);
 > +		__uncharge_beancounter_locked(q, resource, val);
 > +		spin_unlock(&q->ub_lock);
 > +	}
 > +	goto out_restore;
 
 Too many goto's in both directions - please consider refactoring
 
 > +void charge_beancounter_notop(struct user_beancounter *ub,
 > +		int resource, unsigned long val)
 
 Whats the meaning of notop?
 
 > +{
 > +	struct user_beancounter *p;
 > +	unsigned long flags;
 > +
 > +	local_irq_save(flags);
 > +	for (p = ub; p->parent != NULL; p = p->parent) {
 > +		spin_lock(&p->ub_lock);
 > +		__charge_beancounter_locked(p, resource, val, UB_FORCE);
 > +		spin_unlock(&p->ub_lock);
 > +	}
 > +	local_irq_restore(flags);
 > +}
 > +
 
 Could some of this code be shared with charge_beancounter to avoid duplication?
 
 > +EXPORT_SYMBOL(charge_beancounter_notop);
 > +
 > +void __uncharge_beancounter_locked(struct user_beancounter *ub,
 > +		int resource, unsigned long val)
 > +{
 > +	if (unlikely(ub->ub_parms[resource].held < val)) {
 > +		ub_print_resource_warning(ub, resource,
 > +				"uncharging too much", val, 0);
 > +		val = ub->ub_parms[resource].held;
 > +	}
 > +	ub->ub_parms[resource].held -= val;
 > +	ub_adjust_held_minmax(ub, resource);
 > +}
 > +
 > +void uncharge_beancounter(struct user_beancounter *ub,
 > +		int resource, unsigned long val)
 > +{
 > +	unsigned long flags;
 > +	struct user_beancounter *p;
 > +
 > +	for (p = ub; p != NULL; p = p->parent) {
 > +		spin_lock_irqsave(&p->ub_lock, flags);
 > +		__uncharge_beancounter_locked(p, resource, val);
 > +		spin_unlock_irqrestore(&p->ub_lock, flags);
 > +	}
 > +}
 > +
 > +EXPORT_SYMBOL(uncharge_beancounter);
 > +
 > +void uncharge_beancounter_notop(struct user_beancounter *ub,
 > +		int resource, unsigned long val)
 > +{
 > +	struct user_beancounter *p;
 > +	unsigned long flags;
 > +
 > +	local_irq_save(flags);
 > +	for (p = ub; p->parent != NULL; p = p->parent) {
 > +		spin_lock(&p->ub_lock);
 > +		__uncharge_beancounter_locked(p, resource, val);
 > +		spin_unlock(&p->ub_lock);
 > +	}
 > +	local_irq_restore(flags);
 > +}
 > +
 
 The code for both uncharge_beancounter() and uncharge_beancounter_notop() seems
 to do the same thing
 
 > +
 > +void __init ub_init_late(void)
 > +{
 > +	struct user_beancounter *ub;
 > +
 > +	ub_cachep = kmem_cache_create("user_beancounters",
 > +			sizeof(struct user_beancounter),
 > +			0, SLAB_HWCACHE_ALIGN, NULL, NULL);
 > +	if (ub_cachep == NULL)
 > +		panic("Can't create ubc caches\n");
 > +
 > +	ub = &default_beancounter;
 
 Whats the relationship b
...
 
 
 |  
	|  |  |  
	| 
		
			| Re: [ckrm-tech] [PATCH 4/7] UBC: syscalls (user interface) [message #5465 is a reply to message #5434] | Mon, 21 August 2006 02:47   |  
			| 
				
				
					|  Magnus Damm Messages: 8
 Registered: August 2006
 | Junior Member |  |  |  
	| On Fri, 2006-08-18 at 07:45 -0700, Dave Hansen wrote: > On Fri, 2006-08-18 at 12:08 +0400, Andrey Savochkin wrote:
 > >
 > > A) Have separate memory management for each container,
 > >    with separate buddy allocator, lru lists, page replacement mechanism.
 > >    That implies a considerable overhead, and the main challenge there
 > >    is sharing of pages between these separate memory managers.
 >
 > Hold on here for just a sec...
 >
 > It is quite possible to do memory management aimed at one container
 > while that container's memory still participates in the main VM.
 >
 > There is overhead here, as the LRU scanning mechanisms get less
 > efficient, but I'd rather pay a penalty at LRU scanning time than divide
 > up the VM, or coarsely start failing allocations.
 
 This could of course be solved with one LRU per container, which is how
 the CKRM memory controller implemented things about a year ago.
 
 / magnus
 |  
	|  |  |  
	| 
		
			| Re: [ckrm-tech] [RFC][PATCH 5/7] UBC: kernel memory accounting	(core) [message #5473 is a reply to message #5405] | Mon, 21 August 2006 08:56   |  
			| 
				
				
					|  dev Messages: 1693
 Registered: September 2005
 Location: Moscow
 | Senior Member |  
 |  |  
	| Dave Hansen wrote: > On Fri, 2006-08-18 at 12:12 +0400, Kirill Korotaev wrote:
 >
 >>LDT takes from 1 to 16 pages. and is allocated by vmalloc.
 >>do you propose to replace it with slab which can fail due to memory
 >>fragmentation?
 >
 >
 > Nope.  ;)
 so what is your proposal then? Sorry, probably missed it due to lots of emails :)
 
 >>the same applies to fdset, fdarray, ipc ids and iptables entries.
 >
 >
 > The vmalloc area, along with all of those other structures _have_ other
 > data structures.  Now, it will take a wee bit more patching to directly
 > tag those thing with explicit container pointers (or accounting
 > references), but I would much prefer that, especially for the things
 > that are larger than a page.
 do you mean that you prefer adding a explicit pointer to the structures
 itself?
 
 > I worry that this approach was used instead of patching all of the
 > individual subsystems because this was easier to maintain as an
 > out-of-tree patch, and it isn't necessarily the best approach.
 :) if we were to optimize for patch size then we would select vserver
 approach and be happy...
 
 Dave, we used to add UBC pointers on each data structure and then do
 a separate accounting in the places where objects are allocated.
 We spent a lot of time and investigation on how to make it better,
 because it was leading to often accounting errors, wrong error paths etc.
 The approach provided in this patchset proved to be much more efficient
 and more error prone. And it is much much more elegant!
 
 Thanks,
 Kirill
 |  
	|  |  |  
	|  |  
	| 
		
			| Re: [ckrm-tech] [RFC][PATCH 5/7] UBC: kernel memory accounting	(core) [message #5478 is a reply to message #5403] | Mon, 21 August 2006 10:38   |  
			| 
				
				
					|  dev Messages: 1693
 Registered: September 2005
 Location: Moscow
 | Senior Member |  
 |  |  
	| Dave Hansen wrote: > On Fri, 2006-08-18 at 13:31 +0400, Kirill Korotaev wrote:
 >
 >>they all are troublesome :/
 >>user can create lots of vmas, w/o page tables.
 >>lots of fdsets, ipcids.
 >>These are not reclaimable.
 >
 >
 > I guess one of my big questions surrounding these patches is why the
 > accounting is done with pages.
 probably you missed patch details a bit.
 accounting is done:
 1. in pages for objects allocated by buddy allocator
 2. in slabs for objects allocated from caches
 
 > If there really is a need to limit these
 > different kernel objects, then why not simply write patches to limit
 > *these* *objects*?  I trust there is a very good technical reason for
 > doing this, I just don't understand why, yet.
 The one reason is that such an accounting allows to estimate the memory
 used/required by containers, while limitations by objects:
 - per object accounting/limitations do not provide any memory estimation
 - having a big number of reasonably high limits still allows the user
 to consume big amount of memory. I.e. the sum of all the limits tend
 to be high and potentially DoS exploitable :/
 - memory is easier to setup/control from user POV.
 having hundreds of controls is good, but not much user friendly.
 
 Thanks,
 Kirill
 |  
	|  |  |  
	|  |  
	|  |  
	| 
		
			| Re: [ckrm-tech] [RFC][PATCH 5/7] UBC: kernel memory accounting	(core) [message #5484 is a reply to message #5407] | Mon, 21 August 2006 12:36   |  
			| 
				
				
					|  dev Messages: 1693
 Registered: September 2005
 Location: Moscow
 | Senior Member |  
 |  |  
	| Dave Hansen wrote: > On Fri, 2006-08-18 at 13:31 +0400, Kirill Korotaev wrote:
 >
 >>they all are troublesome :/
 >>user can create lots of vmas, w/o page tables.
 >>lots of fdsets, ipcids.
 >>These are not reclaimable.
 >
 >
 > In the real world, with the customers to which you've given these
 > patches, which of these objects is most likely to be consuming the most
 > space?  Is there one set of objects that we could work on that would fix
 > _most_ of the cases which you have encountered?
 the question is not about which one consumes more in "real life".
 The question is "which of the resources are allocated on user demand
 and should be limited for the environment to be secure and isolated".
 
 >>Also consider the following scenario with reclaimable page tables.
 >>e.g. user hit kmemsize limit due to fat page tables.
 >>kernel reclaims some of the page tables and frees user kenerl memory.
 >>after that user creates some uncreclaimable objects like fdsets or ipcs
 >>and then accesses memory with reclaimed page tables.
 >>Sooner or later we kill user with SIGSEGV from page fault due to
 >>no memory. This is worse then returning ENOMEM from poll() or
 >>mmap() where user allocates kernel objects.
 >
 >
 > I think you're claiming that doing reclaim of kernel objects causes much
 > more severe and less recoverable errors than does reclaiming of user
 > pages.
 I also claim that reclaiming some of kernel pages is almost undoable :)
 Look, the page can be used by slab objects from 2 different containers.
 Why one container should suffer from the second one which needs to be reclaimed?
 What to do? separate allocators per container? And if you want to reclaim
 say a page with vma you need to replaces tons of pointers all over the objects
 in SMP safe manner.
 
 That might generally be true, but I have one example that's
 > killing me.  (You shouldn't have mentioned mmap ;)
 >
 > Let's say you have a memory area mapped by one pagetable page, and with
 > 1 user page mapped in it.  The system is out of memory, and if you
 > reclaim either the pagetable page or the user page, you're never going
 > to get it back.
 >
 > So, you pick the user page to reclaim.  The user touches it, the memory
 > allocation fails, and the process gets killed.
 >
 > Instead, you reclaim the pagetable page.  The user touches their page,
 > the memory allocation for the pagetable fails, and the process gets
 > killed.
 >
 > Seems like the same end result to me.
 because you suggest the same limit for pagetables and user memory.
 Thats why we have separate kmemsize limit for kernel objects and privvmpages for
 user memory.
 privvmpages limit hit will result in -ENOMEM on mmap() system call,
 which is memory friendly.
 
 Kirill
 |  
	|  |  |  
	| 
		
			| Re: [ckrm-tech] [RFC][PATCH] UBC: user resource beancounters [message #5485 is a reply to message #5425] | Mon, 21 August 2006 13:21   |  
			| 
				
				
					|  dev Messages: 1693
 Registered: September 2005
 Location: Moscow
 | Senior Member |  
 |  |  
	| Chandra Seetharaman wrote: > Kirill,
 >
 > Here are some concerns I have (as of now) w.r.t using UBC for resource
 > management (in the context of resource groups).
 >
 > - guarantee support is missing. I do not see any code to provide the
 >   minimum amount of resource a group can get. It is important for
 >   providing QoS. (In a different email you did mention guarantee, i am
 >   referring it here for completeness).
 I mentioned a couple of times that this is a limited core functionality
 in this patch set.
 guarantees are implementable as a separate UBC parameters.
 
 > - Creation of a UBC and assignment of task to a UBC always happen in
 >   the context of the task that is affected. I can understand it works in
 >   OpenVZ environment, but IMO has issues if one wants it to be used for
 >   basic resource management
 >    - application needs to be changed to use this feature.
 >    - System administrator does not have the control to assign tasks to a
 >      UBC. Application does by itself.
 >    - Assignment of task to a UBC need to be transparent to the
 >      application.
 this is not 100% true.
 UBC itself doesn't prevent from changing context on the fly.
 But since this leads to part of resources to be charged to
 one UBC and another part to another UBC and so long and so
 forth, we believe that more clear and correct interface is
 something like fork()/exec()-required-application.
 
 So you can always execute new applications in desired UB and
 NO application modification are required.
 
 
 > - UBC is deleted when the last task (in that UBC) exits. For resource
 >   management purposes, UBC should be deleted only when the administrator
 >   deletes it.
 1. UBCs are freed when last _resource_ using it puts the last reference.
 not the task. And it is a BIG error IMHO to think that resource
 management should group tasks. No, it should group _objects_. Tasks
 are just the same objects like say sockets.
 2. this is easily changeable. You are the only who requested it so far.
 3. kernel does so for many other objects like users and no one complains :)
 
 > - No ability to set resource specific configuration information.
 UBC model allows to _limit_ users. It is _core_.
 We want to do resource management step by step and send it patch by patch,
 while you are trying to solve everything at once.
 
 sys_open() for example doesn't allow to open sockets, does it?
 the same for UBC. They do what they are supposed to do.
 
 > - No ability to maintain resource specific data in the controller.
 it's false. fields can be added to user_beancounter struct easily.
 and that's what our controllers do.
 
 > - No ability to get the list of tasks belonging to a UBC.
 it is not true. it can be read from /proc or system calls interface,
 just like the way one finds all tasks belonging to one user :)
 
 BTW, what is so valueable in this feature?
 do you want to have interfaces to find kernel structures and even pages
 which belong to the container? tasks are just one type of objects...
 
 > - Doesn't inform the resource controllers when limits(shares) change.
 As was answered and noted by Alan Cox:
 1. no one defined what type of action should be done when limits change
 2. it is extendable _when_ needed. Do you want to introduce hooks just
 to have them?
 3. is it so BIG obstacle for UBC patch? These 3-lines hooks code which
 is not used?
 
 > - Doesn't inform the resource controllers when a task's UBC has changed.
 the same as above. we don't add functionality which is not used YET
 (and no one even knows HOW).
 
 > - Doesn't recalculate the resource usage when a task's UBC has changed.
 >   i.e doesn't uncharge the old UBC and charge new UBC.
 You probably missed my explanation, that most
 resources (except for the simplest one - numproc) can't be recharged
 easily. And  nothing in UBC code prevents such recharge to be added later
 if requested.
 
 > - For a system administrator name for identification of a UBC is
 >   better than a number (uid).
 Have you any problems with pids, uids, gids and signals?
 It is a question of interface. I don't mind in changing UBC interface even
 to configfs if someone really wants it.
 
 Thanks,
 Kirill
 |  
	|  |  |  
	| 
		
			| Re: [ckrm-tech] [RFC][PATCH 5/7] UBC: kernel memory accounting (core) [message #5487 is a reply to message #5478] | Mon, 21 August 2006 15:10   |  
			| 
				
				
					|  Dave Hansen Messages: 240
 Registered: October 2005
 | Senior Member |  |  |  
	| On Mon, 2006-08-21 at 14:40 +0400, Kirill Korotaev wrote: > The one reason is that such an accounting allows to estimate the
 > memory
 > used/required by containers, while limitations by objects:
 > - per object accounting/limitations do not provide any memory
 > estimation
 
 I know you're more clever than _that_. ;)
 
 > - having a big number of reasonably high limits still allows the user
 >   to consume big amount of memory. I.e. the sum of all the limits tend
 >   to be high and potentially DoS exploitable :/
 > - memory is easier to setup/control from user POV.
 >   having hundreds of controls is good, but not much user friendly.
 
 I'm actually starting to think that some accounting memory usage *only*
 in the slab, plus a few other structures for any stragglers not using
 the slab would suffice.  Since the slab knows the size of the objects,
 there is no ambiguity about how many are being used.  It should also be
 a pretty generic way to control individual object types, if anyone else
 should ever need it.
 
 The high level pages-used-per-container metric is really nice for just
 that, containers.  But, I wonder if other users would find it useful if
 there were more precise ways of controlling individual object usage.
 
 -- Dave
 |  
	|  |  |  
	| 
		
			| Re: [ckrm-tech] [RFC][PATCH 3/7] UBC: ub context and inheritance [message #5492 is a reply to message #5477] | Mon, 21 August 2006 20:48   |  
			| 
				
				
					|  Chandra Seetharaman Messages: 88
 Registered: August 2006
 | Member |  |  |  
	| On Mon, 2006-08-21 at 14:32 +0400, Kirill Korotaev wrote: > Chandra Seetharaman wrote:
 > > On Wed, 2006-08-16 at 19:38 +0400, Kirill Korotaev wrote:
 > >
 > >>Contains code responsible for setting UB on task,
 > >>it's inheriting and setting host context in interrupts.
 > >>
 > >>Task references three beancounters:
 > >>  1. exec_ub  current context. all resources are
 > >>              charged to this beancounter.
 > >>  2. task_ub  beancounter to which task_struct is
 > >>              charged itself.
 > >
 > >
 > > I do not see why task_ub is needed ? i do not see it being used
 > > anywhere.
 > it is used to charge task itself. will be heavily used in next patch set
 > adding "numproc" UBC parameter.
 
 Well, from your other responses it sounded like you are including
 code/data structure/functionality only when they are used. So, I wasn't
 clear if this is missed out on cleanup or really part of UBC.
 
 Besides, if this is needed only for a specific controller, shouldn't the
 controller worry about maintaining it ?
 
 >
 > >>  3. fork_sub beancounter which is inherited by
 > >>              task's children on fork
 > >
 > >
 > >>From other emails it looks like renaming fork/exec to be real/effective
 > > will be easier to understand.
 > there is no "real". exec_ub is effective indeed,
 > but fork_sub is the one to inherit on fork().
 
 IMO, fork_cb/exec_cb doesn't convey the real meaning of the usage.
 >
 > Kirill
 >
 >
 >  ------------------------------------------------------------ -------------
 > Using Tomcat but need to do more? Need to support web services, security?
 > Get stuff done quickly with pre-integrated technology to make your job easier
 > Download IBM WebSphere Application Server v.1.0.1 based on Apache Geronimo
 >  http://sel.as-us.falkag.net/sel?cmd=lnk&kid=120709&b id=263057&dat=121642
 > _______________________________________________
 > ckrm-tech mailing list
 > https://lists.sourceforge.net/lists/listinfo/ckrm-tech
 --
 
 ------------------------------------------------------------ ----------
 Chandra Seetharaman               | Be careful what you choose....
 - sekharan@us.ibm.com   |      .......you may get it.
 ------------------------------------------------------------ ----------
 |  
	|  |  |  
	| 
		
			| Re: [ckrm-tech] [RFC][PATCH 5/7] UBC: kernel memory accounting	(core) [message #5493 is a reply to message #5480] | Mon, 21 August 2006 20:55   |  
			| 
				
				
					|  Chandra Seetharaman Messages: 88
 Registered: August 2006
 | Member |  |  |  
	| On Mon, 2006-08-21 at 14:51 +0400, Kirill Korotaev wrote: > Chandra Seetharaman wrote:
 > > Kirill,
 > >
 > > IMO, a UBC with resource constraint(limit in this case) should behave no
 > > different than a kernel with limited memory. i.e it should do
 > > reclamation before it starts failing allocation requests. It could even
 > > do it preemptively.
 > first, please notice, that this thread is not about user memory.
 > we can discuss it later when about to control user memory. And
 > I still need to notice, that different models of user memory control
 > can exist. With and without reclamation.
 >
 we can talk about it then :)
 
 > > There is no guarantee support which is required for providing QoS.
 > where? in UBC? in UBC _there_ are guarentees, even in regard to OOM killer.
 
 I do not see it in the patches you have submitted. May be I overlooked.
 Can you please point me the code where guarantee is handled.
 
 >
 > > Each controller modifying the infrastructure code doesn't look good. We
 > > can have proper interfaces to add a new resource controller.
 > controllers do not modify interfaces nor core. They just add
 > themself to the list of resources and setup default limits.
 > do you think it is worth creating infrastructure for these
 > 2 one-line-changes?
 
 Yes, IMO, it is cleaner.
 
 Think of the documentation that explains how to write a controller for
 UBC.
 
 With a proper interface it will read something like: One have to call
 register_controller(char *name) and on success it returns a unique id
 which is the id for the controller.
 
 Vs
 
 With changing lines in the core code: One have to edit the file
 filename.c and add a macro to this of macros with an incremented value
 for their controller and add the name of their controller to the array
 named controller_names[].
 
 I think the first one is cleaner, what do you think ?
 
 <snip>
 
 --
 
 ------------------------------------------------------------ ----------
 Chandra Seetharaman               | Be careful what you choose....
 - sekharan@us.ibm.com   |      .......you may get it.
 ------------------------------------------------------------ ----------
 |  
	|  |  |  
	| 
		
			| Re: [ckrm-tech] [RFC][PATCH] UBC: user resource beancounters [message #5494 is a reply to message #5485] | Mon, 21 August 2006 21:45   |  
			| 
				
				
					|  Chandra Seetharaman Messages: 88
 Registered: August 2006
 | Member |  |  |  
	| On Mon, 2006-08-21 at 17:24 +0400, Kirill Korotaev wrote: > Chandra Seetharaman wrote:
 > > Kirill,
 > >
 > > Here are some concerns I have (as of now) w.r.t using UBC for resource
 > > management (in the context of resource groups).
 > >
 > > - guarantee support is missing. I do not see any code to provide the
 > >   minimum amount of resource a group can get. It is important for
 > >   providing QoS. (In a different email you did mention guarantee, i am
 > >   referring it here for completeness).
 > I mentioned a couple of times that this is a limited core functionality
 > in this patch set.
 > guarantees are implementable as a separate UBC parameters.
 
 I will wait for oomguarpages patches :)
 
 >
 > > - Creation of a UBC and assignment of task to a UBC always happen in
 > >   the context of the task that is affected. I can understand it works in
 > >   OpenVZ environment, but IMO has issues if one wants it to be used for
 > >   basic resource management
 > >    - application needs to be changed to use this feature.
 > >    - System administrator does not have the control to assign tasks to a
 > >      UBC. Application does by itself.
 > >    - Assignment of task to a UBC need to be transparent to the
 > >      application.
 > this is not 100% true.
 > UBC itself doesn't prevent from changing context on the fly.
 > But since this leads to part of resources to be charged to
 > one UBC and another part to another UBC and so long and so
 
 Let the controllers and the users worry about that part.
 
 As I mentioned UBC might be perfect for container resource management,
 but what I am talking for is resource management _without_ a container.
 
 > forth, we believe that more clear and correct interface is
 > something like fork()/exec()-required-application.
 >
 > So you can always execute new applications in desired UB and
 > NO application modification are required.
 
 For generic workload management/resource management _desired UB_ is not
 necessarily decided at fork/exec time. It can happen anytime during the
 life cycle of a task.
 
 >
 >
 > > - UBC is deleted when the last task (in that UBC) exits. For resource
 > >   management purposes, UBC should be deleted only when the administrator
 > >   deletes it.
 > 1. UBCs are freed when last _resource_ using it puts the last reference.
 >  not the task. And it is a BIG error IMHO to think that resource
 >  management should group tasks. No, it should group _objects_. Tasks
 >  are just the same objects like say sockets.
 
 No argument there, that is how CKRM was early last year. But, I do not
 see how is related to the point I am making above (" UBC should be
 deleted only when the administrator deletes it").
 
 > 2. this is easily changeable. You are the only who requested it so far.
 
 It may be because I am the only one looking at it without the
 "container" goggles on :).
 
 > 3. kernel does so for many other objects like users and no one complains :)
 >
 > > - No ability to set resource specific configuration information.
 > UBC model allows to _limit_ users. It is _core_.
 
 I think you got me wrong here. What I want is the ability to
 set/maintain a generic controller specific information.
 
 For example, if the CPU controller wants to allow the user to set the
 number of seconds over which the user wants the guarantee/limit to be
 imposed.
 
 > We want to do resource management step by step and send it patch by patch,
 > while you are trying to solve everything at once.
 >
 > sys_open() for example doesn't allow to open sockets, does it?
 > the same for UBC. They do what they are supposed to do.
 
 I do not see how this relates !!
 
 >
 > > - No ability to maintain resource specific data in the controller.
 > it's false. fields can be added to user_beancounter struct easily.
 > and that's what our controllers do.
 
 With the model of static array for resources (struct ubparm ub_parms
 [UB_RESOURCES] in struct user_beancounter), it is not be possible to
 attach _different_ "controller specific" information to each of the
 entries.
 
 I do not think it is good idea to add controller specific information of
 _different_ controllers to the user_beancounter. Think of all the fields
 it will have when all the numproc controller needs is just the basic 3-4
 fields.
 
 >
 > > - No ability to get the list of tasks belonging to a UBC.
 > it is not true. it can be read from /proc or system calls interface,
 > just like the way one finds all tasks belonging to one user :)
 >
 > BTW, what is so valueable in this feature?
 
 Again, it may not be useful for container type usages (you can probably
 get the list from somewhere else, but for resource management it is
 useful for sysadmins).
 
 > do you want to have interfaces to find kernel structures and even pages
 > which belong to the container? tasks are just one type of objects...
 >
 > > - Doesn't inform the resource controllers when limits(shares) change.
 > As was answered and noted by Alan Cox:
 > 1. no one defined what type of action should be done when limits change
 
 let the controller decide it.
 > 2. it is extendable _when_ needed. Do you want to introduce hooks just
 >    to have them?
 > 3. is it so BIG obstacle for UBC patch? These 3-lines hooks code which
 >    is not used?
 >
 > > - Doesn't inform the resource controllers when a task's UBC has changed.
 > the same as above. we don't add functionality which is not used YET
 > (and no one even knows HOW).
 >
 > > - Doesn't recalculate the resource usage when a task's UBC has changed.
 > >   i.e doesn't uncharge the old UBC and charge new UBC.
 > You probably missed my explanation, that most
 > resources (except for the simplest one - numproc) can't be recharged
 > easily. And  nothing in UBC code prevents such recharge to be added later
 > if requested.
 
 My point is that controllers should have this control. I am ok with
 these being added later. Wondering if there is any design limitations
 that would prevent the later additions (like the _controller specific
 data above).
 
 >
 > > - For a system administrator name for identification of a UBC is
 > >   better than a number (uid).
 > Have you any problems with pids, uids, gids and signals?
 
 Again, in container land each UB is attached with a container hence no
 issue.
 
 In a non-container situation IMO it will be easier to manage/associate
 "gold", "silver", "bronze", "plastic" groups than 0, 11, 83 and 113.
 
 
 > It is a question of interface. I don't mind in changing UBC interface even
 > to configfs if someone really wants it.
 >
 > Thanks,
 > Kirill
 >
 >  ------------------------------------------------------------ -------------
 > Using Tomcat but need to do more? Need to support web services, security?
 > Get stuff done quickly with pre-integrated technology to make your job easier
 > Download IBM WebSphere Application Server v.1.0.1 based on Apache Geronimo
 >  http://sel.as-us.falkag.net/sel?cmd=lnk&kid=120709&b id=263057&dat=121642
 > _______________________________________________
 > ckrm-tech mailing list
 > https://lists.sourceforge.net/lists/listinfo/ckrm-tech
 --
 
 ------------------------------------------------------------ ----------
 Chandra Seetharaman               | Be careful what you choose....
 - sekharan@us.ibm.com   |      .......you may get it.
 ------------------------------------------------------------ ----------
 |  
	|  |  |  
	| 
		
			| Re: [ckrm-tech] [RFC][PATCH] UBC: user resource beancounters [message #5495 is a reply to message #5494] | Mon, 21 August 2006 22:01   |  
			| 
				
				
					|  Alan Cox Messages: 48
 Registered: May 2006
 | Member |  |  |  
	| Ar Llu, 2006-08-21 am 14:45 -0700, ysgrifennodd Chandra Seetharaman: > As I mentioned UBC might be perfect for container resource management,
 > but what I am talking for is resource management _without_ a container.
 
 There isn't really a difference. UBC counts usage of things. It has to
 know who to charge the thing to but its core concept of the luid isn't a
 container, its more akin to the a departmental or project billing code.
 
 > > 3. is it so BIG obstacle for UBC patch? These 3-lines hooks code which
 > >    is not used?
 
 Add them later when they prove to be needed. If IBM send a feature that
 needs it then add them in that feature. Everyone is happy it is possible
 to add that hook when needed.
 
 > In a non-container situation IMO it will be easier to manage/associate
 > "gold", "silver", "bronze", "plastic" groups than 0, 11, 83 and 113.
 
 User space issue. Doing that in kernel will lead to some limitations
 later on and end up needing the user space anyway. Consider wanting to
 keep the container name and properties in LDAP.
 |  
	|  |  |  
	| 
		
			| Re: [ckrm-tech] [RFC][PATCH] UBC: user resource beancounters [message #5496 is a reply to message #5495] | Mon, 21 August 2006 22:44   |  
			| 
				
				
					|  Chandra Seetharaman Messages: 88
 Registered: August 2006
 | Member |  |  |  
	| On Mon, 2006-08-21 at 23:20 +0100, Alan Cox wrote: > Ar Llu, 2006-08-21 am 14:45 -0700, ysgrifennodd Chandra Seetharaman:
 > > As I mentioned UBC might be perfect for container resource management,
 > > but what I am talking for is resource management _without_ a container.
 >
 > There isn't really a difference. UBC counts usage of things. It has to
 > know who to charge the thing to but its core concept of the luid isn't a
 > container, its more akin to the a departmental or project billing code.
 
 I didn't say it is different. The way it is implemented now has some
 restrictions for generic resource management purposes (like ability to
 move task around), but they are not a problem for container type usage.
 
 >
 > > > 3. is it so BIG obstacle for UBC patch? These 3-lines hooks code which
 > > >    is not used?
 >
 > Add them later when they prove to be needed. If IBM send a feature that
 > needs it then add them in that feature. Everyone is happy it is possible
 > to add that hook when needed.
 
 As I mentioned in my reply, I am ok with adding it later.
 
 >
 > > In a non-container situation IMO it will be easier to manage/associate
 > > "gold", "silver", "bronze", "plastic" groups than 0, 11, 83 and 113.
 >
 > User space issue. Doing that in kernel will lead to some limitations
 > later on and end up needing the user space anyway. Consider wanting to
 > keep the container name and properties in LDAP.
 >
 >
 >
 >
 >  ------------------------------------------------------------ -------------
 > Using Tomcat but need to do more? Need to support web services, security?
 > Get stuff done quickly with pre-integrated technology to make your job easier
 > Download IBM WebSphere Application Server v.1.0.1 based on Apache Geronimo
 >  http://sel.as-us.falkag.net/sel?cmd=lnk&kid=120709&b id=263057&dat=121642
 > _______________________________________________
 > ckrm-tech mailing list
 > https://lists.sourceforge.net/lists/listinfo/ckrm-tech
 --
 
 ------------------------------------------------------------ ----------
 Chandra Seetharaman               | Be careful what you choose....
 - sekharan@us.ibm.com   |      .......you may get it.
 ------------------------------------------------------------ ----------
 |  
	|  |  |  
	| 
		
			| Re: [ckrm-tech] [PATCH 4/7] UBC: syscalls (user interface) [message #5500 is a reply to message #5465] | Tue, 22 August 2006 01:16   |  
			| 
				
				
					|  Rohit Seth Messages: 101
 Registered: August 2006
 | Senior Member |  |  |  
	| On Mon, 2006-08-21 at 11:47 +0900, Magnus Damm wrote: > On Fri, 2006-08-18 at 07:45 -0700, Dave Hansen wrote:
 > > On Fri, 2006-08-18 at 12:08 +0400, Andrey Savochkin wrote:
 > > >
 > > > A) Have separate memory management for each container,
 > > >    with separate buddy allocator, lru lists, page replacement mechanism.
 > > >    That implies a considerable overhead, and the main challenge there
 > > >    is sharing of pages between these separate memory managers.
 > >
 > > Hold on here for just a sec...
 > >
 > > It is quite possible to do memory management aimed at one container
 > > while that container's memory still participates in the main VM.
 > >
 > > There is overhead here, as the LRU scanning mechanisms get less
 > > efficient, but I'd rather pay a penalty at LRU scanning time than divide
 > > up the VM, or coarsely start failing allocations.
 >
 > This could of course be solved with one LRU per container, which is how
 > the CKRM memory controller implemented things about a year ago.
 
 Effectively Andrew's idea of faking up nodes is also giving per
 container LRUs.
 
 -rohit
 |  
	|  |  |  
	|  |  
	| 
		
			| Re: [ckrm-tech] [RFC][PATCH] UBC: user resource beancounters [message #5502 is a reply to message #5494] | Tue, 22 August 2006 01:45   |  
			| 
				
				
					|  Rohit Seth Messages: 101
 Registered: August 2006
 | Senior Member |  |  |  
	| On Mon, 2006-08-21 at 14:45 -0700, Chandra Seetharaman wrote: > On Mon, 2006-08-21 at 17:24 +0400, Kirill Korotaev wrote:
 > > Chandra Seetharaman wrote:
 > > > Kirill,
 > > >
 > > > Here are some concerns I have (as of now) w.r.t using UBC for resource
 > > > management (in the context of resource groups).
 > > >
 > > > - guarantee support is missing. I do not see any code to provide the
 > > >   minimum amount of resource a group can get. It is important for
 > > >   providing QoS. (In a different email you did mention guarantee, i am
 > > >   referring it here for completeness).
 > > I mentioned a couple of times that this is a limited core functionality
 > > in this patch set.
 > > guarantees are implementable as a separate UBC parameters.
 >
 > I will wait for oomguarpages patches :)
 >
 > >
 > > > - Creation of a UBC and assignment of task to a UBC always happen in
 > > >   the context of the task that is affected. I can understand it works in
 > > >   OpenVZ environment, but IMO has issues if one wants it to be used for
 > > >   basic resource management
 > > >    - application needs to be changed to use this feature.
 > > >    - System administrator does not have the control to assign tasks to a
 > > >      UBC. Application does by itself.
 > > >    - Assignment of task to a UBC need to be transparent to the
 > > >      application.
 
 I agree with the above points.  Just want to add that assignment of a
 task to a container may not be transparent to the application.  For
 example it may hit some limits and some reclaim may happen...
 
 > > this is not 100% true.
 > > UBC itself doesn't prevent from changing context on the fly.
 > > But since this leads to part of resources to be charged to
 > > one UBC and another part to another UBC and so long and so
 >
 > Let the controllers and the users worry about that part.
 >
 
 I think as the tasks move around, it becomes very heavy to move all the
 pages belonging to previous container to a new container.
 
 > As I mentioned UBC might be perfect for container resource management,
 > but what I am talking for is resource management _without_ a container.
 >
 
 Can you explain that part a bit more?
 
 > >
 > > > - No ability to maintain resource specific data in the controller.
 > > it's false. fields can be added to user_beancounter struct easily.
 > > and that's what our controllers do.
 >
 > With the model of static array for resources (struct ubparm ub_parms
 > [UB_RESOURCES] in struct user_beancounter), it is not be possible to
 > attach _different_ "controller specific" information to each of the
 > entries.
 >
 > I do not think it is good idea to add controller specific information of
 > _different_ controllers to the user_beancounter. Think of all the fields
 > it will have when all the numproc controller needs is just the basic 3-4
 > fields.
 >
 
 IMO it is okay to add  the fields whenever necessary as Kirill
 suggested.  I think once the container subject gets baked a little more,
 the controllers will also get tightly coupled.
 
 > >
 > > > - No ability to get the list of tasks belonging to a UBC.
 > > it is not true. it can be read from /proc or system calls interface,
 > > just like the way one finds all tasks belonging to one user :)
 > >
 > > BTW, what is so valueable in this feature?
 >
 > Again, it may not be useful for container type usages (you can probably
 > get the list from somewhere else, but for resource management it is
 > useful for sysadmins).
 >
 
 I'm also debating about whether printing task information is really any
 useful.  If a sysadm wants to get information about any particular task
 then that can come from /proc/<pid>/container  Though container list
 will be one place where one can easily get the list of all the contained
 tasks (and other resources like files).
 
 
 > >
 > > > - For a system administrator name for identification of a UBC is
 > > >   better than a number (uid).
 > > Have you any problems with pids, uids, gids and signals?
 >
 > Again, in container land each UB is attached with a container hence no
 > issue.
 >
 > In a non-container situation IMO it will be easier to manage/associate
 > "gold", "silver", "bronze", "plastic" groups than 0, 11, 83 and 113.
 >
 >
 > > It is a question of interface. I don't mind in changing UBC interface even
 > > to configfs if someone really wants it.
 > >
 
 Yes please.  Thanks.
 -rohit
 |  
	|  |  |  
	| 
		
			| Re: [ckrm-tech] [PATCH 4/7] UBC: syscalls (user interface) [message #5504 is a reply to message #5500] | Tue, 22 August 2006 03:58   |  
			| 
				
				
					|  Magnus Damm Messages: 8
 Registered: August 2006
 | Junior Member |  |  |  
	| On Mon, 2006-08-21 at 18:16 -0700, Rohit Seth wrote: > On Mon, 2006-08-21 at 11:47 +0900, Magnus Damm wrote:
 > > On Fri, 2006-08-18 at 07:45 -0700, Dave Hansen wrote:
 > > > On Fri, 2006-08-18 at 12:08 +0400, Andrey Savochkin wrote:
 > > > >
 > > > > A) Have separate memory management for each container,
 > > > >    with separate buddy allocator, lru lists, page replacement mechanism.
 > > > >    That implies a considerable overhead, and the main challenge there
 > > > >    is sharing of pages between these separate memory managers.
 > > >
 > > > Hold on here for just a sec...
 > > >
 > > > It is quite possible to do memory management aimed at one container
 > > > while that container's memory still participates in the main VM.
 > > >
 > > > There is overhead here, as the LRU scanning mechanisms get less
 > > > efficient, but I'd rather pay a penalty at LRU scanning time than divide
 > > > up the VM, or coarsely start failing allocations.
 > >
 > > This could of course be solved with one LRU per container, which is how
 > > the CKRM memory controller implemented things about a year ago.
 >
 > Effectively Andrew's idea of faking up nodes is also giving per
 > container LRUs.
 
 Yes, but the NUMA emulation approach is using fixed size containers
 where the size is selectable at the kernel command line, while the CKRM
 (and pzone) approach provides a more dynamic (and complex) solution.
 
 / magnus
 |  
	|  |  |  
	|  |  
	| 
		
			| Re: [ckrm-tech] [RFC][PATCH] UBC: user resource beancounters [message #5516 is a reply to message #5512] | Tue, 22 August 2006 09:57   |  
			| 
				
				
					|  Arjan van de Ven Messages: 14
 Registered: March 2006
 | Junior Member |  |  |  
	| On Tue, 2006-08-22 at 11:02 +0100, Alan Cox wrote: > Ar Llu, 2006-08-21 am 18:45 -0700, ysgrifennodd Rohit Seth:
 > > I think as the tasks move around, it becomes very heavy to move all the
 > > pages belonging to previous container to a new container.
 >
 > Its not a meaningful thing to do. Remember an object may be passed
 > around or shared. The simple "creator pays" model avoids all the heavy
 > overheads while maintaining the constraints.
 
 Hi,
 
 there is one issue with the "creator pays" model: if the creator can
 decide to die/go away/respawn then you can create orphan resources. This
 is a leak at least, but if a malicious user can control the
 death/respawn cycle it can even be abused to bypass the controls in the
 first place. Keeping the owner alive until all shared users are gone is
 not always a good idea either; if a container significantly malfunctions
 (or requires a restart due to, say, a very urgent glibc security
 update), keeping it around anyway is not a valid option for the admin.
 (And it forms another opportunity for a malicious user, keep a
 (vulnerable) container alive by hanging on to a shared resource
 deliberately)
 
 A general "unshare me out" function that finds a new to-blame owner
 might work, just the decision whom to blame is not an easy one in that
 scenario.
 
 Greetings,
 Arjan van de Ven
 
 
 --
 if you want to mail me at work (you don't), use arjan (at) linux.intel.com
 |  
	|  |  | 
 
 
 Current Time: Sun Oct 26 19:29:22 GMT 2025 
 Total time taken to generate the page: 0.08762 seconds |