OpenVZ Forum


Home » Mailing lists » Devel » [RFC][PATCH] UBC: user resource beancounters
Re: [ckrm-tech] [RFC][PATCH 2/7] UBC: core (structures, API) [message #5396 is a reply to message #5196] Fri, 18 August 2006 01:58 Go to previous messageGo to next message
Matt Helsley is currently offline  Matt Helsley
Messages: 86
Registered: August 2006
Member
On Wed, 2006-08-16 at 19:37 +0400, Kirill Korotaev wrote:
> Core functionality and interfaces of UBC:
> find/create beancounter, initialization,
> charge/uncharge of resource, core objects' declarations.
>
> Basic structures:
> ubparm - resource description
> user_beancounter - set of resources, id, lock
>
> Signed-Off-By: Pavel Emelianov <xemul@sw.ru>
> Signed-Off-By: Kirill Korotaev <dev@sw.ru>
>
> ---
> include/ub/beancounter.h | 157 ++++++++++++++++++
> init/main.c | 4
> kernel/Makefile | 1
> kernel/ub/Makefile | 7
> kernel/ub/beancounter.c | 398 +++++++++++++++++++++++++++++++++++++++++++++++
> 5 files changed, 567 insertions(+)
>
> --- /dev/null 2006-07-18 14:52:43.075228448 +0400
> +++ ./include/ub/beancounter.h 2006-08-10 14:58:27.000000000 +0400
> @@ -0,0 +1,157 @@
> +/*
> + * include/ub/beancounter.h
> + *
> + * Copyright (C) 2006 OpenVZ. SWsoft Inc
> + *
> + */
> +
> +#ifndef _LINUX_BEANCOUNTER_H
> +#define _LINUX_BEANCOUNTER_H
> +
> +/*
> + * Resource list.
> + */
> +
> +#define UB_RESOURCES 0
> +
> +struct ubparm {
> + /*
> + * A barrier over which resource allocations are failed gracefully.
> + * e.g. if the amount of consumed memory is over the barrier further
> + * sbrk() or mmap() calls fail, the existing processes are not killed.
> + */
> + unsigned long barrier;
> + /* hard resource limit */
> + unsigned long limit;
> + /* consumed resources */
> + unsigned long held;
> + /* maximum amount of consumed resources through the last period */
> + unsigned long maxheld;
> + /* minimum amount of consumed resources through the last period */
> + unsigned long minheld;
> + /* count of failed charges */
> + unsigned long failcnt;
> +};
> +
> +/*
> + * Kernel internal part.
> + */
> +
> +#ifdef __KERNEL__
> +
> +#include <linux/config.h>
> +#include <linux/spinlock.h>
> +#include <linux/list.h>
> +#include <asm/atomic.h>
> +
> +/*
> + * UB_MAXVALUE is essentially LONG_MAX declared in a cross-compiling safe form.
> + */
> +#define UB_MAXVALUE ( (1UL << (sizeof(unsigned long)*8-1)) - 1)
> +
> +
> +/*
> + * Resource management structures
> + * Serialization issues:
> + * beancounter list management is protected via ub_hash_lock
> + * task pointers are set only for current task and only once
> + * refcount is managed atomically
> + * value and limit comparison and change are protected by per-ub spinlock
> + */
> +
> +struct user_beancounter
> +{
> + atomic_t ub_refcount;
> + spinlock_t ub_lock;
> + uid_t ub_uid;
> + struct hlist_node hash;
> +
> + struct user_beancounter *parent;
> + void *private_data;
> +
> + /* resources statistics and settings */
> + struct ubparm ub_parms[UB_RESOURCES];
> +};
> +
> +enum severity { UB_BARRIER, UB_LIMIT, UB_FORCE };
> +
> +/* Flags passed to beancounter_findcreate() */
> +#define UB_LOOKUP_SUB 0x01 /* Lookup subbeancounter */
> +#define UB_ALLOC 0x02 /* May allocate new one */
> +#define UB_ALLOC_ATOMIC 0x04 /* Allocate with GFP_ATOMIC */
> +
> +#define UB_HASH_SIZE 256
> +
> +#ifdef CONFIG_USER_RESOURCE
> +extern struct hlist_head ub_hash[];
> +extern spinlock_t ub_hash_lock;
> +
> +static inline void ub_adjust_held_minmax(struct user_beancounter *ub,
> + int resource)
> +{
> + if (ub->ub_parms[resource].maxheld < ub->ub_parms[resource].held)
> + ub->ub_parms[resource].maxheld = ub->ub_parms[resource].held;
> + if (ub->ub_parms[resource].minheld > ub->ub_parms[resource].held)
> + ub->ub_parms[resource].minheld = ub->ub_parms[resource].held;
> +}
> +
> +void ub_print_resource_warning(struct user_beancounter *ub, int res,
> + char *str, unsigned long val, unsigned long held);
> +void ub_print_uid(struct user_beancounter *ub, char *str, int size);
> +
> +int __charge_beancounter_locked(struct user_beancounter *ub,
> + int resource, unsigned long val, enum severity strict);
> +void charge_beancounter_notop(struct user_beancounter *ub,
> + int resource, unsigned long val);
> +int charge_beancounter(struct user_beancounter *ub,
> + int resource, unsigned long val, enum severity strict);
> +
> +void __uncharge_beancounter_locked(struct user_beancounter *ub,
> + int resource, unsigned long val);
> +void uncharge_beancounter_notop(struct user_beancounter *ub,
> + int resource, unsigned long val);
> +void uncharge_beancounter(struct user_beancounter *ub,
> + int resource, unsigned long val);
> +
> +struct user_beancounter *beancounter_findcreate(uid_t uid,
> + struct user_beancounter *parent, int flags);
> +
> +static inline struct user_beancounter *get_beancounter(
> + struct user_beancounter *ub)
> +{
> + atomic_inc(&ub->ub_refcount);
> + return ub;
> +}
> +
> +void __put_beancounter(struct user_beancounter *ub);
> +static inline void put_beancounter(struct user_beancounter *ub)
> +{
> + __put_beancounter(ub);
> +}
> +
> +void ub_init_early(void);
> +void ub_init_late(void);
> +void ub_init_proc(void);
> +
> +extern struct user_beancounter ub0;
> +extern const char *ub_rnames[];
> +
> +#else /* CONFIG_USER_RESOURCE */
> +
> +#define beancounter_findcreate(id, p, f) (NULL)
> +#define get_beancounter(ub) (NULL)
> +#define put_beancounter(ub) do { } while (0)
> +#define __charge_beancounter_locked(ub, r, v, s) (0)
> +#define charge_beancounter(ub, r, v, s) (0)
> +#define charge_beancounter_notop(ub, r, v) do { } while (0)
> +#define __uncharge_beancounter_locked(ub, r, v) do { } while (0)
> +#define uncharge_beancounter(ub, r, v) do { } while (0)
> +#define uncharge_beancounter_notop(ub, r, v) do { } while (0)
> +#define ub_init_early() do { } while (0)
> +#define ub_init_late() do { } while (0)
> +#define ub_init_proc() do { } while (0)
> +
> +#endif /* CONFIG_USER_RESOURCE */
> +#endif /* __KERNEL__ */
> +
> +#endif /* _LINUX_BEANCOUNTER_H */
> --- ./init/main.c.ubcore 2006-08-10 14:55:47.000000000 +0400
> +++ ./init/main.c 2006-08-10 14:57:01.000000000 +0400
> @@ -52,6 +52,8 @@
> #include <linux/debug_locks.h>
> #include <linux/lockdep.h>
>
> +#include <ub/beancounter.h>
> +
> #include <asm/io.h>
> #include <asm/bugs.h>
> #include <asm/setup.h>
> @@ -470,6 +472,7 @@ asmlinkage void __init start_kernel(void
> early_boot_irqs_off();
> early_init_irq_lock_class();
>
> + ub_init_early();
> /*
> * Interrupts are still disabled. Do necessary setups, then
> * enable them
> @@ -563,6 +566,7 @@ asmlinkage void __init start_kernel(void
> #endif
> fork_init(num_physpages);
> proc_caches_init();
> + ub_init_late();
> buffer_init();
> unnamed_dev_init();
> key_init();
> --- ./kernel/Makefile.ubcore 2006-08-10 14:55:47.000000000 +0400
> +++ ./kernel/Makefile 2006-08-10 14:57:01.000000000 +0400
> @@ -12,6 +12,7 @@ obj-y = sched.o fork.o exec_domain.o
>
> obj-$(CONFIG_STACKTRACE) += stacktrace.o
> obj-y += time/
> +obj-y += ub/
> obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
> obj-$(CONFIG_LOCKDEP) += lockdep.o
> ifeq ($(CONFIG_PROC_FS),y)
> --- /dev/null 2006-07-18 14:52:43.075228448 +0400
> +++ ./kernel/ub/Makefile 2006-08-10 14:57:01.000000000 +0400
> @@ -0,0 +1,7 @@
> +#
> +# User resources part (UBC)
> +#
> +# Copyright (C) 2006 OpenVZ. SWsoft Inc
> +#
> +
> +obj-$(CONFIG_USER_RESOURCE) += beancounter.o
> --- /dev/null 2006-07-18 14:52:43.075228448 +0400
> +++ ./kernel/ub/beancounter.c 2006-08-10 15:09:34.000000000 +0400
> @@ -0,0 +1,398 @@
> +/*
> + * kernel/ub/beancounter.c
> + *
> + * Copyright (C) 2006 OpenVZ. SWsoft Inc
> + * Original code by (C) 1998 Alan Cox
> + * 1998-2000 Andrey Savochkin <saw@saw.sw.com.sg>
> + */
> +
> +#include <linux/slab.h>
> +#include <linux/module.h>
> +
> +#include <ub/beancounter.h>
> +
> +static kmem_cache_t *ub_cachep;
> +static struct user_beancounter default_beancounter;
> +static struct user_beancounter default_subbeancounter;
> +
> +static void init_beancounter_struct(struct user_beancounter *ub, uid_t id);
> +
> +struct user_beancounter ub0;
> +
> +const char *ub_rnames[] = {
> +};
> +
> +#define ub_hash_fun(x) ((((x) >> 8) ^ (x)) & (UB_HASH_SIZE - 1))
> +#define ub_subhash_fun(p, id) ub_hash_fun((p)->ub_uid + (id) * 17)
> +
> +struct hlist_head ub_hash[UB_HASH_SIZE];
> +spinlock_t ub_hash_lock;
> +
> +EXPORT_SYMBOL(ub_hash);
> +EXPORT_SYMBOL(ub_hash_lock);
> +
> +/*
> + * Per user resource beancounting. Resources are tied to their luid.

You haven't explained what an luid is at this point in the patch series.
Patch 0 says:

diff-ubc-syscalls.patch:
Patch adds system calls for UB management:
1. sys_getluid - get current UB id

But I have no idea what that l is there for. Why not sys_get_ubid() for
instance?

> + * The resource structure itself is tagged both to the process and
> + * the charging resources (a socket doesn't want to have to search for
> + * things at irq time for example). Reference counters keep things in
> + * hand.
> + *
> + * The case where a user creates resource, kills all his processes and
> + * then starts new ones is correctly handled this way. The refcounters
> + * will mean the old entry is still around with resource tied to it.
> + */
> +

So we create one beancounter object for every resource the user's tasks
allocate? For instance, one per socket? Or does "resource structure"
refer to something else?

> +struct user_beancounter *beancounter_findcreate(uid_t uid,
> + struct user_beancounter *p, int mask)
> +{
> + struct user_beancounter *new_ub, *ub, *tmpl_ub;
> + unsigned long flags;
> + struct hlist_head *slot;
> + struct hlist_node *pos;
> +
> + if (mask & UB_LOOKUP_SUB) {
> + WARN_ON(p == NULL);
> + tmpl_ub = &default_subbeancounter;
> + slot = &ub_hash[ub_subhash_fun(p, uid)];
> + } else {
> + WARN_ON(p != NULL);
> + tmpl_ub = &default_beancounter;
> + slot = &ub_hash[ub_hash_fun(uid)];
> + }
> + new_ub = NULL;
> +
> +retry:
> + spin_lock_irqsave(&ub_hash_lock, flags);
> + hlist_for_each_entry (ub, pos, slot, hash)
> + if (ub->ub_uid == uid && ub->parent == p)
> + break;
> +
> + if (pos != NULL) {
> + get_beancounter(ub);
> + spin_unlock_irqrestore(&ub_hash_lock, flags);
> +
> + if (new_ub != NULL) {
> + put_beancounter(new_ub->parent);
> + kmem_cache_free(ub_cachep, new_ub);
> + }
> + return ub;
> + }
> +
> + if (!(mask & UB_ALLOC))
> + goto out_unlock;
> +
> + if (new_ub != NULL)
> + goto out_install;
> +
> + if (mask & UB_ALLOC_ATOMIC) {

This block..

> + new_ub = kmem_cache_alloc(ub_cachep, GFP_ATOMIC);
> + if (new_ub == NULL)
> + goto out_unlock;
> +
> + memcpy(new_ub, tmpl_ub, sizeof(*new_ub));
> + init_beancounter_struct(new_ub, uid);
> + if (p)
> + new_ub->parent = get_beancounter(p);

ending here is almost exactly the same as the block ..

> + goto out_install;
> + }
> +
> + spin_unlock_irqrestore(&ub_hash_lock, flags);
> +

>From here..

> + new_ub = kmem_cache_alloc(ub_cachep, GFP_KERNEL);
> + if (new_ub == NULL)
> + goto out;
> +
> + memcpy(new_ub, tmpl_ub, sizeof(*new_ub));
> + init_beancounter_struct(new_ub, uid);
> + if (p)
> + new_ub->parent = get_beancounter(p);

to here. You could make a flag variable that holds GFP_ATOMIC or
GFP_KERNEL based on mask & UB_ALLOC_ATOMIC and perhaps turn this block
into a small helper.

> + goto retry;
> +
> +out_install:
> + hlist_add_head(&new_ub->hash, slot);
> +out_unlock:
> + spin_unlock_irqrestore(&ub_hash_lock, flags);
> +out:
> + return new_ub;
> +}
> +
> +EXPORT_SYMBOL(beancounter_findcreate);
> +
> +void ub_print_uid(struct user_beancounter *ub, char *str, int size)
> +{
> + if (ub->parent != NULL)
> + snprintf(str, size, "%u.%u", ub->parent->ub_uid, ub->ub_uid);
> + else
> + snprintf(str, size, "%u", ub->ub_uid);
> +}
> +
> +EXPORT_SYMBOL(ub_print_uid);

>From what I can see this patch doesn't really justify the need for the
EXPORT_SYMBOL. Shouldn't that be done in the patch where it's needed
outside of the kernel/ub code itself?

> +void ub_print_resource_warning(struct user_beancounter *ub, int res,
> + char *str, unsigned long val, unsigned long held)
> +{
> + char uid[64];
> +
> + ub_print_uid(ub, uid, sizeof(uid));
> + printk(KERN_WARNING "UB %s %s warning: %s "
> + "(held %lu, fails %lu, val %lu)\n",
> + uid, ub_rnames[res], str,
> + (res < UB_RESOURCES ? ub->ub_parms[res].held : held),
> + (res < UB_RESOURCES ? ub->ub_parms[res].failcnt : 0),
> + val);
> +}
> +
> +EXPORT_SYMBOL(ub_print_resource_warning);
> +
> +static inline void verify_held(struct user_beancounter *ub)
> +{
> + int i;
> +
> + for (i = 0; i < UB_RESOURCES; i++)
> + if (ub->ub_parms[i].held != 0)
> + ub_print_resource_warning(ub, i,
> + "resource is held on put", 0, 0);
> +}
> +
> +void __put_beancounter(struct user_beancounter *ub)
> +{
> + unsigned long flags;
> + struct user_beancounter *parent;
> +
> +again:
> + parent = ub->parent;
> + /* equevalent to atomic_dec_and_lock_irqsave() */

nit: s/que/qui/

> + local_irq_save(flags);
> + if (likely(!atomic_dec_and_lock(&ub->ub_refcount, &ub_hash_lock))) {
> + if (unlikely(atomic_read(&ub->ub_refcount) < 0))
> + printk(KERN_ERR "UB: Bad ub refcount: ub=%p, "
> + "luid=%d, ref=%d\n",
> + ub, ub->ub_uid,
> + atomic_read(&ub->ub_refcount));

This seems to be for debugging purposes only. If not, perhaps this
printk ought to be rate limited?

> + local_irq_restore(flags);
> + return;
> + }
> +
> + if (unlikely(ub == &ub0)) {
> + printk(KERN_ERR "Trying to put ub0\n");

Same thing for this printk.

> + spin_unlock_irqrestore(&ub_hash_lock, flags);
> + return;
> + }
> +
> + verify_held(ub);
> + hlist_del(&ub->hash);
> + spin_unlock_irqrestore(&ub_hash_lock, flags);
> +
> + kmem_cache_free(ub_cachep, ub);
> +
> + ub = parent;
> + if (ub != NULL)
> + goto again;

Couldn't this be replaced by a do { } while (ub != NULL); loop?

> +}
> +
> +EXPORT_SYMBOL(__put_beancounter);
> +
> +/*
> + * Generic resource charging stuff
> + */
> +
> +int __charge_beancounter_locked(struct user_beancounter *ub,
> + int resource, unsigned long val, enum severity strict)
> +{
> + /*
> + * ub_value <= UB_MAXVALUE, value <= UB_MAXVALUE, and only one addition
> + * at the moment is possible so an overflow is impossible.
> + */
> + ub->ub_parms[resource].held += val;
> +
> + switch (strict) {
> + case UB_BARRIER:
> + if (ub->ub_parms[resource].held >
> + ub->ub_parms[resource].barrier)
> + break;
> + /* fallthrough */
> + case UB_LIMIT:
> + if (ub->ub_parms[resource].held >
> + ub->ub_parms[resource].limit)
> + break;
> + /* fallthrough */
> + case UB_FORCE:
> + ub_adjust_held_minmax(ub, resource);
> + return 0;
> + default:
> + BUG();
> + }
> +
> + ub->ub_parms[resource].failcnt++;
> + ub->ub_parms[resource].held -= val;
> + return -ENOMEM;
> +}
> +
> +int charge_beancounter(struct user_beancounter *ub,
> + int resource, unsigned long val, enum severity strict)
> +{
> + int retval;
> + struct user_beancounter *p, *q;
> + unsigned long flags;
> +
> + retval = -EINVAL;
> + BUG_ON(val > UB_MAXVALUE);
> +
> + local_irq_save(flags);

<factor>

> + for (p = ub; p != NULL; p = p->parent) {

Seems rather expensive to walk up the tree for every charge. Especially
if the administrator wants a fine degree of resource control and makes a
tall tree. This would be a problem especially when it comes to resources
that require frequent and fast allocation.

> + spin_lock(&p->ub_lock);
> + retval = __charge_beancounter_locked(p, resource, val, strict);
> + spin_unlock(&p->ub_lock);
> + if (retval)
> + goto unroll;

This can be factored by passing a flag that breaks the loop on an error:

if (retval && do_break_err)
return retval;

> + }

</factor>

> +out_restore:
> + local_irq_restore(flags);
> + return retval;
> +

<factor>

> +unroll:
> + for (q = ub; q != p; q = q->parent) {
> + spin_lock(&q->ub_lock);
> + __uncharge_beancounter_locked(q, resource, val);
> + spin_unlock(&q->ub_lock);
> + }

</factor>

> + goto out_restore;
> +}
> +
> +EXPORT_SYMBOL(charge_beancounter);
> +
> +void charge_beancounter_notop(struct user_beancounter *ub,
> + int resource, unsigned long val)
> +{
> + struct user_beancounter *p;
> + unsigned long flags;
> +
> + local_irq_save(flags);

<factor>

> + for (p = ub; p->parent != NULL; p = p->parent) {
> + spin_lock(&p->ub_lock);
> + __charge_beancounter_locked(p, resource, val, UB_FORCE);
> + spin_unlock(&p->ub_lock);
> + }

<factor>

> + local_irq_restore(flags);

Again, this could be factored with charge_beancounter using a helper
function.

> +}
> +
> +EXPORT_SYMBOL(charge_beancounter_notop);
> +
> +void __uncharge_beancounter_locked(struct user_beancounter *ub,
> + int resource, unsigned long val)
> +{
> + if (unlikely(ub->ub_parms[resource].held < val)) {
> + ub_print_resource_warning(ub, resource,
> + "uncharging too much", val, 0);
> + val = ub->ub_parms[resource].held;
> + }
> + ub->ub_parms[resource].held -= val;
> + ub_adjust_held_minmax(ub, resource);
> +}
> +
> +void uncharge_beancounter(struct user_beancounter *ub,
> + int resource, unsigned long val)
> +{
> + unsigned long flags;
> + struct user_beancounter *p;
> +
> + for (p = ub; p != NULL; p = p->parent) {
> + spin_lock_irqsave(&p->ub_lock, flags);
> + __uncharge_beancounter_locked(p, resource, val);
> + spin_unlock_irqrestore(&p->ub_lock, flags);
> + }
> +}

Looks like your unroll: label in charge_beancounter above.

> +
> +EXPORT_SYMBOL(uncharge_beancounter);
> +
> +void uncharge_beancounter_notop(struct user_beancounter *ub,
> + int resource, unsigned long val)
> +{
> + struct user_beancounter *p;
> + unsigned long flags;
> +
> + local_irq_save(flags);

<factor>

> + for (p = ub; p->parent != NULL; p = p->parent) {
> + spin_lock(&p->ub_lock);
> + __uncharge_beancounter_locked(p, resource, val);
> + spin_unlock(&p->ub_lock);
> + }

</factor>

> + local_irq_restore(flags);
> +}
> +
> +EXPORT_SYMBOL(uncharge_beancounter_notop);
> +
> +/*
> + * Initialization
> + *
> + * struct user_beancounter contains
> + * - limits and other configuration settings
> + * - structural fields: lists, spinlocks and so on.
> + *
> + * Before these parts are initialized, the structure should be memset
> + * to 0 or copied from a known clean structure. That takes care of a lot
> + * of fields not initialized explicitly.
> + */
> +
> +static void init_beancounter_struct(struct user_beancounter *ub, uid_t id)
> +{
> + atomic_set(&ub->ub_refcount, 1);
> + spin_lock_init(&ub->ub_lock);
> + ub->ub_uid = id;
> +}
> +
> +static void init_beancounter_nolimits(struct user_beancounter *ub)
> +{
> + int k;
> +
> + for (k = 0; k < UB_RESOURCES; k++) {
> + ub->ub_parms[k].limit = UB_MAXVALUE;
> + ub->ub_parms[k].barrier = UB_MAXVALUE;
> + }
> +}
> +
> +static void init_beancounter_syslimits(struct user_beancounter *ub)
> +{
> + int k;
> +
> + for (k = 0; k < UB_RESOURCES; k++)
> + ub->ub_parms[k].barrier = ub->ub_parms[k].limit;
> +}
> +
> +void __init ub_init_early(void)
> +{
> + struct user_beancounter *ub;
> + struct hlist_head *slot;
> +
> + ub = &ub0;
> +

<factor>

> + memset(ub, 0, sizeof(*ub));
> + init_beancounter_nolimits(ub);
> + init_beancounter_struct(ub, 0);
> +

</factor>

> + spin_lock_init(&ub_hash_lock);
> + slot = &ub_hash[ub_hash_fun(ub->ub_uid)];
> + hlist_add_head(&ub->hash, slot);
> +}
> +
> +void __init ub_init_late(void)
> +{
> + struct user_beancounter *ub;
> +
> + ub_cachep = kmem_cache_create("user_beancounters",
> + sizeof(struct user_beancounter),
> + 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
> + if (ub_cachep == NULL)
> + panic("Can't create ubc caches\n");
> +
> + ub = &default_beancounter;

<factor>

> + memset(ub, 0, sizeof(default_beancounter));
> + init_beancounter_syslimits(ub);
> + init_beancounter_struct(ub, 0);
> +

</factor>

> + ub = &default_subbeancounter;

<factor>

> + memset(ub, 0, sizeof(default_subbeancounter));
> + init_beancounter_nolimits(ub);
> + init_beancounter_struct(ub, 0);

</factor>

> +}

Cheers,
-Matt Helsley
Re: [ckrm-tech] [RFC][PATCH 5/7] UBC: kernel memory accounting (core) [message #5403 is a reply to message #5372] Fri, 18 August 2006 14:58 Go to previous messageGo to next message
Dave Hansen is currently offline  Dave Hansen
Messages: 240
Registered: October 2005
Senior Member
On Fri, 2006-08-18 at 13:31 +0400, Kirill Korotaev wrote:
> they all are troublesome :/
> user can create lots of vmas, w/o page tables.
> lots of fdsets, ipcids.
> These are not reclaimable.

I guess one of my big questions surrounding these patches is why the
accounting is done with pages. If there really is a need to limit these
different kernel objects, then why not simply write patches to limit
*these* *objects*? I trust there is a very good technical reason for
doing this, I just don't understand why, yet.

-- Dave
Re: [ckrm-tech] [RFC][PATCH 5/7] UBC: kernel memory accounting (core) [message #5405 is a reply to message #5359] Fri, 18 August 2006 14:43 Go to previous messageGo to next message
Dave Hansen is currently offline  Dave Hansen
Messages: 240
Registered: October 2005
Senior Member
On Fri, 2006-08-18 at 12:12 +0400, Kirill Korotaev wrote:
> LDT takes from 1 to 16 pages. and is allocated by vmalloc.
> do you propose to replace it with slab which can fail due to memory
> fragmentation?

Nope. ;)

> the same applies to fdset, fdarray, ipc ids and iptables entries.

The vmalloc area, along with all of those other structures _have_ other
data structures. Now, it will take a wee bit more patching to directly
tag those thing with explicit container pointers (or accounting
references), but I would much prefer that, especially for the things
that are larger than a page.

I worry that this approach was used instead of patching all of the
individual subsystems because this was easier to maintain as an
out-of-tree patch, and it isn't necessarily the best approach.

-- Dave
Re: [ckrm-tech] [RFC][PATCH 5/7] UBC: kernel memory accounting (core) [message #5407 is a reply to message #5372] Fri, 18 August 2006 15:06 Go to previous messageGo to next message
Dave Hansen is currently offline  Dave Hansen
Messages: 240
Registered: October 2005
Senior Member
On Fri, 2006-08-18 at 13:31 +0400, Kirill Korotaev wrote:
> they all are troublesome :/
> user can create lots of vmas, w/o page tables.
> lots of fdsets, ipcids.
> These are not reclaimable.

In the real world, with the customers to which you've given these
patches, which of these objects is most likely to be consuming the most
space? Is there one set of objects that we could work on that would fix
_most_ of the cases which you have encountered?

> Also consider the following scenario with reclaimable page tables.
> e.g. user hit kmemsize limit due to fat page tables.
> kernel reclaims some of the page tables and frees user kenerl memory.
> after that user creates some uncreclaimable objects like fdsets or ipcs
> and then accesses memory with reclaimed page tables.
> Sooner or later we kill user with SIGSEGV from page fault due to
> no memory. This is worse then returning ENOMEM from poll() or
> mmap() where user allocates kernel objects.

I think you're claiming that doing reclaim of kernel objects causes much
more severe and less recoverable errors than does reclaiming of user
pages. That might generally be true, but I have one example that's
killing me. (You shouldn't have mentioned mmap ;)

Let's say you have a memory area mapped by one pagetable page, and with
1 user page mapped in it. The system is out of memory, and if you
reclaim either the pagetable page or the user page, you're never going
to get it back.

So, you pick the user page to reclaim. The user touches it, the memory
allocation fails, and the process gets killed.

Instead, you reclaim the pagetable page. The user touches their page,
the memory allocation for the pagetable fails, and the process gets
killed.

Seems like the same end result to me.

-- Dave
Re: [RFC][PATCH 2/7] UBC: core (structures, API) [message #5408 is a reply to message #5347] Fri, 18 August 2006 15:39 Go to previous messageGo to next message
Alan Cox is currently offline  Alan Cox
Messages: 48
Registered: May 2006
Member
Ar Iau, 2006-08-17 am 22:31 -0700, ysgrifennodd Andrew Morton:
> > oh, its a misname. Should be ub_id. it is ID of user_beancounter
> > and has nothing to do with user id.
>
> But it uses a uid_t. That's more than a misnaming?

A container id in UBC is an luid which is a type of uid, and uid_t. That
follows setluid() in other operating system environments.
Re: [RFC][PATCH 5/7] UBC: kernel memory accounting (core) [message #5411 is a reply to message #5374] Fri, 18 August 2006 16:55 Go to previous messageGo to next message
Rohit Seth is currently offline  Rohit Seth
Messages: 101
Registered: August 2006
Senior Member
On Fri, 2006-08-18 at 13:38 +0400, Kirill Korotaev wrote:
> Rohit Seth wrote:
> > On Thu, 2006-08-17 at 17:27 +0400, Kirill Korotaev wrote:
> >
> >>>If I'm reading this patch right then seems like you are making page
> >>>allocations to fail w/o (for example) trying to purge some pages from
> >>>the page cache belonging to this container. Or is that reclaim going to
> >>>come later?
> >>
> >>charged kernel objects can't be _reclaimed_. how do you propose
> >>to reclaim tasks page tables or files or task struct or vma or etc.?
> >
> >
> >
> > I agree that kernel objects cann't be reclaimed easily. But what you
> > are proposing is also not right. Returning failure w/o doing any
> > reclaim on pages (that are reclaimable) is not useful. And this is why
> > I asked, is this change going to be part of next set of patches (as
> > current set of patches are only tracking kernel usage).

> 1. reclaiming user resources is not that good idea as it looks to you.
> such solutions end up with lots of resources spent on reclaim.
> for user memory reclaims mean consumption of expensive disk I/O bandwidth
> which reduces overall system throughput and influences other users.
>

May be I'm overlooking something very obvious. Please tell me, what
happens when a user hits a page fault and the page allocator is easily
able to give a page from its pcp list. But container is over its limit
of physical memory. In your patch there is no attempt by container
support to see if some of the user pages are easily reclaimable. What
options a user will have to make sure some room is created.

> 2. kernel memory is mostly not reclaimable. can you reclaim vma structs or ipc ids?

I'm not arguing about that at all. If people want to talk about
reclaiming kernel pages then that should be done independent of this
subject.



-rohit
Re: [RFC][PATCH 2/7] UBC: core (structures, API) [message #5416 is a reply to message #5384] Fri, 18 August 2006 17:51 Go to previous messageGo to next message
Rohit Seth is currently offline  Rohit Seth
Messages: 101
Registered: August 2006
Senior Member
On Fri, 2006-08-18 at 15:14 +0400, Kirill Korotaev wrote:

> >>2. if you think a bit more about it, adding UB parameters doesn't
> >> require user space changes as well.
> >>3. it is possible to add any kind of interface for UBC. but do you like the idea
> >> to grep 200(containers)x20(parameters) files for getting current usages?
> >
> >
> > How are you doing it currently and how much more efficient it is in
> > comparison to configfs?
> currently it is done with a single file read.
> you can grep it, sum up resources or do what ever you want from bash.
> what is important! you can check whether container hits its limits
> with a single command, while with configs you would have to look through
> 20 files...
>

I think configfs provides all the required functionality that you
listed. You can define the attributes in a such a away that it prints
all the information that you need in one single read operation (I think
the limit is PAGE_SIZE....which is kind of sad).

I've just started playing with configfs for a container implementation
that I'm trying to get a better idea of details.

> IMHO it is convinient to have a text file representing the whole information state
> and system call for applications.
>

There should be an easy interface for shell to be able to do the needful
as well, for example, set the limits.


-rohit
Re: [ckrm-tech] [RFC][PATCH 1/7] UBC: kconfig [message #5424 is a reply to message #5195] Fri, 18 August 2006 19:57 Go to previous messageGo to next message
Chandra Seetharaman is currently offline  Chandra Seetharaman
Messages: 88
Registered: August 2006
Member
As pointed in an earlier email, it would be better if we could have this
in a arch-independent Kconfig, unless there is any problem with that.

On Wed, 2006-08-16 at 19:35 +0400, Kirill Korotaev wrote:
> Add kernel/ub/Kconfig file with UBC options and
> includes it into arch Kconfigs
>
> Signed-Off-By: Pavel Emelianov <xemul@sw.ru>
> Signed-Off-By: Kirill Korotaev <dev@sw.ru>
>
> ---
> arch/i386/Kconfig | 2 ++
> arch/ia64/Kconfig | 2 ++
> arch/powerpc/Kconfig | 2 ++
> arch/ppc/Kconfig | 2 ++
> arch/sparc/Kconfig | 2 ++
> arch/sparc64/Kconfig | 2 ++
> arch/x86_64/Kconfig | 2 ++
> kernel/ub/Kconfig | 25 +++++++++++++++++++++++++
> 8 files changed, 39 insertions(+)
>
> --- ./arch/i386/Kconfig.ubkm 2006-07-10 12:39:10.000000000 +0400
> +++ ./arch/i386/Kconfig 2006-07-28 14:10:41.000000000 +0400
> @@ -1146,6 +1146,8 @@ source "crypto/Kconfig"
>
> source "lib/Kconfig"
>
> +source "kernel/ub/Kconfig"
> +
> #
> # Use the generic interrupt handling code in kernel/irq/:
> #
> --- ./arch/ia64/Kconfig.ubkm 2006-07-10 12:39:10.000000000 +0400
> +++ ./arch/ia64/Kconfig 2006-07-28 14:10:56.000000000 +0400
> @@ -481,6 +481,8 @@ source "fs/Kconfig"
>
> source "lib/Kconfig"
>
> +source "kernel/ub/Kconfig"
> +
> #
> # Use the generic interrupt handling code in kernel/irq/:
> #
> --- ./arch/powerpc/Kconfig.arkcfg 2006-08-07 14:07:12.000000000 +0400
> +++ ./arch/powerpc/Kconfig 2006-08-10 17:55:58.000000000 +0400
> @@ -1038,6 +1038,8 @@ source "arch/powerpc/platforms/iseries/K
>
> source "lib/Kconfig"
>
> +source "ub/Kconfig"
> +
> menu "Instrumentation Support"
> depends on EXPERIMENTAL
>
> --- ./arch/ppc/Kconfig.arkcfg 2006-07-10 12:39:10.000000000 +0400
> +++ ./arch/ppc/Kconfig 2006-08-10 17:56:13.000000000 +0400
> @@ -1414,6 +1414,8 @@ endmenu
>
> source "lib/Kconfig"
>
> +source "ub/Kconfig"
> +
> source "arch/powerpc/oprofile/Kconfig"
>
> source "arch/ppc/Kconfig.debug"
> --- ./arch/sparc/Kconfig.arkcfg 2006-04-21 11:59:32.000000000 +0400
> +++ ./arch/sparc/Kconfig 2006-08-10 17:56:24.000000000 +0400
> @@ -296,3 +296,5 @@ source "security/Kconfig"
> source "crypto/Kconfig"
>
> source "lib/Kconfig"
> +
> +source "ub/Kconfig"
> --- ./arch/sparc64/Kconfig.arkcfg 2006-07-17 17:01:11.000000000 +0400
> +++ ./arch/sparc64/Kconfig 2006-08-10 17:56:36.000000000 +0400
> @@ -432,3 +432,5 @@ source "security/Kconfig"
> source "crypto/Kconfig"
>
> source "lib/Kconfig"
> +
> +source "lib/Kconfig"
> --- ./arch/x86_64/Kconfig.ubkm 2006-07-10 12:39:11.000000000 +0400
> +++ ./arch/x86_64/Kconfig 2006-07-28 14:10:49.000000000 +0400
> @@ -655,3 +655,5 @@ source "security/Kconfig"
> source "crypto/Kconfig"
>
> source "lib/Kconfig"
> +
> +source "kernel/ub/Kconfig"
> --- ./kernel/ub/Kconfig.ubkm 2006-07-28 13:07:38.000000000 +0400
> +++ ./kernel/ub/Kconfig 2006-07-28 13:09:51.000000000 +0400
> @@ -0,0 +1,25 @@
> +#
> +# User resources part (UBC)
> +#
> +# Copyright (C) 2006 OpenVZ. SWsoft Inc
> +
> +menu "User resources"
> +
> +config USER_RESOURCE
> + bool "Enable user resource accounting"
> + default y
> + help
> + This patch provides accounting and allows to configure
> + limits for user's consumption of exhaustible system resources.
> + The most important resource controlled by this patch is unswappable
> + memory (either mlock'ed or used by internal kernel structures and
> + buffers). The main goal of this patch is to protect processes
> + from running short of important resources because of an accidental
> + misbehavior of processes or malicious activity aiming to ``kill''
> + the system. It's worth to mention that resource limits configured
> + by setrlimit(2) do not give an acceptable level of protection
> + because they cover only small fraction of resources and work on a
> + per-process basis. Per-process accounting doesn't prevent malicious
> + users from spawning a lot of resource-consuming processes.
> +
> +endmenu
>
> ------------------------------------------------------------ -------------
> Using Tomcat but need to do more? Need to support web services, security?
> Get stuff done quickly with pre-integrated technology to make your job easier
> Download IBM WebSphere Application Server v.1.0.1 based on Apache Geronimo
> http://sel.as-us.falkag.net/sel?cmd=lnk&kid=120709&b id=263057&dat=121642
> _______________________________________________
> ckrm-tech mailing list
> https://lists.sourceforge.net/lists/listinfo/ckrm-tech
--

------------------------------------------------------------ ----------
Chandra Seetharaman | Be careful what you choose....
- sekharan@us.ibm.com | .......you may get it.
------------------------------------------------------------ ----------
Re: [ckrm-tech] [RFC][PATCH] UBC: user resource beancounters [message #5425 is a reply to message #5192] Fri, 18 August 2006 19:39 Go to previous messageGo to next message
Chandra Seetharaman is currently offline  Chandra Seetharaman
Messages: 88
Registered: August 2006
Member
Kirill,

Here are some concerns I have (as of now) w.r.t using UBC for resource
management (in the context of resource groups).

- guarantee support is missing. I do not see any code to provide the
minimum amount of resource a group can get. It is important for
providing QoS. (In a different email you did mention guarantee, i am
referring it here for completeness).
- Creation of a UBC and assignment of task to a UBC always happen in
the context of the task that is affected. I can understand it works in
OpenVZ environment, but IMO has issues if one wants it to be used for
basic resource management
- application needs to be changed to use this feature.
- System administrator does not have the control to assign tasks to a
UBC. Application does by itself.
- Assignment of task to a UBC need to be transparent to the
application.
- UBC is deleted when the last task (in that UBC) exits. For resource
management purposes, UBC should be deleted only when the administrator
deletes it.
- No ability to set resource specific configuration information.
- No ability to maintain resource specific data in the controller.
- No ability to get the list of tasks belonging to a UBC.
- Doesn't inform the resource controllers when limits(shares) change.
- Doesn't inform the resource controllers when a task's UBC has changed.
- Doesn't recalculate the resource usage when a task's UBC has changed.
i.e doesn't uncharge the old UBC and charge new UBC.
- For a system administrator name for identification of a UBC is
better than a number (uid).

regards,

chandra

On Wed, 2006-08-16 at 19:24 +0400, Kirill Korotaev wrote:
> The following patch set presents base of
> User Resource Beancounters (UBC).
> UBC allows to account and control consumption
> of kernel resources used by group of processes.
>
> The full UBC patch set allows to control:
> - kernel memory. All the kernel objects allocatable
> on user demand should be accounted and limited
> for DoS protection.
> E.g. page tables, task structs, vmas etc.
>
> - virtual memory pages. UBC allows to
> limit a container to some amount of memory and
> introduces 2-level OOM killer taking into account
> container's consumption.
> pages shared between containers are correctly
> charged as fractions (tunable).
>
> - network buffers. These includes TCP/IP rcv/snd
> buffers, dgram snd buffers, unix, netlinks and
> other buffers.
>
> - minor resources accounted/limited by number:
> tasks, files, flocks, ptys, siginfo, pinned dcache
> mem, sockets, iptentries (for containers with
> virtualized networking)
>
> As the first step we want to propose for discussion
> the most complicated parts of resource management:
> kernel memory and virtual memory.
> The patch set to be sent provides core for UBC and
> management of kernel memory only. Virtual memory
> management will be sent in a couple of days.
>
> The patches in these series are:
> diff-ubc-kconfig.patch:
> Adds kernel/ub/Kconfig file with UBC options and
> includes it into arch Kconfigs
>
> diff-ubc-core.patch:
> Contains core functionality and interfaces of UBC:
> find/create beancounter, initialization,
> charge/uncharge of resource, core objects' declarations.
>
> diff-ubc-task.patch:
> Contains code responsible for setting UB on task,
> it's inheriting and setting host context in interrupts.
>
> Task contains three beancounters:
> 1. exec_ub - current context. all resources are charged
> to this beancounter.
> 2. task_ub - beancounter to which task_struct is charged
> itself.
> 3. fork_sub - beancounter which is inherited by
> task's children on fork
>
> diff-ubc-syscalls.patch:
> Patch adds system calls for UB management:
> 1. sys_getluid - get current UB id
> 2. sys_setluid - changes exec_ and fork_ UBs on current
> 3. sys_setublimit - set limits for resources consumtions
>
> diff-ubc-kmem-core.patch:
> Introduces UB_KMEMSIZE resource which accounts kernel
> objects allocated by task's request.
>
> Objects are accounted via struct page and slab objects.
> For the latter ones each slab contains a set of pointers
> corresponding object is charged to.
>
> Allocation charge rules:
> 1. Pages - if allocation is performed with __GFP_UBC flag - page
> is charged to current's exec_ub.
> 2. Slabs - kmem_cache may be created with SLAB_UBC flag - in this
> case each allocation is charged. Caches used by kmalloc are
> created with SLAB_UBC | SLAB_UBC_NOCHARGE flags. In this case
> only __GFP_UBC allocations are charged.
>
> diff-ubc-kmem-charge.patch:
> Adds SLAB_UBC and __GFP_UBC flags in appropriate places
> to cause charging/limiting of specified resources.
>
> diff-ubc-proc.patch:
> Adds two proc entries user_beancounters and user_beancounters_sub
> allowing to see current state (usage/limits/fails for each UB).
> Implemented via seq files.
>
> Patch set is applicable to 2.6.18-rc4-mm1
>
> Thanks,
> Kirill
>
>
> ------------------------------------------------------------ -------------
> Using Tomcat but need to do more? Need to support web services, security?
> Get stuff done quickly with pre-integrated technology to make your job easier
> Download IBM WebSphere Application Server v.1.0.1 based on Apache Geronimo
> http://sel.as-us.falkag.net/sel?cmd=lnk&kid=120709&b id=263057&dat=121642
> _______________________________________________
> ckrm-tech mailing list
> https://lists.sourceforge.net/lists/listinfo/ckrm-tech
--

------------------------------------------------------------ ----------
Chandra Seetharaman | Be careful what you choose....
- sekharan@us.ibm.com | .......you may get it.
------------------------------------------------------------ ----------
Re: [ckrm-tech] [RFC][PATCH 4/7] UBC: syscalls (user interface) [message #5426 is a reply to message #5199] Fri, 18 August 2006 20:13 Go to previous messageGo to next message
Chandra Seetharaman is currently offline  Chandra Seetharaman
Messages: 88
Registered: August 2006
Member
On Wed, 2006-08-16 at 19:39 +0400, Kirill Korotaev wrote:

<snip>

> +/*
> + * The setbeanlimit syscall
> + */
> +asmlinkage long sys_setublimit(uid_t uid, unsigned long resource,
> + unsigned long *limits)
> +{
> + int error;
> + unsigned long flags;
> + struct user_beancounter *ub;
> + unsigned long new_limits[2];
> +
> + error = -EPERM;
> + if(!capable(CAP_SYS_RESOURCE))
> + goto out;
> +
> + error = -EINVAL;
> + if (resource >= UB_RESOURCES)
> + goto out;
> +
> + error = -EFAULT;
> + if (copy_from_user(&new_limits, limits, sizeof(new_limits)))
> + goto out;
> +
> + error = -EINVAL;
> + if (new_limits[0] > UB_MAXVALUE || new_limits[1] > UB_MAXVALUE)
> + goto out;
> +
> + error = -ENOENT;
> + ub = beancounter_findcreate(uid, NULL, 0);
> + if (ub == NULL)
> + goto out;
> +
> + spin_lock_irqsave(&ub->ub_lock, flags);
> + ub->ub_parms[resource].barrier = new_limits[0];
> + ub->ub_parms[resource].limit = new_limits[1];

>From my understanding it appear that barrier <= limit. But, the check is
missing here.
> + spin_unlock_irqrestore(&ub->ub_lock, flags);
> +
> + put_beancounter(ub);
> + error = 0;
> +out:
> + return error;
> +}
> +#endif
>
> ------------------------------------------------------------ -------------
> Using Tomcat but need to do more? Need to support web services, security?
> Get stuff done quickly with pre-integrated technology to make your job easier
> Download IBM WebSphere Application Server v.1.0.1 based on Apache Geronimo
> http://sel.as-us.falkag.net/sel?cmd=lnk&kid=120709&b id=263057&dat=121642
> _______________________________________________
> ckrm-tech mailing list
> https://lists.sourceforge.net/lists/listinfo/ckrm-tech
--

------------------------------------------------------------ ----------
Chandra Seetharaman | Be careful what you choose....
- sekharan@us.ibm.com | .......you may get it.
------------------------------------------------------------ ----------
Re: [ckrm-tech] [RFC][PATCH 5/7] UBC: kernel memory accounting (core) [message #5427 is a reply to message #5200] Fri, 18 August 2006 20:26 Go to previous messageGo to next message
Chandra Seetharaman is currently offline  Chandra Seetharaman
Messages: 88
Registered: August 2006
Member
Kirill,

IMO, a UBC with resource constraint(limit in this case) should behave no
different than a kernel with limited memory. i.e it should do
reclamation before it starts failing allocation requests. It could even
do it preemptively.

There is no guarantee support which is required for providing QoS.

Each controller modifying the infrastructure code doesn't look good. We
can have proper interfaces to add a new resource controller.

chandra
On Wed, 2006-08-16 at 19:40 +0400, Kirill Korotaev wrote:
> Introduce UB_KMEMSIZE resource which accounts kernel
> objects allocated by task's request.
>
> Reference to UB is kept on struct page or slab object.
> For slabs each struct slab contains a set of pointers
> corresponding objects are charged to.
>
> Allocation charge rules:
> define1. Pages - if allocation is performed with __GFP_UBC flag - page
> is charged to current's exec_ub.
> 2. Slabs - kmem_cache may be created with SLAB_UBC flag - in this
> case each allocation is charged. Caches used by kmalloc are
> created with SLAB_UBC | SLAB_UBC_NOCHARGE flags. In this case
> only __GFP_UBC allocations are charged.
>
> Signed-Off-By: Pavel Emelianov <xemul@sw.ru>
> Signed-Off-By: Kirill Korotaev <dev@sw.ru>
>
<snip>
--

------------------------------------------------------------ ----------
Chandra Seetharaman | Be careful what you choose....
- sekharan@us.ibm.com | .......you may get it.
------------------------------------------------------------ ----------
Re: [ckrm-tech] [PATCH 4/7] UBC: syscalls (user interface) [message #5434 is a reply to message #5357] Fri, 18 August 2006 14:45 Go to previous messageGo to next message
Dave Hansen is currently offline  Dave Hansen
Messages: 240
Registered: October 2005
Senior Member
On Fri, 2006-08-18 at 12:08 +0400, Andrey Savochkin wrote:
>
> A) Have separate memory management for each container,
> with separate buddy allocator, lru lists, page replacement mechanism.
> That implies a considerable overhead, and the main challenge there
> is sharing of pages between these separate memory managers.

Hold on here for just a sec...

It is quite possible to do memory management aimed at one container
while that container's memory still participates in the main VM.

There is overhead here, as the LRU scanning mechanisms get less
efficient, but I'd rather pay a penalty at LRU scanning time than divide
up the VM, or coarsely start failing allocations.

-- Dave
Re: [ckrm-tech] [RFC][PATCH] UBC: user resource beancounters [message #5437 is a reply to message #5301] Thu, 17 August 2006 19:55 Go to previous messageGo to next message
Chandra Seetharaman is currently offline  Chandra Seetharaman
Messages: 88
Registered: August 2006
Member
On Thu, 2006-08-17 at 17:55 +0400, Kirill Korotaev wrote:
> > On Wed, Aug 16, 2006 at 07:24:03PM +0400, Kirill Korotaev wrote:
> >
> >>As the first step we want to propose for discussion
> >>the most complicated parts of resource management:
> >>kernel memory and virtual memory.
> >
> > Do you have any plans to post a CPU controller? Is that tied to UBC
> > interface as well?
>
> Not everything at once :) To tell the truth I think CPU controller
> is even more complicated than user memory accounting/limiting.
>
> No, fair CPU scheduler is not tied to UBC in any regard.

Not having the CPU controller on UBC doesn't sound good for the
infrastructure. IMHO, the infrastructure (for resource management) we
are going to have should be able to support different resource
controllers, without each controllers needing to have their own
infrastructure/interface etc.,

> As we discussed before, it is valuable to have an ability to limit
> different resources separately (CPU, disk I/O, memory, etc.).

Having ability to limit/control different resources separately not
necessarily mean we should have different infrastructure for each.

> For example, it can be possible to place some mission critical
> kernel threads (like kjournald) in a separate contanier.

I don't understand the comment above (in this context).
>
> This patches are related to kernel memory and nothing more :)
>
> Thanks,
> Kirill
>
>
> ------------------------------------------------------------ -------------
> Using Tomcat but need to do more? Need to support web services, security?
> Get stuff done quickly with pre-integrated technology to make your job easier
> Download IBM WebSphere Application Server v.1.0.1 based on Apache Geronimo
> http://sel.as-us.falkag.net/sel?cmd=lnk&kid=120709&b id=263057&dat=121642
> _______________________________________________
> ckrm-tech mailing list
> https://lists.sourceforge.net/lists/listinfo/ckrm-tech
--

------------------------------------------------------------ ----------
Chandra Seetharaman | Be careful what you choose....
- sekharan@us.ibm.com | .......you may get it.
------------------------------------------------------------ ----------
Re: [RFC][PATCH 4/7] UBC: syscalls (user interface) [message #5438 is a reply to message #5199] Fri, 18 August 2006 11:40 Go to previous messageGo to next message
Arnd Bergmann is currently offline  Arnd Bergmann
Messages: 10
Registered: February 2006
Junior Member
On Wednesday 16 August 2006 17:39, Kirill Korotaev wrote:
> --- ./include/asm-powerpc/systbl.h.arsys        2006-07-10 12:39:19.000000000 +0400
> +++ ./include/asm-powerpc/systbl.h      2006-08-10 17:05:53.000000000 +0400
> @@ -304,3 +304,6 @@ SYSCALL_SPU(fchmodat)
>  SYSCALL_SPU(faccessat)
>  COMPAT_SYS_SPU(get_robust_list)
>  COMPAT_SYS_SPU(set_robust_list)
> +SYSCALL(sys_getluid)
> +SYSCALL(sys_setluid)
> +SYSCALL(sys_setublimit)
...
> --- ./include/asm-x86_64/unistd.h.ubsys 2006-07-10 12:39:19.000000000 +0400
> +++ ./include/asm-x86_64/unistd.h       2006-07-31 16:00:01.000000000 +0400
> @@ -619,10 +619,16 @@ __SYSCALL(__NR_sync_file_range, sys_sync
>  __SYSCALL(__NR_vmsplice, sys_vmsplice)
>  #define __NR_move_pages                279
>  __SYSCALL(__NR_move_pages, sys_move_pages)
> +#define __NR_getluid           280
> +__SYSCALL(__NR_getluid, sys_getluid)
> +#define __NR_setluid           281
> +__SYSCALL(__NR_setluid, sys_setluid)
> +#define __NR_setublimit                282
> +__SYSCALL(__NR_setublimit, sys_setublimit)
>  
...
> +/*
> + *     The setbeanlimit syscall
> + */
> +asmlinkage long sys_setublimit(uid_t uid, unsigned long resource,
> +               unsigned long *limits)

While I don't yet understand what this call does, it looks to me that
the way it's implemented breaks in 32 bit emulation mode on x86_64 and
powerpc.

You either need to pass a pointer a something that is the same on 32 and
64 bit (e.g. __u64 __user *limits), or need to provide a different
entry point for 32 bit applications:

long compat_sys_setublimit(compat_uid_t uid, compat_ulong_t resource,
compat_ulong_t __user *limits);

You should also add the prototypes to include/linux/syscalls.h.

Arnd <><
Re: [ckrm-tech] [RFC][PATCH 3/7] UBC: ub context and inheritance [message #5440 is a reply to message #5198] Fri, 18 August 2006 20:03 Go to previous messageGo to next message
Chandra Seetharaman is currently offline  Chandra Seetharaman
Messages: 88
Registered: August 2006
Member
On Wed, 2006-08-16 at 19:38 +0400, Kirill Korotaev wrote:
> Contains code responsible for setting UB on task,
> it's inheriting and setting host context in interrupts.
>
> Task references three beancounters:
> 1. exec_ub current context. all resources are
> charged to this beancounter.
> 2. task_ub beancounter to which task_struct is
> charged itself.

I do not see why task_ub is needed ? i do not see it being used
anywhere.

> 3. fork_sub beancounter which is inherited by
> task's children on fork

>From other emails it looks like renaming fork/exec to be real/effective
will be easier to understand.

>
>
<snip>

--

------------------------------------------------------------ ----------
Chandra Seetharaman | Be careful what you choose....
- sekharan@us.ibm.com | .......you may get it.
------------------------------------------------------------ ----------
Re: [RFC][PATCH 1/7] UBC: kconfig [message #5444 is a reply to message #5195] Fri, 18 August 2006 21:14 Go to previous messageGo to next message
Adrian Bunk is currently offline  Adrian Bunk
Messages: 10
Registered: August 2006
Junior Member
On Wed, Aug 16, 2006 at 07:35:34PM +0400, Kirill Korotaev wrote:
> Add kernel/ub/Kconfig file with UBC options and
> includes it into arch Kconfigs
>
> Signed-Off-By: Pavel Emelianov <xemul@sw.ru>
> Signed-Off-By: Kirill Korotaev <dev@sw.ru>
>
> ---
> arch/i386/Kconfig | 2 ++
> arch/ia64/Kconfig | 2 ++
> arch/powerpc/Kconfig | 2 ++
> arch/ppc/Kconfig | 2 ++
> arch/sparc/Kconfig | 2 ++
> arch/sparc64/Kconfig | 2 ++
> arch/x86_64/Kconfig | 2 ++
> kernel/ub/Kconfig | 25 +++++++++++++++++++++++++
> 8 files changed, 39 insertions(+)
>...
> --- ./arch/powerpc/Kconfig.arkcfg 2006-08-07 14:07:12.000000000 +0400
> +++ ./arch/powerpc/Kconfig 2006-08-10 17:55:58.000000000 +0400
> @@ -1038,6 +1038,8 @@ source "arch/powerpc/platforms/iseries/K
>
> source "lib/Kconfig"
>
> +source "ub/Kconfig"

kernel/ub/Kconfig

>...
> --- ./arch/ppc/Kconfig.arkcfg 2006-07-10 12:39:10.000000000 +0400
> +++ ./arch/ppc/Kconfig 2006-08-10 17:56:13.000000000 +0400
> @@ -1414,6 +1414,8 @@ endmenu
>
> source "lib/Kconfig"
>
> +source "ub/Kconfig"

kernel/ub/Kconfig

>...
> --- ./arch/sparc/Kconfig.arkcfg 2006-04-21 11:59:32.000000000 +0400
> +++ ./arch/sparc/Kconfig 2006-08-10 17:56:24.000000000 +0400
> @@ -296,3 +296,5 @@ source "security/Kconfig"
> source "crypto/Kconfig"
>
> source "lib/Kconfig"
> +
> +source "ub/Kconfig"

kernel/ub/Kconfig

> --- ./arch/sparc64/Kconfig.arkcfg 2006-07-17 17:01:11.000000000 +0400
> +++ ./arch/sparc64/Kconfig 2006-08-10 17:56:36.000000000 +0400
> @@ -432,3 +432,5 @@ source "security/Kconfig"
> source "crypto/Kconfig"
>
> source "lib/Kconfig"
> +
> +source "lib/Kconfig"

kernel/ub/Kconfig

>...
> --- ./kernel/ub/Kconfig.ubkm 2006-07-28 13:07:38.000000000 +0400
> +++ ./kernel/ub/Kconfig 2006-07-28 13:09:51.000000000 +0400
> @@ -0,0 +1,25 @@
> +#
> +# User resources part (UBC)
> +#
> +# Copyright (C) 2006 OpenVZ. SWsoft Inc
> +
> +menu "User resources"
> +
> +config USER_RESOURCE
> + bool "Enable user resource accounting"
> + default y
>...

Optional functionality shouldn't default to y.

cu
Adrian

--

Gentoo kernels are 42 times more popular than SUSE kernels among
KLive users (a service by SUSE contractor Andrea Arcangeli that
gathers data about kernels from many users worldwide).

There are three kinds of lies: Lies, Damn Lies, and Statistics.
Benjamin Disraeli
Re: [ckrm-tech] [RFC][PATCH 3/7] UBC: ub context and inheritance [message #5445 is a reply to message #5371] Sat, 19 August 2006 02:19 Go to previous messageGo to next message
Matt Helsley is currently offline  Matt Helsley
Messages: 86
Registered: August 2006
Member
On Fri, 2006-08-18 at 13:23 +0400, Kirill Korotaev wrote:
> Matt Helsley wrote:
> > On Wed, 2006-08-16 at 19:38 +0400, Kirill Korotaev wrote:
> >
> >>Contains code responsible for setting UB on task,
> >>it's inheriting and setting host context in interrupts.
> >>
> >>Task references three beancounters:
> >> 1. exec_ub current context. all resources are
> >> charged to this beancounter.
> >
> >
> > nit: 2-3 below seem to contradict "all". If you mean "the rest" then
> > perhaps you ought to reorder these:
> >
> > 1. task_ub ...
> > 2. fork_sub ...
> > 3. exec_ub Current context. Resources not charged to task_ub
> > or fork_sub are charged to this beancounter.
> not sure what you mean.
> task_ub - where _task_ _itself_ is charged as an object.
> following patches will add charging of "number of tasks" using it.
> fork_sub - beancounter which is inherited on fork() (chaning task beancounter).
> exec_ub - is current context.
>
>
> >> 2. task_ub beancounter to which task_struct is
> >> charged itself.
> >
> >
> > Is task_ub frequently the parent beancounter of exec_ub? If it's always
> > the parent then perhaps the one or more of these _ub fields in the task
> > struct are not necessary.
> no, task_ub != exec_ub of parent task
> when task is created anything can happen: task can change ub, parent can change ub,
> task can be reparented. But the UB we charged task to should be known.
>
> > Also in that case keeping copies of the
> > "parent" user_beancounter pointers in the task_beancounters would seem
> > bug-prone -- if the hierarchy of beancounters changes then these would
> > need to be changed too.
> >
> >
> >> 3. fork_sub beancounter which is inherited by
> >> task's children on fork
> >
> >
> > Is this frequently the same as exec_ub?
> frequently, but not always. exec_ub is changed in softirq for example.
> consider exec_ub as 'current' pointer in kernel.
>
> see other comments below
>
> >>Signed-Off-By: Pavel Emelianov <xemul@sw.ru>
> >>Signed-Off-By: Kirill Korotaev <dev@sw.ru>
> >>
> >>---
> >> include/linux/sched.h | 5 +++++
> >> include/ub/task.h | 42 ++++++++++++++++++++++++++++++++++++++++++
> >> kernel/fork.c | 21 ++++++++++++++++-----
> >> kernel/irq/handle.c | 9 +++++++++
> >> kernel/softirq.c | 8 ++++++++
> >> kernel/ub/Makefile | 1 +
> >> kernel/ub/beancounter.c | 4 ++++
> >> kernel/ub/misc.c | 34 ++++++++++++++++++++++++++++++++++
> >> 8 files changed, 119 insertions(+), 5 deletions(-)
> >>
> >>--- ./include/linux/sched.h.ubfork 2006-07-17 17:01:12.000000000 +0400
> >>+++ ./include/linux/sched.h 2006-07-31 16:01:54.000000000 +0400
> >>@@ -81,6 +81,8 @@ struct sched_param {
> >> #include <linux/timer.h>
> >> #include <linux/hrtimer.h>
> >>
> >>+#include <ub/task.h>
> >>+
> >> #include <asm/processor.h>
> >>
> >> struct exec_domain;
> >>@@ -997,6 +999,9 @@ struct task_struct {
> >> spinlock_t delays_lock;
> >> struct task_delay_info *delays;
> >> #endif
> >>+#ifdef CONFIG_USER_RESOURCE
> >>+ struct task_beancounter task_bc;
> >>+#endif
> >> };
> >>
> >> static inline pid_t process_group(struct task_struct *tsk)
> >>--- ./include/ub/task.h.ubfork 2006-07-28 18:53:52.000000000 +0400
> >>+++ ./include/ub/task.h 2006-08-01 15:26:08.000000000 +0400
> >>@@ -0,0 +1,42 @@
> >>+/*
> >>+ * include/ub/task.h
> >>+ *
> >>+ * Copyright (C) 2006 OpenVZ. SWsoft Inc
> >>+ *
> >>+ */
> >>+
> >>+#ifndef __UB_TASK_H_
> >>+#define __UB_TASK_H_
> >>+
> >>+#include <linux/config.h>
> >>+
> >>+struct user_beancounter;
> >>+
> >>+struct task_beancounter {
> >>+ struct user_beancounter *exec_ub;
> >>+ struct user_beancounter *task_ub;
> >>+ struct user_beancounter *fork_sub;
> >>+};
> >>+
> >>+#ifdef CONFIG_USER_RESOURCE
> >>+#define get_exec_ub() (current->task_bc.exec_ub)
> >>+#define set_exec_ub(newub) \
> >>+ ({ \
> >>+ struct user_beancounter *old; \
> >>+ struct task_beancounter *tbc; \
> >>+ tbc = &current->task_bc; \
> >>+ old = tbc->exec_ub; \
> >>+ tbc->exec_ub = newub; \
> >>+ old; \
> >>+ })
> >>+
> >
> >
> > How about making these static inlines?
> possible, but this requires including sched.h, which includes this file...
> so this one is easier and more separated.
>
> >>+int ub_task_charge(struct task_struct *parent, struct task_struct *new);
> >>+void ub_task_uncharge(struct task_struct *tsk);
> >>+
> >>+#else /* CONFIG_USER_RESOURCE */
> >>+#define get_exec_ub() (NULL)
> >>+#define set_exec_ub(__ub) (NULL)
> >>+#define ub_task_charge(p, t) (0)
> >>+#define ub_task_uncharge(t) do { } while (0)
> >>+#endif /* CONFIG_USER_RESOURCE */
> >>+#endif /* __UB_TASK_H_ */
> >>--- ./kernel/irq/handle.c.ubirq 2006-07-10 12:39:20.000000000 +0400
> >>+++ ./kernel/irq/handle.c 2006-08-01 12:39:34.000000000 +0400
> >>@@ -16,6 +16,9 @@
> >> #include <linux/interrupt.h>
> >> #include <linux/kernel_stat.h>
> >>
> >>+#include <ub/beancounter.h>
> >>+#include <ub/task.h>
> >>+
> >> #include "internals.h"
> >>
> >> /**
> >>@@ -166,6 +169,9 @@ fastcall unsigned int __do_IRQ(unsigned
> >> struct irq_desc *desc = irq_desc + irq;
> >> struct irqaction *action;
> >> unsigned int status;
> >>+ struct user_beancounter *ub;
> >>+
> >>+ ub = set_exec_ub(&ub0);
> >
> >
> > Perhaps a comment: "/* Don't charge resources gained in interrupts to current */
> ok, will add comment:
> /* UBC charges should be done to host system */
> >
> >
> >> kstat_this_cpu.irqs[irq]++;
> >> if (CHECK_IRQ_PER_CPU(desc->status)) {
> >>@@ -178,6 +184,8 @@ fastcall unsigned int __do_IRQ(unsigned
> >> desc->chip->ack(irq);
> >> action_ret = handle_IRQ_event(irq, regs, desc->action);
> >> desc->chip->end(irq);
> >>+
> >>+ (void) set_exec_ub(ub);
> >> return 1;
> >> }
> >>
> >>@@ -246,6 +254,7 @@ out:
> >> desc->chip->end(irq);
> >> spin_unlock(&desc->lock);
> >>
> >>+ (void) set_exec_ub(ub);
> >
> >
> >
> > Seems like a WARN_ON() would be appropriate rather than ignoring the
> > return code.
> BUG_ON(ret != &ub0) ?

Oops, yes, it's not a return code and BUG_ON() does seem more
appropriate.

>
> maybe introduce a kind of
> reset_exec_ub(old_ub, expected_current_ub)
> {
> ret = set_exec_ub(old_ub);
> BUG_ON(ret != expected_current_ub);
> }
> ?

Seems like a good idea to me. This way when UBC is not configured
there'd also be no BUG_ON().

> >> return 1;
> >> }
> >>
> >>--- ./kernel/softirq.c.ubirq 2006-07-17 17:01:12.000000000 +0400
> >>+++ ./kernel/softirq.c 2006-08-01 12:40:44.000000000 +0400
> >>@@ -18,6 +18,9 @@
> >> #include <linux/rcupdate.h>
> >> #include <linux/smp.h>
> >>
> >>+#include <ub/beancounter.h>
> >>+#include <ub/task.h>
> >>+
> >> #include <asm/irq.h>
> >> /*
> >> - No shared variables, all the data are CPU local.
> >>@@ -191,6 +194,9 @@ asmlinkage void __do_softirq(void)
> >> __u32 pending;
> >> int max_restart = MAX_SOFTIRQ_RESTART;
> >> int cpu;
> >>+ struct user_beancounter *ub;
> >>+
> >>+ ub = set_exec_ub(&ub0);
> >
> >
> > Perhaps add the same comment...
> ok
>
> >
> >
> >> pending = local_softirq_pending();
> >> account_system_vtime(current);
> >>@@ -229,6 +235,8 @@ restart:
> >>
> >> account_system_vtime(current);
> >> _local_bh_enable();
> >>+
> >>+ (void) set_exec_ub(ub);
> >
> >
> > .. and the same WARN_ON.
> >
> >
> >> }
> >>
> >> #ifndef __ARCH_HAS_DO_SOFTIRQ
> >>--- ./kernel/fork.c.ubfork 2006-07-17 17:01:12.000000000 +0400
> >>+++ ./kernel/fork.c 2006-08-01 12:58:36.000000000 +0400
> >>@@ -46,6 +46,8 @@
> >> #include <linux/delayacct.h>
> >> #include <linux/taskstats_kern.h>
> >>
> >>+#include <ub/task.h>
> >>+
> >> #include <asm/pgtable.h>
> >> #include <asm/pgalloc.h>
> >> #include <asm/uaccess.h>
> >>@@ -102,6 +104,7 @@ static kmem_cache_t *mm_cachep;
> >>
> >> void free_task(struct task_struct *tsk)
> >> {
> >>+ ub_task_uncharge(tsk);
> >> free_thread_info(tsk->thread_info);
> >> rt_mutex_debug_task_free(tsk);
> >> free_task_struct(tsk);
> >>@@ -162,18 +165,19 @@ static struct task_struct *dup_task_stru
> >>
> >> tsk = alloc_task_struct();
> >> if (!tsk)
> >>- return NULL;
> >>+ goto out;
> >>
> >> ti = alloc_thread_info(tsk);
> >>- if (!ti) {
> >>- free_task_struct(tsk);
> >>- return NULL;
> >>- }
> >>+ if (!ti)
> >>+ goto out_tsk;
> >>
> >> *tsk = *orig;
> >> tsk->thread_info = ti;
> >> setup_thread_stack(tsk, orig);
> >>
> >>+ if (ub_task_charge(orig, tsk))
> >>+ goto out_ti;
> >>+
> >> /* One for us, one for whoever does the "release_task()" (usually parent) */
> >> atomic_set(&tsk->usage,2);
> >> atomic_set(&tsk->fs_excl, 0);
> >>@@ -180,6 +184,13 @@ static struct task_struct *dup_task_stru
> >> #endif
> >> tsk->splice_pipe = NULL;
> >> return tsk;
> >>+
> >>+out_ti:
> >>+ free_thread_info(ti);
> >>+out_tsk:
> >>+ free_task_struct(tsk);
> >>+out:
> >>+ return NULL;
> >
> >
> > Ugh. This is starting to look like copy_process(). Any reason you
> > couldn't move the bean counter bits to copy_process() instead?
> This is more logical place since we _will_ charge task here
> (next patchset for numproc).
> It is logically better to charge objects in places where
> they are allocated. At the same time we inherit tasks ubs here.

Most other systems that aren't so critically related to task struct
allocation and copying place their code in copy_process() and not in
dup_task_struct().

Frankly this still seems to belong in copy_process(). The pattern (with
the gotos) is already there, as are accounting, audit, and security
pieces for example.

> >> }
> >>
> >> #ifdef CONFIG_MMU
> >>--- ./kernel/ub/Makefile.ubcore 2006-08-03 16:24:56.000000000 +0400
> >>+++ ./kernel/ub/Makefile 2006-08-01 11:08:39.000000000 +0400
> >>@@ -5,3 +5,4 @@
> >> #
> >>
> >> obj-$(CONFIG_USER_RESOURCE) += beancounter.o
> >>+obj-$(CONFIG_USER_RESOURCE) += misc.o
> >>--- ./kernel/ub/beancounter.c.ubcore 2006-07-28 13:07:44.000000000 +0400
> >>+++ ./kernel/ub/beancounter.c 2006-08-03 16:14:17.000000000 +0400
> >>@@ -395,6 +395,10 @@
> >> spin_lock_init(&ub_hash_lock);
> >> slot = &ub_hash[ub_hash_fun(ub->ub_uid)];
> >> hlist_add_head(&ub->hash, slot);
> >>+
> >>+ current->task_bc.exec_ub = ub;
> >>+ current->task_bc.task_ub = get_beancounter(ub);
> >>+ current->task_bc.fork_sub = get_beancounter(ub);
> >> }
> >>
> >> void __init ub_init_late(void)
> >>--- ./kernel/ub/misc.c.ubfork 2006-07-31 16:23:44.000000000 +0400
> >>+++ ./kernel/ub/misc.c 2006-07-31 16:28:47.000000000 +0400
> >>@@ -0,0 +1,34 @@
> >>+/*
> >>+ * kernel/ub/misc.c
> >>+ *
> >>+ * Copyright (C) 2006 OpenVZ. SWsoft Inc.
> >>+ *
> >>+ */
> >>+
> >>+#include <linux/sched.h>
> >>+
> >>+#include <ub/beancounter.h>
> >>+#include <ub/task.h>
> >>+
> >>+int ub_task_charge(struct task_struct *parent, struct task_struct *new)
> >>+{
> >
> >
> > parent could be derived from new if you move the charge to copy_process
> > instead of dup_task_struct.
> we can split it into:
> ub_charge_task() in dup_task_struct to account _task_ itself.
> ub_copy_process() in copy_process() to inherit and initialize
> exec_ub and fork_sub
>
> what do you think?

I do like the idea of splitting this up.

Though as I said I'm still against adding stuff to dup_task_struct() if
it's not allocating/copying the task struct or thread info. I think of
"dup_task_struct()" as dealing with the core purpose of the task_struct
-- copy_process() seems to be for all of the things that have plugged
their own fields into task_struct over the years.

Since I haven't seen the numproc patches that follow this I can't
really comment (one way or the other) on whether plugging that into
dup_task_struct() seems appropriate or necessary.

> >>+ struct task_beancounter *old_bc;
> >>+ struct task_beancounter *new_bc;
> >>+ struct user_beancounter *ub;
> >>+
> >>+ old_bc = &parent->task_bc;
> >>+ new_bc = &new->task_bc;
> >>+
> >>+ ub = old_bc->fork_sub;
> >>+ new_bc->exec_ub = get_beancounter(ub);
> >>+ new_bc->task_ub = get_beancounter(ub);
> >>+ new_bc->fork_sub = get_beancounter(ub);
> >>+ return 0;
> >>+}

<snip>

Cheers,
-Matt Helsley
Re: [ckrm-tech] [RFC][PATCH 4/7] UBC: syscalls (user interface) [message #5446 is a reply to message #5388] Sat, 19 August 2006 02:43 Go to previous messageGo to next message
Matt Helsley is currently offline  Matt Helsley
Messages: 86
Registered: August 2006
Member
On Fri, 2006-08-18 at 15:45 +0400, Kirill Korotaev wrote:
> Matt Helsley wrote:
>
> [... snip ...]
> >>--- ./kernel/ub/sys.c.ubsys 2006-07-28 18:52:18.000000000 +0400
> >>+++ ./kernel/ub/sys.c 2006-08-03 16:14:23.000000000 +0400
> >>@@ -0,0 +1,126 @@

<snip>

> >>+#else /* CONFIG_USER_RESOURCE */
> >>+
> >>+/*
> >>+ * The (rather boring) getluid syscall
> >>+ */
> >>+asmlinkage long sys_getluid(void)
> >>+{
> >>+ struct user_beancounter *ub;
> >>+
> >>+ ub = get_exec_ub();
> >>+ if (ub == NULL)
> >>+ return -EINVAL;
> >>+
> >>+ return ub->ub_uid;
> >>+}
> >>+
> >>+/*
> >>+ * The setluid syscall
> >>+ */
> >>+asmlinkage long sys_setluid(uid_t uid)
> >>+{
> >>+ int error;
> >>+ struct user_beancounter *ub;
> >>+ struct task_beancounter *task_bc;
> >>+
> >>+ task_bc = &current->task_bc;
> >>+
> >>+ /* You may not disown a setluid */
> >>+ error = -EINVAL;
> >>+ if (uid == (uid_t)-1)
> >>+ goto out;
> >>+
> >>+ /* You may only set an ub as root */
> >>+ error = -EPERM;
> >>+ if (!capable(CAP_SETUID))
> >>+ goto out;
> >
> >
> > With resource groups you don't necessarily have to be root -- just the
> > owner of the group and task.
> the question is - who is the owner of group?

Whoever is made the 'owner' of the directory is the owner of the group.
If you own both then you can add your task to your group.

> user, user group or who?
> Both are bad, since the same user can run inside the container and thus
> container will be potentially controllable/breakable from inside.

No, that's not a problem. The way shares work is you get a "portion" of
the parent group's resources and if the parent has limited your portion
you cannot exceed that. At the same time you can control how your
portion is dealt out within the child group.

> > Filesystems and appropriate share representations offer a way to give
> > regular users the ability to manage their resources without requiring
> > CAP_FOO.
> not sure what you propose...

A filesystem interface.

> we can introduce the following rules:
>
> containers (UB) can be created by process with SETUID cap only.
> subcontainers (SUB) can be created by any process.

Can subsubcontainers be created?

> what do you think?

I think a filesystem interface would work better. ;)

>
> >>+ /* Ok - set up a beancounter entry for this user */
> >>+ error = -ENOBUFS;
> >>+ ub = beancounter_findcreate(uid, NULL, UB_ALLOC);
> >>+ if (ub == NULL)
> >>+ goto out;
> >>+
> >>+ /* install bc */
> >>+ put_beancounter(task_bc->exec_ub);
> >>+ task_bc->exec_ub = ub;
> >>+ put_beancounter(task_bc->fork_sub);
> >>+ task_bc->fork_sub = get_beancounter(ub);
> >>+ error = 0;
> >>+out:
> >>+ return error;
> >>+}
> >>+
> >>+/*
> >>+ * The setbeanlimit syscall
> >>+ */
> >>+asmlinkage long sys_setublimit(uid_t uid, unsigned long resource,
> >>+ unsigned long *limits)
> >>+{
> >>+ int error;
> >>+ unsigned long flags;
> >>+ struct user_beancounter *ub;
> >>+ unsigned long new_limits[2];
> >>+
> >>+ error = -EPERM;
> >>+ if(!capable(CAP_SYS_RESOURCE))
> >>+ goto out;
> >
> >
> > Again, a filesystem interface would give us more flexibility when it
> > comes to allowing users to manage their resources while still preventing
> > them from exceeding limits.
> we can have 2 different root users with uid = 0 in 2 different containers.

You shouldn't need to have the 2 containers to give resource control to
other users. In other words you shouldn't need to use containers in
order to do resource management. The container model is by no means the
only way to model resource management.

> > I doubt you really want to give owners of a container CAP_SYS_RESOURCE
> > and CAP_USER (i.e. total control over resource management) just to allow
> > them to manage their subset of the resources.
> The origin idea is that administator of the node can manage user
> resources only. Users can't, since otherwise they can increase the limits.

The user may wish to manage the resource usage of her applications
within restrictions imposed by an administrator. If the user has a
portion of resources then you only need to ensure that the sum of her
resources does not exceed the administrator-provided limit.

> But we can allow them to manage sub beancoutners imho...

And subsubbeancounters?

<snip>

Cheers,
-Matt Helsley
Re: [ckrm-tech] [RFC][PATCH 2/7] UBC: core (structures, API) [message #5459 is a reply to message #5196] Sun, 20 August 2006 05:01 Go to previous messageGo to next message
Balbir Singh is currently offline  Balbir Singh
Messages: 491
Registered: August 2006
Senior Member
> +
> +void __put_beancounter(struct user_beancounter *ub)
> +{
> + unsigned long flags;
> + struct user_beancounter *parent;
> +
> +again:
> + parent = ub->parent;
> + /* equevalent to atomic_dec_and_lock_irqsave() */
> + local_irq_save(flags);
> + if (likely(!atomic_dec_and_lock(&ub->ub_refcount, &ub_hash_lock))) {
> + if (unlikely(atomic_read(&ub->ub_refcount) < 0))
> + printk(KERN_ERR "UB: Bad ub refcount: ub=%p, "
> + "luid=%d, ref=%d\n",
> + ub, ub->ub_uid,
> + atomic_read(&ub->ub_refcount));
> + local_irq_restore(flags);
> + return;

Minor comment - the printk (I think there is one other place) could come after
the local_irq_restore()

--

Balbir Singh,
Linux Technology Center,
IBM Software Labs
Re: [ckrm-tech] [RFC][PATCH 2/7] UBC: core (structures, API) [message #5460 is a reply to message #5196] Sun, 20 August 2006 04:58 Go to previous messageGo to next message
Balbir Singh is currently offline  Balbir Singh
Messages: 491
Registered: August 2006
Senior Member
Kirill Korotaev wrote:

> +/*
> + * Resource list.
> + */
> +
> +#define UB_RESOURCES 0
> +
> +struct ubparm {
> + /*
> + * A barrier over which resource allocations are failed gracefully.
> + * e.g. if the amount of consumed memory is over the barrier further
> + * sbrk() or mmap() calls fail, the existing processes are not killed.
> + */
> + unsigned long barrier;
> + /* hard resource limit */
> + unsigned long limit;
> + /* consumed resources */
> + unsigned long held;
> + /* maximum amount of consumed resources through the last period */
> + unsigned long maxheld;
> + /* minimum amount of consumed resources through the last period */
> + unsigned long minheld;
> + /* count of failed charges */
> + unsigned long failcnt;
> +};
> +

Comments to the side of the field would make it easier to read and understand
the structure. I think there are already other comments requesting for renaming
of the barrier field to hard_limit.

<snip>

> +static inline void ub_adjust_held_minmax(struct user_beancounter *ub,
> + int resource)
> +{
> + if (ub->ub_parms[resource].maxheld < ub->ub_parms[resource].held)
> + ub->ub_parms[resource].maxheld = ub->ub_parms[resource].held;
> + if (ub->ub_parms[resource].minheld > ub->ub_parms[resource].held)
> + ub->ub_parms[resource].minheld = ub->ub_parms[resource].held;
> +}

A comment here to clarify what the function does would be helpful, specially due
to the comparison above

if (maxheld < held)
maxheld = held
if (minheld > held)
minheld = held

<snip>

> +struct user_beancounter ub0;

How about global_ub or init_ub?

> +
> +#define ub_hash_fun(x) ((((x) >> 8) ^ (x)) & (UB_HASH_SIZE - 1))
> +#define ub_subhash_fun(p, id) ub_hash_fun((p)->ub_uid + (id) * 17)
> +

What hash properties are we looking for in the hash function? Is the hash
function universal?

> +struct hlist_head ub_hash[UB_HASH_SIZE];
> +spinlock_t ub_hash_lock;
> +
> +EXPORT_SYMBOL(ub_hash);
> +EXPORT_SYMBOL(ub_hash_lock);
> +
> +/*
> + * Per user resource beancounting. Resources are tied to their luid.
> + * The resource structure itself is tagged both to the process and
> + * the charging resources (a socket doesn't want to have to search for
> + * things at irq time for example). Reference counters keep things in
> + * hand.
> + *
> + * The case where a user creates resource, kills all his processes and
> + * then starts new ones is correctly handled this way. The refcounters
> + * will mean the old entry is still around with resource tied to it.
> + */
> +
> +struct user_beancounter *beancounter_findcreate(uid_t uid,
> + struct user_beancounter *p, int mask)
> +{
> + struct user_beancounter *new_ub, *ub, *tmpl_ub;
> + unsigned long flags;
> + struct hlist_head *slot;
> + struct hlist_node *pos;
> +
> + if (mask & UB_LOOKUP_SUB) {
> + WARN_ON(p == NULL);
> + tmpl_ub = &default_subbeancounter;
> + slot = &ub_hash[ub_subhash_fun(p, uid)];
> + } else {
> + WARN_ON(p != NULL);
> + tmpl_ub = &default_beancounter;
> + slot = &ub_hash[ub_hash_fun(uid)];
> + }
> + new_ub = NULL;
> +
> +retry:
> + spin_lock_irqsave(&ub_hash_lock, flags);
> + hlist_for_each_entry (ub, pos, slot, hash)
> + if (ub->ub_uid == uid && ub->parent == p)
> + break;
> +
> + if (pos != NULL) {
> + get_beancounter(ub);
> + spin_unlock_irqrestore(&ub_hash_lock, flags);
> +
> + if (new_ub != NULL) {
> + put_beancounter(new_ub->parent);
> + kmem_cache_free(ub_cachep, new_ub);
> + }

A comment indicative of this being a part of race handing would be useful.
Could you please consider refactoring this function if possible.

> + return ub;
> + }
> +
> + if (!(mask & UB_ALLOC))
> + goto out_unlock;
> +
> + if (new_ub != NULL)
> + goto out_install;
> +
> + if (mask & UB_ALLOC_ATOMIC) {
> + new_ub = kmem_cache_alloc(ub_cachep, GFP_ATOMIC);
> + if (new_ub == NULL)
> + goto out_unlock;
> +
> + memcpy(new_ub, tmpl_ub, sizeof(*new_ub));
> + init_beancounter_struct(new_ub, uid);
> + if (p)
> + new_ub->parent = get_beancounter(p);
> + goto out_install;
> + }
> +
> + spin_unlock_irqrestore(&ub_hash_lock, flags);
> +
> + new_ub = kmem_cache_alloc(ub_cachep, GFP_KERNEL);
> + if (new_ub == NULL)
> + goto out;
> +
> + memcpy(new_ub, tmpl_ub, sizeof(*new_ub));
> + init_beancounter_struct(new_ub, uid);
> + if (p)
> + new_ub->parent = get_beancounter(p);
> + goto retry;
> +
> +out_install:
> + hlist_add_head(&new_ub->hash, slot);
> +out_unlock:
> + spin_unlock_irqrestore(&ub_hash_lock, flags);
> +out:
> + return new_ub;
> +}
> +
> +EXPORT_SYMBOL(beancounter_findcreate);
> +

<snip>

> +void __put_beancounter(struct user_beancounter *ub)
> +{
> + unsigned long flags;
> + struct user_beancounter *parent;
> +
> +again:
> + parent = ub->parent;
> + /* equevalent to atomic_dec_and_lock_irqsave() */
> + local_irq_save(flags);
> + if (likely(!atomic_dec_and_lock(&ub->ub_refcount, &ub_hash_lock))) {
> + if (unlikely(atomic_read(&ub->ub_refcount) < 0))
> + printk(KERN_ERR "UB: Bad ub refcount: ub=%p, "
> + "luid=%d, ref=%d\n",
> + ub, ub->ub_uid,
> + atomic_read(&ub->ub_refcount));
> + local_irq_restore(flags);
> + return;
> + }
> +
> + if (unlikely(ub == &ub0)) {
> + printk(KERN_ERR "Trying to put ub0\n");
> + spin_unlock_irqrestore(&ub_hash_lock, flags);
> + return;
> + }
> +
> + verify_held(ub);
> + hlist_del(&ub->hash);
> + spin_unlock_irqrestore(&ub_hash_lock, flags);

Is this function called with the ub_hash_lock held()? A comment would be useful
or you could call it __put_beancounter_locked :-)

> +
> + kmem_cache_free(ub_cachep, ub);
> +
> + ub = parent;
> + if (ub != NULL)
> + goto again;

Could you please convert this to a do {} while() loop.

> +}
> +
> +EXPORT_SYMBOL(__put_beancounter);

<snip>

> +int charge_beancounter(struct user_beancounter *ub,
> + int resource, unsigned long val, enum severity strict)
> +{
> + int retval;
> + struct user_beancounter *p, *q;
> + unsigned long flags;
> +
> + retval = -EINVAL;
> + BUG_ON(val > UB_MAXVALUE);
> +
> + local_irq_save(flags);
> + for (p = ub; p != NULL; p = p->parent) {
> + spin_lock(&p->ub_lock);
> + retval = __charge_beancounter_locked(p, resource, val, strict);

Everyone in the hierarchy is charged the same amount - val?

> + spin_unlock(&p->ub_lock);
> + if (retval)
> + goto unroll;
> + }
> +out_restore:
> + local_irq_restore(flags);
> + return retval;
> +
> +unroll:
> + for (q = ub; q != p; q = q->parent) {
> + spin_lock(&q->ub_lock);
> + __uncharge_beancounter_locked(q, resource, val);
> + spin_unlock(&q->ub_lock);
> + }
> + goto out_restore;

Too many goto's in both directions - please consider refactoring

> +void charge_beancounter_notop(struct user_beancounter *ub,
> + int resource, unsigned long val)

Whats the meaning of notop?

> +{
> + struct user_beancounter *p;
> + unsigned long flags;
> +
> + local_irq_save(flags);
> + for (p = ub; p->parent != NULL; p = p->parent) {
> + spin_lock(&p->ub_lock);
> + __charge_beancounter_locked(p, resource, val, UB_FORCE);
> + spin_unlock(&p->ub_lock);
> + }
> + local_irq_restore(flags);
> +}
> +

Could some of this code be shared with charge_beancounter to avoid duplication?

> +EXPORT_SYMBOL(charge_beancounter_notop);
> +
> +void __uncharge_beancounter_locked(struct user_beancounter *ub,
> + int resource, unsigned long val)
> +{
> + if (unlikely(ub->ub_parms[resource].held < val)) {
> + ub_print_resource_warning(ub, resource,
> + "uncharging too much", val, 0);
> + val = ub->ub_parms[resource].held;
> + }
> + ub->ub_parms[resource].held -= val;
> + ub_adjust_held_minmax(ub, resource);
> +}
> +
> +void uncharge_beancounter(struct user_beancounter *ub,
> + int resource, unsigned long val)
> +{
> + unsigned long flags;
> + struct user_beancounter *p;
> +
> + for (p = ub; p != NULL; p = p->parent) {
> + spin_lock_irqsave(&p->ub_lock, flags);
> + __uncharge_beancounter_locked(p, resource, val);
> + spin_unlock_irqrestore(&p->ub_lock, flags);
> + }
> +}
> +
> +EXPORT_SYMBOL(uncharge_beancounter);
> +
> +void uncharge_beancounter_notop(struct user_beancounter *ub,
> + int resource, unsigned long val)
> +{
> + struct user_beancounter *p;
> + unsigned long flags;
> +
> + local_irq_save(flags);
> + for (p = ub; p->parent != NULL; p = p->parent) {
> + spin_lock(&p->ub_lock);
> + __uncharge_beancounter_locked(p, resource, val);
> + spin_unlock(&p->ub_lock);
> + }
> + local_irq_restore(flags);
> +}
> +

The code for both uncharge_beancounter() and uncharge_beancounter_notop() seems
to do the same thing

> +
> +void __init ub_init_late(void)
> +{
> + struct user_beancounter *ub;
> +
> + ub_cachep = kmem_cache_create("user_beancounters",
> + sizeof(struct user_beancounter),
> + 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
> + if (ub_cachep == NULL)
> + panic("Can't create ubc caches\n");
> +
> + ub = &default_beancounter;

Whats the relationship b
...

Re: [ckrm-tech] [PATCH 4/7] UBC: syscalls (user interface) [message #5465 is a reply to message #5434] Mon, 21 August 2006 02:47 Go to previous messageGo to next message
Magnus Damm is currently offline  Magnus Damm
Messages: 8
Registered: August 2006
Junior Member
On Fri, 2006-08-18 at 07:45 -0700, Dave Hansen wrote:
> On Fri, 2006-08-18 at 12:08 +0400, Andrey Savochkin wrote:
> >
> > A) Have separate memory management for each container,
> > with separate buddy allocator, lru lists, page replacement mechanism.
> > That implies a considerable overhead, and the main challenge there
> > is sharing of pages between these separate memory managers.
>
> Hold on here for just a sec...
>
> It is quite possible to do memory management aimed at one container
> while that container's memory still participates in the main VM.
>
> There is overhead here, as the LRU scanning mechanisms get less
> efficient, but I'd rather pay a penalty at LRU scanning time than divide
> up the VM, or coarsely start failing allocations.

This could of course be solved with one LRU per container, which is how
the CKRM memory controller implemented things about a year ago.

/ magnus
Re: [ckrm-tech] [RFC][PATCH 5/7] UBC: kernel memory accounting (core) [message #5473 is a reply to message #5405] Mon, 21 August 2006 08:56 Go to previous messageGo to next message
dev is currently offline  dev
Messages: 1693
Registered: September 2005
Location: Moscow
Senior Member

Dave Hansen wrote:
> On Fri, 2006-08-18 at 12:12 +0400, Kirill Korotaev wrote:
>
>>LDT takes from 1 to 16 pages. and is allocated by vmalloc.
>>do you propose to replace it with slab which can fail due to memory
>>fragmentation?
>
>
> Nope. ;)
so what is your proposal then? Sorry, probably missed it due to lots of emails :)

>>the same applies to fdset, fdarray, ipc ids and iptables entries.
>
>
> The vmalloc area, along with all of those other structures _have_ other
> data structures. Now, it will take a wee bit more patching to directly
> tag those thing with explicit container pointers (or accounting
> references), but I would much prefer that, especially for the things
> that are larger than a page.
do you mean that you prefer adding a explicit pointer to the structures
itself?

> I worry that this approach was used instead of patching all of the
> individual subsystems because this was easier to maintain as an
> out-of-tree patch, and it isn't necessarily the best approach.
:) if we were to optimize for patch size then we would select vserver
approach and be happy...

Dave, we used to add UBC pointers on each data structure and then do
a separate accounting in the places where objects are allocated.
We spent a lot of time and investigation on how to make it better,
because it was leading to often accounting errors, wrong error paths etc.
The approach provided in this patchset proved to be much more efficient
and more error prone. And it is much much more elegant!

Thanks,
Kirill
Re: [ckrm-tech] [RFC][PATCH 3/7] UBC: ub context and inheritance [message #5477 is a reply to message #5440] Mon, 21 August 2006 10:30 Go to previous messageGo to next message
dev is currently offline  dev
Messages: 1693
Registered: September 2005
Location: Moscow
Senior Member

Chandra Seetharaman wrote:
> On Wed, 2006-08-16 at 19:38 +0400, Kirill Korotaev wrote:
>
>>Contains code responsible for setting UB on task,
>>it's inheriting and setting host context in interrupts.
>>
>>Task references three beancounters:
>> 1. exec_ub current context. all resources are
>> charged to this beancounter.
>> 2. task_ub beancounter to which task_struct is
>> charged itself.
>
>
> I do not see why task_ub is needed ? i do not see it being used
> anywhere.
it is used to charge task itself. will be heavily used in next patch set
adding "numproc" UBC parameter.

>> 3. fork_sub beancounter which is inherited by
>> task's children on fork
>
>
>>From other emails it looks like renaming fork/exec to be real/effective
> will be easier to understand.
there is no "real". exec_ub is effective indeed,
but fork_sub is the one to inherit on fork().

Kirill
Re: [ckrm-tech] [RFC][PATCH 5/7] UBC: kernel memory accounting (core) [message #5478 is a reply to message #5403] Mon, 21 August 2006 10:38 Go to previous messageGo to next message
dev is currently offline  dev
Messages: 1693
Registered: September 2005
Location: Moscow
Senior Member

Dave Hansen wrote:
> On Fri, 2006-08-18 at 13:31 +0400, Kirill Korotaev wrote:
>
>>they all are troublesome :/
>>user can create lots of vmas, w/o page tables.
>>lots of fdsets, ipcids.
>>These are not reclaimable.
>
>
> I guess one of my big questions surrounding these patches is why the
> accounting is done with pages.
probably you missed patch details a bit.
accounting is done:
1. in pages for objects allocated by buddy allocator
2. in slabs for objects allocated from caches

> If there really is a need to limit these
> different kernel objects, then why not simply write patches to limit
> *these* *objects*? I trust there is a very good technical reason for
> doing this, I just don't understand why, yet.
The one reason is that such an accounting allows to estimate the memory
used/required by containers, while limitations by objects:
- per object accounting/limitations do not provide any memory estimation
- having a big number of reasonably high limits still allows the user
to consume big amount of memory. I.e. the sum of all the limits tend
to be high and potentially DoS exploitable :/
- memory is easier to setup/control from user POV.
having hundreds of controls is good, but not much user friendly.

Thanks,
Kirill
Re: [RFC][PATCH 5/7] UBC: kernel memory accounting (core) [message #5479 is a reply to message #5411] Mon, 21 August 2006 10:41 Go to previous messageGo to next message
dev is currently offline  dev
Messages: 1693
Registered: September 2005
Location: Moscow
Senior Member

>>1. reclaiming user resources is not that good idea as it looks to you.
>>such solutions end up with lots of resources spent on reclaim.
>>for user memory reclaims mean consumption of expensive disk I/O bandwidth
>>which reduces overall system throughput and influences other users.
>>
>
>
> May be I'm overlooking something very obvious. Please tell me, what
> happens when a user hits a page fault and the page allocator is easily
> able to give a page from its pcp list. But container is over its limit
> of physical memory. In your patch there is no attempt by container
> support to see if some of the user pages are easily reclaimable. What
> options a user will have to make sure some room is created.
The patch set send doesn't control user memory!
This topic is about kernel memory...

>>2. kernel memory is mostly not reclaimable. can you reclaim vma structs or ipc ids?
>
>
> I'm not arguing about that at all. If people want to talk about
> reclaiming kernel pages then that should be done independent of this
> subject.
Then why do you mess user pages accounting into this thread then?

Kirill
Re: [ckrm-tech] [RFC][PATCH 5/7] UBC: kernel memory accounting (core) [message #5480 is a reply to message #5427] Mon, 21 August 2006 10:48 Go to previous messageGo to next message
dev is currently offline  dev
Messages: 1693
Registered: September 2005
Location: Moscow
Senior Member

Chandra Seetharaman wrote:
> Kirill,
>
> IMO, a UBC with resource constraint(limit in this case) should behave no
> different than a kernel with limited memory. i.e it should do
> reclamation before it starts failing allocation requests. It could even
> do it preemptively.
first, please notice, that this thread is not about user memory.
we can discuss it later when about to control user memory. And
I still need to notice, that different models of user memory control
can exist. With and without reclamation.

> There is no guarantee support which is required for providing QoS.
where? in UBC? in UBC _there_ are guarentees, even in regard to OOM killer.

> Each controller modifying the infrastructure code doesn't look good. We
> can have proper interfaces to add a new resource controller.
controllers do not modify interfaces nor core. They just add
themself to the list of resources and setup default limits.
do you think it is worth creating infrastructure for these
2 one-line-changes?

> chandra
> On Wed, 2006-08-16 at 19:40 +0400, Kirill Korotaev wrote:
>
>>Introduce UB_KMEMSIZE resource which accounts kernel
>>objects allocated by task's request.
>>
>>Reference to UB is kept on struct page or slab object.
>>For slabs each struct slab contains a set of pointers
>>corresponding objects are charged to.
>>
>>Allocation charge rules:
>> define1. Pages - if allocation is performed with __GFP_UBC flag - page
>> is charged to current's exec_ub.
>> 2. Slabs - kmem_cache may be created with SLAB_UBC flag - in this
>> case each allocation is charged. Caches used by kmalloc are
>> created with SLAB_UBC | SLAB_UBC_NOCHARGE flags. In this case
>> only __GFP_UBC allocations are charged.
>>
>>Signed-Off-By: Pavel Emelianov <xemul@sw.ru>
>>Signed-Off-By: Kirill Korotaev <dev@sw.ru>
>>
> <snip>
Re: [ckrm-tech] [RFC][PATCH 5/7] UBC: kernel memory accounting (core) [message #5484 is a reply to message #5407] Mon, 21 August 2006 12:36 Go to previous messageGo to next message
dev is currently offline  dev
Messages: 1693
Registered: September 2005
Location: Moscow
Senior Member

Dave Hansen wrote:
> On Fri, 2006-08-18 at 13:31 +0400, Kirill Korotaev wrote:
>
>>they all are troublesome :/
>>user can create lots of vmas, w/o page tables.
>>lots of fdsets, ipcids.
>>These are not reclaimable.
>
>
> In the real world, with the customers to which you've given these
> patches, which of these objects is most likely to be consuming the most
> space? Is there one set of objects that we could work on that would fix
> _most_ of the cases which you have encountered?
the question is not about which one consumes more in "real life".
The question is "which of the resources are allocated on user demand
and should be limited for the environment to be secure and isolated".

>>Also consider the following scenario with reclaimable page tables.
>>e.g. user hit kmemsize limit due to fat page tables.
>>kernel reclaims some of the page tables and frees user kenerl memory.
>>after that user creates some uncreclaimable objects like fdsets or ipcs
>>and then accesses memory with reclaimed page tables.
>>Sooner or later we kill user with SIGSEGV from page fault due to
>>no memory. This is worse then returning ENOMEM from poll() or
>>mmap() where user allocates kernel objects.
>
>
> I think you're claiming that doing reclaim of kernel objects causes much
> more severe and less recoverable errors than does reclaiming of user
> pages.
I also claim that reclaiming some of kernel pages is almost undoable :)
Look, the page can be used by slab objects from 2 different containers.
Why one container should suffer from the second one which needs to be reclaimed?
What to do? separate allocators per container? And if you want to reclaim
say a page with vma you need to replaces tons of pointers all over the objects
in SMP safe manner.

That might generally be true, but I have one example that's
> killing me. (You shouldn't have mentioned mmap ;)
>
> Let's say you have a memory area mapped by one pagetable page, and with
> 1 user page mapped in it. The system is out of memory, and if you
> reclaim either the pagetable page or the user page, you're never going
> to get it back.
>
> So, you pick the user page to reclaim. The user touches it, the memory
> allocation fails, and the process gets killed.
>
> Instead, you reclaim the pagetable page. The user touches their page,
> the memory allocation for the pagetable fails, and the process gets
> killed.
>
> Seems like the same end result to me.
because you suggest the same limit for pagetables and user memory.
Thats why we have separate kmemsize limit for kernel objects and privvmpages for
user memory.
privvmpages limit hit will result in -ENOMEM on mmap() system call,
which is memory friendly.

Kirill
Re: [ckrm-tech] [RFC][PATCH] UBC: user resource beancounters [message #5485 is a reply to message #5425] Mon, 21 August 2006 13:21 Go to previous messageGo to next message
dev is currently offline  dev
Messages: 1693
Registered: September 2005
Location: Moscow
Senior Member

Chandra Seetharaman wrote:
> Kirill,
>
> Here are some concerns I have (as of now) w.r.t using UBC for resource
> management (in the context of resource groups).
>
> - guarantee support is missing. I do not see any code to provide the
> minimum amount of resource a group can get. It is important for
> providing QoS. (In a different email you did mention guarantee, i am
> referring it here for completeness).
I mentioned a couple of times that this is a limited core functionality
in this patch set.
guarantees are implementable as a separate UBC parameters.

> - Creation of a UBC and assignment of task to a UBC always happen in
> the context of the task that is affected. I can understand it works in
> OpenVZ environment, but IMO has issues if one wants it to be used for
> basic resource management
> - application needs to be changed to use this feature.
> - System administrator does not have the control to assign tasks to a
> UBC. Application does by itself.
> - Assignment of task to a UBC need to be transparent to the
> application.
this is not 100% true.
UBC itself doesn't prevent from changing context on the fly.
But since this leads to part of resources to be charged to
one UBC and another part to another UBC and so long and so
forth, we believe that more clear and correct interface is
something like fork()/exec()-required-application.

So you can always execute new applications in desired UB and
NO application modification are required.


> - UBC is deleted when the last task (in that UBC) exits. For resource
> management purposes, UBC should be deleted only when the administrator
> deletes it.
1. UBCs are freed when last _resource_ using it puts the last reference.
not the task. And it is a BIG error IMHO to think that resource
management should group tasks. No, it should group _objects_. Tasks
are just the same objects like say sockets.
2. this is easily changeable. You are the only who requested it so far.
3. kernel does so for many other objects like users and no one complains :)

> - No ability to set resource specific configuration information.
UBC model allows to _limit_ users. It is _core_.
We want to do resource management step by step and send it patch by patch,
while you are trying to solve everything at once.

sys_open() for example doesn't allow to open sockets, does it?
the same for UBC. They do what they are supposed to do.

> - No ability to maintain resource specific data in the controller.
it's false. fields can be added to user_beancounter struct easily.
and that's what our controllers do.

> - No ability to get the list of tasks belonging to a UBC.
it is not true. it can be read from /proc or system calls interface,
just like the way one finds all tasks belonging to one user :)

BTW, what is so valueable in this feature?
do you want to have interfaces to find kernel structures and even pages
which belong to the container? tasks are just one type of objects...

> - Doesn't inform the resource controllers when limits(shares) change.
As was answered and noted by Alan Cox:
1. no one defined what type of action should be done when limits change
2. it is extendable _when_ needed. Do you want to introduce hooks just
to have them?
3. is it so BIG obstacle for UBC patch? These 3-lines hooks code which
is not used?

> - Doesn't inform the resource controllers when a task's UBC has changed.
the same as above. we don't add functionality which is not used YET
(and no one even knows HOW).

> - Doesn't recalculate the resource usage when a task's UBC has changed.
> i.e doesn't uncharge the old UBC and charge new UBC.
You probably missed my explanation, that most
resources (except for the simplest one - numproc) can't be recharged
easily. And nothing in UBC code prevents such recharge to be added later
if requested.

> - For a system administrator name for identification of a UBC is
> better than a number (uid).
Have you any problems with pids, uids, gids and signals?
It is a question of interface. I don't mind in changing UBC interface even
to configfs if someone really wants it.

Thanks,
Kirill
Re: [ckrm-tech] [RFC][PATCH 5/7] UBC: kernel memory accounting (core) [message #5487 is a reply to message #5478] Mon, 21 August 2006 15:10 Go to previous messageGo to next message
Dave Hansen is currently offline  Dave Hansen
Messages: 240
Registered: October 2005
Senior Member
On Mon, 2006-08-21 at 14:40 +0400, Kirill Korotaev wrote:
> The one reason is that such an accounting allows to estimate the
> memory
> used/required by containers, while limitations by objects:
> - per object accounting/limitations do not provide any memory
> estimation

I know you're more clever than _that_. ;)

> - having a big number of reasonably high limits still allows the user
> to consume big amount of memory. I.e. the sum of all the limits tend
> to be high and potentially DoS exploitable :/
> - memory is easier to setup/control from user POV.
> having hundreds of controls is good, but not much user friendly.

I'm actually starting to think that some accounting memory usage *only*
in the slab, plus a few other structures for any stragglers not using
the slab would suffice. Since the slab knows the size of the objects,
there is no ambiguity about how many are being used. It should also be
a pretty generic way to control individual object types, if anyone else
should ever need it.

The high level pages-used-per-container metric is really nice for just
that, containers. But, I wonder if other users would find it useful if
there were more precise ways of controlling individual object usage.

-- Dave
Re: [ckrm-tech] [RFC][PATCH 3/7] UBC: ub context and inheritance [message #5492 is a reply to message #5477] Mon, 21 August 2006 20:48 Go to previous messageGo to next message
Chandra Seetharaman is currently offline  Chandra Seetharaman
Messages: 88
Registered: August 2006
Member
On Mon, 2006-08-21 at 14:32 +0400, Kirill Korotaev wrote:
> Chandra Seetharaman wrote:
> > On Wed, 2006-08-16 at 19:38 +0400, Kirill Korotaev wrote:
> >
> >>Contains code responsible for setting UB on task,
> >>it's inheriting and setting host context in interrupts.
> >>
> >>Task references three beancounters:
> >> 1. exec_ub current context. all resources are
> >> charged to this beancounter.
> >> 2. task_ub beancounter to which task_struct is
> >> charged itself.
> >
> >
> > I do not see why task_ub is needed ? i do not see it being used
> > anywhere.
> it is used to charge task itself. will be heavily used in next patch set
> adding "numproc" UBC parameter.

Well, from your other responses it sounded like you are including
code/data structure/functionality only when they are used. So, I wasn't
clear if this is missed out on cleanup or really part of UBC.

Besides, if this is needed only for a specific controller, shouldn't the
controller worry about maintaining it ?

>
> >> 3. fork_sub beancounter which is inherited by
> >> task's children on fork
> >
> >
> >>From other emails it looks like renaming fork/exec to be real/effective
> > will be easier to understand.
> there is no "real". exec_ub is effective indeed,
> but fork_sub is the one to inherit on fork().

IMO, fork_cb/exec_cb doesn't convey the real meaning of the usage.
>
> Kirill
>
>
> ------------------------------------------------------------ -------------
> Using Tomcat but need to do more? Need to support web services, security?
> Get stuff done quickly with pre-integrated technology to make your job easier
> Download IBM WebSphere Application Server v.1.0.1 based on Apache Geronimo
> http://sel.as-us.falkag.net/sel?cmd=lnk&kid=120709&b id=263057&dat=121642
> _______________________________________________
> ckrm-tech mailing list
> https://lists.sourceforge.net/lists/listinfo/ckrm-tech
--

------------------------------------------------------------ ----------
Chandra Seetharaman | Be careful what you choose....
- sekharan@us.ibm.com | .......you may get it.
------------------------------------------------------------ ----------
Re: [ckrm-tech] [RFC][PATCH 5/7] UBC: kernel memory accounting (core) [message #5493 is a reply to message #5480] Mon, 21 August 2006 20:55 Go to previous messageGo to next message
Chandra Seetharaman is currently offline  Chandra Seetharaman
Messages: 88
Registered: August 2006
Member
On Mon, 2006-08-21 at 14:51 +0400, Kirill Korotaev wrote:
> Chandra Seetharaman wrote:
> > Kirill,
> >
> > IMO, a UBC with resource constraint(limit in this case) should behave no
> > different than a kernel with limited memory. i.e it should do
> > reclamation before it starts failing allocation requests. It could even
> > do it preemptively.
> first, please notice, that this thread is not about user memory.
> we can discuss it later when about to control user memory. And
> I still need to notice, that different models of user memory control
> can exist. With and without reclamation.
>
we can talk about it then :)

> > There is no guarantee support which is required for providing QoS.
> where? in UBC? in UBC _there_ are guarentees, even in regard to OOM killer.

I do not see it in the patches you have submitted. May be I overlooked.
Can you please point me the code where guarantee is handled.

>
> > Each controller modifying the infrastructure code doesn't look good. We
> > can have proper interfaces to add a new resource controller.
> controllers do not modify interfaces nor core. They just add
> themself to the list of resources and setup default limits.
> do you think it is worth creating infrastructure for these
> 2 one-line-changes?

Yes, IMO, it is cleaner.

Think of the documentation that explains how to write a controller for
UBC.

With a proper interface it will read something like: One have to call
register_controller(char *name) and on success it returns a unique id
which is the id for the controller.

Vs

With changing lines in the core code: One have to edit the file
filename.c and add a macro to this of macros with an incremented value
for their controller and add the name of their controller to the array
named controller_names[].

I think the first one is cleaner, what do you think ?

<snip>

--

------------------------------------------------------------ ----------
Chandra Seetharaman | Be careful what you choose....
- sekharan@us.ibm.com | .......you may get it.
------------------------------------------------------------ ----------
Re: [ckrm-tech] [RFC][PATCH] UBC: user resource beancounters [message #5494 is a reply to message #5485] Mon, 21 August 2006 21:45 Go to previous messageGo to next message
Chandra Seetharaman is currently offline  Chandra Seetharaman
Messages: 88
Registered: August 2006
Member
On Mon, 2006-08-21 at 17:24 +0400, Kirill Korotaev wrote:
> Chandra Seetharaman wrote:
> > Kirill,
> >
> > Here are some concerns I have (as of now) w.r.t using UBC for resource
> > management (in the context of resource groups).
> >
> > - guarantee support is missing. I do not see any code to provide the
> > minimum amount of resource a group can get. It is important for
> > providing QoS. (In a different email you did mention guarantee, i am
> > referring it here for completeness).
> I mentioned a couple of times that this is a limited core functionality
> in this patch set.
> guarantees are implementable as a separate UBC parameters.

I will wait for oomguarpages patches :)

>
> > - Creation of a UBC and assignment of task to a UBC always happen in
> > the context of the task that is affected. I can understand it works in
> > OpenVZ environment, but IMO has issues if one wants it to be used for
> > basic resource management
> > - application needs to be changed to use this feature.
> > - System administrator does not have the control to assign tasks to a
> > UBC. Application does by itself.
> > - Assignment of task to a UBC need to be transparent to the
> > application.
> this is not 100% true.
> UBC itself doesn't prevent from changing context on the fly.
> But since this leads to part of resources to be charged to
> one UBC and another part to another UBC and so long and so

Let the controllers and the users worry about that part.

As I mentioned UBC might be perfect for container resource management,
but what I am talking for is resource management _without_ a container.

> forth, we believe that more clear and correct interface is
> something like fork()/exec()-required-application.
>
> So you can always execute new applications in desired UB and
> NO application modification are required.

For generic workload management/resource management _desired UB_ is not
necessarily decided at fork/exec time. It can happen anytime during the
life cycle of a task.

>
>
> > - UBC is deleted when the last task (in that UBC) exits. For resource
> > management purposes, UBC should be deleted only when the administrator
> > deletes it.
> 1. UBCs are freed when last _resource_ using it puts the last reference.
> not the task. And it is a BIG error IMHO to think that resource
> management should group tasks. No, it should group _objects_. Tasks
> are just the same objects like say sockets.

No argument there, that is how CKRM was early last year. But, I do not
see how is related to the point I am making above (" UBC should be
deleted only when the administrator deletes it").

> 2. this is easily changeable. You are the only who requested it so far.

It may be because I am the only one looking at it without the
"container" goggles on :).

> 3. kernel does so for many other objects like users and no one complains :)
>
> > - No ability to set resource specific configuration information.
> UBC model allows to _limit_ users. It is _core_.

I think you got me wrong here. What I want is the ability to
set/maintain a generic controller specific information.

For example, if the CPU controller wants to allow the user to set the
number of seconds over which the user wants the guarantee/limit to be
imposed.

> We want to do resource management step by step and send it patch by patch,
> while you are trying to solve everything at once.
>
> sys_open() for example doesn't allow to open sockets, does it?
> the same for UBC. They do what they are supposed to do.

I do not see how this relates !!

>
> > - No ability to maintain resource specific data in the controller.
> it's false. fields can be added to user_beancounter struct easily.
> and that's what our controllers do.

With the model of static array for resources (struct ubparm ub_parms
[UB_RESOURCES] in struct user_beancounter), it is not be possible to
attach _different_ "controller specific" information to each of the
entries.

I do not think it is good idea to add controller specific information of
_different_ controllers to the user_beancounter. Think of all the fields
it will have when all the numproc controller needs is just the basic 3-4
fields.

>
> > - No ability to get the list of tasks belonging to a UBC.
> it is not true. it can be read from /proc or system calls interface,
> just like the way one finds all tasks belonging to one user :)
>
> BTW, what is so valueable in this feature?

Again, it may not be useful for container type usages (you can probably
get the list from somewhere else, but for resource management it is
useful for sysadmins).

> do you want to have interfaces to find kernel structures and even pages
> which belong to the container? tasks are just one type of objects...
>
> > - Doesn't inform the resource controllers when limits(shares) change.
> As was answered and noted by Alan Cox:
> 1. no one defined what type of action should be done when limits change

let the controller decide it.
> 2. it is extendable _when_ needed. Do you want to introduce hooks just
> to have them?
> 3. is it so BIG obstacle for UBC patch? These 3-lines hooks code which
> is not used?
>
> > - Doesn't inform the resource controllers when a task's UBC has changed.
> the same as above. we don't add functionality which is not used YET
> (and no one even knows HOW).
>
> > - Doesn't recalculate the resource usage when a task's UBC has changed.
> > i.e doesn't uncharge the old UBC and charge new UBC.
> You probably missed my explanation, that most
> resources (except for the simplest one - numproc) can't be recharged
> easily. And nothing in UBC code prevents such recharge to be added later
> if requested.

My point is that controllers should have this control. I am ok with
these being added later. Wondering if there is any design limitations
that would prevent the later additions (like the _controller specific
data above).

>
> > - For a system administrator name for identification of a UBC is
> > better than a number (uid).
> Have you any problems with pids, uids, gids and signals?

Again, in container land each UB is attached with a container hence no
issue.

In a non-container situation IMO it will be easier to manage/associate
"gold", "silver", "bronze", "plastic" groups than 0, 11, 83 and 113.


> It is a question of interface. I don't mind in changing UBC interface even
> to configfs if someone really wants it.
>
> Thanks,
> Kirill
>
> ------------------------------------------------------------ -------------
> Using Tomcat but need to do more? Need to support web services, security?
> Get stuff done quickly with pre-integrated technology to make your job easier
> Download IBM WebSphere Application Server v.1.0.1 based on Apache Geronimo
> http://sel.as-us.falkag.net/sel?cmd=lnk&kid=120709&b id=263057&dat=121642
> _______________________________________________
> ckrm-tech mailing list
> https://lists.sourceforge.net/lists/listinfo/ckrm-tech
--

------------------------------------------------------------ ----------
Chandra Seetharaman | Be careful what you choose....
- sekharan@us.ibm.com | .......you may get it.
------------------------------------------------------------ ----------
Re: [ckrm-tech] [RFC][PATCH] UBC: user resource beancounters [message #5495 is a reply to message #5494] Mon, 21 August 2006 22:01 Go to previous messageGo to next message
Alan Cox is currently offline  Alan Cox
Messages: 48
Registered: May 2006
Member
Ar Llu, 2006-08-21 am 14:45 -0700, ysgrifennodd Chandra Seetharaman:
> As I mentioned UBC might be perfect for container resource management,
> but what I am talking for is resource management _without_ a container.

There isn't really a difference. UBC counts usage of things. It has to
know who to charge the thing to but its core concept of the luid isn't a
container, its more akin to the a departmental or project billing code.

> > 3. is it so BIG obstacle for UBC patch? These 3-lines hooks code which
> > is not used?

Add them later when they prove to be needed. If IBM send a feature that
needs it then add them in that feature. Everyone is happy it is possible
to add that hook when needed.

> In a non-container situation IMO it will be easier to manage/associate
> "gold", "silver", "bronze", "plastic" groups than 0, 11, 83 and 113.

User space issue. Doing that in kernel will lead to some limitations
later on and end up needing the user space anyway. Consider wanting to
keep the container name and properties in LDAP.
Re: [ckrm-tech] [RFC][PATCH] UBC: user resource beancounters [message #5496 is a reply to message #5495] Mon, 21 August 2006 22:44 Go to previous messageGo to next message
Chandra Seetharaman is currently offline  Chandra Seetharaman
Messages: 88
Registered: August 2006
Member
On Mon, 2006-08-21 at 23:20 +0100, Alan Cox wrote:
> Ar Llu, 2006-08-21 am 14:45 -0700, ysgrifennodd Chandra Seetharaman:
> > As I mentioned UBC might be perfect for container resource management,
> > but what I am talking for is resource management _without_ a container.
>
> There isn't really a difference. UBC counts usage of things. It has to
> know who to charge the thing to but its core concept of the luid isn't a
> container, its more akin to the a departmental or project billing code.

I didn't say it is different. The way it is implemented now has some
restrictions for generic resource management purposes (like ability to
move task around), but they are not a problem for container type usage.

>
> > > 3. is it so BIG obstacle for UBC patch? These 3-lines hooks code which
> > > is not used?
>
> Add them later when they prove to be needed. If IBM send a feature that
> needs it then add them in that feature. Everyone is happy it is possible
> to add that hook when needed.

As I mentioned in my reply, I am ok with adding it later.

>
> > In a non-container situation IMO it will be easier to manage/associate
> > "gold", "silver", "bronze", "plastic" groups than 0, 11, 83 and 113.
>
> User space issue. Doing that in kernel will lead to some limitations
> later on and end up needing the user space anyway. Consider wanting to
> keep the container name and properties in LDAP.
>
>
>
>
> ------------------------------------------------------------ -------------
> Using Tomcat but need to do more? Need to support web services, security?
> Get stuff done quickly with pre-integrated technology to make your job easier
> Download IBM WebSphere Application Server v.1.0.1 based on Apache Geronimo
> http://sel.as-us.falkag.net/sel?cmd=lnk&kid=120709&b id=263057&dat=121642
> _______________________________________________
> ckrm-tech mailing list
> https://lists.sourceforge.net/lists/listinfo/ckrm-tech
--

------------------------------------------------------------ ----------
Chandra Seetharaman | Be careful what you choose....
- sekharan@us.ibm.com | .......you may get it.
------------------------------------------------------------ ----------
Re: [ckrm-tech] [PATCH 4/7] UBC: syscalls (user interface) [message #5500 is a reply to message #5465] Tue, 22 August 2006 01:16 Go to previous messageGo to next message
Rohit Seth is currently offline  Rohit Seth
Messages: 101
Registered: August 2006
Senior Member
On Mon, 2006-08-21 at 11:47 +0900, Magnus Damm wrote:
> On Fri, 2006-08-18 at 07:45 -0700, Dave Hansen wrote:
> > On Fri, 2006-08-18 at 12:08 +0400, Andrey Savochkin wrote:
> > >
> > > A) Have separate memory management for each container,
> > > with separate buddy allocator, lru lists, page replacement mechanism.
> > > That implies a considerable overhead, and the main challenge there
> > > is sharing of pages between these separate memory managers.
> >
> > Hold on here for just a sec...
> >
> > It is quite possible to do memory management aimed at one container
> > while that container's memory still participates in the main VM.
> >
> > There is overhead here, as the LRU scanning mechanisms get less
> > efficient, but I'd rather pay a penalty at LRU scanning time than divide
> > up the VM, or coarsely start failing allocations.
>
> This could of course be solved with one LRU per container, which is how
> the CKRM memory controller implemented things about a year ago.

Effectively Andrew's idea of faking up nodes is also giving per
container LRUs.

-rohit
Re: [RFC][PATCH 5/7] UBC: kernel memory accounting (core) [message #5501 is a reply to message #5479] Tue, 22 August 2006 01:23 Go to previous messageGo to next message
Rohit Seth is currently offline  Rohit Seth
Messages: 101
Registered: August 2006
Senior Member
On Mon, 2006-08-21 at 14:43 +0400, Kirill Korotaev wrote:
> >>1. reclaiming user resources is not that good idea as it looks to you.
> >>such solutions end up with lots of resources spent on reclaim.
> >>for user memory reclaims mean consumption of expensive disk I/O bandwidth
> >>which reduces overall system throughput and influences other users.
> >>
> >
> >
> > May be I'm overlooking something very obvious. Please tell me, what
> > happens when a user hits a page fault and the page allocator is easily
> > able to give a page from its pcp list. But container is over its limit
> > of physical memory. In your patch there is no attempt by container
> > support to see if some of the user pages are easily reclaimable. What
> > options a user will have to make sure some room is created.
> The patch set send doesn't control user memory!
> This topic is about kernel memory...
>


And that is why I asked the question in the very first mail (if this
support is going to come later).

-rohit
Re: [ckrm-tech] [RFC][PATCH] UBC: user resource beancounters [message #5502 is a reply to message #5494] Tue, 22 August 2006 01:45 Go to previous messageGo to next message
Rohit Seth is currently offline  Rohit Seth
Messages: 101
Registered: August 2006
Senior Member
On Mon, 2006-08-21 at 14:45 -0700, Chandra Seetharaman wrote:
> On Mon, 2006-08-21 at 17:24 +0400, Kirill Korotaev wrote:
> > Chandra Seetharaman wrote:
> > > Kirill,
> > >
> > > Here are some concerns I have (as of now) w.r.t using UBC for resource
> > > management (in the context of resource groups).
> > >
> > > - guarantee support is missing. I do not see any code to provide the
> > > minimum amount of resource a group can get. It is important for
> > > providing QoS. (In a different email you did mention guarantee, i am
> > > referring it here for completeness).
> > I mentioned a couple of times that this is a limited core functionality
> > in this patch set.
> > guarantees are implementable as a separate UBC parameters.
>
> I will wait for oomguarpages patches :)
>
> >
> > > - Creation of a UBC and assignment of task to a UBC always happen in
> > > the context of the task that is affected. I can understand it works in
> > > OpenVZ environment, but IMO has issues if one wants it to be used for
> > > basic resource management
> > > - application needs to be changed to use this feature.
> > > - System administrator does not have the control to assign tasks to a
> > > UBC. Application does by itself.
> > > - Assignment of task to a UBC need to be transparent to the
> > > application.

I agree with the above points. Just want to add that assignment of a
task to a container may not be transparent to the application. For
example it may hit some limits and some reclaim may happen...

> > this is not 100% true.
> > UBC itself doesn't prevent from changing context on the fly.
> > But since this leads to part of resources to be charged to
> > one UBC and another part to another UBC and so long and so
>
> Let the controllers and the users worry about that part.
>

I think as the tasks move around, it becomes very heavy to move all the
pages belonging to previous container to a new container.

> As I mentioned UBC might be perfect for container resource management,
> but what I am talking for is resource management _without_ a container.
>

Can you explain that part a bit more?

> >
> > > - No ability to maintain resource specific data in the controller.
> > it's false. fields can be added to user_beancounter struct easily.
> > and that's what our controllers do.
>
> With the model of static array for resources (struct ubparm ub_parms
> [UB_RESOURCES] in struct user_beancounter), it is not be possible to
> attach _different_ "controller specific" information to each of the
> entries.
>
> I do not think it is good idea to add controller specific information of
> _different_ controllers to the user_beancounter. Think of all the fields
> it will have when all the numproc controller needs is just the basic 3-4
> fields.
>

IMO it is okay to add the fields whenever necessary as Kirill
suggested. I think once the container subject gets baked a little more,
the controllers will also get tightly coupled.

> >
> > > - No ability to get the list of tasks belonging to a UBC.
> > it is not true. it can be read from /proc or system calls interface,
> > just like the way one finds all tasks belonging to one user :)
> >
> > BTW, what is so valueable in this feature?
>
> Again, it may not be useful for container type usages (you can probably
> get the list from somewhere else, but for resource management it is
> useful for sysadmins).
>

I'm also debating about whether printing task information is really any
useful. If a sysadm wants to get information about any particular task
then that can come from /proc/<pid>/container Though container list
will be one place where one can easily get the list of all the contained
tasks (and other resources like files).


> >
> > > - For a system administrator name for identification of a UBC is
> > > better than a number (uid).
> > Have you any problems with pids, uids, gids and signals?
>
> Again, in container land each UB is attached with a container hence no
> issue.
>
> In a non-container situation IMO it will be easier to manage/associate
> "gold", "silver", "bronze", "plastic" groups than 0, 11, 83 and 113.
>
>
> > It is a question of interface. I don't mind in changing UBC interface even
> > to configfs if someone really wants it.
> >

Yes please. Thanks.
-rohit
Re: [ckrm-tech] [PATCH 4/7] UBC: syscalls (user interface) [message #5504 is a reply to message #5500] Tue, 22 August 2006 03:58 Go to previous messageGo to next message
Magnus Damm is currently offline  Magnus Damm
Messages: 8
Registered: August 2006
Junior Member
On Mon, 2006-08-21 at 18:16 -0700, Rohit Seth wrote:
> On Mon, 2006-08-21 at 11:47 +0900, Magnus Damm wrote:
> > On Fri, 2006-08-18 at 07:45 -0700, Dave Hansen wrote:
> > > On Fri, 2006-08-18 at 12:08 +0400, Andrey Savochkin wrote:
> > > >
> > > > A) Have separate memory management for each container,
> > > > with separate buddy allocator, lru lists, page replacement mechanism.
> > > > That implies a considerable overhead, and the main challenge there
> > > > is sharing of pages between these separate memory managers.
> > >
> > > Hold on here for just a sec...
> > >
> > > It is quite possible to do memory management aimed at one container
> > > while that container's memory still participates in the main VM.
> > >
> > > There is overhead here, as the LRU scanning mechanisms get less
> > > efficient, but I'd rather pay a penalty at LRU scanning time than divide
> > > up the VM, or coarsely start failing allocations.
> >
> > This could of course be solved with one LRU per container, which is how
> > the CKRM memory controller implemented things about a year ago.
>
> Effectively Andrew's idea of faking up nodes is also giving per
> container LRUs.

Yes, but the NUMA emulation approach is using fixed size containers
where the size is selectable at the kernel command line, while the CKRM
(and pzone) approach provides a more dynamic (and complex) solution.

/ magnus
Re: [ckrm-tech] [RFC][PATCH] UBC: user resource beancounters [message #5512 is a reply to message #5502] Tue, 22 August 2006 09:42 Go to previous messageGo to next message
Alan Cox is currently offline  Alan Cox
Messages: 48
Registered: May 2006
Member
Ar Llu, 2006-08-21 am 18:45 -0700, ysgrifennodd Rohit Seth:
> I think as the tasks move around, it becomes very heavy to move all the
> pages belonging to previous container to a new container.

Its not a meaningful thing to do. Remember an object may be passed
around or shared. The simple "creator pays" model avoids all the heavy
overheads while maintaining the constraints.

Its only user space pages that some of this (AS and RSS) become
interesting as "movable" objects
Re: [ckrm-tech] [RFC][PATCH] UBC: user resource beancounters [message #5516 is a reply to message #5512] Tue, 22 August 2006 09:57 Go to previous messageGo to previous message
Arjan van de Ven is currently offline  Arjan van de Ven
Messages: 14
Registered: March 2006
Junior Member
On Tue, 2006-08-22 at 11:02 +0100, Alan Cox wrote:
> Ar Llu, 2006-08-21 am 18:45 -0700, ysgrifennodd Rohit Seth:
> > I think as the tasks move around, it becomes very heavy to move all the
> > pages belonging to previous container to a new container.
>
> Its not a meaningful thing to do. Remember an object may be passed
> around or shared. The simple "creator pays" model avoids all the heavy
> overheads while maintaining the constraints.

Hi,

there is one issue with the "creator pays" model: if the creator can
decide to die/go away/respawn then you can create orphan resources. This
is a leak at least, but if a malicious user can control the
death/respawn cycle it can even be abused to bypass the controls in the
first place. Keeping the owner alive until all shared users are gone is
not always a good idea either; if a container significantly malfunctions
(or requires a restart due to, say, a very urgent glibc security
update), keeping it around anyway is not a valid option for the admin.
(And it forms another opportunity for a malicious user, keep a
(vulnerable) container alive by hanging on to a shared resource
deliberately)

A general "unshare me out" function that finds a new to-blame owner
might work, just the decision whom to blame is not an easy one in that
scenario.

Greetings,
Arjan van de Ven


--
if you want to mail me at work (you don't), use arjan (at) linux.intel.com
Previous Topic: [PATCH] vzlist: Fix "cast from pointer to integer of different size" warnings
Next Topic: [RFC][PATCH 1/2] add user namespace [try #2]
Goto Forum:
  


Current Time: Sat Oct 25 17:10:16 GMT 2025

Total time taken to generate the page: 0.07622 seconds