| Home » Mailing lists » Devel » [PATCH v3 04/13] kmem accounting basic infrastructure Goto Forum:
	| 
		
			| [PATCH v3 04/13] kmem accounting basic infrastructure [message #47884] | Tue, 18 September 2012 14:04  |  
			| 
				
				
					|  Glauber Costa Messages: 916
 Registered: October 2011
 | Senior Member |  |  |  
	| This patch adds the basic infrastructure for the accounting of the slab caches. To control that, the following files are created:
 
 * memory.kmem.usage_in_bytes
 * memory.kmem.limit_in_bytes
 * memory.kmem.failcnt
 * memory.kmem.max_usage_in_bytes
 
 They have the same meaning of their user memory counterparts. They
 reflect the state of the "kmem" res_counter.
 
 The code is not enabled until a limit is set. This can be tested by the
 flag "kmem_accounted". This means that after the patch is applied, no
 behavioral changes exists for whoever is still using memcg to control
 their memory usage.
 
 We always account to both user and kernel resource_counters. This
 effectively means that an independent kernel limit is in place when the
 limit is set to a lower value than the user memory. A equal or higher
 value means that the user limit will always hit first, meaning that kmem
 is effectively unlimited.
 
 People who want to track kernel memory but not limit it, can set this
 limit to a very high number (like RESOURCE_MAX - 1page - that no one
 will ever hit, or equal to the user memory)
 
 Signed-off-by: Glauber Costa <glommer@parallels.com>
 CC: Michal Hocko <mhocko@suse.cz>
 CC: Johannes Weiner <hannes@cmpxchg.org>
 Acked-by: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
 ---
 mm/memcontrol.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 63 insertions(+), 1 deletion(-)
 
 diff --git a/mm/memcontrol.c b/mm/memcontrol.c
 index d6ad138..f3fd354 100644
 --- a/mm/memcontrol.c
 +++ b/mm/memcontrol.c
 @@ -265,6 +265,10 @@ struct mem_cgroup {
 };
 
 /*
 +	 * the counter to account for kernel memory usage.
 +	 */
 +	struct res_counter kmem;
 +	/*
 * Per cgroup active and inactive list, similar to the
 * per zone LRU lists.
 */
 @@ -279,6 +283,7 @@ struct mem_cgroup {
 * Should the accounting and control be hierarchical, per subtree?
 */
 bool use_hierarchy;
 +	bool kmem_accounted;
 
 bool		oom_lock;
 atomic_t	under_oom;
 @@ -389,6 +394,7 @@ enum res_type {
 _MEM,
 _MEMSWAP,
 _OOM_TYPE,
 +	_KMEM,
 };
 
 #define MEMFILE_PRIVATE(x, val)	((x) << 16 | (val))
 @@ -1439,6 +1445,10 @@ done:
 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
 res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
 +	printk(KERN_INFO "kmem: usage %llukB, limit %llukB, failcnt %llu\n",
 +		res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10,
 +		res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10,
 +		res_counter_read_u64(&memcg->kmem, RES_FAILCNT));
 }
 
 /*
 @@ -3946,6 +3956,9 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
 else
 val = res_counter_read_u64(&memcg->memsw, name);
 break;
 +	case _KMEM:
 +		val = res_counter_read_u64(&memcg->kmem, name);
 +		break;
 default:
 BUG();
 }
 @@ -3984,8 +3997,18 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
 break;
 if (type == _MEM)
 ret = mem_cgroup_resize_limit(memcg, val);
 -		else
 +		else if (type == _MEMSWAP)
 ret = mem_cgroup_resize_memsw_limit(memcg, val);
 +		else if (type == _KMEM) {
 +			ret = res_counter_set_limit(&memcg->kmem, val);
 +			if (ret)
 +				break;
 +
 +			/* For simplicity, we won't allow this to be disabled */
 +			if (!memcg->kmem_accounted && val != RESOURCE_MAX)
 +				memcg->kmem_accounted = true;
 +		} else
 +			return -EINVAL;
 break;
 case RES_SOFT_LIMIT:
 ret = res_counter_memparse_write_strategy(buffer, &val);
 @@ -4051,12 +4074,16 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
 case RES_MAX_USAGE:
 if (type == _MEM)
 res_counter_reset_max(&memcg->res);
 +		else if (type == _KMEM)
 +			res_counter_reset_max(&memcg->kmem);
 else
 res_counter_reset_max(&memcg->memsw);
 break;
 case RES_FAILCNT:
 if (type == _MEM)
 res_counter_reset_failcnt(&memcg->res);
 +		else if (type == _KMEM)
 +			res_counter_reset_failcnt(&memcg->kmem);
 else
 res_counter_reset_failcnt(&memcg->memsw);
 break;
 @@ -4618,6 +4645,33 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
 }
 
 #ifdef CONFIG_MEMCG_KMEM
 +static struct cftype kmem_cgroup_files[] = {
 +	{
 +		.name = "kmem.limit_in_bytes",
 +		.private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
 +		.write_string = mem_cgroup_write,
 +		.read = mem_cgroup_read,
 +	},
 +	{
 +		.name = "kmem.usage_in_bytes",
 +		.private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
 +		.read = mem_cgroup_read,
 +	},
 +	{
 +		.name = "kmem.failcnt",
 +		.private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
 +		.trigger = mem_cgroup_reset,
 +		.read = mem_cgroup_read,
 +	},
 +	{
 +		.name = "kmem.max_usage_in_bytes",
 +		.private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
 +		.trigger = mem_cgroup_reset,
 +		.read = mem_cgroup_read,
 +	},
 +	{},
 +};
 +
 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 {
 return mem_cgroup_sockets_init(memcg, ss);
 @@ -4961,6 +5015,12 @@ mem_cgroup_create(struct cgroup *cont)
 int cpu;
 enable_swap_cgroup();
 parent = NULL;
 +
 +#ifdef CONFIG_MEMCG_KMEM
 +		WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys,
 +					   kmem_cgroup_files));
 +#endif
 +
 if (mem_cgroup_soft_limit_tree_init())
 goto free_out;
 root_mem_cgroup = memcg;
 @@ -4979,6 +5039,7 @@ mem_cgroup_create(struct cgroup *cont)
 if (parent && parent->use_hierarchy) {
 res_counter_init(&memcg->res, &parent->res);
 res_counter_init(&memcg->memsw, &parent->memsw);
 +		res_counter_init(&memcg->kmem, &parent->kmem);
 /*
 * We increment refcnt of the parent to ensure that we can
 * safely access it on res_counter_charge/uncharge.
 @@ -4989,6 +5050,7 @@ mem_cgroup_create(struct cgroup *cont)
 } else {
 res_counter_init(&memcg->res, NULL);
 res_counter_init(&memcg->memsw, NULL);
 +		res_counter_init(&memcg->kmem, NULL);
 }
 memcg->last_scanned_node = MAX_NUMNODES;
 INIT_LIST_HEAD(&memcg->oom_notify);
 --
 1.7.11.4
 |  
	|  |  |  
	| 
		
			| Re: [PATCH v3 04/13] kmem accounting basic infrastructure [message #47977 is a reply to message #47884] | Fri, 21 September 2012 16:34   |  
			| 
				
				
					|  Tejun Heo Messages: 184
 Registered: November 2006
 | Senior Member |  |  |  
	| On Tue, Sep 18, 2012 at 06:04:01PM +0400, Glauber Costa wrote: >  #ifdef CONFIG_MEMCG_KMEM
 > +static struct cftype kmem_cgroup_files[] = {
 > +	{
 > +		.name = "kmem.limit_in_bytes",
 > +		.private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
 > +		.write_string = mem_cgroup_write,
 > +		.read = mem_cgroup_read,
 > +	},
 > +	{
 > +		.name = "kmem.usage_in_bytes",
 > +		.private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
 > +		.read = mem_cgroup_read,
 > +	},
 > +	{
 > +		.name = "kmem.failcnt",
 > +		.private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
 > +		.trigger = mem_cgroup_reset,
 > +		.read = mem_cgroup_read,
 > +	},
 > +	{
 > +		.name = "kmem.max_usage_in_bytes",
 > +		.private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
 > +		.trigger = mem_cgroup_reset,
 > +		.read = mem_cgroup_read,
 > +	},
 > +	{},
 > +};
 > +
 >  static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 >  {
 >  	return mem_cgroup_sockets_init(memcg, ss);
 > @@ -4961,6 +5015,12 @@ mem_cgroup_create(struct cgroup *cont)
 >  		int cpu;
 >  		enable_swap_cgroup();
 >  		parent = NULL;
 > +
 > +#ifdef CONFIG_MEMCG_KMEM
 > +		WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys,
 > +					   kmem_cgroup_files));
 > +#endif
 > +
 
 Why not just make it part of mem_cgroup_files[]?
 
 Thanks.
 
 --
 tejun
 |  
	|  |  |  
	|  |  
	| 
		
			| Re: [PATCH v3 04/13] kmem accounting basic infrastructure [message #48051 is a reply to message #47884] | Wed, 26 September 2012 14:03   |  
			| 
				
				
					|  Michal Hocko Messages: 109
 Registered: December 2011
 | Senior Member |  |  |  
	| On Tue 18-09-12 18:04:01, Glauber Costa wrote: > This patch adds the basic infrastructure for the accounting of the slab
 > caches. To control that, the following files are created:
 >
 >  * memory.kmem.usage_in_bytes
 >  * memory.kmem.limit_in_bytes
 >  * memory.kmem.failcnt
 >  * memory.kmem.max_usage_in_bytes
 >
 > They have the same meaning of their user memory counterparts. They
 > reflect the state of the "kmem" res_counter.
 
 > The code is not enabled until a limit is set.
 
 "Per cgroup slab memory accounting is not enabled until a limit is set
 for the group. Once the limit is set the accounting cannot be disabled
 such a group."
 
 Better?
 
 > This can be tested by the flag "kmem_accounted".
 
 Sounds as if it could be done from userspace (because you were talking
 about an user interface) which it cannot and we do not see it in this
 patch because it is not used anywhere. So please be more specific.
 
 > This means that after the patch is applied, no behavioral changes
 > exists for whoever is still using memcg to control their memory usage.
 >
 > We always account to both user and kernel resource_counters.
 
 This is in contradiction with your claim that there is no behavioral
 change for memcg users. Please clarify when we use u and when u+k
 accounting.
 "
 There is no behavioral change if the kmem accounting is turned off for
 memcg users but when there is a kmem.limit_in_bytes is set then the
 memory.usage_in_bytes will include both user and kmem memory.
 "
 
 > This
 > effectively means that an independent kernel limit is in place when the
 > limit is set to a lower value than the user memory. A equal or higher
 > value means that the user limit will always hit first, meaning that kmem
 > is effectively unlimited.
 >
 > People who want to track kernel memory but not limit it, can set this
 > limit to a very high number (like RESOURCE_MAX - 1page - that no one
 > will ever hit, or equal to the user memory)
 >
 > Signed-off-by: Glauber Costa <glommer@parallels.com>
 > CC: Michal Hocko <mhocko@suse.cz>
 > CC: Johannes Weiner <hannes@cmpxchg.org>
 > Acked-by: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
 > ---
 >  mm/memcontrol.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 >  1 file changed, 63 insertions(+), 1 deletion(-)
 >
 > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
 > index d6ad138..f3fd354 100644
 > --- a/mm/memcontrol.c
 > +++ b/mm/memcontrol.c
 > @@ -265,6 +265,10 @@ struct mem_cgroup {
 >  	};
 >
 >  	/*
 > +	 * the counter to account for kernel memory usage.
 > +	 */
 > +	struct res_counter kmem;
 > +	/*
 >  	 * Per cgroup active and inactive list, similar to the
 >  	 * per zone LRU lists.
 >  	 */
 > @@ -279,6 +283,7 @@ struct mem_cgroup {
 >  	 * Should the accounting and control be hierarchical, per subtree?
 >  	 */
 >  	bool use_hierarchy;
 > +	bool kmem_accounted;
 >
 >  	bool		oom_lock;
 >  	atomic_t	under_oom;
 > @@ -389,6 +394,7 @@ enum res_type {
 >  	_MEM,
 >  	_MEMSWAP,
 >  	_OOM_TYPE,
 > +	_KMEM,
 >  };
 >
 >  #define MEMFILE_PRIVATE(x, val)	((x) << 16 | (val))
 > @@ -1439,6 +1445,10 @@ done:
 >  		res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
 >  		res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
 >  		res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
 > +	printk(KERN_INFO "kmem: usage %llukB, limit %llukB, failcnt %llu\n",
 > +		res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10,
 > +		res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10,
 > +		res_counter_read_u64(&memcg->kmem, RES_FAILCNT));
 >  }
 >
 >  /*
 > @@ -3946,6 +3956,9 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
 >  		else
 >  			val = res_counter_read_u64(&memcg->memsw, name);
 >  		break;
 > +	case _KMEM:
 > +		val = res_counter_read_u64(&memcg->kmem, name);
 > +		break;
 >  	default:
 >  		BUG();
 >  	}
 > @@ -3984,8 +3997,18 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
 >  			break;
 >  		if (type == _MEM)
 >  			ret = mem_cgroup_resize_limit(memcg, val);
 > -		else
 > +		else if (type == _MEMSWAP)
 >  			ret = mem_cgroup_resize_memsw_limit(memcg, val);
 > +		else if (type == _KMEM) {
 > +			ret = res_counter_set_limit(&memcg->kmem, val);
 > +			if (ret)
 > +				break;
 > +
 > +			/* For simplicity, we won't allow this to be disabled */
 > +			if (!memcg->kmem_accounted && val != RESOURCE_MAX)
 > +				memcg->kmem_accounted = true;
 > +		} else
 > +			return -EINVAL;
 >  		break;
 >  	case RES_SOFT_LIMIT:
 >  		ret = res_counter_memparse_write_strategy(buffer, &val);
 > @@ -4051,12 +4074,16 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
 >  	case RES_MAX_USAGE:
 >  		if (type == _MEM)
 >  			res_counter_reset_max(&memcg->res);
 > +		else if (type == _KMEM)
 > +			res_counter_reset_max(&memcg->kmem);
 >  		else
 >  			res_counter_reset_max(&memcg->memsw);
 >  		break;
 >  	case RES_FAILCNT:
 >  		if (type == _MEM)
 >  			res_counter_reset_failcnt(&memcg->res);
 > +		else if (type == _KMEM)
 > +			res_counter_reset_failcnt(&memcg->kmem);
 >  		else
 >  			res_counter_reset_failcnt(&memcg->memsw);
 >  		break;
 > @@ -4618,6 +4645,33 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
 >  }
 >
 >  #ifdef CONFIG_MEMCG_KMEM
 
 Some things are guarded CONFIG_MEMCG_KMEM but some are not (e.g. struct
 mem_cgroup.kmem). I do understand you want to keep ifdefs on the leash
 but we should clean this up one day.
 
 > +static struct cftype kmem_cgroup_files[] = {
 > +	{
 > +		.name = "kmem.limit_in_bytes",
 > +		.private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
 > +		.write_string = mem_cgroup_write,
 > +		.read = mem_cgroup_read,
 > +	},
 > +	{
 > +		.name = "kmem.usage_in_bytes",
 > +		.private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
 > +		.read = mem_cgroup_read,
 > +	},
 > +	{
 > +		.name = "kmem.failcnt",
 > +		.private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
 > +		.trigger = mem_cgroup_reset,
 > +		.read = mem_cgroup_read,
 > +	},
 > +	{
 > +		.name = "kmem.max_usage_in_bytes",
 > +		.private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
 > +		.trigger = mem_cgroup_reset,
 > +		.read = mem_cgroup_read,
 > +	},
 > +	{},
 > +};
 > +
 >  static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 >  {
 >  	return mem_cgroup_sockets_init(memcg, ss);
 > @@ -4961,6 +5015,12 @@ mem_cgroup_create(struct cgroup *cont)
 >  		int cpu;
 >  		enable_swap_cgroup();
 >  		parent = NULL;
 > +
 > +#ifdef CONFIG_MEMCG_KMEM
 > +		WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys,
 > +					   kmem_cgroup_files));
 > +#endif
 > +
 >  		if (mem_cgroup_soft_limit_tree_init())
 >  			goto free_out;
 >  		root_mem_cgroup = memcg;
 > @@ -4979,6 +5039,7 @@ mem_cgroup_create(struct cgroup *cont)
 >  	if (parent && parent->use_hierarchy) {
 >  		res_counter_init(&memcg->res, &parent->res);
 >  		res_counter_init(&memcg->memsw, &parent->memsw);
 > +		res_counter_init(&memcg->kmem, &parent->kmem);
 
 Haven't we already discussed that a new memcg should inherit kmem_accounted
 from its parent for use_hierarchy?
 Say we have
 root
 |
 A (kmem_accounted = 1, use_hierachy = 1)
 \
 B (kmem_accounted = 0)
 \
 C (kmem_accounted = 1)
 
 B find's itself in an awkward situation becuase it doesn't want to
 account u+k but it ends up doing so becuase C.
 --
 Michal Hocko
 SUSE Labs
 |  
	|  |  |  
	| 
		
			| Re: [PATCH v3 04/13] kmem accounting basic infrastructure [message #48052 is a reply to message #48051] | Wed, 26 September 2012 14:33   |  
			| 
				
				
					|  Glauber Costa Messages: 916
 Registered: October 2011
 | Senior Member |  |  |  
	| On 09/26/2012 06:03 PM, Michal Hocko wrote: > On Tue 18-09-12 18:04:01, Glauber Costa wrote:
 >> This patch adds the basic infrastructure for the accounting of the slab
 >> caches. To control that, the following files are created:
 >>
 >>  * memory.kmem.usage_in_bytes
 >>  * memory.kmem.limit_in_bytes
 >>  * memory.kmem.failcnt
 >>  * memory.kmem.max_usage_in_bytes
 >>
 >> They have the same meaning of their user memory counterparts. They
 >> reflect the state of the "kmem" res_counter.
 >
 >> The code is not enabled until a limit is set.
 >
 > "Per cgroup slab memory accounting is not enabled until a limit is set
 > for the group. Once the limit is set the accounting cannot be disabled
 > such a group."
 >
 > Better?
 >
 >> This can be tested by the flag "kmem_accounted".
 >
 > Sounds as if it could be done from userspace (because you were talking
 > about an user interface) which it cannot and we do not see it in this
 > patch because it is not used anywhere. So please be more specific.
 >
 >> This means that after the patch is applied, no behavioral changes
 >> exists for whoever is still using memcg to control their memory usage.
 >>
 >> We always account to both user and kernel resource_counters.
 >
 > This is in contradiction with your claim that there is no behavioral
 > change for memcg users. Please clarify when we use u and when u+k
 > accounting.
 > "
 > There is no behavioral change if the kmem accounting is turned off for
 > memcg users but when there is a kmem.limit_in_bytes is set then the
 > memory.usage_in_bytes will include both user and kmem memory.
 > "
 >
 >> This
 >> effectively means that an independent kernel limit is in place when the
 >> limit is set to a lower value than the user memory. A equal or higher
 >> value means that the user limit will always hit first, meaning that kmem
 >> is effectively unlimited.
 >>
 >> People who want to track kernel memory but not limit it, can set this
 >> limit to a very high number (like RESOURCE_MAX - 1page - that no one
 >> will ever hit, or equal to the user memory)
 >>
 >> Signed-off-by: Glauber Costa <glommer@parallels.com>
 >> CC: Michal Hocko <mhocko@suse.cz>
 >> CC: Johannes Weiner <hannes@cmpxchg.org>
 >> Acked-by: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
 >> ---
 >>  mm/memcontrol.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 >>  1 file changed, 63 insertions(+), 1 deletion(-)
 >>
 >> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
 >> index d6ad138..f3fd354 100644
 >> --- a/mm/memcontrol.c
 >> +++ b/mm/memcontrol.c
 >> @@ -265,6 +265,10 @@ struct mem_cgroup {
 >>  	};
 >>
 >>  	/*
 >> +	 * the counter to account for kernel memory usage.
 >> +	 */
 >> +	struct res_counter kmem;
 >> +	/*
 >>  	 * Per cgroup active and inactive list, similar to the
 >>  	 * per zone LRU lists.
 >>  	 */
 >> @@ -279,6 +283,7 @@ struct mem_cgroup {
 >>  	 * Should the accounting and control be hierarchical, per subtree?
 >>  	 */
 >>  	bool use_hierarchy;
 >> +	bool kmem_accounted;
 >>
 >>  	bool		oom_lock;
 >>  	atomic_t	under_oom;
 >> @@ -389,6 +394,7 @@ enum res_type {
 >>  	_MEM,
 >>  	_MEMSWAP,
 >>  	_OOM_TYPE,
 >> +	_KMEM,
 >>  };
 >>
 >>  #define MEMFILE_PRIVATE(x, val)	((x) << 16 | (val))
 >> @@ -1439,6 +1445,10 @@ done:
 >>  		res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
 >>  		res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
 >>  		res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
 >> +	printk(KERN_INFO "kmem: usage %llukB, limit %llukB, failcnt %llu\n",
 >> +		res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10,
 >> +		res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10,
 >> +		res_counter_read_u64(&memcg->kmem, RES_FAILCNT));
 >>  }
 >>
 >>  /*
 >> @@ -3946,6 +3956,9 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
 >>  		else
 >>  			val = res_counter_read_u64(&memcg->memsw, name);
 >>  		break;
 >> +	case _KMEM:
 >> +		val = res_counter_read_u64(&memcg->kmem, name);
 >> +		break;
 >>  	default:
 >>  		BUG();
 >>  	}
 >> @@ -3984,8 +3997,18 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
 >>  			break;
 >>  		if (type == _MEM)
 >>  			ret = mem_cgroup_resize_limit(memcg, val);
 >> -		else
 >> +		else if (type == _MEMSWAP)
 >>  			ret = mem_cgroup_resize_memsw_limit(memcg, val);
 >> +		else if (type == _KMEM) {
 >> +			ret = res_counter_set_limit(&memcg->kmem, val);
 >> +			if (ret)
 >> +				break;
 >> +
 >> +			/* For simplicity, we won't allow this to be disabled */
 >> +			if (!memcg->kmem_accounted && val != RESOURCE_MAX)
 >> +				memcg->kmem_accounted = true;
 >> +		} else
 >> +			return -EINVAL;
 >>  		break;
 >>  	case RES_SOFT_LIMIT:
 >>  		ret = res_counter_memparse_write_strategy(buffer, &val);
 >> @@ -4051,12 +4074,16 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
 >>  	case RES_MAX_USAGE:
 >>  		if (type == _MEM)
 >>  			res_counter_reset_max(&memcg->res);
 >> +		else if (type == _KMEM)
 >> +			res_counter_reset_max(&memcg->kmem);
 >>  		else
 >>  			res_counter_reset_max(&memcg->memsw);
 >>  		break;
 >>  	case RES_FAILCNT:
 >>  		if (type == _MEM)
 >>  			res_counter_reset_failcnt(&memcg->res);
 >> +		else if (type == _KMEM)
 >> +			res_counter_reset_failcnt(&memcg->kmem);
 >>  		else
 >>  			res_counter_reset_failcnt(&memcg->memsw);
 >>  		break;
 >> @@ -4618,6 +4645,33 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
 >>  }
 >>
 >>  #ifdef CONFIG_MEMCG_KMEM
 >
 > Some things are guarded CONFIG_MEMCG_KMEM but some are not (e.g. struct
 > mem_cgroup.kmem). I do understand you want to keep ifdefs on the leash
 > but we should clean this up one day.
 >
 >> +static struct cftype kmem_cgroup_files[] = {
 >> +	{
 >> +		.name = "kmem.limit_in_bytes",
 >> +		.private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
 >> +		.write_string = mem_cgroup_write,
 >> +		.read = mem_cgroup_read,
 >> +	},
 >> +	{
 >> +		.name = "kmem.usage_in_bytes",
 >> +		.private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
 >> +		.read = mem_cgroup_read,
 >> +	},
 >> +	{
 >> +		.name = "kmem.failcnt",
 >> +		.private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
 >> +		.trigger = mem_cgroup_reset,
 >> +		.read = mem_cgroup_read,
 >> +	},
 >> +	{
 >> +		.name = "kmem.max_usage_in_bytes",
 >> +		.private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
 >> +		.trigger = mem_cgroup_reset,
 >> +		.read = mem_cgroup_read,
 >> +	},
 >> +	{},
 >> +};
 >> +
 >>  static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 >>  {
 >>  	return mem_cgroup_sockets_init(memcg, ss);
 >> @@ -4961,6 +5015,12 @@ mem_cgroup_create(struct cgroup *cont)
 >>  		int cpu;
 >>  		enable_swap_cgroup();
 >>  		parent = NULL;
 >> +
 >> +#ifdef CONFIG_MEMCG_KMEM
 >> +		WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys,
 >> +					   kmem_cgroup_files));
 >> +#endif
 >> +
 >>  		if (mem_cgroup_soft_limit_tree_init())
 >>  			goto free_out;
 >>  		root_mem_cgroup = memcg;
 >> @@ -4979,6 +5039,7 @@ mem_cgroup_create(struct cgroup *cont)
 >>  	if (parent && parent->use_hierarchy) {
 >>  		res_counter_init(&memcg->res, &parent->res);
 >>  		res_counter_init(&memcg->memsw, &parent->memsw);
 >> +		res_counter_init(&memcg->kmem, &parent->kmem);
 >
 > Haven't we already discussed that a new memcg should inherit kmem_accounted
 > from its parent for use_hierarchy?
 > Say we have
 > root
 > |
 > A (kmem_accounted = 1, use_hierachy = 1)
 >  \
 >   B (kmem_accounted = 0)
 >    \
 >     C (kmem_accounted = 1)
 >
 > B find's itself in an awkward situation becuase it doesn't want to
 > account u+k but it ends up doing so becuase C.
 >
 
 Ok, I haven't updated it here. But that should be taken care of in the
 lifecycle patch.
 |  
	|  |  |  
	| 
		
			| Re: [PATCH v3 04/13] kmem accounting basic infrastructure [message #48055 is a reply to message #48052] | Wed, 26 September 2012 16:01   |  
			| 
				
				
					|  Michal Hocko Messages: 109
 Registered: December 2011
 | Senior Member |  |  |  
	| On Wed 26-09-12 18:33:10, Glauber Costa wrote: > On 09/26/2012 06:03 PM, Michal Hocko wrote:
 > > On Tue 18-09-12 18:04:01, Glauber Costa wrote:
 [...]
 > >> @@ -4961,6 +5015,12 @@ mem_cgroup_create(struct cgroup *cont)
 > >>  		int cpu;
 > >>  		enable_swap_cgroup();
 > >>  		parent = NULL;
 > >> +
 > >> +#ifdef CONFIG_MEMCG_KMEM
 > >> +		WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys,
 > >> +					   kmem_cgroup_files));
 > >> +#endif
 > >> +
 > >>  		if (mem_cgroup_soft_limit_tree_init())
 > >>  			goto free_out;
 > >>  		root_mem_cgroup = memcg;
 > >> @@ -4979,6 +5039,7 @@ mem_cgroup_create(struct cgroup *cont)
 > >>  	if (parent && parent->use_hierarchy) {
 > >>  		res_counter_init(&memcg->res, &parent->res);
 > >>  		res_counter_init(&memcg->memsw, &parent->memsw);
 > >> +		res_counter_init(&memcg->kmem, &parent->kmem);
 > >
 > > Haven't we already discussed that a new memcg should inherit kmem_accounted
 > > from its parent for use_hierarchy?
 > > Say we have
 > > root
 > > |
 > > A (kmem_accounted = 1, use_hierachy = 1)
 > >  \
 > >   B (kmem_accounted = 0)
 > >    \
 > >     C (kmem_accounted = 1)
 > >
 > > B find's itself in an awkward situation becuase it doesn't want to
 > > account u+k but it ends up doing so becuase C.
 > >
 >
 > Ok, I haven't updated it here. But that should be taken care of in the
 > lifecycle patch.
 
 I am not sure which patch you are thinking about but I would prefer to
 have it here because it is safe wrt. races and it is more obvious as
 well.
 
 --
 Michal Hocko
 SUSE Labs
 |  
	|  |  |  
	|  |  
	| 
		
			| Re: [PATCH v3 04/13] kmem accounting basic infrastructure [message #48060 is a reply to message #48055] | Wed, 26 September 2012 17:34   |  
			| 
				
				
					|  Glauber Costa Messages: 916
 Registered: October 2011
 | Senior Member |  |  |  
	| On 09/26/2012 08:01 PM, Michal Hocko wrote: > On Wed 26-09-12 18:33:10, Glauber Costa wrote:
 >> On 09/26/2012 06:03 PM, Michal Hocko wrote:
 >>> On Tue 18-09-12 18:04:01, Glauber Costa wrote:
 > [...]
 >>>> @@ -4961,6 +5015,12 @@ mem_cgroup_create(struct cgroup *cont)
 >>>>  		int cpu;
 >>>>  		enable_swap_cgroup();
 >>>>  		parent = NULL;
 >>>> +
 >>>> +#ifdef CONFIG_MEMCG_KMEM
 >>>> +		WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys,
 >>>> +					   kmem_cgroup_files));
 >>>> +#endif
 >>>> +
 >>>>  		if (mem_cgroup_soft_limit_tree_init())
 >>>>  			goto free_out;
 >>>>  		root_mem_cgroup = memcg;
 >>>> @@ -4979,6 +5039,7 @@ mem_cgroup_create(struct cgroup *cont)
 >>>>  	if (parent && parent->use_hierarchy) {
 >>>>  		res_counter_init(&memcg->res, &parent->res);
 >>>>  		res_counter_init(&memcg->memsw, &parent->memsw);
 >>>> +		res_counter_init(&memcg->kmem, &parent->kmem);
 >>>
 >>> Haven't we already discussed that a new memcg should inherit kmem_accounted
 >>> from its parent for use_hierarchy?
 >>> Say we have
 >>> root
 >>> |
 >>> A (kmem_accounted = 1, use_hierachy = 1)
 >>>  \
 >>>   B (kmem_accounted = 0)
 >>>    \
 >>>     C (kmem_accounted = 1)
 >>>
 >>> B find's itself in an awkward situation becuase it doesn't want to
 >>> account u+k but it ends up doing so becuase C.
 >>>
 >>
 >> Ok, I haven't updated it here. But that should be taken care of in the
 >> lifecycle patch.
 >
 > I am not sure which patch you are thinking about but I would prefer to
 > have it here because it is safe wrt. races and it is more obvious as
 > well.
 >
 
 The patch where I make kmem_accounted into a bitfield. So any code here
 will eventually disappear.
 
 But BTW, I am not saying I won't update the patch - I like that all
 patches work and make sense in their own, I am just saying that I forgot
 to update this patch, because I added the code in its final version to
 the end and then squashed it.
 |  
	|  |  |  
	| 
		
			| Re: [PATCH v3 04/13] kmem accounting basic infrastructure [message #48061 is a reply to message #48057] | Wed, 26 September 2012 17:36   |  
			| 
				
				
					|  Glauber Costa Messages: 916
 Registered: October 2011
 | Senior Member |  |  |  
	| On 09/26/2012 08:36 PM, Tejun Heo wrote: > Hello, Michal, Glauber.
 >
 > On Wed, Sep 26, 2012 at 04:03:47PM +0200, Michal Hocko wrote:
 >> Haven't we already discussed that a new memcg should inherit kmem_accounted
 >> from its parent for use_hierarchy?
 >> Say we have
 >> root
 >> |
 >> A (kmem_accounted = 1, use_hierachy = 1)
 >>  \
 >>   B (kmem_accounted = 0)
 >>    \
 >>     C (kmem_accounted = 1)
 >>
 >> B find's itself in an awkward situation becuase it doesn't want to
 >> account u+k but it ends up doing so becuase C.
 >
 > Do we really want this level of flexibility?  What's wrong with a
 > global switch at the root?  I'm not even sure we want this to be
 > optional at all.  The only reason I can think of is that it might
 > screw up some configurations in use which are carefully crafted to
 > suit userland-only usage but for that isn't what we need a transition
 > plan rather than another ultra flexible config option that not many
 > really understand the implication of?
 >
 > In the same vein, do we really need both .kmem_accounted and config
 > option?  If someone is turning on MEMCG, just include kmem accounting.
 >
 
 Yes, we do.
 
 This was discussed multiple times. Our interest is to preserve existing
 deployed setup, that were tuned in a world where kmem didn't exist.
 Because we also feed kmem to the user counter, this may very well
 disrupt their setup.
 
 User memory, unlike kernel memory, may very well be totally in control
 of the userspace application, so it is not unreasonable to believe that
 extra pages appearing in a new kernel version may break them.
 
 It is actually a much worse compatibility problem than flipping
 hierarchy, in comparison
 |  
	|  |  |  
	| 
		
			| Re: [PATCH v3 04/13] kmem accounting basic infrastructure [message #48062 is a reply to message #48061] | Wed, 26 September 2012 17:44   |  
			| 
				
				
					|  Tejun Heo Messages: 184
 Registered: November 2006
 | Senior Member |  |  |  
	| Hello, Glauber. 
 On Wed, Sep 26, 2012 at 10:36 AM, Glauber Costa <glommer@parallels.com> wrote:
 > This was discussed multiple times. Our interest is to preserve existing
 > deployed setup, that were tuned in a world where kmem didn't exist.
 > Because we also feed kmem to the user counter, this may very well
 > disrupt their setup.
 
 So, that can be served by .kmem_accounted at root, no?
 
 > User memory, unlike kernel memory, may very well be totally in control
 > of the userspace application, so it is not unreasonable to believe that
 > extra pages appearing in a new kernel version may break them.
 >
 > It is actually a much worse compatibility problem than flipping
 > hierarchy, in comparison
 
 Again, what's wrong with one switch at the root?
 
 Thanks.
 
 --
 tejun
 |  
	|  |  |  
	| 
		
			| Re: [PATCH v3 04/13] kmem accounting basic infrastructure [message #48064 is a reply to message #48062] | Wed, 26 September 2012 17:53   |  
			| 
				
				
					|  Glauber Costa Messages: 916
 Registered: October 2011
 | Senior Member |  |  |  
	| On 09/26/2012 09:44 PM, Tejun Heo wrote: > Hello, Glauber.
 >
 > On Wed, Sep 26, 2012 at 10:36 AM, Glauber Costa <glommer@parallels.com> wrote:
 >> This was discussed multiple times. Our interest is to preserve existing
 >> deployed setup, that were tuned in a world where kmem didn't exist.
 >> Because we also feed kmem to the user counter, this may very well
 >> disrupt their setup.
 >
 > So, that can be served by .kmem_accounted at root, no?
 >
 >> User memory, unlike kernel memory, may very well be totally in control
 >> of the userspace application, so it is not unreasonable to believe that
 >> extra pages appearing in a new kernel version may break them.
 >>
 >> It is actually a much worse compatibility problem than flipping
 >> hierarchy, in comparison
 >
 > Again, what's wrong with one switch at the root?
 >
 
 I understand your trauma about over flexibility, and you know I share of
 it. But I don't think there is any need to cap it here. Given kmem
 accounted is perfectly hierarchical, and there seem to be plenty of
 people who only care about user memory, I see no reason to disallow a
 mixed use case here.
 
 I must say that for my particular use case, enabling it unconditionally
 would just work, so it is not that what I have in mind.
 |  
	|  |  |  
	| 
		
			| Re: [PATCH v3 04/13] kmem accounting basic infrastructure [message #48065 is a reply to message #48064] | Wed, 26 September 2012 18:01   |  
			| 
				
				
					|  Tejun Heo Messages: 184
 Registered: November 2006
 | Senior Member |  |  |  
	| Hello, 
 On Wed, Sep 26, 2012 at 09:53:09PM +0400, Glauber Costa wrote:
 > I understand your trauma about over flexibility, and you know I share of
 > it. But I don't think there is any need to cap it here. Given kmem
 > accounted is perfectly hierarchical, and there seem to be plenty of
 > people who only care about user memory, I see no reason to disallow a
 > mixed use case here.
 >
 > I must say that for my particular use case, enabling it unconditionally
 > would just work, so it is not that what I have in mind.
 
 So, I'm not gonna go as far as pushing for enabling it unconditionally
 but would really like to hear why it's necessary to make it per node
 instead of one global switch.  Maybe it has already been discussed to
 hell and back.  Care to summarize / point me to it?
 
 Thanks.
 
 --
 tejun
 |  
	|  |  |  
	| 
		
			| Re: [PATCH v3 04/13] kmem accounting basic infrastructure [message #48066 is a reply to message #48065] | Wed, 26 September 2012 18:56   |  
			| 
				
				
					|  Glauber Costa Messages: 916
 Registered: October 2011
 | Senior Member |  |  |  
	| On 09/26/2012 10:01 PM, Tejun Heo wrote: > Hello,
 >
 > On Wed, Sep 26, 2012 at 09:53:09PM +0400, Glauber Costa wrote:
 >> I understand your trauma about over flexibility, and you know I share of
 >> it. But I don't think there is any need to cap it here. Given kmem
 >> accounted is perfectly hierarchical, and there seem to be plenty of
 >> people who only care about user memory, I see no reason to disallow a
 >> mixed use case here.
 >>
 >> I must say that for my particular use case, enabling it unconditionally
 >> would just work, so it is not that what I have in mind.
 >
 > So, I'm not gonna go as far as pushing for enabling it unconditionally
 > but would really like to hear why it's necessary to make it per node
 > instead of one global switch.  Maybe it has already been discussed to
 > hell and back.  Care to summarize / point me to it?
 >
 
 For me, it is the other way around: it makes perfect sense to have a
 per-subtree selection of features where it doesn't hurt us, provided it
 is hierarchical. For the mere fact that not every application is
 interested in this (Michal is the one that is being so far more vocal
 about this not being needed in some use cases), and it is perfectly
 valid to imagine such applications would coexist.
 
 So given the flexibility it brings, the real question is, as I said,
 backwards: what is it necessary to make it a global switch ?
 |  
	|  |  |  
	| 
		
			| Re: [PATCH v3 04/13] kmem accounting basic infrastructure [message #48069 is a reply to message #48066] | Wed, 26 September 2012 19:34   |  
			| 
				
				
					|  Tejun Heo Messages: 184
 Registered: November 2006
 | Senior Member |  |  |  
	| Hello, 
 On Wed, Sep 26, 2012 at 10:56:09PM +0400, Glauber Costa wrote:
 > For me, it is the other way around: it makes perfect sense to have a
 > per-subtree selection of features where it doesn't hurt us, provided it
 > is hierarchical. For the mere fact that not every application is
 > interested in this (Michal is the one that is being so far more vocal
 > about this not being needed in some use cases), and it is perfectly
 > valid to imagine such applications would coexist.
 >
 > So given the flexibility it brings, the real question is, as I said,
 > backwards: what is it necessary to make it a global switch ?
 
 Because it hurts my head and it's better to keep things simple.  We're
 planning to retire .use_hierarhcy in sub hierarchies and I'd really
 like to prevent another fiasco like that unless there absolutely is no
 way around it.  Flexibility where necessary is fine but let's please
 try our best to avoid over-designing things.  We've been far too good
 at getting lost in flexbility maze.  Michal, care to chime in?
 
 Thanks.
 
 --
 tejun
 |  
	|  |  |  
	| 
		
			| Re: [PATCH v3 04/13] kmem accounting basic infrastructure [message #48071 is a reply to message #48069] | Wed, 26 September 2012 19:46   |  
			| 
				
				
					|  Glauber Costa Messages: 916
 Registered: October 2011
 | Senior Member |  |  |  
	| On 09/26/2012 11:34 PM, Tejun Heo wrote: > Hello,
 >
 > On Wed, Sep 26, 2012 at 10:56:09PM +0400, Glauber Costa wrote:
 >> For me, it is the other way around: it makes perfect sense to have a
 >> per-subtree selection of features where it doesn't hurt us, provided it
 >> is hierarchical. For the mere fact that not every application is
 >> interested in this (Michal is the one that is being so far more vocal
 >> about this not being needed in some use cases), and it is perfectly
 >> valid to imagine such applications would coexist.
 >>
 >> So given the flexibility it brings, the real question is, as I said,
 >> backwards: what is it necessary to make it a global switch ?
 >
 > Because it hurts my head and it's better to keep things simple.  We're
 > planning to retire .use_hierarhcy in sub hierarchies and I'd really
 > like to prevent another fiasco like that unless there absolutely is no
 > way around it.  Flexibility where necessary is fine but let's please
 > try our best to avoid over-designing things.  We've been far too good
 > at getting lost in flexbility maze.  Michal, care to chime in?
 >
 
 I would very much like to hear Michal here as well, sure.
 
 But as I said in this very beginning of this, you pretty much know that
 I am heavily involved in trying to get rid of use_hierarchy, and by no
 means I consider this en pair with that.
 
 use_hierarchy is a hack around a core property of cgroups, the fact that
 they are hierarchical. Its mere existence came to be to overcome a
 performance limitation.
 
 It puts you in contradictory situation where you have cgroups organized
 as directories, and then not satisfied in making this hierarchical
 representation be gravely ignored, forces you to use nonsensical terms
 like "flat hierarchy", making us grasp at how it is to be a politician
 once in our lifetimes.
 
 Besides not being part of cgroup core, and respecting very much both
 cgroups' and basic sanity properties, kmem is an actual feature that
 some people want, and some people don't. There is no reason to believe
 that applications that want will live in the same environment with ones
 that don't want.
 |  
	|  |  |  
	| 
		
			| Re: [PATCH v3 04/13] kmem accounting basic infrastructure [message #48072 is a reply to message #48071] | Wed, 26 September 2012 19:56   |  
			| 
				
				
					|  Tejun Heo Messages: 184
 Registered: November 2006
 | Senior Member |  |  |  
	| Hello, 
 On Wed, Sep 26, 2012 at 11:46:37PM +0400, Glauber Costa wrote:
 > Besides not being part of cgroup core, and respecting very much both
 > cgroups' and basic sanity properties, kmem is an actual feature that
 > some people want, and some people don't. There is no reason to believe
 > that applications that want will live in the same environment with ones
 > that don't want.
 
 I don't know.  It definitely is less crazy than .use_hierarchy but I
 wouldn't say it's an inherently different thing.  I mean, what does it
 even mean to have u+k limit on one subtree and not on another branch?
 And we worry about things like what if parent doesn't enable it but
 its chlidren do.
 
 This is a feature which adds complexity.  If the feature is necessary
 and justified, sure.  If not, let's please not and let's err on the
 side of conservativeness.  We can always add it later but the other
 direction is much harder.
 
 Thanks.
 
 --
 tejun
 |  
	|  |  |  
	| 
		
			| Re: [PATCH v3 04/13] kmem accounting basic infrastructure [message #48073 is a reply to message #48072] | Wed, 26 September 2012 20:02   |  
			| 
				
				
					|  Glauber Costa Messages: 916
 Registered: October 2011
 | Senior Member |  |  |  
	| On 09/26/2012 11:56 PM, Tejun Heo wrote: > Hello,
 >
 > On Wed, Sep 26, 2012 at 11:46:37PM +0400, Glauber Costa wrote:
 >> Besides not being part of cgroup core, and respecting very much both
 >> cgroups' and basic sanity properties, kmem is an actual feature that
 >> some people want, and some people don't. There is no reason to believe
 >> that applications that want will live in the same environment with ones
 >> that don't want.
 >
 > I don't know.  It definitely is less crazy than .use_hierarchy but I
 > wouldn't say it's an inherently different thing.  I mean, what does it
 > even mean to have u+k limit on one subtree and not on another branch?
 > And we worry about things like what if parent doesn't enable it but
 > its chlidren do.
 >
 
 It is inherently different. To begin with, it actually contemplates two
 use cases. It is not a work around.
 
 The meaning is also very well defined. The meaning of having this
 enabled in one subtree and not in other is: Subtree A wants to track
 kernel memory. Subtree B does not. It's that, and never more than that.
 There is no maybes and no buts, no magic knobs that makes it behave in a
 crazy way.
 
 If a children enables it but the parent does not, this does what every
 tree does: enable it from that point downwards.
 
 > This is a feature which adds complexity.  If the feature is necessary
 > and justified, sure.  If not, let's please not and let's err on the
 > side of conservativeness.  We can always add it later but the other
 > direction is much harder.
 >
 
 I disagree. Having kmem tracking adds complexity. Having to cope with
 the use case where we turn it on dynamically to cope with the "user page
 only" use case adds complexity. But I see no significant complexity
 being added by having it per subtree. Really.
 
 You have the use_hierarchy fiasco in mind, and I do understand that you
 are raising the flag and all that.
 
 But think in terms of functionality: This thing here is a lot more
 similar to swap than use_hierarchy. Would you argue that memsw should be
 per-root ?
 
 The reason why it shouldn't: Some people want to limit memory
 consumption all the way to the swap, some people don't. Same with kmem.
 |  
	|  |  |  
	|  |  
	| 
		
			| Re: [PATCH v3 04/13] kmem accounting basic infrastructure [message #48075 is a reply to message #48074] | Wed, 26 September 2012 21:24   |  
			| 
				
				
					|  Glauber Costa Messages: 916
 Registered: October 2011
 | Senior Member |  |  |  
	| On 09/27/2012 12:16 AM, Tejun Heo wrote: > On Thu, Sep 27, 2012 at 12:02:14AM +0400, Glauber Costa wrote:
 >> But think in terms of functionality: This thing here is a lot more
 >> similar to swap than use_hierarchy. Would you argue that memsw should be
 >> per-root ?
 >
 > I'm fairly sure you can make about the same argument about
 > use_hierarchy.  There is a choice to make here and one is simpler than
 > the other.  I want the additional complexity justified by actual use
 > cases which isn't too much to ask for especially when the complexity
 > is something visible to userland.
 >
 > So let's please stop arguing semantics.  If this is definitely
 > necessary for some use cases, sure let's have it.  If not, let's
 > consider it later.  I'll stop responding on "inherent differences."  I
 > don't think we'll get anywhere with that.
 >
 
 If you stop responding, we are for sure not getting anywhere. I agree
 with you here.
 
 Let me point out one issue that you seem to be missing, and you respond
 or not, your call.
 
 "kmem_accounted" is not a switch. It is an internal representation only.
 The semantics, that we discussed exhaustively in San Diego, is that a
 group that is not limited is not accounted. This is simple and consistent.
 
 Since the limits are still per-cgroup, you are actually proposing more
 user-visible complexity than me, since you are adding yet another file,
 with its own semantics.
 
 About use cases, I've already responded: my containers use case is kmem
 limited. There are people like Michal that specifically asked for
 user-only semantics to be preserved. So your question for global vs
 local switch (that again, doesn't exist; only a local *limit* exists)
 should really be posed in the following way:
 "Can two different use cases with different needs be hosted in the same
 box?"
 
 
 
 > Michal, Johannes, Kamezawa, what are your thoughts?
 >
 waiting! =)
 |  
	|  |  |  
	| 
		
			| Re: [PATCH v3 04/13] kmem accounting basic infrastructure [message #48076 is a reply to message #48075] | Wed, 26 September 2012 22:10   |  
			| 
				
				
					|  Tejun Heo Messages: 184
 Registered: November 2006
 | Senior Member |  |  |  
	| Hello, Glauber. 
 On Thu, Sep 27, 2012 at 01:24:40AM +0400, Glauber Costa wrote:
 > "kmem_accounted" is not a switch. It is an internal representation only.
 > The semantics, that we discussed exhaustively in San Diego, is that a
 > group that is not limited is not accounted. This is simple and consistent.
 >
 > Since the limits are still per-cgroup, you are actually proposing more
 > user-visible complexity than me, since you are adding yet another file,
 > with its own semantics.
 
 I was confused.  I thought it was exposed as a switch to userland (it
 being right below .use_hierarchy tripped red alert).  This is internal
 flag dependent upon kernel limit being set.  My apologies.
 
 So, the proposed behavior is to allow enabling kmemcg anytime but
 ignore what happened inbetween?  Where the knob is changes but the
 weirdity seems all the same.  What prevents us from having a single
 switch at root which can only be flipped when there's no children?
 
 Backward compatibility is covered with single switch and I really
 don't think "you can enable limits for kernel memory anytime but we
 don't keep track of whatever happened before it was flipped the first
 time because the first time is always special" is a sane thing to
 expose to userland.  Or am I misunderstanding the proposed behavior
 again?
 
 Thanks.
 
 --
 tejun
 |  
	|  |  |  
	| 
		
			| Re: [PATCH v3 04/13] kmem accounting basic infrastructure [message #48077 is a reply to message #48073] | Wed, 26 September 2012 22:11   |  
			| 
				
				
					|  Johannes Weiner Messages: 9
 Registered: November 2010
 | Junior Member |  |  |  
	| On Thu, Sep 27, 2012 at 12:02:14AM +0400, Glauber Costa wrote: > On 09/26/2012 11:56 PM, Tejun Heo wrote:
 > > Hello,
 > >
 > > On Wed, Sep 26, 2012 at 11:46:37PM +0400, Glauber Costa wrote:
 > >> Besides not being part of cgroup core, and respecting very much both
 > >> cgroups' and basic sanity properties, kmem is an actual feature that
 > >> some people want, and some people don't. There is no reason to believe
 > >> that applications that want will live in the same environment with ones
 > >> that don't want.
 > >
 > > I don't know.  It definitely is less crazy than .use_hierarchy but I
 > > wouldn't say it's an inherently different thing.  I mean, what does it
 > > even mean to have u+k limit on one subtree and not on another branch?
 > > And we worry about things like what if parent doesn't enable it but
 > > its chlidren do.
 > >
 >
 > It is inherently different. To begin with, it actually contemplates two
 > use cases. It is not a work around.
 >
 > The meaning is also very well defined. The meaning of having this
 > enabled in one subtree and not in other is: Subtree A wants to track
 > kernel memory. Subtree B does not. It's that, and never more than that.
 > There is no maybes and no buts, no magic knobs that makes it behave in a
 > crazy way.
 >
 > If a children enables it but the parent does not, this does what every
 > tree does: enable it from that point downwards.
 >
 > > This is a feature which adds complexity.  If the feature is necessary
 > > and justified, sure.  If not, let's please not and let's err on the
 > > side of conservativeness.  We can always add it later but the other
 > > direction is much harder.
 >
 > I disagree. Having kmem tracking adds complexity. Having to cope with
 > the use case where we turn it on dynamically to cope with the "user page
 > only" use case adds complexity. But I see no significant complexity
 > being added by having it per subtree. Really.
 
 Maybe not in code, but you are adding an extra variable into the
 system.  "One switch per subtree" is more complex than "one switch."
 Yes, the toggle is hidden behind setting the limit, but it's still a
 toggle.  The use_hierarchy complexity comes not from the file that
 enables it, but from the resulting semantics.
 
 kmem accounting is expensive and we definitely want to allow enabling
 it separately from traditional user memory accounting.  But I think
 there is no good reason to not demand an all-or-nothing answer from
 the admin; either he wants kmem tracking on a machine or not.  At
 least you haven't presented a convincing case, IMO.
 
 I don't think there is strong/any demand for per-node toggles, but
 once we add this behavior, people will rely on it and expect kmem
 tracking to stay local and we are stuck with it.  Adding it for the
 reason that people will use it is a self-fulfilling prophecy.
 
 > You have the use_hierarchy fiasco in mind, and I do understand that you
 > are raising the flag and all that.
 >
 > But think in terms of functionality: This thing here is a lot more
 > similar to swap than use_hierarchy. Would you argue that memsw should be
 > per-root ?
 
 We actually do have a per-root flag that controls accounting for swap.
 
 > The reason why it shouldn't: Some people want to limit memory
 > consumption all the way to the swap, some people don't. Same with kmem.
 
 That lies in the nature of the interface: we chose k & u+k rather than
 u & u+k, so our memory.limit_in_bytes will necessarily include kmem,
 while swap is not included there.  But I really doubt that there is a
 strong case for turning on swap accounting intentionally and then
 limiting memory+swap only on certain subtrees.  Where would be the
 sense in that?
 |  
	|  |  |  
	| 
		
			| Re: [PATCH v3 04/13] kmem accounting basic infrastructure [message #48078 is a reply to message #48076] | Wed, 26 September 2012 22:29   |  
			| 
				
				
					|  Glauber Costa Messages: 916
 Registered: October 2011
 | Senior Member |  |  |  
	| On 09/27/2012 02:10 AM, Tejun Heo wrote: > Hello, Glauber.
 >
 > On Thu, Sep 27, 2012 at 01:24:40AM +0400, Glauber Costa wrote:
 >> "kmem_accounted" is not a switch. It is an internal representation only.
 >> The semantics, that we discussed exhaustively in San Diego, is that a
 >> group that is not limited is not accounted. This is simple and consistent.
 >>
 >> Since the limits are still per-cgroup, you are actually proposing more
 >> user-visible complexity than me, since you are adding yet another file,
 >> with its own semantics.
 >
 > I was confused.  I thought it was exposed as a switch to userland (it
 > being right below .use_hierarchy tripped red alert).
 
 Remember I was the one more vocally and radically so far trying to get
 rid of use_hierarchy. I should have been more clear - and I was, as soon
 as I better understood the nature of your opposition - but this is
 precisely what I meant by "inherently different".
 
 >
 > So, the proposed behavior is to allow enabling kmemcg anytime but
 > ignore what happened inbetween?  Where the knob is changes but the
 > weirdity seems all the same.  What prevents us from having a single
 > switch at root which can only be flipped when there's no children?
 
 So I view this very differently from you. We have no root-only switches
 in memcg. This would be a first, and this is the kind of thing that adds
 complexity, in my view.
 
 You have someone like libvirt or a systemd service using memcg. It
 probably starts at boot. Once it is started, it will pretty much prevent
 switching of any global switch like this.
 
 And then what? If you want a different behavior you need to go kill all
 your services that are using memcg so you can get the behavior you want?
 And if they happen to be making a specific flag choice by design, you
 just say "you really can't run A + B together" ?
 
 I myself think global switches are an unnecessary complication. And let
 us not talk about use_hierarchy, please. If it becomes global, it is
 going to be as part of a phase out plan anyway. The problem with that is
 not that it is global, is that it shouldn't even exist.
 
 >
 > Backward compatibility is covered with single switch and I really
 > don't think "you can enable limits for kernel memory anytime but we
 > don't keep track of whatever happened before it was flipped the first
 > time because the first time is always special" is a sane thing to
 > expose to userland.  Or am I misunderstanding the proposed behavior
 > again?
 >
 
 You do keep track. Before you switch it for the first time, it all
 belongs to the root memcg.
 |  
	|  |  |  
	| 
		
			| Re: [PATCH v3 04/13] kmem accounting basic infrastructure [message #48079 is a reply to message #48078] | Wed, 26 September 2012 22:42   |  
			| 
				
				
					|  Tejun Heo Messages: 184
 Registered: November 2006
 | Senior Member |  |  |  
	| Hello, Glauber. 
 On Thu, Sep 27, 2012 at 02:29:06AM +0400, Glauber Costa wrote:
 > And then what? If you want a different behavior you need to go kill all
 > your services that are using memcg so you can get the behavior you want?
 > And if they happen to be making a specific flag choice by design, you
 > just say "you really can't run A + B together" ?
 >
 > I myself think global switches are an unnecessary complication. And let
 > us not talk about use_hierarchy, please. If it becomes global, it is
 > going to be as part of a phase out plan anyway. The problem with that is
 > not that it is global, is that it shouldn't even exist.
 
 I would consider it more of a compatibility thing which is set during
 boot and configurable by sysadmin.  Let the newer systems enable it by
 default on boot and old configs / special ones disable it as
 necessary.
 
 > > Backward compatibility is covered with single switch and I really
 > > don't think "you can enable limits for kernel memory anytime but we
 > > don't keep track of whatever happened before it was flipped the first
 > > time because the first time is always special" is a sane thing to
 > > expose to userland.  Or am I misunderstanding the proposed behavior
 > > again?
 >
 > You do keep track. Before you switch it for the first time, it all
 > belongs to the root memcg.
 
 Well, that's really playing with words.  Limit is per cgroup and
 before the limit is set for the first time, everything is accounted to
 something else.  How is that keeping track?
 
 The proposed behavior seems really crazy to me.  Do people really
 think this is a good idea?
 
 Thanks.
 
 --
 tejun
 |  
	|  |  |  
	| 
		
			| Re: [PATCH v3 04/13] kmem accounting basic infrastructure [message #48080 is a reply to message #48077] | Wed, 26 September 2012 22:45   |  
			| 
				
				
					|  Glauber Costa Messages: 916
 Registered: October 2011
 | Senior Member |  |  |  
	| On 09/27/2012 02:11 AM, Johannes Weiner wrote: > On Thu, Sep 27, 2012 at 12:02:14AM +0400, Glauber Costa wrote:
 >> On 09/26/2012 11:56 PM, Tejun Heo wrote:
 >>> Hello,
 >>>
 >>> On Wed, Sep 26, 2012 at 11:46:37PM +0400, Glauber Costa wrote:
 >>>> Besides not being part of cgroup core, and respecting very much both
 >>>> cgroups' and basic sanity properties, kmem is an actual feature that
 >>>> some people want, and some people don't. There is no reason to believe
 >>>> that applications that want will live in the same environment with ones
 >>>> that don't want.
 >>>
 >>> I don't know.  It definitely is less crazy than .use_hierarchy but I
 >>> wouldn't say it's an inherently different thing.  I mean, what does it
 >>> even mean to have u+k limit on one subtree and not on another branch?
 >>> And we worry about things like what if parent doesn't enable it but
 >>> its chlidren do.
 >>>
 >>
 >> It is inherently different. To begin with, it actually contemplates two
 >> use cases. It is not a work around.
 >>
 >> The meaning is also very well defined. The meaning of having this
 >> enabled in one subtree and not in other is: Subtree A wants to track
 >> kernel memory. Subtree B does not. It's that, and never more than that.
 >> There is no maybes and no buts, no magic knobs that makes it behave in a
 >> crazy way.
 >>
 >> If a children enables it but the parent does not, this does what every
 >> tree does: enable it from that point downwards.
 >>
 >>> This is a feature which adds complexity.  If the feature is necessary
 >>> and justified, sure.  If not, let's please not and let's err on the
 >>> side of conservativeness.  We can always add it later but the other
 >>> direction is much harder.
 >>
 >> I disagree. Having kmem tracking adds complexity. Having to cope with
 >> the use case where we turn it on dynamically to cope with the "user page
 >> only" use case adds complexity. But I see no significant complexity
 >> being added by having it per subtree. Really.
 >
 > Maybe not in code, but you are adding an extra variable into the
 > system.  "One switch per subtree" is more complex than "one switch."
 > Yes, the toggle is hidden behind setting the limit, but it's still a
 > toggle.  The use_hierarchy complexity comes not from the file that
 > enables it, but from the resulting semantics.
 >
 
 I didn't claim the complexity was in the code. I actually think the
 other way around that you do, and claim that a global switch is more
 complex than a per-subtree. All properties we have so far applies to
 subtrees, due to cgroup's hierarchical nature. We have no global
 switches like this so far, and adding one would just add a new concept
 that wasn't here.
 
 
 > kmem accounting is expensive and we definitely want to allow enabling
 > it separately from traditional user memory accounting.  But I think
 > there is no good reason to not demand an all-or-nothing answer from
 > the admin; either he wants kmem tracking on a machine or not.  At
 > least you haven't presented a convincing case, IMO.
 >
 > I don't think there is strong/any demand for per-node toggles, but
 > once we add this behavior, people will rely on it and expect kmem
 > tracking to stay local and we are stuck with it.  Adding it for the
 > reason that people will use it is a self-fulfilling prophecy.
 
 
 I don't think this is a compatibility only switch. Much has been said in
 the past about the problem of sharing. A lot of the kernel objects are
 shared by nature, this is pretty much unavoidable. The answer we have
 been giving to this inquiry, is that the workloads (us) interested
 in kmem accounted tend to be quite local in their file accesses (and
 other kernel objects as well).
 
 It should be obvious that not all workloads are like this, and some of
 them would actually prefer to have their umem limited only.
 
 I really don't think, and correct me if I am wrong, that the problem
 lays in "is there a use case for umem?", but rather, if they should be
 allowed to coexist in a box.
 
 And honestly, it seems to me totally reasonable to avoid restricting
 people to run as many workloads they think they can in the same box.
 
 
 >> You have the use_hierarchy fiasco in mind, and I do understand that you
 >> are raising the flag and all that.
 >>
 >> But think in terms of functionality: This thing here is a lot more
 >> similar to swap than use_hierarchy. Would you argue that memsw should be
 >> per-root ?
 >
 > We actually do have a per-root flag that controls accounting for swap.
 >
 >> The reason why it shouldn't: Some people want to limit memory
 >> consumption all the way to the swap, some people don't. Same with kmem.
 >
 > That lies in the nature of the interface: we chose k & u+k rather than
 > u & u+k, so our memory.limit_in_bytes will necessarily include kmem,
 > while swap is not included there.  But I really doubt that there is a
 > strong case for turning on swap accounting intentionally and then
 > limiting memory+swap only on certain subtrees.  Where would be the
 > sense in that?
 
 It makes absolute sense. Because until I go set
 memory.memsw.limit_in_bytes, my subtree is not limited, which is
 precisely what kmem does.
 
 And the use cases for that are:
 
 1) I, application A, want to use 2G of mem, and I can never swap
 2) I, application B, want to use 2G of mem, but I am fine using extra 1G
 in swap.
 
 There are plenty of workloads in both the "can swap" and "can't swap"
 category around.
 |  
	|  |  |  
	| 
		
			| Re: [PATCH v3 04/13] kmem accounting basic infrastructure [message #48081 is a reply to message #48079] | Wed, 26 September 2012 22:54   |  
			| 
				
				
					|  Glauber Costa Messages: 916
 Registered: October 2011
 | Senior Member |  |  |  
	| On 09/27/2012 02:42 AM, Tejun Heo wrote: > Hello, Glauber.
 >
 > On Thu, Sep 27, 2012 at 02:29:06AM +0400, Glauber Costa wrote:
 >> And then what? If you want a different behavior you need to go kill all
 >> your services that are using memcg so you can get the behavior you want?
 >> And if they happen to be making a specific flag choice by design, you
 >> just say "you really can't run A + B together" ?
 >>
 >> I myself think global switches are an unnecessary complication. And let
 >> us not talk about use_hierarchy, please. If it becomes global, it is
 >> going to be as part of a phase out plan anyway. The problem with that is
 >> not that it is global, is that it shouldn't even exist.
 >
 > I would consider it more of a compatibility thing which is set during
 > boot and configurable by sysadmin.  Let the newer systems enable it by
 > default on boot and old configs / special ones disable it as
 > necessary.
 >
 
 I don't. Much has been said in the past about the problem of sharing. A
 lot of the kernel objects are shared by nature, this is pretty much
 unavoidable. The answer we have been giving to this inquiry, is that the
 workloads (us) interested in kmem accounted tend to be quite local in
 their file accesses (and other kernel objects as well).
 
 It should be obvious that not all workloads are like this, and some of
 them would actually prefer to have their umem limited only.
 
 There is nothing unreasonable in tracking user memory only.
 
 If we have a global switch for "tracking all kernel memory", who would
 you account the objects that are heavily shared to? I solve this by not
 tracking kernel memory for cgroups in such workloads. What do you propose?
 
 >>> Backward compatibility is covered with single switch and I really
 >>> don't think "you can enable limits for kernel memory anytime but we
 >>> don't keep track of whatever happened before it was flipped the first
 >>> time because the first time is always special" is a sane thing to
 >>> expose to userland.  Or am I misunderstanding the proposed behavior
 >>> again?
 >>
 >> You do keep track. Before you switch it for the first time, it all
 >> belongs to the root memcg.
 >
 > Well, that's really playing with words.  Limit is per cgroup and
 > before the limit is set for the first time, everything is accounted to
 > something else.  How is that keeping track?
 >
 
 Even after the limit is set, it is set only by workloads that want kmem
 to be tracked. If you want to track it during the whole lifetime of the
 cgroup, you switch it before you put tasks to it. What is so crazy about it?
 
 > The proposed behavior seems really crazy to me.  Do people really
 > think this is a good idea?
 >
 
 It is really sad that you lost the opportunity to say that in a room
 full of mm developers that could add to this discussion in real time,
 when after an explanation about this was given, Mel asked if anyone
 would have any objections to this.
 |  
	|  |  |  
	| 
		
			| Re: [PATCH v3 04/13] kmem accounting basic infrastructure [message #48082 is a reply to message #48081] | Wed, 26 September 2012 23:08   |  
			| 
				
				
					|  Tejun Heo Messages: 184
 Registered: November 2006
 | Senior Member |  |  |  
	| Hello, Glauber. 
 On Thu, Sep 27, 2012 at 02:54:11AM +0400, Glauber Costa wrote:
 > I don't. Much has been said in the past about the problem of sharing. A
 > lot of the kernel objects are shared by nature, this is pretty much
 > unavoidable. The answer we have been giving to this inquiry, is that the
 > workloads (us) interested in kmem accounted tend to be quite local in
 > their file accesses (and other kernel objects as well).
 >
 > It should be obvious that not all workloads are like this, and some of
 > them would actually prefer to have their umem limited only.
 >
 > There is nothing unreasonable in tracking user memory only.
 >
 > If we have a global switch for "tracking all kernel memory", who would
 > you account the objects that are heavily shared to? I solve this by not
 > tracking kernel memory for cgroups in such workloads. What do you propose?
 
 One of the things wrong with that is that it exposes the limitation of
 the current implementation as interface to userland, which is never a
 good idea.  In addition, how is userland supposed to know which
 workload is shared kmem heavy or not?  Details like that are not even
 inherent to workloads.  It's highly dependent on kernel implementation
 which may change any day.  If we hit workloads like that the right
 thing to do is improving kmemcg so that such problems don't occur, not
 exposing another switch.
 
 If we can't make that work in reasonable (doesn't have to be perfect)
 way, we might as well just give up on kmem controller.  If userland
 has to second-guess kernel implementation details to make use of it,
 it's useless.
 
 > > Well, that's really playing with words.  Limit is per cgroup and
 > > before the limit is set for the first time, everything is accounted to
 > > something else.  How is that keeping track?
 > >
 >
 > Even after the limit is set, it is set only by workloads that want kmem
 > to be tracked. If you want to track it during the whole lifetime of the
 > cgroup, you switch it before you put tasks to it. What is so crazy about it?
 
 The fact that the numbers don't really mean what they apparently
 should mean.
 
 > > The proposed behavior seems really crazy to me.  Do people really
 > > think this is a good idea?
 >
 > It is really sad that you lost the opportunity to say that in a room
 > full of mm developers that could add to this discussion in real time,
 > when after an explanation about this was given, Mel asked if anyone
 > would have any objections to this.
 
 Sure, conferences are useful for building consensus but that's the
 extent of it.  Sorry that I didn't realize the implications then but
 conferences don't really add any finality to decisions.
 
 So, this seems properly crazy to me at the similar level of
 use_hierarchy fiasco.  I'm gonna NACK on this.
 
 Thanks.
 
 --
 tejun
 |  
	|  |  |  
	| 
		
			| Re: [PATCH v3 04/13] kmem accounting basic infrastructure [message #48083 is a reply to message #48082] | Wed, 26 September 2012 23:20   |  
			| 
				
				
					|  Glauber Costa Messages: 916
 Registered: October 2011
 | Senior Member |  |  |  
	| On 09/27/2012 03:08 AM, Tejun Heo wrote: > Hello, Glauber.
 >
 > On Thu, Sep 27, 2012 at 02:54:11AM +0400, Glauber Costa wrote:
 >> I don't. Much has been said in the past about the problem of sharing. A
 >> lot of the kernel objects are shared by nature, this is pretty much
 >> unavoidable. The answer we have been giving to this inquiry, is that the
 >> workloads (us) interested in kmem accounted tend to be quite local in
 >> their file accesses (and other kernel objects as well).
 >>
 >> It should be obvious that not all workloads are like this, and some of
 >> them would actually prefer to have their umem limited only.
 >>
 >> There is nothing unreasonable in tracking user memory only.
 >>
 >> If we have a global switch for "tracking all kernel memory", who would
 >> you account the objects that are heavily shared to? I solve this by not
 >> tracking kernel memory for cgroups in such workloads. What do you propose?
 >
 > One of the things wrong with that is that it exposes the limitation of
 > the current implementation as interface to userland, which is never a
 > good idea.  In addition, how is userland supposed to know which
 > workload is shared kmem heavy or not?  Details like that are not even
 > inherent to workloads.  It's highly dependent on kernel implementation
 > which may change any day.  If we hit workloads like that the right
 > thing to do is improving kmemcg so that such problems don't occur, not
 > exposing another switch.
 >
 
 Sorry, there is nothing implementation dependent in here. One of the
 biggest consumers of all this, are dentries. Dentries are related to
 the paths you touch. If you touch files in a self-contained directory,
 where you don't expect anyone else to touch, this can safely be
 considered local. If you touch files all around, this can safely be
 considered not local. Where is the implementation dependent part?
 
 > If we can't make that work in reasonable (doesn't have to be perfect)
 > way, we might as well just give up on kmem controller.  If userland
 > has to second-guess kernel implementation details to make use of it,
 > it's useless.
 >
 As I said above, it shouldn't.
 
 >>> Well, that's really playing with words.  Limit is per cgroup and
 >>> before the limit is set for the first time, everything is accounted to
 >>> something else.  How is that keeping track?
 >>>
 >>
 >> Even after the limit is set, it is set only by workloads that want kmem
 >> to be tracked. If you want to track it during the whole lifetime of the
 >> cgroup, you switch it before you put tasks to it. What is so crazy about it?
 >
 > The fact that the numbers don't really mean what they apparently
 > should mean.
 >
 
 This is vague. The usage file in the cgroup means how much kernel memory
 was used by that cgroup. If it really bothers you that this may not be
 set through the whole group's lifetime, it is also easily solvable.
 
 > So, this seems properly crazy to me at the similar level of
 > use_hierarchy fiasco.  I'm gonna NACK on this.
 >
 
 As I said: all use cases I particularly care about are covered by a
 global switch.
 
 I am laying down my views because I really believe they make more sense.
 But at some point, of course, I'll shut up if I believe I am a lone voice.
 
 I believe it should still be good to hear from mhocko and kame, but from
 your point of view, would all the rest, plus the introduction of a
 global switch make it acceptable to you?
 |  
	|  |  |  
	| 
		
			| Re: [PATCH v3 04/13] kmem accounting basic infrastructure [message #48084 is a reply to message #48083] | Wed, 26 September 2012 23:33   |  
			| 
				
				
					|  Tejun Heo Messages: 184
 Registered: November 2006
 | Senior Member |  |  |  
	| Hello, 
 On Thu, Sep 27, 2012 at 03:20:27AM +0400, Glauber Costa wrote:
 > > One of the things wrong with that is that it exposes the limitation of
 > > the current implementation as interface to userland, which is never a
 > > good idea.  In addition, how is userland supposed to know which
 > > workload is shared kmem heavy or not?  Details like that are not even
 > > inherent to workloads.  It's highly dependent on kernel implementation
 > > which may change any day.  If we hit workloads like that the right
 > > thing to do is improving kmemcg so that such problems don't occur, not
 > > exposing another switch.
 >
 > Sorry, there is nothing implementation dependent in here. One of the
 > biggest consumers of all this, are dentries. Dentries are related to
 > the paths you touch. If you touch files in a self-contained directory,
 > where you don't expect anyone else to touch, this can safely be
 > considered local. If you touch files all around, this can safely be
 > considered not local. Where is the implementation dependent part?
 
 For things like dentries and inodes and if that really matters, we
 should be able to account for the usage better, no?  And frankly I'm
 not even sold on that usecase.  Unless there's a way to detect and
 inform about these, userland isn't gonna know that they're doing
 something which consumes a lot of shared memory even that activity is
 filesystem walking.  You're still asking userland to tune something
 depending on parameters not easily visible from userland.  It's a lose
 lose situation.
 
 > >> Even after the limit is set, it is set only by workloads that want kmem
 > >> to be tracked. If you want to track it during the whole lifetime of the
 > >> cgroup, you switch it before you put tasks to it. What is so crazy about it?
 > >
 > > The fact that the numbers don't really mean what they apparently
 > > should mean.
 > >
 >
 > This is vague. The usage file in the cgroup means how much kernel memory
 > was used by that cgroup. If it really bothers you that this may not be
 > set through the whole group's lifetime, it is also easily solvable.
 
 Yes, easily by having a global switch which can be manipulated when
 there's no children.  It really seems like a no brainer to me.
 
 > > So, this seems properly crazy to me at the similar level of
 > > use_hierarchy fiasco.  I'm gonna NACK on this.
 >
 > As I said: all use cases I particularly care about are covered by a
 > global switch.
 >
 > I am laying down my views because I really believe they make more sense.
 > But at some point, of course, I'll shut up if I believe I am a lone voice.
 >
 > I believe it should still be good to hear from mhocko and kame, but from
 > your point of view, would all the rest, plus the introduction of a
 > global switch make it acceptable to you?
 
 The only thing I'm whining about is per-node switch + silently
 ignoring past accounting, so if those two are solved, I think I'm
 pretty happy with the rest.
 
 Thanks.
 
 --
 tejun
 |  
	|  |  |  
	| 
		
			| Re: [PATCH v3 04/13] kmem accounting basic infrastructure [message #48091 is a reply to message #48075] | Thu, 27 September 2012 12:08   |  
			| 
				
				
					|  Michal Hocko Messages: 109
 Registered: December 2011
 | Senior Member |  |  |  
	| On Thu 27-09-12 01:24:40, Glauber Costa wrote: [...]
 > About use cases, I've already responded: my containers use case is kmem
 > limited. There are people like Michal that specifically asked for
 > user-only semantics to be preserved.
 
 Yes, because we have many users (basically almost all) who care only
 about the user memory because that's what occupies the vast majority of
 the memory. They usually want to isolate workload which would disrupt
 the global memory otherwise (e.g. backup process vs. database). You
 really do not want to pay an additional overhead for kmem accounting
 here.
 
 > So your question for global vs local switch (that again, doesn't
 > exist; only a local *limit* exists) should really be posed in the
 > following way:  "Can two different use cases with different needs be
 > hosted in the same box?"
 
 I think this is a good and a relevant question. I think this boils down
 to whether you want to have trusted and untrusted workloads at the same
 machine.
 Trusted loads usually only need user memory accounting because kmem
 consumption should be really negligible (unless kernel is doing
 something really stupid and no kmem limit will help here).
 On the other hand, untrusted workloads can do nasty things that
 administrator has hard time to mitigate and setting a kmem limit can
 help significantly.
 
 IMHO such a different loads exist on a single machine quite often (Web
 server and a back up process as the most simplistic one). The per
 hierarchy accounting, therefore, sounds like a good idea without too
 much added complexity (actually the only added complexity is in the
 proper kmem.limit_in_bytes handling which is a single place).
 
 So I would rather go with per-hierarchy thing.
 
 > > Michal, Johannes, Kamezawa, what are your thoughts?
 > >
 > waiting! =)
 
 Well, you guys generated a lot of discussion that one has to read
 through, didn't you :P
 
 --
 Michal Hocko
 SUSE Labs
 |  
	|  |  |  
	|  |  
	| 
		
			| Re: [PATCH v3 04/13] kmem accounting basic infrastructure [message #48094 is a reply to message #48084] | Thu, 27 September 2012 12:15   |  
			| 
				
				
					|  Michal Hocko Messages: 109
 Registered: December 2011
 | Senior Member |  |  |  
	| On Wed 26-09-12 16:33:34, Tejun Heo wrote: [...]
 > > > So, this seems properly crazy to me at the similar level of
 > > > use_hierarchy fiasco.  I'm gonna NACK on this.
 > >
 > > As I said: all use cases I particularly care about are covered by a
 > > global switch.
 > >
 > > I am laying down my views because I really believe they make more sense.
 > > But at some point, of course, I'll shut up if I believe I am a lone voice.
 > >
 > > I believe it should still be good to hear from mhocko and kame, but from
 > > your point of view, would all the rest, plus the introduction of a
 > > global switch make it acceptable to you?
 >
 > The only thing I'm whining about is per-node switch + silently
 > ignoring past accounting, so if those two are solved, I think I'm
 > pretty happy with the rest.
 
 I think that per-group "switch" is not nice as well but if we make it
 hierarchy specific (which I am proposing for quite some time) and do not
 let enable accounting for a group with tasks then we get both
 flexibility and reasonable semantic. A global switch sounds too coars to
 me and it really not necessary.
 
 Would this work with you?
 --
 Michal Hocko
 SUSE Labs
 |  
	|  |  |  
	| 
		
			| Re: [PATCH v3 04/13] kmem accounting basic infrastructure [message #48096 is a reply to message #48094] | Thu, 27 September 2012 12:20   |  
			| 
				
				
					|  Glauber Costa Messages: 916
 Registered: October 2011
 | Senior Member |  |  |  
	| On 09/27/2012 04:15 PM, Michal Hocko wrote: > On Wed 26-09-12 16:33:34, Tejun Heo wrote:
 > [...]
 >>>> So, this seems properly crazy to me at the similar level of
 >>>> use_hierarchy fiasco.  I'm gonna NACK on this.
 >>>
 >>> As I said: all use cases I particularly care about are covered by a
 >>> global switch.
 >>>
 >>> I am laying down my views because I really believe they make more sense.
 >>> But at some point, of course, I'll shut up if I believe I am a lone voice.
 >>>
 >>> I believe it should still be good to hear from mhocko and kame, but from
 >>> your point of view, would all the rest, plus the introduction of a
 >>> global switch make it acceptable to you?
 >>
 >> The only thing I'm whining about is per-node switch + silently
 >> ignoring past accounting, so if those two are solved, I think I'm
 >> pretty happy with the rest.
 >
 > I think that per-group "switch" is not nice as well but if we make it
 > hierarchy specific (which I am proposing for quite some time) and do not
 > let enable accounting for a group with tasks then we get both
 > flexibility and reasonable semantic. A global switch sounds too coars to
 > me and it really not necessary.
 >
 > Would this work with you?
 >
 
 How exactly would that work? AFAIK, we have a single memcg root, we
 can't have multiple memcg hierarchies in a system. Am I missing something?
 |  
	|  |  |  
	| 
		
			| Re: [PATCH v3 04/13] kmem accounting basic infrastructure [message #48097 is a reply to message #48096] | Thu, 27 September 2012 12:40   |  
			| 
				
				
					|  Michal Hocko Messages: 109
 Registered: December 2011
 | Senior Member |  |  |  
	| On Thu 27-09-12 16:20:55, Glauber Costa wrote: > On 09/27/2012 04:15 PM, Michal Hocko wrote:
 > > On Wed 26-09-12 16:33:34, Tejun Heo wrote:
 > > [...]
 > >>>> So, this seems properly crazy to me at the similar level of
 > >>>> use_hierarchy fiasco.  I'm gonna NACK on this.
 > >>>
 > >>> As I said: all use cases I particularly care about are covered by a
 > >>> global switch.
 > >>>
 > >>> I am laying down my views because I really believe they make more sense.
 > >>> But at some point, of course, I'll shut up if I believe I am a lone voice.
 > >>>
 > >>> I believe it should still be good to hear from mhocko and kame, but from
 > >>> your point of view, would all the rest, plus the introduction of a
 > >>> global switch make it acceptable to you?
 > >>
 > >> The only thing I'm whining about is per-node switch + silently
 > >> ignoring past accounting, so if those two are solved, I think I'm
 > >> pretty happy with the rest.
 > >
 > > I think that per-group "switch" is not nice as well but if we make it
 > > hierarchy specific (which I am proposing for quite some time) and do not
 > > let enable accounting for a group with tasks then we get both
 > > flexibility and reasonable semantic. A global switch sounds too coars to
 > > me and it really not necessary.
 > >
 > > Would this work with you?
 > >
 >
 > How exactly would that work? AFAIK, we have a single memcg root, we
 > can't have multiple memcg hierarchies in a system. Am I missing something?
 
 Well root is so different that we could consider the first level as the
 real roots for hierarchies.
 --
 Michal Hocko
 SUSE Labs
 |  
	|  |  |  
	| 
		
			| Re: [PATCH v3 04/13] kmem accounting basic infrastructure [message #48098 is a reply to message #48097] | Thu, 27 September 2012 12:40   |  
			| 
				
				
					|  Glauber Costa Messages: 916
 Registered: October 2011
 | Senior Member |  |  |  
	| On 09/27/2012 04:40 PM, Michal Hocko wrote: > On Thu 27-09-12 16:20:55, Glauber Costa wrote:
 >> On 09/27/2012 04:15 PM, Michal Hocko wrote:
 >>> On Wed 26-09-12 16:33:34, Tejun Heo wrote:
 >>> [...]
 >>>>>> So, this seems properly crazy to me at the similar level of
 >>>>>> use_hierarchy fiasco.  I'm gonna NACK on this.
 >>>>>
 >>>>> As I said: all use cases I particularly care about are covered by a
 >>>>> global switch.
 >>>>>
 >>>>> I am laying down my views because I really believe they make more sense.
 >>>>> But at some point, of course, I'll shut up if I believe I am a lone voice.
 >>>>>
 >>>>> I believe it should still be good to hear from mhocko and kame, but from
 >>>>> your point of view, would all the rest, plus the introduction of a
 >>>>> global switch make it acceptable to you?
 >>>>
 >>>> The only thing I'm whining about is per-node switch + silently
 >>>> ignoring past accounting, so if those two are solved, I think I'm
 >>>> pretty happy with the rest.
 >>>
 >>> I think that per-group "switch" is not nice as well but if we make it
 >>> hierarchy specific (which I am proposing for quite some time) and do not
 >>> let enable accounting for a group with tasks then we get both
 >>> flexibility and reasonable semantic. A global switch sounds too coars to
 >>> me and it really not necessary.
 >>>
 >>> Would this work with you?
 >>>
 >>
 >> How exactly would that work? AFAIK, we have a single memcg root, we
 >> can't have multiple memcg hierarchies in a system. Am I missing something?
 >
 > Well root is so different that we could consider the first level as the
 > real roots for hierarchies.
 >
 So let's favor clarity: What you are proposing is that the first level
 can have a switch for that, and the first level only. Is that right ?
 
 At first, I just want to understand what exactly is your proposal. This
 is not an endorsement of lack thereof.
 |  
	|  |  |  
	| 
		
			| Re: [PATCH v3 04/13] kmem accounting basic infrastructure [message #48099 is a reply to message #48098] | Thu, 27 September 2012 12:54   |  
			| 
				
				
					|  Michal Hocko Messages: 109
 Registered: December 2011
 | Senior Member |  |  |  
	| On Thu 27-09-12 16:40:03, Glauber Costa wrote: > On 09/27/2012 04:40 PM, Michal Hocko wrote:
 > > On Thu 27-09-12 16:20:55, Glauber Costa wrote:
 > >> On 09/27/2012 04:15 PM, Michal Hocko wrote:
 > >>> On Wed 26-09-12 16:33:34, Tejun Heo wrote:
 > >>> [...]
 > >>>>>> So, this seems properly crazy to me at the similar level of
 > >>>>>> use_hierarchy fiasco.  I'm gonna NACK on this.
 > >>>>>
 > >>>>> As I said: all use cases I particularly care about are covered by a
 > >>>>> global switch.
 > >>>>>
 > >>>>> I am laying down my views because I really believe they make more sense.
 > >>>>> But at some point, of course, I'll shut up if I believe I am a lone voice.
 > >>>>>
 > >>>>> I believe it should still be good to hear from mhocko and kame, but from
 > >>>>> your point of view, would all the rest, plus the introduction of a
 > >>>>> global switch make it acceptable to you?
 > >>>>
 > >>>> The only thing I'm whining about is per-node switch + silently
 > >>>> ignoring past accounting, so if those two are solved, I think I'm
 > >>>> pretty happy with the rest.
 > >>>
 > >>> I think that per-group "switch" is not nice as well but if we make it
 > >>> hierarchy specific (which I am proposing for quite some time) and do not
 > >>> let enable accounting for a group with tasks then we get both
 > >>> flexibility and reasonable semantic. A global switch sounds too coars to
 > >>> me and it really not necessary.
 > >>>
 > >>> Would this work with you?
 > >>>
 > >>
 > >> How exactly would that work? AFAIK, we have a single memcg root, we
 > >> can't have multiple memcg hierarchies in a system. Am I missing something?
 > >
 > > Well root is so different that we could consider the first level as the
 > > real roots for hierarchies.
 > >
 > So let's favor clarity: What you are proposing is that the first level
 > can have a switch for that, and the first level only. Is that right ?
 
 I do not want any more switches. I am fine with your "set the limit and
 start accounting apprach" and then inherit the _internal_ flag down the
 hierarchy.
 If you are in a child and want to set the limit then you can do that
 only if your parent is accounted already (so that you can have your own
 limit). We will need the same thing for oom_controll and swappinness.
 --
 Michal Hocko
 SUSE Labs
 |  
	|  |  |  
	| 
		
			| Re: [PATCH v3 04/13] kmem accounting basic infrastructure [message #48108 is a reply to message #48082] | Thu, 27 September 2012 14:28   |  
			| 
				
				
					|  Mel Gorman Messages: 8
 Registered: August 2012
 | Junior Member |  |  |  
	| On Wed, Sep 26, 2012 at 04:08:07PM -0700, Tejun Heo wrote: > Hello, Glauber.
 >
 > On Thu, Sep 27, 2012 at 02:54:11AM +0400, Glauber Costa wrote:
 > > I don't. Much has been said in the past about the problem of sharing. A
 > > lot of the kernel objects are shared by nature, this is pretty much
 > > unavoidable. The answer we have been giving to this inquiry, is that the
 > > workloads (us) interested in kmem accounted tend to be quite local in
 > > their file accesses (and other kernel objects as well).
 > >
 > > It should be obvious that not all workloads are like this, and some of
 > > them would actually prefer to have their umem limited only.
 > >
 > > There is nothing unreasonable in tracking user memory only.
 > >
 > > If we have a global switch for "tracking all kernel memory", who would
 > > you account the objects that are heavily shared to? I solve this by not
 > > tracking kernel memory for cgroups in such workloads. What do you propose?
 >
 > One of the things wrong with that is that it exposes the limitation of
 > the current implementation as interface to userland, which is never a
 > good idea.
 
 I think the limitations have been fairly clearly explained and any admin
 using the interface is going to have *some* familiarity with the limitations.
 
 > In addition, how is userland supposed to know which
 > workload is shared kmem heavy or not?
 
 By using a bit of common sense.
 
 An application may not be able to figure this out but the administrator
 is going to be able to make a very educated guess. If processes running
 within two containers are not sharing a filesystem hierarchy for example
 then it'll be clear they are not sharing dentries.
 
 If there was a suspicion they were then it could be analysed with
 something like SystemTap probing when files are opened and see if files
 are being opened that are shared between containers.
 
 It's not super-easy but it's not impossible either and I fail to see why
 it's such a big deal for you.
 
 >Details like that are not even
 > inherent to workloads.  It's highly dependent on kernel implementation
 > which may change any day.  If we hit workloads like that the right
 > thing to do is improving kmemcg so that such problems don't occur, not
 > exposing another switch.
 >
 > If we can't make that work in reasonable (doesn't have to be perfect)
 > way, we might as well just give up on kmem controller.  If userland
 > has to second-guess kernel implementation details to make use of it,
 > it's useless.
 >
 > > > Well, that's really playing with words.  Limit is per cgroup and
 > > > before the limit is set for the first time, everything is accounted to
 > > > something else.  How is that keeping track?
 > > >
 > >
 > > Even after the limit is set, it is set only by workloads that want kmem
 > > to be tracked. If you want to track it during the whole lifetime of the
 > > cgroup, you switch it before you put tasks to it. What is so crazy about it?
 >
 > The fact that the numbers don't really mean what they apparently
 > should mean.
 >
 
 I think it is a reasonable limitation that only some kernel allocations are
 accounted for although I'll freely admit I'm not a cgroup or memcg user
 either.
 
 My understanding is that this comes down to cost -- accounting for the
 kernel memory usage is expensive so it is limited only to the allocations
 that are easy to abuse by an unprivileged process. Hence this is
 initially concerned with stack pages with dentries and TCP usage to
 follow in later patches.
 
 Further I would expect that an administrator would be aware of these
 limitations and set kmem_accounting at cgroup creation time before any
 processes start. Maybe that should be enforced but it's not a
 fundamental problem.
 
 Due to the cost of accounting, I can see why it would be desirable to
 enable kmem_accounting for some cgroup trees and not others. It is not
 unreasonable to expect that an administrator might want to account within
 one cgroup where processes are accessing millions of files without
 impairing the performance of another cgroup that is mostly using
 anonymous memory.
 
 > > > The proposed behavior seems really crazy to me.  Do people really
 > > > think this is a good idea?
 > >
 > > It is really sad that you lost the opportunity to say that in a room
 > > full of mm developers that could add to this discussion in real time,
 > > when after an explanation about this was given, Mel asked if anyone
 > > would have any objections to this.
 >
 > Sure, conferences are useful for building consensus but that's the
 > extent of it.  Sorry that I didn't realize the implications then but
 > conferences don't really add any finality to decisions.
 >
 > So, this seems properly crazy to me at the similar level of
 > use_hierarchy fiasco.  I'm gonna NACK on this.
 >
 
 I think you're over-reacting to say the very least :|
 
 --
 Mel Gorman
 SUSE Labs
 |  
	|  |  |  
	| 
		
			| Re: [PATCH v3 04/13] kmem accounting basic infrastructure [message #48109 is a reply to message #48091] | Thu, 27 September 2012 14:33   |  
			| 
				
				
					|  Tejun Heo Messages: 184
 Registered: November 2006
 | Senior Member |  |  |  
	| Hello, Michal. 
 On Thu, Sep 27, 2012 at 02:08:06PM +0200, Michal Hocko wrote:
 > Yes, because we have many users (basically almost all) who care only
 > about the user memory because that's what occupies the vast majority of
 > the memory. They usually want to isolate workload which would disrupt
 > the global memory otherwise (e.g. backup process vs. database). You
 > really do not want to pay an additional overhead for kmem accounting
 > here.
 
 I'm not too convinced.  First of all, the overhead added by kmemcg
 isn't big.  The hot path overhead is quite minimal - it doesn't do
 much more than indirecting one more time.  In terms of memory usage,
 it sure could lead to a bit more fragmentation but even if it gets to
 several megs per cgroup, I don't think that's something excessive.
 So, there is overhead but I don't believe it to be prohibitive.
 
 > > So your question for global vs local switch (that again, doesn't
 > > exist; only a local *limit* exists) should really be posed in the
 > > following way:  "Can two different use cases with different needs be
 > > hosted in the same box?"
 >
 > I think this is a good and a relevant question. I think this boils down
 > to whether you want to have trusted and untrusted workloads at the same
 > machine.
 > Trusted loads usually only need user memory accounting because kmem
 > consumption should be really negligible (unless kernel is doing
 > something really stupid and no kmem limit will help here).
 > On the other hand, untrusted workloads can do nasty things that
 > administrator has hard time to mitigate and setting a kmem limit can
 > help significantly.
 >
 > IMHO such a different loads exist on a single machine quite often (Web
 > server and a back up process as the most simplistic one). The per
 > hierarchy accounting, therefore, sounds like a good idea without too
 > much added complexity (actually the only added complexity is in the
 > proper kmem.limit_in_bytes handling which is a single place).
 
 The distinction between "trusted" and "untrusted" is something
 artificially created due to the assumed deficiency of kmemcg
 implementation.  Making things like this visible to userland is a bad
 idea because it locks us into a place where we can't or don't need to
 improve the said deficiencies and end up pushing the difficult
 problems to somewhere else where it will likely be implemented in a
 shabbier way.  There sure are cases when such approach simply cannot
 be avoided, but I really don't think that's the case here - the
 overhead already seems to be at an acceptable level and we're not
 taking away the escape switch.
 
 This is userland visible API.  We better err on the side of being
 conservative than going overboard with flexibility.  Even if we
 eventually need to make this switching fullly hierarchical, we really
 should be doing,
 
 1. Implement simple global switching and look for problem cases.
 
 2. Analyze them and see whether the problem case can't be solved in a
 better, more intelligent way.
 
 3. If the problem is something structurally inherent or reasonably too
 difficult to solve any other way, consider dumping the problem as
 config parameters to userland.
 
 We can always expand the flexibility.  Let's do the simple thing
 first.  As an added bonus, it would enable using static_keys for
 accounting branches too.
 
 Thanks.
 
 --
 tejun
 |  
	|  |  |  
	| 
		
			| Re: [PATCH v3 04/13] kmem accounting basic infrastructure [message #48110 is a reply to message #48109] | Thu, 27 September 2012 14:43   |  
			| 
				
				
					|  Mel Gorman Messages: 8
 Registered: August 2012
 | Junior Member |  |  |  
	| On Thu, Sep 27, 2012 at 07:33:00AM -0700, Tejun Heo wrote: > Hello, Michal.
 >
 > On Thu, Sep 27, 2012 at 02:08:06PM +0200, Michal Hocko wrote:
 > > Yes, because we have many users (basically almost all) who care only
 > > about the user memory because that's what occupies the vast majority of
 > > the memory. They usually want to isolate workload which would disrupt
 > > the global memory otherwise (e.g. backup process vs. database). You
 > > really do not want to pay an additional overhead for kmem accounting
 > > here.
 >
 > I'm not too convinced.  First of all, the overhead added by kmemcg
 > isn't big.
 
 Really?
 
 If kmemcg was globally accounted then every __GFP_KMEMCG allocation in
 the page allocator potentially ends up down in
 __memcg_kmem_newpage_charge which
 
 1. takes RCU read lock
 2. looks up cgroup from task
 3. takes a reference count
 4. memcg_charge_kmem -> __mem_cgroup_try_charge
 5. release reference count
 
 That's a *LOT* of work to incur for cgroups that do not care about kernel
 accounting. This is why I thought it was reasonable that the kmem accounting
 not be global.
 
 --
 Mel Gorman
 SUSE Labs
 |  
	|  |  |  
	| 
		
			| Re: [PATCH v3 04/13] kmem accounting basic infrastructure [message #48111 is a reply to message #48108] | Thu, 27 September 2012 14:49   |  
			| 
				
				
					|  Tejun Heo Messages: 184
 Registered: November 2006
 | Senior Member |  |  |  
	| Hello, Mel. 
 On Thu, Sep 27, 2012 at 03:28:22PM +0100, Mel Gorman wrote:
 > > In addition, how is userland supposed to know which
 > > workload is shared kmem heavy or not?
 >
 > By using a bit of common sense.
 >
 > An application may not be able to figure this out but the administrator
 > is going to be able to make a very educated guess. If processes running
 > within two containers are not sharing a filesystem hierarchy for example
 > then it'll be clear they are not sharing dentries.
 >
 > If there was a suspicion they were then it could be analysed with
 > something like SystemTap probing when files are opened and see if files
 > are being opened that are shared between containers.
 >
 > It's not super-easy but it's not impossible either and I fail to see why
 > it's such a big deal for you.
 
 Because we're not even trying to actually solve the problem but just
 dumping it to userland.  If dentry/inode usage is the only case we're
 being worried about, there can be better ways to solve it or at least
 we should strive for that.
 
 Also, the problem is not that it is impossible if you know and
 carefully plan for things beforehand (that would be one extremely
 competent admin) but that the problem is undiscoverable.  With kmemcg
 accounting disabled, there's no way to tell a looking cgroup the admin
 thinks running something which doesn'ft tax kmem much could be
 generating a ton without the admin ever noticing.
 
 > > The fact that the numbers don't really mean what they apparently
 > > should mean.
 >
 > I think it is a reasonable limitation that only some kernel allocations are
 > accounted for although I'll freely admit I'm not a cgroup or memcg user
 > either.
 >
 > My understanding is that this comes down to cost -- accounting for the
 > kernel memory usage is expensive so it is limited only to the allocations
 > that are easy to abuse by an unprivileged process. Hence this is
 > initially concerned with stack pages with dentries and TCP usage to
 > follow in later patches.
 
 I think the cost isn't too prohibitive considering it's already using
 memcg.  Charging / uncharging happens only as pages enter and leave
 slab caches and the hot path overhead is essentially single
 indirection.  Glauber's benchmark seemed pretty reasonable to me and I
 don't yet think that warrants exposing this subtle tree of
 configuration.
 
 > > Sure, conferences are useful for building consensus but that's the
 > > extent of it.  Sorry that I didn't realize the implications then but
 > > conferences don't really add any finality to decisions.
 > >
 > > So, this seems properly crazy to me at the similar level of
 > > use_hierarchy fiasco.  I'm gonna NACK on this.
 >
 > I think you're over-reacting to say the very least :|
 
 The part I nacked is enabling kmemcg on a populated cgroup and then
 starting accounting from then without any apparent indication that any
 past allocation hasn't been considered.  You end up with numbers which
 nobody can't tell what they really mean and there's no mechanism to
 guarantee any kind of ordering between populating the cgroup and
 configuring it and there's *no* way to find out what happened
 afterwards neither.  This is properly crazy and definitely deserves a
 nack.
 
 Thanks.
 
 --
 tejun
 |  
	|  |  |  
	|  | 
 
 
 Current Time: Sat Oct 25 12:34:14 GMT 2025 
 Total time taken to generate the page: 0.08782 seconds |