OpenVZ Forum


Home » Mailing lists » Devel » [RFC 0/4] memcg: background reclaim (v1)
Re: [RFC 3/4] memcg: background reclaim [message #30511 is a reply to message #30478] Tue, 27 May 2008 17:08 Go to previous messageGo to previous message
Balbir Singh is currently offline  Balbir Singh
Messages: 491
Registered: August 2006
Senior Member
KAMEZAWA Hiroyuki wrote:
> Do background reclaim based on high-low watermarks.
> 
> This feature helps smooth work of processes under memcg by reclaiming memory
> in the kernel thread. # of limitation failure at mem_cgroup_charge() will
> dramatically reduced. But this also means a CPU is continuously used for
> reclaiming memory.
> 
> This one is very simple. Anyway, we need to update this when we add new
> complexity to memcg.
> 
> Major logic:
>   - add high-low watermark support to memory resource controller.
>   - create a kernel thread for cgroup when hwmark is changed. (once)
>   - stop a kernel thread at rmdir().
>   - start background reclaim if res_counter is over high-watermark.
>   - stop background reclaim if res_coutner is below low-watermark.
>   - for reclaiiming, just calls try_to_free_mem_cgroup_pages().
>   - kthread for reclaim 's priority is nice(0). default is (-5).
>     (weaker is better ?)
>   - kthread for reclaim calls yield() on each loop.
> 
> TODO:
>   - add an interface to start/stop daemon ?
>   - wise numa support
>   - too small "low watermark" targe just consumes CPU. Should we warn ?
>     and what is the best value for hwmark/lwmark in general....?
> 
> Chagelog: old one -> this (v1)
>   - start a thread at write of hwmark.
>   
> 
> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> 
> ---
>  mm/memcontrol.c |  147 +++++++++++++++++++++++++++++++++++++++++++++++++-------
>  1 file changed, 130 insertions(+), 17 deletions(-)
> 
> Index: mm-2.6.26-rc2-mm1/mm/memcontrol.c
> ===================================================================
> --- mm-2.6.26-rc2-mm1.orig/mm/memcontrol.c
> +++ mm-2.6.26-rc2-mm1/mm/memcontrol.c
> @@ -32,7 +32,8 @@
>  #include <linux/fs.h>
>  #include <linux/seq_file.h>
>  #include <linux/vmalloc.h>
> -
> +#include <linux/freezer.h>
> +#include <linux/kthread.h>
>  #include <asm/uaccess.h>
> 
>  struct cgroup_subsys mem_cgroup_subsys __read_mostly;
> @@ -119,10 +120,6 @@ struct mem_cgroup_lru_info {
>   * statistics based on the statistics developed by Rik Van Riel for clock-pro,
>   * to help the administrator determine what knobs to tune.
>   *
> - * TODO: Add a water mark for the memory controller. Reclaim will begin when
> - * we hit the water mark. May be even add a low water mark, such that
> - * no reclaim occurs from a cgroup at it's low water mark, this is
> - * a feature that will be implemented much later in the future.
>   */
>  struct mem_cgroup {
>  	struct cgroup_subsys_state css;
> @@ -131,6 +128,13 @@ struct mem_cgroup {
>  	 */
>  	struct res_counter res;
>  	/*
> +	 * background reclaim.
> +	 */
> +	struct {
> +		wait_queue_head_t waitq;
> +		struct task_struct *thread;
> +	} daemon;

Comments on each of the members would be nice.

> +	/*
>  	 * Per cgroup active and inactive list, similar to the
>  	 * per zone LRU lists.
>  	 */
> @@ -143,6 +147,7 @@ struct mem_cgroup {
>  	struct mem_cgroup_stat stat;
>  };
>  static struct mem_cgroup init_mem_cgroup;
> +static DEFINE_MUTEX(memcont_daemon_lock);
> 
>  /*
>   * We use the lower bit of the page->page_cgroup pointer as a bit spin
> @@ -374,6 +379,15 @@ void mem_cgroup_move_lists(struct page *
>  	unlock_page_cgroup(page);
>  }
> 
> +static void mem_cgroup_schedule_reclaim(struct mem_cgroup *mem)
> +{
> +	if (!mem->daemon.thread)
> +		return;


I suspect we are using threads. Aren't workqueues better than threads?

> +	if (!waitqueue_active(&mem->daemon.waitq))
> +		return;
> +	wake_up_interruptible(&mem->daemon.waitq);
> +}
> +
>  /*
>   * Calculate mapped_ratio under memory controller. This will be used in
>   * vmscan.c for deteremining we have to reclaim mapped pages.
> @@ -532,6 +546,7 @@ static int mem_cgroup_charge_common(stru
>  	unsigned long flags;
>  	unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
>  	struct mem_cgroup_per_zone *mz;
> +	enum res_state state;
> 
>  	if (mem_cgroup_subsys.disabled)
>  		return 0;
> @@ -558,23 +573,23 @@ static int mem_cgroup_charge_common(stru
>  		mem = memcg;
>  		css_get(&memcg->css);
>  	}
> -
> -	while (res_counter_charge(&mem->res, PAGE_SIZE) == RES_OVER_LIMIT) {
> +retry:
> +	state = res_counter_charge(&mem->res, PAGE_SIZE);
> +	if (state == RES_OVER_LIMIT) {
>  		if (!(gfp_mask & __GFP_WAIT))
>  			goto out;
> -
>  		if (try_to_free_mem_cgroup_pages(mem, gfp_mask))
> -			continue;
> -
> +			goto retry;
>  		/*
> -		 * try_to_free_mem_cgroup_pages() might not give us a full
> -		 * picture of reclaim. Some pages are reclaimed and might be
> -		 * moved to swap cache or just unmapped from the cgroup.
> -		 * Check the limit again to see if the reclaim reduced the
> -		 * current usage of the cgroup before giving up
> +		 * try_to_free_mem_cgroup_pages() might not give us a
> +		 * full picture of reclaim. Some pages are reclaimed
> +		 * and might be moved to swap cache or just unmapped
> +		 * from the cgroup. Check the limit again to see if
> +		 * the reclaim reduced the current usage of the cgroup
> +		 * before giving up
>  		 */
>  		if (res_counter_check_under_limit(&mem->res))
> -			continue;
> +			goto retry;
> 
>  		if (!nr_retries--) {
>  			mem_cgroup_out_of_memory(mem, gfp_mask);
> @@ -609,6 +624,9 @@ static int mem_cgroup_charge_common(stru
>  	spin_unlock_irqrestore(&mz->lru_lock, flags);
> 
>  	unlock_page_cgroup(page);
> +
> +	if (state > RES_BELOW_HIGH)
> +		mem_cgroup_schedule_reclaim(mem);

I really don't like state > RES_BELOW_HIGH sort of checks. Could we please
abstract them into functions, like

if (mem_res_above_high_watermark(state))
	....

>  done:
>  	return 0;
>  out:
> @@ -891,6 +909,74 @@ out:
>  	css_put(&mem->css);
>  	return ret;
>  }
> +/*
> + * background reclaim daemon.
> + */
> +static int mem_cgroup_reclaim_daemon(void *data)
> +{
> +	DEFINE_WAIT(wait);
> +	struct mem_cgroup *mem = data;
> +	int ret;
> +
> +	css_get(&mem->css);
> +	current->flags |= PF_SWAPWRITE;
> +	/* we don't want to use cpu too much. */
> + 	set_user_nice(current, 0);

Shouldn't this (0) be a #define, what if we would like to degrade our nice value
even further later?

> +	set_freezable();
> +
> +	while (!kthread_should_stop()) {
> +		prepare_to_wait(&mem->daemon.waitq, &wait, TASK_INTERRUPTIBLE);
> +		if (res_counter_state(&mem->res) == RES_BELOW_LOW) {
> +			if (!kthread_should_stop()) {
> +				schedule();
> +				try_to_freeze();

I am afraid, I fail to understand the code above.

> +			}
> +			finish_wait(&mem->daemon.waitq, &wait);
> +			continue;
> +		}
> +		finish_wait(&mem->daemon.waitq, &wait);
> +		/*
> +		 * memory resource controller doesn't see NUMA memory usage
> +		 * balancing, becasue we cannot know what balancing is good.

			      ^^^^^^^^ typo

> +		 * TODO: some annotation or heuristics to detect which node
> +		 * we should start reclaim from.
> +		 */
> +		ret = try_to_free_mem_cgroup_pages(mem, GFP_HIGHUSER_MOVABLE);
> +
> +		yield();

Why do we yeild here? Shouldn't our waitqueue handle the waiting?

> +	}
> +	css_put(&mem->css);
> +	return 0;
> +}
> +
> +static int mem_cgroup_start_daemon(struct mem_cgroup *mem)
> +{
> +	int ret = 0;
> +	struct task_struct *thr;
> +
> +	mutex_lock(&memcont_daemon_lock);
> +	if (!mem->daemon.thread) {
> +		thr = kthread_run(mem_cgroup_reclaim_daemon, mem, "memcontd");
> +		if (IS_ERR(thr))
> +			ret = PTR_ERR(thr);
> +		else
> +			mem->daemon.thread = thr;
> +	}
> +	mutex_unlock(&memcont_daemon_lock);
> +	return ret;
> +}
> +
> +static void mem_cgroup_stop_daemon(struct mem_cgroup *mem)
> +{
> +	mutex_lock(&memcont_daemon_lock);
> +	if (mem->daemon.thread) {
> +		kthread_stop(mem->daemon.thread);
> +		mem->daemon.thread = NULL;
> +	}
> +	mutex_unlock(&memcont_daemon_lock);
> +	return;
> +}
> +
> 
>  static int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp)
>  {
> @@ -915,6 +1001,19 @@ static ssize_t mem_cgroup_write(struct c
>  				struct file *file, const char __user *userbuf,
>  				size_t nbytes, loff_t *ppos)
>  {
> +	int ret;
> +	/*
> +	 * start daemon can fail. But we should start daemon always
> +	 * when changes to HWMARK is succeeded. So, we start daemon before
> +	 * changes to HWMARK. We don't stop this daemon even if
> +	 * res_counter_write fails. To do that, we need ugly codes and
> +	 * it's not so big problem.
> +	 */
> +	if (cft->private == RES_HWMARK) {
> +		ret = mem_cgroup_start_daemon(mem_cgroup_from_cont(cont));
> +		if (ret)
> +			return ret;
> +	}
>  	return res_counter_write(&mem_cgroup_from_cont(cont)->res,
>  				cft->private, userbuf, nbytes, ppos,
>  				mem_cgroup_write_strategy);
> @@ -1004,6 +1103,18 @@ static struct cftype mem_cgroup_files[] 
>  		.read_u64 = mem_cgroup_read,
>  	},
>  	{
> +		.name = "high_wmark_in_bytes",
> +		.private = RES_HWMARK,
> +		.write = mem_cgroup_write,
> +		.read_u64 = mem_cgroup_read,
> +	},
> +	{
> +		.name = "low_wmark_in_bytes",
> +		.private = RES_LWMARK,
> +		.write = mem_cgroup_write,
> +		.read_u64 = mem_cgroup_read,
> +	},
>
...

 
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Previous Topic: [PATCH 0/4] swapcgroup(v2)
Next Topic: [RFC 2/2] memcg: hierarchy support interface (yet another one)
Goto Forum:
  


Current Time: Tue Nov 19 15:18:51 GMT 2024

Total time taken to generate the page: 0.03225 seconds