OpenVZ Forum


Home » Mailing lists » Devel » [RFC][PATCH] another swap controller for cgroup
Re: [RFC][PATCH] another swap controller for cgroup [message #29914 is a reply to message #28403] Tue, 29 April 2008 22:50 Go to previous messageGo to previous message
yamamoto is currently offline  yamamoto
Messages: 97
Registered: July 2007
Member
> > > - anonymous objects (shmem) are not accounted.
> > IMHO, shmem should be accounted.
> > I agree it's difficult in your implementation,
> > but are you going to support it?
> 
> it should be trivial to track how much swap an anonymous object is using.
> i'm not sure how it should be associated with cgroups, tho.
> 
> YAMAMOTO Takashi

i implemented shmem swap accounting.  see below.

YAMAMOTO Takashi


the following is another swap controller, which was designed and
implemented independently from nishimura-san's one.

some random differences from nishimura-san's one:
- counts and limits the number of ptes with swap entries instead of
  on-disk swap slots.  (except swap slots used by anonymous objects,
  which are counted differently.  see below.)
- no swapon-time memory allocation.
- precise wrt moving tasks between cgroups.
- [NEW] anonymous objects (shmem) are associated to a cgroup when they are
  created and the number of on-disk swap slots used by them are counted for
  the cgroup.  the association is persistent except cgroup removal,
  in which case, its associated objects are moved to init_mm's cgroup.


Signed-off-by: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
---

--- linux-2.6.25-rc8-mm2/mm/swapcontrol.c.BACKUP	2008-04-21 12:06:44.000000000 +0900
+++ linux-2.6.25-rc8-mm2/mm/swapcontrol.c	2008-04-28 14:16:03.000000000 +0900
@@ -0,0 +1,447 @@
+
+/*
+ * swapcontrol.c COPYRIGHT FUJITSU LIMITED 2008
+ *
+ * Author: yamamoto@valinux.co.jp
+ */
+
+#include <linux/err.h>
+#include <linux/cgroup.h>
+#include <linux/hugetlb.h>
+#include <linux/res_counter.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/swap.h>
+#include <linux/swapcontrol.h>
+#include <linux/swapops.h>
+#include <linux/shmem_fs.h>
+
+struct swap_cgroup {
+	struct cgroup_subsys_state scg_css;
+	struct res_counter scg_counter;
+	struct list_head scg_shmem_list;
+};
+
+#define	task_to_css(task) task_subsys_state((task), swap_cgroup_subsys_id)
+#define	css_to_scg(css)	container_of((css), struct swap_cgroup, scg_css)
+#define	cg_to_css(cg)	cgroup_subsys_state((cg), swap_cgroup_subsys_id)
+#define	cg_to_scg(cg)	css_to_scg(cg_to_css(cg))
+
+static DEFINE_MUTEX(swap_cgroup_shmem_lock);
+
+static struct swap_cgroup *
+swap_cgroup_prepare_ptp(struct page *ptp, struct mm_struct *mm)
+{
+	struct swap_cgroup *scg = ptp->ptp_swap_cgroup;
+
+	BUG_ON(mm == NULL);
+	BUG_ON(mm->swap_cgroup == NULL);
+	if (scg == NULL) {
+		/*
+		 * see swap_cgroup_attach.
+		 */
+		smp_rmb();
+		scg = mm->swap_cgroup;
+		BUG_ON(scg == NULL);
+		ptp->ptp_swap_cgroup = scg;
+	}
+
+	return scg;
+}
+
+/*
+ * called with page table locked.
+ */
+int
+swap_cgroup_charge(struct page *ptp, struct mm_struct *mm)
+{
+	struct swap_cgroup * const scg = swap_cgroup_prepare_ptp(ptp, mm);
+
+	return res_counter_charge(&scg->scg_counter, PAGE_CACHE_SIZE);
+}
+
+/*
+ * called with page table locked.
+ */
+void
+swap_cgroup_uncharge(struct page *ptp)
+{
+	struct swap_cgroup * const scg = ptp->ptp_swap_cgroup;
+
+	BUG_ON(scg == NULL);
+	res_counter_uncharge(&scg->scg_counter, PAGE_CACHE_SIZE);
+}
+
+/*
+ * called with both page tables locked.
+ */
+void
+swap_cgroup_remap_charge(struct page *oldptp, struct page *newptp,
+    struct mm_struct *mm)
+{
+	struct swap_cgroup * const oldscg = oldptp->ptp_swap_cgroup;
+	struct swap_cgroup * const newscg = swap_cgroup_prepare_ptp(newptp, mm);
+
+	BUG_ON(oldscg == NULL);
+	BUG_ON(newscg == NULL);
+	if (oldscg == newscg) {
+		return;
+	}
+
+	/*
+	 * swap_cgroup_attach is in progress.
+	 */
+
+	res_counter_charge_force(&newscg->scg_counter, PAGE_CACHE_SIZE);
+	res_counter_uncharge(&oldscg->scg_counter, PAGE_CACHE_SIZE);
+}
+
+void
+swap_cgroup_init_mm(struct mm_struct *mm, struct task_struct *t)
+{
+	struct swap_cgroup *scg;
+	struct cgroup *cg;
+
+	/* mm->swap_cgroup is not NULL in the case of dup_mm */
+	cg = task_cgroup(t, swap_cgroup_subsys_id);
+	BUG_ON(cg == NULL);
+	scg = cg_to_scg(cg);
+	BUG_ON(scg == NULL);
+	css_get(&scg->scg_css);
+	mm->swap_cgroup = scg;
+}
+
+void
+swap_cgroup_exit_mm(struct mm_struct *mm)
+{
+	struct swap_cgroup * const scg = mm->swap_cgroup;
+
+	/*
+	 * note that swap_cgroup_attach does get_task_mm which
+	 * prevents us from being called.  we can assume mm->swap_cgroup
+	 * is stable.
+	 */
+
+	BUG_ON(scg == NULL);
+	mm->swap_cgroup = NULL; /* it isn't necessary.  just being tidy. */
+	css_put(&scg->scg_css);
+}
+
+static u64
+swap_cgroup_read_u64(struct cgroup *cg, struct cftype *cft)
+{
+
+	return res_counter_read_u64(&cg_to_scg(cg)->scg_counter, cft->private);
+}
+
+static int
+swap_cgroup_write_u64(struct cgroup *cg, struct cftype *cft, u64 val)
+{
+	struct res_counter *counter = &cg_to_scg(cg)->scg_counter;
+	unsigned long flags;
+
+	/* XXX res_counter_write_u64 */
+	BUG_ON(cft->private != RES_LIMIT);
+	spin_lock_irqsave(&counter->lock, flags);
+	counter->limit = val;
+	spin_unlock_irqrestore(&counter->lock, flags);
+	return 0;
+}
+
+static const struct cftype files[] = {
+	{
+		.name = "usage_in_bytes",
+		.private = RES_USAGE,
+		.read_u64 = swap_cgroup_read_u64,
+	},
+	{
+		.name = "failcnt",
+		.private = RES_FAILCNT,
+		.read_u64 = swap_cgroup_read_u64,
+	},
+	{
+		.name = "limit_in_bytes",
+		.private = RES_LIMIT,
+		.read_u64 = swap_cgroup_read_u64,
+		.write_u64 = swap_cgroup_write_u64,
+	},
+};
+
+static struct cgroup_subsys_state *
+swap_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cg)
+{
+	struct swap_cgroup *scg;
+
+	scg = kzalloc(sizeof(*scg), GFP_KERNEL);
+	if (scg == NULL) {
+		return ERR_PTR(-ENOMEM);
+	}
+	res_counter_init(&scg->scg_counter);
+	if (unlikely(cg->parent == NULL)) {
+		init_mm.swap_cgroup = scg;
+	}
+	INIT_LIST_HEAD(&scg->scg_shmem_list);
+	return &scg->scg_css;
+}
+
+static void
+swap_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cg)
+{
+	struct swap_cgroup *oldscg = cg_to_scg(cg);
+	struct swap_cgroup *newscg;
+	struct list_head *pos;
+	struct list_head *next;
+
+	/*
+	 * move our anonymous objects to init_mm's group.
+	 */
+	newscg = init_mm.swap_cgroup;
+	BUG_ON(newscg == NULL);
+	BUG_ON(newscg == oldscg);
+
+	mutex_lock(&swap_cgroup_shmem_lock);
+	list_for_each_safe(pos, next, &oldscg->scg_shmem_list) {
+		struct shmem_inode_info * const info =
+		    list_entry(pos, struct shmem_inode_info, swap_cgroup_list);
+		long bytes;
+
+		spin_lock(&info->lock);
+		BUG_ON(info->swap_cgroup != oldscg);
+		bytes = info->swapped << PAGE_SHIFT;
+		info->swap_cgroup = newscg;
+		res_counter_uncharge(&oldscg->scg_counter, bytes);
+		res_counter_charge_force(&newscg->scg_counter, bytes);
+		spin_unlock(&info->lock);
+		list_del_init(&info->swap_cgroup_list);
+		list_add(&info->swap_cgroup_list, &newscg->scg_shmem_list);
+	}
+	mutex_unlock(&swap_cgroup_shmem_lock);
+
+	BUG_ON(res_counter_read_u64(&oldscg->scg_counter, RES_USAGE) != 0);
+	kfree(oldscg);
+}
+
+static int
+swap_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cg)
+{
+
+	return cgroup_add_files(cg, ss, files, ARRAY_SIZE(files));
+}
+
+struct swap_cgroup_attach_mm_cb_args {
+	struct vm_area_struct *vma;
+	struct swap_cgroup *oldscg;
+	struct swap_cgroup *newscg;
+};
+
+static int
+swap_cgroup_attach_mm_cb(pmd_t *pmd, unsigned long startva, unsigned long endva,
+    void *private)
+{
+	const struct swap_cgroup_attach_mm_cb_args * const args = private;
+	struct vm_area_struct * const vma = args->vma;
+	struct swap_cgroup * const oldscg = args->oldscg;
+	struct swap_cgroup * const newscg = args->newscg;
+	struct page *ptp;
+	spinlock_t *ptl;
+	const pte_t *startpte;
+	const pte_t *pte;
+	unsigned long va;
+	int swslots;
+	int bytes;
+
+	BUG_ON((startva & ~PMD_MASK) != 0);
+	BUG_ON((endva & ~PMD_MASK) != 0);
+
+	startpte = pte_offset_map_lock(vma->vm_mm, pmd, startva, &ptl);
+	ptp = pmd_page(*pmd);
+	if (ptp->ptp_swap_cgroup == NULL || ptp->ptp_swap_cgroup == newscg) {
+		goto out;
+	}
+	BUG_ON(ptp->ptp_swap_cgroup != oldscg);
+
+	/*
+	 * count the number of swap entries in this page table page.
+	 */
+	swslots = 0;
+	for (va = startva, pte = startpte; va != endva;
+	    pte++, va += PAGE_SIZE) {
+		const pte_t pt_entry = *pte;
+
+		if (pte_present(pt_entry)) {
+			continue;
+		}
+		if (pte_none(pt_entry)) {
+			continue;
+		}
+		if (pte_file(pt_entry)) {
+			continue;
+		}
+		if (is_migration_entry(pte_to_swp_entry(pt_entry))) {
+			continue;
+		}
+		swslots++;
+	}
+
+	bytes = swslots * PAGE_CACHE_SIZE;
+	res_counter_uncharge(&oldscg->scg_counter, bytes);
+	/*
+	 * XXX ignore newscg's limit because cgroup ->attach method can't fail.
+	 */
+	res_counter_charge_force(&newscg->scg_counter, bytes);
+	ptp->ptp_swap_cgroup = newscg;
+out:
+	pte_unmap_unlock(startpte, ptl);
+
+	return 0;
+}
+
+static const struct mm_walk swap_cgroup_attach_mm_walk = {
+	.pmd_entry = swap_cgroup_attach_mm_cb,
+};
+
+static void
+swap_cgroup_attach_mm(struct mm_struct *mm, struct swap_cgroup *oldscg,
+    struct swap_cgroup *newscg)
+{
+	struct swap_cgroup_attach_mm_cb_args args;
+	struct vm_area_struct *vma;
+
+	args.oldscg = oldscg;
+	args.newscg = newscg;
+	down_read(&mm->mmap_sem);
+	for (vma = mm->mmap; vma; vma = vma->vm_next) {
+		if (is_vm_hugetlb_page(vma)) {
+			continue;
+		}
+		args.vma = vma;
+		walk_page_range(mm, vma->vm_start & PMD_MASK,
+		    (vma->vm_end + PMD_SIZE - 1) & PMD_MASK,
+		    &swap_cgroup_attach_mm_walk, &args);
+	}
+	up_read(&mm->mmap_sem);
+}
+
+static void
+swap_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *newcg,
+    struct cgroup *oldcg, struct task_struct *t)
+{
+	struct swap_cgroup *oldmmscg;
+	struct swap_cgroup *ol
...

 
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Previous Topic: extradite oracle into a VE, can we?
Next Topic: [PATCH -mm] remove node_ prefix_from ns subsystem
Goto Forum:
  


Current Time: Fri Aug 16 10:48:04 GMT 2024

Total time taken to generate the page: 0.02837 seconds