OpenVZ Forum


Home » Mailing lists » Devel » [RFC][PATCH] another swap controller for cgroup
Re: [RFC][PATCH] another swap controller for cgroup [message #30193 is a reply to message #30191] Thu, 15 May 2008 08:56 Go to previous messageGo to previous message
yamamoto is currently offline  yamamoto
Messages: 97
Registered: July 2007
Member
> >  > If so, why is this better
> >  > than charging for actual swap usage?
> >
> >  its behaviour is more determinstic and it uses less memory.
> >  (than nishimura-san's one, which charges for actual swap usage.)
> >
> 
> Using less memory is good, but maybe not worth it if the result isn't so useful.
> 
> I'd say that it's less deterministic than nishimura-san's controller -
> with his you just need to know how much swap is in use (which you can
> tell by observing the app on a real system) but with yours you also
> have to know whether there are any processes sharing anon pages (but
> not mms).

deterministic in the sense that, even when two or more processes
from different cgroups are sharing a page, both of them, rather than
only unlucky one, are always charged.

another related advantage is that it's possible to move charges
quite precisely when moving a task among cgroups.

> Now it's true that if all the apps you need to run do an execve()
> after forking, then the number of swap ptes really does track the
> amount of swap space in use pretty accurately, since there's not going
> to be any sharing of anon memory between mms. And it might be that
> people decide that the reduced memory overhead justifies this
> limitation. But I think it should be made explicit in the patch
> description and documentation that this controller achieves its
> reduced overhead at the cost of giving (IMO) bogus results on a rather
> ancient but still perfectly legitimate class of Unix application. (The
> apache httpd server used to work this way, for instance. It may still
> but I've not looked at it in a while).

fair enough.

> >  > - what will happen if someone creates non-NPTL threads, which share an
> >  > mm but not a thread group (so each of them is a thread group leader)?
> >
> >  a thread which is most recently assigned to a cgroup will "win".
> >
> 
> Doesn't that risk triggering the BUG_ON(mm->swap_cgroup != oldscg) in
> swap_cgroup_attach() ?

which version of the patch you are looking at?
the following is the latest copy.

YAMAMOTO Takashi


Signed-off-by: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
---

--- linux-2.6.25-rc8-mm2/mm/swapcontrol.c.BACKUP	2008-04-21 12:06:44.000000000 +0900
+++ linux-2.6.25-rc8-mm2/mm/swapcontrol.c	2008-05-15 13:48:00.000000000 +0900
@@ -0,0 +1,570 @@
+
+/*
+ * swapcontrol.c COPYRIGHT FUJITSU LIMITED 2008
+ *
+ * Author: yamamoto@valinux.co.jp
+ */
+
+/*
+ * there are two types of swap users.
+ * this controller handles both of them.
+ *
+ * - anonymous pages
+ *	eg. user stacks, MAP_PRIVATE pages
+ *
+ *	we track the number of PTEs with swap entries.
+ *	it's precise wrt moving tasks between cgroups.
+ *
+ * - anonymous objects (aka "shmem")
+ *	eg. tmpfs, sysvshm, MAP_SHARED anonymous mapping
+ *
+ *	anonymous objects are associated to a cgroup when they are created
+ *	and the number of on-disk swap slots used by them are counted
+ *	for the cgroup.  the association is persistent except cgroup removal,
+ *	in which case, its associated objects are moved to init_mm's cgroup.
+ */
+
+#include <linux/err.h>
+#include <linux/cgroup.h>
+#include <linux/res_counter.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/swap.h>
+#include <linux/swapcontrol.h>
+#include <linux/swapops.h>
+#include <linux/shmem_fs.h>
+#include <linux/hugetlb.h>
+
+struct swap_cgroup {
+	struct cgroup_subsys_state scg_css;
+	struct res_counter scg_counter;
+	struct list_head scg_shmem_list;
+};
+
+static struct cgroup_subsys_state *
+task_to_css(struct task_struct *task)
+{
+
+	return task_subsys_state(task, swap_cgroup_subsys_id);
+}
+
+static struct swap_cgroup *
+css_to_scg(struct cgroup_subsys_state *css)
+{
+
+	return container_of(css, struct swap_cgroup, scg_css);
+}
+
+static struct cgroup_subsys_state *
+cg_to_css(struct cgroup *cg)
+{
+
+	return cgroup_subsys_state(cg, swap_cgroup_subsys_id);
+}
+
+static struct swap_cgroup *
+cg_to_scg(struct cgroup *cg)
+{
+
+	return css_to_scg(cg_to_css(cg));
+}
+
+/* ============================================================ */
+/*
+ * anonymous pages
+ */
+
+/*
+ * lazily initialize ptp->ptp_swap_cgroup.
+ *
+ * called with page table locked.
+ */
+static struct swap_cgroup *
+swap_cgroup_prepare_ptp(struct page *ptp, struct mm_struct *mm)
+{
+	struct swap_cgroup *scg = ptp->ptp_swap_cgroup;
+
+	BUG_ON(mm == NULL);
+	BUG_ON(mm->swap_cgroup == NULL);
+
+	/*
+	 * scg == NULL here means that it's the first time we access
+	 * this PTP.  in that case, initialize ptp->ptp_swap_cgroup
+	 * from mm->swap_cgroup.
+	 */
+	if (scg == NULL) {
+		/*
+		 * see swap_cgroup_attach.
+		 */
+		smp_rmb();
+		scg = mm->swap_cgroup;
+		BUG_ON(scg == NULL);
+		ptp->ptp_swap_cgroup = scg;
+	}
+
+	return scg;
+}
+
+/*
+ * called with page table locked.
+ */
+int
+swap_cgroup_charge(struct page *ptp, struct mm_struct *mm)
+{
+	struct swap_cgroup * const scg = swap_cgroup_prepare_ptp(ptp, mm);
+
+	return res_counter_charge(&scg->scg_counter, PAGE_CACHE_SIZE);
+}
+
+/*
+ * called with page table locked.
+ */
+void
+swap_cgroup_uncharge(struct page *ptp)
+{
+	struct swap_cgroup * const scg = ptp->ptp_swap_cgroup;
+
+	BUG_ON(scg == NULL);
+	res_counter_uncharge(&scg->scg_counter, PAGE_CACHE_SIZE);
+}
+
+/*
+ * special version of swap_cgroup_charge/swap_cgroup_uncharge for mremap.
+ *
+ * called with both page tables locked.
+ */
+void
+swap_cgroup_remap_charge(struct page *oldptp, struct page *newptp,
+    struct mm_struct *mm)
+{
+	struct swap_cgroup * const oldscg = oldptp->ptp_swap_cgroup;
+	struct swap_cgroup * const newscg = swap_cgroup_prepare_ptp(newptp, mm);
+
+	BUG_ON(oldscg == NULL);
+	BUG_ON(newscg == NULL);
+
+	/*
+	 * normally we have nothing to do as these PTPs belong to the same mm.
+	 */
+	if (oldscg == newscg)
+		return;
+
+	/*
+	 * swap_cgroup_attach is in progress.
+	 * it's an exceptional event.
+	 *
+	 * forcing charge here shouldn't matter much
+	 * as the condition is likely transitional.
+	 */
+
+	res_counter_charge_force(&newscg->scg_counter, PAGE_CACHE_SIZE);
+	res_counter_uncharge(&oldscg->scg_counter, PAGE_CACHE_SIZE);
+}
+
+struct swap_cgroup_attach_mm_cb_args {
+	struct vm_area_struct *vma;
+	struct swap_cgroup *oldscg;
+	struct swap_cgroup *newscg;
+};
+
+/*
+ * an mm_walk callback function.  used by swap_cgroup_attach_mm.
+ *
+ * investigate each PTEs in the range and adjust res_counters.
+ * note that the page table lock prevents concurrent attempts of
+ * charge/uncharge.
+ */
+static int
+swap_cgroup_attach_mm_cb(pmd_t *pmd, unsigned long startva, unsigned long endva,
+    void *private)
+{
+	const struct swap_cgroup_attach_mm_cb_args * const args = private;
+	struct vm_area_struct * const vma = args->vma;
+	struct swap_cgroup * const oldscg = args->oldscg;
+	struct swap_cgroup * const newscg = args->newscg;
+	struct page *ptp;
+	spinlock_t *ptl;
+	const pte_t *startpte;
+	const pte_t *pte;
+	unsigned long va;
+	int swslots;
+	int bytes;
+
+	BUG_ON((startva & ~PMD_MASK) != 0);
+	BUG_ON((endva & ~PMD_MASK) != 0);
+
+	startpte = pte_offset_map_lock(vma->vm_mm, pmd, startva, &ptl);
+	ptp = pmd_page(*pmd);
+	/*
+	 * ptp->ptp_swap_cgroup == newscg here means this PTP is covered
+	 * by another VMA as well.
+	 */
+	if (ptp->ptp_swap_cgroup == NULL || ptp->ptp_swap_cgroup == newscg)
+		goto out;
+	BUG_ON(ptp->ptp_swap_cgroup != oldscg);
+
+	/*
+	 * count the number of swap entries in this page table page.
+	 */
+	swslots = 0;
+	for (va = startva, pte = startpte; va != endva;
+	    pte++, va += PAGE_SIZE) {
+		const pte_t pt_entry = *pte;
+
+		if (pte_present(pt_entry))
+			continue;
+		if (pte_none(pt_entry))
+			continue;
+		if (pte_file(pt_entry))
+			continue;
+		if (is_migration_entry(pte_to_swp_entry(pt_entry)))
+			continue;
+		swslots++;
+	}
+
+	bytes = swslots * PAGE_CACHE_SIZE;
+	res_counter_uncharge(&oldscg->scg_counter, bytes);
+	/*
+	 * XXX ignore newscg's limit because cgroup ->attach method can't fail.
+	 */
+	res_counter_charge_force(&newscg->scg_counter, bytes);
+	ptp->ptp_swap_cgroup = newscg;
+out:
+	pte_unmap_unlock(startpte, ptl);
+
+	return 0;
+}
+
+static const struct mm_walk swap_cgroup_attach_mm_walk = {
+	.pmd_entry = swap_cgroup_attach_mm_cb,
+};
+
+/*
+ * walk VMAs to adjust res_counters.
+ */
+static void
+swap_cgroup_attach_mm(struct mm_struct *mm, struct swap_cgroup *oldscg,
+    struct swap_cgroup *newscg)
+{
+	struct swap_cgroup_attach_mm_cb_args args;
+	struct vm_area_struct *vma;
+
+	args.oldscg = oldscg;
+	args.newscg = newscg;
+	down_read(&mm->mmap_sem);
+	for (vma = mm->mmap; vma; vma = vma->vm_next) {
+		if (is_vm_hugetlb_page(vma))
+			continue;
+		args.vma = vma;
+		walk_page_range(mm, vma->vm_start & PMD_MASK,
+		    (vma->vm_end + PMD_SIZE - 1) & PMD_MASK,
+		    &swap_cgroup_attach_mm_walk, &args);
+	}
+	up_read(&mm->mmap_sem);
+}
+
+static void
+swap_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *newcg,
+    struct cgroup *oldcg, struct task_struct *t)
+{
+	struct swap_cgroup *oldmmscg;
+	struct swap_cgroup *oldtaskscg;
+	struct swap_cgroup *newscg;
+	struct mm_struct *mm;
+
+	BUG_ON(oldcg == NULL);
+	BUG_ON(newcg == NULL);
+	BUG_ON(cg_to_css(oldcg) == NULL);
+	BUG_ON(cg_to_css(newcg) == NULL);
+	BUG_ON(oldcg == newcg);
+
+	if (!thread_group_leader(t))
+		return;
+	mm = get_task_mm(t);
+	if (mm == NULL)
+		return;
+	oldtaskscg = cg_to_scg(oldcg);
+	newscg = cg_to_scg(newcg);
+	BUG_ON(oldtaskscg == newscg);
+	/*
+	 * note that a task and its mm can belong to different cgroups
+	 * with the current implementation.
+	 */
+	oldmmscg = mm->swap_cgroup;
+	if (oldmmscg != newscg) {
+		css_get(&newscg->sc
...

 
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Read Message
Previous Topic: extradite oracle into a VE, can we?
Next Topic: [PATCH -mm] remove node_ prefix_from ns subsystem
Goto Forum:
  


Current Time: Wed Sep 18 04:37:06 GMT 2024

Total time taken to generate the page: 0.04889 seconds