One way of doing application memory checkpoint which was suggested
by Dave Hansen was to exploit the swapfile for storing memory, and
just dump the relevant metadata.
To that end, this patch allows a set of tasks to be associated with
one specific swapfile. I called it 'swap namespaces', but it's
actually not a namespace at all, just a filter. The initial swap_ns
is null. In this swap_ns multiple swapfiles can be used. In a child
swap_ns, only one swapfile can be swapon'd. A swapfile can only be
active in one swap_ns. When you clone a new swap_ns, you begin with
no swapfile.
You can't unshare a swap_ns, you must use clone. You must also
CLONE_NEWIPC and not CLONE_VM.
This patch doesn't address the limited number of swapfiles per
system, which I imagine will become a tremendous shortcoming for
machines intended to run large number of c/r jobs.
This patch is based on top of Cedric's clone64 patchset so as
to have a free CLONE_SWAPNS flag, so you pass CLONE_SWAPNS in
through cloneflags_hi.
One way to test is as follows:
login 1 login 2
=======================================================================
dd if=/dev/zero of=/s1 bs=1M \
count=512
dd if=/dev/zero of=/s2 bs=1M \
count=512
strings /s2
(you see only the swap signature)
clone64(CLONE_NEWSWAP|CLONE_NEWIPC)
clone64(CLONE_NEWSWAP|CLONE_NEWIPC)
swapon /s1
swapon /s2
cd $LTPDIR
runltp -l out1 > txt1 2>&1
(let ltp run...)
exit
exit
Now you can notice the following:
The output of strings /s2 contains no memory from the ltp run.
In ltp the following tests failed in addition to tests on an
unpatched kernel, or a patched kernel but not in a swap_ns:
swapon01
swapon02
swapoff01
swapoff02
mem01
The first four are because you cannot swapon a new swapfile,
since you already have /s1 swapped on.
The last one, if you look at your console, is because mem01 was
killed by oom.
As I often note with patchsets like these, I don't expect this
code or even the design to go upstream, although the patch appears
to work. Rather this posting is mainly to collect feedback and
discussion.
thanks,
-serge
>From b2a5b0cd72fd8adcb58343a6f3ef6b5340ea284a Mon Sep 17 00:00:00 2001
From: Serge E. Hallyn <serue@us.ibm.com>
Date: Fri, 18 Apr 2008 16:46:10 -0400
Subject: [RFC PATCH 1/1] swap namespaces: introduce basic, simple swap namespaces (v3)
By default processes are in the "init" swapns, which is NULL.
By default, all swapfiles are in the init swapns.
When a task calls clone(CLONE_NEWSWAP), it must also have
no vm in common with any tasks in another swap_namespace.
Its nsproxy->swap_namespace is then a structure containing
an 'int type', which defaults to -1.
A task in a non-init swapns can only have one swapfile.
After swapon, nsproxy->swap_namespace->type will be the
index into the swaplist. The corresponding swap_info_struct
will have swap_namespace pointing to the new swap_namespace.
(For any swapfiles which were swapon()d in the init swapns,
it is NULL).
get_swap_page now takes an argument, the page for which it
is finding a swap page. We use the anonvma to find an mm
to which the page belongs, use the mm->owner to find the
owning task, and use that to find the swap_namespace. If
it exists, then we use its swapfile to find the swap page.
Note:
shmem is a problem at checkpoint - any swapped pages from
an mmap on a shmem file will be in the init swap ns. So
we'll need (1) to copy over any tmpfs files for which there
are open fds, and (2) fine a way to detect mmaped tmpfs files
which no longer have an open fd.
Changelog:
Apr 24: small CONFIG_SWAP_NS=n fixes
Apr 23: fix wrongly inverted check or CLONE_VM
Apr 23: use u64 for clone flags (oops)
Apr 21: force CLONE_NEWIPC if clone(CLONE_NEWSWAP)
Apr 18: make sure to filter out private /proc/swaps
entries when viewing from the init namespace.
Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
---
include/linux/nsproxy.h | 2 +
include/linux/rmap.h | 6 ++
include/linux/sched.h | 1 +
include/linux/swap.h | 6 +-
include/linux/swap_namespace.h | 68 ++++++++++++++++
init/Kconfig | 8 ++
kernel/fork.c | 21 +++++
kernel/nsproxy.c | 14 +++-
mm/Makefile | 2 +-
mm/rmap.c | 4 +-
mm/shmem.c | 2 +-
mm/swap_namespace.c | 84 +++++++++++++++++++
mm/swap_state.c | 2 +-
mm/swapfile.c | 174 +++++++++++++++++++++++++++++++++++++++-
14 files changed, 385 insertions(+), 9 deletions(-)
create mode 100644 include/linux/swap_namespace.h
create mode 100644 mm/swap_namespace.c
diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
index 5395e8c..8e2490a 100644
--- a/include/linux/nsproxy.h
+++ b/include/linux/nsproxy.h
@@ -8,6 +8,7 @@ struct mnt_namespace;
struct uts_namespace;
struct ipc_namespace;
struct pid_namespace;
+struct swap_namespace;
/*
* A structure to contain pointers to all per-process
@@ -29,6 +30,7 @@ struct nsproxy {
struct pid_namespace *pid_ns;
struct user_namespace *user_ns;
struct net *net_ns;
+ struct swap_namespace *swap_ns;
};
extern struct nsproxy init_nsproxy;
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 1383692..48024db 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -31,6 +31,9 @@ struct anon_vma {
#ifdef CONFIG_MMU
+extern struct anon_vma *page_lock_anon_vma(struct page *page);
+extern void page_unlock_anon_vma(struct anon_vma *anon_vma);
+
extern struct kmem_cache *anon_vma_cachep;
static inline struct anon_vma *anon_vma_alloc(void)
@@ -111,6 +114,9 @@ int page_mkclean(struct page *);
#else /* !CONFIG_MMU */
+#define page_lock_anon_vma(p) (NULL)
+#define page_unlock_anon_vma(p) do {} while (0)
+
#define anon_vma_init() do {} while (0)
#define anon_vma_prepare(vma) (0)
#define anon_vma_link(vma) do {} while (0)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 90130b7..5d59c2f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -28,6 +28,7 @@
#define CLONE_NEWPID 0x20000000 /* New pid namespace */
#define CLONE_NEWNET 0x40000000 /* New network namespace */
#define CLONE_IO 0x80000000 /* Clone io context */
+#define CLONE_NEWSWAP 0x0000000100000000ULL
/*
* Scheduling policies
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 0b33776..e0e8f3d 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -7,6 +7,7 @@
#include <linux/list.h>
#include <linux/memcontrol.h>
#include <linux/sched.h>
+#include <linux/swap_namespace.h>
#include <asm/atomic.h>
#include <asm/page.h>
@@ -134,6 +135,7 @@ enum {
*/
struct swap_info_struct {
unsigned int flags;
+ struct swap_namespace *swap_namespace;
int prio; /* swap priority */
struct file *swap_file;
struct block_device *bdev;
@@ -239,7 +241,7 @@ extern struct page *swapin_readahead(swp_entry_t, gfp_t,
extern long total_swap_pages;
extern unsigned int nr_swapfiles;
extern void si_swapinfo(struct sysinfo *);
-extern swp_entry_t get_swap_page(void);
+extern swp_entry_t get_swap_page(struct page *page);
extern swp_entry_t get_swap_page_of_type(int);
extern int swap_duplicate(swp_entry_t);
extern int valid_swaphandles(swp_entry_t, unsigned long *);
@@ -342,7 +344,7 @@ static inline int remove_exclusive_swap_page(struct page *p)
return 0;
}
-static inline swp_entry_t get_swap_page(void)
+static inline swp_entry_t get_swap_page(struct page *page)
{
swp_entry_t entry;
entry.val = 0;
diff --git a/include/linux/swap_namespace.h b/include/linux/swap_namespace.h
new file mode 100644
index 0000000..3839d58
--- /dev/null
+++ b/include/linux/swap_namespace.h
@@ -0,0 +1,68 @@
+#ifndef _LINUX_SWAP_NS_H
+#define _LINUX_SWAP_NS_H
+
+#include <linux/list.h>
+#include <linux/kref.h>
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <asm/page.h>
+
+/*
+ * Practically, we store both type and si because both are useful.
+ * During get_swap_page(), we use type==-1 to decide whether there
+ * is a swapfile.
+ * During swapon/swapoff, we use si==NULL. This is because there
+ * are certain times - especially during swapoff - where we drop the
+ * swap_lock after removing the swapfile, only to find we couldn't
+ * remove the swapfile and need to reactivate it. Using si==NULL
+ * for the check here allows us to prevent a racing swapon and
+ * swapoff during this window.
+ */
+struct swap_namespace {
+ struct kref kref;
+ int type; /* index into swap_list */
+ struct swap_info_struct *si;
+ /*
+ * ns->lock can only be held *under* swap_lock
+ */
+ spinlock_t lock;
+};
+
+#ifdef CONFIG_SWAP_NS
+extern void free_swap_ns(struct kref *kref);
+extern struct swap_namespace *copy_swap_ns(u64 flags,
+ struct swap_namespace *ns);
+extern struct swap_namespace *vma_page_to_swapns(struct page *page);
+#else
+static inline void free_swap_ns(struct kref *kref)
+{
+}
+
+static inline struct swap_namespace *copy_swap_ns(u64 flags,
+ struct swap_namespace *ns)
+{
+ /* no namespaces so we don't copy it, just inc the refcount */
+ if (flags&CLONE_NEWSWAP)
+ return ERR_PTR(-EINVAL);
+ return NULL;
+}
+
+static inline struct swap_namespace *vma_page_to_swapns(struct page *page)
+{
+ return NULL;
+}
+#endif
+
+static inline struct swap_namespace *get_swap_ns(struct swap_namespace *swapns)
+{
+ if (swapns)
+ kref_get(&swapns->kref);
+ return swapns;
+}
+
+static inline void put_swap_ns(struct swap_namespace *swapns)
+{
+ if (swapns)
+ kref_put(&swapns->kref, free_swap_ns);
+}
+#endif
diff --git a/init/Kconfig b/init/Kconfig
index 27b1660..06e3d1e 100644
--- a/init/Kconfig
+++ b/init/Kc
...