OpenVZ Forum


Home » Mailing lists » Devel » [PATCH 0/6] linux-cr: make pids a proper shared object (v2)
[PATCH 0/6] linux-cr: make pids a proper shared object (v2) [message #41559] Mon, 07 February 2011 17:18 Go to next message
Oren Laadan is currently offline  Oren Laadan
Messages: 71
Registered: August 2007
Member
From: *parallels.com
Hi,

This patch-set changes the way pids are saved/restored to treat them
as proper shared objects (tracked in objhash) rather than simple pid
numbers. The patch-set applies on top of branch 'ckpt-v23-rc1'. It
also requires updates to user-cr (patchset posted separately).

Changelog[v2]:
- Wipe out many bugs - this version now passes all tests-cr tests.
- Rename 'depth' to 'vpids' as per Suka's suggestion

Oren.


_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containe rs
[PATCH 1/6] c/r: introduce ckpt_task_vnr(), ckpt_pid_vnr() [message #41560 is a reply to message #41559] Mon, 07 February 2011 17:18 Go to previous messageGo to next message
Oren Laadan is currently offline  Oren Laadan
Messages: 71
Registered: August 2007
Member
From: *parallels.com
This helpoer is Useful to get the pid from the root task's (checkpoint
or restart) point of view.

Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
---
include/linux/checkpoint.h | 10 +++++++++-
kernel/checkpoint/checkpoint.c | 4 ++--
kernel/checkpoint/restart.c | 6 ++----
kernel/signal.c | 2 +-
4 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index 6c0ccfd..21fc23e 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -114,8 +114,16 @@ extern int checkpoint_dump_page(struct ckpt_ctx *ctx, struct page *page);
extern int restore_read_page(struct ckpt_ctx *ctx, struct page *page);

/* pids */
-extern pid_t ckpt_pid_nr(struct ckpt_ctx *ctx, struct pid *pid);
extern struct pid *_ckpt_find_pgrp(struct ckpt_ctx *ctx, pid_t pgid);
+static inline pid_t ckpt_task_vnr(struct ckpt_ctx *ctx, struct task_struct *task)
+{
+ return task_pid_nr_ns(task, ctx->root_nsproxy->pid_ns);
+}
+static inline pid_t ckpt_pid_vnr(struct ckpt_ctx *ctx, struct pid *pid)
+{
+ return pid_nr_ns(pid, ctx->root_nsproxy->pid_ns);
+}
+extern int ckpt_lookup_pid(struct ckpt_ctx *ctx, struct pid *pid);

/* defined in objhash.c and also used in security/security.c */
extern void lsm_string_free(struct kref *kref);
diff --git a/kernel/checkpoint/checkpoint.c b/kernel/checkpoint/checkpoint.c
index 0f46acf..01653d7 100644
--- a/kernel/checkpoint/checkpoint.c
+++ b/kernel/checkpoint/checkpoint.c
@@ -243,10 +243,10 @@ static int may_checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
{
struct task_struct *root = ctx->root_task;
struct nsproxy *nsproxy;
- int ret = 0;
struct pid_namespace *pidns;
+ int ret = 0;

- ckpt_debug("check %d\n", task_pid_nr_ns(t, ctx->root_nsproxy->pid_ns));
+ ckpt_debug("check %d\n", ckpt_task_vnr(ctx, t));

if (t->exit_state == EXIT_DEAD) {
_ckpt_err(ctx, -EBUSY, "%(T)Task state EXIT_DEAD\n");
diff --git a/kernel/checkpoint/restart.c b/kernel/checkpoint/restart.c
index 01da67f..66b6625 100644
--- a/kernel/checkpoint/restart.c
+++ b/kernel/checkpoint/restart.c
@@ -913,8 +913,7 @@ static int wait_task_active(struct ckpt_ctx *ctx)

static int wait_task_sync(struct ckpt_ctx *ctx)
{
- ckpt_debug("pid %d syncing\n",
- task_pid_nr_ns(current, task_active_pid_ns(ctx->root_task)));
+ ckpt_debug("pid %d syncing\n", ckpt_task_vnr(ctx, current));
wait_event_interruptible(ctx->waitq, ckpt_test_complete(ctx));
ckpt_debug("task sync done (errno %d)\n", ctx->errno);
if (ckpt_test_error(ctx))
@@ -1187,10 +1186,9 @@ static struct task_struct *choose_root_task(struct ckpt_ctx *ctx, pid_t pid)

read_lock(&tasklist_lock);
list_for_each_entry(task, &current->children, sibling) {
- if (task_pid_nr_ns(task, ctx->coord_pidns) == pid) {
+ if (task_pid_vnr(task) == pid) {
get_task_struct(task);
ctx->root_task = task;
- ctx->root_pid = pid;
break;
}
}
diff --git a/kernel/signal.c b/kernel/signal.c
index b1e6a31..dca40be 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3352,7 +3352,7 @@ static int restore_signal(struct ckpt_ctx *ctx)
* fail, so no need for explicit test
*/
ret = do_tiocspgrp(tty, tty_pair_get_tty(tty),
- h->tty_pgrp);
+ pid_vnr(pgrp));
if (ret < 0)
goto out;
}
--
1.7.1

_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containe rs
[PATCH 2/6] c/r: nit to avoid rcu lockdep complaint in restore_obj_sighand() [message #41573 is a reply to message #41559] Mon, 07 February 2011 17:18 Go to previous messageGo to next message
Oren Laadan is currently offline  Oren Laadan
Messages: 71
Registered: August 2007
Member
From: *parallels.com
Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
---
kernel/signal.c | 7 +++++--
1 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/kernel/signal.c b/kernel/signal.c
index dca40be..3842f5d 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2871,7 +2871,7 @@ static void *restore_sighand(struct ckpt_ctx *ctx)
int restore_obj_sighand(struct ckpt_ctx *ctx, int sighand_objref)
{
struct sighand_struct *sighand;
- struct sighand_struct *old_sighand;
+ struct sighand_struct *old_sighand = NULL;

sighand = ckpt_obj_fetch(ctx, sighand_objref, CKPT_OBJ_SIGHAND);
if (IS_ERR(sighand))
@@ -2884,11 +2884,14 @@ int restore_obj_sighand(struct ckpt_ctx *ctx, int sighand_objref)

/* manipulate tsk->sighand with tasklist lock write-held */
write_lock_irq(&tasklist_lock);
- old_sighand = rcu_dereference(current->sighand);
+ old_sighand = rcu_dereference_check(current->sighand,
+ rcu_read_lock_held() ||
+ lockdep_is_held(&tasklist_lock));
spin_lock(&old_sighand->siglock);
rcu_assign_pointer(current->sighand, sighand);
spin_unlock(&old_sighand->siglock);
write_unlock_irq(&tasklist_lock);
+
__cleanup_sighand(old_sighand);

return 0;
--
1.7.1

_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containe rs
[PATCH 3/6] c/r: introduce ckpt_obj_count() for the total number of objects [message #41574 is a reply to message #41559] Mon, 07 February 2011 17:18 Go to previous messageGo to next message
Oren Laadan is currently offline  Oren Laadan
Messages: 71
Registered: August 2007
Member
From: *parallels.com
This will be needed for pids-objects in container checkpoints. In
userspace we will rely on the pids objects inserted to the objash
sequentially. For subtree checkpoint, the pids objects are the first
to enter the objhash, therefore their tags will counting from 1. In
container checkpoint, other objects are inserted first, so we will
need to know how many objects were inserted earlier - so we will alos
save the count at that time.

Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
---
include/linux/checkpoint.h | 2 ++
kernel/checkpoint/objhash.c | 5 +++++
2 files changed, 7 insertions(+), 0 deletions(-)

diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index 21fc23e..65b6477 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -211,6 +211,8 @@ extern int ckpt_obj_insert(struct ckpt_ctx *ctx, void *ptr, int objref,
extern int ckpt_obj_reserve(struct ckpt_ctx *ctx);
extern int ckpt_obj_delete(struct ckpt_ctx *ctx, int objref,
enum obj_type type);
+extern int ckpt_obj_count(struct ckpt_ctx *ctx);
+

extern struct ckpt_ctx *ckpt_ctx_get(struct ckpt_ctx *ctx);
extern void ckpt_ctx_put(struct ckpt_ctx *ctx);
diff --git a/kernel/checkpoint/objhash.c b/kernel/checkpoint/objhash.c
index 40c2e9b..cbace7a 100644
--- a/kernel/checkpoint/objhash.c
+++ b/kernel/checkpoint/objhash.c
@@ -158,6 +158,11 @@ static inline int obj_alloc_objref(struct ckpt_ctx *ctx)
return ctx->obj_hash->next_free_objref++;
}

+int ckpt_obj_count(struct ckpt_ctx *ctx)
+{
+ return ctx->obj_hash->next_free_objref;
+}
+
/**
* ckpt_obj_new - add an object to the obj_hash
* @ctx: checkpoint context
--
1.7.1

_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containe rs
[PATCH 6/6] c/r: use pids objects for the pgrp/old_pgdp of ttys [message #41575 is a reply to message #41559] Mon, 07 February 2011 17:18 Go to previous message
Oren Laadan is currently offline  Oren Laadan
Messages: 71
Registered: August 2007
Member
From: *parallels.com
Make tty_old_pgrp and tty_pgrp use the shared pid instead of saving
the actual pid number in {checkpoint,restore}_signal().

Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
---
kernel/signal.c | 42 ++++++++++++++++++++++++++++++++----------
1 files changed, 32 insertions(+), 10 deletions(-)

diff --git a/kernel/signal.c b/kernel/signal.c
index 718b940..5c174a9 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3218,25 +3218,34 @@ static int checkpoint_signal(struct ckpt_ctx *ctx, struct task_struct *t)

/* tty */
if (signal->leader) {
- h->tty_old_pgrp = ckpt_pid_vnr(ctx, signal->tty_old_pgrp);
+ h->tty_old_pgrp = ckpt_lookup_pid(ctx, signal->tty_old_pgrp);
tty = tty_kref_get(signal->tty);
if (tty) {
/* irq is already disabled */
spin_lock(&tty->ctrl_lock);
- h->tty_pgrp = ckpt_pid_vnr(ctx, tty->pgrp);
+ h->tty_pgrp = ckpt_lookup_pid(ctx, tty->pgrp);
spin_unlock(&tty->ctrl_lock);
- tty_kref_put(tty);
}
}

unlock_task_sighand(t, &flags);

/*
+ * tty_pgrp must be in our namespace, since we are the
+ * session leader and could not have unshared pidns !
+ */
+ if (tty && h->tty_pgrp == 0) {
+ ckpt_err(ctx, -EBUSY, "%(T)tty_pgrp outside namespace\n");
+ ret = -EBUSY;
+ goto out;
+ }
+
+ /*
* If the session is in an ancestor namespace, skip this tty
* and set tty_objref = 0. It will not be explicitly restored,
* but rather inherited from parent pid-ns at restart time.
*/
- if (tty && ckpt_pid_vnr(ctx, tty->session) > 0) {
+ if (tty && ckpt_lookup_pid(ctx, tty->session) > 0) {
h->tty_objref = checkpoint_obj(ctx, tty, CKPT_OBJ_TTY);
if (h->tty_objref < 0)
ret = h->tty_objref;
@@ -3255,6 +3264,7 @@ static int checkpoint_signal(struct ckpt_ctx *ctx, struct task_struct *t)
list_splice(&shared_pending.list, &signal->shared_pending.list);
unlock_task_sighand(t, &flags);
out:
+ tty_kref_put(tty);
ckpt_hdr_put(ctx, h);
return ret;
}
@@ -3339,6 +3349,10 @@ static int restore_signal(struct ckpt_ctx *ctx)

/* tty - session */
if (h->tty_objref) {
+ /*
+ * should only get here if we are session leader, but
+ * we don't explicitly check: forced by calls below
+ */
tty = ckpt_obj_fetch(ctx, h->tty_objref, CKPT_OBJ_TTY);
if (IS_ERR(tty)) {
ret = PTR_ERR(tty);
@@ -3354,6 +3368,11 @@ static int restore_signal(struct ckpt_ctx *ctx)
* If tty_pgrp == CKPT_PID_NULL, below will
* fail, so no need for explicit test
*/
+ pgrp = ckpt_obj_fetch(ctx, h->tty_pgrp, CKPT_OBJ_PID);
+ if (IS_ERR(pgrp)) {
+ ret = PTR_ERR(pgrp);
+ goto out;
+ }
ret = do_tiocspgrp(tty, tty_pair_get_tty(tty),
pid_vnr(pgrp));
if (ret < 0)
@@ -3382,13 +3401,16 @@ static int restore_signal(struct ckpt_ctx *ctx)
do_setitimer(ITIMER_VIRTUAL, &itimer, NULL);
do_setitimer(ITIMER_PROF, &itimer, NULL);

- /* tty - tty_old_pgrp */
- if (current->signal->leader && h->tty_old_pgrp != CKPT_PID_NULL) {
- rcu_read_lock();
- pgrp = NULL; /* temp until next patch */
- rcu_read_unlock();
- if (!pgrp)
+ /*
+ * tty - tty_old_pgrp: only for session leaders and for valid
+ * tty_old_pgrp, ie, within our namespace, and not CKPT_PID_NULL.
+ */
+ if (current->signal->leader && h->tty_old_pgrp > 0) {
+ pgrp = ckpt_obj_fetch(ctx, h->tty_old_pgrp, CKPT_OBJ_PID);
+ if (IS_ERR(pgrp)) {
+ ret = PTR_ERR(pgrp);
goto out;
+ }
}

spin_lock_irq(&current->sighand->siglock);
--
1.7.1

_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containe rs
[PATCH 4/6] c/r: introduce pids objects [message #41576 is a reply to message #41559] Mon, 07 February 2011 17:18 Go to previous message
Oren Laadan is currently offline  Oren Laadan
Messages: 71
Registered: August 2007
Member
From: *parallels.com
Add the interface to handle pids as proper c/r shared objects. This
is the first step in converting the c/r code to hanlde pid as pids
objects that are namespace aware. Also remote the unused field
@root_pid from struct ckpt_ctx.

Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
---
include/linux/checkpoint_hdr.h | 6 +++++-
include/linux/checkpoint_types.h | 1 -
kernel/checkpoint/checkpoint.c | 2 --
kernel/checkpoint/restart.c | 1 -
kernel/pid.c | 28 ++++++++++++++++++++++++++++
5 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index b12d586..922eff0 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -245,6 +245,8 @@ struct ckpt_hdr_objref {
enum obj_type {
CKPT_OBJ_IGNORE = 0,
#define CKPT_OBJ_IGNORE CKPT_OBJ_IGNORE
+ CKPT_OBJ_PID,
+#define CKPT_OBJ_PID CKPT_OBJ_PID
CKPT_OBJ_INODE,
#define CKPT_OBJ_INODE CKPT_OBJ_INODE
CKPT_OBJ_FILE_TABLE,
@@ -373,7 +375,9 @@ struct ckpt_pids {
} __attribute__((aligned(8)));

/* pids */
-#define CKPT_PID_NULL -1
+/* (negative but not valid error) */
+#define CKPT_PID_NULL (-4096) /* null pid pointer */
+#define CKPT_PID_ROOT (-4097) /* pid same as root task */

/* task data */
struct ckpt_hdr_task {
diff --git a/include/linux/checkpoint_types.h b/include/linux/checkpoint_types.h
index cf74d3e..87a569a 100644
--- a/include/linux/checkpoint_types.h
+++ b/include/linux/checkpoint_types.h
@@ -35,7 +35,6 @@ struct ckpt_ctx {
ktime_t ktime_begin; /* operation start time */

int root_init; /* [container] root init ? */
- pid_t root_pid; /* [container] root pid */
struct task_struct *root_task; /* [container] root task */
struct nsproxy *root_nsproxy; /* [container] root nsproxy */
struct task_struct *root_freezer; /* [container] root task */
diff --git a/kernel/checkpoint/checkpoint.c b/kernel/checkpoint/checkpoint.c
index 01653d7..a850423 100644
--- a/kernel/checkpoint/checkpoint.c
+++ b/kernel/checkpoint/checkpoint.c
@@ -628,8 +628,6 @@ static int init_checkpoint_ctx(struct ckpt_ctx *ctx, pid_t pid)
* occurs then ckpt_ctx_free() is eventually called.
*/

- ctx->root_pid = pid;
-
/* root task */
rcu_read_lock();
task = find_task_by_vpid(pid);
diff --git a/kernel/checkpoint/restart.c b/kernel/checkpoint/restart.c
index 66b6625..9aaab4f 100644
--- a/kernel/checkpoint/restart.c
+++ b/kernel/checkpoint/restart.c
@@ -1178,7 +1178,6 @@ static struct task_struct *choose_root_task(struct ckpt_ctx *ctx, pid_t pid)
struct task_struct *task;

if (ctx->uflags & RESTART_TASKSELF) {
- ctx->root_pid = pid;
ctx->root_task = current;
get_task_struct(current);
return current;
diff --git a/kernel/pid.c b/kernel/pid.c
index f08f40a..0269991 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -36,6 +36,7 @@
#include <linux/pid_namespace.h>
#include <linux/init_task.h>
#include <linux/syscalls.h>
+#include <linux/checkpoint.h>

#define pid_hashfn(nr, ns) \
hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
@@ -571,6 +572,30 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
return pid;
}

+#ifdef CONFIG_CHECKPOINT
+
+/* pid-related checkpoint objects */
+
+static int obj_pid_grab(void *ptr)
+{
+ get_pid((struct pid *) ptr);
+ return 0;
+}
+
+static void obj_pid_drop(void *ptr, int lastref)
+{
+ put_pid((struct pid *) ptr);
+}
+
+static const struct ckpt_obj_ops ckpt_obj_pid_ops = {
+ .obj_name = "PID",
+ .obj_type = CKPT_OBJ_PID,
+ .ref_drop = obj_pid_drop,
+ .ref_grab = obj_pid_grab,
+};
+
+#endif /* CONFIG_CHECKPOINT */
+
/*
* The pid hash table is scaled according to the amount of memory in the
* machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or
@@ -605,4 +630,7 @@ void __init pidmap_init(void)

init_pid_ns.pid_cachep = KMEM_CACHE(pid,
SLAB_HWCACHE_ALIGN | SLAB_PANIC);
+#ifdef CONFIG_CHECKPOINT
+ register_checkpoint_obj(&ckpt_obj_pid_ops);
+#endif
}
--
1.7.1

_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containe rs
[PATCH 5/6] c/r: checkpoint and restart pids objects [message #41577 is a reply to message #41559] Mon, 07 February 2011 17:18 Go to previous message
Oren Laadan is currently offline  Oren Laadan
Messages: 71
Registered: August 2007
Member
From: *parallels.com
Make use of (shared) pids objects instead of simply saving the pid_t
numbers in both checkpoint and restart.

The motivation for this change is twofold. First, since pid-ns came to
life pid's in the kenrel _are_ shared objects and should be treated as
such. This is useful e.g. for tty handling and also file-ownership
(the latter waiting for this feature). Second, to properly support
nested namesapces we need to report with each pid the entire list of
pid numbers, not only a single pid. While current we do that for all
"live" pids (those that belong to live tasks), we didn't do it for
"dead" pids (to be assigned to ghost restarting tasks).

Note, that ideally the list of vpids of a pid object should also
include the pid-ns to which each level belongs; however, in this patch
we don't yet hanlde that. So only linear pid-nesting works well and
not arbitrary tree.

DICLAIMER: this patch is big and intrusive! Here is a summary of the
changes that it makes:

CHECKPOINT:

1) Modified the data structures used to describe pids and tasks' pids:
struct ckpt_pids - for the data of a pids object (depth, numbers)
(pids objects are collected in the order found, and are assigned
tags sequentially, starting from 1)
struct ckpt_task_pids - for a task's pids, holding the _tag_ of the
corresponding pids object rather than their pid numbers themselves.

2) Accordingly, two arrays are used to hold this information:
ctx->pids_arr - array of 'struct ckpt_pids' collected from the tasks
and the pids they reference. Entries are of variable size depending
on the pid-ns nesting level.
ctx->tasks_arr - array of 'struct ckpt_task_pids' collected from teh
tasks. Entries are of fixed size, and hold the objref tags to the
shared pids objects rather than actual pid numbers.
(the old vpids_arr is no longer needed, nor written separately).

3) We now first write the pids information, then tasks' pids.

4) checkpoint_pids() builds and writes the ctx->pids_arr:
checkpoint_pids_build() - iterates over the tasks and collects the
unique pids in a flex_array (also inserts them into the objhash)
checkpoint_pids_dumps() - dumps the data from the flex_array in
the format of ctx->tasks_arr

5) checkpoint_tree() dumps the tasks' pids information, by scanning
all the tasks and writing out tags of the pids they reference. If
a pgid/sid is zero, i.e. from an ancestor pid-ns, then the tag will
be zero.

6) In container checkpoint, pids out of our namesapce are disallwed.
We don't do leak detection on pids objects (should we ?).

RESTART:

1) We first call prepare_descendants() to set the ->checkpoint_ctx
of the restarting tasks, and _then_ read the pids data followed by
the tasks' pids data. We validate both against existing tasks.

2) restore_read_pids() reads the pids data, validates that each pid
exists (*) and adds the pids to the objhash. Verify that the owner
task is within our restart context.

(*) We validate them from the root task's point of view, by seeing
that the task has the correct 'struct pid' pointer. NOTE: user-cr
does not support restart --no-pids when there are nested pis-ns,
because is it quite complicated to find ou the pids of all tasks
at all nested levels from userspace.

3) restore_read_tasks() reads the tasks' pids data, validates each
task and adds it to the ctx->tasks_arr. Verify that the task is
within our restart context.

4) We track the array of restarting _tasks_ and the active _task_
instead an array of restarting pids and the active pid. The helpers
to wake-up, sync, check active task etc were modified accordingly.
It improves and simplifies the logic, e.g. restore_activate_next().

5) There are two special values for pgid/sid tags:
0 - means that it is from an ancestor namespace, so we verify that
this is the case. For sid, user-cr should have created the task
properly; for pgid, use the coordinator's (or coordinator's parent)
pid if from differnet namespace, or fail.
CKPT_PID_ROOT - means that we want to reuse the root task's sid,
useful for when the root task is _not_ a conatiner init (e.g. in
subtree c/r) and its session (like our pgrp) was inherited from
somewhere above).

6) Restoring of a task's pgid was moved to when task is validated,
as this should be part of the validation.

NOTE: the patch does not yet allow non-linear nesting of pid-ns.
This would require to make pid-ns a shared object and track it by
the 'struct ckpt_pids' on the kernel side, and in userspace we'll
need to update the logic of MakeForest algorithm to be pid-ns aware
(probably similarly to how sid constraints are handled).

Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
---
include/linux/checkpoint_hdr.h | 23 ++-
include/linux/checkpoint_types.h | 10 +-
kernel/checkpoint/checkpoint.c | 450 +++++++++++++++++++++---------
kernel/checkpoint/process.c | 108 +-------
kernel/checkpoint/restart.c | 568 +++++++++++++++++++++++++++++++-------
kernel/checkpoint/sys.c | 5 -
kernel/signal.c | 8 +-
7 files changed, 806 insertions(+), 366 deletions(-)

diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 922eff0..6f991c6 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -107,7 +107,9 @@ enum {
CKPT_HDR_SECURITY,
#define CKPT_HDR_SECURITY CKPT_HDR_SECURITY

- CKPT_HDR_TREE = 101,
+ CKPT_HDR_PIDS = 101,
+#define CKPT_HDR_PIDS CKPT_HDR_PIDS
+ CKPT_HDR_TREE,
#define CKPT_HDR_TREE CKPT_HDR_TREE
CKPT_HDR_TASK,
#define CKPT_HDR_TASK CKPT_HDR_TASK
@@ -358,20 +360,33 @@ struct ckpt_hdr_container {
*/
} __attribute__((aligned(8)));;

+/* pids array */
+struct ckpt_hdr_pids {
+ struct ckpt_hdr h;
+ __u32 nr_pids;
+ __u32 nr_vpids;
+ __u32 offset;
+} __attribute__((aligned(8)));
+
+struct ckpt_pids {
+ __u32 depth;
+ __s32 numbers[1];
+} __attribute__((aligned(8)));
+
/* task tree */
struct ckpt_hdr_tree {
struct ckpt_hdr h;
- __s32 nr_tasks;
+ __u32 nr_tasks;
} __attribute__((aligned(8)));

-struct ckpt_pids {
+struct ckpt_task_pids {
/* These pids are in the root_nsproxy's pid ns */
__s32 vpid;
__s32 vppid;
__s32 vtgid;
__s32 vpgid;
__s32 vsid;
- __s32 depth; /* pid namespace depth relative to container init */
+ __u32 depth;
} __attribute__((aligned(8)));

/* pids */
diff --git a/include/linux/checkpoint_types.h b/include/linux/checkpoint_types.h
index 87a569a..60c664f 100644
--- a/include/linux/checkpoint_types.h
+++ b/include/linux/checkpoint_types.h
@@ -68,16 +68,14 @@ struct ckpt_ctx {

int nr_vpids; /* total count of vpids */

- /* [checkpoint] */
- struct task_struct *tsk; /* current target task */
struct task_struct **tasks_arr; /* array of all tasks */
int nr_tasks; /* size of tasks array */

+ /* [checkpoint] */
+ struct task_struct *tsk; /* current target task */
+
/* [restart] */
- struct pid_namespace *coord_pidns;/* coordinator pid_ns */
- struct ckpt_pids *pids_arr; /* array of all pids [restart] */
- int nr_pids; /* size of pids array */
- int active_pid; /* (next) position in pids array */
+ int active_task; /* (next) position in pids array */
atomic_t nr_total; /* total tasks count */
struct completion complete; /* completion for container root */
wait_queue_head_t waitq; /* waitqueue for restarting tasks */
diff --git a/kernel/checkpoint/checkpoint.c b/kernel/checkpoint/checkpoint.c
index a850423..342590b 100644
--- a/kernel/checkpoint/checkpoint.c
+++ b/kernel/checkpoint/checkpoint.c
@@ -26,6 +26,7 @@
#include <linux/utsname.h>
#include <linux/magic.h>
#include <linux/hrtimer.h>
+#include <linux/flex_array.h>
#include <linux/deferqueue.h>
#include <linux/checkpoint.h>
#include <linux/pid_namespace.h>
@@ -312,133 +313,6 @@ static int may_checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
return ret;
}

-#define CKPT_HDR_PIDS_CHUNK 256
-
-/*
- * Write the pids in ctx->root_nsproxy->pidns. This info is
- * needed at restart to unambiguously dereference tasks.
- */
-static int checkpoint_pids(struct ckpt_ctx *ctx)
-{
- struct ckpt_pids *h;
- struct pid_namespace *root_pidns;
- struct task_struct *task;
- struct task_struct **tasks_arr;
- int nr_tasks, n, pos = 0, ret = 0;
-
- root_pidns = ctx->root_nsproxy->pid_ns;
- tasks_arr = ctx->tasks_arr;
- nr_tasks = ctx->nr_tasks;
- BUG_ON(nr_tasks <= 0);
-
- ret = ckpt_write_obj_type(ctx, NULL,
- sizeof(*h) * nr_tasks,
- CKPT_HDR_BUFFER);
- if (ret < 0)
- return ret;
-
- h = ckpt_hdr_get(ctx, sizeof(*h) * CKPT_HDR_PIDS_CHUNK);
- if (!h)
- return -ENOMEM;
-
- do {
- rcu_read_lock();
- for (n = 0; n < min(nr_tasks, CKPT_HDR_PIDS_CHUNK); n++) {
- struct pid_namespace *task_pidns;
- task = tasks_arr[pos];
-
- h[n].vpid = task_pid_nr_ns(task, root_pidns);
- h[n].vtgid = task_tgid_nr_ns(task, root_pidns);
- h[n].vpgid = task_pgrp_nr_ns(task, root_pidns);
- h[n].vsid = task_session_nr_ns(task, root_pidns);
- h[n].vppid = task_tgid_nr_ns(task->real_parent,
- root_pidns);
- task_pidns = task_active_pid_ns(task);
- h[n].depth = task_pidns->level - root_pidns->level;
-
- ckpt_debug("task[%d]: vpid %d vtgid %d parent %d\n",
- pos, h[n].vpid, h[n].vtgid, h[n].vppid);
- ctx->nr_vpids += h[n].depth;
- pos++;
- }
- rcu_read_unlock();
-
- n = min(nr_tasks, CKPT_HDR_PIDS_CHUNK);
- ret = ckpt_kwrite(ctx, h, n * sizeof(*h));
- if (ret < 0)
- break;
-
- nr_tasks -= n;
- } while (nr_tasks > 0);
-
- _ckpt_hdr_put(ctx, h, sizeof(*h) * CKPT_HDR_PIDS_CHUNK);
- return ret;
-}
-
-static int checkpoint_vpids(struct ckpt_ctx *ctx)
-{
- __s32 *h; /* vpid array */
- struct pid_namespace *root_pidns, *task_pidns = NULL, *active_pidns;
- struct task_struct *task;
- int ret, nr_tasks = ctx->nr_tasks;
- int tidx = 0; /* index into task array */
- int hidx = 0; /* pids wr
...

Previous Topic: [PATCH 1/2] c/r: Do not crash if socket has no peercred
Next Topic: Re: [PATCH, v3 2/2] cgroups: introduce timer slack subsystem
Goto Forum:
  


Current Time: Mon Dec 09 03:33:07 GMT 2019