OpenVZ Forum


Home » Mailing lists » Devel » [PATCH 0/11] user-cr: support for pids as shared objects (v2)
[PATCH 0/11] user-cr: support for pids as shared objects (v2) [message #41565] Mon, 07 February 2011 17:21 Go to next message
Oren Laadan is currently offline  Oren Laadan
Messages: 71
Registered: August 2007
Member
From: *parallels.com
This patchset adds the necessary support in user-cr related to
handling of pids as proper shared objets. You must use this if you use
the corresponding kernel-cr patchset recetly posted.

Changelog[v2]:
- Cleanups and many bug fixes: it now passes all tests in tests-cr

Oren.

_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containe rs
[PATCH 04/11] restart: improve success/failure reporting [message #41566 is a reply to message #41565] Mon, 07 February 2011 17:21 Go to previous messageGo to next message
Oren Laadan is currently offline  Oren Laadan
Messages: 71
Registered: August 2007
Member
From: *parallels.com
This patch improves the final status report (success/fail) of restart
by adding a new ctx->success variable. The motivation is twofold:

1) Before, there was a confusion between who reports an error when (in
--pidns) the root task isn't container init - in which case we add a
dummy init. For this, we also improve how the dummy init communicates
the status (success/fail) to the original restart task.

2) Even after restart succeeds, there may be errors (which may not
affect the restarted task, but should be nevertheless reported).

Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
---
restart.c | 81 ++++++++++++++++++++++++++++++++++--------------------------
1 files changed, 46 insertions(+), 35 deletions(-)

diff --git a/restart.c b/restart.c
index 9535543..a1af631 100644
--- a/restart.c
+++ b/restart.c
@@ -125,6 +125,7 @@ struct ckpt_ctx {
} whoami;

int error;
+ int success;

pid_t root_pid;
int pipe_in;
@@ -677,9 +678,21 @@ int cr_restart(struct cr_restart_args *args)
if (global_feeder_pid)
waitpid(global_feeder_pid, NULL, 0);

- if (ret < 0)
+ if (ctx.error)
errno = ctx.error;

+ if (ctx.success) {
+ ckpt_dbg("c/r succeeded\n");
+ ckpt_verbose("Restart succeeded\n");
+ if (ctx.error)
+ ckpt_verbose("Post restart error: %d\n", ctx.error);
+ } else {
+ ckpt_dbg("c/r failed ?\n");
+ ckpt_perror("restart");
+ ckpt_verbose("Restart failed\n");
+ ret = -1;
+ }
+
return ret;
}

@@ -827,7 +840,7 @@ static int ckpt_pretend_reaper(struct ckpt_ctx *ctx)
return ckpt_parse_status(global_child_status, 1, 0);
}

-static int ckpt_probe_child(pid_t pid, char *str)
+static int ckpt_probe_child(struct ckpt_ctx *ctx, pid_t pid, char *str)
{
int status, ret;

@@ -840,17 +853,16 @@ static int ckpt_probe_child(pid_t pid, char *str)
ret = waitpid(pid, &status, WNOHANG);
if (ret == pid) {
report_exit_status(status, str, 0);
- errno = ECHILD;
- return -1;
+ return ctx_ret_errno(ctx, ECHILD);
} else if (ret < 0 && errno == ECHILD) {
ckpt_err("WEIRD: %s exited without trace (%s)\n",
str, strerror(errno));
- return -1;
+ return ctx_set_errno(ctx);
} else if (ret != 0) {
ckpt_err("waitpid for %s (%s)", str, strerror(errno));
if (ret > 0)
errno = ECHILD;
- return -1;
+ return ctx_set_errno(ctx);
}
return 0;
}
@@ -900,30 +912,39 @@ static int __ckpt_coordinator(void *arg)
if (!ctx->args->wait)
close(ctx->pipe_coord[0]);

- return ckpt_coordinator(ctx);
+ /* set the exit status properly */
+ return ckpt_coordinator(ctx) >= 0 ? 0 : 1;
}

static int ckpt_coordinator_status(struct ckpt_ctx *ctx)
{
- int status = -1;
+ int status;
int ret;

close(ctx->pipe_coord[1]);
ctx->pipe_coord[1] = -1; /* mark unused */

ret = read(ctx->pipe_coord[0], &status, sizeof(status));
- if (ret < 0)
- ckpt_perror("read coordinator status");
- else if (ret == 0) {
- /* coordinator failed to report */
- ckpt_dbg("Coordinator failed to report status.");
- } else
- status = 0;

close(ctx->pipe_coord[0]);
ctx->pipe_coord[0] = -1; /* mark unused */

- return status;
+ if (ret < 0) {
+ ckpt_perror("read coordinator status");
+ return ctx_set_errno(ctx);
+ } else if (ret != sizeof(status)) {
+ /* coordinator failed to report */
+ ckpt_dbg("Coordinator failed to report status\n");
+ return ctx_ret_errno(ctx, EIO);
+ } else if (status != 0) {
+ /* coordinator reported failure */
+ ckpt_dbg("Coordinator reported error\n");
+ return ctx_ret_errno(ctx, status);
+ }
+
+ /* success ! */
+ ctx->success = 1;
+ return 0;
}

static int ckpt_coordinator_pidns(struct ckpt_ctx *ctx)
@@ -977,8 +998,8 @@ static int ckpt_coordinator_pidns(struct ckpt_ctx *ctx)
* The child (coordinator) may have already exited before the
* signal handler was plugged; verify that it's still there.
*/
- if (ckpt_probe_child(coord_pid, "coordinator") < 0)
- return ctx_set_errno(ctx);
+ if (ckpt_probe_child(ctx, coord_pid, "coordinator") < 0)
+ return -1;

ctx->args->copy_status = copy;

@@ -1017,8 +1038,8 @@ static int ckpt_coordinator(struct ckpt_ctx *ctx)
* The child (root_task) may have already exited before the
* signal handler was plugged; verify that it's still there.
*/
- if (ckpt_probe_child(root_pid, "root task") < 0)
- return ctx_set_errno(ctx);
+ if (ckpt_probe_child(ctx, root_pid, "root task") < 0)
+ return -1;

if (ctx->args->keep_frozen)
flags |= RESTART_FROZEN;
@@ -1028,20 +1049,15 @@ static int ckpt_coordinator(struct ckpt_ctx *ctx)
ret = restart(root_pid, ctx->args->infd,
flags, ctx->args->klogfd);

- if (ret < 0) {
- ckpt_perror("restart failed");
- ckpt_verbose("Failed\n");
- ckpt_dbg("restart failed ?\n");
- return ret;
+ if (ret >= 0) {
+ ctx->success = 1; /* restart succeeded ! */
+ ret = 0;
}

- ckpt_verbose("Success\n");
- ckpt_dbg("restart succeeded\n");
-
- ret = 0;
-
if (ctx->args->pidns && ctx->tasks_arr[0].pid != 1) {
/* Report success/failure to the parent */
+ if (ret < 0)
+ ret = ctx->error;
if (write(ctx->pipe_coord[1], &ret, sizeof(ret)) < 0) {
ckpt_perror("failed to report status");
return ctx_set_errno(ctx);
@@ -1068,11 +1084,6 @@ static int ckpt_coordinator(struct ckpt_ctx *ctx)
ret = ckpt_collect_child(ctx);
}

- if (ret < 0)
- ckpt_dbg("c/r failed ?\n");
- else
- ckpt_dbg("c/r succeeded\n");
-
return ret;
}

--
1.7.1

_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containe rs
[PATCH 09/11] ckptinfo: s/ckpt_pids/ckpt_task_pids/ after kerenl header update [message #41567 is a reply to message #41565] Mon, 07 February 2011 17:21 Go to previous messageGo to next message
Oren Laadan is currently offline  Oren Laadan
Messages: 71
Registered: August 2007
Member
From: *parallels.com
Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
---
ckptinfo.c | 4 ++--
1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ckptinfo.c b/ckptinfo.c
index d73b38c..1361c21 100644
--- a/ckptinfo.c
+++ b/ckptinfo.c
@@ -254,7 +254,7 @@ static int image_parse(int fd, struct args *args)
static int image_parse_tree(struct ckpt_hdr *h, int fd, struct args *args)
{
struct ckpt_hdr_tree *hh;
- struct ckpt_pids *pp;
+ struct ckpt_task_pids *pp;
int nr_tasks;
int i, ret;

@@ -268,7 +268,7 @@ static int image_parse_tree(struct ckpt_hdr *h, int fd, struct args *args)
if (ret <= 0)
return -1;

- pp = (struct ckpt_pids *) h;
+ pp = (struct ckpt_task_pids *) h;

if (args->show_task_tree) {
for (i = 0; i < nr_tasks; i++) {
--
1.7.1

_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containe rs
[PATCH 05/11] restart: obtain pid_max from /proc/sys/kernel/pid_max [message #41568 is a reply to message #41565] Mon, 07 February 2011 17:21 Go to previous messageGo to next message
Oren Laadan is currently offline  Oren Laadan
Messages: 71
Registered: August 2007
Member
From: *parallels.com
Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
---
restart.c | 19 +++++++++++++++++--
1 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/restart.c b/restart.c
index a1af631..05d101b 100644
--- a/restart.c
+++ b/restart.c
@@ -158,6 +158,9 @@ struct ckpt_ctx {
struct cr_restart_args *args;

char *freezer;
+
+ /* system limits */
+ pid_t pid_max;
};

struct pid_swap {
@@ -489,6 +492,9 @@ int process_args(struct cr_restart_args *args)

static void init_ctx(struct ckpt_ctx *ctx)
{
+ FILE *file;
+ char buf[1024];
+
memset(ctx, 0, sizeof(*ctx));

/* mark all fds as unused */
@@ -500,6 +506,15 @@ static void init_ctx(struct ckpt_ctx *ctx)
ctx->pipe_feed[1] = -1;
ctx->pipe_coord[0] = -1;
ctx->pipe_coord[1] = -1;
+
+ /* system limits */
+ ctx->pid_max = SHRT_MAX; /* default */
+ file = fopen("/proc/sys/kernel/pid_max", "r");
+ if (file) {
+ if (fgets(buf, 1024, file))
+ ctx->pid_max = atoi(buf);
+ fclose(file);
+ }
}

static void exit_ctx(struct ckpt_ctx *ctx)
@@ -1223,12 +1238,12 @@ static int ckpt_alloc_pid(struct ckpt_ctx *ctx)
* (this will become inefficient if pid-space is exhausted)
*/
do {
- if (ctx->tasks_pid == INT_MAX)
+ if (ctx->tasks_pid == ctx->pid_max)
ctx->tasks_pid = CKPT_RESERVED_PIDS;
else
ctx->tasks_pid++;

- if (n++ == INT_MAX) { /* ohhh... */
+ if (n++ == ctx->pid_max) { /* ohhh... */
ckpt_err("pid namsepace exhausted");
return -1;
}
--
1.7.1

_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containe rs
[PATCH 08/11] udpate kernel headers: support for pids objects [message #41569 is a reply to message #41565] Mon, 07 February 2011 17:21 Go to previous messageGo to next message
Oren Laadan is currently offline  Oren Laadan
Messages: 71
Registered: August 2007
Member
From: *parallels.com
---
include/linux/checkpoint_hdr.h | 29 ++++++++++++++++++++++++-----
1 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index f7c4d9a..227bfbe 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -94,7 +94,9 @@ enum {
CKPT_HDR_SECURITY,
#define CKPT_HDR_SECURITY CKPT_HDR_SECURITY

- CKPT_HDR_TREE = 101,
+ CKPT_HDR_PIDS = 101,
+#define CKPT_HDR_PIDS CKPT_HDR_PIDS
+ CKPT_HDR_TREE,
#define CKPT_HDR_TREE CKPT_HDR_TREE
CKPT_HDR_TASK,
#define CKPT_HDR_TASK CKPT_HDR_TASK
@@ -232,6 +234,8 @@ struct ckpt_hdr_objref {
enum obj_type {
CKPT_OBJ_IGNORE = 0,
#define CKPT_OBJ_IGNORE CKPT_OBJ_IGNORE
+ CKPT_OBJ_PID,
+#define CKPT_OBJ_PID CKPT_OBJ_PID
CKPT_OBJ_INODE,
#define CKPT_OBJ_INODE CKPT_OBJ_INODE
CKPT_OBJ_FILE_TABLE,
@@ -343,24 +347,39 @@ struct ckpt_hdr_container {
*/
} __attribute__((aligned(8)));;

+/* pids array */
+struct ckpt_hdr_pids {
+ struct ckpt_hdr h;
+ __u32 nr_pids;
+ __u32 nr_vpids;
+ __u32 offset;
+} __attribute__((aligned(8)));
+
+struct ckpt_pids {
+ __u32 depth;
+ __s32 numbers[1];
+} __attribute__((aligned(8)));
+
/* task tree */
struct ckpt_hdr_tree {
struct ckpt_hdr h;
- __s32 nr_tasks;
+ __u32 nr_tasks;
} __attribute__((aligned(8)));

-struct ckpt_pids {
+struct ckpt_task_pids {
/* These pids are in the root_nsproxy's pid ns */
__s32 vpid;
__s32 vppid;
__s32 vtgid;
__s32 vpgid;
__s32 vsid;
- __s32 depth; /* pid namespace depth relative to container init */
+ __u32 depth;
} __attribute__((aligned(8)));

/* pids */
-#define CKPT_PID_NULL -1
+/* (negative but not valid error) */
+#define CKPT_PID_NULL (-4096) /* null pid pointer */
+#define CKPT_PID_ROOT (-4097) /* pid same as root task */

/* task data */
struct ckpt_hdr_task {
--
1.7.1

_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containe rs
[PATCH 07/11] restart: explicitly disallow orphan tasks with --no-pidns [message #41570 is a reply to message #41565] Mon, 07 February 2011 17:21 Go to previous messageGo to next message
Oren Laadan is currently offline  Oren Laadan
Messages: 71
Registered: August 2007
Member
From: *parallels.com
Add a test to ckpt_build_tree() that, if --no-pidns is required, will
verify that the tasks data structure will not lead to an orphan task
during the restart (since this should not have been possible in the
original subtree checkpoint).

This is useful beyond a sanity check:

If --no-pidns is set, then every child task asks to be notified of the
parent's death via prctl (for better cleanup). This is ok because from
a subtree checkpoint there should never be orphan tasks. But if we do
have an orphan task (due to e.g. a bug or incorrectly modified image),
then the following may happen:

A ghost/dead task that has a child (should be impossilbe in subtree
image) may exit before that child; In that case, the child receives
the signal and dies prematurely. Restart may either fail if this
happens early enough, or simply hang since the coordinator will be
waiting forever for all the tasks to complete.

Besides the sanity test, we also modify the parent death signal to be
SIGUSR1 instead of SIGKILL, and we catch it - so if the race happens
while the restarting task is still in userspace, we can be verbose
about it. We also add a hanlder for SIGSEGV, so we'll get a message
about it too.

Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
---
restart.c | 50 +++++++++++++++++++++++++++++++++++++++++++++-----
1 files changed, 45 insertions(+), 5 deletions(-)

diff --git a/restart.c b/restart.c
index 966f7a1..01566c2 100644
--- a/restart.c
+++ b/restart.c
@@ -306,6 +306,18 @@ static char *sig2str(int sig)
return "UNKNOWN SIGNAL";
}

+static void sigusr1_handler(int sig)
+{
+ ckpt_dbg("SIGUSER1: restarting process would become orphan.\n");
+ exit(1);
+}
+
+static void sigsegv_handler(int sig)
+{
+ ckpt_dbg("SIGSEGV: restarting process unexpected fault.\n");
+ exit(1);
+}
+
static void sigchld_handler(int sig)
{
int collected = 0;
@@ -573,6 +585,7 @@ int cr_restart(struct cr_restart_args *args)
ckpt_perror("unshare");
goto cleanup;
}
+
/* chroot ? */
if (args->root && chroot(args->root) < 0) {
ckpt_perror("chroot");
@@ -912,6 +925,9 @@ static int __ckpt_coordinator(void *arg)

/* none of this requires cleanup: we're forked ... */

+ /* we want to be vocal about sigsegv */
+ signal(SIGSEGV, sigsegv_handler);
+
/* chroot ? */
if (ctx->args->root && chroot(ctx->args->root) < 0) {
ckpt_perror("chroot");
@@ -1114,7 +1130,7 @@ static inline struct task *ckpt_init_task(struct ckpt_ctx *ctx)
static int ckpt_build_tree(struct ckpt_ctx *ctx)
{
struct task *task;
- int i;
+ int i, found;

/*
* Allow for additional tasks to be added on demand for
@@ -1167,6 +1183,25 @@ static int ckpt_build_tree(struct ckpt_ctx *ctx)
ckpt_dbg("............\n");
#endif

+ /*
+ * For --no-pidns verify that the resulting tree won't create
+ * orphan tasks (comment in ckpt_fork_stub() explain why).
+ */
+ if (!ctx->args->pidns) {
+ found = 0;
+ for (i = 0; i < ctx->tasks_nr; i++) {
+ task = &ctx->tasks[i];
+ if (!(task->flags & (TASK_GHOST | TASK_DEAD)))
+ continue;
+ if (!task->children)
+ continue;
+ ckpt_err("Bad orphan task (#%d) with --no-pidns\n", i);
+ found = 1;
+ }
+ if (found)
+ return ctx_ret_errno(ctx, EINVAL);
+ }
+
return 0;
}

@@ -1814,15 +1849,20 @@ int ckpt_fork_stub(void *data)
* death by asking to be killed then. When restart succeeds,
* it will have replaced this with the original value.
*
- * This works because in the --no-pids case, the hierarchy of
- * tasks does not contain zombies (else, there must also be a
- * container init, whose pid (==1) is clearly already taken).
+ * This works because in the --no-pidns case, the hierarchy of
+ * tasks does not contain zombies; else, there must also be a
+ * container init, whose pid (==1) is clearly already taken.
+ * This is verified in ckpt_build_tree().
*
* Thus, if a the parent of this task dies before this prctl()
* call, it suffices to test getppid() == task->parent_pid.
+ *
+ * We use SIGUSR1 so that we can catch it and issue an error
+ * or debug message when this happens.
*/
if (!ctx->args->pidns) {
- if (prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0) < 0) {
+ signal(SIGUSR1, sigusr1_handler);
+ if (prctl(PR_SET_PDEATHSIG, SIGUSR1, 0, 0, 0) < 0) {
ckpt_perror("prctl");
return ctx_set_errno(ctx);
}
--
1.7.1

_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containe rs
[PATCH 10/11] restart: fix support for nested pid namespaces [message #41571 is a reply to message #41565] Mon, 07 February 2011 17:21 Go to previous messageGo to next message
Oren Laadan is currently offline  Oren Laadan
Messages: 71
Registered: August 2007
Member
From: *parallels.com
Adapt restart code to the new pids handling in kernel-cr that handles
pids as a proper shared object.

DISCLAIMER Disclaimer: this patch is bug and intrusive ... Here is a
summary of the changes that it makes:

1) The main change is that we read the 'ckpt_pids' that hold the
actual pids numbers, and then everything else uses tags that refer to
these objects. Since the ctx->pids_arr is an array of variable-length
entries, it is inconvenient ot point to it with an index. So we use
another array, ctx->pids, that maps from a linear index to the offset
in the ctx->pids_arr where the data is found.

2) Now all pids other than those in 'ckpt_pids' are indices into that
array (more precisely, into ctx->pids array), the variables now have
a "_ind" suffix, e.g. "pid_ind" instead of "pid". There are helpers
to translate from index to pids structure.

3) Document the data structures used to track pids and tasks within
the restart code.

4) To support (linearly) nested-pids, the pids hash table was extended
to have depth, so that if we need to allocate a new (dummy) pid, we
can choose unique pids at all pid-ns levels, not just the top.

5) Accordingly, dummy pid allocation is done at all possible depths in
the hash.

6) Throw away ckpt_{read/write/assign}_vpids - it is no longer needed.
Instead, the seuqence of calls is now:
ckpt_read_pids()
ckpt_read_tree()
ckpt_build_pids()
ckpt_build_tree()

7) Disallow restart with --no-pids if there are nested pid-ns, because
because is it quite complicated to find ou the pids of all tasks at
all nested levels from userspace.

8) If the root task's is not a session leader (must be from a subtree
checkpoint), then it should now inherit its sid from the coordinator.
Furthermore, other tasks with sid/pgid inherited from above the root
task should also do the same. For this to work we use a special value
for their {sid,pgid}_ind: we can't use 0, because that already means
a pid from an ancestor pid-ns; instead we mark it with CKPT_PID_ROOT,
and the kernel code knows how to handle it.

NOTE: this is only necessary when the root task is not a session
leader. Otherwise, we can just add a placeholder task to accopmlish
the same effect (recall it's a subtree). But a placeholder cannot be
placed above the root task...

NOTE2: by doing this, we squash all the sids/pgids from above the
root task into a single common value at restart, even though they
may have been distinct at checkpoint. This is considered a feature
until someone really needs this to behave differently ...

9) Fix a subtle bug in the session-propagation logic, whereas we don't
need a placeholder if we reach the root task _and_ we are a child of
the root task (becaus we will inherit the sid from the root task).

10) In ckpt_fork_child() we can use the 'ckpt_pids' structure for the
pids rather than manually build one.

11) In adjust_pids() and --no-pids we only try to update the numbers[0]
of the pid; we don't support nested pid-ns for --no-pids.

Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
---
restart.c | 1192 ++++++++++++++++++++++++++++++++++++++++-------------------- -
1 files changed, 787 insertions(+), 405 deletions(-)

diff --git a/restart.c b/restart.c
index 01566c2..f64e508 100644
--- a/restart.c
+++ b/restart.c
@@ -45,6 +45,12 @@
#include "common.h"

/*
+ * To re-create the tasks tree in user space, 'restart' reads the
+ * header and tree data from the checkpoint image tree. It makes up
+ * for the data that was consumed by using a helper process that
+ * provides the data back to the restart syscall, followed by the rest
+ * of the checkpoint image stream.
+ *
* By default, 'restart' creates a new pid namespace in which the
* restart takes place, using the original pids from the time of the
* checkpoint. This requires that CLONE_NEWPID and eclone() be enabled.
@@ -54,19 +60,15 @@
* by default, 'restart' creates an equivalen tree without restoring
* the original pids, assuming that the application can tolerate this.
* For this, the 'ckpt_pids' array is transformed on-the-fly before it
- * is fed to the kernel.
+ * is fed to the kernel. This mode of operation is permitted only if
+ * all the restarting tasks belong to a single pid-namespace (i.e. no
+ * pid-namespace nesting).
*
- * By default, "--pids" implied "--pidns" and vice-versa. The user can
+ * By default, "--pids" implies "--pidns" and vice-versa. The user can
* use "--pids --no-pidns" for a restart in the currnet namespace -
* 'restart' will attempt to create the new tree with the original pids
* from the time of the checkpoint, if possible. This requires that
* eclone() be enabled.
- *
- * To re-create the tasks tree in user space, 'restart' reads the
- * header and tree data from the checkpoint image tree. It makes up
- * for the data that was consumed by using a helper process that
- * provides the data back to the restart syscall, followed by the rest
- * of the checkpoint image stream.
*/

struct hashent {
@@ -78,6 +80,75 @@ struct hashent {
struct task;
struct ckpt_ctx;

+/*
+ * The following data structres are used to track pids:
+ *
+ * ctx->pids_arr[]:
+ * Array of (variable sized) 'struct ckpt_pids' from the checkpoint
+ * image, each entry indicates the level (depth) relative to the
+ * root task, and the pids at each level. NOTE: the order of pids
+ * matches order of adding them to the objhash during checkpoint
+ * (hence their tags).
+ *
+ * ctx->pids_copy[]:
+ * Array used to hold a copy of pids_arr[] during --no-pids restart
+ * when converting the task's pids from the original values from
+ * the checkpoint image, to the real pids produced by forks.
+ *
+ * ctx->pids_new[]:
+ * Array of (variable sized) 'struct ckpt_pids' to hold new pids
+ * objects allocated by the MakeForst algorithm fo the restart.
+ *
+ * ctx->pids_index[]:
+ * Array of integers that provides mapping from a pid object (tag)
+ * to the byte offset inside ctx->pids_arr where that pid object
+ * is. It is useful since the entries in the latter are of variable
+ * size.
+ *
+ * ctx->tasks_arr[]:
+ * Array of 'struct ckpt_task_pids' from the checkpoint image, each
+ * entry indicates a task's pids (pid,tgid,pgid,sid,ppid) and the
+ * pid-namespace nesting level. NOTE: the pids store the tags of the
+ * corresponding pid objects (and thus their order in ctx->pids_arr)
+ * rather then the pid values themselves.
+ *
+ * ctx->tasks[]:
+ * Array of 'struct task' that holds information about all the tasks
+ * neede to be created in userespace (the input and output of the
+ * DumpForst and CreateForest algorithms). NOTE: the pids here also
+ * store the tags of the corresponding pid objects).
+ *
+ * When restart algorithm needs to create dead tasks or produce dummy
+ * tasks, it stores new 'ckpt_pids' objects in ctx->pids_new[], and
+ * extends ctx->pids[] and ctx->tasks[] to store index to new pids
+ * and new tasks, respectively.
+ *
+ * ctx->pids_nr: (original) size of ctx->pids_arr
+ * ctx->pids_cnt: current size of ctx->pids_index
+ * ctx->pids_max: maximum size of ctx->pids_index
+ * ctx->pids_off: current offset in ctx->pids_new[]
+ * ctx->pids_len: maximum offset in ctx->pids_new[]
+ *
+ * ctx->tasks_nr: size of ctx->pids_arr
+ * ctx->tasks_cnt: current size of ctx->tasks
+ * ctx->tasks_max: maximum size of ctx->tasks
+ *
+ * Given a byte offset in ctx->pids_arr, to get the 'ckpt_pids':
+ * pids = pid_at_index(ctx, @offset)
+ *
+ * Given a pid-index from ctx->tasks/ctx->tasks_arr, to get the byte
+ * offset of the matching 'ckpt_pids' in ctx->pids_arr:
+ * ctx->pids_index[@index]
+ *
+ * And to get the 'ckpt_pids' from an index:
+ * pids = pids_of_index(@index)
+ *
+ *
+ * ctx->tasks_pids[]:
+ * Array of pid values indicating the next hint for pid allocation
+ * at each nesting level of pid-namespace.
+ */
+
struct task {
int flags; /* state and (later) actions */

@@ -91,13 +162,12 @@ struct task {
int vidx; /* index into vpid array, -1 if none */
int piddepth;

- pid_t pid; /* process IDs, our bread-&-butter */
- pid_t ppid;
- pid_t tgid;
- pid_t sid;
+ /* Following are INDEX values into ctx->pids_index */
+ int pid_ind; /* process IDs, our bread-&-butter */
+ int ppid_ind;
+ int tgid_ind;
+ int sid_ind;

- pid_t rpid; /* [restart without vpids] actual (real) pid */
-
struct ckpt_ctx *ctx; /* points back to the c/r context */

pid_t real_parent; /* pid of task's real parent */
@@ -127,32 +197,45 @@ struct ckpt_ctx {
int error;
int success;

- pid_t root_pid;
int pipe_in;
int pipe_out;
- int pids_nr;
- int vpids_nr;

int pipe_child[2]; /* for children to report status */
int pipe_feed[2]; /* for feeder to provide input */
int pipe_coord[2]; /* for coord to report status (if needed) */

+ int root_pid;
+ int pid_offset;
+
struct ckpt_pids *pids_arr;
- struct ckpt_pids *copy_arr;
- __s32 *vpids_arr;
+ struct ckpt_pids *pids_new;
+ struct ckpt_pids *pids_copy;
+ int *pids_index;
+
+ int pids_nr;
+ int vpids_nr;
+ int pids_cnt;
+ int pids_max;
+ int pids_off;
+ int pids_len;

+ struct ckpt_task_pids *tasks_arr;
struct task *tasks;
+
int tasks_nr;
+ int tasks_cnt;
int tasks_max;
- int tasks_pid;

- struct hashent **hash_arr;
+ /* an array of pid hash-tables: one hash-table per pidns level */
+ struct hashent ***hash_arr;
+ int *hash_last_pid;
+ int hash_depth;

char header[BUFSIZE];
char header_arch[BUFSIZE];
char container[BUFSIZE];
char tree[BUFSIZE];
- char vpids[BUFSIZE];
+ char pids[BUFSIZE];
char buf[BUFSIZE];

struct cr_restart_args *args;
@@ -194,9 +277,9 @@ int global_send_sigint = -1;
static int c
...

[PATCH 11/11] restart: account for all ghost sids before ghost pgids [message #41572 is a reply to message #41565] Mon, 07 February 2011 17:21 Go to previous message
Oren Laadan is currently offline  Oren Laadan
Messages: 71
Registered: August 2007
Member
From: *parallels.com
This patch fixes the following two bugs:

1) Consider a subtree checkpoint of one task with pid != sid and also
pgid == sid. At restart, ckpt_init_tree() will add a ghost task that
accounts for the pgid, and ckpt_setup_task() will set its ppid and sid
to be the root task. But because the root task is _not_ a session
leader, ckpt_set_creator() will incorrecly add a dead task between the
root and the ghost.

We fix this by ensuring that in this case we set the sid of the ghost
task to be _the same_ as that of the parent task (and not the pid of
the parent task itself - which only works when the parent is indeed
the session leader).

2) Consider a checkpoint with 2 tasks: one has a pgid corresponding to
a dead process, the other has its sid corresponding to the same dead
process. If the former is processed by ckpt_setup_task() first, it
will produce a ghost task that is _not_ a session leader - which is
incorrect. Moreover, this will not be recrtified later when the latter
task is found.

We fix this by splitting the work into two loops: in the first we only
consider tasks' sids and add the corresponding ghosts. This ensures
that they are session leaders. In the second we consider the dead
pgids and the tgids.

The patch also refactors ckpt_set_task() into smaller pieces to
simplify its logic and improve readability.

Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
---
restart.c | 83 ++++++++++++++++++++++++++++++++++++++++++++++--------------
1 files changed, 63 insertions(+), 20 deletions(-)

diff --git a/restart.c b/restart.c
index f64e508..e60335c 100644
--- a/restart.c
+++ b/restart.c
@@ -1456,22 +1456,12 @@ static int ckpt_build_tree(struct ckpt_ctx *ctx)
return 0;
}

-static int ckpt_setup_task(struct ckpt_ctx *ctx, int pid_ind, int ppid_ind)
+static int ckpt_ghost_task(struct ckpt_ctx *ctx, struct ckpt_pids *pids,
+ int pid_ind, int sid_ind, int ppid_ind)
{
struct task *task;
- struct ckpt_pids *pids;
int j;

- /* ignore if outside namespace */
- if (pid_ind == 0 || pid_ind == CKPT_PID_ROOT)
- return 0;
-
- pids = pids_of_index(ctx, pid_ind);
-
- /* skip if already handled */
- if (hash_lookup(ctx, pids->numbers[0]))
- return 0;
-
task = &ctx->tasks[ctx->tasks_cnt++];

task->flags = TASK_GHOST;
@@ -1479,7 +1469,7 @@ static int ckpt_setup_task(struct ckpt_ctx *ctx, int pid_ind, int ppid_ind)
task->pid_ind = pid_ind;
task->ppid_ind = ppid_ind;
task->tgid_ind = pid_ind;
- task->sid_ind = ppid_ind;
+ task->sid_ind = sid_ind;

task->children = NULL;
task->next_sib = NULL;
@@ -1500,6 +1490,47 @@ static int ckpt_setup_task(struct ckpt_ctx *ctx, int pid_ind, int ppid_ind)
return 0;
}

+static struct ckpt_pids *ckpt_consider_pid(struct ckpt_ctx *ctx, int pid_ind)
+{
+ struct ckpt_pids *pids;
+
+ /* ignore if outside namespace */
+ if (pid_ind == 0 || pid_ind == CKPT_PID_ROOT)
+ return NULL;
+
+ pids = pids_of_index(ctx, pid_ind);
+
+ /* skip if already handled */
+ if (hash_lookup(ctx, pids->numbers[0]))
+ return NULL;
+
+ return pids;
+}
+
+static int ckpt_set_session(struct ckpt_ctx *ctx, int pid_ind, int ppid_ind)
+{
+ struct ckpt_pids *pids;
+
+ pids = ckpt_consider_pid(ctx, pid_ind);
+ if (!pids)
+ return 0;
+
+ return ckpt_ghost_task(ctx, pids, pid_ind, pid_ind, ppid_ind);
+}
+
+static int ckpt_set_task(struct ckpt_ctx *ctx, int pid_ind, int ppid_ind)
+{
+ struct ckpt_pids *pids;
+ struct task *parent;
+
+ pids = ckpt_consider_pid(ctx, pid_ind);
+ if (!pids)
+ return 0;
+
+ parent = hash_lookup_ind(ctx, ppid_ind);
+ return ckpt_ghost_task(ctx, pids, pid_ind, parent->sid_ind, ppid_ind);
+}
+
static int _ckpt_valid_pid(struct ckpt_ctx *ctx, pid_t pid, char *which, int i)
{
if (pid < 0) {
@@ -1697,17 +1728,29 @@ static int ckpt_init_tree(struct ckpt_ctx *ctx)

ctx->tasks_cnt = ctx->tasks_nr;

- /* add pids unaccounted for (no tasks) */
+ /*
+ * Add sids unaccounted for (no tasks): find those orphan pids
+ * that are used as some task's sid, and create ghost session
+ * leaders for them. Since they should be session leaders, we
+ * must do this before adding other ghost tasks for tgid/pgid
+ * below: the latter is added a non-session leader.
+ */
for (i = 0; i < ctx->tasks_nr; i++) {
-
/*
* An unaccounted-for sid belongs to a task that was a
- * session leader and died. We can safe set its parent
+ * session leader and died. We can safely set its parent
* (and creator) to be the root task.
*/
- if (ckpt_setup_task(ctx, tasks_arr[i].vsid, root_pid_ind) < 0)
+
+ if (ckpt_set_session(ctx, tasks_arr[i].vsid, root_pid_ind) < 0)
return -1;
+ }

+ /*
+ * Add tgids/pgids unaccounted for (no tasks): find those
+ * orphan tgids/pgids and create ghost tasks for them.
+ */
+ for (i = 0; i < ctx->tasks_nr; i++) {
/*
* An sid == 0 means that the session was inherited an
* ancestor of root_task, and more specifically, via
@@ -1723,18 +1766,18 @@ static int ckpt_init_tree(struct ckpt_ctx *ctx)
* need to add it with the same sid as current (and
* other) threads.
*/
- if (ckpt_setup_task(ctx, tasks_arr[i].vtgid, ppid_ind) < 0)
+ if (ckpt_set_task(ctx, tasks_arr[i].vtgid, ppid_ind) < 0)
return -1;

/*
* If pgrp == sid, then the pgrp/sid will already have
* been hashed by now (e.g. by the call above) and the
- * ckpt_setup_task() will return promptly.
+ * ckpt_set_task() will return promptly.
* If pgrp != sid, then the pgrp 'owner' must have the
* same sid as us: all tasks with same pgrp must have
* their sid matching.
*/
- if (ckpt_setup_task(ctx, tasks_arr[i].vpgid, ppid_ind) < 0)
+ if (ckpt_set_task(ctx, tasks_arr[i].vpgid, ppid_ind) < 0)
return -1;
}

--
1.7.1

_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containe rs
Previous Topic: [PATCH 06/11] restart: rename 'ctx-&gt;tasks_arr' to 'ctx-&gt;tasks'
Next Topic: [PATCH v3] cgroup/freezer: add per freezer duty ratio control
Goto Forum:
  


Current Time: Mon Jun 17 05:36:18 GMT 2019