Re: [PATCH 2/2] Add support for in-kernel process creation during restart



Hi,

Andrey Mirkin wrote:
All work (process tree creation and process state restore) now can be
done in kernel.

Task structure in image file is extended with 2 fields to make in-kernel
process creation more easy.

Signed-off-by: Andrey Mirkin <major@xxxxxxxxxx>
---
checkpoint/checkpoint.c | 17 ++++
checkpoint/restart.c | 4 +-
checkpoint/rstr_process.c | 201 +++++++++++++++++++++++++++++++++++++++-
include/linux/checkpoint.h | 2 +
include/linux/checkpoint_hdr.h | 2 +
5 files changed, 223 insertions(+), 3 deletions(-)

diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
index 04b0c4a..ae3326e 100644
--- a/checkpoint/checkpoint.c
+++ b/checkpoint/checkpoint.c
@@ -173,6 +173,21 @@ static int cr_write_tail(struct cr_ctx *ctx)
return ret;
}

+static int cr_count_children(struct cr_ctx *ctx, struct task_struct *tsk)
+{
+ int num = 0;
+ struct task_struct *child;
+
+ read_lock(&tasklist_lock);
+ list_for_each_entry(child, &tsk->children, sibling) {
+ if (child->parent != tsk)
+ continue;
+ num++;
+ }
+ read_unlock(&tasklist_lock);
+ return num;
+}
+

This information (and the pids of children) is already available in
the ctx->pids_arr.

/* dump the task_struct of a given task */
static int cr_write_task_struct(struct cr_ctx *ctx, struct task_struct *t)
{
@@ -189,6 +204,8 @@ static int cr_write_task_struct(struct cr_ctx *ctx, struct task_struct *t)
hh->exit_code = t->exit_code;
hh->exit_signal = t->exit_signal;

+ hh->vpid = task_pid_nr_ns(t, ctx->root_nsproxy->pid_ns);
+ hh->children_nr = cr_count_children(ctx, t);
hh->task_comm_len = TASK_COMM_LEN;

Same here (hmm... well, if it weren't skipped below, actually...)


/* FIXME: save remaining relevant task_struct fields */
diff --git a/checkpoint/restart.c b/checkpoint/restart.c

[...]

+static int restart_thread(void *arg)
+{
+ struct thr_context *thr_ctx = arg;
+ struct cr_ctx *ctx;
+ struct cr_hdr_task *ht;
+ int ret;
+ int i;
+
+ current->state = TASK_UNINTERRUPTIBLE;

Why do you not want it to be interruptible ?

+
+ ctx = thr_ctx->ctx;
+ ht = thr_ctx->ht;
+
+ if (ht->vpid == 1) {
+ ctx->root_task = current;
+ ctx->root_nsproxy = current->nsproxy;
+
+ get_task_struct(ctx->root_task);
+ get_nsproxy(ctx->root_nsproxy);
+ }
+
+ ret = cr_rstr_task_struct(ctx, ht);
+ cr_debug("rstr_task_struct: ret %d\n", ret);
+ if (ret < 0)
+ goto out;
+ ret = cr_read_mm(ctx);
+ cr_debug("memory: ret %d\n", ret);
+ if (ret < 0)
+ goto out;
+ ret = cr_read_files(ctx);
+ cr_debug("files: ret %d\n", ret);
+ if (ret < 0)
+ goto out;
+ ret = cr_read_thread(ctx);
+ cr_debug("thread: ret %d\n", ret);
+ if (ret < 0)
+ goto out;
+ ret = cr_read_cpu(ctx);
+ cr_debug("cpu: ret %d\n", ret);
+
+ for (i = 0; i < ht->children_nr; i++) {
+ ret = cr_restart_process(ctx);
+ if (ret < 0)
+ break;
+ }

Here, eventually we'll need the pids of those processes; instead of
keeping 'ht->children_nr', you could loop through the pids array in
the ctx and create those who have their parent set to us.

+
+out:
+ thr_ctx->error = ret;
+ complete(&thr_ctx->complete);
+
+ if (!ret && (ht->state & (EXIT_ZOMBIE|EXIT_DEAD))) {
+ do_exit(ht->exit_code);
+ } else {
+ __set_current_state(TASK_UNINTERRUPTIBLE);
+ }
+ schedule();

Using TASK_UNINTERRUPTIBLE, may keep a task unkill-able for potentially
long time. Any reason not to use TASK_INTERRUPTIBLE ?

+
+ cr_debug("leaked %d/%d %p\n", task_pid_nr(current), task_pid_vnr(current), current->mm);
+
+ complete_and_exit(NULL, 0);
+ return ret;
+}
+
+static int cr_restart_process(struct cr_ctx *ctx)
+{
+ struct thr_context thr_ctx;
+ struct task_struct *tsk;
+ struct cr_hdr_task *ht = cr_hbuf_get(ctx, sizeof(*ht));
+ int pid, parent, ret = -EINVAL;
+
+ thr_ctx.ctx = ctx;
+ thr_ctx.error = 0;
+ init_completion(&thr_ctx.complete);
+
+ parent = cr_read_obj_type(ctx, ht, sizeof(*ht), CR_HDR_TASK);
+ if (parent < 0) {
+ ret = parent;
+ goto out;
+ } else if (parent != 0)
+ goto out;
+
+ thr_ctx.ht = ht;
+
+ if (ht->vpid == 1) {
+ /* We should also create container here */
+ pid = cr_kernel_thread(restart_thread, &thr_ctx,
+ CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
+ CLONE_NEWUSER | CLONE_NEWPID | CLONE_NEWNET, 0);

why do you want to start the container in the kernel ?

instead, we can create a new containers in user space, and have the init task
call the restart syscall from inside there.


+ } else {
+ /* We should fork here a child with saved pid and
+ correct flags */
+ pid = cr_kernel_thread(restart_thread, &thr_ctx, 0, ht->vpid);
+ }
+ if (pid < 0) {
+ ret = pid;
+ goto out;
+ }
+ read_lock(&tasklist_lock);
+ tsk = find_task_by_vpid(pid);
+ if (tsk)
+ get_task_struct(tsk);
+ read_unlock(&tasklist_lock);
+ if (tsk == NULL) {
+ ret = -ESRCH;
+ goto out;
+ }
+
+ wait_for_completion(&thr_ctx.complete);
+ wait_task_inactive(tsk, 0);
+ ret = thr_ctx.error;
+ put_task_struct(tsk);
+
+out:
+ cr_hbuf_put(ctx, sizeof(*ht));
+ return ret;
+}
+

int do_restart_in_kernel(struct cr_ctx *ctx)
{
- return -ENOSYS;
+ int ret, size, parent;
+ struct cr_hdr_tree *hh = cr_hbuf_get(ctx, sizeof(*hh));
+
+ ret = cr_read_head(ctx);
+ if (ret < 0)
+ goto out;
+
+ ret = -EINVAL;
+ parent = cr_read_obj_type(ctx, hh, sizeof(*hh), CR_HDR_TREE);
+ if (parent < 0) {
+ ret = parent;
+ goto out;
+ } else if (parent != 0)
+ goto out;
+
+ size = sizeof(*ctx->pids_arr) * hh->tasks_nr;
+ if (size < 0)
+ goto out;
+ ctx->file->f_pos += size;

You can't do that - the file may be non-seekable (e.g. a socket); must read
the data in.
Besides, if instead of skipping this data you read it in, then you could use
it as noted above.

+
+ ret = cr_restart_process(ctx);
+ if (ret < 0)
+ goto out;
+
+ ret = cr_read_tail(ctx);

[...]

Thanks,

Oren.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/



Relevant Pages

  • Re: help with huge memory consumption??
    ... 'process creation' being much slower than 'thread creation' ... would be a strong hint at a deficiency in the kernel. ... was the reason to use threads. ... implemented with the same system mechanism (things the kernel sees as ...
    (comp.os.linux.development.system)
  • Re: [RFC][PATCH 04/20] pspace: Allow multiple instaces of the process id namespace
    ... and used that generate compile errors when someone did not use the ... Maintainable code is a delicate balancing act between things that ... I have hierarchical pids so nesting is possible. ... Other kernel developers gut feel is that ...
    (Linux-Kernel)
  • Re: Process Creation Notification
    ... I'm doing some linux development for sourceforge.net and ... but you can quite easilly poll the kernel. ... but not when one terminates. ... Alternatively you could use kill in a loop with all possible pids ...
    (comp.os.linux.development.apps)
  • Re: [PATCH -v2] ftrace: Documentation
    ... Steven Rostedt writes: ... basically only used tracing to see "what's happening in the kernel here?". ... I'm new to the namespace concept of pids. ...
    (Linux-Kernel)
  • Re: [Devel] Re: [PATCH 0/9] OpenVZ kernel based checkpointing/restart
    ... process creation. ... Pretty sure Dave prefers user-space. ... I'd say the thing to do is push the core API that supports single-thread ... kernel, and let the arguments for and against that stand on their own. ...
    (Linux-Kernel)