OpenVZ Forum


Home » Mailing lists » Devel » Re: checkpointing and restoring processes
Re: checkpointing and restoring processes [message #18774] Wed, 06 June 2007 12:48
Cedric Le Goater is currently offline  Cedric Le Goater
Messages: 443
Registered: February 2006
Senior Member
Mark Pflueger wrote:
> hi everyone!
> 
> i'm not subscribed to the list, so if you care to flame because of my noob 
> question, just do it to the list, otherwise please cc me.

you should subscribe to containers@lists.osdl.org and send your ideas on that
list. There's a BOF on that topic at OLS if you can attend.

cheers,

C.

> i'm trying to write a checkpoint/restore module for processes and so have 
> a basic version going already - problem is, when i restore the process, 
> one of three things happens at random. first is, the process restored 
> segfaults. second is, i get a kernel null pointer dereference and third 
> is, i get a virtual address lookup error and a kernel crash. the trace 
> back and the address always change.
> 
> the user space process is as simple as i could make it: (error checking 
> and debugging messages are left out)
> 
> 
> void take_chkpt(void) {
>         pid_t pid;
>         char call_pid[10];
>         char call_num[10];
> 
>         chkptpid = getpid();
>         snprintf(call_pid, 9, "%d", chkptpid);
>         snprintf(call_num, 9, "%d", checkpointnum);
> 
> 	switch(pid = fork()) {
> 	case -1:
>                 fprintf(stderr, "Fork failed.\n");
>                 return;
>                 break;
> 	case  0:   /* child process */
>                 if(!execl("child_take", call_pid, call_num, (char *)0))
>                         perror("execl: ");
>                 break;
> 	default:   /* parent process */
>                 waitpid(pid, NULL, 0);
>                 break;
> 	}
> 
>         return;
> }
> 
> 
> void restore_chkpts(void) {
>         pid_t pid;
>         char call_pid[10];
>         char call_num[10];
> 
> 	ENTERFUN();
> 
>         if(restore_retry) // do nothing on second call to restore
>                 return;
> 
>         chkptpid = getpid();
>         snprintf(call_pid, 9, "%d", chkptpid);
>         snprintf(call_num, 9, "%d", checkpointnum);
> 
> 	switch(pid = fork()) {
> 	case -1:
>                 fprintf(stderr, "MP: Fork failed.\n");
>                 return;
>                 break;
> 	case  0:   /* child process */
>                 if(!execl("child_restore", call_pid, call_num, (char *)0))
>                         perror("execl: ");
>                 break;
> 	default:   /* parent process */
>                 INF(("Parent Process"));
>                 restore_retry=1;
>                 INF(("Wait for Child..."));
>                 waitpid(pid, NULL, 0);
>                 break;
> 	}
> 
> 	LEAVEFUN();
> 
>         return;
> }
> 
> int main(int argc, char* argv[]) {
> 	take_chkpt();
> 	printf("Hello cruel world!\n");
> 	restore_chkpts();
> 	return 0;
> }
> 
> where child_take and child_restore do the following:
> 
> 
> void child_take_chkpt(int chkptpid, int checkpointnum) {
>         struct chkpt_ioctl chkptio;
>         int dev_fd; // ioctl device file
>         char chkptname[30];
> 
>         if ((dev_fd = open(CHKPT_DEVICE, O_RDWR)) < 0) {
>                 perror("MP: Open device file");
>                 exit(EXIT_FAILURE);
>         }
>         chkptio.pid = chkptpid;
>         snprintf(chkptname, 29, "/tmp/chkpt_%d_%d", chkptio.pid, checkpointnum);
>         chkptio.file = creat(chkptname, 00755);
>         sleep(1); // to go sure the parent process is in waitpid -- ugly, 
> but works
>         kill(chkptio.pid, SIGSTOP);
>         sleep(1);
>         ioctl(dev_fd, CHKPT_IOCTL_SAVE, (unsigned long)&chkptio);
>         close(dev_fd);
>         close(chkptio.file);
>         kill(chkptio.pid, SIGCONT);
>         exit(0);
> }
> 
> void child_restore_chkpts(int chkptpid, int checkpointnum) {
>         struct chkpt_ioctl chkptio;
>         int dev_fd; // ioctl device file
>         char chkptname[30];
> 
>         snprintf(chkptname, 29, "/tmp/chkpt_%d_%d", chkptpid, checkpointnum-1);
>         chkptio.file = open(chkptname, O_RDONLY);
>         chkptio.pid = chkptpid;
>         dev_fd = open(CHKPT_DEVICE, O_RDWR);
>         sleep(1);
>         kill(chkptpid, SIGSTOP);
>         sleep(1);
>         ioctl(dev_fd, CHKPT_IOCTL_RESTORE, (unsigned long)&chkptio);
>         close(chkptio.file);
>         close(dev_fd);
>         kill(chkptpid, SIGCONT);
>         exit(0);
> }
> 
> the header for the files is this:
> 
> 
> enum {
>         CHKPT_IOCTL_SAVE,
>         CHKPT_IOCTL_RESTORE
> };
> 
> struct chkpt_ioctl {
>         pid_t pid; // for fork tests
>         int file;
> };
> 
> struct chkpt {
>         pid_t pid; // for fork tests
>         struct pt_regs regs;
>         unsigned int datasize;
>         unsigned int brksize;
>         unsigned int stacksize;
> };
> 
> 
> and finally the kernel module:
> 
> int chkpt_ioctl_handler(struct inode *i, struct file *f,
>                      unsigned int cmd, unsigned long arg) {
>         struct chkpt_ioctl pmio, *u_pmio;
>         int ret = -1;
> 
>         u_pmio = (struct chkpt_ioctl *)arg;
> 
>         switch(cmd) {
>         case CHKPT_IOCTL_SAVE:
>                 if (copy_from_user(&pmio, u_pmio, sizeof(struct 
> chkpt_ioctl))) {
>                         printk("...failed to copy from user\n");
>                         ret = -1;
>                         break;
>                 }
>                 if(chkpt_save(&pmio) < 0) {
>                         printk("...failed to save chkpt\n");
>                         ret = -1;
>                         break;
>                 }
>                 ret = 0;
>                 break;
>         case CHKPT_IOCTL_RESTORE:
>                 INFO(("CHKPT_IOCTL_RESTORE"));
>                 if (copy_from_user(&pmio, u_pmio, sizeof(struct 
> chkpt_ioctl))) {
>                         printk("...failed to copy from user\n");
>                         ret = -1;
>                         break;
>                 }
>                 if (chkpt_restore(&pmio) < 0) {
>                         printk("...failed to restore chkpt\n");
>                         ret = -1;
>                         break;
>                 }        
>                 ret = 0;
>                 break;
>         default:
>                 printk("...illegal ioctl cmd\n");
>                 ret = -1;
>                 break;
>         }
>         return ret;
> }
> 
> static int chkpt_save(struct chkpt_ioctl *chkptio) {
>         struct task_struct *tsk;
>         struct chkpt chkpt;
>         unsigned int datasz, brksz, stacksz;
>         struct file *f;
> 
>         if (!(tsk = find_task_by_pid(chkptio->pid))) {
>                 printk("...task %d not found\n", chkptio->pid);
>                 return -1;
>         }
> 
>         f = current->files->fd[chkptio->file];
> 
>         datasz = tsk->mm->end_data - tsk->mm->start_data; // data
>         brksz = tsk->mm->brk - tsk->mm->start_brk; // brk
>         stacksz = tsk->thread.esp0 - tsk->thread.esp; // stack
> 
>         /* saving most important information belonging to tsk */
>         /* NO FILES, SOCKETS, PIPES, SHARED MEMORY AND SEMAPHORES */
>         chkpt.pid = chkptio->pid;
>         /* REGISTERS */
>         memcpy(&chkpt.regs, REGS, sizeof(struct pt_regs));
>         if (in_syscall(tsk))
>                 intr_syscall(&chkpt.regs);
>         chkpt.datasize = datasz;
>         chkpt.brksize = brksz;
>         chkpt.stacksize = stacksz;
>         pack_write(f, (void *)&chkpt, sizeof(struct chkpt), 0);
>         /* TASK */
>         pack_write(f, (void*)tsk, THREAD_SIZE, 0);
>         /* MEMORY */
>         pack_write(f, (void *)tsk->mm->start_data, datasz, 0);
>         pack_write(f, (void *)tsk->mm->start_brk, brksz, 0);
>         pack_write(f, (void *)tsk->thread.esp, stacksz, 0);
> 
>         pack_write(f, NULL, 0, 1); /* last packet */
> 
>         return 0;
> }
> 
> static int pack_write (struct file *f, char *buf, int len,
> 		       int last_pkt) {
>     static char *pack = NULL;
>     static long pos = 0;
>     int ret, to_copy, wrtn = 0;
>     
>     if (pack==NULL)
>     {
> 	pack=(char*)kmalloc(PACKET_SIZE, GFP_KERNEL);
> 	if (!pack)
> 	{
> 	    printk("pack_write: no mem!\n");
> 	    return -1;
> 	}
>     }
>     
>     while (len>0)
>     {
> 	to_copy = (len>(PACKET_SIZE-pos))?(PACKET_SIZE-pos):(len);
> 	
>         memcpy(&(pack[pos]), buf+wrtn, to_copy);
> 	
> 	pos += to_copy;
> 	len -= to_copy;
> 	wrtn +=to_copy; 
> 	
> 	if ( (pos==PACKET_SIZE) || (last_pkt) )
> 	{
> 	    mm_segment_t fs = get_fs();
> 	    
> 	    set_fs(KERNEL_DS);
> 	    ret = f->f_op->write(f, pack, pos, &(f->f_pos));	
> 	    set_fs(fs);
> 	    if (ret!=pos)
> 		return ret;
> 	    
> 	    pos = 0;
> 	    if (last_pkt)
> 	    {
> 		kfree(pack);
> 		pack = NULL;
> 	    }
> 	}
>     }
>     
>     if ( (last_pkt) && (pack!=NULL) )
>     {
> 	if (pos!=0)
> 	{
> 	    mm_segment_t fs = get_fs();
> 	    
> 	    set_fs(KERNEL
...

 
Read Message
Previous Topic: [patch 0/5][RFC - ipv4/udp checkpoint/restart] dumping/restoring the IPV4/UDP sockets
Next Topic: Re: checkpointing and restoring processes
Goto Forum:
  


Current Time: Mon Sep 08 22:39:24 GMT 2025

Total time taken to generate the page: 0.07423 seconds