public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Andrey Mirkin <major@openvz.org>
To: devel@openvz.org, Louis.Rilling@kerlabs.com
Cc: Pavel Emelyanov <xemul@openvz.org>,
	containers@lists.linux-foundation.org,
	linux-kernel@vger.kernel.org
Subject: Re: [Devel] Re: [PATCH 06/10] Introduce functions to dump mm
Date: Wed, 22 Oct 2008 12:58:43 +0400	[thread overview]
Message-ID: <200810221258.44447.major@openvz.org> (raw)
In-Reply-To: <20081020122514.GR15171@hawkmoon.kerlabs.com>

On Monday 20 October 2008 16:25 Louis Rilling wrote:
> On Sat, Oct 18, 2008 at 03:11:34AM +0400, Andrey Mirkin wrote:
> > Functions to dump mm struct, VMAs and mm context are added.
>
> Again, a few little comments.
>
> [...]
>
> > diff --git a/checkpoint/cpt_mm.c b/checkpoint/cpt_mm.c
> > new file mode 100644
> > index 0000000..8a22c48
> > --- /dev/null
> > +++ b/checkpoint/cpt_mm.c
> > @@ -0,0 +1,434 @@
> > +/*
> > + *  Copyright (C) 2008 Parallels, Inc.
> > + *
> > + *  Authors:	Andrey Mirkin <major@openvz.org>
> > + *
> > + *  This program is free software; you can redistribute it and/or
> > + *  modify it under the terms of the GNU General Public License as
> > + *  published by the Free Software Foundation, version 2 of the
> > + *  License.
> > + *
> > + */
> > +
> > +#include <linux/sched.h>
> > +#include <linux/slab.h>
> > +#include <linux/file.h>
> > +#include <linux/mm.h>
> > +#include <linux/errno.h>
> > +#include <linux/major.h>
> > +#include <linux/mman.h>
> > +#include <linux/mnt_namespace.h>
> > +#include <linux/mount.h>
> > +#include <linux/namei.h>
> > +#include <linux/pagemap.h>
> > +#include <linux/hugetlb.h>
> > +#include <asm/ldt.h>
> > +
> > +#include "checkpoint.h"
> > +#include "cpt_image.h"
> > +
> > +struct page_area
> > +{
> > +	int type;
> > +	unsigned long start;
> > +	unsigned long end;
> > +	pgoff_t pgoff;
> > +	loff_t mm;
> > +	__u64 list[16];
> > +};
> > +
> > +struct page_desc
> > +{
> > +	int	type;
> > +	pgoff_t	index;
> > +	loff_t	mm;
> > +	int	shared;
> > +};
> > +
> > +enum {
> > +	PD_ABSENT,
> > +	PD_COPY,
> > +	PD_FUNKEY,
> > +};
> > +
> > +/* 0: page can be obtained from backstore, or still not mapped anonymous
> >  page, +      or something else, which does not requre copy.
> > +   1: page requires copy
> > +   2: page requres copy but its content is zero. Quite useless.
> > +   3: wp page is shared after fork(). It is to be COWed when modified.
> > +   4: page is something unsupported... We copy it right now.
> > + */
> > +
> > +static void page_get_desc(struct vm_area_struct *vma, unsigned long
> > addr, +			  struct page_desc *pdesc, cpt_context_t * ctx)
> > +{
> > +	struct mm_struct *mm = vma->vm_mm;
> > +	pgd_t *pgd;
> > +	pud_t *pud;
> > +	pmd_t *pmd;
> > +	pte_t *ptep, pte;
> > +	spinlock_t *ptl;
> > +	struct page *pg = NULL;
> > +	pgoff_t linear_index = (addr - vma->vm_start)/PAGE_SIZE +
> > vma->vm_pgoff; +
> > +	pdesc->index = linear_index;
> > +	pdesc->shared = 0;
> > +	pdesc->mm = CPT_NULL;
> > +
> > +	if (vma->vm_flags & VM_IO) {
> > +		pdesc->type = PD_ABSENT;
> > +		return;
> > +	}
> > +
> > +	pgd = pgd_offset(mm, addr);
> > +	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
> > +		goto out_absent;
> > +	pud = pud_offset(pgd, addr);
> > +	if (pud_none(*pud) || unlikely(pud_bad(*pud)))
> > +		goto out_absent;
> > +	pmd = pmd_offset(pud, addr);
> > +	if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
> > +		goto out_absent;
> > +#ifdef CONFIG_X86
> > +	if (pmd_huge(*pmd)) {
> > +		eprintk("page_huge\n");
> > +		goto out_unsupported;
> > +	}
> > +#endif
> > +	ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
> > +	pte = *ptep;
> > +	pte_unmap(ptep);
> > +
> > +	if (pte_none(pte))
> > +		goto out_absent_unlock;
> > +
> > +	if ((pg = vm_normal_page(vma, addr, pte)) == NULL) {
> > +		pdesc->type = PD_COPY;
> > +		goto out_unlock;
> > +	}
> > +
> > +	get_page(pg);
> > +	spin_unlock(ptl);
> > +
> > +	if (pg->mapping && !PageAnon(pg)) {
> > +		if (vma->vm_file == NULL) {
> > +			eprintk("pg->mapping!=NULL for fileless vma: %08lx\n", addr);
> > +			goto out_unsupported;
> > +		}
> > +		if (vma->vm_file->f_mapping != pg->mapping) {
> > +			eprintk("pg->mapping!=f_mapping: %08lx %p %p\n",
> > +				    addr, vma->vm_file->f_mapping, pg->mapping);
> > +			goto out_unsupported;
> > +		}
> > +		pdesc->index = (pg->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT));
> > +		/* Page is in backstore. For us it is like
> > +		 * it is not present.
> > +		 */
> > +		goto out_absent;
> > +	}
> > +
> > +	if (PageReserved(pg)) {
> > +		/* Special case: ZERO_PAGE is used, when an
> > +		 * anonymous page is accessed but not written. */
> > +		if (pg == ZERO_PAGE(addr)) {
> > +			if (pte_write(pte)) {
> > +				eprintk("not funny already, writable ZERO_PAGE\n");
> > +				goto out_unsupported;
> > +			}
> > +			/* Just copy it for now */
> > +			pdesc->type = PD_COPY;
> > +			goto out_put;
> > +		}
> > +		eprintk("reserved page %lu at %08lx\n", pg->index, addr);
> > +		goto out_unsupported;
> > +	}
> > +
> > +	if (!pg->mapping) {
> > +		eprintk("page without mapping at %08lx\n", addr);
> > +		goto out_unsupported;
> > +	}
> > +
> > +	pdesc->type = PD_COPY;
> > +
> > +out_put:
> > +	if (pg)
> > +		put_page(pg);
> > +	return;
> > +
> > +out_unlock:
> > +	spin_unlock(ptl);
> > +	goto out_put;
> > +
> > +out_absent_unlock:
> > +	spin_unlock(ptl);
> > +
> > +out_absent:
> > +	pdesc->type = PD_ABSENT;
> > +	goto out_put;
> > +
> > +out_unsupported:
> > +	pdesc->type = PD_FUNKEY;
> > +	goto out_put;
> > +}
> > +
> > +static int count_vma_pages(struct vm_area_struct *vma, struct
> > cpt_context *ctx) +{
> > +	unsigned long addr;
> > +	int page_num = 0;
> > +
> > +	for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) {
> > +		struct page_desc pd;
> > +
> > +		page_get_desc(vma, addr, &pd, ctx);
> > +
> > +		if (pd.type != PD_COPY) {
> > +			return -EINVAL;
> > +		} else {
> > +			page_num += 1;
> > +		}
> > +
> > +	}
> > +	return page_num;
> > +}
> > +
> > +/* ATTN: We give "current" to get_user_pages(). This is wrong, but
> > get_user_pages() + * does not really need this thing. It just stores some
> > page fault stats there. + *
> > + * BUG: some archs (f.e. sparc64, but not Intel*) require flush cache
> > pages + * before accessing vma.
> > + */
> > +static int dump_pages(struct vm_area_struct *vma, unsigned long start,
> > +		unsigned long end, struct cpt_context *ctx)
> > +{
> > +#define MAX_PAGE_BATCH 16
> > +	struct page *pg[MAX_PAGE_BATCH];
> > +	int npages = (end - start)/PAGE_SIZE;
> > +	int count = 0;
> > +
> > +	while (count < npages) {
> > +		int copy = npages - count;
> > +		int n;
> > +
> > +		if (copy > MAX_PAGE_BATCH)
> > +			copy = MAX_PAGE_BATCH;
> > +		n = get_user_pages(current, vma->vm_mm, start, copy,
> > +				   0, 1, pg, NULL);
> > +		if (n == copy) {
> > +			int i;
> > +			for (i=0; i<n; i++) {
> > +				char *maddr = kmap(pg[i]);
> > +				ctx->write(maddr, PAGE_SIZE, ctx);
> > +				kunmap(pg[i]);
>
> There is no error handling in this inner loop. Should be fixed imho.

Yes, you right. Already fixed in next version. I'll try to send it out 
shortly.

>
> > +			}
> > +		} else {
> > +			eprintk("get_user_pages fault");
> > +			for ( ; n > 0; n--)
> > +				page_cache_release(pg[n-1]);
> > +			return -EFAULT;
> > +		}
> > +		start += n*PAGE_SIZE;
> > +		count += n;
> > +		for ( ; n > 0; n--)
> > +			page_cache_release(pg[n-1]);
> > +	}
> > +	return 0;
> > +}
> > +
> > +static int dump_page_block(struct vm_area_struct *vma,
> > +			   struct cpt_page_block *pgb,
> > +			   struct cpt_context *ctx)
> > +{
> > +	int err;
> > +	pgb->cpt_len = sizeof(*pgb) + pgb->cpt_end - pgb->cpt_start;
> > +	pgb->cpt_type = CPT_OBJ_PAGES;
> > +	pgb->cpt_hdrlen = sizeof(*pgb);
> > +	pgb->cpt_content = CPT_CONTENT_DATA;
> > +
> > +	err = ctx->write(pgb, sizeof(*pgb), ctx);
> > +	if (!err)
> > +		err = dump_pages(vma, pgb->cpt_start, pgb->cpt_end, ctx);
> > +
> > +	return err;
> > +}
> > +
> > +static int cpt_dump_dentry(struct path *p, cpt_context_t *ctx)
> > +{
> > +	int len;
> > +	char *path;
> > +	char *buf;
> > +	struct cpt_object_hdr o;
> > +
> > +	buf = (char *)__get_free_page(GFP_KERNEL);
> > +	if (!buf)
> > +		return -ENOMEM;
> > +
> > +	path = d_path(p, buf, PAGE_SIZE);
> > +
> > +	if (IS_ERR(path)) {
> > +		free_page((unsigned long)buf);
> > +		return PTR_ERR(path);
> > +	}
> > +
> > +	len = buf + PAGE_SIZE - 1 - path;
> > +	o.cpt_len = sizeof(o) + len + 1;
> > +	o.cpt_type = CPT_OBJ_NAME;
> > +	o.cpt_hdrlen = sizeof(o);
> > +	o.cpt_content = CPT_CONTENT_NAME;
> > +	path[len] = 0;
> > +
> > +	ctx->write(&o, sizeof(o), ctx);
> > +	ctx->write(path, len + 1, ctx);
>
> Error handling?
Will fix it, thanks.

>
> > +	free_page((unsigned long)buf);
> > +
> > +	return 0;
> > +}
> > +
> > +static int dump_one_vma(struct mm_struct *mm,
> > +			struct vm_area_struct *vma, struct cpt_context *ctx)
> > +{
> > +	struct cpt_vma_image *v;
> > +	unsigned long addr;
> > +	int page_num;
> > +	int err;
> > +
> > +	v = kzalloc(sizeof(*v), GFP_KERNEL);
> > +	if (!v)
> > +		return -ENOMEM;
> > +
> > +	v->cpt_len = sizeof(*v);
> > +	v->cpt_type = CPT_OBJ_VMA;
> > +	v->cpt_hdrlen = sizeof(*v);
> > +	v->cpt_content = CPT_CONTENT_ARRAY;
> > +
> > +	v->cpt_start = vma->vm_start;
> > +	v->cpt_end = vma->vm_end;
> > +	v->cpt_flags = vma->vm_flags;
> > +	if (vma->vm_flags & VM_HUGETLB) {
> > +		eprintk("huge TLB VMAs are still not supported\n");
> > +		kfree(v);
> > +		return -EINVAL;
> > +	}
> > +	v->cpt_pgprot = vma->vm_page_prot.pgprot;
> > +	v->cpt_pgoff = vma->vm_pgoff;
> > +	v->cpt_file = CPT_NULL;
> > +	v->cpt_vma_type = CPT_VMA_TYPE_0;
> > +
> > +	page_num = count_vma_pages(vma, ctx);
> > +	if (page_num < 0) {
> > +		kfree(v);
> > +		return -EINVAL;
> > +	}
>
> AFAICS, since count_vma_pages only supports pages with PD_COPY, and since
> get_page_desc() tags text segment pages (file-mapped and not anonymous
> since not written to) as PD_ABSENT, no executable is checkpointable. So,
> where is the trick? Am I completely missing something about page mapping?
Oh, that's my fault, I have sent wrong version. I will send new patchset with 
correct page mapping today.

>
> > +	v->cpt_page_num = page_num;
> > +
> > +	if (vma->vm_file) {
> > +		v->cpt_file = 0;
> > +		v->cpt_vma_type = CPT_VMA_FILE;
> > +	}
> > +
> > +	ctx->write(v, sizeof(*v), ctx);
>
> Error handling?
Yes, will add it.

>
> > +	kfree(v);
> > +
> > +	if (vma->vm_file) {
> > +		err = cpt_dump_dentry(&vma->vm_file->f_path, ctx);
> > +		if (err < 0)
> > +			return err;
> > +	}
> > +
> > +	for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) {
> > +		struct page_desc pd;
> > +		struct cpt_page_block pgb;
> > +
> > +		page_get_desc(vma, addr, &pd, ctx);
> > +
> > +		if (pd.type == PD_FUNKEY || pd.type == PD_ABSENT) {
> > +			eprintk("dump_one_vma: funkey page\n");
> > +			return -EINVAL;
> > +		}
> > +
> > +		pgb.cpt_start = addr;
> > +		pgb.cpt_end = addr + PAGE_SIZE;
> > +		dump_page_block(vma, &pgb, ctx);
>
> Error handling?
Yeap, thanks.

>
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +static int cpt_dump_mm_context(struct mm_struct *mm, struct cpt_context
> > *ctx) +{
> > +#ifdef CONFIG_X86
> > +	if (mm->context.size) {
> > +		struct cpt_obj_bits b;
> > +		int size;
> > +
> > +		mutex_lock(&mm->context.lock);
> > +
> > +		b.cpt_type = CPT_OBJ_BITS;
> > +		b.cpt_len = sizeof(b);
> > +		b.cpt_content = CPT_CONTENT_MM_CONTEXT;
> > +		b.cpt_size = mm->context.size * LDT_ENTRY_SIZE;
> > +
> > +		ctx->write(&b, sizeof(b), ctx);
> > +
> > +		size = mm->context.size * LDT_ENTRY_SIZE;
> > +
> > +		ctx->write(mm->context.ldt, size, ctx);
>
> Error handling?
Thanks again!

>
> > +
> > +		mutex_unlock(&mm->context.lock);
> > +	}
> > +#endif
> > +	return 0;
> > +}
> > +
> > +int cpt_dump_mm(struct task_struct *tsk, struct cpt_context *ctx)
> > +{
> > +	struct mm_struct *mm = tsk->mm;
> > +	struct cpt_mm_image *v;
> > +	struct vm_area_struct *vma;
> > +	int err;
> > +
> > +	v = kzalloc(sizeof(*v), GFP_KERNEL);
> > +	if (!v)
> > +		return -ENOMEM;
> > +
> > +	v->cpt_len = sizeof(*v);
> > +	v->cpt_type = CPT_OBJ_MM;
> > +	v->cpt_hdrlen = sizeof(*v);
> > +	v->cpt_content = CPT_CONTENT_ARRAY;
> > +
> > +	down_read(&mm->mmap_sem);
> > +	v->cpt_start_code = mm->start_code;
> > +	v->cpt_end_code = mm->end_code;
> > +	v->cpt_start_data = mm->start_data;
> > +	v->cpt_end_data = mm->end_data;
> > +	v->cpt_start_brk = mm->start_brk;
> > +	v->cpt_brk = mm->brk;
> > +	v->cpt_start_stack = mm->start_stack;
> > +	v->cpt_start_arg = mm->arg_start;
> > +	v->cpt_end_arg = mm->arg_end;
> > +	v->cpt_start_env = mm->env_start;
> > +	v->cpt_end_env = mm->env_end;
> > +	v->cpt_def_flags = mm->def_flags;
> > +	v->cpt_flags = mm->flags;
> > +	v->cpt_map_count = mm->map_count;
> > +
> > +	err = ctx->write(v, sizeof(*v), ctx);
> > +	kfree(v);
> > +
> > +	if (err) {
> > +		eprintk("error during writing mm\n");
> > +		goto err_up;
> > +	}
> > +
> > +	for (vma = mm->mmap; vma; vma = vma->vm_next) {
> > +		if ((err = dump_one_vma(mm, vma, ctx)) != 0)
> > +			goto err_up;
> > +	}
> > +
> > +	err = cpt_dump_mm_context(mm, ctx);
> > +
> > +err_up:
> > +	up_read(&mm->mmap_sem);
> > +
> > +	return err;
> > +}
> > +
>
> [...]
>
> Louis

  reply	other threads:[~2008-10-22  8:58 UTC|newest]

Thread overview: 49+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2008-10-17 23:11 [PATCH 0/10] OpenVZ kernel based checkpointing/restart (v2) Andrey Mirkin
2008-10-17 23:11 ` [PATCH 01/10] Introduce trivial sys_checkpoint and sys_restore system calls Andrey Mirkin
2008-10-17 23:11   ` [PATCH 02/10] Make checkpoint/restart functionality modular Andrey Mirkin
2008-10-17 23:11     ` [PATCH 03/10] Introduce context structure needed during checkpointing/restart Andrey Mirkin
2008-10-17 23:11       ` [PATCH 04/10] Introduce container dump function Andrey Mirkin
2008-10-17 23:11         ` [PATCH 05/10] Introduce function to dump process Andrey Mirkin
2008-10-17 23:11           ` [PATCH 06/10] Introduce functions to dump mm Andrey Mirkin
2008-10-17 23:11             ` [PATCH 07/10] Introduce function for restarting a container Andrey Mirkin
2008-10-17 23:11               ` [PATCH 08/10] Introduce functions to restart a process Andrey Mirkin
2008-10-17 23:11                 ` [PATCH 09/10] Introduce functions to restore mm Andrey Mirkin
2008-10-17 23:11                   ` [PATCH 10/10] Add support for multiple processes Andrey Mirkin
2008-10-27 15:58                     ` Oren Laadan
2008-10-30  4:55                       ` [Devel] " Andrey Mirkin
2008-10-20  9:23                 ` [PATCH 08/10] Introduce functions to restart a process Cedric Le Goater
2008-10-22  8:49                   ` [Devel] " Andrey Mirkin
2008-10-22  9:25                     ` Louis Rilling
2008-10-22 10:06                       ` Greg Kurz
2008-10-22 10:44                         ` Louis Rilling
2008-10-22 12:44                           ` Greg Kurz
2008-10-22 10:12                       ` Andrey Mirkin
2008-10-22 10:46                         ` Louis Rilling
2008-10-23  8:53                           ` Andrey Mirkin
2008-10-22 15:25                         ` Oren Laadan
2008-10-23  9:00                           ` Andrey Mirkin
2008-10-23 13:57                             ` Dave Hansen
2008-10-24  3:57                               ` Andrey Mirkin
2008-10-25 21:10                                 ` Oren Laadan
2008-10-29 14:52                                   ` Andrey Mirkin
2008-10-30 15:59                                     ` Oren Laadan
2008-10-22 12:47                     ` Cedric Le Goater
2008-10-23  9:54                       ` Andrey Mirkin
2008-10-23 13:49                         ` Dave Hansen
2008-10-24  4:04                           ` Andrey Mirkin
2008-10-20 13:25                 ` Louis Rilling
2008-10-23 10:56                   ` [Devel] " Andrey Mirkin
2008-10-20 12:25             ` [PATCH 06/10] Introduce functions to dump mm Louis Rilling
2008-10-22  8:58               ` Andrey Mirkin [this message]
2008-10-20 17:21             ` Dave Hansen
2008-10-23  8:43               ` [Devel] " Andrey Mirkin
2008-10-23 13:51                 ` Dave Hansen
2008-10-24  4:07                   ` Andrey Mirkin
2008-10-20 11:02           ` [PATCH 05/10] Introduce function to dump process Louis Rilling
2008-10-24  4:15             ` [Devel] " Andrey Mirkin
2008-10-20 17:48           ` Serge E. Hallyn
2008-10-24  4:40             ` [Devel] " Andrey Mirkin
2008-10-20 17:02       ` [PATCH 03/10] Introduce context structure needed during checkpointing/restart Dave Hansen
2008-10-29 15:30         ` [Devel] " Andrey Mirkin
2008-10-20 16:51     ` [PATCH 02/10] Make checkpoint/restart functionality modular Dave Hansen
2008-10-20 16:59     ` Serge E. Hallyn

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=200810221258.44447.major@openvz.org \
    --to=major@openvz.org \
    --cc=Louis.Rilling@kerlabs.com \
    --cc=containers@lists.linux-foundation.org \
    --cc=devel@openvz.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=xemul@openvz.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox