linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [RFC][PATCH] vmalloc: simplify vread()/vwrite()
@ 2010-01-07  1:24 Wu Fengguang
  2010-01-07  1:38 ` KAMEZAWA Hiroyuki
  0 siblings, 1 reply; 9+ messages in thread
From: Wu Fengguang @ 2010-01-07  1:24 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Andrew Morton, Tejun Heo, Ingo Molnar, Nick Piggin, Andi Kleen,
	Hugh Dickins, Christoph Lameter, Linux Memory Management List,
	LKML

vread()/vwrite() is only called from kcore/kmem to access one page at
a time.  So the logic can be vastly simplified.

The changes are:
- remove the vmlist walk and rely solely on vmalloc_to_page()
- replace the VM_IOREMAP check with (page && page_is_ram(pfn))

The VM_IOREMAP check is introduced in commit d0107eb07320b for per-cpu
alloc. Kame, would you double check if this change is OK for that
purpose?

The page_is_ram() check is necessary because kmap_atomic() is not
designed to work with non-RAM pages.

Even for a RAM page, we don't own the page, and cannot assume it's a
_PAGE_CACHE_WB page. So I wonder whether it's necessary to do another
patch to call reserve_memtype() before kmap_atomic() to ensure cache
consistency?

TODO: update comments accordingly

CC: Tejun Heo <tj@kernel.org>
CC: Ingo Molnar <mingo@elte.hu>
CC: Nick Piggin <npiggin@suse.de>
CC: Andi Kleen <andi@firstfloor.org> 
CC: Hugh Dickins <hugh.dickins@tiscali.co.uk>
CC: Christoph Lameter <cl@linux-foundation.org>
CC: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 mm/vmalloc.c |  196 ++++++++++---------------------------------------
 1 file changed, 40 insertions(+), 156 deletions(-)

--- linux-mm.orig/mm/vmalloc.c	2010-01-04 10:23:12.000000000 +0800
+++ linux-mm/mm/vmalloc.c	2010-01-05 12:42:40.000000000 +0800
@@ -1646,87 +1646,6 @@ void *vmalloc_32_user(unsigned long size
 }
 EXPORT_SYMBOL(vmalloc_32_user);
 
-/*
- * small helper routine , copy contents to buf from addr.
- * If the page is not present, fill zero.
- */
-
-static int aligned_vread(char *buf, char *addr, unsigned long count)
-{
-	struct page *p;
-	int copied = 0;
-
-	while (count) {
-		unsigned long offset, length;
-
-		offset = (unsigned long)addr & ~PAGE_MASK;
-		length = PAGE_SIZE - offset;
-		if (length > count)
-			length = count;
-		p = vmalloc_to_page(addr);
-		/*
-		 * To do safe access to this _mapped_ area, we need
-		 * lock. But adding lock here means that we need to add
-		 * overhead of vmalloc()/vfree() calles for this _debug_
-		 * interface, rarely used. Instead of that, we'll use
-		 * kmap() and get small overhead in this access function.
-		 */
-		if (p) {
-			/*
-			 * we can expect USER0 is not used (see vread/vwrite's
-			 * function description)
-			 */
-			void *map = kmap_atomic(p, KM_USER0);
-			memcpy(buf, map + offset, length);
-			kunmap_atomic(map, KM_USER0);
-		} else
-			memset(buf, 0, length);
-
-		addr += length;
-		buf += length;
-		copied += length;
-		count -= length;
-	}
-	return copied;
-}
-
-static int aligned_vwrite(char *buf, char *addr, unsigned long count)
-{
-	struct page *p;
-	int copied = 0;
-
-	while (count) {
-		unsigned long offset, length;
-
-		offset = (unsigned long)addr & ~PAGE_MASK;
-		length = PAGE_SIZE - offset;
-		if (length > count)
-			length = count;
-		p = vmalloc_to_page(addr);
-		/*
-		 * To do safe access to this _mapped_ area, we need
-		 * lock. But adding lock here means that we need to add
-		 * overhead of vmalloc()/vfree() calles for this _debug_
-		 * interface, rarely used. Instead of that, we'll use
-		 * kmap() and get small overhead in this access function.
-		 */
-		if (p) {
-			/*
-			 * we can expect USER0 is not used (see vread/vwrite's
-			 * function description)
-			 */
-			void *map = kmap_atomic(p, KM_USER0);
-			memcpy(map + offset, buf, length);
-			kunmap_atomic(map, KM_USER0);
-		}
-		addr += length;
-		buf += length;
-		copied += length;
-		count -= length;
-	}
-	return copied;
-}
-
 /**
  *	vread() -  read vmalloc area in a safe way.
  *	@buf:		buffer for reading data
@@ -1757,49 +1676,34 @@ static int aligned_vwrite(char *buf, cha
 
 long vread(char *buf, char *addr, unsigned long count)
 {
-	struct vm_struct *tmp;
-	char *vaddr, *buf_start = buf;
-	unsigned long buflen = count;
-	unsigned long n;
-
-	/* Don't allow overflow */
-	if ((unsigned long) addr + count < count)
-		count = -(unsigned long) addr;
+	struct page *p;
+	void *map;
+	int offset = (unsigned long)addr & (PAGE_SIZE - 1);
 
-	read_lock(&vmlist_lock);
-	for (tmp = vmlist; count && tmp; tmp = tmp->next) {
-		vaddr = (char *) tmp->addr;
-		if (addr >= vaddr + tmp->size - PAGE_SIZE)
-			continue;
-		while (addr < vaddr) {
-			if (count == 0)
-				goto finished;
-			*buf = '\0';
-			buf++;
-			addr++;
-			count--;
-		}
-		n = vaddr + tmp->size - PAGE_SIZE - addr;
-		if (n > count)
-			n = count;
-		if (!(tmp->flags & VM_IOREMAP))
-			aligned_vread(buf, addr, n);
-		else /* IOREMAP area is treated as memory hole */
-			memset(buf, 0, n);
-		buf += n;
-		addr += n;
-		count -= n;
-	}
-finished:
-	read_unlock(&vmlist_lock);
+	/* Assume subpage access */
+	BUG_ON(count > PAGE_SIZE - offset);
 
-	if (buf == buf_start)
+	p = vmalloc_to_page(addr);
+	if (!p || !page_is_ram(page_to_pfn(p))) {
+		memset(buf, 0, count);
 		return 0;
-	/* zero-fill memory holes */
-	if (buf != buf_start + buflen)
-		memset(buf, 0, buflen - (buf - buf_start));
+	}
 
-	return buflen;
+	/*
+	 * To do safe access to this _mapped_ area, we need
+	 * lock. But adding lock here means that we need to add
+	 * overhead of vmalloc()/vfree() calles for this _debug_
+	 * interface, rarely used. Instead of that, we'll use
+	 * kmap() and get small overhead in this access function.
+	 *
+	 * we can expect USER0 is not used (see vread/vwrite's
+	 * function description)
+	 */
+	map = kmap_atomic(p, KM_USER0);
+	memcpy(buf, map + offset, count);
+	kunmap_atomic(map, KM_USER0);
+
+	return count;
 }
 
 /**
@@ -1834,44 +1738,24 @@ finished:
 
 long vwrite(char *buf, char *addr, unsigned long count)
 {
-	struct vm_struct *tmp;
-	char *vaddr;
-	unsigned long n, buflen;
-	int copied = 0;
-
-	/* Don't allow overflow */
-	if ((unsigned long) addr + count < count)
-		count = -(unsigned long) addr;
-	buflen = count;
+	struct page *p;
+	void *map;
+	int offset = (unsigned long)addr & (PAGE_SIZE - 1);
 
-	read_lock(&vmlist_lock);
-	for (tmp = vmlist; count && tmp; tmp = tmp->next) {
-		vaddr = (char *) tmp->addr;
-		if (addr >= vaddr + tmp->size - PAGE_SIZE)
-			continue;
-		while (addr < vaddr) {
-			if (count == 0)
-				goto finished;
-			buf++;
-			addr++;
-			count--;
-		}
-		n = vaddr + tmp->size - PAGE_SIZE - addr;
-		if (n > count)
-			n = count;
-		if (!(tmp->flags & VM_IOREMAP)) {
-			aligned_vwrite(buf, addr, n);
-			copied++;
-		}
-		buf += n;
-		addr += n;
-		count -= n;
-	}
-finished:
-	read_unlock(&vmlist_lock);
-	if (!copied)
+	/* Assume subpage access */
+	BUG_ON(count > PAGE_SIZE - offset);
+
+	p = vmalloc_to_page(addr);
+	if (!p)
+		return 0;
+	if (!page_is_ram(page_to_pfn(p)))
 		return 0;
-	return buflen;
+
+	map = kmap_atomic(p, KM_USER0);
+	memcpy(map + offset, buf, count);
+	kunmap_atomic(map, KM_USER0);
+
+	return count;
 }
 
 /**

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC][PATCH] vmalloc: simplify vread()/vwrite()
  2010-01-07  1:24 [RFC][PATCH] vmalloc: simplify vread()/vwrite() Wu Fengguang
@ 2010-01-07  1:38 ` KAMEZAWA Hiroyuki
  2010-01-07  2:50   ` Wu Fengguang
  0 siblings, 1 reply; 9+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-01-07  1:38 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Andrew Morton, Tejun Heo, Ingo Molnar, Nick Piggin, Andi Kleen,
	Hugh Dickins, Christoph Lameter, Linux Memory Management List,
	LKML

On Thu, 7 Jan 2010 09:24:59 +0800
Wu Fengguang <fengguang.wu@intel.com> wrote:

> vread()/vwrite() is only called from kcore/kmem to access one page at
> a time.  So the logic can be vastly simplified.
> 
I recommend you to rename the function because safety of function is
changed and you can show what callers are influenced.


> The changes are:
> - remove the vmlist walk and rely solely on vmalloc_to_page()
> - replace the VM_IOREMAP check with (page && page_is_ram(pfn))
> 
> The VM_IOREMAP check is introduced in commit d0107eb07320b for per-cpu
> alloc. Kame, would you double check if this change is OK for that
> purpose?
> 
I think VM_IOREMAP is for avoiding access to device configuration area and
unexpected breakage in device. Then, VM_IOREMAP are should be skipped by
the caller. (My patch _just_ moves the avoidance of callers to vread()/vwrite())


> The page_is_ram() check is necessary because kmap_atomic() is not
> designed to work with non-RAM pages.
> 
I think page_is_ram() is not a complete method...on x86, it just check
e820's memory range. checking VM_IOREMAP is better, I think.

> Even for a RAM page, we don't own the page, and cannot assume it's a
> _PAGE_CACHE_WB page. So I wonder whether it's necessary to do another
> patch to call reserve_memtype() before kmap_atomic() to ensure cache
> consistency?
> 
> TODO: update comments accordingly
> 

BTW, f->f_pos problem on 64bit machine still exists and this patch is still
hard to test. I stopped that because anyone doesn't show any interests.

I have no objection to your direction.

but please rewrite the function explanation as
"addr" should be page alinged and bufsize should be multiple of page size."
and change the function names.

Thanks,
-Kame


> CC: Tejun Heo <tj@kernel.org>
> CC: Ingo Molnar <mingo@elte.hu>
> CC: Nick Piggin <npiggin@suse.de>
> CC: Andi Kleen <andi@firstfloor.org> 
> CC: Hugh Dickins <hugh.dickins@tiscali.co.uk>
> CC: Christoph Lameter <cl@linux-foundation.org>
> CC: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
> ---
>  mm/vmalloc.c |  196 ++++++++++---------------------------------------
>  1 file changed, 40 insertions(+), 156 deletions(-)
> 
> --- linux-mm.orig/mm/vmalloc.c	2010-01-04 10:23:12.000000000 +0800
> +++ linux-mm/mm/vmalloc.c	2010-01-05 12:42:40.000000000 +0800
> @@ -1646,87 +1646,6 @@ void *vmalloc_32_user(unsigned long size
>  }
>  EXPORT_SYMBOL(vmalloc_32_user);
>  
> -/*
> - * small helper routine , copy contents to buf from addr.
> - * If the page is not present, fill zero.
> - */
> -
> -static int aligned_vread(char *buf, char *addr, unsigned long count)
> -{
> -	struct page *p;
> -	int copied = 0;
> -
> -	while (count) {
> -		unsigned long offset, length;
> -
> -		offset = (unsigned long)addr & ~PAGE_MASK;
> -		length = PAGE_SIZE - offset;
> -		if (length > count)
> -			length = count;
> -		p = vmalloc_to_page(addr);
> -		/*
> -		 * To do safe access to this _mapped_ area, we need
> -		 * lock. But adding lock here means that we need to add
> -		 * overhead of vmalloc()/vfree() calles for this _debug_
> -		 * interface, rarely used. Instead of that, we'll use
> -		 * kmap() and get small overhead in this access function.
> -		 */
> -		if (p) {
> -			/*
> -			 * we can expect USER0 is not used (see vread/vwrite's
> -			 * function description)
> -			 */
> -			void *map = kmap_atomic(p, KM_USER0);
> -			memcpy(buf, map + offset, length);
> -			kunmap_atomic(map, KM_USER0);
> -		} else
> -			memset(buf, 0, length);
> -
> -		addr += length;
> -		buf += length;
> -		copied += length;
> -		count -= length;
> -	}
> -	return copied;
> -}
> -
> -static int aligned_vwrite(char *buf, char *addr, unsigned long count)
> -{
> -	struct page *p;
> -	int copied = 0;
> -
> -	while (count) {
> -		unsigned long offset, length;
> -
> -		offset = (unsigned long)addr & ~PAGE_MASK;
> -		length = PAGE_SIZE - offset;
> -		if (length > count)
> -			length = count;
> -		p = vmalloc_to_page(addr);
> -		/*
> -		 * To do safe access to this _mapped_ area, we need
> -		 * lock. But adding lock here means that we need to add
> -		 * overhead of vmalloc()/vfree() calles for this _debug_
> -		 * interface, rarely used. Instead of that, we'll use
> -		 * kmap() and get small overhead in this access function.
> -		 */
> -		if (p) {
> -			/*
> -			 * we can expect USER0 is not used (see vread/vwrite's
> -			 * function description)
> -			 */
> -			void *map = kmap_atomic(p, KM_USER0);
> -			memcpy(map + offset, buf, length);
> -			kunmap_atomic(map, KM_USER0);
> -		}
> -		addr += length;
> -		buf += length;
> -		copied += length;
> -		count -= length;
> -	}
> -	return copied;
> -}
> -
>  /**
>   *	vread() -  read vmalloc area in a safe way.
>   *	@buf:		buffer for reading data
> @@ -1757,49 +1676,34 @@ static int aligned_vwrite(char *buf, cha
>  
>  long vread(char *buf, char *addr, unsigned long count)
>  {
> -	struct vm_struct *tmp;
> -	char *vaddr, *buf_start = buf;
> -	unsigned long buflen = count;
> -	unsigned long n;
> -
> -	/* Don't allow overflow */
> -	if ((unsigned long) addr + count < count)
> -		count = -(unsigned long) addr;
> +	struct page *p;
> +	void *map;
> +	int offset = (unsigned long)addr & (PAGE_SIZE - 1);
>  
> -	read_lock(&vmlist_lock);
> -	for (tmp = vmlist; count && tmp; tmp = tmp->next) {
> -		vaddr = (char *) tmp->addr;
> -		if (addr >= vaddr + tmp->size - PAGE_SIZE)
> -			continue;
> -		while (addr < vaddr) {
> -			if (count == 0)
> -				goto finished;
> -			*buf = '\0';
> -			buf++;
> -			addr++;
> -			count--;
> -		}
> -		n = vaddr + tmp->size - PAGE_SIZE - addr;
> -		if (n > count)
> -			n = count;
> -		if (!(tmp->flags & VM_IOREMAP))
> -			aligned_vread(buf, addr, n);
> -		else /* IOREMAP area is treated as memory hole */
> -			memset(buf, 0, n);
> -		buf += n;
> -		addr += n;
> -		count -= n;
> -	}
> -finished:
> -	read_unlock(&vmlist_lock);
> +	/* Assume subpage access */
> +	BUG_ON(count > PAGE_SIZE - offset);
>  
> -	if (buf == buf_start)
> +	p = vmalloc_to_page(addr);
> +	if (!p || !page_is_ram(page_to_pfn(p))) {
> +		memset(buf, 0, count);
>  		return 0;
> -	/* zero-fill memory holes */
> -	if (buf != buf_start + buflen)
> -		memset(buf, 0, buflen - (buf - buf_start));
> +	}
>  
> -	return buflen;
> +	/*
> +	 * To do safe access to this _mapped_ area, we need
> +	 * lock. But adding lock here means that we need to add
> +	 * overhead of vmalloc()/vfree() calles for this _debug_
> +	 * interface, rarely used. Instead of that, we'll use
> +	 * kmap() and get small overhead in this access function.
> +	 *
> +	 * we can expect USER0 is not used (see vread/vwrite's
> +	 * function description)
> +	 */
> +	map = kmap_atomic(p, KM_USER0);
> +	memcpy(buf, map + offset, count);
> +	kunmap_atomic(map, KM_USER0);
> +
> +	return count;
>  }
>  
>  /**
> @@ -1834,44 +1738,24 @@ finished:
>  
>  long vwrite(char *buf, char *addr, unsigned long count)
>  {
> -	struct vm_struct *tmp;
> -	char *vaddr;
> -	unsigned long n, buflen;
> -	int copied = 0;
> -
> -	/* Don't allow overflow */
> -	if ((unsigned long) addr + count < count)
> -		count = -(unsigned long) addr;
> -	buflen = count;
> +	struct page *p;
> +	void *map;
> +	int offset = (unsigned long)addr & (PAGE_SIZE - 1);
>  
> -	read_lock(&vmlist_lock);
> -	for (tmp = vmlist; count && tmp; tmp = tmp->next) {
> -		vaddr = (char *) tmp->addr;
> -		if (addr >= vaddr + tmp->size - PAGE_SIZE)
> -			continue;
> -		while (addr < vaddr) {
> -			if (count == 0)
> -				goto finished;
> -			buf++;
> -			addr++;
> -			count--;
> -		}
> -		n = vaddr + tmp->size - PAGE_SIZE - addr;
> -		if (n > count)
> -			n = count;
> -		if (!(tmp->flags & VM_IOREMAP)) {
> -			aligned_vwrite(buf, addr, n);
> -			copied++;
> -		}
> -		buf += n;
> -		addr += n;
> -		count -= n;
> -	}
> -finished:
> -	read_unlock(&vmlist_lock);
> -	if (!copied)
> +	/* Assume subpage access */
> +	BUG_ON(count > PAGE_SIZE - offset);
> +
> +	p = vmalloc_to_page(addr);
> +	if (!p)
> +		return 0;
> +	if (!page_is_ram(page_to_pfn(p)))
>  		return 0;
> -	return buflen;
> +
> +	map = kmap_atomic(p, KM_USER0);
> +	memcpy(map + offset, buf, count);
> +	kunmap_atomic(map, KM_USER0);
> +
> +	return count;
>  }
>  
>  /**
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC][PATCH] vmalloc: simplify vread()/vwrite()
  2010-01-07  1:38 ` KAMEZAWA Hiroyuki
@ 2010-01-07  2:50   ` Wu Fengguang
  2010-01-07  2:57     ` KAMEZAWA Hiroyuki
  2010-01-07  3:15     ` Huang Ying
  0 siblings, 2 replies; 9+ messages in thread
From: Wu Fengguang @ 2010-01-07  2:50 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Andrew Morton, Tejun Heo, Ingo Molnar, Nick Piggin, Andi Kleen,
	Hugh Dickins, Christoph Lameter, Linux Memory Management List,
	LKML, Huang, Ying

On Thu, Jan 07, 2010 at 09:38:25AM +0800, KAMEZAWA Hiroyuki wrote:
> On Thu, 7 Jan 2010 09:24:59 +0800
> Wu Fengguang <fengguang.wu@intel.com> wrote:
> 
> > vread()/vwrite() is only called from kcore/kmem to access one page at
> > a time.  So the logic can be vastly simplified.
> > 
> I recommend you to rename the function because safety of function is
> changed and you can show what callers are influenced.

OK.
 
> > The changes are:
> > - remove the vmlist walk and rely solely on vmalloc_to_page()
> > - replace the VM_IOREMAP check with (page && page_is_ram(pfn))
> > 
> > The VM_IOREMAP check is introduced in commit d0107eb07320b for per-cpu
> > alloc. Kame, would you double check if this change is OK for that
> > purpose?
> > 
> I think VM_IOREMAP is for avoiding access to device configuration area and
> unexpected breakage in device. Then, VM_IOREMAP are should be skipped by
> the caller. (My patch _just_ moves the avoidance of callers to vread()/vwrite())

"device configuration area" is not RAM, so testing of RAM would be
able to skip them?

> 
> > The page_is_ram() check is necessary because kmap_atomic() is not
> > designed to work with non-RAM pages.
> > 
> I think page_is_ram() is not a complete method...on x86, it just check
> e820's memory range. checking VM_IOREMAP is better, I think.

(double check) Not complete or not safe?

EFI seems to not update e820 table by default.  Ying, do you know why?

> > Even for a RAM page, we don't own the page, and cannot assume it's a
> > _PAGE_CACHE_WB page. So I wonder whether it's necessary to do another
> > patch to call reserve_memtype() before kmap_atomic() to ensure cache
> > consistency?
> > 
> > TODO: update comments accordingly
> > 
> 
> BTW, f->f_pos problem on 64bit machine still exists and this patch is still
> hard to test. I stopped that because anyone doesn't show any interests.

I'm using your patch :)

I feel most inconfident on this patch, so submitted it for RFC first.
I'll then submit a full patch series including your f_pos fix.

> I have no objection to your direction.
> 
> but please rewrite the function explanation as
> "addr" should be page alinged and bufsize should be multiple of page size."
> and change the function names.

OK, I'll rename it to vread_page().

Thanks,
Fengguang

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC][PATCH] vmalloc: simplify vread()/vwrite()
  2010-01-07  2:50   ` Wu Fengguang
@ 2010-01-07  2:57     ` KAMEZAWA Hiroyuki
  2010-01-07  3:21       ` Wu Fengguang
  2010-01-07  3:15     ` Huang Ying
  1 sibling, 1 reply; 9+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-01-07  2:57 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Andrew Morton, Tejun Heo, Ingo Molnar, Nick Piggin, Andi Kleen,
	Hugh Dickins, Christoph Lameter, Linux Memory Management List,
	LKML, Huang, Ying

On Thu, 7 Jan 2010 10:50:54 +0800
Wu Fengguang <fengguang.wu@intel.com> wrote:
 
> > > The changes are:
> > > - remove the vmlist walk and rely solely on vmalloc_to_page()
> > > - replace the VM_IOREMAP check with (page && page_is_ram(pfn))
> > > 
> > > The VM_IOREMAP check is introduced in commit d0107eb07320b for per-cpu
> > > alloc. Kame, would you double check if this change is OK for that
> > > purpose?
> > > 
> > I think VM_IOREMAP is for avoiding access to device configuration area and
> > unexpected breakage in device. Then, VM_IOREMAP are should be skipped by
> > the caller. (My patch _just_ moves the avoidance of callers to vread()/vwrite())
> 
> "device configuration area" is not RAM, so testing of RAM would be
> able to skip them?
>
Sorry, that's an area what I'm not sure. 
But, page_is_ram() implementation other than x86 seems not very safe...
(And it seems that it's not defiend in some archs.)

> > 
> > > The page_is_ram() check is necessary because kmap_atomic() is not
> > > designed to work with non-RAM pages.
> > > 
> > I think page_is_ram() is not a complete method...on x86, it just check
> > e820's memory range. checking VM_IOREMAP is better, I think.
> 
> (double check) Not complete or not safe?
> 
I think not-safe because e820 doesn't seem to be updated.

> EFI seems to not update e820 table by default.  Ying, do you know why?
> 

I hope all this kinds can be fixed by kernel/resource.c in generic way....
Now, each archs have its own.

> > > Even for a RAM page, we don't own the page, and cannot assume it's a
> > > _PAGE_CACHE_WB page. So I wonder whether it's necessary to do another
> > > patch to call reserve_memtype() before kmap_atomic() to ensure cache
> > > consistency?
> > > 
> > > TODO: update comments accordingly
> > > 
> > 
> > BTW, f->f_pos problem on 64bit machine still exists and this patch is still
> > hard to test. I stopped that because anyone doesn't show any interests.
> 
> I'm using your patch :)
> 
> I feel most inconfident on this patch, so submitted it for RFC first.
> I'll then submit a full patch series including your f_pos fix.
> 
Thank you, it's helpful.

Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC][PATCH] vmalloc: simplify vread()/vwrite()
  2010-01-07  2:50   ` Wu Fengguang
  2010-01-07  2:57     ` KAMEZAWA Hiroyuki
@ 2010-01-07  3:15     ` Huang Ying
  2010-01-07  3:23       ` KAMEZAWA Hiroyuki
  1 sibling, 1 reply; 9+ messages in thread
From: Huang Ying @ 2010-01-07  3:15 UTC (permalink / raw)
  To: Wu, Fengguang
  Cc: KAMEZAWA Hiroyuki, Andrew Morton, Tejun Heo, Ingo Molnar,
	Nick Piggin, Andi Kleen, Hugh Dickins, Christoph Lameter,
	Linux Memory Management List, LKML

On Thu, 2010-01-07 at 10:50 +0800, Wu, Fengguang wrote:
> On Thu, Jan 07, 2010 at 09:38:25AM +0800, KAMEZAWA Hiroyuki wrote:
> > On Thu, 7 Jan 2010 09:24:59 +0800
> > Wu Fengguang <fengguang.wu@intel.com> wrote:
> > 
> > > vread()/vwrite() is only called from kcore/kmem to access one page at
> > > a time.  So the logic can be vastly simplified.
> > > 
> > I recommend you to rename the function because safety of function is
> > changed and you can show what callers are influenced.
> 
> OK.
>  
> > > The changes are:
> > > - remove the vmlist walk and rely solely on vmalloc_to_page()
> > > - replace the VM_IOREMAP check with (page && page_is_ram(pfn))
> > > 
> > > The VM_IOREMAP check is introduced in commit d0107eb07320b for per-cpu
> > > alloc. Kame, would you double check if this change is OK for that
> > > purpose?
> > > 
> > I think VM_IOREMAP is for avoiding access to device configuration area and
> > unexpected breakage in device. Then, VM_IOREMAP are should be skipped by
> > the caller. (My patch _just_ moves the avoidance of callers to vread()/vwrite())
> 
> "device configuration area" is not RAM, so testing of RAM would be
> able to skip them?
> 
> > 
> > > The page_is_ram() check is necessary because kmap_atomic() is not
> > > designed to work with non-RAM pages.
> > > 
> > I think page_is_ram() is not a complete method...on x86, it just check
> > e820's memory range. checking VM_IOREMAP is better, I think.
> 
> (double check) Not complete or not safe?
> 
> EFI seems to not update e820 table by default.  Ying, do you know why?

In EFI system, E820 table is constructed from EFI memory map in boot
loader, so I think you can rely on E820 table.

Best Regards,
Huang Ying


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC][PATCH] vmalloc: simplify vread()/vwrite()
  2010-01-07  2:57     ` KAMEZAWA Hiroyuki
@ 2010-01-07  3:21       ` Wu Fengguang
  0 siblings, 0 replies; 9+ messages in thread
From: Wu Fengguang @ 2010-01-07  3:21 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Andrew Morton, Tejun Heo, Ingo Molnar, Nick Piggin, Andi Kleen,
	Hugh Dickins, Christoph Lameter, Linux Memory Management List,
	LKML, Huang, Ying

On Thu, Jan 07, 2010 at 10:57:36AM +0800, KAMEZAWA Hiroyuki wrote:
> On Thu, 7 Jan 2010 10:50:54 +0800
> Wu Fengguang <fengguang.wu@intel.com> wrote:
>  
> > > > The changes are:
> > > > - remove the vmlist walk and rely solely on vmalloc_to_page()
> > > > - replace the VM_IOREMAP check with (page && page_is_ram(pfn))
> > > > 
> > > > The VM_IOREMAP check is introduced in commit d0107eb07320b for per-cpu
> > > > alloc. Kame, would you double check if this change is OK for that
> > > > purpose?
> > > > 
> > > I think VM_IOREMAP is for avoiding access to device configuration area and
> > > unexpected breakage in device. Then, VM_IOREMAP are should be skipped by
> > > the caller. (My patch _just_ moves the avoidance of callers to vread()/vwrite())
> > 
> > "device configuration area" is not RAM, so testing of RAM would be
> > able to skip them?
> >
> Sorry, that's an area what I'm not sure. 

I believe the fundamental correctness requirement would be: to avoid
accessing _PAGE_CACHE_UC/WC pages by building _PAGE_CACHE_WB mapping
to them(which kmap_atomic do), whether it be physical RAM or device
address area.

For example, /dev/mem allows access to
- device areas, with proper _PAGE_CACHE_* via ioremap_cache() 
- _low_ physical RAM, which it directly reuse the 1:1 kernel mapping

> But, page_is_ram() implementation other than x86 seems not very safe...
> (And it seems that it's not defiend in some archs.)

Ah didn't know archs other than x86.  And hotplug won't update e820 as
well..

> > > 
> > > > The page_is_ram() check is necessary because kmap_atomic() is not
> > > > designed to work with non-RAM pages.
> > > > 
> > > I think page_is_ram() is not a complete method...on x86, it just check
> > > e820's memory range. checking VM_IOREMAP is better, I think.
> > 
> > (double check) Not complete or not safe?
> > 
> I think not-safe because e820 doesn't seem to be updated.
> 
> > EFI seems to not update e820 table by default.  Ying, do you know why?

Ying just confirmed that the kernel didn't update e820 with EFI memmap
because the bootloader will fake one e820 table based on EFI memmap,
in order to run legacy/unmodified kernel.

> 
> I hope all this kinds can be fixed by kernel/resource.c in generic way....
> Now, each archs have its own.
 
Agreed.

> > > > Even for a RAM page, we don't own the page, and cannot assume it's a
> > > > _PAGE_CACHE_WB page. So I wonder whether it's necessary to do another
> > > > patch to call reserve_memtype() before kmap_atomic() to ensure cache
> > > > consistency?
> > > > 
> > > > TODO: update comments accordingly
> > > > 
> > > 
> > > BTW, f->f_pos problem on 64bit machine still exists and this patch is still
> > > hard to test. I stopped that because anyone doesn't show any interests.
> > 
> > I'm using your patch :)
> > 
> > I feel most inconfident on this patch, so submitted it for RFC first.
> > I'll then submit a full patch series including your f_pos fix.
> > 
> Thank you, it's helpful.

Thanks,
Fengguang

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC][PATCH] vmalloc: simplify vread()/vwrite()
  2010-01-07  3:15     ` Huang Ying
@ 2010-01-07  3:23       ` KAMEZAWA Hiroyuki
  2010-01-07  5:24         ` Wu Fengguang
  0 siblings, 1 reply; 9+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-01-07  3:23 UTC (permalink / raw)
  To: Huang Ying
  Cc: Wu, Fengguang, Andrew Morton, Tejun Heo, Ingo Molnar, Nick Piggin,
	Andi Kleen, Hugh Dickins, Christoph Lameter,
	Linux Memory Management List, LKML

On Thu, 07 Jan 2010 11:15:41 +0800
Huang Ying <ying.huang@intel.com> wrote:

> > > 
> > > > The page_is_ram() check is necessary because kmap_atomic() is not
> > > > designed to work with non-RAM pages.
> > > > 
> > > I think page_is_ram() is not a complete method...on x86, it just check
> > > e820's memory range. checking VM_IOREMAP is better, I think.
> > 
> > (double check) Not complete or not safe?
> > 
> > EFI seems to not update e820 table by default.  Ying, do you know why?
> 
> In EFI system, E820 table is constructed from EFI memory map in boot
> loader, so I think you can rely on E820 table.
> 
Yes, we can rely on. But concerns here is that we cannot get any
information of ioremap via e820 map. 

But yes,
== ioremap()
 140         for (pfn = phys_addr >> PAGE_SHIFT;
 141                                 (pfn << PAGE_SHIFT) < (last_addr & PAGE_MASK);
 142                                 pfn++) {
 143 
 144                 int is_ram = page_is_ram(pfn);
 145 
 146                 if (is_ram && pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn)))
 147                         return NULL;
 148                 WARN_ON_ONCE(is_ram);
 149         }
==
you'll get warned before access if "ram" area is remapped...

But, about this patch, it seems that page_is_ram() is not free from architecture
dependecy.


Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC][PATCH] vmalloc: simplify vread()/vwrite()
  2010-01-07  3:23       ` KAMEZAWA Hiroyuki
@ 2010-01-07  5:24         ` Wu Fengguang
  2010-01-07  5:39           ` KAMEZAWA Hiroyuki
  0 siblings, 1 reply; 9+ messages in thread
From: Wu Fengguang @ 2010-01-07  5:24 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Huang, Ying, Andrew Morton, Tejun Heo, Ingo Molnar, Nick Piggin,
	Andi Kleen, Hugh Dickins, Christoph Lameter,
	Linux Memory Management List, LKML

On Thu, Jan 07, 2010 at 11:23:04AM +0800, KAMEZAWA Hiroyuki wrote:
> On Thu, 07 Jan 2010 11:15:41 +0800
> Huang Ying <ying.huang@intel.com> wrote:
> 
> > > > 
> > > > > The page_is_ram() check is necessary because kmap_atomic() is not
> > > > > designed to work with non-RAM pages.
> > > > > 
> > > > I think page_is_ram() is not a complete method...on x86, it just check
> > > > e820's memory range. checking VM_IOREMAP is better, I think.
> > > 
> > > (double check) Not complete or not safe?
> > > 
> > > EFI seems to not update e820 table by default.  Ying, do you know why?
> > 
> > In EFI system, E820 table is constructed from EFI memory map in boot
> > loader, so I think you can rely on E820 table.
> > 
> Yes, we can rely on. But concerns here is that we cannot get any
> information of ioremap via e820 map. 
> 
> But yes,
> == ioremap()
>  140         for (pfn = phys_addr >> PAGE_SHIFT;
>  141                                 (pfn << PAGE_SHIFT) < (last_addr & PAGE_MASK);
>  142                                 pfn++) {
>  143 
>  144                 int is_ram = page_is_ram(pfn);
>  145 
>  146                 if (is_ram && pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn)))
>  147                         return NULL;
>  148                 WARN_ON_ONCE(is_ram);
>  149         }
> ==
> you'll get warned before access if "ram" area is remapped...

Right.

> But, about this patch, it seems that page_is_ram() is not free from architecture
> dependecy.

Yes this is a problem. We can provide a generic page_is_ram() as below.
And could further convert the existing x86 (and others) page_is_ram()
to be resource-based -- since at least for now the e820 table won't be
updated on memory hotplug.

Thanks,
Fengguang
---
 include/linux/ioport.h |    2 ++
 kernel/resource.c      |   18 ++++++++++++++++++
 2 files changed, 20 insertions(+)

--- linux-mm.orig/kernel/resource.c	2010-01-07 12:40:55.000000000 +0800
+++ linux-mm/kernel/resource.c	2010-01-07 13:13:46.000000000 +0800
@@ -297,6 +297,24 @@ int walk_system_ram_range(unsigned long 
 
 #endif
 
+static int __page_is_ram(unsigned long pfn, unsigned long nr_pages, void *arg)
+{
+	int *is_ram = arg;
+
+	*is_ram = 1;
+
+	return 1;
+}
+
+int __attribute__((weak)) page_is_ram(unsigned long pagenr)
+{
+	int is_ram = 0;
+
+	walk_system_ram_range(pagenr, 1, &is_ram, __page_is_ram);
+
+	return is_ram;
+}
+
 /*
  * Find empty slot in the resource tree given range and alignment.
  */
--- linux-mm.orig/include/linux/ioport.h	2010-01-07 13:11:43.000000000 +0800
+++ linux-mm/include/linux/ioport.h	2010-01-07 13:12:37.000000000 +0800
@@ -188,5 +188,7 @@ extern int
 walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
 		void *arg, int (*func)(unsigned long, unsigned long, void *));
 
+extern int page_is_ram(unsigned long pagenr);
+
 #endif /* __ASSEMBLY__ */
 #endif	/* _LINUX_IOPORT_H */

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC][PATCH] vmalloc: simplify vread()/vwrite()
  2010-01-07  5:24         ` Wu Fengguang
@ 2010-01-07  5:39           ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 9+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-01-07  5:39 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Huang, Ying, Andrew Morton, Tejun Heo, Ingo Molnar, Nick Piggin,
	Andi Kleen, Hugh Dickins, Christoph Lameter,
	Linux Memory Management List, LKML

On Thu, 7 Jan 2010 13:24:03 +0800
Wu Fengguang <fengguang.wu@intel.com> wrote:

> On Thu, Jan 07, 2010 at 11:23:04AM +0800, KAMEZAWA Hiroyuki wrote:
> > On Thu, 07 Jan 2010 11:15:41 +0800
> > Huang Ying <ying.huang@intel.com> wrote:
> > 
> > > > > 
> > > > > > The page_is_ram() check is necessary because kmap_atomic() is not
> > > > > > designed to work with non-RAM pages.
> > > > > > 
> > > > > I think page_is_ram() is not a complete method...on x86, it just check
> > > > > e820's memory range. checking VM_IOREMAP is better, I think.
> > > > 
> > > > (double check) Not complete or not safe?
> > > > 
> > > > EFI seems to not update e820 table by default.  Ying, do you know why?
> > > 
> > > In EFI system, E820 table is constructed from EFI memory map in boot
> > > loader, so I think you can rely on E820 table.
> > > 
> > Yes, we can rely on. But concerns here is that we cannot get any
> > information of ioremap via e820 map. 
> > 
> > But yes,
> > == ioremap()
> >  140         for (pfn = phys_addr >> PAGE_SHIFT;
> >  141                                 (pfn << PAGE_SHIFT) < (last_addr & PAGE_MASK);
> >  142                                 pfn++) {
> >  143 
> >  144                 int is_ram = page_is_ram(pfn);
> >  145 
> >  146                 if (is_ram && pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn)))
> >  147                         return NULL;
> >  148                 WARN_ON_ONCE(is_ram);
> >  149         }
> > ==
> > you'll get warned before access if "ram" area is remapped...
> 
> Right.
> 
> > But, about this patch, it seems that page_is_ram() is not free from architecture
> > dependecy.
> 
> Yes this is a problem. We can provide a generic page_is_ram() as below.
> And could further convert the existing x86 (and others) page_is_ram()
> to be resource-based -- since at least for now the e820 table won't be
> updated on memory hotplug.
> 
> Thanks,
> Fengguang

seems nice :)

Thanks,
-Kame

> ---
>  include/linux/ioport.h |    2 ++
>  kernel/resource.c      |   18 ++++++++++++++++++
>  2 files changed, 20 insertions(+)
> 
> --- linux-mm.orig/kernel/resource.c	2010-01-07 12:40:55.000000000 +0800
> +++ linux-mm/kernel/resource.c	2010-01-07 13:13:46.000000000 +0800
> @@ -297,6 +297,24 @@ int walk_system_ram_range(unsigned long 
>  
>  #endif
>  
> +static int __page_is_ram(unsigned long pfn, unsigned long nr_pages, void *arg)
> +{
> +	int *is_ram = arg;
> +
> +	*is_ram = 1;
> +
> +	return 1;
> +}
> +
> +int __attribute__((weak)) page_is_ram(unsigned long pagenr)
> +{
> +	int is_ram = 0;
> +
> +	walk_system_ram_range(pagenr, 1, &is_ram, __page_is_ram);
> +
> +	return is_ram;
> +}
> +
>  /*
>   * Find empty slot in the resource tree given range and alignment.
>   */
> --- linux-mm.orig/include/linux/ioport.h	2010-01-07 13:11:43.000000000 +0800
> +++ linux-mm/include/linux/ioport.h	2010-01-07 13:12:37.000000000 +0800
> @@ -188,5 +188,7 @@ extern int
>  walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
>  		void *arg, int (*func)(unsigned long, unsigned long, void *));
>  
> +extern int page_is_ram(unsigned long pagenr);
> +
>  #endif /* __ASSEMBLY__ */
>  #endif	/* _LINUX_IOPORT_H */
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2010-01-07  5:42 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-01-07  1:24 [RFC][PATCH] vmalloc: simplify vread()/vwrite() Wu Fengguang
2010-01-07  1:38 ` KAMEZAWA Hiroyuki
2010-01-07  2:50   ` Wu Fengguang
2010-01-07  2:57     ` KAMEZAWA Hiroyuki
2010-01-07  3:21       ` Wu Fengguang
2010-01-07  3:15     ` Huang Ying
2010-01-07  3:23       ` KAMEZAWA Hiroyuki
2010-01-07  5:24         ` Wu Fengguang
2010-01-07  5:39           ` KAMEZAWA Hiroyuki

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).