public inbox for linux-arch@vger.kernel.org
 help / color / mirror / Atom feed
* mremap() BUG on virtually indexed caches
@ 2006-06-01 21:04 David Miller
  2006-06-01 22:33 ` David Miller
  0 siblings, 1 reply; 3+ messages in thread
From: David Miller @ 2006-06-01 21:04 UTC (permalink / raw)
  To: linux-arch


I just discovered a serious problem with mremap() on platforms that
use virtually indexed caches.  It results in illegal cache aliases and
corrupt data, even for anonymous mmap()'s.  The bug has essentially
been around forever.

The problem occurs when MREMAP_MAYMOVE is passed into mremap(), the
area is moved instead of simply expanded/shrunk, and the virtual
address choosen for the new area has a different "virtual color" from
the previous location.

To be honest, the "ZERO_PAGE() mips" stuff for move_pte() should have
been a big red flag.

I'm actually surprised this bug didn't get spotted earlier.  For
example, glibc uses mmap() regions for malloc() buffers if they are
>128K in size, or something like that.  If you realloc() on such a
buffer then mremap() with MREMAP_MAYMOVE is used by glibc to expand
the area.

Guess what uses realloc() a _lot_ with >128K buffers?  dpkg.  When
it's parsing and building package database files, it uses this
expanding buffer thing called varbuf, every time a varbuf needs to
expand it uses realloc().

So the bug reports I saw on sparc64 were cases of the Debian package
database 'status' and 'available' files getting corrupted on Debian
and Debian derived systems such as Ubuntu.  It was always some
multiple of the cacheline size, the corruption pieces, and once I
strace'd dpkg and saw the voluminous mremap() calls it became clear
what might cause this.

There are several ways to deal with this bug, one example is attached
below which involved overriding move_pte() on the relevant platforms
in order to do the cache flush if necessary.  Another possible
solution is to make get_unmapped_area() smarter about this case, such
that it will choose the same virtual color as the existing mapping.
That would be desirable because it would eliminate all of the cache
flushing.

I'm working on a reproducable test case for this bug so that 1) I can
validate the bug and 2) other architecture maintainers can try the
test case on their platform.  This test case isn't easy to write
because you have to use carefully choosen guard mmap() pages in order
to both force a mremap() move during expansion, and also to force
get_unmapped_area() into choosing an area with a different virtual
color.  :-)

Anyways, I'll post that test case here once I have it working.

diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 358e4d3..c2059a3 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -159,17 +159,8 @@ static inline void ptep_set_wrprotect(st
 #define lazy_mmu_prot_update(pte)	do { } while (0)
 #endif
 
-#ifndef __HAVE_ARCH_MULTIPLE_ZERO_PAGE
+#ifndef __HAVE_ARCH_MOVE_PTE
 #define move_pte(pte, prot, old_addr, new_addr)	(pte)
-#else
-#define move_pte(pte, prot, old_addr, new_addr)				\
-({									\
- 	pte_t newpte = (pte);						\
-	if (pte_present(pte) && pfn_valid(pte_pfn(pte)) &&		\
-			pte_page(pte) == ZERO_PAGE(old_addr))		\
-		newpte = mk_pte(ZERO_PAGE(new_addr), (prot));		\
-	newpte;								\
-})
 #endif
 
 /*
diff --git a/include/asm-mips/pgtable.h b/include/asm-mips/pgtable.h
index 702a28f..69cebbd 100644
--- a/include/asm-mips/pgtable.h
+++ b/include/asm-mips/pgtable.h
@@ -70,7 +70,15 @@ extern unsigned long zero_page_mask;
 #define ZERO_PAGE(vaddr) \
 	(virt_to_page(empty_zero_page + (((unsigned long)(vaddr)) & zero_page_mask)))
 
-#define __HAVE_ARCH_MULTIPLE_ZERO_PAGE
+#define __HAVE_ARCH_MOVE_PTE
+#define move_pte(pte, prot, old_addr, new_addr)				\
+({									\
+ 	pte_t newpte = (pte);						\
+	if (pte_present(pte) && pfn_valid(pte_pfn(pte)) &&		\
+			pte_page(pte) == ZERO_PAGE(old_addr))		\
+		newpte = mk_pte(ZERO_PAGE(new_addr), (prot));		\
+	newpte;								\
+})
 
 extern void paging_init(void);
 
diff --git a/include/asm-sparc64/pgtable.h b/include/asm-sparc64/pgtable.h
index c44e746..afe6b97 100644
--- a/include/asm-sparc64/pgtable.h
+++ b/include/asm-sparc64/pgtable.h
@@ -689,6 +689,23 @@ static inline void set_pte_at(struct mm_
 #define pte_clear(mm,addr,ptep)		\
 	set_pte_at((mm), (addr), (ptep), __pte(0UL))
 
+#ifdef DCACHE_ALIASING_POSSIBLE
+#define __HAVE_ARCH_MOVE_PTE
+#define move_pte(pte, prot, old_addr, new_addr)				\
+({									\
+ 	pte_t newpte = (pte);						\
+	if (tlb_type != hypervisor && pte_present(pte)) {		\
+		unsigned long __pfn = pte_pfn(pte);			\
+									\
+		if (pfn_valid(__pfn) &&					\
+		    (((old_addr) ^ (new_addr)) & (1 << 13)))		\
+			flush_dcache_page_all(current->mm,		\
+					      pfn_to_page(__pfn));	\
+	}								\
+	newpte;								\
+})
+#endif
+
 extern pgd_t swapper_pg_dir[2048];
 extern pmd_t swapper_low_pmd_dir[2048];
 

^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: mremap() BUG on virtually indexed caches
  2006-06-01 21:04 mremap() BUG on virtually indexed caches David Miller
@ 2006-06-01 22:33 ` David Miller
  2006-06-02  0:42   ` David Miller
  0 siblings, 1 reply; 3+ messages in thread
From: David Miller @ 2006-06-01 22:33 UTC (permalink / raw)
  To: linux-arch


Ignore my test program, it's seriously buggy :)

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: mremap() BUG on virtually indexed caches
  2006-06-01 22:33 ` David Miller
@ 2006-06-02  0:42   ` David Miller
  0 siblings, 0 replies; 3+ messages in thread
From: David Miller @ 2006-06-02  0:42 UTC (permalink / raw)
  To: linux-arch

From: David Miller <davem@davemloft.net>
Date: Thu, 01 Jun 2006 15:33:45 -0700 (PDT)

> Ignore my test program, it's seriously buggy :)

Ok, this version of the test program below works for me and I verified
my fix as well (after fixing some local variable name clashes in my
move_pte() macro, notably "__pfn" which is also used by pfn_to_page()
which resulted in fun oopses :-)

The biggest pain is that the MREMAP_FIXED stuff is not available in
anything other than current glibc's, and the only way to get at the
extra mremap() argument is to do everything usually obtained from
sys/mman.h by hand :( I tried to get all the platform cases right, but
please double check before you try this test program.

Thanks!

/* mremap() stress tester for D-cache aliasing platforms.
 *
 * Copyright (C) 2006 David S. Miller (davem@davemloft.net)
 *
 * It tries to exercise the case where mremap() with MREMAP_MAYMOVE
 * will actually place the area somewhere else and not just extend
 * or shrink at the existing mapping location.
 *
 * This can cause problems on virtually indexed cache platforms
 * if they do not implement move_pte() with logic to handle a
 * change of virtual color.  If the cache virtual color changes
 * when mremap() moves the mapping around, we can end up accessing
 * stale aliases in the cache on subsequent cpu accesses to the
 * new virtual addresses.
 *
 * This bug was first discovered as file corruption occuring occaisionally
 * in 'dpkg'.  When 'dpkg' is building a 'status' or 'available' file it
 * uses an expanding allocator called 'varbuf' which uses realloc()
 * heavilly to expand it's internal buffer.  In glibc, for very large
 * malloc() buffer sizes, realloc() uses mremap() with MREMAP_MAYMOVE to
 * try and satisfy expansion requests.  Most of the time there is room
 * in the address space, but if we bump up against anoter mmap() region
 * it can move things around.  If this results in different D-cache coloring
 * for the region we can end up reading corrupt data from existing aliases
 * in the D-cache.
 *
 * It was very hard to reproduce, so this test case was written.
 */

#define _GNU_SOURCE
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>

/* XXX There is no easy way to get at mremap()'s MREAMP_FIXED functionality
 * XXX with older glibc versions...
 */
extern void *mremap(void *old_address, size_t old_size, size_t new_size,
		    unsigned long flags, ...);
# define MREMAP_MAYMOVE	1
# define MREMAP_FIXED	2

extern void *mmap(void *start, size_t length, int prot, int flags, int fd,
		  off_t offset);

/* Same on all platforms... */
#define PROT_READ	0x1		/* Page can be read.  */
#define PROT_WRITE	0x2		/* Page can be written.  */

#if defined(__alpha__)
#define MAP_FIXED	0x100		/* Interpret addr exactly.  */
#else
#if defined(__parisc__)
#define MAP_FIXED	0x04		/* Interpret addr exactly.  */
#else
#define MAP_FIXED	0x10		/* Interpret addr exactly.  */
#endif
#endif

#if defined(__alpha__) || defined(__parisc__)
#define MAP_ANONYMOUS	0x10		/* Don't use a file.  */
#else
#if defined(__mips__) || defined(__xtensa__)
#define MAP_ANONYMOUS	0x800		/* Don't use a file.  */
#else
#define MAP_ANONYMOUS	0x20		/* Don't use a file.  */
#endif
#endif

#define MAP_PRIVATE	0x02		/* Changes are private.  */

#define MAP_FAILED	((void *) -1)

static int page_size;

static void *unmapped_region;

#define MAX_OFFSET	8

static int init_main_buf(void **main_buf_p, int *main_buf_size_p)
{
	page_size = getpagesize();
	*main_buf_size_p = page_size;

	unmapped_region = mmap(NULL, (MAX_OFFSET + 1) * page_size,
			       PROT_READ,
			       MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
	if (unmapped_region == (void *) MAP_FAILED) {
		perror("mmap() of unmapped_region");
		return 1;
	}
	fprintf(stdout, "unmapped region at %p\n", unmapped_region);
	fflush(stdout);

	*main_buf_p = mmap(unmapped_region, *main_buf_size_p,
			   PROT_READ | PROT_WRITE,
			   MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS,
			   -1, 0);
	if (*main_buf_p == (void *) MAP_FAILED) {
		perror("Initial 1-page mmap() of main_buf");
		return 1;
	}

	fprintf(stdout, "Initial main_buf at %p\n", *main_buf_p);
	fflush(stdout);

	return 0;
}

static void destroy_main_buf(void *main_buf, int main_buf_size)
{
	munmap(main_buf, main_buf_size);
}

static unsigned int key = 0x00000001;

static void fill_page(void *start)
{
	unsigned int *p = start;
	unsigned int *end = start + page_size;

	while (p < end) {
		volatile unsigned int *xp = (volatile unsigned int *) p;

		(void) *xp;

		*p++ = (unsigned int) (p - (unsigned int *) start) + key;
	}
}

static int remap_page(void **main_buf_p, int *main_buf_size_p, int off)
{
	void *p;

	p = mremap(*main_buf_p,
		   *main_buf_size_p,
		   *main_buf_size_p,
		   MREMAP_FIXED | MREMAP_MAYMOVE,
		   unmapped_region + (off * page_size));
	if (p == (void *) MAP_FAILED) {
		perror("mremap() of main_buf");
		return 1;
	}
	*main_buf_p = p;

	return 0;
}

static void check_page(void *start)
{
	unsigned int *p = start;
	unsigned int *end = start + page_size;

	while (p < end) {
		unsigned int val = *p;
		unsigned int exp_val;

		exp_val = (unsigned int) (p - (unsigned int *) start) + key;

		if (val != exp_val) {
			fprintf(stderr, "Bogus value %08x should be %08x\n",
				val, exp_val);
			exit(99);
		}
		p++;
	}
}

static int do_test(void)
{
	void *main_buf;
	int main_buf_size;
	int err, i;

	err = init_main_buf(&main_buf, &main_buf_size);
	if (err)
		return 1;

	while (1) {
		for (i = 0; i < MAX_OFFSET; i++) {
			fill_page(main_buf);

			remap_page(&main_buf, &main_buf_size, i + 1);

			check_page(main_buf);

			key++;
		}
	}

	destroy_main_buf(main_buf, main_buf_size);

	return 0;
}

int main(int argc, char **argv, char **envp)
{
	page_size = getpagesize();

	return do_test();
}

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2006-06-02  0:42 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-06-01 21:04 mremap() BUG on virtually indexed caches David Miller
2006-06-01 22:33 ` David Miller
2006-06-02  0:42   ` David Miller

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox