public inbox for linux-ia64@vger.kernel.org
 help / color / mirror / Atom feed
From: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
To: linux-ia64@vger.kernel.org
Subject: Kernel bug fix: hugepage_ free_pgtables()
Date: Wed, 24 Dec 2003 23:46:25 +0000	[thread overview]
Message-ID: <marc-linux-ia64-107230960711204@msgid-missing> (raw)

[-- Attachment #1: Type: text/plain, Size: 1612 bytes --]

We recently discovered a bug on ia64 when unmapping an address space
that
belongs to huge page region.  The generic code unmap_region() calls
free_pgtables() to free any possible pages that are used for page
tables.
However, it does no differentiation whether that region is mapped for
normal
page or huge tlb page.  Problem arises when free_pgtables() calculates
PGDIR
aligned area based on the default page size, where in the huge page
case, it
should really be using huge tlb page size instead.  The pgd_index
calculation
should also be adjusted accordingly.

So we need an architecture specific code to handle huge tlb cases.  It
also
requires changes in generic part of kernel.  The generic kernel changes
has
made into Andrew Morton's 2.6.0-mm1 already.  Here is the ia64 part of
patch
to take the new free page table semantics.

A bit more details on the kernel bug:  When there are two huge page
mappings,
like the two in the example, first one at the end of PGDIR_SIZE, and
second
one starts at next PGDIR_SIZE (64GB with 16K page size):
8000000ff0000000-8000001000000000 rw-s
8000001000000000-8000001010000000 rw-s
Unmapping the first vma would trick free_pgtable to think it can remove
one
set of pgd indexed at 0x400, and it went ahead purge the entire pmd/pte
that
are still in use by the second mapping. Now any subsequent access to
pmd/pte
for the second active mapping will trigger the bug.  We've seen hard
kernel
hang on some platform, some other platform will generate MCA, plus all
kinds
of unpleasant result.

David, please apply for 2.6.

- Ken

[-- Attachment #2: free_pgt.ia64.patch --]
[-- Type: application/octet-stream, Size: 4118 bytes --]

diff -Nur linux-2.6.0/arch/ia64/mm/hugetlbpage.c linux-2.6.0-ken/arch/ia64/mm/hugetlbpage.c
--- linux-2.6.0/arch/ia64/mm/hugetlbpage.c	2003-12-17 18:58:56.000000000 -0800
+++ linux-2.6.0-ken/arch/ia64/mm/hugetlbpage.c	2003-12-23 13:48:23.000000000 -0800
@@ -144,17 +144,6 @@
 
 	return 0;
 }
-/* This function checks if the address and address+len falls out of HugeTLB region.  It
- * return -EINVAL if any part of address range falls in HugeTLB region.
- */
-int  check_valid_hugepage_range(unsigned long addr, unsigned long len)
-{
-	if (REGION_NUMBER(addr) == REGION_HPAGE)
-		return -EINVAL;
-	if (REGION_NUMBER(addr+len) == REGION_HPAGE)
-		return -EINVAL;
-	return 0;
-}
 
 int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 			struct vm_area_struct *vma)
@@ -272,6 +261,59 @@
 	free_huge_page(page);
 }
 
+/*
+ * Same as generic free_pgtables(), except constant PGDIR_* and pgd_offset
+ * are hugetlb region specific.
+ */
+void hugetlb_free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *prev,
+	unsigned long start, unsigned long end)
+{
+	unsigned long first = start & HUGETLB_PGDIR_MASK;
+	unsigned long last = end + HUGETLB_PGDIR_SIZE - 1;
+	unsigned long start_index, end_index;
+	struct mm_struct *mm = tlb->mm;
+
+	if (!prev) {
+		prev = mm->mmap;
+		if (!prev)
+			goto no_mmaps;
+		if (prev->vm_end > start) {
+			if (last > prev->vm_start)
+				last = prev->vm_start;
+			goto no_mmaps;
+		}
+	}
+	for (;;) {
+		struct vm_area_struct *next = prev->vm_next;
+
+		if (next) {
+			if (next->vm_start < start) {
+				prev = next;
+				continue;
+			}
+			if (last > next->vm_start)
+				last = next->vm_start;
+		}
+		if (prev->vm_end > first)
+			first = prev->vm_end + HUGETLB_PGDIR_SIZE - 1;
+		break;
+	}
+no_mmaps:
+	if (last < first)	/* for arches with discontiguous pgd indices */
+		return;
+	/*
+	 * If the PGD bits are not consecutive in the virtual address, the
+	 * old method of shifting the VA >> by PGDIR_SHIFT doesn't work.
+	 */
+
+	start_index = pgd_index(htlbpage_to_page(first));
+	end_index = pgd_index(htlbpage_to_page(last));
+
+	if (end_index > start_index) {
+		clear_page_tables(tlb, start_index, end_index - start_index);
+	}
+}
+
 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end)
 {
 	struct mm_struct *mm = vma->vm_mm;
diff -Nur linux-2.6.0/include/asm-ia64/page.h linux-2.6.0-ken/include/asm-ia64/page.h
--- linux-2.6.0/include/asm-ia64/page.h	2003-12-17 18:58:15.000000000 -0800
+++ linux-2.6.0-ken/include/asm-ia64/page.h	2003-12-23 11:28:48.000000000 -0800
@@ -63,7 +63,7 @@
 # define HPAGE_SIZE	(__IA64_UL_CONST(1) << HPAGE_SHIFT)
 # define HPAGE_MASK	(~(HPAGE_SIZE - 1))
 # define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
-# define ARCH_HAS_VALID_HUGEPAGE_RANGE
+# define ARCH_HAS_HUGEPAGE_ONLY_RANGE
 #endif /* CONFIG_HUGETLB_PAGE */
 
 #ifdef __ASSEMBLY__
@@ -137,7 +137,9 @@
 # define htlbpage_to_page(x)	((REGION_NUMBER(x) << 61)				\
 				 | (REGION_OFFSET(x) >> (HPAGE_SHIFT-PAGE_SHIFT)))
 # define HUGETLB_PAGE_ORDER	(HPAGE_SHIFT - PAGE_SHIFT)
-extern int  check_valid_hugepage_range(unsigned long addr, unsigned long len);
+# define is_hugepage_only_range(addr, len)		\
+	 (REGION_NUMBER(addr) == REGION_HPAGE &&	\
+	  REGION_NUMBER((addr)+(len)) == REGION_HPAGE)
 #endif
 
 static __inline__ int
diff -Nur linux-2.6.0/include/asm-ia64/pgtable.h linux-2.6.0-ken/include/asm-ia64/pgtable.h
--- linux-2.6.0/include/asm-ia64/pgtable.h	2003-12-17 18:58:39.000000000 -0800
+++ linux-2.6.0-ken/include/asm-ia64/pgtable.h	2003-12-23 13:46:14.000000000 -0800
@@ -455,6 +455,15 @@
 /* We provide our own get_unmapped_area to cope with VA holes for userland */
 #define HAVE_ARCH_UNMAPPED_AREA
 
+#ifdef CONFIG_HUGETLB_PAGE
+#define HUGETLB_PGDIR_SHIFT	(HPAGE_SHIFT + 2*(PAGE_SHIFT-3))
+#define HUGETLB_PGDIR_SIZE	(__IA64_UL(1) << HUGETLB_PGDIR_SHIFT)
+#define HUGETLB_PGDIR_MASK	(~(HUGETLB_PGDIR_SIZE-1))
+struct mmu_gather;
+extern void hugetlb_free_pgtables(struct mmu_gather *tlb,
+	struct vm_area_struct * prev, unsigned long start, unsigned long end);
+#endif
+
 typedef pte_t *pte_addr_t;
 
 /*

                 reply	other threads:[~2003-12-24 23:46 UTC|newest]

Thread overview: [no followups] expand[flat|nested]  mbox.gz  Atom feed

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=marc-linux-ia64-107230960711204@msgid-missing \
    --to=kenneth.w.chen@intel.com \
    --cc=linux-ia64@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox