From: Andrew Morton <akpm@digeo.com>
To: davem@redhat.com, rohit.seth@intel.com, davidm@napali.hpl.hp.com,
anton@samba.org, wli@holomorphy.com, linux-mm@kvack.org
Subject: Re: hugepage patches
Date: Fri, 31 Jan 2003 15:18:04 -0800 [thread overview]
Message-ID: <20030131151804.68b9c1ce.akpm@digeo.com> (raw)
In-Reply-To: <20030131151501.7273a9bf.akpm@digeo.com>
3/4
We currently have a problem when things like ptrace, futexes and direct-io
try to pin user pages. If the user's address is in a huge page we're
elevting the refcount of a constituent 4k page, not the head page of the
high-order allocation unit.
To solve this, a generic way of handling higher-order pages has been
implemented:
- A higher-order page is called a "compound page". Chose this because
"huge page", "large page", "super page", etc all seem to mean different
things to different people.
- The first (controlling) 4k page of a compound page is referred to as the
"head" page.
- The remaining pages are tail pages.
All pages have PG_compound set. All pages have their lru.next pointing at
the head page (even the head page has this).
The head page's lru.prev, if non-zero, holds the address of the compound
page's put_page() function.
The order of the allocation is stored in the first tail page's lru.prev.
This is only for debug at present. This usage means that zero-order pages
may not be compound.
The above relationships are established for _all_ higher-order pages in the
page allocator. Which has some cost, but not much - another atomic op during
fork(), mainly.
This functionality is only enabled if CONFIG_HUGETLB_PAGE, although it could
be turned on permanently. There's a little extra cost in get_page/put_page.
linux/mm.h | 35 ++++++++++++++++++++++++++--
linux/page-flags.h | 7 ++++-
page_alloc.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 105 insertions(+), 3 deletions(-)
diff -puN include/linux/page-flags.h~compound-pages include/linux/page-flags.h
--- 25/include/linux/page-flags.h~compound-pages 2003-01-30 23:43:18.000000000 -0800
+++ 25-akpm/include/linux/page-flags.h 2003-01-30 23:43:18.000000000 -0800
@@ -72,7 +72,8 @@
#define PG_direct 16 /* ->pte_chain points directly at pte */
#define PG_mappedtodisk 17 /* Has blocks allocated on-disk */
-#define PG_reclaim 18 /* To be recalimed asap */
+#define PG_reclaim 18 /* To be reclaimed asap */
+#define PG_compound 19 /* Part of a compound page */
/*
* Global page accounting. One instance per CPU. Only unsigned longs are
@@ -251,6 +252,10 @@ extern void get_full_page_state(struct p
#define ClearPageReclaim(page) clear_bit(PG_reclaim, &(page)->flags)
#define TestClearPageReclaim(page) test_and_clear_bit(PG_reclaim, &(page)->flags)
+#define PageCompound(page) test_bit(PG_compound, &(page)->flags)
+#define SetPageCompound(page) set_bit(PG_compound, &(page)->flags)
+#define ClearPageCompound(page) clear_bit(PG_compound, &(page)->flags)
+
/*
* The PageSwapCache predicate doesn't use a PG_flag at this time,
* but it may again do so one day.
diff -puN mm/page_alloc.c~compound-pages mm/page_alloc.c
--- 25/mm/page_alloc.c~compound-pages 2003-01-30 23:43:18.000000000 -0800
+++ 25-akpm/mm/page_alloc.c 2003-01-31 01:47:02.000000000 -0800
@@ -85,6 +85,62 @@ static void bad_page(const char *functio
page->mapping = NULL;
}
+#ifndef CONFIG_HUGETLB_PAGE
+#define prep_compound_page(page, order) do { } while (0)
+#define destroy_compound_page(page, order) do { } while (0)
+#else
+/*
+ * Higher-order pages are called "compound pages". They are structured thusly:
+ *
+ * The first PAGE_SIZE page is called the "head page".
+ *
+ * The remaining PAGE_SIZE pages are called "tail pages".
+ *
+ * All pages have PG_compound set. All pages have their lru.next pointing at
+ * the head page (even the head page has this).
+ *
+ * The head page's lru.prev, if non-zero, holds the address of the compound
+ * page's put_page() function.
+ *
+ * The order of the allocation is stored in the first tail page's lru.prev.
+ * This is only for debug at present. This usage means that zero-order pages
+ * may not be compound.
+ */
+static void prep_compound_page(struct page *page, int order)
+{
+ int i;
+ int nr_pages = 1 << order;
+
+ page->lru.prev = NULL;
+ page[1].lru.prev = (void *)order;
+ for (i = 0; i < nr_pages; i++) {
+ struct page *p = page + i;
+
+ SetPageCompound(p);
+ p->lru.next = (void *)page;
+ }
+}
+
+static void destroy_compound_page(struct page *page, int order)
+{
+ int i;
+ int nr_pages = 1 << order;
+
+ if (page[1].lru.prev != (void *)order)
+ bad_page(__FUNCTION__, page);
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *p = page + i;
+
+ if (!PageCompound(p))
+ bad_page(__FUNCTION__, page);
+ if (p->lru.next != (void *)page)
+ bad_page(__FUNCTION__, page);
+ ClearPageCompound(p);
+ }
+}
+#endif /* CONFIG_HUGETLB_PAGE */
+
/*
* Freeing function for a buddy system allocator.
*
@@ -114,6 +170,8 @@ static inline void __free_pages_bulk (st
{
unsigned long page_idx, index;
+ if (order)
+ destroy_compound_page(page, order);
page_idx = page - base;
if (page_idx & ~mask)
BUG();
@@ -409,6 +467,12 @@ void free_cold_page(struct page *page)
free_hot_cold_page(page, 1);
}
+/*
+ * Really, prep_compound_page() should be called from __rmqueue_bulk(). But
+ * we cheat by calling it from here, in the order > 0 path. Saves a branch
+ * or two.
+ */
+
static struct page *buffered_rmqueue(struct zone *zone, int order, int cold)
{
unsigned long flags;
@@ -435,6 +499,8 @@ static struct page *buffered_rmqueue(str
spin_lock_irqsave(&zone->lock, flags);
page = __rmqueue(zone, order);
spin_unlock_irqrestore(&zone->lock, flags);
+ if (order && page)
+ prep_compound_page(page, order);
}
if (page != NULL) {
diff -puN include/linux/mm.h~compound-pages include/linux/mm.h
--- 25/include/linux/mm.h~compound-pages 2003-01-30 23:43:18.000000000 -0800
+++ 25-akpm/include/linux/mm.h 2003-01-30 23:43:18.000000000 -0800
@@ -208,24 +208,55 @@ struct page {
* Also, many kernel routines increase the page count before a critical
* routine so they can be sure the page doesn't go away from under them.
*/
-#define get_page(p) atomic_inc(&(p)->count)
-#define __put_page(p) atomic_dec(&(p)->count)
#define put_page_testzero(p) \
({ \
BUG_ON(page_count(page) == 0); \
atomic_dec_and_test(&(p)->count); \
})
+
#define page_count(p) atomic_read(&(p)->count)
#define set_page_count(p,v) atomic_set(&(p)->count, v)
+#define __put_page(p) atomic_dec(&(p)->count)
extern void FASTCALL(__page_cache_release(struct page *));
+#ifdef CONFIG_HUGETLB_PAGE
+
+static inline void get_page(struct page *page)
+{
+ if (PageCompound(page))
+ page = (struct page *)page->lru.next;
+ atomic_inc(&page->count);
+}
+
static inline void put_page(struct page *page)
{
+ if (PageCompound(page)) {
+ page = (struct page *)page->lru.next;
+ if (page->lru.prev) { /* destructor? */
+ (*(void (*)(struct page *))page->lru.prev)(page);
+ return;
+ }
+ }
if (!PageReserved(page) && put_page_testzero(page))
__page_cache_release(page);
}
+#else /* CONFIG_HUGETLB_PAGE */
+
+static inline void get_page(struct page *page)
+{
+ atomic_inc(&page->count);
+}
+
+static inline void put_page(struct page *page)
+{
+ if (!PageReserved(page) && put_page_testzero(page))
+ __page_cache_release(page);
+}
+
+#endif /* CONFIG_HUGETLB_PAGE */
+
/*
* Multiple processes may "see" the same page. E.g. for untouched
* mappings of /dev/null, all processes see the same page full of
_
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/
next prev parent reply other threads:[~2003-01-31 23:15 UTC|newest]
Thread overview: 48+ messages / expand[flat|nested] mbox.gz Atom feed top
2003-01-31 23:15 hugepage patches Andrew Morton
2003-01-31 23:13 ` David S. Miller
2003-01-31 23:36 ` Andrew Morton
2003-01-31 23:23 ` David S. Miller
2003-01-31 23:45 ` Andrew Morton
2003-01-31 23:48 ` David S. Miller
2003-01-31 23:16 ` Andrew Morton
2003-01-31 23:17 ` Andrew Morton
2003-01-31 23:18 ` Andrew Morton [this message]
2003-01-31 23:18 ` Andrew Morton
2003-02-01 8:58 ` Ingo Oeser
2003-02-01 9:31 ` Andrew Morton
2003-02-01 10:00 ` William Lee Irwin III
2003-02-01 10:14 ` Andrew Morton
2003-02-02 10:55 ` Andrew Morton
2003-02-02 10:55 ` Andrew Morton
2003-02-02 19:59 ` William Lee Irwin III
2003-02-02 20:49 ` Andrew Morton
2003-02-03 15:09 ` Eric W. Biederman
2003-02-03 21:29 ` Andrew Morton
2003-02-04 5:37 ` Eric W. Biederman
2003-02-04 5:50 ` William Lee Irwin III
2003-02-04 7:06 ` Eric W. Biederman
2003-02-04 7:16 ` Martin J. Bligh
2003-02-04 12:40 ` Eric W. Biederman
2003-02-04 15:55 ` Martin J. Bligh
2003-02-05 12:18 ` Eric W. Biederman
2003-02-04 21:12 ` Andrew Morton
2003-02-05 12:25 ` Eric W. Biederman
2003-02-05 19:57 ` Andrew Morton
2003-02-05 20:00 ` Andrew Morton
2003-02-02 10:55 ` Andrew Morton
2003-02-02 10:56 ` Andrew Morton
2003-02-02 20:06 ` William Lee Irwin III
2003-02-02 10:56 ` Andrew Morton
2003-02-02 10:56 ` Andrew Morton
2003-02-02 10:57 ` Andrew Morton
2003-02-02 10:57 ` Andrew Morton
2003-02-02 20:17 ` William Lee Irwin III
2003-02-02 10:57 ` Andrew Morton
-- strict thread matches above, loose matches on Subject: below --
2003-02-07 21:49 Seth, Rohit
2003-02-07 22:00 ` Andrew Morton
2003-02-07 22:02 Seth, Rohit
2003-02-07 22:24 ` Andrew Morton
2003-02-08 1:47 Seth, Rohit
2003-02-08 2:02 ` Andrew Morton
2003-02-08 3:05 Seth, Rohit
2003-02-08 8:48 ` Andrew Morton
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20030131151804.68b9c1ce.akpm@digeo.com \
--to=akpm@digeo.com \
--cc=anton@samba.org \
--cc=davem@redhat.com \
--cc=davidm@napali.hpl.hp.com \
--cc=linux-mm@kvack.org \
--cc=rohit.seth@intel.com \
--cc=wli@holomorphy.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.