All of lore.kernel.org
 help / color / mirror / Atom feed
From: Nick Piggin <nickpiggin@yahoo.com.au>
To: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: linux-kernel <linux-kernel@vger.kernel.org>,
	Linux Memory Management <linux-mm@kvack.org>
Subject: [patch 5] mm: lockless pagecache lookups
Date: Mon, 27 Jun 2005 16:34:43 +1000	[thread overview]
Message-ID: <42BF9E03.9050507@yahoo.com.au> (raw)
In-Reply-To: <42BF9DE5.6010701@yahoo.com.au>

[-- Attachment #1: Type: text/plain, Size: 28 bytes --]


-- 
SUSE Labs, Novell Inc.

[-- Attachment #2: mm-lockless-pagecache-lookups.patch --]
[-- Type: text/plain, Size: 11976 bytes --]

Use the speculative get_page and the lockless radix tree lookups
to introduce lockless page cache lookups (ie. no mapping->tree_lock).

The only atomicity changes this should introduce is the use of a
non atomic pagevec lookup for truncate, however what atomicity
guarantees there were are probably not too useful anyway.

Index: linux-2.6/mm/filemap.c
===================================================================
--- linux-2.6.orig/mm/filemap.c
+++ linux-2.6/mm/filemap.c
@@ -378,18 +378,25 @@ int add_to_page_cache(struct page *page,
 	int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
 
 	if (error == 0) {
+		page_cache_get(page);
+		__SetPageLocked(page);
+		page->mapping = mapping;
+		page->index = offset;
+
 		write_lock_irq(&mapping->tree_lock);
 		error = radix_tree_insert(&mapping->page_tree, offset, page);
 		if (!error) {
-			page_cache_get(page);
-			SetPageLocked(page);
-			page->mapping = mapping;
-			page->index = offset;
 			mapping->nrpages++;
 			pagecache_acct(1);
 		}
 		write_unlock_irq(&mapping->tree_lock);
 		radix_tree_preload_end();
+
+		if (error) {
+			page->mapping = NULL;
+			__put_page(page);
+			__ClearPageLocked(page);
+		}
 	}
 	return error;
 }
@@ -499,13 +506,13 @@ EXPORT_SYMBOL(__lock_page);
  */
 struct page * find_get_page(struct address_space *mapping, unsigned long offset)
 {
-	struct page *page;
+	struct page **pagep;
+	struct page *page = NULL;
 
-	read_lock_irq(&mapping->tree_lock);
-	page = radix_tree_lookup(&mapping->page_tree, offset);
-	if (page)
-		page_cache_get(page);
-	read_unlock_irq(&mapping->tree_lock);
+	pagep = (struct page **)radix_tree_lookup_slot(&mapping->page_tree,
+									offset);
+	if (pagep)
+		page = page_cache_get_speculative(pagep);
 	return page;
 }
 
@@ -518,12 +525,24 @@ struct page *find_trylock_page(struct ad
 {
 	struct page *page;
 
-	read_lock_irq(&mapping->tree_lock);
-	page = radix_tree_lookup(&mapping->page_tree, offset);
-	if (page && TestSetPageLocked(page))
-		page = NULL;
-	read_unlock_irq(&mapping->tree_lock);
-	return page;
+	page = find_get_page(mapping, offset);
+	if (page) {
+		if (TestSetPageLocked(page))
+			goto out_failed;
+		/* Has the page been truncated before being locked? */
+		if (page->mapping != mapping || page->index != offset) {
+			unlock_page(page);
+			goto out_failed;
+		}
+
+		/* Silly interface requires us to drop the refcount */
+		__put_page(page);
+		return page;
+
+out_failed:
+		page_cache_release(page);
+	}
+	return NULL;
 }
 
 EXPORT_SYMBOL(find_trylock_page);
@@ -544,25 +563,17 @@ struct page *find_lock_page(struct addre
 {
 	struct page *page;
 
-	read_lock_irq(&mapping->tree_lock);
 repeat:
-	page = radix_tree_lookup(&mapping->page_tree, offset);
+	page = find_get_page(mapping, offset);
 	if (page) {
-		page_cache_get(page);
-		if (TestSetPageLocked(page)) {
-			read_unlock_irq(&mapping->tree_lock);
-			lock_page(page);
-			read_lock_irq(&mapping->tree_lock);
-
-			/* Has the page been truncated while we slept? */
-			if (page->mapping != mapping || page->index != offset) {
-				unlock_page(page);
-				page_cache_release(page);
-				goto repeat;
-			}
+		lock_page(page);
+		/* Has the page been truncated before being locked? */
+		if (page->mapping != mapping || page->index != offset) {
+			unlock_page(page);
+			page_cache_release(page);
+			goto repeat;
 		}
 	}
-	read_unlock_irq(&mapping->tree_lock);
 	return page;
 }
 
@@ -645,6 +656,30 @@ unsigned find_get_pages(struct address_s
 	return ret;
 }
 
+unsigned find_get_pages_nonatomic(struct address_space *mapping, pgoff_t start,
+			    unsigned int nr_pages, struct page **pages)
+{
+	unsigned int i;
+	unsigned int ret;
+	unsigned int ret2;
+
+	/*
+	 * We do some unsightly casting to use the array first for storing
+	 * pointers to the page pointers, and then for the pointers to
+	 * the pages themselves that the caller wants.
+	 */
+	ret = radix_tree_gang_lookup_slot(&mapping->page_tree,
+				(void ***)pages, start, nr_pages);
+	ret2 = 0;
+	for (i = 0; i < ret; i++) {
+		struct page *page;
+		page = page_cache_get_speculative(((struct page ***)pages)[i]);
+		if (page)
+			pages[ret2++] = page;
+	}
+	return ret2;
+}
+
 /*
  * Like find_get_pages, except we only return pages which are tagged with
  * `tag'.   We update *index to index the next page for the traversal.
Index: linux-2.6/mm/readahead.c
===================================================================
--- linux-2.6.orig/mm/readahead.c
+++ linux-2.6/mm/readahead.c
@@ -272,27 +272,24 @@ __do_page_cache_readahead(struct address
 	/*
 	 * Preallocate as many pages as we will need.
 	 */
-	read_lock_irq(&mapping->tree_lock);
 	for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
 		unsigned long page_offset = offset + page_idx;
 		
 		if (page_offset > end_index)
 			break;
 
+		/* Don't need mapping->tree_lock - lookup can be racy */
 		page = radix_tree_lookup(&mapping->page_tree, page_offset);
 		if (page)
 			continue;
 
-		read_unlock_irq(&mapping->tree_lock);
 		page = page_cache_alloc_cold(mapping);
-		read_lock_irq(&mapping->tree_lock);
 		if (!page)
 			break;
 		page->index = page_offset;
 		list_add(&page->lru, &page_pool);
 		ret++;
 	}
-	read_unlock_irq(&mapping->tree_lock);
 
 	/*
 	 * Now start the IO.  We ignore I/O errors - if the page is not
Index: linux-2.6/mm/swap_state.c
===================================================================
--- linux-2.6.orig/mm/swap_state.c
+++ linux-2.6/mm/swap_state.c
@@ -76,19 +76,26 @@ static int __add_to_swap_cache(struct pa
 	BUG_ON(PagePrivate(page));
 	error = radix_tree_preload(gfp_mask);
 	if (!error) {
+		page_cache_get(page);
+		SetPageLocked(page);
+		SetPageSwapCache(page);
+		page->private = entry.val;
+
 		write_lock_irq(&swapper_space.tree_lock);
 		error = radix_tree_insert(&swapper_space.page_tree,
 						entry.val, page);
 		if (!error) {
-			page_cache_get(page);
-			SetPageLocked(page);
-			SetPageSwapCache(page);
-			page->private = entry.val;
 			total_swapcache_pages++;
 			pagecache_acct(1);
 		}
 		write_unlock_irq(&swapper_space.tree_lock);
 		radix_tree_preload_end();
+
+		if (error) {
+			__put_page(page);
+			ClearPageLocked(page);
+			ClearPageSwapCache(page);
+		}
 	}
 	return error;
 }
Index: linux-2.6/include/linux/page-flags.h
===================================================================
--- linux-2.6.orig/include/linux/page-flags.h
+++ linux-2.6/include/linux/page-flags.h
@@ -167,16 +167,13 @@ extern void __mod_page_state(unsigned lo
 /*
  * Manipulation of page state flags
  */
-#define PageLocked(page)		\
-		test_bit(PG_locked, &(page)->flags)
-#define SetPageLocked(page)		\
-		set_bit(PG_locked, &(page)->flags)
-#define TestSetPageLocked(page)		\
-		test_and_set_bit(PG_locked, &(page)->flags)
-#define ClearPageLocked(page)		\
-		clear_bit(PG_locked, &(page)->flags)
-#define TestClearPageLocked(page)	\
-		test_and_clear_bit(PG_locked, &(page)->flags)
+#define PageLocked(page)	test_bit(PG_locked, &(page)->flags)
+#define SetPageLocked(page)	set_bit(PG_locked, &(page)->flags)
+#define __SetPageLocked(page)	__set_bit(PG_locked, &(page)->flags)
+#define TestSetPageLocked(page)	test_and_set_bit(PG_locked, &(page)->flags)
+#define ClearPageLocked(page)	clear_bit(PG_locked, &(page)->flags)
+#define __ClearPageLocked(page)	__clear_bit(PG_locked, &(page)->flags)
+#define TestClearPageLocked(page) test_and_clear_bit(PG_locked, &(page)->flags)
 
 #define PageError(page)		test_bit(PG_error, &(page)->flags)
 #define SetPageError(page)	set_bit(PG_error, &(page)->flags)
Index: linux-2.6/include/linux/pagemap.h
===================================================================
--- linux-2.6.orig/include/linux/pagemap.h
+++ linux-2.6/include/linux/pagemap.h
@@ -108,6 +108,8 @@ extern struct page * find_or_create_page
 				unsigned long index, unsigned int gfp_mask);
 unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
 			unsigned int nr_pages, struct page **pages);
+unsigned find_get_pages_nonatomic(struct address_space *mapping, pgoff_t start,
+			unsigned int nr_pages, struct page **pages);
 unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
 			int tag, unsigned int nr_pages, struct page **pages);
 
Index: linux-2.6/include/linux/pagevec.h
===================================================================
--- linux-2.6.orig/include/linux/pagevec.h
+++ linux-2.6/include/linux/pagevec.h
@@ -25,6 +25,8 @@ void __pagevec_lru_add_active(struct pag
 void pagevec_strip(struct pagevec *pvec);
 unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
 		pgoff_t start, unsigned nr_pages);
+unsigned pagevec_lookup_nonatomic(struct pagevec *pvec,
+	struct address_space *mapping, pgoff_t start, unsigned nr_pages);
 unsigned pagevec_lookup_tag(struct pagevec *pvec,
 		struct address_space *mapping, pgoff_t *index, int tag,
 		unsigned nr_pages);
Index: linux-2.6/mm/swap.c
===================================================================
--- linux-2.6.orig/mm/swap.c
+++ linux-2.6/mm/swap.c
@@ -380,6 +380,19 @@ unsigned pagevec_lookup(struct pagevec *
 	return pagevec_count(pvec);
 }
 
+/**
+ * pagevec_lookup_nonatomic - non atomic pagevec_lookup
+ *
+ * This routine is non-atomic in that it may return blah.
+ */
+unsigned pagevec_lookup_nonatomic(struct pagevec *pvec,
+		struct address_space *mapping, pgoff_t start, unsigned nr_pages)
+{
+	pvec->nr = find_get_pages_nonatomic(mapping, start,
+					nr_pages, pvec->pages);
+	return pagevec_count(pvec);
+}
+
 unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
 		pgoff_t *index, int tag, unsigned nr_pages)
 {
Index: linux-2.6/mm/truncate.c
===================================================================
--- linux-2.6.orig/mm/truncate.c
+++ linux-2.6/mm/truncate.c
@@ -126,7 +126,7 @@ void truncate_inode_pages(struct address
 
 	pagevec_init(&pvec, 0);
 	next = start;
-	while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+	while (pagevec_lookup_nonatomic(&pvec, mapping, next, PAGEVEC_SIZE)) {
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
 			pgoff_t page_index = page->index;
@@ -160,7 +160,7 @@ void truncate_inode_pages(struct address
 	next = start;
 	for ( ; ; ) {
 		cond_resched();
-		if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+		if (!pagevec_lookup_nonatomic(&pvec, mapping, next, PAGEVEC_SIZE)) {
 			if (next == start)
 				break;
 			next = start;
@@ -206,7 +206,7 @@ unsigned long invalidate_mapping_pages(s
 
 	pagevec_init(&pvec, 0);
 	while (next <= end &&
-			pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+		pagevec_lookup_nonatomic(&pvec, mapping, next, PAGEVEC_SIZE)) {
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
 
Index: linux-2.6/mm/page-writeback.c
===================================================================
--- linux-2.6.orig/mm/page-writeback.c
+++ linux-2.6/mm/page-writeback.c
@@ -811,6 +811,7 @@ int mapping_tagged(struct address_space 
 	unsigned long flags;
 	int ret;
 
+	/* XXX: radix_tree_tagged is safe to run without the lock */
 	read_lock_irqsave(&mapping->tree_lock, flags);
 	ret = radix_tree_tagged(&mapping->page_tree, tag);
 	read_unlock_irqrestore(&mapping->tree_lock, flags);
Index: linux-2.6/mm/swapfile.c
===================================================================
--- linux-2.6.orig/mm/swapfile.c
+++ linux-2.6/mm/swapfile.c
@@ -338,6 +338,7 @@ int remove_exclusive_swap_page(struct pa
 	retval = 0;
 	if (p->swap_map[swp_offset(entry)] == 1) {
 		/* Recheck the page count with the swapcache lock held.. */
+		SetPageFreeing(page);
 		write_lock_irq(&swapper_space.tree_lock);
 		if ((page_count(page) == 2) && !PageWriteback(page)) {
 			__delete_from_swap_cache(page);
@@ -345,6 +346,7 @@ int remove_exclusive_swap_page(struct pa
 			retval = 1;
 		}
 		write_unlock_irq(&swapper_space.tree_lock);
+		ClearPageFreeing(page);
 	}
 	swap_info_put(p);
 

  reply	other threads:[~2005-06-27  6:42 UTC|newest]

Thread overview: 106+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2005-06-27  6:29 [rfc] lockless pagecache Nick Piggin
2005-06-27  6:29 ` Nick Piggin
2005-06-27  6:32 ` [patch 1] mm: PG_free flag Nick Piggin
2005-06-27  6:32   ` [patch 2] mm: speculative get_page Nick Piggin
2005-06-27  6:33     ` [patch 3] radix tree: lookup_slot Nick Piggin
2005-06-27  6:34       ` [patch 4] radix tree: lockless readside Nick Piggin
2005-06-27  6:34         ` Nick Piggin [this message]
2005-06-27  6:35           ` [patch 6] mm: spinlock tree_lock Nick Piggin
2005-06-27 14:12     ` [patch 2] mm: speculative get_page William Lee Irwin III
2005-06-27 14:12       ` William Lee Irwin III
2005-06-28  0:03       ` Nick Piggin
2005-06-28  0:03         ` Nick Piggin
2005-06-28  0:56         ` Nick Piggin
2005-06-28  0:56           ` Nick Piggin
2005-06-28  1:22         ` William Lee Irwin III
2005-06-28  1:22           ` William Lee Irwin III
2005-06-28  1:42           ` Nick Piggin
2005-06-28  1:42             ` Nick Piggin
2005-06-28  4:06             ` William Lee Irwin III
2005-06-28  4:06               ` William Lee Irwin III
2005-06-28  4:50               ` Nick Piggin
2005-06-28  4:50                 ` Nick Piggin
2005-06-28  5:08                 ` David S. Miller
2005-06-28  5:08                   ` [patch 2] mm: speculative get_page, " David S. Miller, Nick Piggin
2005-06-28  5:34                   ` Nick Piggin
2005-06-28  5:34                     ` Nick Piggin
2005-06-28 14:19                   ` William Lee Irwin III
2005-06-28 14:19                     ` William Lee Irwin III
2005-06-28 15:43                     ` Nick Piggin
2005-06-28 15:43                       ` Nick Piggin
2005-06-28 17:01                       ` Christoph Lameter
2005-06-28 17:01                         ` Christoph Lameter
2005-06-28 23:10                         ` Nick Piggin
2005-06-28 23:10                           ` Nick Piggin
2005-06-28 21:32                   ` Jesse Barnes
2005-06-28 21:32                     ` Jesse Barnes
2005-06-28 22:17                     ` Christoph Lameter
2005-06-28 22:17                       ` Christoph Lameter
2005-06-28 12:45     ` Andy Whitcroft
2005-06-28 12:45       ` Andy Whitcroft
2005-06-28 13:16       ` Nick Piggin
2005-06-28 13:16         ` Nick Piggin
2005-06-28 16:02         ` Dave Hansen
2005-06-28 16:02           ` Dave Hansen
2005-06-29 16:31           ` Pavel Machek
2005-06-29 16:31             ` Pavel Machek
2005-06-29 18:43             ` Dave Hansen
2005-06-29 18:43               ` Dave Hansen
2005-06-29 21:22               ` Pavel Machek
2005-06-29 21:22                 ` Pavel Machek
2005-06-29 16:31         ` Pavel Machek
2005-06-29 16:31           ` Pavel Machek
2005-06-27  6:43 ` VFS scalability (was: [rfc] lockless pagecache) Nick Piggin
2005-06-27  6:43   ` Nick Piggin
2005-06-27  7:13   ` Andi Kleen
2005-06-27  7:13     ` Andi Kleen
2005-06-27  7:33     ` VFS scalability Nick Piggin
2005-06-27  7:33       ` Nick Piggin
2005-06-27  7:44       ` Andi Kleen
2005-06-27  7:44         ` Andi Kleen
2005-06-27  8:03         ` Nick Piggin
2005-06-27  8:03           ` Nick Piggin
2005-06-27  7:46 ` [rfc] lockless pagecache Andrew Morton
2005-06-27  7:46   ` Andrew Morton
2005-06-27  8:02   ` Nick Piggin
2005-06-27  8:02     ` Nick Piggin
2005-06-27  8:15     ` Andrew Morton
2005-06-27  8:15       ` Andrew Morton
2005-06-27  8:28       ` Nick Piggin
2005-06-27  8:28         ` Nick Piggin
2005-06-27  8:56     ` Lincoln Dale
2005-06-27  8:56       ` Lincoln Dale
2005-06-27  9:04       ` Nick Piggin
2005-06-27  9:04         ` Nick Piggin
2005-06-27 18:14         ` Chen, Kenneth W
2005-06-27 18:14           ` Chen, Kenneth W
2005-06-27 18:50           ` Badari Pulavarty
2005-06-27 18:50             ` Badari Pulavarty
2005-06-27 19:05             ` Chen, Kenneth W
2005-06-27 19:05               ` Chen, Kenneth W
2005-06-27 19:22               ` Christoph Lameter
2005-06-27 19:22                 ` Christoph Lameter
2005-06-27 19:42                 ` Chen, Kenneth W
2005-06-27 19:42                   ` Chen, Kenneth W
2005-07-05 15:11                   ` Sonny Rao
2005-07-05 15:11                     ` Sonny Rao
2005-07-05 15:31                     ` Martin J. Bligh
2005-07-05 15:31                       ` Martin J. Bligh
2005-07-05 15:37                       ` Sonny Rao
2005-07-05 15:37                         ` Sonny Rao
2005-06-27 13:17     ` Benjamin LaHaise
2005-06-27 13:17       ` Benjamin LaHaise
2005-06-28  0:32       ` Nick Piggin
2005-06-28  0:32         ` Nick Piggin
2005-06-28  1:26         ` William Lee Irwin III
2005-06-28  1:26           ` William Lee Irwin III
2005-06-27 14:08   ` Martin J. Bligh
2005-06-27 14:08     ` Martin J. Bligh
2005-06-27 17:49   ` Christoph Lameter
2005-06-27 17:49     ` Christoph Lameter
2005-06-29 10:49 ` Hirokazu Takahashi
2005-06-29 10:49   ` Hirokazu Takahashi
2005-06-29 11:38   ` Nick Piggin
2005-06-29 11:38     ` Nick Piggin
2005-06-30  3:32     ` Hirokazu Takahashi
2005-06-30  3:32       ` Hirokazu Takahashi

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=42BF9E03.9050507@yahoo.com.au \
    --to=nickpiggin@yahoo.com.au \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.