From: "Paul E. McKenney" <paulmck@us.ibm.com>
To: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Paul McKenney <paul.mckenney@us.ibm.com>,
Dipankar Sarma <dipankar@in.ibm.com>, Ingo Molnar <mingo@elte.hu>,
linux-kernel <linux-kernel@vger.kernel.org>
Subject: Re: [patch 6/7] mm: lockless pagecache
Date: Thu, 11 Aug 2005 18:49:24 -0700 [thread overview]
Message-ID: <20050812014924.GQ1300@us.ibm.com> (raw)
In-Reply-To: <42FB4454.2010601@yahoo.com.au>
On Thu, Aug 11, 2005 at 10:28:04PM +1000, Nick Piggin wrote:
> 6/7
>
> --
> SUSE Labs, Novell Inc.
>
> Use the speculative get_page and the lockless radix tree lookups
> to introduce lockless page cache lookups (ie. no mapping->tree_lock).
>
> The only atomicity changes this should introduce is the use of a
> non atomic pagevec lookup for truncate, however what atomicity
> guarantees there were are probably not too useful anyway.
I don't understand the placement of the rcu_read_lock() and
rcu_read_unlock() calls. Again, possibly because I don't understand
the overall algorithm yet. And again, search for blank lines.
> Index: linux-2.6/mm/filemap.c
> ===================================================================
> --- linux-2.6.orig/mm/filemap.c
> +++ linux-2.6/mm/filemap.c
> @@ -379,18 +379,25 @@ int add_to_page_cache(struct page *page,
> int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
>
> if (error == 0) {
> + page_cache_get(page);
> + __SetPageLocked(page);
> + page->mapping = mapping;
> + page->index = offset;
> +
> write_lock_irq(&mapping->tree_lock);
> error = radix_tree_insert(&mapping->page_tree, offset, page);
> if (!error) {
> - page_cache_get(page);
> - SetPageLocked(page);
> - page->mapping = mapping;
> - page->index = offset;
> mapping->nrpages++;
> pagecache_acct(1);
> }
> write_unlock_irq(&mapping->tree_lock);
> radix_tree_preload_end();
> +
> + if (error) {
> + page->mapping = NULL;
> + __put_page(page);
> + __ClearPageLocked(page);
> + }
> }
> return error;
> }
> @@ -500,13 +507,15 @@ EXPORT_SYMBOL(__lock_page);
> */
> struct page * find_get_page(struct address_space *mapping, unsigned long offset)
> {
> - struct page *page;
> + struct page **pagep;
> + struct page *page = NULL;
>
> - read_lock_irq(&mapping->tree_lock);
> - page = radix_tree_lookup(&mapping->page_tree, offset);
> - if (page)
> - page_cache_get(page);
> - read_unlock_irq(&mapping->tree_lock);
> + rcu_read_lock();
> + pagep = (struct page **)radix_tree_lookup_slot(&mapping->page_tree,
> + offset);
> + if (pagep)
> + page = page_cache_get_speculative(pagep);
The data structures accessed by page_cache_get_speculative() don't
seem to be freed via RCU. My guess is that this is because these
data structures (struct page) never really go away. However, they
do get re-used, and this re-use must be protected against. It looks
to me that this protection takes the form of atomic instructions
in get_page_testone() and the like. If this is the case, then it
should be possible to push the rcu_read_lock() and rcu_read_unlock()
down into the radix_tree_lookup_slot().
> + rcu_read_unlock();
> return page;
> }
>
> @@ -519,12 +528,24 @@ struct page *find_trylock_page(struct ad
> {
> struct page *page;
>
> - read_lock_irq(&mapping->tree_lock);
> - page = radix_tree_lookup(&mapping->page_tree, offset);
> - if (page && TestSetPageLocked(page))
> - page = NULL;
> - read_unlock_irq(&mapping->tree_lock);
> - return page;
> + page = find_get_page(mapping, offset);
> + if (page) {
> + if (TestSetPageLocked(page))
> + goto out_failed;
> + /* Has the page been truncated before being locked? */
> + if (page->mapping != mapping || page->index != offset) {
> + unlock_page(page);
> + goto out_failed;
> + }
> +
> + /* Silly interface requires us to drop the refcount */
> + __put_page(page);
> + return page;
> +
> +out_failed:
> + page_cache_release(page);
> + }
> + return NULL;
> }
>
> EXPORT_SYMBOL(find_trylock_page);
> @@ -545,25 +566,17 @@ struct page *find_lock_page(struct addre
> {
> struct page *page;
>
> - read_lock_irq(&mapping->tree_lock);
> repeat:
> - page = radix_tree_lookup(&mapping->page_tree, offset);
> + page = find_get_page(mapping, offset);
> if (page) {
> - page_cache_get(page);
> - if (TestSetPageLocked(page)) {
> - read_unlock_irq(&mapping->tree_lock);
> - lock_page(page);
> - read_lock_irq(&mapping->tree_lock);
> -
> - /* Has the page been truncated while we slept? */
> - if (page->mapping != mapping || page->index != offset) {
> - unlock_page(page);
> - page_cache_release(page);
> - goto repeat;
> - }
> + lock_page(page);
> + /* Has the page been truncated before being locked? */
> + if (page->mapping != mapping || page->index != offset) {
> + unlock_page(page);
> + page_cache_release(page);
> + goto repeat;
> }
> }
> - read_unlock_irq(&mapping->tree_lock);
> return page;
> }
>
> @@ -646,6 +659,32 @@ unsigned find_get_pages(struct address_s
> return ret;
> }
>
> +unsigned find_get_pages_nonatomic(struct address_space *mapping, pgoff_t start,
> + unsigned int nr_pages, struct page **pages)
> +{
> + unsigned int i;
> + unsigned int ret;
> + unsigned int ret2;
> +
> + /*
> + * We do some unsightly casting to use the array first for storing
> + * pointers to the page pointers, and then for the pointers to
> + * the pages themselves that the caller wants.
> + */
> + rcu_read_lock();
> + ret = radix_tree_gang_lookup_slot(&mapping->page_tree,
> + (void ***)pages, start, nr_pages);
> + ret2 = 0;
> + for (i = 0; i < ret; i++) {
> + struct page *page;
> + page = page_cache_get_speculative(((struct page ***)pages)[i]);
> + if (page)
> + pages[ret2++] = page;
> + }
Same thing here -- I don't see why the rcu_read_lock() and rcu_read_unlock()
should not be pushed down into radix_tree_gang_lookup_slot().
> + rcu_read_unlock();
> + return ret2;
> +}
> +
> /*
> * Like find_get_pages, except we only return pages which are tagged with
> * `tag'. We update *index to index the next page for the traversal.
> Index: linux-2.6/mm/readahead.c
> ===================================================================
> --- linux-2.6.orig/mm/readahead.c
> +++ linux-2.6/mm/readahead.c
> @@ -272,27 +272,26 @@ __do_page_cache_readahead(struct address
> /*
> * Preallocate as many pages as we will need.
> */
> - read_lock_irq(&mapping->tree_lock);
> for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
> unsigned long page_offset = offset + page_idx;
>
> if (page_offset > end_index)
> break;
>
> + /* Don't need mapping->tree_lock - lookup can be racy */
> + rcu_read_lock();
> page = radix_tree_lookup(&mapping->page_tree, page_offset);
> + rcu_read_unlock();
Here, they could clearly be pushed down. ;-)
> if (page)
> continue;
>
> - read_unlock_irq(&mapping->tree_lock);
> page = page_cache_alloc_cold(mapping);
> - read_lock_irq(&mapping->tree_lock);
> if (!page)
> break;
> page->index = page_offset;
> list_add(&page->lru, &page_pool);
> ret++;
> }
> - read_unlock_irq(&mapping->tree_lock);
>
> /*
> * Now start the IO. We ignore I/O errors - if the page is not
> Index: linux-2.6/mm/swap_state.c
> ===================================================================
> --- linux-2.6.orig/mm/swap_state.c
> +++ linux-2.6/mm/swap_state.c
> @@ -76,19 +76,26 @@ static int __add_to_swap_cache(struct pa
> BUG_ON(PagePrivate(page));
> error = radix_tree_preload(gfp_mask);
> if (!error) {
> + page_cache_get(page);
> + SetPageLocked(page);
> + SetPageSwapCache(page);
> + page->private = entry.val;
> +
> write_lock_irq(&swapper_space.tree_lock);
> error = radix_tree_insert(&swapper_space.page_tree,
> entry.val, page);
> if (!error) {
> - page_cache_get(page);
> - SetPageLocked(page);
> - SetPageSwapCache(page);
> - page->private = entry.val;
> total_swapcache_pages++;
> pagecache_acct(1);
> }
> write_unlock_irq(&swapper_space.tree_lock);
> radix_tree_preload_end();
> +
> + if (error) {
> + __put_page(page);
> + ClearPageLocked(page);
> + ClearPageSwapCache(page);
> + }
> }
> return error;
> }
> Index: linux-2.6/include/linux/page-flags.h
> ===================================================================
> --- linux-2.6.orig/include/linux/page-flags.h
> +++ linux-2.6/include/linux/page-flags.h
> @@ -167,16 +167,13 @@ extern void __mod_page_state(unsigned lo
> /*
> * Manipulation of page state flags
> */
> -#define PageLocked(page) \
> - test_bit(PG_locked, &(page)->flags)
> -#define SetPageLocked(page) \
> - set_bit(PG_locked, &(page)->flags)
> -#define TestSetPageLocked(page) \
> - test_and_set_bit(PG_locked, &(page)->flags)
> -#define ClearPageLocked(page) \
> - clear_bit(PG_locked, &(page)->flags)
> -#define TestClearPageLocked(page) \
> - test_and_clear_bit(PG_locked, &(page)->flags)
> +#define PageLocked(page) test_bit(PG_locked, &(page)->flags)
> +#define SetPageLocked(page) set_bit(PG_locked, &(page)->flags)
> +#define __SetPageLocked(page) __set_bit(PG_locked, &(page)->flags)
> +#define TestSetPageLocked(page) test_and_set_bit(PG_locked, &(page)->flags)
> +#define ClearPageLocked(page) clear_bit(PG_locked, &(page)->flags)
> +#define __ClearPageLocked(page) __clear_bit(PG_locked, &(page)->flags)
> +#define TestClearPageLocked(page) test_and_clear_bit(PG_locked, &(page)->flags)
>
> #define PageError(page) test_bit(PG_error, &(page)->flags)
> #define SetPageError(page) set_bit(PG_error, &(page)->flags)
> Index: linux-2.6/include/linux/pagemap.h
> ===================================================================
> --- linux-2.6.orig/include/linux/pagemap.h
> +++ linux-2.6/include/linux/pagemap.h
> @@ -130,6 +130,8 @@ extern struct page * find_or_create_page
> unsigned long index, unsigned int gfp_mask);
> unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
> unsigned int nr_pages, struct page **pages);
> +unsigned find_get_pages_nonatomic(struct address_space *mapping, pgoff_t start,
> + unsigned int nr_pages, struct page **pages);
> unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
> int tag, unsigned int nr_pages, struct page **pages);
>
> Index: linux-2.6/include/linux/pagevec.h
> ===================================================================
> --- linux-2.6.orig/include/linux/pagevec.h
> +++ linux-2.6/include/linux/pagevec.h
> @@ -25,6 +25,8 @@ void __pagevec_lru_add_active(struct pag
> void pagevec_strip(struct pagevec *pvec);
> unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
> pgoff_t start, unsigned nr_pages);
> +unsigned pagevec_lookup_nonatomic(struct pagevec *pvec,
> + struct address_space *mapping, pgoff_t start, unsigned nr_pages);
> unsigned pagevec_lookup_tag(struct pagevec *pvec,
> struct address_space *mapping, pgoff_t *index, int tag,
> unsigned nr_pages);
> Index: linux-2.6/mm/swap.c
> ===================================================================
> --- linux-2.6.orig/mm/swap.c
> +++ linux-2.6/mm/swap.c
> @@ -380,6 +380,19 @@ unsigned pagevec_lookup(struct pagevec *
> return pagevec_count(pvec);
> }
>
> +/**
> + * pagevec_lookup_nonatomic - non atomic pagevec_lookup
> + *
> + * This routine is non-atomic in that it may return blah.
> + */
> +unsigned pagevec_lookup_nonatomic(struct pagevec *pvec,
> + struct address_space *mapping, pgoff_t start, unsigned nr_pages)
> +{
> + pvec->nr = find_get_pages_nonatomic(mapping, start,
> + nr_pages, pvec->pages);
> + return pagevec_count(pvec);
> +}
> +
> unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
> pgoff_t *index, int tag, unsigned nr_pages)
> {
> Index: linux-2.6/mm/truncate.c
> ===================================================================
> --- linux-2.6.orig/mm/truncate.c
> +++ linux-2.6/mm/truncate.c
> @@ -126,7 +126,7 @@ void truncate_inode_pages(struct address
>
> pagevec_init(&pvec, 0);
> next = start;
> - while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
> + while (pagevec_lookup_nonatomic(&pvec, mapping, next, PAGEVEC_SIZE)) {
> for (i = 0; i < pagevec_count(&pvec); i++) {
> struct page *page = pvec.pages[i];
> pgoff_t page_index = page->index;
> @@ -160,7 +160,7 @@ void truncate_inode_pages(struct address
> next = start;
> for ( ; ; ) {
> cond_resched();
> - if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
> + if (!pagevec_lookup_nonatomic(&pvec, mapping, next, PAGEVEC_SIZE)) {
> if (next == start)
> break;
> next = start;
> @@ -206,7 +206,7 @@ unsigned long invalidate_mapping_pages(s
>
> pagevec_init(&pvec, 0);
> while (next <= end &&
> - pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
> + pagevec_lookup_nonatomic(&pvec, mapping, next, PAGEVEC_SIZE)) {
> for (i = 0; i < pagevec_count(&pvec); i++) {
> struct page *page = pvec.pages[i];
>
> Index: linux-2.6/mm/page-writeback.c
> ===================================================================
> --- linux-2.6.orig/mm/page-writeback.c
> +++ linux-2.6/mm/page-writeback.c
> @@ -811,6 +811,7 @@ int mapping_tagged(struct address_space
> unsigned long flags;
> int ret;
>
> + /* XXX: radix_tree_tagged is safe to run without the lock? */
> read_lock_irqsave(&mapping->tree_lock, flags);
> ret = radix_tree_tagged(&mapping->page_tree, tag);
> read_unlock_irqrestore(&mapping->tree_lock, flags);
next prev parent reply other threads:[~2005-08-12 1:49 UTC|newest]
Thread overview: 16+ messages / expand[flat|nested] mbox.gz Atom feed top
2005-08-11 12:18 [patch 0/7] lockless pagecache 2 Nick Piggin
2005-08-11 12:21 ` [patch 1/7] mm: remove PageReserved rollup Nick Piggin
2005-08-11 12:22 ` [patch 2/7] mm: PG_free flag Nick Piggin
2005-08-11 12:22 ` [patch 3/7] mm: speculative get_page Nick Piggin
2005-08-11 12:25 ` [patch 4/7] radix-tree: lookup_slot Nick Piggin
2005-08-11 12:25 ` [patch 5/7] radix-tree: lockless readside Nick Piggin
2005-08-11 12:28 ` [patch 6/7] mm: lockless pagecache Nick Piggin
2005-08-11 12:28 ` [patch 7/7] mm: spinlock tree_lock Nick Piggin
2005-08-11 13:58 ` [patch 6/7] mm: lockless pagecache Pekka Enberg
2005-08-11 14:06 ` Nick Piggin
2005-08-12 1:49 ` Paul E. McKenney [this message]
2005-08-12 4:04 ` Nick Piggin
2005-08-12 1:37 ` [patch 5/7] radix-tree: lockless readside Paul E. McKenney
2005-08-12 3:59 ` Nick Piggin
2005-08-12 4:38 ` Nick Piggin
2005-08-12 7:53 ` Nick Piggin
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20050812014924.GQ1300@us.ibm.com \
--to=paulmck@us.ibm.com \
--cc=dipankar@in.ibm.com \
--cc=linux-kernel@vger.kernel.org \
--cc=mingo@elte.hu \
--cc=nickpiggin@yahoo.com.au \
--cc=paul.mckenney@us.ibm.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.