linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Nick Piggin <npiggin@suse.de>
To: Andrew Morton <akpm@linux-foundation.org>
Cc: Linux Filesystems <linux-fsdevel@vger.kernel.org>,
	Mark Fasheh <mark.fasheh@oracle.com>,
	Linux Memory Management <linux-mm@kvack.org>
Subject: [patch 07/44] mm: buffered write cleanup
Date: Tue, 24 Apr 2007 11:23:53 +1000	[thread overview]
Message-ID: <20070424013433.015170000@suse.de> (raw)
In-Reply-To: 20070424012346.696840000@suse.de

[-- Attachment #1: mm-buffered-write-cleanup.patch --]
[-- Type: text/plain, Size: 11225 bytes --]


Quite a bit of code is used in maintaining these "cached pages" that are
probably pretty unlikely to get used. It would require a narrow race where
the page is inserted concurrently while this process is allocating a page
in order to create the spare page. Then a multi-page write into an uncached
part of the file, to make use of it.

Next, the buffered write path (and others) uses its own LRU pagevec when it
should be just using the per-CPU LRU pagevec (which will cut down on both data
and code size cacheline footprint). Also, these private LRU pagevecs are
emptied after just a very short time, in contrast with the per-CPU pagevecs
that are persistent. Net result: 7.3 times fewer lru_lock acquisitions required
to add the pages to pagecache for a bulk write (in 4K chunks).

[this gets rid of some cond_resched() calls in readahead.c and mpage.c due
 to clashes in -mm. What put them there, and why? ]

Cc: Linux Memory Management <linux-mm@kvack.org>
Cc: Linux Filesystems <linux-fsdevel@vger.kernel.org>
Signed-off-by: Nick Piggin <npiggin@suse.de>

 fs/mpage.c     |   12 ----
 mm/filemap.c   |  144 ++++++++++++++++++++++-----------------------------------
 mm/readahead.c |   28 +++--------
 3 files changed, 66 insertions(+), 118 deletions(-)

Index: linux-2.6/mm/filemap.c
===================================================================
--- linux-2.6.orig/mm/filemap.c
+++ linux-2.6/mm/filemap.c
@@ -689,26 +689,22 @@ EXPORT_SYMBOL(probe_page);
 struct page *find_or_create_page(struct address_space *mapping,
 		unsigned long index, gfp_t gfp_mask)
 {
-	struct page *page, *cached_page = NULL;
+	struct page *page;
 	int err;
 repeat:
 	page = find_lock_page(mapping, index);
 	if (!page) {
-		if (!cached_page) {
-			cached_page = alloc_page(gfp_mask);
-			if (!cached_page)
-				return NULL;
-		}
-		err = add_to_page_cache_lru(cached_page, mapping,
-					index, gfp_mask);
-		if (!err) {
-			page = cached_page;
-			cached_page = NULL;
-		} else if (err == -EEXIST)
-			goto repeat;
+		page = alloc_page(gfp_mask);
+		if (!page)
+			return NULL;
+		err = add_to_page_cache_lru(page, mapping, index, gfp_mask);
+		if (unlikely(err)) {
+			page_cache_release(page);
+			page = NULL;
+			if (err == -EEXIST)
+				goto repeat;
+		}
 	}
-	if (cached_page)
-		page_cache_release(cached_page);
 	return page;
 }
 EXPORT_SYMBOL(find_or_create_page);
@@ -903,11 +899,9 @@ void do_generic_mapping_read(struct addr
 	unsigned long next_index;
 	unsigned long prev_index;
 	loff_t isize;
-	struct page *cached_page;
 	int error;
 	struct file_ra_state ra = *_ra;
 
-	cached_page = NULL;
 	index = *ppos >> PAGE_CACHE_SHIFT;
 	next_index = index;
 	prev_index = ra.prev_page;
@@ -1084,23 +1078,20 @@ no_cached_page:
 		 * Ok, it wasn't cached, so we need to create a new
 		 * page..
 		 */
-		if (!cached_page) {
-			cached_page = page_cache_alloc_cold(mapping);
-			if (!cached_page) {
-				desc->error = -ENOMEM;
-				goto out;
-			}
+		page = page_cache_alloc_cold(mapping);
+		if (!page) {
+			desc->error = -ENOMEM;
+			goto out;
 		}
-		error = add_to_page_cache_lru(cached_page, mapping,
+		error = add_to_page_cache_lru(page, mapping,
 						index, GFP_KERNEL);
 		if (error) {
+			page_cache_release(page);
 			if (error == -EEXIST)
 				goto find_page;
 			desc->error = error;
 			goto out;
 		}
-		page = cached_page;
-		cached_page = NULL;
 		goto readpage;
 	}
 
@@ -1110,8 +1101,6 @@ out:
 		_ra->prev_page = prev_index;
 
 	*ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
-	if (cached_page)
-		page_cache_release(cached_page);
 	if (filp)
 		file_accessed(filp);
 }
@@ -1605,35 +1594,28 @@ static struct page *__read_cache_page(st
 				int (*filler)(void *,struct page*),
 				void *data)
 {
-	struct page *page, *cached_page = NULL;
+	struct page *page;
 	int err;
 repeat:
 	page = find_get_page(mapping, index);
 	if (!page) {
-		if (!cached_page) {
-			cached_page = page_cache_alloc_cold(mapping);
-			if (!cached_page)
-				return ERR_PTR(-ENOMEM);
-		}
-		err = add_to_page_cache_lru(cached_page, mapping,
-					index, GFP_KERNEL);
-		if (err == -EEXIST)
-			goto repeat;
-		if (err < 0) {
+		page = page_cache_alloc_cold(mapping);
+		if (!page)
+			return ERR_PTR(-ENOMEM);
+		err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL);
+		if (unlikely(err)) {
+			page_cache_release(page);
+			if (err == -EEXIST)
+				goto repeat;
 			/* Presumably ENOMEM for radix tree node */
-			page_cache_release(cached_page);
 			return ERR_PTR(err);
 		}
-		page = cached_page;
-		cached_page = NULL;
 		err = filler(data, page);
 		if (err < 0) {
 			page_cache_release(page);
 			page = ERR_PTR(err);
 		}
 	}
-	if (cached_page)
-		page_cache_release(cached_page);
 	return page;
 }
 
@@ -1711,40 +1693,6 @@ struct page *read_cache_page(struct addr
 EXPORT_SYMBOL(read_cache_page);
 
 /*
- * If the page was newly created, increment its refcount and add it to the
- * caller's lru-buffering pagevec.  This function is specifically for
- * generic_file_write().
- */
-static inline struct page *
-__grab_cache_page(struct address_space *mapping, unsigned long index,
-			struct page **cached_page, struct pagevec *lru_pvec)
-{
-	int err;
-	struct page *page;
-repeat:
-	page = find_lock_page(mapping, index);
-	if (!page) {
-		if (!*cached_page) {
-			*cached_page = page_cache_alloc(mapping);
-			if (!*cached_page)
-				return NULL;
-		}
-		err = add_to_page_cache(*cached_page, mapping,
-					index, GFP_KERNEL);
-		if (err == -EEXIST)
-			goto repeat;
-		if (err == 0) {
-			page = *cached_page;
-			page_cache_get(page);
-			if (!pagevec_add(lru_pvec, page))
-				__pagevec_lru_add(lru_pvec);
-			*cached_page = NULL;
-		}
-	}
-	return page;
-}
-
-/*
  * The logic we want is
  *
  *	if suid or (sgid and xgrp)
@@ -1938,6 +1886,33 @@ generic_file_direct_write(struct kiocb *
 }
 EXPORT_SYMBOL(generic_file_direct_write);
 
+/*
+ * Find or create a page at the given pagecache position. Return the locked
+ * page. This function is specifically for buffered writes.
+ */
+static struct page *__grab_cache_page(struct address_space *mapping,
+							pgoff_t index)
+{
+	int status;
+	struct page *page;
+repeat:
+	page = find_lock_page(mapping, index);
+	if (likely(page))
+		return page;
+
+	page = page_cache_alloc(mapping);
+	if (!page)
+		return NULL;
+	status = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL);
+	if (unlikely(status)) {
+		page_cache_release(page);
+		if (status == -EEXIST)
+			goto repeat;
+		return NULL;
+	}
+	return page;
+}
+
 ssize_t
 generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 		unsigned long nr_segs, loff_t pos, loff_t *ppos,
@@ -1948,15 +1923,10 @@ generic_file_buffered_write(struct kiocb
 	const struct address_space_operations *a_ops = mapping->a_ops;
 	struct inode 	*inode = mapping->host;
 	long		status = 0;
-	struct page	*page;
-	struct page	*cached_page = NULL;
-	struct pagevec	lru_pvec;
 	const struct iovec *cur_iov = iov; /* current iovec */
 	size_t		iov_offset = 0;	   /* offset in the current iovec */
 	char __user	*buf;
 
-	pagevec_init(&lru_pvec, 0);
-
 	/*
 	 * handle partial DIO write.  Adjust cur_iov if needed.
 	 */
@@ -1968,6 +1938,7 @@ generic_file_buffered_write(struct kiocb
 	}
 
 	do {
+		struct page *page;
 		pgoff_t index;		/* Pagecache index for current page */
 		unsigned long offset;	/* Offset into pagecache page */
 		unsigned long maxlen;	/* Bytes remaining in current iovec */
@@ -1994,7 +1965,8 @@ generic_file_buffered_write(struct kiocb
 		fault_in_pages_readable(buf, maxlen);
 #endif
 
-		page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
+
+		page = __grab_cache_page(mapping, index);
 		if (!page) {
 			status = -ENOMEM;
 			break;
@@ -2062,9 +2034,6 @@ fs_write_aop_error:
 	} while (count);
 	*ppos = pos;
 
-	if (cached_page)
-		page_cache_release(cached_page);
-
 	/*
 	 * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC
 	 */
@@ -2084,7 +2053,6 @@ fs_write_aop_error:
 	if (unlikely(file->f_flags & O_DIRECT) && written)
 		status = filemap_write_and_wait(mapping);
 
-	pagevec_lru_add(&lru_pvec);
 	return written ? written : status;
 }
 EXPORT_SYMBOL(generic_file_buffered_write);
Index: linux-2.6/fs/mpage.c
===================================================================
--- linux-2.6.orig/fs/mpage.c
+++ linux-2.6/fs/mpage.c
@@ -389,33 +389,25 @@ mpage_readpages(struct address_space *ma
 	struct bio *bio = NULL;
 	unsigned page_idx;
 	sector_t last_block_in_bio = 0;
-	struct pagevec lru_pvec;
 	struct buffer_head map_bh;
 	unsigned long first_logical_block = 0;
 
 	clear_buffer_mapped(&map_bh);
-	pagevec_init(&lru_pvec, 0);
 	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
 		struct page *page = list_entry(pages->prev, struct page, lru);
 
 		prefetchw(&page->flags);
 		list_del(&page->lru);
-		if (!add_to_page_cache(page, mapping,
+		if (!add_to_page_cache_lru(page, mapping,
 					page->index, GFP_KERNEL)) {
 			bio = do_mpage_readpage(bio, page,
 					nr_pages - page_idx,
 					&last_block_in_bio, &map_bh,
 					&first_logical_block,
 					get_block);
-			if (!pagevec_add(&lru_pvec, page)) {
-				cond_resched();
-				__pagevec_lru_add(&lru_pvec);
-			}
-		} else {
-			page_cache_release(page);
 		}
+		page_cache_release(page);
 	}
-	pagevec_lru_add(&lru_pvec);
 	BUG_ON(!list_empty(pages));
 	if (bio)
 		mpage_bio_submit(READ, bio);
Index: linux-2.6/mm/readahead.c
===================================================================
--- linux-2.6.orig/mm/readahead.c
+++ linux-2.6/mm/readahead.c
@@ -230,30 +230,25 @@ int read_cache_pages(struct address_spac
 			int (*filler)(void *, struct page *), void *data)
 {
 	struct page *page;
-	struct pagevec lru_pvec;
 	int ret = 0;
 
-	pagevec_init(&lru_pvec, 0);
-
 	while (!list_empty(pages)) {
 		page = list_to_page(pages);
 		list_del(&page->lru);
-		if (add_to_page_cache(page, mapping, page->index, GFP_KERNEL)) {
+		if (add_to_page_cache_lru(page, mapping,
+					page->index, GFP_KERNEL)) {
 			page_cache_release(page);
 			continue;
 		}
+		page_cache_release(page);
+
 		ret = filler(data, page);
-		if (!pagevec_add(&lru_pvec, page)) {
-			cond_resched();
-			__pagevec_lru_add(&lru_pvec);
-		}
-		if (ret) {
+		if (unlikely(ret)) {
 			put_pages_list(pages);
 			break;
 		}
 		task_io_account_read(PAGE_CACHE_SIZE);
 	}
-	pagevec_lru_add(&lru_pvec);
 	return ret;
 }
 
@@ -263,7 +258,6 @@ static int read_pages(struct address_spa
 		struct list_head *pages, unsigned nr_pages)
 {
 	unsigned page_idx;
-	struct pagevec lru_pvec;
 	int ret;
 
 	if (mapping->a_ops->readpages) {
@@ -273,21 +267,15 @@ static int read_pages(struct address_spa
 		goto out;
 	}
 
-	pagevec_init(&lru_pvec, 0);
 	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
 		struct page *page = list_to_page(pages);
 		list_del(&page->lru);
-		if (!add_to_page_cache(page, mapping,
+		if (!add_to_page_cache_lru(page, mapping,
 					page->index, GFP_KERNEL)) {
 			mapping->a_ops->readpage(filp, page);
-			if (!pagevec_add(&lru_pvec, page)) {
-				cond_resched();
-				__pagevec_lru_add(&lru_pvec);
-			}
-		} else
-			page_cache_release(page);
+		}
+		page_cache_release(page);
 	}
-	pagevec_lru_add(&lru_pvec);
 	ret = 0;
 out:
 	return ret;

-- 


  parent reply	other threads:[~2007-04-24  5:20 UTC|newest]

Thread overview: 61+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2007-04-24  1:23 [patch 00/44] Buffered write deadlock fix and new aops for 2.6.21-rc6-mm1 Nick Piggin
2007-04-24  1:23 ` [patch 01/44] mm: revert KERNEL_DS buffered write optimisation Nick Piggin
2007-04-24  1:23 ` [patch 02/44] Revert 81b0c8713385ce1b1b9058e916edcf9561ad76d6 Nick Piggin
2007-04-24  1:23 ` [patch 03/44] Revert 6527c2bdf1f833cc18e8f42bd97973d583e4aa83 Nick Piggin
2007-04-24  1:23 ` [patch 04/44] mm: clean up buffered write code Nick Piggin
2007-04-24  1:23 ` [patch 05/44] mm: debug write deadlocks Nick Piggin
2007-04-24  1:23 ` [patch 06/44] mm: trim more holes Nick Piggin
2007-04-24  6:07   ` Neil Brown
2007-04-24  6:17     ` Nick Piggin
2007-04-24  1:23 ` Nick Piggin [this message]
2007-04-24  1:23 ` [patch 08/44] mm: write iovec cleanup Nick Piggin
2007-04-24  1:23 ` [patch 09/44] mm: fix pagecache write deadlocks Nick Piggin
2007-04-24  1:23 ` [patch 10/44] mm: buffered write iterator Nick Piggin
2007-04-24  1:23 ` [patch 11/44] fs: fix data-loss on error Nick Piggin
2007-04-24  1:23 ` [patch 12/44] fs: introduce write_begin, write_end, and perform_write aops Nick Piggin
2007-04-24  6:59   ` Neil Brown
2007-04-24  7:23     ` Nick Piggin
2007-04-24  7:49       ` Neil Brown
2007-04-24 10:37         ` Nick Piggin
2007-04-24  1:23 ` [patch 13/44] mm: restore KERNEL_DS optimisations Nick Piggin
2007-04-24 10:43   ` Christoph Hellwig
2007-04-24 11:03     ` Nick Piggin
2007-04-24  1:24 ` [patch 14/44] implement simple fs aops Nick Piggin
2007-04-24  1:24 ` [patch 15/44] block_dev convert to new aops Nick Piggin
2007-04-24  1:24 ` [patch 16/44] rd " Nick Piggin
2007-04-24 10:46   ` Christoph Hellwig
2007-04-24 11:05     ` Nick Piggin
2007-04-24 11:11       ` Christoph Hellwig
2007-04-24 11:16         ` Nick Piggin
2007-04-24 11:18           ` Christoph Hellwig
2007-04-24 11:20             ` Nick Piggin
2007-04-24 11:42           ` Neil Brown
2007-04-24  1:24 ` [patch 17/44] ext2 " Nick Piggin
2007-04-24  1:24 ` [patch 18/44] ext3 " Nick Piggin
2007-04-24  1:24 ` [patch 19/44] ext4 " Nick Piggin
2007-04-24  1:24 ` [patch 20/44] xfs " Nick Piggin
2007-04-24  1:24 ` [patch 21/44] fs: new cont helpers Nick Piggin
2007-04-24  1:24 ` [patch 22/44] fat convert to new aops Nick Piggin
2007-04-24  1:24 ` [patch 23/44] adfs " Nick Piggin
2007-04-24  1:24 ` [patch 24/44] affs " Nick Piggin
2007-04-24  1:24 ` [patch 25/44] hfs " Nick Piggin
2007-04-24  1:24 ` [patch 26/44] hfsplus " Nick Piggin
2007-04-24  1:24 ` [patch 27/44] hpfs " Nick Piggin
2007-04-24  1:24 ` [patch 28/44] bfs " Nick Piggin
2007-04-24  1:24 ` [patch 29/44] qnx4 " Nick Piggin
2007-04-24  1:24 ` [patch 30/44] nfs " Nick Piggin
2007-04-24  1:24 ` [patch 31/44] smb " Nick Piggin
2007-04-24  1:24 ` [patch 32/44] ocfs2: " Nick Piggin
2007-04-24  1:24 ` [patch 33/44] gfs2 " Nick Piggin
2007-04-24  1:24 ` [patch 34/44] fs: no AOP_TRUNCATED_PAGE for writes Nick Piggin
2007-04-24  1:24 ` [patch 35/44] ecryptfs convert to new aops Nick Piggin
2007-04-24  1:24 ` [patch 36/44] fuse " Nick Piggin
2007-04-24  1:24 ` [patch 37/44] hostfs " Nick Piggin
2007-04-27 16:11   ` Jeff Dike
2007-04-24  1:24 ` [patch 38/44] jffs2 " Nick Piggin
2007-04-24  1:24 ` [patch 39/44] cifs " Nick Piggin
2007-04-24  1:24 ` [patch 40/44] ufs " Nick Piggin
2007-04-24  1:24 ` [patch 41/44] udf " Nick Piggin
2007-04-24  1:24 ` [patch 42/44] sysv " Nick Piggin
2007-04-24  1:24 ` [patch 43/44] minix " Nick Piggin
2007-04-24  1:24 ` [patch 44/44] jfs " Nick Piggin

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20070424013433.015170000@suse.de \
    --to=npiggin@suse.de \
    --cc=akpm@linux-foundation.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mark.fasheh@oracle.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).