All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/5] Concurrent O_SYNC write speedups using radix-tree walks
@ 2004-08-01  7:45 Suparna Bhattacharya
  2004-08-01  7:47 ` [PATCH 1/5] Writeback page range hint Suparna Bhattacharya
                   ` (4 more replies)
  0 siblings, 5 replies; 6+ messages in thread
From: Suparna Bhattacharya @ 2004-08-01  7:45 UTC (permalink / raw)
  To: akpm; +Cc: linux-kernel


The attached patches (generated against 2.6.8-rc2) enable concurrent 
O_SYNC writers to different parts of the same file by avoiding 
serialising on i_sem across the wait for IO completion.

This is mostly your work, ported to the tagged radix tree VFS changes
and a few fixes. I have been carrying these patches for sometime now; 
they can be the merged upstream. Please apply.

Regards
Suparna

-- 
Suparna Bhattacharya (suparna@in.ibm.com)
Linux Technology Center
IBM Software Lab, India


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 1/5] Writeback page range hint
  2004-08-01  7:45 [PATCH 0/5] Concurrent O_SYNC write speedups using radix-tree walks Suparna Bhattacharya
@ 2004-08-01  7:47 ` Suparna Bhattacharya
  2004-08-01  7:49 ` [PATCH 2/5] Fix writeback page range to use exact limits Suparna Bhattacharya
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 6+ messages in thread
From: Suparna Bhattacharya @ 2004-08-01  7:47 UTC (permalink / raw)
  To: akpm; +Cc: linux-kernel

On Sun, Aug 01, 2004 at 01:15:18PM +0530, Suparna Bhattacharya wrote:
> 
> The attached patches (generated against 2.6.8-rc2) enable concurrent 
> O_SYNC writers to different parts of the same file by avoiding 
> serialising on i_sem across the wait for IO completion.
> 
> This is mostly your work, ported to the tagged radix tree VFS changes
> and a few fixes. I have been carrying these patches for sometime now; 
> they can be the merged upstream. Please apply.
> 

[1] writepages-range.patch

Regards
Suparna

-- 
Suparna Bhattacharya (suparna@in.ibm.com)
Linux Technology Center
IBM Software Lab, India

------------------------------------------------------

From: Andrew Morton <akpm@osdl.org>

Modify mpage_writepages to optionally only write back dirty pages within
a specified range in a file (as in the case of O_SYNC). Cheat a
little to avoid changes to prototypes of aops - just put the
<start, end> hint into the writeback_control struct instead.
If <start, end> are not set, then default to writing back all
the mapping's dirty pages.

Signed-off-by: Suparna Bhattacharya <suparna@in.ibm.com>

 linux-2.6.8-rc2-suparna/fs/mpage.c                |   27 +++++++++++++++++++---
 linux-2.6.8-rc2-suparna/include/linux/writeback.h |   21 +++++++++++++----
 2 files changed, 40 insertions(+), 8 deletions(-)

diff -puN fs/mpage.c~writepages-range fs/mpage.c
--- linux-2.6.8-rc2/fs/mpage.c~writepages-range	2004-08-01 12:30:15.000000000 +0530
+++ linux-2.6.8-rc2-suparna/fs/mpage.c	2004-08-01 12:30:15.000000000 +0530
@@ -622,7 +622,9 @@ mpage_writepages(struct address_space *m
 	struct pagevec pvec;
 	int nr_pages;
 	pgoff_t index;
+	pgoff_t end = -1;		/* Inclusive */
 	int scanned = 0;
+	int is_range = 0;
 
 	if (wbc->nonblocking && bdi_write_congested(bdi)) {
 		wbc->encountered_congestion = 1;
@@ -640,9 +642,16 @@ mpage_writepages(struct address_space *m
 		index = 0;			  /* whole-file sweep */
 		scanned = 1;
 	}
+	if (wbc->start || wbc->end) {
+		index = wbc->start >> PAGE_CACHE_SHIFT;
+		end = wbc->end >> PAGE_CACHE_SHIFT;
+		is_range = 1;
+		scanned = 1;
+	}
 retry:
 	while (!done && (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
-					PAGECACHE_TAG_DIRTY, PAGEVEC_SIZE))) {
+			PAGECACHE_TAG_DIRTY,
+			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
 		unsigned i;
 
 		scanned = 1;
@@ -659,10 +668,21 @@ retry:
 
 			lock_page(page);
 
+			if (unlikely(page->mapping != mapping)) {
+				unlock_page(page);
+				continue;
+			}
+
+			if (unlikely(is_range) && page->index > end) {
+				done = 1;
+				unlock_page(page);
+				continue;
+			}
+
 			if (wbc->sync_mode != WB_SYNC_NONE)
 				wait_on_page_writeback(page);
 
-			if (page->mapping != mapping || PageWriteback(page) ||
+			if (PageWriteback(page) ||
 					!clear_page_dirty_for_io(page)) {
 				unlock_page(page);
 				continue;
@@ -701,7 +721,8 @@ retry:
 		index = 0;
 		goto retry;
 	}
-	mapping->writeback_index = index;
+	if (!is_range)
+		mapping->writeback_index = index;
 	if (bio)
 		mpage_bio_submit(WRITE, bio);
 	return ret;
diff -puN include/linux/writeback.h~writepages-range include/linux/writeback.h
--- linux-2.6.8-rc2/include/linux/writeback.h~writepages-range	2004-08-01 12:30:15.000000000 +0530
+++ linux-2.6.8-rc2-suparna/include/linux/writeback.h	2004-08-01 12:30:15.000000000 +0530
@@ -29,7 +29,9 @@ enum writeback_sync_modes {
 };
 
 /*
- * A control structure which tells the writeback code what to do
+ * A control structure which tells the writeback code what to do.  These are
+ * always on the stack, and hence need no locking.  They are always initialised
+ * in a manner such that unspecified fields are set to zero.
  */
 struct writeback_control {
 	struct backing_dev_info *bdi;	/* If !NULL, only write back this
@@ -40,10 +42,19 @@ struct writeback_control {
 	long nr_to_write;		/* Write this many pages, and decrement
 					   this for each page written */
 	long pages_skipped;		/* Pages which were not written */
-	int nonblocking;		/* Don't get stuck on request queues */
-	int encountered_congestion;	/* An output: a queue is full */
-	int for_kupdate;		/* A kupdate writeback */
-	int for_reclaim;		/* Invoked from the page allocator */
+
+	/*
+	 * For a_ops->writepages(): is start or end are non-zero then this is
+	 * a hint that the filesystem need only write out the pages inside that
+	 * byterange.  The byte at `end' is included in the writeout request.
+	 */
+	loff_t start;
+	loff_t end;
+
+	int nonblocking:1;		/* Don't get stuck on request queues */
+	int encountered_congestion:1;	/* An output: a queue is full */
+	int for_kupdate:1;		/* A kupdate writeback */
+	int for_reclaim:1;		/* Invoked from the page allocator */
 };
 
 /*

_

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 2/5] Fix writeback page range to use exact limits
  2004-08-01  7:45 [PATCH 0/5] Concurrent O_SYNC write speedups using radix-tree walks Suparna Bhattacharya
  2004-08-01  7:47 ` [PATCH 1/5] Writeback page range hint Suparna Bhattacharya
@ 2004-08-01  7:49 ` Suparna Bhattacharya
  2004-08-01  7:50 ` [PATCH 3/5] mpage writepages range limit fix Suparna Bhattacharya
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 6+ messages in thread
From: Suparna Bhattacharya @ 2004-08-01  7:49 UTC (permalink / raw)
  To: akpm; +Cc: linux-kernel

On Sun, Aug 01, 2004 at 01:15:18PM +0530, Suparna Bhattacharya wrote:
> 
> The attached patches (generated against 2.6.8-rc2) enable concurrent 
> O_SYNC writers to different parts of the same file by avoiding 
> serialising on i_sem across the wait for IO completion.
> 
> This is mostly your work, ported to the tagged radix tree VFS changes
> and a few fixes. I have been carrying these patches for sometime now; 
> they can be the merged upstream. Please apply.
> 

[2] fix-writeback-range.patch

Regards
Suparna

-- 
Suparna Bhattacharya (suparna@in.ibm.com)
Linux Technology Center
IBM Software Lab, India

---------------------------------------------------------------


wait_on_page_writeback_range shouldn't wait for pages beyond the
specified range. Ideally, the radix-tree-lookup could accept an
end_index parameter so that it doesn't return the extra pages
in the first place, but for now we just add a few extra checks
to skip such pages.

Signed-off-by: Suparna Bhattacharya <suparna@in.ibm.com>

 linux-2.6.8-rc2-suparna/mm/filemap.c |    7 ++++++-
 1 files changed, 6 insertions(+), 1 deletion(-)

diff -puN mm/filemap.c~fix-writeback-range mm/filemap.c
--- linux-2.6.8-rc2/mm/filemap.c~fix-writeback-range	2004-08-01 12:32:04.000000000 +0530
+++ linux-2.6.8-rc2-suparna/mm/filemap.c	2004-08-01 12:32:04.000000000 +0530
@@ -198,7 +198,8 @@ static int wait_on_page_writeback_range(
 
 	pagevec_init(&pvec, 0);
 	index = start;
-	while ((nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+	while ((index <= end) &&
+			(nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
 			PAGECACHE_TAG_WRITEBACK,
 			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
 		unsigned i;
@@ -206,6 +207,10 @@ static int wait_on_page_writeback_range(
 		for (i = 0; i < nr_pages; i++) {
 			struct page *page = pvec.pages[i];
 
+			/* until radix tree lookup accepts end_index */
+			if (page->index > end) {
+				continue;
+			}
 			wait_on_page_writeback(page);
 			if (PageError(page))
 				ret = -EIO;

_

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 3/5] mpage writepages range limit fix
  2004-08-01  7:45 [PATCH 0/5] Concurrent O_SYNC write speedups using radix-tree walks Suparna Bhattacharya
  2004-08-01  7:47 ` [PATCH 1/5] Writeback page range hint Suparna Bhattacharya
  2004-08-01  7:49 ` [PATCH 2/5] Fix writeback page range to use exact limits Suparna Bhattacharya
@ 2004-08-01  7:50 ` Suparna Bhattacharya
  2004-08-01  7:52 ` [PATCH 4/5] filemap_fdatawrite range interface Suparna Bhattacharya
  2004-08-01  7:53 ` [PATCH 5/5] Concurrent O_SYNC write support Suparna Bhattacharya
  4 siblings, 0 replies; 6+ messages in thread
From: Suparna Bhattacharya @ 2004-08-01  7:50 UTC (permalink / raw)
  To: akpm; +Cc: linux-kernel

On Sun, Aug 01, 2004 at 01:15:18PM +0530, Suparna Bhattacharya wrote:
> 
> The attached patches (generated against 2.6.8-rc2) enable concurrent 
> O_SYNC writers to different parts of the same file by avoiding 
> serialising on i_sem across the wait for IO completion.
> 
> This is mostly your work, ported to the tagged radix tree VFS changes
> and a few fixes. I have been carrying these patches for sometime now; 
> they can be the merged upstream. Please apply.
> 

[3] fix-writepages-range.patch

Regards
Suparna

-- 
Suparna Bhattacharya (suparna@in.ibm.com)
Linux Technology Center
IBM Software Lab, India

---------------------------------------------------------------

Safeguard to make sure we break out of pagevec_lookup_tag loop if we
are beyond the specified range.

Signed-off-by: Suparna Bhattacharya <suparna@in.ibm.com>

 linux-2.6.8-rc2-suparna/fs/mpage.c      |    3 ++-
 linux-2.6.8-rc2-suparna/fs/mpage.c.orig |    3 +--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff -puN fs/mpage.c~fix-writepages-range fs/mpage.c
--- linux-2.6.8-rc2/fs/mpage.c~fix-writepages-range	2004-08-01 12:33:10.000000000 +0530
+++ linux-2.6.8-rc2-suparna/fs/mpage.c	2004-08-01 12:33:10.000000000 +0530
@@ -649,7 +649,8 @@ mpage_writepages(struct address_space *m
 		scanned = 1;
 	}
 retry:
-	while (!done && (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+	while (!done && (index <= end) &&
+			(nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
 			PAGECACHE_TAG_DIRTY,
 			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
 		unsigned i;
diff -puN fs/mpage.c.orig~fix-writepages-range fs/mpage.c.orig
--- linux-2.6.8-rc2/fs/mpage.c.orig~fix-writepages-range	2004-08-01 12:33:10.000000000 +0530
+++ linux-2.6.8-rc2-suparna/fs/mpage.c.orig	2004-08-01 12:32:43.000000000 +0530
@@ -649,8 +649,7 @@ mpage_writepages(struct address_space *m
 		scanned = 1;
 	}
 retry:
-	while (!done && (index <= end) && 
-			(nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+	while (!done && (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
 			PAGECACHE_TAG_DIRTY,
 			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
 		unsigned i;

_

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 4/5] filemap_fdatawrite range interface
  2004-08-01  7:45 [PATCH 0/5] Concurrent O_SYNC write speedups using radix-tree walks Suparna Bhattacharya
                   ` (2 preceding siblings ...)
  2004-08-01  7:50 ` [PATCH 3/5] mpage writepages range limit fix Suparna Bhattacharya
@ 2004-08-01  7:52 ` Suparna Bhattacharya
  2004-08-01  7:53 ` [PATCH 5/5] Concurrent O_SYNC write support Suparna Bhattacharya
  4 siblings, 0 replies; 6+ messages in thread
From: Suparna Bhattacharya @ 2004-08-01  7:52 UTC (permalink / raw)
  To: akpm; +Cc: linux-kernel

On Sun, Aug 01, 2004 at 01:15:18PM +0530, Suparna Bhattacharya wrote:
> 
> The attached patches (generated against 2.6.8-rc2) enable concurrent 
> O_SYNC writers to different parts of the same file by avoiding 
> serialising on i_sem across the wait for IO completion.
> 
> This is mostly your work, ported to the tagged radix tree VFS changes
> and a few fixes. I have been carrying these patches for sometime now; 
> they can be the merged upstream. Please apply.
> 

[4] fdatawrite-range.patch

Regards
Suparna

-- 
Suparna Bhattacharya (suparna@in.ibm.com)
Linux Technology Center
IBM Software Lab, India

--------------------------------------------------------

Range based equivalent of filemap_fdatawrite for O_SYNC writers (to go
with writepages range support added to mpage_writepages).
If both <start> and <end> are zero, then it defaults to writing
back all of the mapping's dirty pages.

Signed-off-by: Suparna Bhattacharya <suparna@in.ibm.com>

 linux-2.6.8-rc2-suparna/mm/filemap.c |   23 +++++++++++++++++++++--
 1 files changed, 21 insertions(+), 2 deletions(-)

diff -puN mm/filemap.c~fdatawrite-range mm/filemap.c
--- linux-2.6.8-rc2/mm/filemap.c~fdatawrite-range	2004-08-01 12:34:34.000000000 +0530
+++ linux-2.6.8-rc2-suparna/mm/filemap.c	2004-08-01 12:34:34.000000000 +0530
@@ -142,20 +142,26 @@ static inline int sync_page(struct page 
 }
 
 /**
- * filemap_fdatawrite - start writeback against all of a mapping's dirty pages
+ * filemap_fdatawrite_range - start writeback against all of a mapping's
+ * dirty pages that lie within the byte offsets <start, end>
  * @mapping: address space structure to write
+ * @start: offset in bytes where the range starts
+ * @end : offset in bytes where the range ends
  *
  * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
  * opposed to a regular memory * cleansing writeback.  The difference between
  * these two operations is that if a dirty page/buffer is encountered, it must
  * be waited upon, and not just skipped over.
  */
-static int __filemap_fdatawrite(struct address_space *mapping, int sync_mode)
+static int __filemap_fdatawrite_range(struct address_space *mapping,
+	loff_t start, loff_t end, int sync_mode)
 {
 	int ret;
 	struct writeback_control wbc = {
 		.sync_mode = sync_mode,
 		.nr_to_write = mapping->nrpages * 2,
+		.start = start,
+		.end = end,
 	};
 
 	if (mapping->backing_dev_info->memory_backed)
@@ -165,12 +171,25 @@ static int __filemap_fdatawrite(struct a
 	return ret;
 }
 
+static inline int __filemap_fdatawrite(struct address_space *mapping,
+	int sync_mode)
+{
+	return __filemap_fdatawrite_range(mapping, 0, 0, sync_mode);
+}
+
 int filemap_fdatawrite(struct address_space *mapping)
 {
 	return __filemap_fdatawrite(mapping, WB_SYNC_ALL);
 }
 EXPORT_SYMBOL(filemap_fdatawrite);
 
+int filemap_fdatawrite_range(struct address_space *mapping,
+	loff_t start, loff_t end)
+{
+	return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
+}
+EXPORT_SYMBOL(filemap_fdatawrite_range);
+
 /*
  * This is a mostly non-blocking flush.  Not suitable for data-integrity
  * purposes - I/O may not be started against all dirty pages.

_

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 5/5] Concurrent O_SYNC write support
  2004-08-01  7:45 [PATCH 0/5] Concurrent O_SYNC write speedups using radix-tree walks Suparna Bhattacharya
                   ` (3 preceding siblings ...)
  2004-08-01  7:52 ` [PATCH 4/5] filemap_fdatawrite range interface Suparna Bhattacharya
@ 2004-08-01  7:53 ` Suparna Bhattacharya
  4 siblings, 0 replies; 6+ messages in thread
From: Suparna Bhattacharya @ 2004-08-01  7:53 UTC (permalink / raw)
  To: akpm; +Cc: linux-kernel

On Sun, Aug 01, 2004 at 01:15:18PM +0530, Suparna Bhattacharya wrote:
> 
> The attached patches (generated against 2.6.8-rc2) enable concurrent 
> O_SYNC writers to different parts of the same file by avoiding 
> serialising on i_sem across the wait for IO completion.
> 
> This is mostly your work, ported to the tagged radix tree VFS changes
> and a few fixes. I have been carrying these patches for sometime now; 
> they can be the merged upstream. Please apply.
> 

[5] O_SYNC-speedup.patch

Regards
Suparna

-- 
Suparna Bhattacharya (suparna@in.ibm.com)
Linux Technology Center
IBM Software Lab, India

--------------------------------------------------------------


From: Andrew Morton <akpm@osdl.org>

In databases it is common to have multiple threads or processes performing
O_SYNC writes against different parts of the same file.

Our performance at this is poor, because each writer blocks access to the
file by waiting on I/O completion while holding i_sem: everything is
serialised.

The patch improves things by moving the writing and waiting outside i_sem.
So other threads can get in and submit their I/O and permit the disk
scheduler to optimise the IO patterns better.

Also, the O_SYNC writer only writes and waits on the pages which he wrote,
rather than writing and waiting on all dirty pages in the file.

The reason we haven't been able to do this before is that the required walk
of the address_space page lists is easily livelockable without the i_sem
serialisation.  But in this patch we perform the waiting via a radix-tree
walk of the affected pages.  This cannot be livelocked.

The sync of the inode's metadata is still performed inside i_sem.  This is
because it is list-based and is hence still livelockable.  However it is
usually the case that databases are overwriting existing file blocks and
there will be no dirty buffers attached to the address_space anyway.

The code is careful to ensure that the IO for the pages and the IO for the
metadata are nonblockingly scheduled at the same time.  This is am improvemtn
over the current code, which will issue two separate write-and-wait cycles:
one for metadata, one for pages.

Note from Suparna:
Reworked to use the tagged radix-tree based writeback infrastructure.

Signed-off-by: Suparna Bhattacharya <suparna@in.ibm.com>

 linux-2.6.8-rc2-suparna/include/linux/buffer_head.h |    6 -
 linux-2.6.8-rc2-suparna/include/linux/fs.h          |    5 +
 linux-2.6.8-rc2-suparna/include/linux/writeback.h   |    2 
 linux-2.6.8-rc2-suparna/mm/filemap.c                |   93 +++++++++++++++-----
 4 files changed, 81 insertions(+), 25 deletions(-)

diff -puN include/linux/buffer_head.h~O_SYNC-speedup include/linux/buffer_head.h
--- linux-2.6.8-rc2/include/linux/buffer_head.h~O_SYNC-speedup	2004-08-01 12:34:44.000000000 +0530
+++ linux-2.6.8-rc2-suparna/include/linux/buffer_head.h	2004-08-01 12:34:45.000000000 +0530
@@ -202,12 +202,6 @@ int nobh_prepare_write(struct page*, uns
 int nobh_commit_write(struct file *, struct page *, unsigned, unsigned);
 int nobh_truncate_page(struct address_space *, loff_t);
 
-#define OSYNC_METADATA	(1<<0)
-#define OSYNC_DATA	(1<<1)
-#define OSYNC_INODE	(1<<2)
-int generic_osync_inode(struct inode *, struct address_space *, int);
-
-
 /*
  * inline definitions
  */
diff -puN include/linux/fs.h~O_SYNC-speedup include/linux/fs.h
--- linux-2.6.8-rc2/include/linux/fs.h~O_SYNC-speedup	2004-08-01 12:34:44.000000000 +0530
+++ linux-2.6.8-rc2-suparna/include/linux/fs.h	2004-08-01 12:34:45.000000000 +0530
@@ -823,6 +823,11 @@ extern int vfs_rename(struct inode *, st
 #define DT_SOCK		12
 #define DT_WHT		14
 
+#define OSYNC_METADATA	(1<<0)
+#define OSYNC_DATA	(1<<1)
+#define OSYNC_INODE	(1<<2)
+int generic_osync_inode(struct inode *, struct address_space *, int);
+
 /*
  * This is the "filldir" function type, used by readdir() to let
  * the kernel specify what kind of dirent layout it wants to have.
diff -puN include/linux/writeback.h~O_SYNC-speedup include/linux/writeback.h
--- linux-2.6.8-rc2/include/linux/writeback.h~O_SYNC-speedup	2004-08-01 12:34:45.000000000 +0530
+++ linux-2.6.8-rc2-suparna/include/linux/writeback.h	2004-08-01 12:34:45.000000000 +0530
@@ -103,6 +103,8 @@ void page_writeback_init(void);
 void balance_dirty_pages_ratelimited(struct address_space *mapping);
 int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0);
 int do_writepages(struct address_space *mapping, struct writeback_control *wbc);
+int sync_page_range(struct inode *inode, struct address_space *mapping,
+			loff_t pos, size_t count);
 
 /* pdflush.c */
 extern int nr_pdflush_threads;	/* Global so it can be exported to sysctl
diff -puN mm/filemap.c~O_SYNC-speedup mm/filemap.c
--- linux-2.6.8-rc2/mm/filemap.c~O_SYNC-speedup	2004-08-01 12:34:45.000000000 +0530
+++ linux-2.6.8-rc2-suparna/mm/filemap.c	2004-08-01 12:34:45.000000000 +0530
@@ -247,6 +247,34 @@ static int wait_on_page_writeback_range(
 	return ret;
 }
 
+
+/*
+ * Write and wait upon all the pages in the passed range.  This is a "data
+ * integrity" operation.  It waits upon in-flight writeout before starting and
+ * waiting upon new writeout.  If there was an IO error, return it.
+ *
+ * We need to re-take i_sem during the generic_osync_inode list walk because
+ * it is otherwise livelockable.
+ */
+int sync_page_range(struct inode *inode, struct address_space *mapping,
+			loff_t pos, size_t count)
+{
+	pgoff_t start = pos >> PAGE_CACHE_SHIFT;
+	pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
+	int ret;
+
+	if (mapping->backing_dev_info->memory_backed || !count)
+		return 0;
+	ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
+	if (ret == 0) {
+		down(&inode->i_sem);
+		ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
+		up(&inode->i_sem);
+	}
+	if (ret == 0)
+		ret = wait_on_page_writeback_range(mapping, start, end);
+	return ret;
+}
 /**
  * filemap_fdatawait - walk the list of under-writeback pages of the given
  *     address space and wait for all of them.
@@ -2026,11 +2054,13 @@ generic_file_aio_write_nolock(struct kio
 	/*
 	 * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC
 	 */
-	if (status >= 0) {
-		if ((file->f_flags & O_SYNC) || IS_SYNC(inode))
-			status = generic_osync_inode(inode, mapping,
-					OSYNC_METADATA|OSYNC_DATA);
-	}
+	if (likely(status >= 0)) {
+		if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+			if (!a_ops->writepage || !is_sync_kiocb(iocb))
+				status = generic_osync_inode(inode, mapping,
+						OSYNC_METADATA|OSYNC_DATA);
+		}
+  	}
 	
 	/*
 	 * If we get here for O_DIRECT writes then we must have fallen through
@@ -2070,36 +2100,52 @@ ssize_t generic_file_aio_write(struct ki
 			       size_t count, loff_t pos)
 {
 	struct file *file = iocb->ki_filp;
-	struct inode *inode = file->f_mapping->host;
-	ssize_t err;
-	struct iovec local_iov = { .iov_base = (void __user *)buf, .iov_len = count };
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = mapping->host;
+	ssize_t ret;
+	struct iovec local_iov = { .iov_base = (void __user *)buf,
+					.iov_len = count };
 
 	BUG_ON(iocb->ki_pos != pos);
 
 	down(&inode->i_sem);
-	err = generic_file_aio_write_nolock(iocb, &local_iov, 1, 
+	ret = generic_file_aio_write_nolock(iocb, &local_iov, 1,
 						&iocb->ki_pos);
 	up(&inode->i_sem);
 
-	return err;
-}
+	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+		ssize_t err;
 
+		err = sync_page_range(inode, mapping, pos, ret);
+		if (err < 0)
+			ret = err;
+	}
+	return ret;
+}
 EXPORT_SYMBOL(generic_file_aio_write);
 
 ssize_t generic_file_write(struct file *file, const char __user *buf,
 			   size_t count, loff_t *ppos)
 {
-	struct inode	*inode = file->f_mapping->host;
-	ssize_t		err;
-	struct iovec local_iov = { .iov_base = (void __user *)buf, .iov_len = count };
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = mapping->host;
+	ssize_t	ret;
+	struct iovec local_iov = { .iov_base = (void __user *)buf,
+					.iov_len = count };
 
 	down(&inode->i_sem);
-	err = generic_file_write_nolock(file, &local_iov, 1, ppos);
+	ret = generic_file_write_nolock(file, &local_iov, 1, ppos);
 	up(&inode->i_sem);
 
-	return err;
-}
+	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+		ssize_t err;
 
+		err = sync_page_range(inode, mapping, *ppos - ret, ret);
+		if (err < 0)
+			ret = err;
+	}
+	return ret;
+}
 EXPORT_SYMBOL(generic_file_write);
 
 ssize_t generic_file_readv(struct file *filp, const struct iovec *iov,
@@ -2118,14 +2164,23 @@ ssize_t generic_file_readv(struct file *
 EXPORT_SYMBOL(generic_file_readv);
 
 ssize_t generic_file_writev(struct file *file, const struct iovec *iov,
-			unsigned long nr_segs, loff_t * ppos) 
+			unsigned long nr_segs, loff_t *ppos)
 {
-	struct inode *inode = file->f_mapping->host;
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = mapping->host;
 	ssize_t ret;
 
 	down(&inode->i_sem);
 	ret = generic_file_write_nolock(file, iov, nr_segs, ppos);
 	up(&inode->i_sem);
+
+	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+		int err;
+
+		err = sync_page_range(inode, mapping, *ppos - ret, ret);
+		if (err < 0)
+			ret = err;
+	}
 	return ret;
 }
 

_

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2004-08-01  7:47 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2004-08-01  7:45 [PATCH 0/5] Concurrent O_SYNC write speedups using radix-tree walks Suparna Bhattacharya
2004-08-01  7:47 ` [PATCH 1/5] Writeback page range hint Suparna Bhattacharya
2004-08-01  7:49 ` [PATCH 2/5] Fix writeback page range to use exact limits Suparna Bhattacharya
2004-08-01  7:50 ` [PATCH 3/5] mpage writepages range limit fix Suparna Bhattacharya
2004-08-01  7:52 ` [PATCH 4/5] filemap_fdatawrite range interface Suparna Bhattacharya
2004-08-01  7:53 ` [PATCH 5/5] Concurrent O_SYNC write support Suparna Bhattacharya

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.