All of lore.kernel.org
 help / color / mirror / Atom feed
* [patch] pdflush congestion avoidance
@ 2002-09-11  8:30 Andrew Morton
  0 siblings, 0 replies; only message in thread
From: Andrew Morton @ 2002-09-11  8:30 UTC (permalink / raw)
  To: lkml



- Add the `nonblocking' flag to struct writeback_control, and teach
  the writeback paths to honour it.

- Add the `encountered_congestion' flag to struct writeback_control
  and teach the writeback paths to set it.


So as soon as a mapping's backing_dev_info indicates that it is getting
congested, bale out of writeback.  And don't even start writeback
against filesystems whose queues are congested.

- Convert pdflush's background_writeback() function to use
  nonblocking writeback.

This way, a single pdflush thread will circulate around all the
dirty queues, keeping them filled.

- Convert the pdlfush `kupdate' function to do the same thing.


This solves the problem of pdflush thread pool exhaustion.

It solves the problem of pdflush startup latency.

It solves the (minor) problem wherein `kupdate' writeback only writes
back a single disk at a time (it was getting blocked on each queue in
turn).

It probably means that we only ever need a single pdflush thread.




 fs/fs-writeback.c         |   40 ++++++++++++++++++++++------------------
 fs/mpage.c                |    7 +++++++
 include/linux/writeback.h |    2 ++
 mm/page-writeback.c       |   37 +++++++++++++++++++++++++++++--------
 4 files changed, 60 insertions(+), 26 deletions(-)

--- 2.5.34/fs/mpage.c~nonblocking-pdflush	Tue Sep 10 00:00:20 2002
+++ 2.5.34-akpm/fs/mpage.c	Tue Sep 10 00:00:20 2002
@@ -20,6 +20,7 @@
 #include <linux/prefetch.h>
 #include <linux/mpage.h>
 #include <linux/writeback.h>
+#include <linux/backing-dev.h>
 #include <linux/pagevec.h>
 
 /*
@@ -530,6 +531,7 @@ int
 mpage_writepages(struct address_space *mapping,
 		struct writeback_control *wbc, get_block_t get_block)
 {
+	struct backing_dev_info *bdi = mapping->backing_dev_info;
 	struct bio *bio = NULL;
 	sector_t last_block_in_bio = 0;
 	int ret = 0;
@@ -593,6 +595,11 @@ mpage_writepages(struct address_space *m
 			}
 			if (ret || (--(wbc->nr_to_write) <= 0))
 				done = 1;
+			if (wbc->nonblocking && bdi_write_congested(bdi)) {
+				blk_run_queues();
+				wbc->encountered_congestion = 1;
+				done = 1;
+			}
 		} else {
 			unlock_page(page);
 		}
--- 2.5.34/include/linux/writeback.h~nonblocking-pdflush	Tue Sep 10 00:00:20 2002
+++ 2.5.34-akpm/include/linux/writeback.h	Tue Sep 10 00:00:20 2002
@@ -43,6 +43,8 @@ struct writeback_control {
 					   older than this */
 	long nr_to_write;		/* Write this many pages, and decrement
 					   this for each page written */
+	int nonblocking;		/* Don't get stuck on request queues */
+	int encountered_congestion;	/* An output: a queue is full */
 };
 	
 void writeback_inodes(struct writeback_control *wbc);
--- 2.5.34/mm/page-writeback.c~nonblocking-pdflush	Tue Sep 10 00:00:20 2002
+++ 2.5.34-akpm/mm/page-writeback.c	Tue Sep 10 00:00:20 2002
@@ -21,6 +21,7 @@
 #include <linux/init.h>
 #include <linux/sysrq.h>
 #include <linux/backing-dev.h>
+#include <linux/blkdev.h>
 #include <linux/mpage.h>
 #include <linux/notifier.h>
 #include <linux/smp.h>
@@ -172,21 +173,30 @@ static void background_writeout(unsigned
 		.sync_mode	= WB_SYNC_NONE,
 		.older_than_this = NULL,
 		.nr_to_write	= 0,
+		.nonblocking	= 1,
 	};
 
 	CHECK_EMERGENCY_SYNC
 
 	background_thresh = (dirty_background_ratio * total_pages) / 100;
-
-	do {
+	for ( ; ; ) {
 		struct page_state ps;
+
 		get_page_state(&ps);
 		if (ps.nr_dirty < background_thresh && min_pages <= 0)
 			break;
+		wbc.encountered_congestion = 0;
 		wbc.nr_to_write = MAX_WRITEBACK_PAGES;
 		writeback_inodes(&wbc);
 		min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
-	} while (wbc.nr_to_write <= 0);
+		if (wbc.nr_to_write == MAX_WRITEBACK_PAGES) {
+			/* Wrote nothing */
+			if (wbc.encountered_congestion)
+				blk_congestion_wait(WRITE, HZ/10);
+			else
+				break;
+		}
+	}
 	blk_run_queues();
 }
 
@@ -223,25 +233,36 @@ static void wb_kupdate(unsigned long arg
 	unsigned long oldest_jif;
 	unsigned long start_jif;
 	unsigned long next_jif;
+	long nr_to_write;
 	struct page_state ps;
 	struct writeback_control wbc = {
 		.bdi		= NULL,
 		.sync_mode	= WB_SYNC_NONE,
 		.older_than_this = &oldest_jif,
 		.nr_to_write	= 0,
+		.nonblocking	= 1,
 	};
 
 	sync_supers();
-	get_page_state(&ps);
 
+	get_page_state(&ps);
 	oldest_jif = jiffies - (dirty_expire_centisecs * HZ) / 100;
 	start_jif = jiffies;
 	next_jif = start_jif + (dirty_writeback_centisecs * HZ) / 100;
-	wbc.nr_to_write = ps.nr_dirty;
-	writeback_inodes(&wbc);
+	nr_to_write = ps.nr_dirty;
+	while (nr_to_write > 0) {
+		wbc.encountered_congestion = 0;
+		wbc.nr_to_write = MAX_WRITEBACK_PAGES;
+		writeback_inodes(&wbc);
+		if (wbc.nr_to_write == MAX_WRITEBACK_PAGES) {
+			if (wbc.encountered_congestion)
+				blk_congestion_wait(WRITE, HZ);
+			else
+				break;	/* All the old data is written */
+		}
+		nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
+	}
 	blk_run_queues();
-	yield();
-
 	if (time_before(next_jif, jiffies + HZ))
 		next_jif = jiffies + HZ;
 	mod_timer(&wb_timer, next_jif);
--- 2.5.34/fs/fs-writeback.c~nonblocking-pdflush	Tue Sep 10 00:00:20 2002
+++ 2.5.34-akpm/fs/fs-writeback.c	Tue Sep 10 00:00:20 2002
@@ -220,44 +220,52 @@ __writeback_single_inode(struct inode *i
  *
  * FIXME: this linear search could get expensive with many fileystems.  But
  * how to fix?  We need to go from an address_space to all inodes which share
- * a queue with that address_space.
+ * a queue with that address_space.  (Easy: have a global "dirty superblocks"
+ * list).
  *
  * The inodes to be written are parked on sb->s_io.  They are moved back onto
  * sb->s_dirty as they are selected for writing.  This way, none can be missed
  * on the writer throttling path, and we get decent balancing between many
- * thrlttled threads: we don't want them all piling up on __wait_on_inode.
+ * throlttled threads: we don't want them all piling up on __wait_on_inode.
  */
 static void
 sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
 {
-	struct list_head *tmp;
-	struct list_head *head;
 	const unsigned long start = jiffies;	/* livelock avoidance */
 
 	list_splice_init(&sb->s_dirty, &sb->s_io);
-	head = &sb->s_io;
-	while ((tmp = head->prev) != head) {
-		struct inode *inode = list_entry(tmp, struct inode, i_list);
+	while (!list_empty(&sb->s_io)) {
+		struct inode *inode = list_entry(sb->s_io.prev,
+						struct inode, i_list);
 		struct address_space *mapping = inode->i_mapping;
-		struct backing_dev_info *bdi;
+		struct backing_dev_info *bdi = mapping->backing_dev_info;
 		int really_sync;
 
-		if (wbc->bdi && mapping->backing_dev_info != wbc->bdi) {
+		if (wbc->nonblocking && bdi_write_congested(bdi)) {
+			wbc->encountered_congestion = 1;
 			if (sb != blockdev_superblock)
-				break;		/* inappropriate superblock */
+				break;		/* Skip a congested fs */
 			list_move(&inode->i_list, &sb->s_dirty);
-			continue;		/* not this blockdev */
+			continue;		/* Skip a congested blockdev */
+		}
+
+		if (wbc->bdi && bdi != wbc->bdi) {
+			if (sb != blockdev_superblock)
+				break;		/* fs has the wrong queue */
+			list_move(&inode->i_list, &sb->s_dirty);
+			continue;		/* blockdev has wrong queue */
 		}
 
 		/* Was this inode dirtied after sync_sb_inodes was called? */
 		if (time_after(mapping->dirtied_when, start))
 			break;
 
+		/* Was this inode dirtied too recently? */
 		if (wbc->older_than_this && time_after(mapping->dirtied_when,
 						*wbc->older_than_this))
-			goto out;
+			break;
 
-		bdi = mapping->backing_dev_info;
+		/* Is another pdflush already flushing this queue? */
 		if (current_is_pdflush() && !writeback_acquire(bdi))
 			break;
 
@@ -278,11 +286,7 @@ sync_sb_inodes(struct super_block *sb, s
 		if (wbc->nr_to_write <= 0)
 			break;
 	}
-out:
-	/*
-	 * Leave any unwritten inodes on s_io.
-	 */
-	return;
+	return;		/* Leave any unwritten inodes on s_io */
 }
 
 /*

.

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2002-09-11  8:12 UTC | newest]

Thread overview: (only message) (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2002-09-11  8:30 [patch] pdflush congestion avoidance Andrew Morton

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.