[PATCH RFC] mm: Implement balance_dirty_pages() through waiting for flusher thread

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [PATCH RFC] mm: Implement balance_dirty_pages() through waiting for flusher thread
@ 2010-06-17 18:04 Jan Kara
  2010-06-18  6:09 ` Dave Chinner
                   ` (3 more replies)
  0 siblings, 4 replies; 43+ messages in thread
From: Jan Kara @ 2010-06-17 18:04 UTC (permalink / raw)
  To: linux-fsdevel; +Cc: linux-mm, Jan Kara, hch, akpm, peterz, wfg

This patch changes balance_dirty_pages() throttling so that the function does
not submit writes on its own but rather waits for flusher thread to do enough
writes. This has an advantage that we have a single source of IO allowing for
better writeback locality. Also we do not have to reenter filesystems from a
non-trivial context.

The waiting is implemented as follows: Each BDI has a FIFO of wait requests.
When balance_dirty_pages() finds it needs to throttle the writer, it adds a
request for writing write_chunk of pages to the FIFO and waits until the
request is fulfilled or we drop below dirty limits. A flusher thread tracks
amount of pages it has written and when enough pages are written since the
first request in the FIFO became first, it wakes up the waiter and removes
request from the FIFO (thus another request becomes first and flusher thread
starts writing out on behalf of this request).

CC: hch@infradead.org
CC: akpm@linux-foundation.org
CC: peterz@infradead.org
CC: wfg@mail.ustc.edu.cn
Signed-off-by: Jan Kara <jack@suse.cz>
---
 include/linux/backing-dev.h |   18 +++++
 include/linux/writeback.h   |    1 +
 mm/backing-dev.c            |  119 +++++++++++++++++++++++++++++++
 mm/page-writeback.c         |  164 +++++++++++++++----------------------------
 4 files changed, 195 insertions(+), 107 deletions(-)

 This is mostly a proof-of-concept patch. It's mostly untested and probably
doesn't completely work. I was just thinking about this for some time already
and would like to get some feedback on the approach...

diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index aee5f6c..e8b5be8 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -39,6 +39,7 @@ typedef int (congested_fn)(void *, int);
 enum bdi_stat_item {
 	BDI_RECLAIMABLE,
 	BDI_WRITEBACK,
+	BDI_WRITTEN,
 	NR_BDI_STAT_ITEMS
 };
 
@@ -58,6 +59,11 @@ struct bdi_writeback {
 	struct list_head	b_more_io;	/* parked for more writeback */
 };
 
+struct bdi_written_count {
+	struct list_head list;
+	u64 written;
+};
+
 struct backing_dev_info {
 	struct list_head bdi_list;
 	struct rcu_head rcu_head;
@@ -85,6 +91,16 @@ struct backing_dev_info {
 	unsigned long wb_mask;	  /* bitmask of registered tasks */
 	unsigned int wb_cnt;	  /* number of registered tasks */
 
+	/*
+	 * We abuse wb_written_wait.lock to also guard consistency of
+	 * wb_written_wait together with wb_written_list and wb_written_head
+	 */
+	wait_queue_head_t wb_written_wait;  /* waitqueue of waiters for writeout */
+	struct list_head wb_written_list;   /* list with numbers of writes to
+					     * wait for */
+	u64 wb_written_head;	/* value of BDI_WRITTEN when we should check
+				 * the first waiter */
+
 	struct list_head work_list;
 
 	struct device *dev;
@@ -110,6 +126,8 @@ void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
 int bdi_writeback_task(struct bdi_writeback *wb);
 int bdi_has_dirty_io(struct backing_dev_info *bdi);
 void bdi_arm_supers_timer(void);
+void bdi_wakeup_writers(struct backing_dev_info *bdi);
+void bdi_wait_written(struct backing_dev_info *bdi, long write_chunk);
 
 extern spinlock_t bdi_lock;
 extern struct list_head bdi_list;
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index d63ef8f..a822159 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -129,6 +129,7 @@ int dirty_writeback_centisecs_handler(struct ctl_table *, int,
 
 void get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
 		      unsigned long *pbdi_dirty, struct backing_dev_info *bdi);
+int dirty_limits_exceeded(struct backing_dev_info *bdi);
 
 void page_writeback_init(void);
 void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 660a87a..e2d0e1a 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -390,6 +390,122 @@ static void sync_supers_timer_fn(unsigned long unused)
 	bdi_arm_supers_timer();
 }
 
+/*
+ * Remove writer from the list, update wb_written_head as needed
+ *
+ * Needs wb_written_wait.lock held
+ */
+static void bdi_remove_writer(struct backing_dev_info *bdi,
+			      struct bdi_written_count *wc)
+{
+	int first = 0;
+
+	if (bdi->wb_written_list.next == &wc->list)
+		first = 1;
+	/* Initialize list so that entry owner knows it's removed */
+	list_del_init(&wc->list);
+	if (first) {
+		if (list_empty(&bdi->wb_written_list)) {
+			bdi->wb_written_head = ~(u64)0;
+			return;
+		}
+		wc = list_entry(bdi->wb_written_list.next,
+				struct bdi_written_count,
+				list);
+		bdi->wb_written_head = bdi_stat(bdi, BDI_WRITTEN) + wc->written;
+	}
+}
+
+void bdi_wakeup_writers(struct backing_dev_info *bdi)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&bdi->wb_written_wait.lock, flags);
+	while (!list_empty(&bdi->wb_written_list)) {
+		struct bdi_written_count *wc =
+			list_entry(bdi->wb_written_list.next,
+				   struct bdi_written_count,
+				   list);
+		if (bdi_stat(bdi, BDI_WRITTEN) >= bdi->wb_written_head) {
+			bdi_remove_writer(bdi, wc);
+			/*
+			 * Now we can wakeup the writer which frees wc entry
+			 * The barrier is here so that woken task sees the
+			 * modification of wc.
+			 */
+			smp_wmb();
+			__wake_up_locked(&bdi->wb_written_wait, TASK_NORMAL);
+		} else
+			break;
+	}
+	spin_unlock_irqrestore(&bdi->wb_written_wait.lock, flags);
+}
+
+/* Adds wc structure and the task to the tail of list of waiters */
+static void bdi_add_writer(struct backing_dev_info *bdi,
+			   struct bdi_written_count *wc,
+			   wait_queue_t *wait)
+{
+	spin_lock_irq(&bdi->wb_written_wait.lock);
+	if (list_empty(&bdi->wb_written_list))
+		bdi->wb_written_head = bdi_stat(bdi, BDI_WRITTEN) + wc->written;
+	list_add_tail(&wc->list, &bdi->wb_written_list);
+	__add_wait_queue_tail_exclusive(&bdi->wb_written_wait, wait);
+	spin_unlock_irq(&bdi->wb_written_wait.lock);
+}
+
+/* Wait until write_chunk is written or we get below dirty limits */
+void bdi_wait_written(struct backing_dev_info *bdi, long write_chunk)
+{
+	struct bdi_written_count wc = {
+					.list = LIST_HEAD_INIT(wc.list),
+					.written = write_chunk,
+				};
+	DECLARE_WAITQUEUE(wait, current);
+	int pause = 1;
+
+	bdi_add_writer(bdi, &wc, &wait);
+	for (;;) {
+		if (signal_pending_state(TASK_KILLABLE, current))
+			break;
+
+		/*
+		 * Make the task just killable so that tasks cannot circumvent
+		 * throttling by sending themselves non-fatal signals...
+		 */
+		__set_current_state(TASK_KILLABLE);
+		io_schedule_timeout(pause);
+
+		/*
+		 * The following check is save without wb_written_wait.lock
+		 * because once bdi_remove_writer removes us from the list
+		 * noone will touch us and it's impossible for list_empty check
+		 * to trigger as false positive. The barrier is there to avoid
+		 * missing the wakeup when we are removed from the list.
+		 */
+		smp_rmb();
+		if (list_empty(&wc.list))
+			break;
+
+		if (!dirty_limits_exceeded(bdi))
+			break;
+
+		/*
+		 * Increase the delay for each loop, up to our previous
+		 * default of taking a 100ms nap.
+		 */
+		pause <<= 1;
+		if (pause > HZ / 10)
+			pause = HZ / 10;
+	}
+
+	spin_lock_irq(&bdi->wb_written_wait.lock);
+	__remove_wait_queue(&bdi->wb_written_wait, &wait);
+	if (!list_empty(&wc.list))
+		bdi_remove_writer(bdi, &wc);
+	spin_unlock_irq(&bdi->wb_written_wait.lock);
+}
+
 static int bdi_forker_task(void *ptr)
 {
 	struct bdi_writeback *me = ptr;
@@ -688,6 +804,9 @@ int bdi_init(struct backing_dev_info *bdi)
 	}
 
 	bdi->dirty_exceeded = 0;
+	bdi->wb_written_head = ~(u64)0;
+	init_waitqueue_head(&bdi->wb_written_wait);
+	INIT_LIST_HEAD(&bdi->wb_written_list);
 	err = prop_local_init_percpu(&bdi->completions);
 
 	if (err) {
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index bbd396a..3a86a61 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -473,131 +473,78 @@ get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
 	}
 }
 
-/*
- * balance_dirty_pages() must be called by processes which are generating dirty
- * data.  It looks at the number of dirty pages in the machine and will force
- * the caller to perform writeback if the system is over `vm_dirty_ratio'.
- * If we're over `background_thresh' then the writeback threads are woken to
- * perform some writeout.
- */
-static void balance_dirty_pages(struct address_space *mapping,
-				unsigned long write_chunk)
+int dirty_limits_exceeded(struct backing_dev_info *bdi)
 {
 	long nr_reclaimable, bdi_nr_reclaimable;
 	long nr_writeback, bdi_nr_writeback;
 	unsigned long background_thresh;
 	unsigned long dirty_thresh;
 	unsigned long bdi_thresh;
-	unsigned long pages_written = 0;
-	unsigned long pause = 1;
-
-	struct backing_dev_info *bdi = mapping->backing_dev_info;
-
-	for (;;) {
-		struct writeback_control wbc = {
-			.bdi		= bdi,
-			.sync_mode	= WB_SYNC_NONE,
-			.older_than_this = NULL,
-			.nr_to_write	= write_chunk,
-			.range_cyclic	= 1,
-		};
-
-		get_dirty_limits(&background_thresh, &dirty_thresh,
-				&bdi_thresh, bdi);
-
-		nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
-					global_page_state(NR_UNSTABLE_NFS);
-		nr_writeback = global_page_state(NR_WRITEBACK);
 
+	get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi);
+	/*
+	 * In order to avoid the stacked BDI deadlock we need to ensure we
+	 * accurately count the 'dirty' pages when the threshold is low.
+	 *
+	 * Otherwise it would be possible to get thresh+n pages reported dirty,
+	 * even though there are thresh-m pages actually dirty; with m+n
+	 * sitting in the percpu deltas.
+	 */
+	if (bdi_thresh < 2*bdi_stat_error(bdi)) {
+		bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
+		bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK);
+	} else {
 		bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
 		bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
+	}
 
-		if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
-			break;
-
-		/*
-		 * Throttle it only when the background writeback cannot
-		 * catch-up. This avoids (excessively) small writeouts
-		 * when the bdi limits are ramping up.
-		 */
-		if (nr_reclaimable + nr_writeback <
-				(background_thresh + dirty_thresh) / 2)
-			break;
-
-		if (!bdi->dirty_exceeded)
-			bdi->dirty_exceeded = 1;
-
-		/* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
-		 * Unstable writes are a feature of certain networked
-		 * filesystems (i.e. NFS) in which data may have been
-		 * written to the server's write cache, but has not yet
-		 * been flushed to permanent storage.
-		 * Only move pages to writeback if this bdi is over its
-		 * threshold otherwise wait until the disk writes catch
-		 * up.
-		 */
-		if (bdi_nr_reclaimable > bdi_thresh) {
-			writeback_inodes_wbc(&wbc);
-			pages_written += write_chunk - wbc.nr_to_write;
-			get_dirty_limits(&background_thresh, &dirty_thresh,
-				       &bdi_thresh, bdi);
-		}
-
-		/*
-		 * In order to avoid the stacked BDI deadlock we need
-		 * to ensure we accurately count the 'dirty' pages when
-		 * the threshold is low.
-		 *
-		 * Otherwise it would be possible to get thresh+n pages
-		 * reported dirty, even though there are thresh-m pages
-		 * actually dirty; with m+n sitting in the percpu
-		 * deltas.
-		 */
-		if (bdi_thresh < 2*bdi_stat_error(bdi)) {
-			bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
-			bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK);
-		} else if (bdi_nr_reclaimable) {
-			bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
-			bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
-		}
+	if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh) {
+		if (bdi->dirty_exceeded)
+			bdi->dirty_exceeded = 0;
+		return 0;
+	}
 
-		if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
-			break;
-		if (pages_written >= write_chunk)
-			break;		/* We've done our duty */
+	nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
+					global_page_state(NR_UNSTABLE_NFS);
+	nr_writeback = global_page_state(NR_WRITEBACK);
+	/*
+	 * Throttle it only when the background writeback cannot
+	 * catch-up. This avoids (excessively) small writeouts
+	 * when the bdi limits are ramping up.
+	 */
+	if (nr_reclaimable + nr_writeback <
+			(background_thresh + dirty_thresh) / 2) {
+		if (bdi->dirty_exceeded)
+			bdi->dirty_exceeded = 0;
+		return 0;
+	}
 
-		__set_current_state(TASK_INTERRUPTIBLE);
-		io_schedule_timeout(pause);
+	if (!bdi->dirty_exceeded)
+		bdi->dirty_exceeded = 1;
 
-		/*
-		 * Increase the delay for each loop, up to our previous
-		 * default of taking a 100ms nap.
-		 */
-		pause <<= 1;
-		if (pause > HZ / 10)
-			pause = HZ / 10;
-	}
+	return 1;
+}
 
-	if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
-			bdi->dirty_exceeded)
-		bdi->dirty_exceeded = 0;
+/*
+ * balance_dirty_pages() must be called by processes which are generating dirty
+ * data.  It looks at the number of dirty pages in the machine and will force
+ * the caller to perform writeback if the system is over `vm_dirty_ratio'.
+ * If we're over `background_thresh' then the writeback threads are woken to
+ * perform some writeout.
+ */
+static void balance_dirty_pages(struct address_space *mapping,
+				unsigned long write_chunk)
+{
+	struct backing_dev_info *bdi = mapping->backing_dev_info;
 
-	if (writeback_in_progress(bdi))
+	if (!dirty_limits_exceeded(bdi))
 		return;
 
-	/*
-	 * In laptop mode, we wait until hitting the higher threshold before
-	 * starting background writeout, and then write out all the way down
-	 * to the lower threshold.  So slow writers cause minimal disk activity.
-	 *
-	 * In normal mode, we start background writeout at the lower
-	 * background_thresh, to keep the amount of dirty memory low.
-	 */
-	if ((laptop_mode && pages_written) ||
-	    (!laptop_mode && ((global_page_state(NR_FILE_DIRTY)
-			       + global_page_state(NR_UNSTABLE_NFS))
-					  > background_thresh)))
+	/* FIXME: We could start writeback for all bdis get rid of dirty
+	 * data there ASAP and use all the paralelism we can get... */
+	if (!writeback_in_progress(bdi))
 		bdi_start_writeback(bdi, NULL, 0);
+	bdi_wait_written(bdi, write_chunk);
 }
 
 void set_page_dirty_balance(struct page *page, int page_mkwrite)
@@ -1295,10 +1242,13 @@ int test_clear_page_writeback(struct page *page)
 						PAGECACHE_TAG_WRITEBACK);
 			if (bdi_cap_account_writeback(bdi)) {
 				__dec_bdi_stat(bdi, BDI_WRITEBACK);
+				__inc_bdi_stat(bdi, BDI_WRITTEN);
 				__bdi_writeout_inc(bdi);
 			}
 		}
 		spin_unlock_irqrestore(&mapping->tree_lock, flags);
+		if (bdi_stat(bdi, BDI_WRITTEN) >= bdi->wb_written_head)
+			bdi_wakeup_writers(bdi);
 	} else {
 		ret = TestClearPageWriteback(page);
 	}
-- 
1.6.4.2

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC] mm: Implement balance_dirty_pages() through waiting for flusher thread
  2010-06-17 18:04 [PATCH RFC] mm: Implement balance_dirty_pages() through waiting for flusher thread Jan Kara
@ 2010-06-18  6:09 ` Dave Chinner
  2010-06-18  9:11   ` Peter Zijlstra
  2010-06-21 23:36   ` Jan Kara
  2010-06-18 10:21 ` Peter Zijlstra
                   ` (2 subsequent siblings)
  3 siblings, 2 replies; 43+ messages in thread
From: Dave Chinner @ 2010-06-18  6:09 UTC (permalink / raw)
  To: Jan Kara; +Cc: linux-fsdevel, linux-mm, hch, akpm, peterz, wfg

On Thu, Jun 17, 2010 at 08:04:38PM +0200, Jan Kara wrote:
> This patch changes balance_dirty_pages() throttling so that the function does
> not submit writes on its own but rather waits for flusher thread to do enough
> writes. This has an advantage that we have a single source of IO allowing for
> better writeback locality. Also we do not have to reenter filesystems from a
> non-trivial context.
> 
> The waiting is implemented as follows: Each BDI has a FIFO of wait requests.
> When balance_dirty_pages() finds it needs to throttle the writer, it adds a
> request for writing write_chunk of pages to the FIFO and waits until the
> request is fulfilled or we drop below dirty limits. A flusher thread tracks
> amount of pages it has written and when enough pages are written since the
> first request in the FIFO became first, it wakes up the waiter and removes
> request from the FIFO (thus another request becomes first and flusher thread
> starts writing out on behalf of this request).
> 
> CC: hch@infradead.org
> CC: akpm@linux-foundation.org
> CC: peterz@infradead.org
> CC: wfg@mail.ustc.edu.cn
> Signed-off-by: Jan Kara <jack@suse.cz>
> ---
>  include/linux/backing-dev.h |   18 +++++
>  include/linux/writeback.h   |    1 +
>  mm/backing-dev.c            |  119 +++++++++++++++++++++++++++++++
>  mm/page-writeback.c         |  164 +++++++++++++++----------------------------
>  4 files changed, 195 insertions(+), 107 deletions(-)
> 
>  This is mostly a proof-of-concept patch. It's mostly untested and probably
> doesn't completely work. I was just thinking about this for some time already
> and would like to get some feedback on the approach...

It matches the way I'd like to see writeback throttling work.
I think this approach is likely to have less jitter and latency
because throttling is not tied to a specific set of pages or IOs to
complete.

> 
> +++ b/mm/backing-dev.c
> @@ -390,6 +390,122 @@ static void sync_supers_timer_fn(unsigned long unused)
>  	bdi_arm_supers_timer();
>  }
>  
> +/*
> + * Remove writer from the list, update wb_written_head as needed
> + *
> + * Needs wb_written_wait.lock held
> + */
> +static void bdi_remove_writer(struct backing_dev_info *bdi,
> +			      struct bdi_written_count *wc)
> +{
> +	int first = 0;
> +
> +	if (bdi->wb_written_list.next == &wc->list)
> +		first = 1;
> +	/* Initialize list so that entry owner knows it's removed */
> +	list_del_init(&wc->list);
> +	if (first) {
> +		if (list_empty(&bdi->wb_written_list)) {
> +			bdi->wb_written_head = ~(u64)0;
> +			return;
> +		}
> +		wc = list_entry(bdi->wb_written_list.next,
> +				struct bdi_written_count,
> +				list);

list_first_entry()?

> +		bdi->wb_written_head = bdi_stat(bdi, BDI_WRITTEN) + wc->written;

The resolution of the percpu counters is an issue here, I think.
percpu counters update in batches of 32 counts per CPU. wc->written
is going to have a value of roughly 8 or 32 depending on whether
bdi->dirty_exceeded is set or not. I note that you take this into
account when checking dirty threshold limits, but it doesn't appear
to be taken in to here.

That means for ratelimits of 8 pages on writes and only one CPU doing
IO completion wakeups are only going to occur in completion batches
of 32 pages. we'll end up with something like:

waiter, written		count at head	count when woken
 A, 8			n			n + 32
 B, 8			n + 32			n + 64
 C, 8			n + 64			n + 96
.....

And so on. This isn't necessarily bad - we'll throttle for longer
than we strictly need to - but the cumulative counter resolution
error gets worse as the number of CPUs doing IO completion grows.
Worst case ends up at for (num cpus * 31) + 1 pages of writeback for
just the first waiter. For an arbitrary FIFO queue of depth d, the
worst case is more like d * (num cpus * 31 + 1).

Hence I think that instead of calculating the next wakeup threshold
when the head changes the wakeup threshold needs to be a function of
the FIFO depth. That is, it is calculated at queueing time from the
current tail of the queue.

e.g. something like this when queuing:

	if (list_empty(&bdi->wb_written_list))
		wc->wakeup_at = bdi_stat(bdi, BDI_WRITTEN) + written;
		bdi->wb_written_head = wc->wakeup_at;
	else {
		tail = list_last_entry(&bdi->wb_written_list);
		wc->wakeup_at = tail->wakeup_at + written;
	}
	list_add_tail(&wc->list, &bdi->wb_written_list);

And this when the wakeup threshold is tripped:

	written = bdi_stat(bdi, BDI_WRITTEN);
	while (!list_empty(&bdi->wb_written_list)) {
		wc = list_first_entry();

		if (wc->wakeup_at > written)
			break;

		list_del_init(wc)
		wakeup(wc)
	}

	if (!list_empty(&bdi->wb_written_list)) {
		wc = list_first_entry();
		bdi->wb_written_head = wc->wakeup_at;
	} else
		bdi->wb_written_head = ~0ULL;

This takes the counter resolution completely out of the picture - if
the counter resolution is 32, and there are 4 waiters on the fifo
each waiting for 8 pages, then a single tick of the counter will
wake them all up.

Other than that, it seems like a good approach to me...

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC] mm: Implement balance_dirty_pages() through waiting for flusher thread
  2010-06-18  6:09 ` Dave Chinner
@ 2010-06-18  9:11   ` Peter Zijlstra
  2010-06-18 23:29     ` Dave Chinner
  2010-06-21 23:36   ` Jan Kara
  1 sibling, 1 reply; 43+ messages in thread
From: Peter Zijlstra @ 2010-06-18  9:11 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Jan Kara, linux-fsdevel, linux-mm, hch, akpm, wfg

On Fri, 2010-06-18 at 16:09 +1000, Dave Chinner wrote:
> > +             bdi->wb_written_head = bdi_stat(bdi, BDI_WRITTEN) + wc->written;
> 
> The resolution of the percpu counters is an issue here, I think.
> percpu counters update in batches of 32 counts per CPU. wc->written
> is going to have a value of roughly 8 or 32 depending on whether
> bdi->dirty_exceeded is set or not. I note that you take this into
> account when checking dirty threshold limits, but it doesn't appear
> to be taken in to here. 

The BDI stuff uses a custom batch-size, see bdi_stat_error() and
related. The total error is in the order of O(n log n) where n is the
number of CPUs.

But yeah, the whole dirty_exceeded thing makes life more interesting.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC] mm: Implement balance_dirty_pages() through waiting for flusher thread
  2010-06-17 18:04 [PATCH RFC] mm: Implement balance_dirty_pages() through waiting for flusher thread Jan Kara
  2010-06-18  6:09 ` Dave Chinner
@ 2010-06-18 10:21 ` Peter Zijlstra
  2010-06-21 13:31   ` Jan Kara
  2010-06-18 10:21 ` Peter Zijlstra
  2010-06-18 10:21 ` Peter Zijlstra
  3 siblings, 1 reply; 43+ messages in thread
From: Peter Zijlstra @ 2010-06-18 10:21 UTC (permalink / raw)
  To: Jan Kara; +Cc: linux-fsdevel, linux-mm, hch, akpm, wfg

On Thu, 2010-06-17 at 20:04 +0200, Jan Kara wrote:
> +                       /*
> +                        * Now we can wakeup the writer which frees wc entry
> +                        * The barrier is here so that woken task sees the
> +                        * modification of wc.
> +                        */
> +                       smp_wmb();
> +                       __wake_up_locked(&bdi->wb_written_wait, TASK_NORMAL); 

wakeups imply a wmb.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC] mm: Implement balance_dirty_pages() through waiting for flusher thread
  2010-06-17 18:04 [PATCH RFC] mm: Implement balance_dirty_pages() through waiting for flusher thread Jan Kara
  2010-06-18  6:09 ` Dave Chinner
  2010-06-18 10:21 ` Peter Zijlstra
@ 2010-06-18 10:21 ` Peter Zijlstra
  2010-06-21 14:02   ` Jan Kara
  2010-06-18 10:21 ` Peter Zijlstra
  3 siblings, 1 reply; 43+ messages in thread
From: Peter Zijlstra @ 2010-06-18 10:21 UTC (permalink / raw)
  To: Jan Kara; +Cc: linux-fsdevel, linux-mm, hch, akpm, wfg

On Thu, 2010-06-17 at 20:04 +0200, Jan Kara wrote:
> +/* Wait until write_chunk is written or we get below dirty limits */
> +void bdi_wait_written(struct backing_dev_info *bdi, long write_chunk)
> +{
> +       struct bdi_written_count wc = {
> +                                       .list = LIST_HEAD_INIT(wc.list),
> +                                       .written = write_chunk,
> +                               };
> +       DECLARE_WAITQUEUE(wait, current);
> +       int pause = 1;
> +
> +       bdi_add_writer(bdi, &wc, &wait);
> +       for (;;) {
> +               if (signal_pending_state(TASK_KILLABLE, current))
> +                       break;
> +
> +               /*
> +                * Make the task just killable so that tasks cannot circumvent
> +                * throttling by sending themselves non-fatal signals...
> +                */
> +               __set_current_state(TASK_KILLABLE);
> +               io_schedule_timeout(pause);
> +
> +               /*
> +                * The following check is save without wb_written_wait.lock
> +                * because once bdi_remove_writer removes us from the list
> +                * noone will touch us and it's impossible for list_empty check
> +                * to trigger as false positive. The barrier is there to avoid
> +                * missing the wakeup when we are removed from the list.
> +                */
> +               smp_rmb();
> +               if (list_empty(&wc.list))
> +                       break;
> +
> +               if (!dirty_limits_exceeded(bdi))
> +                       break;
> +
> +               /*
> +                * Increase the delay for each loop, up to our previous
> +                * default of taking a 100ms nap.
> +                */
> +               pause <<= 1;
> +               if (pause > HZ / 10)
> +                       pause = HZ / 10;
> +       }
> +
> +       spin_lock_irq(&bdi->wb_written_wait.lock);
> +       __remove_wait_queue(&bdi->wb_written_wait, &wait);
> +       if (!list_empty(&wc.list))
> +               bdi_remove_writer(bdi, &wc);
> +       spin_unlock_irq(&bdi->wb_written_wait.lock);
> +} 

OK, so the whole pause thing is simply because we don't get a wakeup
when we drop below the limit, right?


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC] mm: Implement balance_dirty_pages() through waiting for flusher thread
  2010-06-17 18:04 [PATCH RFC] mm: Implement balance_dirty_pages() through waiting for flusher thread Jan Kara
                   ` (2 preceding siblings ...)
  2010-06-18 10:21 ` Peter Zijlstra
@ 2010-06-18 10:21 ` Peter Zijlstra
  2010-06-21 13:42   ` Jan Kara
  3 siblings, 1 reply; 43+ messages in thread
From: Peter Zijlstra @ 2010-06-18 10:21 UTC (permalink / raw)
  To: Jan Kara; +Cc: linux-fsdevel, linux-mm, hch, akpm, wfg

On Thu, 2010-06-17 at 20:04 +0200, Jan Kara wrote:
> +               if (bdi_stat(bdi, BDI_WRITTEN) >= bdi->wb_written_head)
> +                       bdi_wakeup_writers(bdi); 

For the paranoid amongst us you could make wb_written_head s64 and write
the above as:

  if (bdi_stat(bdi, BDI_WRITTEN) - bdi->wb_written_head > 0)

Which, if you assume both are monotonic and wb_written_head is always
within 2^63 of the actual bdi_stat() value, should give the same end
result and deal with wrap-around.

For when we manage to create a device that can write 2^64 pages in our
uptime :-)

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC] mm: Implement balance_dirty_pages() through waiting for flusher thread
  2010-06-18  9:11   ` Peter Zijlstra
@ 2010-06-18 23:29     ` Dave Chinner
  0 siblings, 0 replies; 43+ messages in thread
From: Dave Chinner @ 2010-06-18 23:29 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Jan Kara, linux-fsdevel, linux-mm, hch, akpm, wfg

On Fri, Jun 18, 2010 at 11:11:00AM +0200, Peter Zijlstra wrote:
> On Fri, 2010-06-18 at 16:09 +1000, Dave Chinner wrote:
> > > +             bdi->wb_written_head = bdi_stat(bdi, BDI_WRITTEN) + wc->written;
> > 
> > The resolution of the percpu counters is an issue here, I think.
> > percpu counters update in batches of 32 counts per CPU. wc->written
> > is going to have a value of roughly 8 or 32 depending on whether
> > bdi->dirty_exceeded is set or not. I note that you take this into
> > account when checking dirty threshold limits, but it doesn't appear
> > to be taken in to here. 
> 
> The BDI stuff uses a custom batch-size, see bdi_stat_error() and
> related. The total error is in the order of O(n log n) where n is the
> number of CPUs.

#define BDI_STAT_BATCH (8*(1+ilog2(nr_cpu_ids)))

A dual socket server I have here:

[    0.000000] NR_CPUS:512 nr_cpumask_bits:512 nr_cpu_ids:32 nr_node_ids:2

Which means that the bdi per-cpu counter batch size on it would be 48
and the inaccuracy would be even bigger than I described. ;)

> But yeah, the whole dirty_exceeded thing makes life more
> interesting.

I suspect the 8 vs 32 pages could go away without too much impact
with the Jan's mechanism...

Another thing to consider is the impact of mulipage writes on the
incoming value to balance_dirty_pages - if we get 1024 page writes
running, then that may impact on the way throttling behaves, too.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC] mm: Implement balance_dirty_pages() through waiting for flusher thread
  2010-06-18 10:21 ` Peter Zijlstra
@ 2010-06-21 13:31   ` Jan Kara
  0 siblings, 0 replies; 43+ messages in thread
From: Jan Kara @ 2010-06-21 13:31 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Jan Kara, linux-fsdevel, linux-mm, hch, akpm, wfg

On Fri 18-06-10 12:21:35, Peter Zijlstra wrote:
> On Thu, 2010-06-17 at 20:04 +0200, Jan Kara wrote:
> > +                       /*
> > +                        * Now we can wakeup the writer which frees wc entry
> > +                        * The barrier is here so that woken task sees the
> > +                        * modification of wc.
> > +                        */
> > +                       smp_wmb();
> > +                       __wake_up_locked(&bdi->wb_written_wait, TASK_NORMAL); 
> 
> wakeups imply a wmb.
  Thanks. Removed smp_wmb and updated comment.

									Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC] mm: Implement balance_dirty_pages() through waiting for flusher thread
  2010-06-18 10:21 ` Peter Zijlstra
@ 2010-06-21 13:42   ` Jan Kara
  2010-06-22  4:07     ` Wu Fengguang
  0 siblings, 1 reply; 43+ messages in thread
From: Jan Kara @ 2010-06-21 13:42 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Jan Kara, linux-fsdevel, linux-mm, hch, akpm, wfg

On Fri 18-06-10 12:21:37, Peter Zijlstra wrote:
> On Thu, 2010-06-17 at 20:04 +0200, Jan Kara wrote:
> > +               if (bdi_stat(bdi, BDI_WRITTEN) >= bdi->wb_written_head)
> > +                       bdi_wakeup_writers(bdi); 
> 
> For the paranoid amongst us you could make wb_written_head s64 and write
> the above as:
> 
>   if (bdi_stat(bdi, BDI_WRITTEN) - bdi->wb_written_head > 0)
> 
> Which, if you assume both are monotonic and wb_written_head is always
> within 2^63 of the actual bdi_stat() value, should give the same end
> result and deal with wrap-around.
> 
> For when we manage to create a device that can write 2^64 pages in our
> uptime :-)
  OK, the fix is simple enough so I've changed it, although I'm not
paranoic enough ;) (I actually did the math before writing that test).

								Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC] mm: Implement balance_dirty_pages() through waiting for flusher thread
  2010-06-18 10:21 ` Peter Zijlstra
@ 2010-06-21 14:02   ` Jan Kara
  2010-06-21 14:10     ` Jan Kara
  0 siblings, 1 reply; 43+ messages in thread
From: Jan Kara @ 2010-06-21 14:02 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Jan Kara, linux-fsdevel, linux-mm, hch, akpm, wfg

On Fri 18-06-10 12:21:36, Peter Zijlstra wrote:
> On Thu, 2010-06-17 at 20:04 +0200, Jan Kara wrote:
> > +/* Wait until write_chunk is written or we get below dirty limits */
> > +void bdi_wait_written(struct backing_dev_info *bdi, long write_chunk)
> > +{
> > +       struct bdi_written_count wc = {
> > +                                       .list = LIST_HEAD_INIT(wc.list),
> > +                                       .written = write_chunk,
> > +                               };
> > +       DECLARE_WAITQUEUE(wait, current);
> > +       int pause = 1;
> > +
> > +       bdi_add_writer(bdi, &wc, &wait);
> > +       for (;;) {
> > +               if (signal_pending_state(TASK_KILLABLE, current))
> > +                       break;
> > +
> > +               /*
> > +                * Make the task just killable so that tasks cannot circumvent
> > +                * throttling by sending themselves non-fatal signals...
> > +                */
> > +               __set_current_state(TASK_KILLABLE);
> > +               io_schedule_timeout(pause);
> > +
> > +               /*
> > +                * The following check is save without wb_written_wait.lock
> > +                * because once bdi_remove_writer removes us from the list
> > +                * noone will touch us and it's impossible for list_empty check
> > +                * to trigger as false positive. The barrier is there to avoid
> > +                * missing the wakeup when we are removed from the list.
> > +                */
> > +               smp_rmb();
> > +               if (list_empty(&wc.list))
> > +                       break;
> > +
> > +               if (!dirty_limits_exceeded(bdi))
> > +                       break;
> > +
> > +               /*
> > +                * Increase the delay for each loop, up to our previous
> > +                * default of taking a 100ms nap.
> > +                */
> > +               pause <<= 1;
> > +               if (pause > HZ / 10)
> > +                       pause = HZ / 10;
> > +       }
> > +
> > +       spin_lock_irq(&bdi->wb_written_wait.lock);
> > +       __remove_wait_queue(&bdi->wb_written_wait, &wait);
> > +       if (!list_empty(&wc.list))
> > +               bdi_remove_writer(bdi, &wc);
> > +       spin_unlock_irq(&bdi->wb_written_wait.lock);
> > +} 
> 
> OK, so the whole pause thing is simply because we don't get a wakeup
> when we drop below the limit, right?
  Yes. I will write a comment about it before the loop. I was also thinking
about sending a wakeup when we get below limits but then all threads would
start thundering the device at the same time and likely cause a congestion
again. This way we might get a smoother start. But I'll have to measure
whether we aren't too unfair with this approach...

								Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC] mm: Implement balance_dirty_pages() through waiting for flusher thread
  2010-06-21 14:02   ` Jan Kara
@ 2010-06-21 14:10     ` Jan Kara
  2010-06-21 14:12       ` Peter Zijlstra
  0 siblings, 1 reply; 43+ messages in thread
From: Jan Kara @ 2010-06-21 14:10 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Jan Kara, linux-fsdevel, linux-mm, hch, akpm, wfg

On Mon 21-06-10 16:02:37, Jan Kara wrote:
> On Fri 18-06-10 12:21:36, Peter Zijlstra wrote:
> > On Thu, 2010-06-17 at 20:04 +0200, Jan Kara wrote:
> > > +/* Wait until write_chunk is written or we get below dirty limits */
> > > +void bdi_wait_written(struct backing_dev_info *bdi, long write_chunk)
> > > +{
> > > +       struct bdi_written_count wc = {
> > > +                                       .list = LIST_HEAD_INIT(wc.list),
> > > +                                       .written = write_chunk,
> > > +                               };
> > > +       DECLARE_WAITQUEUE(wait, current);
> > > +       int pause = 1;
> > > +
> > > +       bdi_add_writer(bdi, &wc, &wait);
> > > +       for (;;) {
> > > +               if (signal_pending_state(TASK_KILLABLE, current))
> > > +                       break;
> > > +
> > > +               /*
> > > +                * Make the task just killable so that tasks cannot circumvent
> > > +                * throttling by sending themselves non-fatal signals...
> > > +                */
> > > +               __set_current_state(TASK_KILLABLE);
> > > +               io_schedule_timeout(pause);
> > > +
> > > +               /*
> > > +                * The following check is save without wb_written_wait.lock
> > > +                * because once bdi_remove_writer removes us from the list
> > > +                * noone will touch us and it's impossible for list_empty check
> > > +                * to trigger as false positive. The barrier is there to avoid
> > > +                * missing the wakeup when we are removed from the list.
> > > +                */
> > > +               smp_rmb();
> > > +               if (list_empty(&wc.list))
> > > +                       break;
> > > +
> > > +               if (!dirty_limits_exceeded(bdi))
> > > +                       break;
> > > +
> > > +               /*
> > > +                * Increase the delay for each loop, up to our previous
> > > +                * default of taking a 100ms nap.
> > > +                */
> > > +               pause <<= 1;
> > > +               if (pause > HZ / 10)
> > > +                       pause = HZ / 10;
> > > +       }
> > > +
> > > +       spin_lock_irq(&bdi->wb_written_wait.lock);
> > > +       __remove_wait_queue(&bdi->wb_written_wait, &wait);
> > > +       if (!list_empty(&wc.list))
> > > +               bdi_remove_writer(bdi, &wc);
> > > +       spin_unlock_irq(&bdi->wb_written_wait.lock);
> > > +} 
> > 
> > OK, so the whole pause thing is simply because we don't get a wakeup
> > when we drop below the limit, right?
>   Yes. I will write a comment about it before the loop. I was also thinking
> about sending a wakeup when we get below limits but then all threads would
> start thundering the device at the same time and likely cause a congestion
> again. This way we might get a smoother start. But I'll have to measure
> whether we aren't too unfair with this approach...
  I just got an idea - if the sleeping is too unfair (as threads at the end
of the FIFO are likely to have 'pause' smaller and thus could find out
earlier that the system is below dirty limits), we could share 'pause'
among all threads waiting for that BDI. That way threads would wake up
in a FIFO order...

								Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC] mm: Implement balance_dirty_pages() through waiting for flusher thread
  2010-06-21 14:10     ` Jan Kara
@ 2010-06-21 14:12       ` Peter Zijlstra
  0 siblings, 0 replies; 43+ messages in thread
From: Peter Zijlstra @ 2010-06-21 14:12 UTC (permalink / raw)
  To: Jan Kara; +Cc: linux-fsdevel, linux-mm, hch, akpm, wfg

On Mon, 2010-06-21 at 16:10 +0200, Jan Kara wrote:
>   I just got an idea - if the sleeping is too unfair (as threads at the end
> of the FIFO are likely to have 'pause' smaller and thus could find out
> earlier that the system is below dirty limits), we could share 'pause'
> among all threads waiting for that BDI. That way threads would wake up
> in a FIFO order... 


Sounds sensible.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC] mm: Implement balance_dirty_pages() through waiting for flusher thread
  2010-06-18  6:09 ` Dave Chinner
  2010-06-18  9:11   ` Peter Zijlstra
@ 2010-06-21 23:36   ` Jan Kara
  2010-06-22  5:44     ` Dave Chinner
  1 sibling, 1 reply; 43+ messages in thread
From: Jan Kara @ 2010-06-21 23:36 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Jan Kara, linux-fsdevel, linux-mm, hch, akpm, peterz, wfg

On Fri 18-06-10 16:09:01, Dave Chinner wrote:
> On Thu, Jun 17, 2010 at 08:04:38PM +0200, Jan Kara wrote:
> > This patch changes balance_dirty_pages() throttling so that the function does
> > not submit writes on its own but rather waits for flusher thread to do enough
> > writes. This has an advantage that we have a single source of IO allowing for
> > better writeback locality. Also we do not have to reenter filesystems from a
> > non-trivial context.
> > 
> > The waiting is implemented as follows: Each BDI has a FIFO of wait requests.
> > When balance_dirty_pages() finds it needs to throttle the writer, it adds a
> > request for writing write_chunk of pages to the FIFO and waits until the
> > request is fulfilled or we drop below dirty limits. A flusher thread tracks
> > amount of pages it has written and when enough pages are written since the
> > first request in the FIFO became first, it wakes up the waiter and removes
> > request from the FIFO (thus another request becomes first and flusher thread
> > starts writing out on behalf of this request).
> > 
> > CC: hch@infradead.org
> > CC: akpm@linux-foundation.org
> > CC: peterz@infradead.org
> > CC: wfg@mail.ustc.edu.cn
> > Signed-off-by: Jan Kara <jack@suse.cz>
...
> > +		wc = list_entry(bdi->wb_written_list.next,
> > +				struct bdi_written_count,
> > +				list);
> 
> list_first_entry()?
  Done, thanks.

> 
> > +		bdi->wb_written_head = bdi_stat(bdi, BDI_WRITTEN) + wc->written;
> 
> The resolution of the percpu counters is an issue here, I think.
> percpu counters update in batches of 32 counts per CPU. wc->written
> is going to have a value of roughly 8 or 32 depending on whether
> bdi->dirty_exceeded is set or not. I note that you take this into
> account when checking dirty threshold limits, but it doesn't appear
> to be taken in to here.
  Hmm, are you sure about the number of pages? I think that the ratelimits
you speak about influence only how often we *check* the limits. Looking at
sync_writeback_pages() I see:
static inline long sync_writeback_pages(unsigned long dirtied)
{
        if (dirtied < ratelimit_pages)
                dirtied = ratelimit_pages;

        return dirtied + dirtied / 2;
}

  So it returns at least ratelimit_pages * 3/2. Now ratelimit_pages is
computed in writeback_set_ratelimit() and for most machines I expect the
math there to end up with ratelimit_pages == 1024. So we enter
balance_dirty_pages with number 1536... That being said, the error of
percpu counters can still be significant - with 16 CPUs doing completion
the average error is 384 (on your max_cpus == 512 machine - our distro
kernels have that too).

> That means for ratelimits of 8 pages on writes and only one CPU doing
> IO completion wakeups are only going to occur in completion batches
> of 32 pages. we'll end up with something like:
> 
> waiter, written		count at head	count when woken
>  A, 8			n			n + 32
>  B, 8			n + 32			n + 64
>  C, 8			n + 64			n + 96
> .....
> 
> And so on. This isn't necessarily bad - we'll throttle for longer
> than we strictly need to - but the cumulative counter resolution
> error gets worse as the number of CPUs doing IO completion grows.
> Worst case ends up at for (num cpus * 31) + 1 pages of writeback for
> just the first waiter. For an arbitrary FIFO queue of depth d, the
> worst case is more like d * (num cpus * 31 + 1).
  Hmm, I don't see how the error would depend on the FIFO depth. I'd rather
think that the number of written pages after which a thread is woken is a
random variable equally distributed in a range
<wanted_nr - max_counter_err, wanted_nr + max_counter_err>. And over time
(or with more threads in the queue) these errors cancel out... Now as I
wrote above the error for single waiter might be just too large to accept
but I just wanted to argue here that I don't see how it would be getting
worse with more waiters.

> Hence I think that instead of calculating the next wakeup threshold
> when the head changes the wakeup threshold needs to be a function of
> the FIFO depth. That is, it is calculated at queueing time from the
> current tail of the queue.
> 
> e.g. something like this when queuing:
> 
> 	if (list_empty(&bdi->wb_written_list))
> 		wc->wakeup_at = bdi_stat(bdi, BDI_WRITTEN) + written;
> 		bdi->wb_written_head = wc->wakeup_at;
> 	else {
> 		tail = list_last_entry(&bdi->wb_written_list);
> 		wc->wakeup_at = tail->wakeup_at + written;
> 	}
> 	list_add_tail(&wc->list, &bdi->wb_written_list);
  I specifically wanted to avoid computing the wake up time while queueing
because if some thread quits the queue (a signal is currently probably the
only good reason; if a system gets below dirty limits for a short while so
that only a few threads notice is an arguable reason), you would need to
recompute them or live with waiting unnecessarily long.
  But we could compute next wake up time from wb_written_head (when valid)
which would make more sense. I like that.

> And this when the wakeup threshold is tripped:
> 
> 	written = bdi_stat(bdi, BDI_WRITTEN);
> 	while (!list_empty(&bdi->wb_written_list)) {
> 		wc = list_first_entry();
> 
> 		if (wc->wakeup_at > written)
> 			break;
> 
> 		list_del_init(wc)
> 		wakeup(wc)
> 	}
> 
> 	if (!list_empty(&bdi->wb_written_list)) {
> 		wc = list_first_entry();
> 		bdi->wb_written_head = wc->wakeup_at;
> 	} else
> 		bdi->wb_written_head = ~0ULL;
> 
> This takes the counter resolution completely out of the picture - if
> the counter resolution is 32, and there are 4 waiters on the fifo
> each waiting for 8 pages, then a single tick of the counter will
> wake them all up.
  Yeah, this would work with what I suggest above too. One just has to
write the loop carefully enough ;). Thanks for great suggestions.

								Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC] mm: Implement balance_dirty_pages() through waiting for flusher thread
  2010-06-21 13:42   ` Jan Kara
@ 2010-06-22  4:07     ` Wu Fengguang
  2010-06-22 13:27       ` Jan Kara
  0 siblings, 1 reply; 43+ messages in thread
From: Wu Fengguang @ 2010-06-22  4:07 UTC (permalink / raw)
  To: Jan Kara; +Cc: Peter Zijlstra, linux-fsdevel, linux-mm, hch, akpm

On Mon, Jun 21, 2010 at 03:42:39PM +0200, Jan Kara wrote:
> On Fri 18-06-10 12:21:37, Peter Zijlstra wrote:
> > On Thu, 2010-06-17 at 20:04 +0200, Jan Kara wrote:
> > > +               if (bdi_stat(bdi, BDI_WRITTEN) >= bdi->wb_written_head)
> > > +                       bdi_wakeup_writers(bdi); 
> > 
> > For the paranoid amongst us you could make wb_written_head s64 and write
> > the above as:
> > 
> >   if (bdi_stat(bdi, BDI_WRITTEN) - bdi->wb_written_head > 0)
> > 
> > Which, if you assume both are monotonic and wb_written_head is always
> > within 2^63 of the actual bdi_stat() value, should give the same end
> > result and deal with wrap-around.
> > 
> > For when we manage to create a device that can write 2^64 pages in our
> > uptime :-)
>   OK, the fix is simple enough so I've changed it, although I'm not
> paranoic enough ;) (I actually did the math before writing that test).

a bit more change :)

type:

-       u64 wb_written_head
+       s64 wb_written_head

resetting:

-                       bdi->wb_written_head = ~(u64)0;
+                       bdi->wb_written_head = 0;

setting:

                bdi->wb_written_head = bdi_stat(bdi, BDI_WRITTEN) + wc->written;
+               bdi->wb_written_head |= 1;

testing:

        if (bdi->wb_written_head &&
            bdi_stat(bdi, BDI_WRITTEN) - bdi->wb_written_head > 0)

This avoids calling into bdi_wakeup_writers() pointlessly when no one
is being throttled (which is the normal case).

Thanks,
Fengguang

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC] mm: Implement balance_dirty_pages() through waiting for flusher thread
  2010-06-21 23:36   ` Jan Kara
@ 2010-06-22  5:44     ` Dave Chinner
  2010-06-22  6:14       ` Andrew Morton
  2010-06-22 11:19       ` Jan Kara
  0 siblings, 2 replies; 43+ messages in thread
From: Dave Chinner @ 2010-06-22  5:44 UTC (permalink / raw)
  To: Jan Kara; +Cc: linux-fsdevel, linux-mm, hch, akpm, peterz, wfg

On Tue, Jun 22, 2010 at 01:36:29AM +0200, Jan Kara wrote:
> On Fri 18-06-10 16:09:01, Dave Chinner wrote:
> > On Thu, Jun 17, 2010 at 08:04:38PM +0200, Jan Kara wrote:
> > > This patch changes balance_dirty_pages() throttling so that the function does
> > > not submit writes on its own but rather waits for flusher thread to do enough
> > > writes. This has an advantage that we have a single source of IO allowing for
> > > better writeback locality. Also we do not have to reenter filesystems from a
> > > non-trivial context.
> > > 
> > > The waiting is implemented as follows: Each BDI has a FIFO of wait requests.
> > > When balance_dirty_pages() finds it needs to throttle the writer, it adds a
> > > request for writing write_chunk of pages to the FIFO and waits until the
> > > request is fulfilled or we drop below dirty limits. A flusher thread tracks
> > > amount of pages it has written and when enough pages are written since the
> > > first request in the FIFO became first, it wakes up the waiter and removes
> > > request from the FIFO (thus another request becomes first and flusher thread
> > > starts writing out on behalf of this request).
> > > 
> > > CC: hch@infradead.org
> > > CC: akpm@linux-foundation.org
> > > CC: peterz@infradead.org
> > > CC: wfg@mail.ustc.edu.cn
> > > Signed-off-by: Jan Kara <jack@suse.cz>
> ...
> > > +		wc = list_entry(bdi->wb_written_list.next,
> > > +				struct bdi_written_count,
> > > +				list);
> > 
> > list_first_entry()?
>   Done, thanks.
> 
> > 
> > > +		bdi->wb_written_head = bdi_stat(bdi, BDI_WRITTEN) + wc->written;
> > 
> > The resolution of the percpu counters is an issue here, I think.
> > percpu counters update in batches of 32 counts per CPU. wc->written
> > is going to have a value of roughly 8 or 32 depending on whether
> > bdi->dirty_exceeded is set or not. I note that you take this into
> > account when checking dirty threshold limits, but it doesn't appear
> > to be taken in to here.
>   Hmm, are you sure about the number of pages? I think that the ratelimits
> you speak about influence only how often we *check* the limits. Looking at
> sync_writeback_pages() I see:
> static inline long sync_writeback_pages(unsigned long dirtied)
> {
>         if (dirtied < ratelimit_pages)
>                 dirtied = ratelimit_pages;
> 
>         return dirtied + dirtied / 2;
> }

It's declared as:

  39 /*
  40  * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
  41  * will look to see if it needs to force writeback or throttling.
  42  */
  43 static long ratelimit_pages = 32;

>   So it returns at least ratelimit_pages * 3/2. Now ratelimit_pages is
> computed in writeback_set_ratelimit() and for most machines I expect the
> math there to end up with ratelimit_pages == 1024. So we enter
> balance_dirty_pages with number 1536...

But I missed this. That means we typically will check every 1024
pages until dirty_exceeded is set, then we'll check every 8 pages.

> That being said, the error of
> percpu counters can still be significant - with 16 CPUs doing completion
> the average error is 384 (on your max_cpus == 512 machine - our distro
> kernels have that too).

The error gets larger as the number of CPUs goes up - Peter pointed
that out because:

#define BDI_STAT_BATCH (8*(1+ilog2(nr_cpu_ids)))

So for a system reporting nr_cpu_ids = 32 (as my 8 core nehalem
server currently reports), the error could be 16 * 48 = 768.

> > That means for ratelimits of 8 pages on writes and only one CPU doing
> > IO completion wakeups are only going to occur in completion batches
> > of 32 pages. we'll end up with something like:
> > 
> > waiter, written		count at head	count when woken
> >  A, 8			n			n + 32
> >  B, 8			n + 32			n + 64
> >  C, 8			n + 64			n + 96
> > .....
> > 
> > And so on. This isn't necessarily bad - we'll throttle for longer
> > than we strictly need to - but the cumulative counter resolution
> > error gets worse as the number of CPUs doing IO completion grows.
> > Worst case ends up at for (num cpus * 31) + 1 pages of writeback for
> > just the first waiter. For an arbitrary FIFO queue of depth d, the
> > worst case is more like d * (num cpus * 31 + 1).
>   Hmm, I don't see how the error would depend on the FIFO depth.

It's the cumulative error that depends on the FIFO depth, not the
error seen by a single waiter.

>   I'd rather
> think that the number of written pages after which a thread is woken is a
> random variable equally distributed in a range
> <wanted_nr - max_counter_err, wanted_nr + max_counter_err>.

Right, that the error range seenby a single waiter. i.e. between

	bdi->wb_written_head = bdi_stat(written) + wc->pages_to_wait_for;

If we only want to wait for, say, 10 pages from a counter value of
N, the per-cpu counter increments in batches of BDI_STAT_BATCH.
Hence the first time we see bdi_stat(written) pass
bdi->wb_written_head will be when it reaches (N + BDI_STAT_BATCH).
We will have waited (BDI_STAT_BATCH - 10) pages more than we needed
to. i.e the wait time is a minimum of roundup(pages_to_wait_for,
BDI_STAT_BATCH).

> And over time
> (or with more threads in the queue) these errors cancel out... Now as I
> wrote above the error for single waiter might be just too large to accept
> but I just wanted to argue here that I don't see how it would be getting
> worse with more waiters.

That's where I think you go wrong - the error occurs when we remove
the waiter at the head of the queue, so each subsequent waiter has a
resolution error introduced. These will continue to aggregate as
long as there are more waiters on the queue.

e.g if we've got 3 waiters of 10 pages each, they should all be
woken after 30 pages have been cleared. Instead, the first is only
woken after the counter ticks (BDI_STAT_BATCH), then the second is
queued, and it doesn't get woken for another counter tick. Similarly
for the third waiter. When the third waiter is woken, it has waited
for 3 * BDI_STAT_BATCH pages. In reality, what we realy wanted was
all three waiters to be woken by the time 30 pages had been written
back. i.e. the last waiter waited for (3 * (BDI_STAT_BATCH - 10)),
which in general terms gives a wait overshoot for any given waiter of
roughly (queue depth when queued * BDI_STAT_BATCH * nr_cpu_ids) pages.

However, if we calculate the wakeup time when we queue, each waiter
ends up with a wakeup time of N + 10, N + 20, and N + 30, and the
counter tick resolution is taken out of the picture. i.e. if the
tick is 8, then the first wakeup occurs after 2 counter ticks, the
second is woken after 3 counter ticks and the last is woken after 4
counter ticks. If the counter tick is >= 32, then all three will be
woken on the first counter tick. In this case, the worst case wait
overshoot for any waiter regardless of queue depth is
(BDI_STAT_BATCH * nr_cpu_ids) pages.

> > Hence I think that instead of calculating the next wakeup threshold
> > when the head changes the wakeup threshold needs to be a function of
> > the FIFO depth. That is, it is calculated at queueing time from the
> > current tail of the queue.
> > 
> > e.g. something like this when queuing:
> > 
> > 	if (list_empty(&bdi->wb_written_list))
> > 		wc->wakeup_at = bdi_stat(bdi, BDI_WRITTEN) + written;
> > 		bdi->wb_written_head = wc->wakeup_at;
> > 	else {
> > 		tail = list_last_entry(&bdi->wb_written_list);
> > 		wc->wakeup_at = tail->wakeup_at + written;
> > 	}
> > 	list_add_tail(&wc->list, &bdi->wb_written_list);
>   I specifically wanted to avoid computing the wake up time while queueing
> because if some thread quits the queue (a signal is currently probably the
> only good reason;

Fair point.

> if a system gets below dirty limits for a short while so
> that only a few threads notice is an arguable reason), you would need to
> recompute them or live with waiting unnecessarily long.

I'd suggest that we shouldn't be waking waiters prematurely if we've
dropped below the dirty threshold - we need some hysteresis in the
system to prevent oscillating rapidly around the threshold. If we
are not issuing IO, then if we don't damp the wakeup frequency we
could easily thrash around the threshold.

I'd prefer to strip down the mechanism to the simplest, most
accurate form before trying to add "interesting" tweaks like faster
wakeup semantics at dirty thresholds....

>   But we could compute next wake up time from wb_written_head (when valid)
> which would make more sense. I like that.

Yes, I think that would work because everything would then be delta
values from a constant base - it just delays the wakeup threshold
calculation until the waiter hits the head of the queue.....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC] mm: Implement balance_dirty_pages() through waiting for flusher thread
  2010-06-22  5:44     ` Dave Chinner
@ 2010-06-22  6:14       ` Andrew Morton
  2010-06-22  7:45         ` Peter Zijlstra
  2010-06-22 10:09         ` Dave Chinner
  2010-06-22 11:19       ` Jan Kara
  1 sibling, 2 replies; 43+ messages in thread
From: Andrew Morton @ 2010-06-22  6:14 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Jan Kara, linux-fsdevel, linux-mm, hch, peterz, wfg

On Tue, 22 Jun 2010 15:44:09 +1000 Dave Chinner <david@fromorbit.com> wrote:

> > > And so on. This isn't necessarily bad - we'll throttle for longer
> > > than we strictly need to - but the cumulative counter resolution
> > > error gets worse as the number of CPUs doing IO completion grows.
> > > Worst case ends up at for (num cpus * 31) + 1 pages of writeback for
> > > just the first waiter. For an arbitrary FIFO queue of depth d, the
> > > worst case is more like d * (num cpus * 31 + 1).
> >   Hmm, I don't see how the error would depend on the FIFO depth.
> 
> It's the cumulative error that depends on the FIFO depth, not the
> error seen by a single waiter.

Could use the below to basically eliminate the inaccuracies.

Obviously things might get a bit expensive in certain threshold cases
but with some hysteresis that should be manageable.






From: Tim Chen <tim.c.chen@linux.intel.com>

Add percpu_counter_compare that allows for a quick but accurate comparison
of percpu_counter with a given value.

A rough count is provided by the count field in percpu_counter structure,
without accounting for the other values stored in individual cpu counters.
 The actual count is a sum of count and the cpu counters.  However, count
field is never different from the actual value by a factor of
batch*num_online_cpu.  We do not need to get actual count for comparison
if count is different from the given value by this factor and allows for
quick comparison without summing up all the per cpu counters.

Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 include/linux/percpu_counter.h |   11 +++++++++++
 lib/percpu_counter.c           |   27 +++++++++++++++++++++++++++
 2 files changed, 38 insertions(+)

diff -puN include/linux/percpu_counter.h~tmpfs-add-accurate-compare-function-to-percpu_counter-library include/linux/percpu_counter.h
--- a/include/linux/percpu_counter.h~tmpfs-add-accurate-compare-function-to-percpu_counter-library
+++ a/include/linux/percpu_counter.h
@@ -40,6 +40,7 @@ void percpu_counter_destroy(struct percp
 void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
 void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch);
 s64 __percpu_counter_sum(struct percpu_counter *fbc);
+int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs);
 
 static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
 {
@@ -98,6 +99,16 @@ static inline void percpu_counter_set(st
 	fbc->count = amount;
 }
 
+static inline int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs)
+{
+	if (fbc->count > rhs)
+		return 1;
+	else if (fbc->count < rhs)
+		return -1;
+	else
+		return 0;
+}
+
 static inline void
 percpu_counter_add(struct percpu_counter *fbc, s64 amount)
 {
diff -puN lib/percpu_counter.c~tmpfs-add-accurate-compare-function-to-percpu_counter-library lib/percpu_counter.c
--- a/lib/percpu_counter.c~tmpfs-add-accurate-compare-function-to-percpu_counter-library
+++ a/lib/percpu_counter.c
@@ -138,6 +138,33 @@ static int __cpuinit percpu_counter_hotc
 	return NOTIFY_OK;
 }
 
+/*
+ * Compare counter against given value.
+ * Return 1 if greater, 0 if equal and -1 if less
+ */
+int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs)
+{
+	s64	count;
+
+	count = percpu_counter_read(fbc);
+	/* Check to see if rough count will be sufficient for comparison */
+	if (abs(count - rhs) > (percpu_counter_batch*num_online_cpus())) {
+		if (count > rhs)
+			return 1;
+		else
+			return -1;
+	}
+	/* Need to use precise count */
+	count = percpu_counter_sum(fbc);
+	if (count > rhs)
+		return 1;
+	else if (count < rhs)
+		return -1;
+	else
+		return 0;
+}
+EXPORT_SYMBOL(percpu_counter_compare);
+
 static int __init percpu_counter_startup(void)
 {
 	compute_batch_value();
_

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC] mm: Implement balance_dirty_pages() through waiting for flusher thread
  2010-06-22  6:14       ` Andrew Morton
@ 2010-06-22  7:45         ` Peter Zijlstra
  2010-06-22  8:24           ` Andrew Morton
  2010-06-22 10:09         ` Dave Chinner
  1 sibling, 1 reply; 43+ messages in thread
From: Peter Zijlstra @ 2010-06-22  7:45 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Dave Chinner, Jan Kara, linux-fsdevel, linux-mm, hch, wfg

On Mon, 2010-06-21 at 23:14 -0700, Andrew Morton wrote:
> +/*
> + * Compare counter against given value.
> + * Return 1 if greater, 0 if equal and -1 if less
> + */
> +int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs)
> +{
> +       s64     count;
> +
> +       count = percpu_counter_read(fbc);
> +       /* Check to see if rough count will be sufficient for comparison */
> +       if (abs(count - rhs) > (percpu_counter_batch*num_online_cpus())) {
> +               if (count > rhs)
> +                       return 1;
> +               else
> +                       return -1;
> +       }
> +       /* Need to use precise count */
> +       count = percpu_counter_sum(fbc);
> +       if (count > rhs)
> +               return 1;
> +       else if (count < rhs)
> +               return -1;
> +       else
> +               return 0;
> +}
> +EXPORT_SYMBOL(percpu_counter_compare); 

That won't quite work as advertised for the bdi stuff since we use a
custom batch size.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC] mm: Implement balance_dirty_pages() through waiting for flusher thread
  2010-06-22  7:45         ` Peter Zijlstra
@ 2010-06-22  8:24           ` Andrew Morton
  2010-06-22  8:52             ` Peter Zijlstra
  0 siblings, 1 reply; 43+ messages in thread
From: Andrew Morton @ 2010-06-22  8:24 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Dave Chinner, Jan Kara, linux-fsdevel, linux-mm, hch, wfg

On Tue, 22 Jun 2010 09:45:22 +0200 Peter Zijlstra <peterz@infradead.org> wrote:

> On Mon, 2010-06-21 at 23:14 -0700, Andrew Morton wrote:
> > +/*
> > + * Compare counter against given value.
> > + * Return 1 if greater, 0 if equal and -1 if less
> > + */
> > +int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs)
> > +{
> > +       s64     count;
> > +
> > +       count = percpu_counter_read(fbc);
> > +       /* Check to see if rough count will be sufficient for comparison */
> > +       if (abs(count - rhs) > (percpu_counter_batch*num_online_cpus())) {
> > +               if (count > rhs)
> > +                       return 1;
> > +               else
> > +                       return -1;
> > +       }
> > +       /* Need to use precise count */
> > +       count = percpu_counter_sum(fbc);
> > +       if (count > rhs)
> > +               return 1;
> > +       else if (count < rhs)
> > +               return -1;
> > +       else
> > +               return 0;
> > +}
> > +EXPORT_SYMBOL(percpu_counter_compare); 
> 
> That won't quite work as advertised for the bdi stuff since we use a
> custom batch size.

Oh come on, of course it will.  It just needs
__percpu_counter_compare() as I mentioned when merging it.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC] mm: Implement balance_dirty_pages() through waiting for flusher thread
  2010-06-22  8:24           ` Andrew Morton
@ 2010-06-22  8:52             ` Peter Zijlstra
  0 siblings, 0 replies; 43+ messages in thread
From: Peter Zijlstra @ 2010-06-22  8:52 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Dave Chinner, Jan Kara, linux-fsdevel, linux-mm, hch, wfg

On Tue, 2010-06-22 at 01:24 -0700, Andrew Morton wrote:
> 
> Oh come on, of course it will.  It just needs
> __percpu_counter_compare() as I mentioned when merging it. 

Sure, all I was saying is that something like that needs doing..

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC] mm: Implement balance_dirty_pages() through waiting for flusher thread
  2010-06-22  6:14       ` Andrew Morton
  2010-06-22  7:45         ` Peter Zijlstra
@ 2010-06-22 10:09         ` Dave Chinner
  2010-06-22 13:17           ` Jan Kara
  1 sibling, 1 reply; 43+ messages in thread
From: Dave Chinner @ 2010-06-22 10:09 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Jan Kara, linux-fsdevel, linux-mm, hch, peterz, wfg

On Mon, Jun 21, 2010 at 11:14:16PM -0700, Andrew Morton wrote:
> On Tue, 22 Jun 2010 15:44:09 +1000 Dave Chinner <david@fromorbit.com> wrote:
> 
> > > > And so on. This isn't necessarily bad - we'll throttle for longer
> > > > than we strictly need to - but the cumulative counter resolution
> > > > error gets worse as the number of CPUs doing IO completion grows.
> > > > Worst case ends up at for (num cpus * 31) + 1 pages of writeback for
> > > > just the first waiter. For an arbitrary FIFO queue of depth d, the
> > > > worst case is more like d * (num cpus * 31 + 1).
> > >   Hmm, I don't see how the error would depend on the FIFO depth.
> > 
> > It's the cumulative error that depends on the FIFO depth, not the
> > error seen by a single waiter.
> 
> Could use the below to basically eliminate the inaccuracies.
> 
> Obviously things might get a bit expensive in certain threshold cases
> but with some hysteresis that should be manageable.

That seems a lot more... unpredictable than modifying the accounting
to avoid cumulative errors.

> +	/* Check to see if rough count will be sufficient for comparison */
> +	if (abs(count - rhs) > (percpu_counter_batch*num_online_cpus())) {

Also, that's a big margin when we are doing equality matches for
every page IO completion. If we a large CPU count machine where
per-cpu counters actually improve performance (say 16p) then we're
going to be hitting the slow path for the last 512 pages of every
waiter. Hence I think the counter sum is compared too often to scale
with this method of comparison.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC] mm: Implement balance_dirty_pages() through waiting for flusher thread
  2010-06-22  5:44     ` Dave Chinner
  2010-06-22  6:14       ` Andrew Morton
@ 2010-06-22 11:19       ` Jan Kara
  1 sibling, 0 replies; 43+ messages in thread
From: Jan Kara @ 2010-06-22 11:19 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Jan Kara, linux-fsdevel, linux-mm, hch, akpm, peterz, wfg

On Tue 22-06-10 15:44:09, Dave Chinner wrote:
> On Tue, Jun 22, 2010 at 01:36:29AM +0200, Jan Kara wrote:
> > On Fri 18-06-10 16:09:01, Dave Chinner wrote:
> > > On Thu, Jun 17, 2010 at 08:04:38PM +0200, Jan Kara wrote:
> > > > This patch changes balance_dirty_pages() throttling so that the function does
> > > > not submit writes on its own but rather waits for flusher thread to do enough
> > > > writes. This has an advantage that we have a single source of IO allowing for
> > > > better writeback locality. Also we do not have to reenter filesystems from a
> > > > non-trivial context.
> > > > 
> > > > The waiting is implemented as follows: Each BDI has a FIFO of wait requests.
> > > > When balance_dirty_pages() finds it needs to throttle the writer, it adds a
> > > > request for writing write_chunk of pages to the FIFO and waits until the
> > > > request is fulfilled or we drop below dirty limits. A flusher thread tracks
> > > > amount of pages it has written and when enough pages are written since the
> > > > first request in the FIFO became first, it wakes up the waiter and removes
> > > > request from the FIFO (thus another request becomes first and flusher thread
> > > > starts writing out on behalf of this request).
> > > > 
> > > > CC: hch@infradead.org
> > > > CC: akpm@linux-foundation.org
> > > > CC: peterz@infradead.org
> > > > CC: wfg@mail.ustc.edu.cn
> > > > Signed-off-by: Jan Kara <jack@suse.cz>
...
> > > > +		bdi->wb_written_head = bdi_stat(bdi, BDI_WRITTEN) + wc->written;
> > > 
> > > The resolution of the percpu counters is an issue here, I think.
> > > percpu counters update in batches of 32 counts per CPU. wc->written
> > > is going to have a value of roughly 8 or 32 depending on whether
> > > bdi->dirty_exceeded is set or not. I note that you take this into
> > > account when checking dirty threshold limits, but it doesn't appear
> > > to be taken in to here.
> >   Hmm, are you sure about the number of pages? I think that the ratelimits
> > you speak about influence only how often we *check* the limits. Looking at
> > sync_writeback_pages() I see:
> > static inline long sync_writeback_pages(unsigned long dirtied)
> > {
> >         if (dirtied < ratelimit_pages)
> >                 dirtied = ratelimit_pages;
> > 
> >         return dirtied + dirtied / 2;
> > }
> 
> It's declared as:
> 
>   39 /*
>   40  * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
>   41  * will look to see if it needs to force writeback or throttling.
>   42  */
>   43 static long ratelimit_pages = 32;
  Yeah, I know, it's a bit confusing...

> >   So it returns at least ratelimit_pages * 3/2. Now ratelimit_pages is
> > computed in writeback_set_ratelimit() and for most machines I expect the
> > math there to end up with ratelimit_pages == 1024. So we enter
> > balance_dirty_pages with number 1536...
> 
> But I missed this. That means we typically will check every 1024
> pages until dirty_exceeded is set, then we'll check every 8 pages.
  Yes, with dirty_exceeded we check every 8 pages but still require 1536
pages written.

> > That being said, the error of
> > percpu counters can still be significant - with 16 CPUs doing completion
> > the average error is 384 (on your max_cpus == 512 machine - our distro
> > kernels have that too).
> 
> The error gets larger as the number of CPUs goes up - Peter pointed
> that out because:
> 
> #define BDI_STAT_BATCH (8*(1+ilog2(nr_cpu_ids)))
> 
> So for a system reporting nr_cpu_ids = 32 (as my 8 core nehalem
> server currently reports), the error could be 16 * 48 = 768.
>
> > > That means for ratelimits of 8 pages on writes and only one CPU doing
> > > IO completion wakeups are only going to occur in completion batches
> > > of 32 pages. we'll end up with something like:
> > > 
> > > waiter, written		count at head	count when woken
> > >  A, 8			n			n + 32
> > >  B, 8			n + 32			n + 64
> > >  C, 8			n + 64			n + 96
> > > .....
> > > 
> > > And so on. This isn't necessarily bad - we'll throttle for longer
> > > than we strictly need to - but the cumulative counter resolution
> > > error gets worse as the number of CPUs doing IO completion grows.
> > > Worst case ends up at for (num cpus * 31) + 1 pages of writeback for
> > > just the first waiter. For an arbitrary FIFO queue of depth d, the
> > > worst case is more like d * (num cpus * 31 + 1).
> >   Hmm, I don't see how the error would depend on the FIFO depth.
> 
> It's the cumulative error that depends on the FIFO depth, not the
> error seen by a single waiter.
> 
> >   I'd rather
> > think that the number of written pages after which a thread is woken is a
> > random variable equally distributed in a range
> > <wanted_nr - max_counter_err, wanted_nr + max_counter_err>.
> 
> Right, that the error range seenby a single waiter. i.e. between
> 
> 	bdi->wb_written_head = bdi_stat(written) + wc->pages_to_wait_for;
> 
> If we only want to wait for, say, 10 pages from a counter value of
> N, the per-cpu counter increments in batches of BDI_STAT_BATCH.
> Hence the first time we see bdi_stat(written) pass
> bdi->wb_written_head will be when it reaches (N + BDI_STAT_BATCH).
> We will have waited (BDI_STAT_BATCH - 10) pages more than we needed
> to. i.e the wait time is a minimum of roundup(pages_to_wait_for,
> BDI_STAT_BATCH).
> 
> > And over time
> > (or with more threads in the queue) these errors cancel out... Now as I
> > wrote above the error for single waiter might be just too large to accept
> > but I just wanted to argue here that I don't see how it would be getting
> > worse with more waiters.
> 
> That's where I think you go wrong - the error occurs when we remove
> the waiter at the head of the queue, so each subsequent waiter has a
> resolution error introduced. These will continue to aggregate as
> long as there are more waiters on the queue.
> 
> e.g if we've got 3 waiters of 10 pages each, they should all be
> woken after 30 pages have been cleared. Instead, the first is only
> woken after the counter ticks (BDI_STAT_BATCH), then the second is
> queued, and it doesn't get woken for another counter tick. Similarly
> for the third waiter. When the third waiter is woken, it has waited
> for 3 * BDI_STAT_BATCH pages. In reality, what we realy wanted was
> all three waiters to be woken by the time 30 pages had been written
> back. i.e. the last waiter waited for (3 * (BDI_STAT_BATCH - 10)),
> which in general terms gives a wait overshoot for any given waiter of
> roughly (queue depth when queued * BDI_STAT_BATCH * nr_cpu_ids) pages.
  Ah, you are right... Thanks for the correction.

> However, if we calculate the wakeup time when we queue, each waiter
> ends up with a wakeup time of N + 10, N + 20, and N + 30, and the
> counter tick resolution is taken out of the picture. i.e. if the
> tick is 8, then the first wakeup occurs after 2 counter ticks, the
> second is woken after 3 counter ticks and the last is woken after 4
> counter ticks. If the counter tick is >= 32, then all three will be
> woken on the first counter tick. In this case, the worst case wait
> overshoot for any waiter regardless of queue depth is
> (BDI_STAT_BATCH * nr_cpu_ids) pages.
  Yes.

> > if a system gets below dirty limits for a short while so
> > that only a few threads notice is an arguable reason), you would need to
> > recompute them or live with waiting unnecessarily long.
> 
> I'd suggest that we shouldn't be waking waiters prematurely if we've
> dropped below the dirty threshold - we need some hysteresis in the
> system to prevent oscillating rapidly around the threshold. If we
> are not issuing IO, then if we don't damp the wakeup frequency we
> could easily thrash around the threshold.
  The current logic is that we do not wake up threads when the system
gets below limit. But threads wake up after a certain time passes, recheck
dirty limits, and quit waiting if limits are not exceeded. This is quite
similar to the old behavior and should provide a reasonably smooth start.

> I'd prefer to strip down the mechanism to the simplest, most
> accurate form before trying to add "interesting" tweaks like faster
> wakeup semantics at dirty thresholds....
  I am a bit reluctant to not stop waiting when the system is below dirty
limits for two reasons:
  a) it's a bit more unfair because the thread has to wait longer (until
enough is written) while other threads happily perform writes because
the system isn't above dirty limits anymore.
  b) we require 1536 pages written regardless of how many pages were
dirtied. Now in theory, if there are say 8 threads writing, the threads
would require 12288 pages written in total. But when a system has just
32768 pages, the dirty limit is set at 6553 pages so there even cannot
be that many dirty pages in the system.
  Of course, we could trim down the number of pages we wait for when we
aren't doing actual IO with new balance_dirty_pages() (which would reduce
scope of the above problem). But still we shouldn't set it too low because
all that checking, sleeping, and waking costs something so longer sleeps
are beneficial from this POV. So all in all I would prefer to keep some
'backup' waking mechanism in place...
 
								Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC] mm: Implement balance_dirty_pages() through waiting for flusher thread
  2010-06-22 10:09         ` Dave Chinner
@ 2010-06-22 13:17           ` Jan Kara
  2010-06-22 13:52             ` Wu Fengguang
  0 siblings, 1 reply; 43+ messages in thread
From: Jan Kara @ 2010-06-22 13:17 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Andrew Morton, Jan Kara, linux-fsdevel, linux-mm, hch, peterz,
	wfg

On Tue 22-06-10 20:09:24, Dave Chinner wrote:
> On Mon, Jun 21, 2010 at 11:14:16PM -0700, Andrew Morton wrote:
> > On Tue, 22 Jun 2010 15:44:09 +1000 Dave Chinner <david@fromorbit.com> wrote:
> > 
> > > > > And so on. This isn't necessarily bad - we'll throttle for longer
> > > > > than we strictly need to - but the cumulative counter resolution
> > > > > error gets worse as the number of CPUs doing IO completion grows.
> > > > > Worst case ends up at for (num cpus * 31) + 1 pages of writeback for
> > > > > just the first waiter. For an arbitrary FIFO queue of depth d, the
> > > > > worst case is more like d * (num cpus * 31 + 1).
> > > >   Hmm, I don't see how the error would depend on the FIFO depth.
> > > 
> > > It's the cumulative error that depends on the FIFO depth, not the
> > > error seen by a single waiter.
> > 
> > Could use the below to basically eliminate the inaccuracies.
> > 
> > Obviously things might get a bit expensive in certain threshold cases
> > but with some hysteresis that should be manageable.
> 
> That seems a lot more... unpredictable than modifying the accounting
> to avoid cumulative errors.
> 
> > +	/* Check to see if rough count will be sufficient for comparison */
> > +	if (abs(count - rhs) > (percpu_counter_batch*num_online_cpus())) {
> 
> Also, that's a big margin when we are doing equality matches for
> every page IO completion. If we a large CPU count machine where
> per-cpu counters actually improve performance (say 16p) then we're
> going to be hitting the slow path for the last 512 pages of every
> waiter. Hence I think the counter sum is compared too often to scale
> with this method of comparison.
  On the other hand I think we will have to come up with something
more clever than what I do now because for some huge machines with
nr_cpu_ids == 256, the error of the counter is 256*9*8 = 18432 so that's
already unacceptable given the amounts we want to check (like 1536) -
already for nr_cpu_ids == 32, the error is the same as the difference we
want to check.  I think we'll have to come up with some scheme whose error
is not dependent on the number of cpus or if it is dependent, it's only a
weak dependency (like a logarithm or so).
  Or we could rely on the fact that IO completions for a bdi won't happen on
all CPUs and thus the error would be much more bounded. But I'm not sure
how much that is true or not.

								Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC] mm: Implement balance_dirty_pages() through waiting for flusher thread
  2010-06-22  4:07     ` Wu Fengguang
@ 2010-06-22 13:27       ` Jan Kara
  2010-06-22 13:33         ` Wu Fengguang
  0 siblings, 1 reply; 43+ messages in thread
From: Jan Kara @ 2010-06-22 13:27 UTC (permalink / raw)
  To: Wu Fengguang; +Cc: Jan Kara, Peter Zijlstra, linux-fsdevel, linux-mm, hch, akpm

On Tue 22-06-10 12:07:27, Wu Fengguang wrote:
> On Mon, Jun 21, 2010 at 03:42:39PM +0200, Jan Kara wrote:
> > On Fri 18-06-10 12:21:37, Peter Zijlstra wrote:
> > > On Thu, 2010-06-17 at 20:04 +0200, Jan Kara wrote:
> > > > +               if (bdi_stat(bdi, BDI_WRITTEN) >= bdi->wb_written_head)
> > > > +                       bdi_wakeup_writers(bdi); 
> > > 
> > > For the paranoid amongst us you could make wb_written_head s64 and write
> > > the above as:
> > > 
> > >   if (bdi_stat(bdi, BDI_WRITTEN) - bdi->wb_written_head > 0)
> > > 
> > > Which, if you assume both are monotonic and wb_written_head is always
> > > within 2^63 of the actual bdi_stat() value, should give the same end
> > > result and deal with wrap-around.
> > > 
> > > For when we manage to create a device that can write 2^64 pages in our
> > > uptime :-)
> >   OK, the fix is simple enough so I've changed it, although I'm not
> > paranoic enough ;) (I actually did the math before writing that test).
> 
> a bit more change :)
> 
> type:
> 
> -       u64 wb_written_head
> +       s64 wb_written_head
> 
> resetting:
> 
> -                       bdi->wb_written_head = ~(u64)0;
> +                       bdi->wb_written_head = 0;
> 
> setting:
> 
>                 bdi->wb_written_head = bdi_stat(bdi, BDI_WRITTEN) + wc->written;
> +               bdi->wb_written_head |= 1;
> 
> testing:
> 
>         if (bdi->wb_written_head &&
>             bdi_stat(bdi, BDI_WRITTEN) - bdi->wb_written_head > 0)
> 
> This avoids calling into bdi_wakeup_writers() pointlessly when no one
> is being throttled (which is the normal case).
  Actually, I've already changed wb_written_head to s64. I kept setting
wb_written_head to s64 maximum. That also avoids calling into
bdi_wakeup_writers() unnecessarily...

								Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC] mm: Implement balance_dirty_pages() through waiting for flusher thread
  2010-06-22 13:27       ` Jan Kara
@ 2010-06-22 13:33         ` Wu Fengguang
  0 siblings, 0 replies; 43+ messages in thread
From: Wu Fengguang @ 2010-06-22 13:33 UTC (permalink / raw)
  To: Jan Kara
  Cc: Peter Zijlstra, linux-fsdevel@vger.kernel.org, linux-mm@kvack.org,
	hch@infradead.org, akpm@linux-foundation.org

On Tue, Jun 22, 2010 at 09:27:35PM +0800, Jan Kara wrote:
> On Tue 22-06-10 12:07:27, Wu Fengguang wrote:
> > On Mon, Jun 21, 2010 at 03:42:39PM +0200, Jan Kara wrote:
> > > On Fri 18-06-10 12:21:37, Peter Zijlstra wrote:
> > > > On Thu, 2010-06-17 at 20:04 +0200, Jan Kara wrote:
> > > > > +               if (bdi_stat(bdi, BDI_WRITTEN) >= bdi->wb_written_head)
> > > > > +                       bdi_wakeup_writers(bdi); 
> > > > 
> > > > For the paranoid amongst us you could make wb_written_head s64 and write
> > > > the above as:
> > > > 
> > > >   if (bdi_stat(bdi, BDI_WRITTEN) - bdi->wb_written_head > 0)
> > > > 
> > > > Which, if you assume both are monotonic and wb_written_head is always
> > > > within 2^63 of the actual bdi_stat() value, should give the same end
> > > > result and deal with wrap-around.
> > > > 
> > > > For when we manage to create a device that can write 2^64 pages in our
> > > > uptime :-)
> > >   OK, the fix is simple enough so I've changed it, although I'm not
> > > paranoic enough ;) (I actually did the math before writing that test).
> > 
> > a bit more change :)
> > 
> > type:
> > 
> > -       u64 wb_written_head
> > +       s64 wb_written_head
> > 
> > resetting:
> > 
> > -                       bdi->wb_written_head = ~(u64)0;
> > +                       bdi->wb_written_head = 0;
> > 
> > setting:
> > 
> >                 bdi->wb_written_head = bdi_stat(bdi, BDI_WRITTEN) + wc->written;
> > +               bdi->wb_written_head |= 1;
> > 
> > testing:
> > 
> >         if (bdi->wb_written_head &&
> >             bdi_stat(bdi, BDI_WRITTEN) - bdi->wb_written_head > 0)
> > 
> > This avoids calling into bdi_wakeup_writers() pointlessly when no one
> > is being throttled (which is the normal case).
>   Actually, I've already changed wb_written_head to s64. I kept setting
> wb_written_head to s64 maximum. That also avoids calling into
> bdi_wakeup_writers() unnecessarily...

Ah OK, I forgot bdi_stat() calls percpu_counter_read_positive() which
is always in range [0, s64 max].

Thanks,
Fengguang

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC] mm: Implement balance_dirty_pages() through waiting for flusher thread
  2010-06-22 13:17           ` Jan Kara
@ 2010-06-22 13:52             ` Wu Fengguang
  2010-06-22 13:59               ` Peter Zijlstra
                                 ` (3 more replies)
  0 siblings, 4 replies; 43+ messages in thread
From: Wu Fengguang @ 2010-06-22 13:52 UTC (permalink / raw)
  To: Jan Kara
  Cc: Dave Chinner, Andrew Morton, linux-fsdevel, linux-mm, hch, peterz

>   On the other hand I think we will have to come up with something
> more clever than what I do now because for some huge machines with
> nr_cpu_ids == 256, the error of the counter is 256*9*8 = 18432 so that's
> already unacceptable given the amounts we want to check (like 1536) -
> already for nr_cpu_ids == 32, the error is the same as the difference we
> want to check.  I think we'll have to come up with some scheme whose error
> is not dependent on the number of cpus or if it is dependent, it's only a
> weak dependency (like a logarithm or so).
>   Or we could rely on the fact that IO completions for a bdi won't happen on
> all CPUs and thus the error would be much more bounded. But I'm not sure
> how much that is true or not.

Yes the per CPU counter seems tricky. How about plain atomic operations? 

This test shows that atomic_dec_and_test() is about 4.5 times slower
than plain i-- in a 4-core CPU. Not bad.

Note that
1) we can avoid the atomic operations when there are no active waiters
2) most writeback will be submitted by one per-bdi-flusher, so no worry
   of cache bouncing (this also means the per CPU counter error is
   normally bounded by the batch size)
3) the cost of atomic inc/dec will be weakly related to core numbers
   but never socket numbers (based on 2), so won't scale too bad

Thanks,
Fengguang
---
$ perf stat ./atomic

 Performance counter stats for './atomic':

         903.875304  task-clock-msecs         #      0.998 CPUs 
                 76  context-switches         #      0.000 M/sec
                  0  CPU-migrations           #      0.000 M/sec
                 98  page-faults              #      0.000 M/sec
         3011186459  cycles                   #   3331.418 M/sec
         1608926490  instructions             #      0.534 IPC  
          301481656  branches                 #    333.543 M/sec
              94932  branch-misses            #      0.031 %    
              88687  cache-references         #      0.098 M/sec
               1286  cache-misses             #      0.001 M/sec

        0.905576197  seconds time elapsed

$ perf stat ./non-atomic

 Performance counter stats for './non-atomic':

         215.315814  task-clock-msecs         #      0.996 CPUs 
                 18  context-switches         #      0.000 M/sec
                  0  CPU-migrations           #      0.000 M/sec
                 99  page-faults              #      0.000 M/sec
          704358635  cycles                   #   3271.281 M/sec
          303445790  instructions             #      0.431 IPC  
          100574889  branches                 #    467.104 M/sec
              39323  branch-misses            #      0.039 %    
              36064  cache-references         #      0.167 M/sec
                850  cache-misses             #      0.004 M/sec

        0.216175521  seconds time elapsed


--------------------------------------------------------------------------------
$ cat atomic.c 
#include <stdio.h> 

typedef struct {
        int counter;
} atomic_t;

static inline int atomic_dec_and_test(atomic_t *v)
{      
        unsigned char c;

        asm volatile("lock; decl %0; sete %1"
                     : "+m" (v->counter), "=qm" (c)
                     : : "memory");
        return c != 0;
}

int main(void)
{ 
        atomic_t i;

        i.counter = 100000000;

        for (; !atomic_dec_and_test(&i);)
                ;

        return 0;
}

--------------------------------------------------------------------------------
$ cat non-atomic.c 
#include <stdio.h> 

int main(void)
{ 
        int i;

        for (i = 100000000; i; i--)
                ;

        return 0;
}

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC] mm: Implement balance_dirty_pages() through waiting for flusher thread
  2010-06-22 13:52             ` Wu Fengguang
@ 2010-06-22 13:59               ` Peter Zijlstra
  2010-06-22 14:00               ` Peter Zijlstra
                                 ` (2 subsequent siblings)
  3 siblings, 0 replies; 43+ messages in thread
From: Peter Zijlstra @ 2010-06-22 13:59 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Jan Kara, Dave Chinner, Andrew Morton, linux-fsdevel, linux-mm,
	hch

On Tue, 2010-06-22 at 21:52 +0800, Wu Fengguang wrote:
> 
> This test shows that atomic_dec_and_test() is about 4.5 times slower
> than plain i-- in a 4-core CPU. Not bad. 

It gets worse - much worse - on larger machines.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC] mm: Implement balance_dirty_pages() through waiting for flusher thread
  2010-06-22 13:52             ` Wu Fengguang
  2010-06-22 13:59               ` Peter Zijlstra
@ 2010-06-22 14:00               ` Peter Zijlstra
  2010-06-22 14:36                 ` Wu Fengguang
  2010-06-22 14:02               ` Jan Kara
  2010-06-22 14:31               ` Christoph Hellwig
  3 siblings, 1 reply; 43+ messages in thread
From: Peter Zijlstra @ 2010-06-22 14:00 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Jan Kara, Dave Chinner, Andrew Morton, linux-fsdevel, linux-mm,
	hch

On Tue, 2010-06-22 at 21:52 +0800, Wu Fengguang wrote:
> #include <stdio.h> 
> 
> typedef struct {
>         int counter;
> } atomic_t;
> 
> static inline int atomic_dec_and_test(atomic_t *v)
> {      
>         unsigned char c;
> 
>         asm volatile("lock; decl %0; sete %1"
>                      : "+m" (v->counter), "=qm" (c)
>                      : : "memory");
>         return c != 0;
> }
> 
> int main(void)
> { 
>         atomic_t i;
> 
>         i.counter = 100000000;
> 
>         for (; !atomic_dec_and_test(&i);)
>                 ;
> 
>         return 0;
> } 

This test utterly fails to stress the concurrency, you want to create
nr_cpus threads and then pound the global variable. Then compare it
against the per-cpu-counter variant.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC] mm: Implement balance_dirty_pages() through waiting for flusher thread
  2010-06-22 13:52             ` Wu Fengguang
  2010-06-22 13:59               ` Peter Zijlstra
  2010-06-22 14:00               ` Peter Zijlstra
@ 2010-06-22 14:02               ` Jan Kara
  2010-06-22 14:24                 ` Wu Fengguang
  2010-06-22 22:29                 ` Dave Chinner
  2010-06-22 14:31               ` Christoph Hellwig
  3 siblings, 2 replies; 43+ messages in thread
From: Jan Kara @ 2010-06-22 14:02 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Jan Kara, Dave Chinner, Andrew Morton, linux-fsdevel, linux-mm,
	hch, peterz

On Tue 22-06-10 21:52:34, Wu Fengguang wrote:
> >   On the other hand I think we will have to come up with something
> > more clever than what I do now because for some huge machines with
> > nr_cpu_ids == 256, the error of the counter is 256*9*8 = 18432 so that's
> > already unacceptable given the amounts we want to check (like 1536) -
> > already for nr_cpu_ids == 32, the error is the same as the difference we
> > want to check.  I think we'll have to come up with some scheme whose error
> > is not dependent on the number of cpus or if it is dependent, it's only a
> > weak dependency (like a logarithm or so).
> >   Or we could rely on the fact that IO completions for a bdi won't happen on
> > all CPUs and thus the error would be much more bounded. But I'm not sure
> > how much that is true or not.
> 
> Yes the per CPU counter seems tricky. How about plain atomic operations? 
> 
> This test shows that atomic_dec_and_test() is about 4.5 times slower
> than plain i-- in a 4-core CPU. Not bad.
> 
> Note that
> 1) we can avoid the atomic operations when there are no active waiters
> 2) most writeback will be submitted by one per-bdi-flusher, so no worry
>    of cache bouncing (this also means the per CPU counter error is
>    normally bounded by the batch size)
  Yes, writeback will be submitted by one flusher thread but the question
is rather where the writeback will be completed. And that depends on which
CPU that particular irq is handled. As far as my weak knowledge of HW goes,
this very much depends on the system configuration (i.e., irq affinity and
other things).

> 3) the cost of atomic inc/dec will be weakly related to core numbers
>    but never socket numbers (based on 2), so won't scale too bad

								Honza

> ---
> $ perf stat ./atomic
> 
>  Performance counter stats for './atomic':
> 
>          903.875304  task-clock-msecs         #      0.998 CPUs 
>                  76  context-switches         #      0.000 M/sec
>                   0  CPU-migrations           #      0.000 M/sec
>                  98  page-faults              #      0.000 M/sec
>          3011186459  cycles                   #   3331.418 M/sec
>          1608926490  instructions             #      0.534 IPC  
>           301481656  branches                 #    333.543 M/sec
>               94932  branch-misses            #      0.031 %    
>               88687  cache-references         #      0.098 M/sec
>                1286  cache-misses             #      0.001 M/sec
> 
>         0.905576197  seconds time elapsed
> 
> $ perf stat ./non-atomic
> 
>  Performance counter stats for './non-atomic':
> 
>          215.315814  task-clock-msecs         #      0.996 CPUs 
>                  18  context-switches         #      0.000 M/sec
>                   0  CPU-migrations           #      0.000 M/sec
>                  99  page-faults              #      0.000 M/sec
>           704358635  cycles                   #   3271.281 M/sec
>           303445790  instructions             #      0.431 IPC  
>           100574889  branches                 #    467.104 M/sec
>               39323  branch-misses            #      0.039 %    
>               36064  cache-references         #      0.167 M/sec
>                 850  cache-misses             #      0.004 M/sec
> 
>         0.216175521  seconds time elapsed
> 
> 
> --------------------------------------------------------------------------------
> $ cat atomic.c 
> #include <stdio.h> 
> 
> typedef struct {
>         int counter;
> } atomic_t;
> 
> static inline int atomic_dec_and_test(atomic_t *v)
> {      
>         unsigned char c;
> 
>         asm volatile("lock; decl %0; sete %1"
>                      : "+m" (v->counter), "=qm" (c)
>                      : : "memory");
>         return c != 0;
> }
> 
> int main(void)
> { 
>         atomic_t i;
> 
>         i.counter = 100000000;
> 
>         for (; !atomic_dec_and_test(&i);)
>                 ;
> 
>         return 0;
> }
> 
> --------------------------------------------------------------------------------
> $ cat non-atomic.c 
> #include <stdio.h> 
> 
> int main(void)
> { 
>         int i;
> 
>         for (i = 100000000; i; i--)
>                 ;
> 
>         return 0;
> }
> 
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC] mm: Implement balance_dirty_pages() through waiting for flusher thread
  2010-06-22 14:02               ` Jan Kara
@ 2010-06-22 14:24                 ` Wu Fengguang
  2010-06-22 22:29                 ` Dave Chinner
  1 sibling, 0 replies; 43+ messages in thread
From: Wu Fengguang @ 2010-06-22 14:24 UTC (permalink / raw)
  To: Jan Kara
  Cc: Dave Chinner, Andrew Morton, linux-fsdevel@vger.kernel.org,
	linux-mm@kvack.org, hch@infradead.org, peterz@infradead.org

On Tue, Jun 22, 2010 at 10:02:59PM +0800, Jan Kara wrote:
> On Tue 22-06-10 21:52:34, Wu Fengguang wrote:
> > >   On the other hand I think we will have to come up with something
> > > more clever than what I do now because for some huge machines with
> > > nr_cpu_ids == 256, the error of the counter is 256*9*8 = 18432 so that's
> > > already unacceptable given the amounts we want to check (like 1536) -
> > > already for nr_cpu_ids == 32, the error is the same as the difference we
> > > want to check.  I think we'll have to come up with some scheme whose error
> > > is not dependent on the number of cpus or if it is dependent, it's only a
> > > weak dependency (like a logarithm or so).
> > >   Or we could rely on the fact that IO completions for a bdi won't happen on
> > > all CPUs and thus the error would be much more bounded. But I'm not sure
> > > how much that is true or not.
> > 
> > Yes the per CPU counter seems tricky. How about plain atomic operations? 
> > 
> > This test shows that atomic_dec_and_test() is about 4.5 times slower
> > than plain i-- in a 4-core CPU. Not bad.
> > 
> > Note that
> > 1) we can avoid the atomic operations when there are no active waiters
> > 2) most writeback will be submitted by one per-bdi-flusher, so no worry
> >    of cache bouncing (this also means the per CPU counter error is
> >    normally bounded by the batch size)
>   Yes, writeback will be submitted by one flusher thread but the question
> is rather where the writeback will be completed. And that depends on which
> CPU that particular irq is handled. As far as my weak knowledge of HW goes,
> this very much depends on the system configuration (i.e., irq affinity and
> other things).

Either the irq goes to the io submit CPU, or some fixed CPU (somehow
determined by the bdi?) I guess?  My wild guess is, it may be bad for
the irq to goto some random CPU...

Thanks,
Fengguang

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC] mm: Implement balance_dirty_pages() through waiting for flusher thread
  2010-06-22 13:52             ` Wu Fengguang
                                 ` (2 preceding siblings ...)
  2010-06-22 14:02               ` Jan Kara
@ 2010-06-22 14:31               ` Christoph Hellwig
  2010-06-22 14:38                 ` Jan Kara
  2010-06-22 14:41                 ` Wu Fengguang
  3 siblings, 2 replies; 43+ messages in thread
From: Christoph Hellwig @ 2010-06-22 14:31 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Jan Kara, Dave Chinner, Andrew Morton, linux-fsdevel, linux-mm,
	hch, peterz

On Tue, Jun 22, 2010 at 09:52:34PM +0800, Wu Fengguang wrote:
> 2) most writeback will be submitted by one per-bdi-flusher, so no worry
>    of cache bouncing (this also means the per CPU counter error is
>    normally bounded by the batch size)

What counter are we talking about exactly?  Once balanance_dirty_pages
stops submitting I/O the per-bdi flusher thread will in fact be
the only thing submitting writeback, unless you count direct invocations
of writeback_single_inode. 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC] mm: Implement balance_dirty_pages() through waiting for flusher thread
  2010-06-22 14:00               ` Peter Zijlstra
@ 2010-06-22 14:36                 ` Wu Fengguang
  0 siblings, 0 replies; 43+ messages in thread
From: Wu Fengguang @ 2010-06-22 14:36 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Jan Kara, Dave Chinner, Andrew Morton,
	linux-fsdevel@vger.kernel.org, linux-mm@kvack.org,
	hch@infradead.org

On Tue, Jun 22, 2010 at 10:00:54PM +0800, Peter Zijlstra wrote:
> On Tue, 2010-06-22 at 21:52 +0800, Wu Fengguang wrote:
> > #include <stdio.h> 
> > 
> > typedef struct {
> >         int counter;
> > } atomic_t;
> > 
> > static inline int atomic_dec_and_test(atomic_t *v)
> > {      
> >         unsigned char c;
> > 
> >         asm volatile("lock; decl %0; sete %1"
> >                      : "+m" (v->counter), "=qm" (c)
> >                      : : "memory");
> >         return c != 0;
> > }
> > 
> > int main(void)
> > { 
> >         atomic_t i;
> > 
> >         i.counter = 100000000;
> > 
> >         for (; !atomic_dec_and_test(&i);)
> >                 ;
> > 
> >         return 0;
> > } 
> 
> This test utterly fails to stress the concurrency, you want to create
> nr_cpus threads and then pound the global variable. Then compare it
> against the per-cpu-counter variant.

I mean to test an atomic value that is mainly visited by one single CPU.

It sounds not reasonable for the IO completion IRQs to land randomly
on every CPU in the system.. when the IOs are submitted mostly by a
dedicated thread and to one single BDI (but yes, the BDI may be some
"compound" device).

Thanks,
Fengguang

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC] mm: Implement balance_dirty_pages() through waiting for flusher thread
  2010-06-22 14:31               ` Christoph Hellwig
@ 2010-06-22 14:38                 ` Jan Kara
  2010-06-22 22:45                   ` Dave Chinner
  2010-06-22 14:41                 ` Wu Fengguang
  1 sibling, 1 reply; 43+ messages in thread
From: Jan Kara @ 2010-06-22 14:38 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Wu Fengguang, Jan Kara, Dave Chinner, Andrew Morton,
	linux-fsdevel, linux-mm, peterz

On Tue 22-06-10 10:31:24, Christoph Hellwig wrote:
> On Tue, Jun 22, 2010 at 09:52:34PM +0800, Wu Fengguang wrote:
> > 2) most writeback will be submitted by one per-bdi-flusher, so no worry
> >    of cache bouncing (this also means the per CPU counter error is
> >    normally bounded by the batch size)
> 
> What counter are we talking about exactly?  Once balanance_dirty_pages
  The new per-bdi counter I'd like to introduce.

> stops submitting I/O the per-bdi flusher thread will in fact be
> the only thing submitting writeback, unless you count direct invocations
> of writeback_single_inode.
  Yes, I agree that the per-bdi flusher thread should be the only thread
submitting lots of IO (there is direct reclaim or kswapd if we change
direct reclaim but those should be negligible). So does this mean that
also I/O completions will be local to the CPU running per-bdi flusher
thread? Because the counter is incremented from the I/O completion
callback.

								Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC] mm: Implement balance_dirty_pages() through waiting for flusher thread
  2010-06-22 14:31               ` Christoph Hellwig
  2010-06-22 14:38                 ` Jan Kara
@ 2010-06-22 14:41                 ` Wu Fengguang
  1 sibling, 0 replies; 43+ messages in thread
From: Wu Fengguang @ 2010-06-22 14:41 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jan Kara, Dave Chinner, Andrew Morton,
	linux-fsdevel@vger.kernel.org, linux-mm@kvack.org,
	peterz@infradead.org

On Tue, Jun 22, 2010 at 10:31:24PM +0800, Christoph Hellwig wrote:
> On Tue, Jun 22, 2010 at 09:52:34PM +0800, Wu Fengguang wrote:
> > 2) most writeback will be submitted by one per-bdi-flusher, so no worry
> >    of cache bouncing (this also means the per CPU counter error is
> >    normally bounded by the batch size)
> 
> What counter are we talking about exactly?  Once balanance_dirty_pages

bdi_stat(bdi, BDI_WRITTEN) introduced in this patch.

> stops submitting I/O the per-bdi flusher thread will in fact be
> the only thing submitting writeback, unless you count direct invocations
> of writeback_single_inode. 

Right. 

Thanks,
Fengguang

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC] mm: Implement balance_dirty_pages() through waiting for flusher thread
  2010-06-22 14:02               ` Jan Kara
  2010-06-22 14:24                 ` Wu Fengguang
@ 2010-06-22 22:29                 ` Dave Chinner
  2010-06-23 13:15                   ` Jan Kara
  1 sibling, 1 reply; 43+ messages in thread
From: Dave Chinner @ 2010-06-22 22:29 UTC (permalink / raw)
  To: Jan Kara
  Cc: Wu Fengguang, Andrew Morton, linux-fsdevel, linux-mm, hch, peterz

On Tue, Jun 22, 2010 at 04:02:59PM +0200, Jan Kara wrote:
> On Tue 22-06-10 21:52:34, Wu Fengguang wrote:
> > >   On the other hand I think we will have to come up with something
> > > more clever than what I do now because for some huge machines with
> > > nr_cpu_ids == 256, the error of the counter is 256*9*8 = 18432 so that's
> > > already unacceptable given the amounts we want to check (like 1536) -
> > > already for nr_cpu_ids == 32, the error is the same as the difference we
> > > want to check.  I think we'll have to come up with some scheme whose error
> > > is not dependent on the number of cpus or if it is dependent, it's only a
> > > weak dependency (like a logarithm or so).
> > >   Or we could rely on the fact that IO completions for a bdi won't happen on
> > > all CPUs and thus the error would be much more bounded. But I'm not sure
> > > how much that is true or not.
> > 
> > Yes the per CPU counter seems tricky. How about plain atomic operations? 
> > 
> > This test shows that atomic_dec_and_test() is about 4.5 times slower
> > than plain i-- in a 4-core CPU. Not bad.

It's not how fast an uncontended operation runs that matter - it's
what happens when it is contended by lots of CPUs. In my experience,
atomics in writeback paths scale to medium sized machines (say
16-32p) but bottleneck on larger configurations due to the increased
cost of cacheline propagation on larger machines.

> > Note that
> > 1) we can avoid the atomic operations when there are no active waiters

Under heavy IO load there will always be waiters.

> > 2) most writeback will be submitted by one per-bdi-flusher, so no worry
> >    of cache bouncing (this also means the per CPU counter error is
> >    normally bounded by the batch size)
>   Yes, writeback will be submitted by one flusher thread but the question
> is rather where the writeback will be completed. And that depends on which
> CPU that particular irq is handled. As far as my weak knowledge of HW goes,
> this very much depends on the system configuration (i.e., irq affinity and
> other things).

And how many paths to the storage you are using, how threaded the
underlying driver is, whether it is using MSI to direct interrupts to
multiple CPUs instead of just one, etc.

As we scale up we're more likely to see multiple CPUs doing IO
completion for the same BDI because the storage configs are more
complex in high end machines. Hence IMO preventing cacheline
bouncing between submission and completion is a significant
scalability concern.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC] mm: Implement balance_dirty_pages() through waiting for flusher thread
  2010-06-22 14:38                 ` Jan Kara
@ 2010-06-22 22:45                   ` Dave Chinner
  2010-06-23  1:34                     ` Wu Fengguang
  0 siblings, 1 reply; 43+ messages in thread
From: Dave Chinner @ 2010-06-22 22:45 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christoph Hellwig, Wu Fengguang, Andrew Morton, linux-fsdevel,
	linux-mm, peterz

On Tue, Jun 22, 2010 at 04:38:56PM +0200, Jan Kara wrote:
> On Tue 22-06-10 10:31:24, Christoph Hellwig wrote:
> > On Tue, Jun 22, 2010 at 09:52:34PM +0800, Wu Fengguang wrote:
> > > 2) most writeback will be submitted by one per-bdi-flusher, so no worry
> > >    of cache bouncing (this also means the per CPU counter error is
> > >    normally bounded by the batch size)
> > 
> > What counter are we talking about exactly?  Once balanance_dirty_pages
>   The new per-bdi counter I'd like to introduce.
> 
> > stops submitting I/O the per-bdi flusher thread will in fact be
> > the only thing submitting writeback, unless you count direct invocations
> > of writeback_single_inode.
>   Yes, I agree that the per-bdi flusher thread should be the only thread
> submitting lots of IO (there is direct reclaim or kswapd if we change
> direct reclaim but those should be negligible). So does this mean that
> also I/O completions will be local to the CPU running per-bdi flusher
> thread? Because the counter is incremented from the I/O completion
> callback.

By default we set QUEUE_FLAG_SAME_COMP, which means we hand
completions back to the submitter CPU during blk_complete_request().
Completion processing is then handled by a softirq on the CPU
selected for completion processing.

This was done, IIRC, because it provided some OLTP benchmark 1-2%
better results. It can, however, be turned off via
/sys/block/<foo>/queue/rq_affinity, and there's no guarantee that
the completion processing doesn't get handled off to some other CPU
(e.g. via a workqueue) so we cannot rely on this completion
behaviour to avoid cacheline bouncing.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC] mm: Implement balance_dirty_pages() through waiting for flusher thread
  2010-06-22 22:45                   ` Dave Chinner
@ 2010-06-23  1:34                     ` Wu Fengguang
  2010-06-23  3:06                       ` Dave Chinner
  0 siblings, 1 reply; 43+ messages in thread
From: Wu Fengguang @ 2010-06-23  1:34 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Jan Kara, Christoph Hellwig, Andrew Morton,
	linux-fsdevel@vger.kernel.org, linux-mm@kvack.org,
	peterz@infradead.org

On Wed, Jun 23, 2010 at 06:45:51AM +0800, Dave Chinner wrote:
> On Tue, Jun 22, 2010 at 04:38:56PM +0200, Jan Kara wrote:
> > On Tue 22-06-10 10:31:24, Christoph Hellwig wrote:
> > > On Tue, Jun 22, 2010 at 09:52:34PM +0800, Wu Fengguang wrote:
> > > > 2) most writeback will be submitted by one per-bdi-flusher, so no worry
> > > >    of cache bouncing (this also means the per CPU counter error is
> > > >    normally bounded by the batch size)
> > > 
> > > What counter are we talking about exactly?  Once balanance_dirty_pages
> >   The new per-bdi counter I'd like to introduce.
> > 
> > > stops submitting I/O the per-bdi flusher thread will in fact be
> > > the only thing submitting writeback, unless you count direct invocations
> > > of writeback_single_inode.
> >   Yes, I agree that the per-bdi flusher thread should be the only thread
> > submitting lots of IO (there is direct reclaim or kswapd if we change
> > direct reclaim but those should be negligible). So does this mean that
> > also I/O completions will be local to the CPU running per-bdi flusher
> > thread? Because the counter is incremented from the I/O completion
> > callback.
> 
> By default we set QUEUE_FLAG_SAME_COMP, which means we hand
> completions back to the submitter CPU during blk_complete_request().
> Completion processing is then handled by a softirq on the CPU
> selected for completion processing.

Good to know about that, thanks!

> This was done, IIRC, because it provided some OLTP benchmark 1-2%
> better results. It can, however, be turned off via
> /sys/block/<foo>/queue/rq_affinity, and there's no guarantee that
> the completion processing doesn't get handled off to some other CPU
> (e.g. via a workqueue) so we cannot rely on this completion
> behaviour to avoid cacheline bouncing.

If rq_affinity does not work reliably somewhere in the IO completion
path, why not trying to fix it? Otherwise all the page/mapping/zone
cachelines covered by test_set_page_writeback()/test_clear_page_writeback()
(and more other functions) will also be bounced.

Another option is to put atomic accounting into test_set_page_writeback()
ie. the IO submission path. This actually matches the current
balanance_dirty_pages() behavior. It may then block on get_request().
The down side is, get_request() blocks until queue depth goes down
from nr_congestion_on to nr_congestion_off, which is not as smooth as
the IO completion path. As a result balanance_dirty_pages() may get
delayed much more than necessary when there is only 1 waiter, and
wake up multiple waiters in bursts.

Thanks,
Fengguang

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC] mm: Implement balance_dirty_pages() through waiting for flusher thread
  2010-06-23  1:34                     ` Wu Fengguang
@ 2010-06-23  3:06                       ` Dave Chinner
  2010-06-23  3:22                         ` Wu Fengguang
  0 siblings, 1 reply; 43+ messages in thread
From: Dave Chinner @ 2010-06-23  3:06 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Jan Kara, Christoph Hellwig, Andrew Morton,
	linux-fsdevel@vger.kernel.org, linux-mm@kvack.org,
	peterz@infradead.org

On Wed, Jun 23, 2010 at 09:34:26AM +0800, Wu Fengguang wrote:
> On Wed, Jun 23, 2010 at 06:45:51AM +0800, Dave Chinner wrote:
> > On Tue, Jun 22, 2010 at 04:38:56PM +0200, Jan Kara wrote:
> > > On Tue 22-06-10 10:31:24, Christoph Hellwig wrote:
> > > > On Tue, Jun 22, 2010 at 09:52:34PM +0800, Wu Fengguang wrote:
> > > > > 2) most writeback will be submitted by one per-bdi-flusher, so no worry
> > > > >    of cache bouncing (this also means the per CPU counter error is
> > > > >    normally bounded by the batch size)
> > > > 
> > > > What counter are we talking about exactly?  Once balanance_dirty_pages
> > >   The new per-bdi counter I'd like to introduce.
> > > 
> > > > stops submitting I/O the per-bdi flusher thread will in fact be
> > > > the only thing submitting writeback, unless you count direct invocations
> > > > of writeback_single_inode.
> > >   Yes, I agree that the per-bdi flusher thread should be the only thread
> > > submitting lots of IO (there is direct reclaim or kswapd if we change
> > > direct reclaim but those should be negligible). So does this mean that
> > > also I/O completions will be local to the CPU running per-bdi flusher
> > > thread? Because the counter is incremented from the I/O completion
> > > callback.
> > 
> > By default we set QUEUE_FLAG_SAME_COMP, which means we hand
> > completions back to the submitter CPU during blk_complete_request().
> > Completion processing is then handled by a softirq on the CPU
> > selected for completion processing.
> 
> Good to know about that, thanks!
> 
> > This was done, IIRC, because it provided some OLTP benchmark 1-2%
> > better results. It can, however, be turned off via
> > /sys/block/<foo>/queue/rq_affinity, and there's no guarantee that
> > the completion processing doesn't get handled off to some other CPU
> > (e.g. via a workqueue) so we cannot rely on this completion
> > behaviour to avoid cacheline bouncing.
> 
> If rq_affinity does not work reliably somewhere in the IO completion
> path, why not trying to fix it?

Because completion on the submitter CPU is not ideal for high
bandwidth buffered IO.

> Otherwise all the page/mapping/zone
> cachelines covered by test_set_page_writeback()/test_clear_page_writeback()
> (and more other functions) will also be bounced.

Yes, but when the flusher thread is approaching being CPU bound for
high throughput IO, bouncing cachelines to another CPU during
completion costs far less in terms of throughput compared to
reducing the amount of time available to issue IO on that CPU.

> Another option is to put atomic accounting into test_set_page_writeback()
> ie. the IO submission path. This actually matches the current
> balanance_dirty_pages() behavior. It may then block on get_request().
> The down side is, get_request() blocks until queue depth goes down
> from nr_congestion_on to nr_congestion_off, which is not as smooth as
> the IO completion path. As a result balanance_dirty_pages() may get
> delayed much more than necessary when there is only 1 waiter, and
> wake up multiple waiters in bursts.

Being reliant on the block layer queuing behaviour for VM congestion
control is exactly the problem are trying to avoid...

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC] mm: Implement balance_dirty_pages() through waiting for flusher thread
  2010-06-23  3:06                       ` Dave Chinner
@ 2010-06-23  3:22                         ` Wu Fengguang
  2010-06-23  6:03                           ` Dave Chinner
  0 siblings, 1 reply; 43+ messages in thread
From: Wu Fengguang @ 2010-06-23  3:22 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Jan Kara, Christoph Hellwig, Andrew Morton,
	linux-fsdevel@vger.kernel.org, linux-mm@kvack.org,
	peterz@infradead.org

On Wed, Jun 23, 2010 at 11:06:04AM +0800, Dave Chinner wrote:
> On Wed, Jun 23, 2010 at 09:34:26AM +0800, Wu Fengguang wrote:
> > On Wed, Jun 23, 2010 at 06:45:51AM +0800, Dave Chinner wrote:
> > > On Tue, Jun 22, 2010 at 04:38:56PM +0200, Jan Kara wrote:
> > > > On Tue 22-06-10 10:31:24, Christoph Hellwig wrote:
> > > > > On Tue, Jun 22, 2010 at 09:52:34PM +0800, Wu Fengguang wrote:
> > > > > > 2) most writeback will be submitted by one per-bdi-flusher, so no worry
> > > > > >    of cache bouncing (this also means the per CPU counter error is
> > > > > >    normally bounded by the batch size)
> > > > > 
> > > > > What counter are we talking about exactly?  Once balanance_dirty_pages
> > > >   The new per-bdi counter I'd like to introduce.
> > > > 
> > > > > stops submitting I/O the per-bdi flusher thread will in fact be
> > > > > the only thing submitting writeback, unless you count direct invocations
> > > > > of writeback_single_inode.
> > > >   Yes, I agree that the per-bdi flusher thread should be the only thread
> > > > submitting lots of IO (there is direct reclaim or kswapd if we change
> > > > direct reclaim but those should be negligible). So does this mean that
> > > > also I/O completions will be local to the CPU running per-bdi flusher
> > > > thread? Because the counter is incremented from the I/O completion
> > > > callback.
> > > 
> > > By default we set QUEUE_FLAG_SAME_COMP, which means we hand
> > > completions back to the submitter CPU during blk_complete_request().
> > > Completion processing is then handled by a softirq on the CPU
> > > selected for completion processing.
> > 
> > Good to know about that, thanks!
> > 
> > > This was done, IIRC, because it provided some OLTP benchmark 1-2%
> > > better results. It can, however, be turned off via
> > > /sys/block/<foo>/queue/rq_affinity, and there's no guarantee that
> > > the completion processing doesn't get handled off to some other CPU
> > > (e.g. via a workqueue) so we cannot rely on this completion
> > > behaviour to avoid cacheline bouncing.
> > 
> > If rq_affinity does not work reliably somewhere in the IO completion
> > path, why not trying to fix it?
> 
> Because completion on the submitter CPU is not ideal for high
> bandwidth buffered IO.

Yes there may be heavy post-processing for read data, however for writes
it is mainly the pre-processing that costs CPU? So perfect rq_affinity
should always benefit write IO?

> > Otherwise all the page/mapping/zone
> > cachelines covered by test_set_page_writeback()/test_clear_page_writeback()
> > (and more other functions) will also be bounced.
> 
> Yes, but when the flusher thread is approaching being CPU bound for
> high throughput IO, bouncing cachelines to another CPU during
> completion costs far less in terms of throughput compared to
> reducing the amount of time available to issue IO on that CPU.

Yes, reasonable for reads.

> > Another option is to put atomic accounting into test_set_page_writeback()
> > ie. the IO submission path. This actually matches the current
> > balanance_dirty_pages() behavior. It may then block on get_request().
> > The down side is, get_request() blocks until queue depth goes down
> > from nr_congestion_on to nr_congestion_off, which is not as smooth as
> > the IO completion path. As a result balanance_dirty_pages() may get
> > delayed much more than necessary when there is only 1 waiter, and
> > wake up multiple waiters in bursts.
> 
> Being reliant on the block layer queuing behaviour for VM congestion
> control is exactly the problem are trying to avoid...

Yes this is not a good option. The paragraph looks more like stating a
potential benefit of the proposed patch :)

Thanks,
Fengguang

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC] mm: Implement balance_dirty_pages() through waiting for flusher thread
  2010-06-23  3:22                         ` Wu Fengguang
@ 2010-06-23  6:03                           ` Dave Chinner
  2010-06-23  6:25                             ` Wu Fengguang
  0 siblings, 1 reply; 43+ messages in thread
From: Dave Chinner @ 2010-06-23  6:03 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Jan Kara, Christoph Hellwig, Andrew Morton,
	linux-fsdevel@vger.kernel.org, linux-mm@kvack.org,
	peterz@infradead.org

On Wed, Jun 23, 2010 at 11:22:13AM +0800, Wu Fengguang wrote:
> On Wed, Jun 23, 2010 at 11:06:04AM +0800, Dave Chinner wrote:
> > On Wed, Jun 23, 2010 at 09:34:26AM +0800, Wu Fengguang wrote:
> > > On Wed, Jun 23, 2010 at 06:45:51AM +0800, Dave Chinner wrote:
> > > > On Tue, Jun 22, 2010 at 04:38:56PM +0200, Jan Kara wrote:
> > > > > On Tue 22-06-10 10:31:24, Christoph Hellwig wrote:
> > > > > > On Tue, Jun 22, 2010 at 09:52:34PM +0800, Wu Fengguang wrote:
> > > > > > > 2) most writeback will be submitted by one per-bdi-flusher, so no worry
> > > > > > >    of cache bouncing (this also means the per CPU counter error is
> > > > > > >    normally bounded by the batch size)
> > > > > > 
> > > > > > What counter are we talking about exactly?  Once balanance_dirty_pages
> > > > >   The new per-bdi counter I'd like to introduce.
> > > > > 
> > > > > > stops submitting I/O the per-bdi flusher thread will in fact be
> > > > > > the only thing submitting writeback, unless you count direct invocations
> > > > > > of writeback_single_inode.
> > > > >   Yes, I agree that the per-bdi flusher thread should be the only thread
> > > > > submitting lots of IO (there is direct reclaim or kswapd if we change
> > > > > direct reclaim but those should be negligible). So does this mean that
> > > > > also I/O completions will be local to the CPU running per-bdi flusher
> > > > > thread? Because the counter is incremented from the I/O completion
> > > > > callback.
> > > > 
> > > > By default we set QUEUE_FLAG_SAME_COMP, which means we hand
> > > > completions back to the submitter CPU during blk_complete_request().
> > > > Completion processing is then handled by a softirq on the CPU
> > > > selected for completion processing.
> > > 
> > > Good to know about that, thanks!
> > > 
> > > > This was done, IIRC, because it provided some OLTP benchmark 1-2%
> > > > better results. It can, however, be turned off via
> > > > /sys/block/<foo>/queue/rq_affinity, and there's no guarantee that
> > > > the completion processing doesn't get handled off to some other CPU
> > > > (e.g. via a workqueue) so we cannot rely on this completion
> > > > behaviour to avoid cacheline bouncing.
> > > 
> > > If rq_affinity does not work reliably somewhere in the IO completion
> > > path, why not trying to fix it?
> > 
> > Because completion on the submitter CPU is not ideal for high
> > bandwidth buffered IO.
> 
> Yes there may be heavy post-processing for read data, however for writes
> it is mainly the pre-processing that costs CPU?

Could be either - delayed allocation requires significant pre-processing
for allocation. Avoiding this by using preallocation just
moves the processing load to IO completion which needs to issue
transactions to mark the region written.

> So perfect rq_affinity
> should always benefit write IO?

No, because the flusher thread gets to be CPU bound just writing
pages, allocating blocks and submitting IO. It might take 5-10GB/s
to get there (say a million dirty pages a second being processed by
a single CPU), but that's the sort of storage subsystem XFS is
capable of driving. IO completion time for such a workload is
significant, too, so putting that on the same CPU as the flusher
thread will slow things down by far more than gain from avoiding
cacheline bouncing.

> > > Otherwise all the page/mapping/zone
> > > cachelines covered by test_set_page_writeback()/test_clear_page_writeback()
> > > (and more other functions) will also be bounced.
> > 
> > Yes, but when the flusher thread is approaching being CPU bound for
> > high throughput IO, bouncing cachelines to another CPU during
> > completion costs far less in terms of throughput compared to
> > reducing the amount of time available to issue IO on that CPU.
> 
> Yes, reasonable for reads.

I was taking about writes - the flusher threads don't do any reading ;)

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC] mm: Implement balance_dirty_pages() through waiting for flusher thread
  2010-06-23  6:03                           ` Dave Chinner
@ 2010-06-23  6:25                             ` Wu Fengguang
  2010-06-23 23:42                               ` Dave Chinner
  0 siblings, 1 reply; 43+ messages in thread
From: Wu Fengguang @ 2010-06-23  6:25 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Jan Kara, Christoph Hellwig, Andrew Morton,
	linux-fsdevel@vger.kernel.org, linux-mm@kvack.org,
	peterz@infradead.org

On Wed, Jun 23, 2010 at 02:03:19PM +0800, Dave Chinner wrote:
> On Wed, Jun 23, 2010 at 11:22:13AM +0800, Wu Fengguang wrote:
> > On Wed, Jun 23, 2010 at 11:06:04AM +0800, Dave Chinner wrote:
> > > On Wed, Jun 23, 2010 at 09:34:26AM +0800, Wu Fengguang wrote:
> > > > On Wed, Jun 23, 2010 at 06:45:51AM +0800, Dave Chinner wrote:
> > > > > On Tue, Jun 22, 2010 at 04:38:56PM +0200, Jan Kara wrote:
> > > > > > On Tue 22-06-10 10:31:24, Christoph Hellwig wrote:
> > > > > > > On Tue, Jun 22, 2010 at 09:52:34PM +0800, Wu Fengguang wrote:
> > > > > > > > 2) most writeback will be submitted by one per-bdi-flusher, so no worry
> > > > > > > >    of cache bouncing (this also means the per CPU counter error is
> > > > > > > >    normally bounded by the batch size)
> > > > > > > 
> > > > > > > What counter are we talking about exactly?  Once balanance_dirty_pages
> > > > > >   The new per-bdi counter I'd like to introduce.
> > > > > > 
> > > > > > > stops submitting I/O the per-bdi flusher thread will in fact be
> > > > > > > the only thing submitting writeback, unless you count direct invocations
> > > > > > > of writeback_single_inode.
> > > > > >   Yes, I agree that the per-bdi flusher thread should be the only thread
> > > > > > submitting lots of IO (there is direct reclaim or kswapd if we change
> > > > > > direct reclaim but those should be negligible). So does this mean that
> > > > > > also I/O completions will be local to the CPU running per-bdi flusher
> > > > > > thread? Because the counter is incremented from the I/O completion
> > > > > > callback.
> > > > > 
> > > > > By default we set QUEUE_FLAG_SAME_COMP, which means we hand
> > > > > completions back to the submitter CPU during blk_complete_request().
> > > > > Completion processing is then handled by a softirq on the CPU
> > > > > selected for completion processing.
> > > > 
> > > > Good to know about that, thanks!
> > > > 
> > > > > This was done, IIRC, because it provided some OLTP benchmark 1-2%
> > > > > better results. It can, however, be turned off via
> > > > > /sys/block/<foo>/queue/rq_affinity, and there's no guarantee that
> > > > > the completion processing doesn't get handled off to some other CPU
> > > > > (e.g. via a workqueue) so we cannot rely on this completion
> > > > > behaviour to avoid cacheline bouncing.
> > > > 
> > > > If rq_affinity does not work reliably somewhere in the IO completion
> > > > path, why not trying to fix it?
> > > 
> > > Because completion on the submitter CPU is not ideal for high
> > > bandwidth buffered IO.
> > 
> > Yes there may be heavy post-processing for read data, however for writes
> > it is mainly the pre-processing that costs CPU?
> 
> Could be either - delayed allocation requires significant pre-processing
> for allocation. Avoiding this by using preallocation just
> moves the processing load to IO completion which needs to issue
> transactions to mark the region written.

Good point, thanks.

> > So perfect rq_affinity
> > should always benefit write IO?
> 
> No, because the flusher thread gets to be CPU bound just writing
> pages, allocating blocks and submitting IO. It might take 5-10GB/s
> to get there (say a million dirty pages a second being processed by
> a single CPU), but that's the sort of storage subsystem XFS is
> capable of driving. IO completion time for such a workload is
> significant, too, so putting that on the same CPU as the flusher
> thread will slow things down by far more than gain from avoiding
> cacheline bouncing.

So super fast storage is going to demand multiple flushers per bdi.
And once we run multiple flushers for one bdi, it will again be
beneficial to schedule IO completion to the flusher CPU :)

However in this case per bdi atomic value (and some others) will be
bounced among the flusher CPUs...

> > > > Otherwise all the page/mapping/zone
> > > > cachelines covered by test_set_page_writeback()/test_clear_page_writeback()
> > > > (and more other functions) will also be bounced.
> > > 
> > > Yes, but when the flusher thread is approaching being CPU bound for
> > > high throughput IO, bouncing cachelines to another CPU during
> > > completion costs far less in terms of throughput compared to
> > > reducing the amount of time available to issue IO on that CPU.
> > 
> > Yes, reasonable for reads.
> 
> I was taking about writes - the flusher threads don't do any reading ;)

Ah, sorry for not catching the words "flusher thread" :)

Thanks,
Fengguang

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC] mm: Implement balance_dirty_pages() through waiting for flusher thread
  2010-06-22 22:29                 ` Dave Chinner
@ 2010-06-23 13:15                   ` Jan Kara
  2010-06-23 23:06                     ` Dave Chinner
  0 siblings, 1 reply; 43+ messages in thread
From: Jan Kara @ 2010-06-23 13:15 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Jan Kara, Wu Fengguang, Andrew Morton, linux-fsdevel, linux-mm,
	hch, peterz

On Wed 23-06-10 08:29:32, Dave Chinner wrote:
> On Tue, Jun 22, 2010 at 04:02:59PM +0200, Jan Kara wrote:
> > > 2) most writeback will be submitted by one per-bdi-flusher, so no worry
> > >    of cache bouncing (this also means the per CPU counter error is
> > >    normally bounded by the batch size)
> >   Yes, writeback will be submitted by one flusher thread but the question
> > is rather where the writeback will be completed. And that depends on which
> > CPU that particular irq is handled. As far as my weak knowledge of HW goes,
> > this very much depends on the system configuration (i.e., irq affinity and
> > other things).
> 
> And how many paths to the storage you are using, how threaded the
> underlying driver is, whether it is using MSI to direct interrupts to
> multiple CPUs instead of just one, etc.
> 
> As we scale up we're more likely to see multiple CPUs doing IO
> completion for the same BDI because the storage configs are more
> complex in high end machines. Hence IMO preventing cacheline
> bouncing between submission and completion is a significant
> scalability concern.
  Thanks for details. I'm wondering whether we could assume that although
IO completion can run on several CPUs, it will be still a fairly limited
number of CPUs. If this is the case, we could then implement a per-cpu
counter that would additionally track number of CPUs modifying the counter
(the number of CPUs would get zeroed in ???_counter_sum). This way the
number of atomic operations won't be much higher (only one atomic inc when
a CPU updates the counter for the first time) and if only several CPUs
modify the counter, we would be able to bound the error much better.

								Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC] mm: Implement balance_dirty_pages() through waiting for flusher thread
  2010-06-23 13:15                   ` Jan Kara
@ 2010-06-23 23:06                     ` Dave Chinner
  0 siblings, 0 replies; 43+ messages in thread
From: Dave Chinner @ 2010-06-23 23:06 UTC (permalink / raw)
  To: Jan Kara
  Cc: Wu Fengguang, Andrew Morton, linux-fsdevel, linux-mm, hch, peterz

On Wed, Jun 23, 2010 at 03:15:57PM +0200, Jan Kara wrote:
> On Wed 23-06-10 08:29:32, Dave Chinner wrote:
> > On Tue, Jun 22, 2010 at 04:02:59PM +0200, Jan Kara wrote:
> > > > 2) most writeback will be submitted by one per-bdi-flusher, so no worry
> > > >    of cache bouncing (this also means the per CPU counter error is
> > > >    normally bounded by the batch size)
> > >   Yes, writeback will be submitted by one flusher thread but the question
> > > is rather where the writeback will be completed. And that depends on which
> > > CPU that particular irq is handled. As far as my weak knowledge of HW goes,
> > > this very much depends on the system configuration (i.e., irq affinity and
> > > other things).
> > 
> > And how many paths to the storage you are using, how threaded the
> > underlying driver is, whether it is using MSI to direct interrupts to
> > multiple CPUs instead of just one, etc.
> > 
> > As we scale up we're more likely to see multiple CPUs doing IO
> > completion for the same BDI because the storage configs are more
> > complex in high end machines. Hence IMO preventing cacheline
> > bouncing between submission and completion is a significant
> > scalability concern.
>   Thanks for details. I'm wondering whether we could assume that although
> IO completion can run on several CPUs, it will be still a fairly limited
> number of CPUs.

I don't think we can. For example, the 10GB/s test bed I used in
2006 had 32 dual port FC HBAs in it and only 24 CPUs. The result was
that every single CPU was taking HBA interrupts and hence doing IO
completion processing.

> If this is the case, we could then implement a per-cpu counter
> that would additionally track number of CPUs modifying the counter
> (the number of CPUs would get zeroed in ???_counter_sum). This way
> the number of atomic operations won't be much higher (only one
> atomic inc when a CPU updates the counter for the first time) and
> if only several CPUs modify the counter, we would be able to bound
> the error much better.

Effectively track the number of changed counters? That might work on
several levels.  Firstly, the number of changed counters could be a
trigger for accurate summing during counter reads, and secondly it
could be used to optimise the summing only to lock and sum changed
CPUs, which should also speed that operation up...

I also had a thought for removing the error completely from the
default complete-on-submitter-cpu behaviour - all the percpu
accounting in that case is likely to be on the local counter, so the
counter read could return (fbc->count + local cpu count) rather than
just fbc->count. This gives us finer-grained visibility of local
completions at no extra cost.

This only falls down when completions are on another CPU, but if we
can track the maximum counter error bounds across those other CPUs
like you've suggest then we should be able to make this work...

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC] mm: Implement balance_dirty_pages() through waiting for flusher thread
  2010-06-23  6:25                             ` Wu Fengguang
@ 2010-06-23 23:42                               ` Dave Chinner
  0 siblings, 0 replies; 43+ messages in thread
From: Dave Chinner @ 2010-06-23 23:42 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Jan Kara, Christoph Hellwig, Andrew Morton,
	linux-fsdevel@vger.kernel.org, linux-mm@kvack.org,
	peterz@infradead.org

On Wed, Jun 23, 2010 at 02:25:40PM +0800, Wu Fengguang wrote:
> On Wed, Jun 23, 2010 at 02:03:19PM +0800, Dave Chinner wrote:
> > On Wed, Jun 23, 2010 at 11:22:13AM +0800, Wu Fengguang wrote:
> > > On Wed, Jun 23, 2010 at 11:06:04AM +0800, Dave Chinner wrote:
> > > > On Wed, Jun 23, 2010 at 09:34:26AM +0800, Wu Fengguang wrote:
> > > > > On Wed, Jun 23, 2010 at 06:45:51AM +0800, Dave Chinner wrote:
> > > > > > By default we set QUEUE_FLAG_SAME_COMP, which means we hand
> > > > > > completions back to the submitter CPU during blk_complete_request().
> > > > > > Completion processing is then handled by a softirq on the CPU
> > > > > > selected for completion processing.
> > > > > 
> > > > > Good to know about that, thanks!
> > > > > 
> > > > > > This was done, IIRC, because it provided some OLTP benchmark 1-2%
> > > > > > better results. It can, however, be turned off via
> > > > > > /sys/block/<foo>/queue/rq_affinity, and there's no guarantee that
> > > > > > the completion processing doesn't get handled off to some other CPU
> > > > > > (e.g. via a workqueue) so we cannot rely on this completion
> > > > > > behaviour to avoid cacheline bouncing.
> > > > > 
> > > > > If rq_affinity does not work reliably somewhere in the IO completion
> > > > > path, why not trying to fix it?
> > > > 
> > > > Because completion on the submitter CPU is not ideal for high
> > > > bandwidth buffered IO.
> > > 
> > > Yes there may be heavy post-processing for read data, however for writes
> > > it is mainly the pre-processing that costs CPU?
> > 
> > Could be either - delayed allocation requires significant pre-processing
> > for allocation. Avoiding this by using preallocation just
> > moves the processing load to IO completion which needs to issue
> > transactions to mark the region written.
> 
> Good point, thanks.
> 
> > > So perfect rq_affinity
> > > should always benefit write IO?
> > 
> > No, because the flusher thread gets to be CPU bound just writing
> > pages, allocating blocks and submitting IO. It might take 5-10GB/s
> > to get there (say a million dirty pages a second being processed by
> > a single CPU), but that's the sort of storage subsystem XFS is
> > capable of driving. IO completion time for such a workload is
> > significant, too, so putting that on the same CPU as the flusher
> > thread will slow things down by far more than gain from avoiding
> > cacheline bouncing.
> 
> So super fast storage is going to demand multiple flushers per bdi.
> And once we run multiple flushers for one bdi, it will again be
> beneficial to schedule IO completion to the flusher CPU :)

Yes - that is where we want to get to with XFS. But we don't have
multiple bdi-flusher thread support yet for any filesystem, so
I think it will be a while before the we can ignore this issue...

Cheers,

Dave.> 

-- 
Dave Chinner
david@fromorbit.com

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

end of thread, other threads:[~2010-06-23 23:42 UTC | newest]

Thread overview: 43+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-06-17 18:04 [PATCH RFC] mm: Implement balance_dirty_pages() through waiting for flusher thread Jan Kara
2010-06-18  6:09 ` Dave Chinner
2010-06-18  9:11   ` Peter Zijlstra
2010-06-18 23:29     ` Dave Chinner
2010-06-21 23:36   ` Jan Kara
2010-06-22  5:44     ` Dave Chinner
2010-06-22  6:14       ` Andrew Morton
2010-06-22  7:45         ` Peter Zijlstra
2010-06-22  8:24           ` Andrew Morton
2010-06-22  8:52             ` Peter Zijlstra
2010-06-22 10:09         ` Dave Chinner
2010-06-22 13:17           ` Jan Kara
2010-06-22 13:52             ` Wu Fengguang
2010-06-22 13:59               ` Peter Zijlstra
2010-06-22 14:00               ` Peter Zijlstra
2010-06-22 14:36                 ` Wu Fengguang
2010-06-22 14:02               ` Jan Kara
2010-06-22 14:24                 ` Wu Fengguang
2010-06-22 22:29                 ` Dave Chinner
2010-06-23 13:15                   ` Jan Kara
2010-06-23 23:06                     ` Dave Chinner
2010-06-22 14:31               ` Christoph Hellwig
2010-06-22 14:38                 ` Jan Kara
2010-06-22 22:45                   ` Dave Chinner
2010-06-23  1:34                     ` Wu Fengguang
2010-06-23  3:06                       ` Dave Chinner
2010-06-23  3:22                         ` Wu Fengguang
2010-06-23  6:03                           ` Dave Chinner
2010-06-23  6:25                             ` Wu Fengguang
2010-06-23 23:42                               ` Dave Chinner
2010-06-22 14:41                 ` Wu Fengguang
2010-06-22 11:19       ` Jan Kara
2010-06-18 10:21 ` Peter Zijlstra
2010-06-21 13:31   ` Jan Kara
2010-06-18 10:21 ` Peter Zijlstra
2010-06-21 14:02   ` Jan Kara
2010-06-21 14:10     ` Jan Kara
2010-06-21 14:12       ` Peter Zijlstra
2010-06-18 10:21 ` Peter Zijlstra
2010-06-21 13:42   ` Jan Kara
2010-06-22  4:07     ` Wu Fengguang
2010-06-22 13:27       ` Jan Kara
2010-06-22 13:33         ` Wu Fengguang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).