* [PATCH 1/3] radix-tree: Implement function radix_tree_gang_tag_if_tagged
2010-02-11 23:06 [RFC] [PATCH 0/3] Writeback livelock avoidance using page tags (v2) Jan Kara
@ 2010-02-11 23:06 ` Jan Kara
2010-02-15 16:01 ` Nick Piggin
2010-02-11 23:06 ` [PATCH 2/3] mm: Implement writeback livelock avoidance using page tagging Jan Kara
2010-02-11 23:06 ` [PATCH 3/3] mm: Debugging of new livelock avoidance Jan Kara
2 siblings, 1 reply; 12+ messages in thread
From: Jan Kara @ 2010-02-11 23:06 UTC (permalink / raw)
To: LKML; +Cc: Andrew Morton, npiggin, fengguang.wu, Jan Kara
Implement function for setting one tag if another tag is set
for each item in given range.
Signed-off-by: Jan Kara <jack@suse.cz>
---
include/linux/radix-tree.h | 3 ++
lib/radix-tree.c | 82 ++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 85 insertions(+), 0 deletions(-)
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index c5da749..41fa087 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -185,6 +185,9 @@ unsigned int
radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results,
unsigned long first_index, unsigned int max_items,
unsigned int tag);
+unsigned long radix_tree_gang_tag_if_tagged(struct radix_tree_root *root,
+ unsigned long first_index, unsigned long last_index,
+ unsigned int fromtag, unsigned int totag);
int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag);
static inline void radix_tree_preload_end(void)
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 92cdd99..d821a17 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -610,6 +610,88 @@ int radix_tree_tag_get(struct radix_tree_root *root,
EXPORT_SYMBOL(radix_tree_tag_get);
/**
+ * radix_tree_gang_tag_if_tagged - for each item in given range set given
+ * tag if item has another tag set
+ * @root: radix tree root
+ * @first_index: starting index of a range to scan
+ * @last_index: last index of a range to scan
+ * @iftag: tag index to test
+ * @settag: tag index to set if tested tag is set
+ *
+ * This function scans range of radix tree from first_index to last_index.
+ * For each item in the range if iftag is set, the function sets also
+ * settag.
+ *
+ * The function returns number of leaves where the tag was set.
+ */
+unsigned long radix_tree_gang_tag_if_tagged(struct radix_tree_root *root,
+ unsigned long first_index, unsigned long last_index,
+ unsigned int iftag, unsigned int settag)
+{
+ unsigned int height = root->height, shift;
+ unsigned long tagged = 0, index = first_index;
+ struct radix_tree_node *open_slots[height], *slot;
+
+ last_index = min(last_index, radix_tree_maxindex(height));
+ if (first_index > last_index)
+ return 0;
+ if (!root_tag_get(root, iftag))
+ return 0;
+ if (height == 0) {
+ root_tag_set(root, settag);
+ return 1;
+ }
+
+ shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
+ slot = radix_tree_indirect_to_ptr(root->rnode);
+
+ for (;;) {
+ int offset;
+
+ offset = (index >> shift) & RADIX_TREE_MAP_MASK;
+ if (!slot->slots[offset])
+ goto next;
+ if (!tag_get(slot, iftag, offset))
+ goto next;
+ tag_set(slot, settag, offset);
+ if (height == 1) {
+ tagged++;
+ goto next;
+ }
+ /* Go down one level */
+ height--;
+ shift -= RADIX_TREE_MAP_SHIFT;
+ open_slots[height] = slot;
+ slot = slot->slots[offset];
+ continue;
+next:
+ /* Go to next item at level determined by 'shift' */
+ index = ((index >> shift) + 1) << shift;
+ if (index > last_index)
+ break;
+ while (((index >> shift) & RADIX_TREE_MAP_MASK) == 0) {
+ /*
+ * We've fully scanned this node. Go up. Because
+ * last_index is guaranteed to be in the tree, what
+ * we do below cannot wander astray.
+ */
+ slot = open_slots[height];
+ height++;
+ shift += RADIX_TREE_MAP_SHIFT;
+ }
+ }
+ /*
+ * The iftag must have been set somewhere because otherwise
+ * we would return immediated at the beginning of the function
+ */
+ root_tag_set(root, settag);
+
+ return tagged;
+}
+EXPORT_SYMBOL(radix_tree_gang_tag_if_tagged);
+
+
+/**
* radix_tree_next_hole - find the next hole (not-present entry)
* @root: tree root
* @index: index key
--
1.6.4.2
^ permalink raw reply related [flat|nested] 12+ messages in thread* Re: [PATCH 1/3] radix-tree: Implement function radix_tree_gang_tag_if_tagged
2010-02-11 23:06 ` [PATCH 1/3] radix-tree: Implement function radix_tree_gang_tag_if_tagged Jan Kara
@ 2010-02-15 16:01 ` Nick Piggin
0 siblings, 0 replies; 12+ messages in thread
From: Nick Piggin @ 2010-02-15 16:01 UTC (permalink / raw)
To: Jan Kara; +Cc: LKML, Andrew Morton, fengguang.wu
On Fri, Feb 12, 2010 at 12:06:22AM +0100, Jan Kara wrote:
> Implement function for setting one tag if another tag is set
> for each item in given range.
>
> Signed-off-by: Jan Kara <jack@suse.cz>
> ---
> include/linux/radix-tree.h | 3 ++
> lib/radix-tree.c | 82 ++++++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 85 insertions(+), 0 deletions(-)
>
> diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
> index c5da749..41fa087 100644
> --- a/include/linux/radix-tree.h
> +++ b/include/linux/radix-tree.h
> @@ -185,6 +185,9 @@ unsigned int
> radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results,
> unsigned long first_index, unsigned int max_items,
> unsigned int tag);
> +unsigned long radix_tree_gang_tag_if_tagged(struct radix_tree_root *root,
> + unsigned long first_index, unsigned long last_index,
> + unsigned int fromtag, unsigned int totag);
Really minor thing here, I think I called my equivalent something
like radix_tree_tag_set_if_tagged. We don't use tag/untag naming
but tag_set/tag_clear.
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH 2/3] mm: Implement writeback livelock avoidance using page tagging
2010-02-11 23:06 [RFC] [PATCH 0/3] Writeback livelock avoidance using page tags (v2) Jan Kara
2010-02-11 23:06 ` [PATCH 1/3] radix-tree: Implement function radix_tree_gang_tag_if_tagged Jan Kara
@ 2010-02-11 23:06 ` Jan Kara
2010-02-12 19:39 ` Andrew Morton
2010-02-11 23:06 ` [PATCH 3/3] mm: Debugging of new livelock avoidance Jan Kara
2 siblings, 1 reply; 12+ messages in thread
From: Jan Kara @ 2010-02-11 23:06 UTC (permalink / raw)
To: LKML; +Cc: Andrew Morton, npiggin, fengguang.wu, Jan Kara
We go though all sorts of hoops to avoid livelocking when we do
writeback on a mapping and someone steadily creates dirty pages
in that mapping. The motivation of this patch is to implement
a simple way to avoid livelocks (an thus we can rip out all the
methods we used previously).
The idea is simple: Tag all pages that should be written back
with a special tag (TOWRITE) in the radix tree. This can be done
rather quickly and thus livelocks should not happen in practice.
Then we start doing the hard work of locking pages and sending
them to disk only for those pages that have TOWRITE tag set.
Signed-off-by: Jan Kara <jack@suse.cz>
---
include/linux/fs.h | 1 +
include/linux/radix-tree.h | 2 +-
mm/page-writeback.c | 37 ++++++++++++++++++++++++++++++++++++-
3 files changed, 38 insertions(+), 2 deletions(-)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b1bcb27..da425f5 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -681,6 +681,7 @@ struct block_device {
*/
#define PAGECACHE_TAG_DIRTY 0
#define PAGECACHE_TAG_WRITEBACK 1
+#define PAGECACHE_TAG_TOWRITE 2
int mapping_tagged(struct address_space *mapping, int tag);
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 41fa087..c3f99b5 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -55,7 +55,7 @@ static inline int radix_tree_is_indirect_ptr(void *ptr)
/*** radix-tree API starts here ***/
-#define RADIX_TREE_MAX_TAGS 2
+#define RADIX_TREE_MAX_TAGS 3
/* root tags are stored in gfp_mask, shifted by __GFP_BITS_SHIFT */
struct radix_tree_root {
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 0b19943..f37cbc2 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -803,6 +803,30 @@ void __init page_writeback_init(void)
}
/**
+ * tag_pages_for_writeback - tag pages to be written by write_cache_pages
+ * @mapping: address space structure to write
+ * @start: starting page index
+ * @end: ending page index (inclusive)
+ *
+ * This function scans the page range from @start to @end and tags all pages
+ * that have DIRTY tag set with a special TOWRITE tag. The idea is that
+ * write_cache_pages (or whoever calls this function) will then use TOWRITE tag
+ * to identify pages eligible for writeback. This mechanism is used to avoid
+ * livelocking of writeback by a process steadily creating new dirty pages in
+ * the file (thus it is important for this function to be damn quick so that it
+ * can tag pages faster than a dirtying process can create them).
+ */
+void tag_pages_for_writeback(struct address_space *mapping,
+ pgoff_t start, pgoff_t end)
+{
+ spin_lock_irq(&mapping->tree_lock);
+ radix_tree_gang_tag_if_tagged(&mapping->page_tree, start, end,
+ PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE);
+ spin_unlock_irq(&mapping->tree_lock);
+}
+EXPORT_SYMBOL(tag_pages_for_writeback);
+
+/**
* write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
* @mapping: address space structure to write
* @wbc: subtract the number of written pages from *@wbc->nr_to_write
@@ -816,6 +840,13 @@ void __init page_writeback_init(void)
* the call was made get new I/O started against them. If wbc->sync_mode is
* WB_SYNC_ALL then we were called for data integrity and we must wait for
* existing IO to complete.
+ *
+ * To avoid livelocks (when other process dirties new pages), we first tag
+ * pages which should be written back with TOWRITE tag and only then start
+ * writing them. For data-integrity sync we have to be careful so that we do
+ * not miss some pages (e.g., because some other process has cleared TOWRITE
+ * tag we set). The rule we follow is that TOWRITE tag can be cleared only
+ * by the process clearing the DIRTY tag (and submitting the page for IO).
*/
int write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc, writepage_t writepage,
@@ -850,12 +881,13 @@ int write_cache_pages(struct address_space *mapping,
cycled = 1; /* ignore range_cyclic tests */
}
retry:
+ tag_pages_for_writeback(mapping, index, end);
done_index = index;
while (!done && (index <= end)) {
int i;
nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
- PAGECACHE_TAG_DIRTY,
+ PAGECACHE_TAG_TOWRITE,
min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
if (nr_pages == 0)
break;
@@ -1315,6 +1347,9 @@ int test_set_page_writeback(struct page *page)
radix_tree_tag_clear(&mapping->page_tree,
page_index(page),
PAGECACHE_TAG_DIRTY);
+ radix_tree_tag_clear(&mapping->page_tree,
+ page_index(page),
+ PAGECACHE_TAG_TOWRITE);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
} else {
ret = TestSetPageWriteback(page);
--
1.6.4.2
^ permalink raw reply related [flat|nested] 12+ messages in thread* Re: [PATCH 2/3] mm: Implement writeback livelock avoidance using page tagging
2010-02-11 23:06 ` [PATCH 2/3] mm: Implement writeback livelock avoidance using page tagging Jan Kara
@ 2010-02-12 19:39 ` Andrew Morton
2010-02-15 15:47 ` Jan Kara
0 siblings, 1 reply; 12+ messages in thread
From: Andrew Morton @ 2010-02-12 19:39 UTC (permalink / raw)
To: Jan Kara; +Cc: LKML, npiggin, fengguang.wu
On Fri, 12 Feb 2010 00:06:23 +0100
Jan Kara <jack@suse.cz> wrote:
> The idea is simple: Tag all pages that should be written back
> with a special tag (TOWRITE) in the radix tree. This can be done
> rather quickly and thus livelocks should not happen in practice.
> Then we start doing the hard work of locking pages and sending
> them to disk only for those pages that have TOWRITE tag set.
Adding a second pass across all the pages sounds expensive?
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH 2/3] mm: Implement writeback livelock avoidance using page tagging
2010-02-12 19:39 ` Andrew Morton
@ 2010-02-15 15:47 ` Jan Kara
2010-02-15 16:21 ` Nick Piggin
0 siblings, 1 reply; 12+ messages in thread
From: Jan Kara @ 2010-02-15 15:47 UTC (permalink / raw)
To: Andrew Morton; +Cc: Jan Kara, LKML, npiggin, fengguang.wu
On Fri 12-02-10 11:39:55, Andrew Morton wrote:
> On Fri, 12 Feb 2010 00:06:23 +0100
> Jan Kara <jack@suse.cz> wrote:
>
> > The idea is simple: Tag all pages that should be written back
> > with a special tag (TOWRITE) in the radix tree. This can be done
> > rather quickly and thus livelocks should not happen in practice.
> > Then we start doing the hard work of locking pages and sending
> > them to disk only for those pages that have TOWRITE tag set.
>
> Adding a second pass across all the pages sounds expensive?
Strictly speaking it's just through the radix tree and only through
branches with DIRTY_TAG set. But yes, there is some additional CPU cost.
I just thought that given the total cost of submitting a page it is
an acceptable increase and the simplification is worth it.
Would some numbers make you happier? Any suggestion for measurements?
Because I think that even for writes to tmpfs the change will be lost
in the noise...
Honza
--
Jan Kara <jack@suse.cz>
SUSE Labs, CR
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH 2/3] mm: Implement writeback livelock avoidance using page tagging
2010-02-15 15:47 ` Jan Kara
@ 2010-02-15 16:21 ` Nick Piggin
2010-02-15 16:24 ` Christoph Hellwig
2010-02-15 19:57 ` Jan Kara
0 siblings, 2 replies; 12+ messages in thread
From: Nick Piggin @ 2010-02-15 16:21 UTC (permalink / raw)
To: Jan Kara; +Cc: Andrew Morton, LKML, fengguang.wu
On Mon, Feb 15, 2010 at 04:47:51PM +0100, Jan Kara wrote:
> On Fri 12-02-10 11:39:55, Andrew Morton wrote:
> > On Fri, 12 Feb 2010 00:06:23 +0100
> > Jan Kara <jack@suse.cz> wrote:
> >
> > > The idea is simple: Tag all pages that should be written back
> > > with a special tag (TOWRITE) in the radix tree. This can be done
> > > rather quickly and thus livelocks should not happen in practice.
> > > Then we start doing the hard work of locking pages and sending
> > > them to disk only for those pages that have TOWRITE tag set.
> >
> > Adding a second pass across all the pages sounds expensive?
> Strictly speaking it's just through the radix tree and only through
> branches with DIRTY_TAG set. But yes, there is some additional CPU cost.
> I just thought that given the total cost of submitting a page it is
> an acceptable increase and the simplification is worth it.
> Would some numbers make you happier? Any suggestion for measurements?
> Because I think that even for writes to tmpfs the change will be lost
> in the noise...
Although hmm, if it is a very large file with *lots* of dirty pages
then it might become a noticable proportion of the cost.
Dave Chinner would probably tell you he's seen files with many
gigabytes dirty, and what is nr_to_write set to? 1024 is it? So you
might be tagging hundreds or thousands of radix tree entries per
page you write.
Also, I wonder what you think about leaving the tags dangling when
the loop bails out early? I have a *slight* concern about this
because previously we never have a tag set when radix_tree_delete
is called. I actually had a bug in that code in earlier versions
of rcu radix tree that only got found by the user test harness.
And another slight concern that it is just a bit ugly to leave the
tag. But I can accept that lower CPU overhead trumps ugliness :)
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH 2/3] mm: Implement writeback livelock avoidance using page tagging
2010-02-15 16:21 ` Nick Piggin
@ 2010-02-15 16:24 ` Christoph Hellwig
2010-02-15 16:37 ` Nick Piggin
2010-02-15 19:57 ` Jan Kara
1 sibling, 1 reply; 12+ messages in thread
From: Christoph Hellwig @ 2010-02-15 16:24 UTC (permalink / raw)
To: Nick Piggin; +Cc: Jan Kara, Andrew Morton, LKML, fengguang.wu
On Tue, Feb 16, 2010 at 03:21:27AM +1100, Nick Piggin wrote:
> Also, I wonder what you think about leaving the tags dangling when
> the loop bails out early? I have a *slight* concern about this
> because previously we never have a tag set when radix_tree_delete
> is called. I actually had a bug in that code in earlier versions
> of rcu radix tree that only got found by the user test harness.
> And another slight concern that it is just a bit ugly to leave the
> tag. But I can accept that lower CPU overhead trumps ugliness :)
The XFS inode cache calls radix_tree_delete with a tag set, and
interestingly enough we're trying to catch a very weird bug in that
area currently, which seems more or less directly related to the use
of tags. But we also had a real locking bug related to tags, so I'm
not yet sure what the issue is back until I hear back from the bug
reported and reproducer.
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH 2/3] mm: Implement writeback livelock avoidance using page tagging
2010-02-15 16:24 ` Christoph Hellwig
@ 2010-02-15 16:37 ` Nick Piggin
0 siblings, 0 replies; 12+ messages in thread
From: Nick Piggin @ 2010-02-15 16:37 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Jan Kara, Andrew Morton, LKML, fengguang.wu
On Mon, Feb 15, 2010 at 11:24:53AM -0500, Christoph Hellwig wrote:
> On Tue, Feb 16, 2010 at 03:21:27AM +1100, Nick Piggin wrote:
> > Also, I wonder what you think about leaving the tags dangling when
> > the loop bails out early? I have a *slight* concern about this
> > because previously we never have a tag set when radix_tree_delete
> > is called. I actually had a bug in that code in earlier versions
> > of rcu radix tree that only got found by the user test harness.
> > And another slight concern that it is just a bit ugly to leave the
> > tag. But I can accept that lower CPU overhead trumps ugliness :)
>
> The XFS inode cache calls radix_tree_delete with a tag set, and
> interestingly enough we're trying to catch a very weird bug in that
> area currently, which seems more or less directly related to the use
> of tags. But we also had a real locking bug related to tags, so I'm
> not yet sure what the issue is back until I hear back from the bug
> reported and reproducer.
Upon looking at the radix_tree_delete code again, the tag clearing
there seems rather simple so perhaps I misremembered some detail of
that bug I had.
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH 2/3] mm: Implement writeback livelock avoidance using page tagging
2010-02-15 16:21 ` Nick Piggin
2010-02-15 16:24 ` Christoph Hellwig
@ 2010-02-15 19:57 ` Jan Kara
2010-02-16 14:19 ` Nick Piggin
1 sibling, 1 reply; 12+ messages in thread
From: Jan Kara @ 2010-02-15 19:57 UTC (permalink / raw)
To: Nick Piggin; +Cc: Jan Kara, Andrew Morton, LKML, fengguang.wu
On Tue 16-02-10 03:21:27, Nick Piggin wrote:
> On Mon, Feb 15, 2010 at 04:47:51PM +0100, Jan Kara wrote:
> > On Fri 12-02-10 11:39:55, Andrew Morton wrote:
> > > On Fri, 12 Feb 2010 00:06:23 +0100
> > > Jan Kara <jack@suse.cz> wrote:
> > >
> > > > The idea is simple: Tag all pages that should be written back
> > > > with a special tag (TOWRITE) in the radix tree. This can be done
> > > > rather quickly and thus livelocks should not happen in practice.
> > > > Then we start doing the hard work of locking pages and sending
> > > > them to disk only for those pages that have TOWRITE tag set.
> > >
> > > Adding a second pass across all the pages sounds expensive?
> > Strictly speaking it's just through the radix tree and only through
> > branches with DIRTY_TAG set. But yes, there is some additional CPU cost.
> > I just thought that given the total cost of submitting a page it is
> > an acceptable increase and the simplification is worth it.
> > Would some numbers make you happier? Any suggestion for measurements?
> > Because I think that even for writes to tmpfs the change will be lost
> > in the noise...
>
> Although hmm, if it is a very large file with *lots* of dirty pages
> then it might become a noticable proportion of the cost.
>
> Dave Chinner would probably tell you he's seen files with many
> gigabytes dirty, and what is nr_to_write set to? 1024 is it? So you
> might be tagging hundreds or thousands of radix tree entries per
> page you write.
Hmm, this is a good point. My final aim is to remove the nr_to_write
logic so then the only case when we break out of the loop in
write_cache_pages early would be when ->writepage returns an error.
But until then what you describe can happen quite regularly.
An obvious workaround is to limit the amount of tags transferred but that
might have to repeat retagging and it just gets ugly. Another solution
would be to remove the DIRTY tag as we set the TOWRITE tag. That might
actually work quite nicely what do you think?
Finally, I could complement this series with patches substituting
nr_to_write logic but that would make the series considerably more
complicated and controverse so I rather wanted to do it in small steps...
> Also, I wonder what you think about leaving the tags dangling when
> the loop bails out early? I have a *slight* concern about this
> because previously we never have a tag set when radix_tree_delete
> is called. I actually had a bug in that code in earlier versions
> of rcu radix tree that only got found by the user test harness.
> And another slight concern that it is just a bit ugly to leave the
> tag. But I can accept that lower CPU overhead trumps ugliness :)
I specifically checked __remove_from_page_cache -> radix_tree_delete
path to verify what happens and everything seems to handle that just
fine. BTW, note that cancel_dirty_page does not clear the radix tree
dirty tag either so during truncation radix_tree_delete will be called
with tagged pages as well.
Honza
--
Jan Kara <jack@suse.cz>
SUSE Labs, CR
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH 2/3] mm: Implement writeback livelock avoidance using page tagging
2010-02-15 19:57 ` Jan Kara
@ 2010-02-16 14:19 ` Nick Piggin
0 siblings, 0 replies; 12+ messages in thread
From: Nick Piggin @ 2010-02-16 14:19 UTC (permalink / raw)
To: Jan Kara; +Cc: Andrew Morton, LKML, fengguang.wu
On Mon, Feb 15, 2010 at 08:57:13PM +0100, Jan Kara wrote:
> On Tue 16-02-10 03:21:27, Nick Piggin wrote:
> > On Mon, Feb 15, 2010 at 04:47:51PM +0100, Jan Kara wrote:
> > > On Fri 12-02-10 11:39:55, Andrew Morton wrote:
> > > > On Fri, 12 Feb 2010 00:06:23 +0100
> > > > Jan Kara <jack@suse.cz> wrote:
> > > >
> > > > > The idea is simple: Tag all pages that should be written back
> > > > > with a special tag (TOWRITE) in the radix tree. This can be done
> > > > > rather quickly and thus livelocks should not happen in practice.
> > > > > Then we start doing the hard work of locking pages and sending
> > > > > them to disk only for those pages that have TOWRITE tag set.
> > > >
> > > > Adding a second pass across all the pages sounds expensive?
> > > Strictly speaking it's just through the radix tree and only through
> > > branches with DIRTY_TAG set. But yes, there is some additional CPU cost.
> > > I just thought that given the total cost of submitting a page it is
> > > an acceptable increase and the simplification is worth it.
> > > Would some numbers make you happier? Any suggestion for measurements?
> > > Because I think that even for writes to tmpfs the change will be lost
> > > in the noise...
> >
> > Although hmm, if it is a very large file with *lots* of dirty pages
> > then it might become a noticable proportion of the cost.
> >
> > Dave Chinner would probably tell you he's seen files with many
> > gigabytes dirty, and what is nr_to_write set to? 1024 is it? So you
> > might be tagging hundreds or thousands of radix tree entries per
> > page you write.
> Hmm, this is a good point. My final aim is to remove the nr_to_write
> logic so then the only case when we break out of the loop in
> write_cache_pages early would be when ->writepage returns an error.
> But until then what you describe can happen quite regularly.
> An obvious workaround is to limit the amount of tags transferred but that
> might have to repeat retagging and it just gets ugly. Another solution
Yes I quickly thought about that but it gets hard.
> would be to remove the DIRTY tag as we set the TOWRITE tag. That might
> actually work quite nicely what do you think?
Hmm, would there be a risk of races with other code looking up
the dirty tag? It might work but it would be a bit scary.
How about only re-tagging the TOWRITE tags when there are none
left? (or at the point where a sync is requested).
> Finally, I could complement this series with patches substituting
> nr_to_write logic but that would make the series considerably more
> complicated and controverse so I rather wanted to do it in small steps...
Well if one of the justifications for this patch is that following
series then I'd like to see it. If it is simplifying other parts then
it will be good to see how it interacts with the TOWRITE stuff.
> > Also, I wonder what you think about leaving the tags dangling when
> > the loop bails out early? I have a *slight* concern about this
> > because previously we never have a tag set when radix_tree_delete
> > is called. I actually had a bug in that code in earlier versions
> > of rcu radix tree that only got found by the user test harness.
> > And another slight concern that it is just a bit ugly to leave the
> > tag. But I can accept that lower CPU overhead trumps ugliness :)
> I specifically checked __remove_from_page_cache -> radix_tree_delete
> path to verify what happens and everything seems to handle that just
> fine. BTW, note that cancel_dirty_page does not clear the radix tree
> dirty tag either so during truncation radix_tree_delete will be called
> with tagged pages as well.
Yeah, I must have been wrong about that. But dangling tags are ugly!
If we keep them around as writeback markers as above (refill only
when run out of tags or sync required), then they become conceptually
less ugly maybe.
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH 3/3] mm: Debugging of new livelock avoidance
2010-02-11 23:06 [RFC] [PATCH 0/3] Writeback livelock avoidance using page tags (v2) Jan Kara
2010-02-11 23:06 ` [PATCH 1/3] radix-tree: Implement function radix_tree_gang_tag_if_tagged Jan Kara
2010-02-11 23:06 ` [PATCH 2/3] mm: Implement writeback livelock avoidance using page tagging Jan Kara
@ 2010-02-11 23:06 ` Jan Kara
2 siblings, 0 replies; 12+ messages in thread
From: Jan Kara @ 2010-02-11 23:06 UTC (permalink / raw)
To: LKML; +Cc: Andrew Morton, npiggin, fengguang.wu, Jan Kara
Make some paranoic checks to verify that code does what it's expected to do.
Signed-off-by: Jan Kara <jack@suse.cz>
---
mm/page-writeback.c | 39 +++++++++++++++++++++++++++++++++++++++
1 files changed, 39 insertions(+), 0 deletions(-)
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index f37cbc2..a389feb 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -822,6 +822,45 @@ void tag_pages_for_writeback(struct address_space *mapping,
spin_lock_irq(&mapping->tree_lock);
radix_tree_gang_tag_if_tagged(&mapping->page_tree, start, end,
PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE);
+ /* Debugging aid */
+ {
+ int i;
+ pgoff_t index;
+ unsigned int nr_pages;
+ struct pagevec pvec;
+
+ index = start;
+ do {
+ nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+ PAGECACHE_TAG_DIRTY,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+ for (i = 0; i < nr_pages; i++) {
+ if (pvec.pages[i]->index > end) {
+ nr_pages = 0;
+ break;
+ }
+ if (!radix_tree_tag_get(&mapping->page_tree, pvec.pages[i]->index, PAGECACHE_TAG_TOWRITE))
+ printk("Seeing DIRTY !TOWRITE page %lu, state %lu\n", pvec.pages[i]->index, pvec.pages[i]->flags);
+ }
+ pagevec_release(&pvec);
+ } while (nr_pages);
+
+ index = start;
+ do {
+ nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+ PAGECACHE_TAG_TOWRITE,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+ for (i = 0; i < nr_pages; i++) {
+ if (pvec.pages[i]->index > end) {
+ nr_pages = 0;
+ break;
+ }
+ if (!radix_tree_tag_get(&mapping->page_tree, pvec.pages[i]->index, PAGECACHE_TAG_DIRTY))
+ printk("Seeing !DIRTY TOWRITE page %lu, state %lu\n", pvec.pages[i]->index, pvec.pages[i]->flags);
+ }
+ pagevec_release(&pvec);
+ } while (nr_pages);
+ }
spin_unlock_irq(&mapping->tree_lock);
}
EXPORT_SYMBOL(tag_pages_for_writeback);
--
1.6.4.2
^ permalink raw reply related [flat|nested] 12+ messages in thread