* [Cluster-devel] [PATCH 1/4] gfs2: add IO submission trace points
2010-02-05 5:45 [Cluster-devel] (no subject) Dave Chinner
@ 2010-02-05 5:45 ` Dave Chinner
2010-02-05 9:49 ` Steven Whitehouse
2010-02-05 5:45 ` [Cluster-devel] [PATCH 2/4] gfs2: ordered writes are backwards Dave Chinner
` (2 subsequent siblings)
3 siblings, 1 reply; 12+ messages in thread
From: Dave Chinner @ 2010-02-05 5:45 UTC (permalink / raw)
To: cluster-devel.redhat.com
Useful for tracking down where specific IOs are being issued
from.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
fs/gfs2/log.c | 6 ++++++
fs/gfs2/lops.c | 6 ++++++
fs/gfs2/trace_gfs2.h | 41 +++++++++++++++++++++++++++++++++++++++++
3 files changed, 53 insertions(+), 0 deletions(-)
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 4511b08..bd26dff 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -121,6 +121,7 @@ __acquires(&sdp->sd_log_lock)
lock_buffer(bh);
if (test_clear_buffer_dirty(bh)) {
bh->b_end_io = end_buffer_write_sync;
+ trace_gfs2_submit_bh(bh, WRITE_SYNC_PLUG, __func__);
submit_bh(WRITE_SYNC_PLUG, bh);
} else {
unlock_buffer(bh);
@@ -610,6 +611,8 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
goto skip_barrier;
get_bh(bh);
+ trace_gfs2_submit_bh(bh, WRITE_SYNC | (1 << BIO_RW_BARRIER) | (1 <<
+ BIO_RW_META), __func__);
submit_bh(WRITE_SYNC | (1 << BIO_RW_BARRIER) | (1 << BIO_RW_META), bh);
wait_on_buffer(bh);
if (buffer_eopnotsupp(bh)) {
@@ -619,6 +622,8 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
lock_buffer(bh);
skip_barrier:
get_bh(bh);
+ trace_gfs2_submit_bh(bh, WRITE_SYNC | (1 << BIO_RW_META),
+ __func__);
submit_bh(WRITE_SYNC | (1 << BIO_RW_META), bh);
wait_on_buffer(bh);
}
@@ -670,6 +675,7 @@ static void gfs2_ordered_write(struct gfs2_sbd *sdp)
lock_buffer(bh);
if (buffer_mapped(bh) && test_clear_buffer_dirty(bh)) {
bh->b_end_io = end_buffer_write_sync;
+ trace_gfs2_submit_bh(bh, WRITE_SYNC_PLUG, __func__);
submit_bh(WRITE_SYNC_PLUG, bh);
} else {
unlock_buffer(bh);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index de97632..5708edf 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -198,6 +198,7 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp)
}
gfs2_log_unlock(sdp);
+ trace_gfs2_submit_bh(bh, WRITE_SYNC_PLUG, __func__);
submit_bh(WRITE_SYNC_PLUG, bh);
gfs2_log_lock(sdp);
@@ -208,6 +209,7 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp)
gfs2_log_unlock(sdp);
lock_buffer(bd2->bd_bh);
bh = gfs2_log_fake_buf(sdp, bd2->bd_bh);
+ trace_gfs2_submit_bh(bh, WRITE_SYNC_PLUG, __func__);
submit_bh(WRITE_SYNC_PLUG, bh);
gfs2_log_lock(sdp);
if (++n >= num)
@@ -350,6 +352,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
sdp->sd_log_num_revoke--;
if (offset + sizeof(u64) > sdp->sd_sb.sb_bsize) {
+ trace_gfs2_submit_bh(bh, WRITE_SYNC_PLUG, __func__);
submit_bh(WRITE_SYNC_PLUG, bh);
bh = gfs2_log_get_buf(sdp);
@@ -367,6 +370,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
}
gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
+ trace_gfs2_submit_bh(bh, WRITE_SYNC_PLUG, __func__);
submit_bh(WRITE_SYNC_PLUG, bh);
}
@@ -569,6 +573,7 @@ static void gfs2_write_blocks(struct gfs2_sbd *sdp, struct buffer_head *bh,
ptr = bh_log_ptr(bh);
get_bh(bh);
+ trace_gfs2_submit_bh(bh, WRITE_SYNC_PLUG, __func__);
submit_bh(WRITE_SYNC_PLUG, bh);
gfs2_log_lock(sdp);
while(!list_empty(list)) {
@@ -595,6 +600,7 @@ static void gfs2_write_blocks(struct gfs2_sbd *sdp, struct buffer_head *bh,
} else {
bh1 = gfs2_log_fake_buf(sdp, bd->bd_bh);
}
+ trace_gfs2_submit_bh(bh1, WRITE_SYNC_PLUG, __func__);
submit_bh(WRITE_SYNC_PLUG, bh1);
gfs2_log_lock(sdp);
ptr += 2;
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
index 148d55c..e9a231d 100644
--- a/fs/gfs2/trace_gfs2.h
+++ b/fs/gfs2/trace_gfs2.h
@@ -397,6 +397,47 @@ TRACE_EVENT(gfs2_block_alloc,
block_state_name(__entry->block_state))
);
+#include <linux/fs.h>
+#include <linux/bio.h>
+
+#define BH_IO_TYPE \
+ { READ, "read" }, \
+ { WRITE, "write" }, \
+ { READA, "readahead" }, \
+ { SWRITE, "swrite" }, \
+ { READ_SYNC, "read sync" }, \
+ { READ_META, "read meta" }, \
+ { WRITE_SYNC, "write sync" }, \
+ { WRITE_SYNC_PLUG, "write sync plug" }, \
+ { WRITE_META, "write meta" }, \
+ { WRITE_BARRIER, "write barrier" }
+
+TRACE_EVENT(gfs2_submit_bh,
+ TP_PROTO(struct buffer_head *bh, int rw, const char * func),
+ TP_ARGS(bh, rw, func),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned long long, bno)
+ __field(unsigned long long, len)
+ __field(int, rw)
+ __field(const char *, func)
+ ),
+ TP_fast_assign(
+ __entry->dev = bh->b_bdev->bd_dev;
+ __entry->bno = bh->b_blocknr;
+ __entry->len = bh->b_size;
+ __entry->rw = rw;
+ __entry->func = func;
+ ),
+ TP_printk("dev %d:%d %s bno %llu len %llu caller %s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->rw, BH_IO_TYPE),
+ __entry->bno,
+ __entry->len,
+ __entry->func)
+);
+
+
#endif /* _TRACE_GFS2_H */
/* This part must be outside protection */
--
1.6.5
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [Cluster-devel] [PATCH 1/4] gfs2: add IO submission trace points
2010-02-05 5:45 ` [Cluster-devel] [PATCH 1/4] gfs2: add IO submission trace points Dave Chinner
@ 2010-02-05 9:49 ` Steven Whitehouse
2010-02-05 9:48 ` Christoph Hellwig
0 siblings, 1 reply; 12+ messages in thread
From: Steven Whitehouse @ 2010-02-05 9:49 UTC (permalink / raw)
To: cluster-devel.redhat.com
Hi,
On Fri, 2010-02-05 at 16:45 +1100, Dave Chinner wrote:
> Useful for tracking down where specific IOs are being issued
> from.
>
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
> fs/gfs2/log.c | 6 ++++++
> fs/gfs2/lops.c | 6 ++++++
> fs/gfs2/trace_gfs2.h | 41 +++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 53 insertions(+), 0 deletions(-)
>
> diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
> index 4511b08..bd26dff 100644
> --- a/fs/gfs2/log.c
> +++ b/fs/gfs2/log.c
> @@ -121,6 +121,7 @@ __acquires(&sdp->sd_log_lock)
> lock_buffer(bh);
> if (test_clear_buffer_dirty(bh)) {
> bh->b_end_io = end_buffer_write_sync;
> + trace_gfs2_submit_bh(bh, WRITE_SYNC_PLUG, __func__);
> submit_bh(WRITE_SYNC_PLUG, bh);
This looks like it could be a generically useful function, I wonder if
it would be possible to do this directly in submit_bh, since we should
be able to use __builtin_return_address(0) to find out the origin of the
call?
Steve.
^ permalink raw reply [flat|nested] 12+ messages in thread
* [Cluster-devel] [PATCH 1/4] gfs2: add IO submission trace points
2010-02-05 9:49 ` Steven Whitehouse
@ 2010-02-05 9:48 ` Christoph Hellwig
0 siblings, 0 replies; 12+ messages in thread
From: Christoph Hellwig @ 2010-02-05 9:48 UTC (permalink / raw)
To: cluster-devel.redhat.com
On Fri, Feb 05, 2010 at 09:49:54AM +0000, Steven Whitehouse wrote:
> Hi,
>
> On Fri, 2010-02-05 at 16:45 +1100, Dave Chinner wrote:
> > Useful for tracking down where specific IOs are being issued
> > from.
> >
> > Signed-off-by: Dave Chinner <dchinner@redhat.com>
> > ---
> > fs/gfs2/log.c | 6 ++++++
> > fs/gfs2/lops.c | 6 ++++++
> > fs/gfs2/trace_gfs2.h | 41 +++++++++++++++++++++++++++++++++++++++++
> > 3 files changed, 53 insertions(+), 0 deletions(-)
> >
> > diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
> > index 4511b08..bd26dff 100644
> > --- a/fs/gfs2/log.c
> > +++ b/fs/gfs2/log.c
> > @@ -121,6 +121,7 @@ __acquires(&sdp->sd_log_lock)
> > lock_buffer(bh);
> > if (test_clear_buffer_dirty(bh)) {
> > bh->b_end_io = end_buffer_write_sync;
> > + trace_gfs2_submit_bh(bh, WRITE_SYNC_PLUG, __func__);
> > submit_bh(WRITE_SYNC_PLUG, bh);
> This looks like it could be a generically useful function, I wonder if
> it would be possible to do this directly in submit_bh, since we should
> be able to use __builtin_return_address(0) to find out the origin of the
> call?
Yes, we could. The tracing code normally uses the _RET_IP_ wrapper
for __builtin_return_address(0) for some reason, though.
^ permalink raw reply [flat|nested] 12+ messages in thread
* [Cluster-devel] [PATCH 2/4] gfs2: ordered writes are backwards
2010-02-05 5:45 [Cluster-devel] (no subject) Dave Chinner
2010-02-05 5:45 ` [Cluster-devel] [PATCH 1/4] gfs2: add IO submission trace points Dave Chinner
@ 2010-02-05 5:45 ` Dave Chinner
2010-02-05 10:02 ` Steven Whitehouse
2010-02-05 5:45 ` [Cluster-devel] [PATCH 3/4] gfs2: ordered buffer writes are not sync Dave Chinner
2010-02-05 5:45 ` [Cluster-devel] [PATCH 4/4] gfs2: introduce AIL lock Dave Chinner
3 siblings, 1 reply; 12+ messages in thread
From: Dave Chinner @ 2010-02-05 5:45 UTC (permalink / raw)
To: cluster-devel.redhat.com
When we queue data buffers for ordered write, the buffers are added
to the head of the ordered write list. When the log needs to push
these buffers to disk, it also walks the list from the head. The
result is that the the ordered buffers are submitted to disk in
reverse order.
For large writes, this means that whenever the log flushes large
streams of reverse sequential order buffers are pushed down into the
block layers. The elevators don't handle this particularly well, so
IO rates tend to be significantly lower than if the IO was issued in
ascending block order.
Queue new ordered buffers to the tail of the ordered buffer list to
ensure that IO is dispatched in the order it was submitted. This
should significantly improve large sequential write speeds. On a
disk capable of 85MB/s, speeds increase from 50MB/s to 65MB/s for
noop and from 38MB/s to 50MB/s for cfq.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
fs/gfs2/lops.c | 4 ++--
1 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 5708edf..7278cf0 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -532,9 +532,9 @@ static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
gfs2_pin(sdp, bd->bd_bh);
tr->tr_num_databuf_new++;
sdp->sd_log_num_databuf++;
- list_add(&le->le_list, &sdp->sd_log_le_databuf);
+ list_add_tail(&le->le_list, &sdp->sd_log_le_databuf);
} else {
- list_add(&le->le_list, &sdp->sd_log_le_ordered);
+ list_add_tail(&le->le_list, &sdp->sd_log_le_ordered);
}
out:
gfs2_log_unlock(sdp);
--
1.6.5
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [Cluster-devel] [PATCH 2/4] gfs2: ordered writes are backwards
2010-02-05 5:45 ` [Cluster-devel] [PATCH 2/4] gfs2: ordered writes are backwards Dave Chinner
@ 2010-02-05 10:02 ` Steven Whitehouse
2010-02-05 10:34 ` Dave Chinner
0 siblings, 1 reply; 12+ messages in thread
From: Steven Whitehouse @ 2010-02-05 10:02 UTC (permalink / raw)
To: cluster-devel.redhat.com
Hi,
This looks good. There is an argument for trying to sort the buffers as
we write them (in case the application writes them out of order) but
this seems a sensible change to catch 90% of cases. I'm just about to
give this a quick test and I'll push this one in straight away if it
looks good on my test,
Steve.
On Fri, 2010-02-05 at 16:45 +1100, Dave Chinner wrote:
> When we queue data buffers for ordered write, the buffers are added
> to the head of the ordered write list. When the log needs to push
> these buffers to disk, it also walks the list from the head. The
> result is that the the ordered buffers are submitted to disk in
> reverse order.
>
> For large writes, this means that whenever the log flushes large
> streams of reverse sequential order buffers are pushed down into the
> block layers. The elevators don't handle this particularly well, so
> IO rates tend to be significantly lower than if the IO was issued in
> ascending block order.
>
> Queue new ordered buffers to the tail of the ordered buffer list to
> ensure that IO is dispatched in the order it was submitted. This
> should significantly improve large sequential write speeds. On a
> disk capable of 85MB/s, speeds increase from 50MB/s to 65MB/s for
> noop and from 38MB/s to 50MB/s for cfq.
>
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
> fs/gfs2/lops.c | 4 ++--
> 1 files changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
> index 5708edf..7278cf0 100644
> --- a/fs/gfs2/lops.c
> +++ b/fs/gfs2/lops.c
> @@ -532,9 +532,9 @@ static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
> gfs2_pin(sdp, bd->bd_bh);
> tr->tr_num_databuf_new++;
> sdp->sd_log_num_databuf++;
> - list_add(&le->le_list, &sdp->sd_log_le_databuf);
> + list_add_tail(&le->le_list, &sdp->sd_log_le_databuf);
> } else {
> - list_add(&le->le_list, &sdp->sd_log_le_ordered);
> + list_add_tail(&le->le_list, &sdp->sd_log_le_ordered);
> }
> out:
> gfs2_log_unlock(sdp);
^ permalink raw reply [flat|nested] 12+ messages in thread
* [Cluster-devel] [PATCH 2/4] gfs2: ordered writes are backwards
2010-02-05 10:02 ` Steven Whitehouse
@ 2010-02-05 10:34 ` Dave Chinner
0 siblings, 0 replies; 12+ messages in thread
From: Dave Chinner @ 2010-02-05 10:34 UTC (permalink / raw)
To: cluster-devel.redhat.com
On Fri, Feb 05, 2010 at 10:02:15AM +0000, Steven Whitehouse wrote:
> Hi,
>
> This looks good. There is an argument for trying to sort the buffers as
> we write them (in case the application writes them out of order) but
> this seems a sensible change to catch 90% of cases.
Agreed. I'm taking small steps first, though. ;)
Cheers,
Dave.
--
Dave Chinner
dchinner at redhat.com
^ permalink raw reply [flat|nested] 12+ messages in thread
* [Cluster-devel] [PATCH 3/4] gfs2: ordered buffer writes are not sync
2010-02-05 5:45 [Cluster-devel] (no subject) Dave Chinner
2010-02-05 5:45 ` [Cluster-devel] [PATCH 1/4] gfs2: add IO submission trace points Dave Chinner
2010-02-05 5:45 ` [Cluster-devel] [PATCH 2/4] gfs2: ordered writes are backwards Dave Chinner
@ 2010-02-05 5:45 ` Dave Chinner
2010-02-05 10:58 ` Steven Whitehouse
2010-02-05 5:45 ` [Cluster-devel] [PATCH 4/4] gfs2: introduce AIL lock Dave Chinner
3 siblings, 1 reply; 12+ messages in thread
From: Dave Chinner @ 2010-02-05 5:45 UTC (permalink / raw)
To: cluster-devel.redhat.com
Currently gfs2 ordered buffer writes use WRITE_SYNC_PLUG as the IO
type being dispatched. They aren't sync writes; we issue all the IO
pending, then wait for it all. IOWs, this is async IO with a bulk
wait on the end.
We should use normal WRITE tagging for this, and before we start
waiting make sure that all the Io is issued by unplugging the
device. The use of normal WRITEs for these buffers should
significantly reduce the overhead of processing in the cfq elevator
and enable the disk subsystem to get much closer to disk bandwidth
for large sequential writes.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
fs/gfs2/aops.c | 3 +++
fs/gfs2/log.c | 11 +++++++----
fs/gfs2/lops.c | 18 ++++++++++--------
3 files changed, 20 insertions(+), 12 deletions(-)
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 7b8da94..b75784c 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -20,6 +20,7 @@
#include <linux/swap.h>
#include <linux/gfs2_ondisk.h>
#include <linux/backing-dev.h>
+#include <linux/blkdev.h>
#include "gfs2.h"
#include "incore.h"
@@ -34,6 +35,7 @@
#include "super.h"
#include "util.h"
#include "glops.h"
+#include "trace_gfs2.h"
static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
@@ -52,6 +54,7 @@ static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
if (gfs2_is_jdata(ip))
set_buffer_uptodate(bh);
gfs2_trans_add_bh(ip->i_gl, bh, 0);
+ trace_gfs2_submit_bh(bh, WRITE, __func__);
}
}
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index bd26dff..a9797be 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -18,6 +18,7 @@
#include <linux/kthread.h>
#include <linux/freezer.h>
#include <linux/bio.h>
+#include <linux/blkdev.h>
#include "gfs2.h"
#include "incore.h"
@@ -121,8 +122,8 @@ __acquires(&sdp->sd_log_lock)
lock_buffer(bh);
if (test_clear_buffer_dirty(bh)) {
bh->b_end_io = end_buffer_write_sync;
- trace_gfs2_submit_bh(bh, WRITE_SYNC_PLUG, __func__);
- submit_bh(WRITE_SYNC_PLUG, bh);
+ trace_gfs2_submit_bh(bh, WRITE, __func__);
+ submit_bh(WRITE, bh);
} else {
unlock_buffer(bh);
brelse(bh);
@@ -675,8 +676,8 @@ static void gfs2_ordered_write(struct gfs2_sbd *sdp)
lock_buffer(bh);
if (buffer_mapped(bh) && test_clear_buffer_dirty(bh)) {
bh->b_end_io = end_buffer_write_sync;
- trace_gfs2_submit_bh(bh, WRITE_SYNC_PLUG, __func__);
- submit_bh(WRITE_SYNC_PLUG, bh);
+ trace_gfs2_submit_bh(bh, WRITE, __func__);
+ submit_bh(WRITE, bh);
} else {
unlock_buffer(bh);
brelse(bh);
@@ -692,6 +693,8 @@ static void gfs2_ordered_wait(struct gfs2_sbd *sdp)
struct gfs2_bufdata *bd;
struct buffer_head *bh;
+ blk_run_backing_dev(blk_get_backing_dev_info(sdp->sd_vfs->s_bdev), NULL);
+
gfs2_log_lock(sdp);
while (!list_empty(&sdp->sd_log_le_ordered)) {
bd = list_entry(sdp->sd_log_le_ordered.prev, struct gfs2_bufdata, bd_le.le_list);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 7278cf0..0fe2f3c 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -15,6 +15,7 @@
#include <linux/gfs2_ondisk.h>
#include <linux/bio.h>
#include <linux/fs.h>
+#include <linux/blkdev.h>
#include "gfs2.h"
#include "incore.h"
@@ -198,8 +199,8 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp)
}
gfs2_log_unlock(sdp);
- trace_gfs2_submit_bh(bh, WRITE_SYNC_PLUG, __func__);
- submit_bh(WRITE_SYNC_PLUG, bh);
+ trace_gfs2_submit_bh(bh, WRITE, __func__);
+ submit_bh(WRITE, bh);
gfs2_log_lock(sdp);
n = 0;
@@ -209,8 +210,8 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp)
gfs2_log_unlock(sdp);
lock_buffer(bd2->bd_bh);
bh = gfs2_log_fake_buf(sdp, bd2->bd_bh);
- trace_gfs2_submit_bh(bh, WRITE_SYNC_PLUG, __func__);
- submit_bh(WRITE_SYNC_PLUG, bh);
+ trace_gfs2_submit_bh(bh, WRITE, __func__);
+ submit_bh(WRITE, bh);
gfs2_log_lock(sdp);
if (++n >= num)
break;
@@ -220,6 +221,7 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp)
total -= num;
}
gfs2_log_unlock(sdp);
+ blk_run_backing_dev(blk_get_backing_dev_info(sdp->sd_vfs->s_bdev), NULL);
}
static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
@@ -573,8 +575,8 @@ static void gfs2_write_blocks(struct gfs2_sbd *sdp, struct buffer_head *bh,
ptr = bh_log_ptr(bh);
get_bh(bh);
- trace_gfs2_submit_bh(bh, WRITE_SYNC_PLUG, __func__);
- submit_bh(WRITE_SYNC_PLUG, bh);
+ trace_gfs2_submit_bh(bh, WRITE, __func__);
+ submit_bh(WRITE, bh);
gfs2_log_lock(sdp);
while(!list_empty(list)) {
bd = list_entry(list->next, struct gfs2_bufdata, bd_le.le_list);
@@ -600,8 +602,8 @@ static void gfs2_write_blocks(struct gfs2_sbd *sdp, struct buffer_head *bh,
} else {
bh1 = gfs2_log_fake_buf(sdp, bd->bd_bh);
}
- trace_gfs2_submit_bh(bh1, WRITE_SYNC_PLUG, __func__);
- submit_bh(WRITE_SYNC_PLUG, bh1);
+ trace_gfs2_submit_bh(bh1, WRITE, __func__);
+ submit_bh(WRITE, bh1);
gfs2_log_lock(sdp);
ptr += 2;
}
--
1.6.5
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [Cluster-devel] [PATCH 3/4] gfs2: ordered buffer writes are not sync
2010-02-05 5:45 ` [Cluster-devel] [PATCH 3/4] gfs2: ordered buffer writes are not sync Dave Chinner
@ 2010-02-05 10:58 ` Steven Whitehouse
0 siblings, 0 replies; 12+ messages in thread
From: Steven Whitehouse @ 2010-02-05 10:58 UTC (permalink / raw)
To: cluster-devel.redhat.com
Hi,
On Fri, 2010-02-05 at 16:45 +1100, Dave Chinner wrote:
> Currently gfs2 ordered buffer writes use WRITE_SYNC_PLUG as the IO
> type being dispatched. They aren't sync writes; we issue all the IO
> pending, then wait for it all. IOWs, this is async IO with a bulk
> wait on the end.
>
> We should use normal WRITE tagging for this, and before we start
> waiting make sure that all the Io is issued by unplugging the
> device. The use of normal WRITEs for these buffers should
> significantly reduce the overhead of processing in the cfq elevator
> and enable the disk subsystem to get much closer to disk bandwidth
> for large sequential writes.
>
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
That sounds reasonable. With respect to the new trace point, I'd raise
the same question as per the initial patch in the series. Also I'm
wondering about the calls to blk_run_backing_dev() as I'd thought that
this would happen automatically when we get to wait for the I/O.
Bearing in mind that your tests show no particular increase in
performance for this change, I'm tempted to be a bit more cautious about
applying it for now,
Steve.
^ permalink raw reply [flat|nested] 12+ messages in thread
* [Cluster-devel] [PATCH 4/4] gfs2: introduce AIL lock
2010-02-05 5:45 [Cluster-devel] (no subject) Dave Chinner
` (2 preceding siblings ...)
2010-02-05 5:45 ` [Cluster-devel] [PATCH 3/4] gfs2: ordered buffer writes are not sync Dave Chinner
@ 2010-02-05 5:45 ` Dave Chinner
2010-02-05 11:11 ` Steven Whitehouse
3 siblings, 1 reply; 12+ messages in thread
From: Dave Chinner @ 2010-02-05 5:45 UTC (permalink / raw)
To: cluster-devel.redhat.com
THe log lock is currently used to protect the AIL lists and
the movements of buffers into and out of them. The lists
are self contained and no log specific items outside the
lists are accessed when starting or emptying the AIL lists.
Hence the operation of the AIL does not require the protection
of the log lock so split them out into a new AIL specific lock
to reduce the amount of traffic on the log lock. This will
also reduce the amount of serialisation that occurs when
the gfs2_logd pushes on the AIL to move it forward.
This reduces the impact of log pushing on sequential write
throughput. On no-op scheduler on a disk that can do 85MB/s,
this increases the write rate from 65MB/s with the ordering
fixes to 75MB/s.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
fs/gfs2/glops.c | 10 ++++++++--
fs/gfs2/incore.h | 1 +
fs/gfs2/log.c | 32 +++++++++++++++++---------------
fs/gfs2/log.h | 22 ++++++++++++++++++++++
fs/gfs2/lops.c | 5 ++++-
fs/gfs2/ops_fstype.c | 1 +
6 files changed, 53 insertions(+), 18 deletions(-)
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 78554ac..65048f9 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -57,20 +57,26 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
BUG_ON(current->journal_info);
current->journal_info = &tr;
- gfs2_log_lock(sdp);
+ gfs2_ail_lock(sdp);
while (!list_empty(head)) {
bd = list_entry(head->next, struct gfs2_bufdata,
bd_ail_gl_list);
bh = bd->bd_bh;
gfs2_remove_from_ail(bd);
+ gfs2_ail_unlock(sdp);
+
bd->bd_bh = NULL;
bh->b_private = NULL;
bd->bd_blkno = bh->b_blocknr;
+ gfs2_log_lock(sdp);
gfs2_assert_withdraw(sdp, !buffer_busy(bh));
gfs2_trans_add_revoke(sdp, bd);
+ gfs2_log_unlock(sdp);
+
+ gfs2_ail_lock(sdp);
}
gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
- gfs2_log_unlock(sdp);
+ gfs2_ail_unlock(sdp);
gfs2_trans_end(sdp);
gfs2_log_flush(sdp, NULL);
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 4792200..00d610f 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -644,6 +644,7 @@ struct gfs2_sbd {
unsigned int sd_log_flush_head;
u64 sd_log_flush_wrapped;
+ spinlock_t sd_ail_lock;
struct list_head sd_ail1_list;
struct list_head sd_ail2_list;
u64 sd_ail_sync_gen;
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index a9797be..f2ee615 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -89,8 +89,8 @@ void gfs2_remove_from_ail(struct gfs2_bufdata *bd)
*/
static void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
-__releases(&sdp->sd_log_lock)
-__acquires(&sdp->sd_log_lock)
+__releases(&sdp->sd_ail_lock)
+__acquires(&sdp->sd_ail_lock)
{
struct gfs2_bufdata *bd, *s;
struct buffer_head *bh;
@@ -118,7 +118,7 @@ __acquires(&sdp->sd_log_lock)
list_move(&bd->bd_ail_st_list, &ai->ai_ail1_list);
get_bh(bh);
- gfs2_log_unlock(sdp);
+ gfs2_ail_unlock(sdp);
lock_buffer(bh);
if (test_clear_buffer_dirty(bh)) {
bh->b_end_io = end_buffer_write_sync;
@@ -128,7 +128,7 @@ __acquires(&sdp->sd_log_lock)
unlock_buffer(bh);
brelse(bh);
}
- gfs2_log_lock(sdp);
+ gfs2_ail_lock(sdp);
retry = 1;
break;
@@ -178,10 +178,10 @@ static void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags)
struct gfs2_ail *first_ai, *ai, *tmp;
int done = 0;
- gfs2_log_lock(sdp);
+ gfs2_ail_lock(sdp);
head = &sdp->sd_ail1_list;
if (list_empty(head)) {
- gfs2_log_unlock(sdp);
+ gfs2_ail_unlock(sdp);
return;
}
sync_gen = sdp->sd_ail_sync_gen++;
@@ -189,7 +189,7 @@ static void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags)
first = head->prev;
first_ai = list_entry(first, struct gfs2_ail, ai_list);
first_ai->ai_sync_gen = sync_gen;
- gfs2_ail1_start_one(sdp, first_ai); /* This may drop log lock */
+ gfs2_ail1_start_one(sdp, first_ai); /* This may drop ail lock */
if (flags & DIO_ALL)
first = NULL;
@@ -204,13 +204,13 @@ static void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags)
if (ai->ai_sync_gen >= sync_gen)
continue;
ai->ai_sync_gen = sync_gen;
- gfs2_ail1_start_one(sdp, ai); /* This may drop log lock */
+ gfs2_ail1_start_one(sdp, ai); /* This may drop ail lock */
done = 0;
break;
}
}
- gfs2_log_unlock(sdp);
+ gfs2_ail_unlock(sdp);
}
static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags)
@@ -218,7 +218,7 @@ static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags)
struct gfs2_ail *ai, *s;
int ret;
- gfs2_log_lock(sdp);
+ gfs2_ail_lock(sdp);
list_for_each_entry_safe_reverse(ai, s, &sdp->sd_ail1_list, ai_list) {
if (gfs2_ail1_empty_one(sdp, ai, flags))
@@ -229,7 +229,7 @@ static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags)
ret = list_empty(&sdp->sd_ail1_list);
- gfs2_log_unlock(sdp);
+ gfs2_ail_unlock(sdp);
return ret;
}
@@ -262,7 +262,7 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
int wrap = (new_tail < old_tail);
int a, b, rm;
- gfs2_log_lock(sdp);
+ gfs2_ail_lock(sdp);
list_for_each_entry_safe(ai, safe, &sdp->sd_ail2_list, ai_list) {
a = (old_tail <= ai->ai_first);
@@ -278,7 +278,7 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
kfree(ai);
}
- gfs2_log_unlock(sdp);
+ gfs2_ail_unlock(sdp);
}
/**
@@ -437,7 +437,7 @@ static unsigned int current_tail(struct gfs2_sbd *sdp)
struct gfs2_ail *ai;
unsigned int tail;
- gfs2_log_lock(sdp);
+ gfs2_ail_lock(sdp);
if (list_empty(&sdp->sd_ail1_list)) {
tail = sdp->sd_log_head;
@@ -446,7 +446,7 @@ static unsigned int current_tail(struct gfs2_sbd *sdp)
tail = ai->ai_first;
}
- gfs2_log_unlock(sdp);
+ gfs2_ail_unlock(sdp);
return tail;
}
@@ -775,10 +775,12 @@ void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
sdp->sd_log_commited_databuf = 0;
sdp->sd_log_commited_revoke = 0;
+ gfs2_ail_lock(sdp);
if (!list_empty(&ai->ai_ail1_list)) {
list_add(&ai->ai_list, &sdp->sd_ail1_list);
ai = NULL;
}
+ gfs2_ail_unlock(sdp);
gfs2_log_unlock(sdp);
trace_gfs2_log_flush(sdp, 0);
up_write(&sdp->sd_log_flush_lock);
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 7c64510..62c110e 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -38,6 +38,28 @@ __releases(&sdp->sd_log_lock)
spin_unlock(&sdp->sd_log_lock);
}
+/**
+ * gfs2_ail_lock - acquire the right to mess with the AIL
+ * @sdp: the filesystem
+ *
+ */
+static inline void gfs2_ail_lock(struct gfs2_sbd *sdp)
+__acquires(&sdp->sd_ail_lock)
+{
+ spin_lock(&sdp->sd_ail_lock);
+}
+
+/**
+ * gfs2_ail_unlock - release the right to mess with the AIL
+ * @sdp: the filesystem
+ *
+ */
+static inline void gfs2_ail_unlock(struct gfs2_sbd *sdp)
+__releases(&sdp->sd_ail_lock)
+{
+ spin_unlock(&sdp->sd_ail_lock);
+}
+
static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp,
unsigned int value)
{
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 0fe2f3c..342d65e 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -80,7 +80,7 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
mark_buffer_dirty(bh);
clear_buffer_pinned(bh);
- gfs2_log_lock(sdp);
+ gfs2_ail_lock(sdp);
if (bd->bd_ail) {
list_del(&bd->bd_ail_st_list);
brelse(bh);
@@ -91,6 +91,9 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
}
bd->bd_ail = ai;
list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list);
+ gfs2_ail_unlock(sdp);
+
+ gfs2_log_lock(sdp);
clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
trace_gfs2_pin(bd, 0);
gfs2_log_unlock(sdp);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index edfee24..14ecb2b 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -108,6 +108,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
INIT_LIST_HEAD(&sdp->sd_log_le_ordered);
mutex_init(&sdp->sd_log_reserve_mutex);
+ spin_lock_init(&sdp->sd_ail_lock);
INIT_LIST_HEAD(&sdp->sd_ail1_list);
INIT_LIST_HEAD(&sdp->sd_ail2_list);
--
1.6.5
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [Cluster-devel] [PATCH 4/4] gfs2: introduce AIL lock
2010-02-05 5:45 ` [Cluster-devel] [PATCH 4/4] gfs2: introduce AIL lock Dave Chinner
@ 2010-02-05 11:11 ` Steven Whitehouse
2010-02-06 2:34 ` Dave Chinner
0 siblings, 1 reply; 12+ messages in thread
From: Steven Whitehouse @ 2010-02-05 11:11 UTC (permalink / raw)
To: cluster-devel.redhat.com
Hi,
On Fri, 2010-02-05 at 16:45 +1100, Dave Chinner wrote:
> THe log lock is currently used to protect the AIL lists and
> the movements of buffers into and out of them. The lists
> are self contained and no log specific items outside the
> lists are accessed when starting or emptying the AIL lists.
>
> Hence the operation of the AIL does not require the protection
> of the log lock so split them out into a new AIL specific lock
> to reduce the amount of traffic on the log lock. This will
> also reduce the amount of serialisation that occurs when
> the gfs2_logd pushes on the AIL to move it forward.
>
> This reduces the impact of log pushing on sequential write
> throughput. On no-op scheduler on a disk that can do 85MB/s,
> this increases the write rate from 65MB/s with the ordering
> fixes to 75MB/s.
>
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
This looks good, but a couple of comments:
> ---
> fs/gfs2/glops.c | 10 ++++++++--
> fs/gfs2/incore.h | 1 +
> fs/gfs2/log.c | 32 +++++++++++++++++---------------
> fs/gfs2/log.h | 22 ++++++++++++++++++++++
> fs/gfs2/lops.c | 5 ++++-
> fs/gfs2/ops_fstype.c | 1 +
> 6 files changed, 53 insertions(+), 18 deletions(-)
>
> diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
> index 78554ac..65048f9 100644
> --- a/fs/gfs2/glops.c
> +++ b/fs/gfs2/glops.c
> @@ -57,20 +57,26 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
> BUG_ON(current->journal_info);
> current->journal_info = &tr;
>
> - gfs2_log_lock(sdp);
> + gfs2_ail_lock(sdp);
^^^^ this abstraction of a spinlock is left over from the old
gfs1 code. I'd prefer when adding new locks just to use spinlock(&....)
directly, rather than abstracting it out like this. That way we don't
have to think about what kind of lock it is.
[snip]
> diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
> index 0fe2f3c..342d65e 100644
> --- a/fs/gfs2/lops.c
> +++ b/fs/gfs2/lops.c
> @@ -80,7 +80,7 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
> mark_buffer_dirty(bh);
> clear_buffer_pinned(bh);
>
> - gfs2_log_lock(sdp);
> + gfs2_ail_lock(sdp);
> if (bd->bd_ail) {
> list_del(&bd->bd_ail_st_list);
> brelse(bh);
> @@ -91,6 +91,9 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
> }
> bd->bd_ail = ai;
> list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list);
> + gfs2_ail_unlock(sdp);
> +
> + gfs2_log_lock(sdp);
> clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
> trace_gfs2_pin(bd, 0);
> gfs2_log_unlock(sdp);
I don't think the gfs2_log_lock() is actually required at this point.
the LFLUSH bit is protected by the sd_log_flush_lock rwsem
and the tracing doesn't need the log lock either,
Steve.
^ permalink raw reply [flat|nested] 12+ messages in thread
* [Cluster-devel] [PATCH 4/4] gfs2: introduce AIL lock
2010-02-05 11:11 ` Steven Whitehouse
@ 2010-02-06 2:34 ` Dave Chinner
0 siblings, 0 replies; 12+ messages in thread
From: Dave Chinner @ 2010-02-06 2:34 UTC (permalink / raw)
To: cluster-devel.redhat.com
On Fri, Feb 05, 2010 at 11:11:48AM +0000, Steven Whitehouse wrote:
> Hi,
>
> On Fri, 2010-02-05 at 16:45 +1100, Dave Chinner wrote:
> > THe log lock is currently used to protect the AIL lists and
> > the movements of buffers into and out of them. The lists
> > are self contained and no log specific items outside the
> > lists are accessed when starting or emptying the AIL lists.
> >
> > Hence the operation of the AIL does not require the protection
> > of the log lock so split them out into a new AIL specific lock
> > to reduce the amount of traffic on the log lock. This will
> > also reduce the amount of serialisation that occurs when
> > the gfs2_logd pushes on the AIL to move it forward.
> >
> > This reduces the impact of log pushing on sequential write
> > throughput. On no-op scheduler on a disk that can do 85MB/s,
> > this increases the write rate from 65MB/s with the ordering
> > fixes to 75MB/s.
> >
> > Signed-off-by: Dave Chinner <dchinner@redhat.com>
>
> This looks good, but a couple of comments:
>
> > diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
> > index 78554ac..65048f9 100644
> > --- a/fs/gfs2/glops.c
> > +++ b/fs/gfs2/glops.c
> > @@ -57,20 +57,26 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
> > BUG_ON(current->journal_info);
> > current->journal_info = &tr;
> >
> > - gfs2_log_lock(sdp);
> > + gfs2_ail_lock(sdp);
> ^^^^ this abstraction of a spinlock is left over from the old
> gfs1 code. I'd prefer when adding new locks just to use spinlock(&....)
> directly, rather than abstracting it out like this. That way we don't
> have to think about what kind of lock it is.
Cool. I wondered about that - I was going to just ignore the
wrappers and put direct calls in, but I thought it might be better
to just start by following the existing convention. I'll respin this
without the wrappers.
> [snip]
> > @@ -91,6 +91,9 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
> > }
> > bd->bd_ail = ai;
> > list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list);
> > + gfs2_ail_unlock(sdp);
> > +
> > + gfs2_log_lock(sdp);
> > clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
> > trace_gfs2_pin(bd, 0);
> > gfs2_log_unlock(sdp);
> I don't think the gfs2_log_lock() is actually required at this point.
> the LFLUSH bit is protected by the sd_log_flush_lock rwsem
> and the tracing doesn't need the log lock either,
Ok, I wasn't sure how that bit was protected, so I left it with the
same protection as it had before. I'll kill the log lock from there
entirely.
Cheers,
Dave.
--
Dave Chinner
dchinner at redhat.com
^ permalink raw reply [flat|nested] 12+ messages in thread