From: Benny Halevy <bhalevy@panasas.com>
To: Fred Isaman <iisaman@netapp.com>
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>,
Boaz Harrosh <bharrosh@panasas.com>,
linux-nfs@vger.kernel.org
Subject: Re: [PATCH v2 03/29] pnfs: Use byte-range for layoutget
Date: Thu, 12 May 2011 19:46:36 -0400 [thread overview]
Message-ID: <4DCC715C.1090101@panasas.com> (raw)
In-Reply-To: <BANLkTimaVzS94FtZrY_EpXB5Od6QGjeLhg@mail.gmail.com>
On 2011-05-12 11:18, Fred Isaman wrote:
> On Mon, May 9, 2011 at 1:06 PM, Benny Halevy <bhalevy@panasas.com> wrote:
>> Add offset and count parameters to pnfs_update_layout and use them to get
>> the layout in the pageio path.
>>
>> Test byte range against the layout segment in use in pnfs_{read,write}_pg_test
>> so not to coalesce pages not using the same layout segment.
>>
>> Signed-off-by: Benny Halevy <bhalevy@panasas.com>
>> ---
>> fs/nfs/pnfs.c | 149 +++++++++++++++++++++++++++++++++++++++++++++-----------
>> fs/nfs/pnfs.h | 4 +-
>> fs/nfs/read.c | 10 +++-
>> fs/nfs/write.c | 8 ++-
>> 4 files changed, 136 insertions(+), 35 deletions(-)
>>
>> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
>> index d9ab972..e689bdf 100644
>> --- a/fs/nfs/pnfs.c
>> +++ b/fs/nfs/pnfs.c
>> @@ -261,6 +261,65 @@ put_lseg(struct pnfs_layout_segment *lseg)
>> }
>> EXPORT_SYMBOL_GPL(put_lseg);
>>
>> +static inline u64
>> +end_offset(u64 start, u64 len)
>> +{
>> + u64 end;
>> +
>> + end = start + len;
>> + return end >= start ? end : NFS4_MAX_UINT64;
>> +}
>> +
>> +/* last octet in a range */
>> +static inline u64
>> +last_byte_offset(u64 start, u64 len)
>> +{
>> + u64 end;
>> +
>> + BUG_ON(!len);
>> + end = start + len;
>> + return end > start ? end - 1 : NFS4_MAX_UINT64;
>> +}
>> +
>> +/*
>> + * is l2 fully contained in l1?
>> + * start1 end1
>> + * [----------------------------------)
>> + * start2 end2
>> + * [----------------)
>> + */
>> +static inline int
>> +lo_seg_contained(struct pnfs_layout_range *l1,
>> + struct pnfs_layout_range *l2)
>> +{
>> + u64 start1 = l1->offset;
>> + u64 end1 = end_offset(start1, l1->length);
>> + u64 start2 = l2->offset;
>> + u64 end2 = end_offset(start2, l2->length);
>> +
>> + return (start1 <= start2) && (end1 >= end2);
>> +}
>> +
>> +/*
>> + * is l1 and l2 intersecting?
>> + * start1 end1
>> + * [----------------------------------)
>> + * start2 end2
>> + * [----------------)
>> + */
>> +static inline int
>> +lo_seg_intersecting(struct pnfs_layout_range *l1,
>> + struct pnfs_layout_range *l2)
>> +{
>> + u64 start1 = l1->offset;
>> + u64 end1 = end_offset(start1, l1->length);
>> + u64 start2 = l2->offset;
>> + u64 end2 = end_offset(start2, l2->length);
>> +
>> + return (end1 == NFS4_MAX_UINT64 || end1 > start2) &&
>> + (end2 == NFS4_MAX_UINT64 || end2 > start1);
>> +}
>> +
>> static bool
>> should_free_lseg(u32 lseg_iomode, u32 recall_iomode)
>> {
>> @@ -466,7 +525,7 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
>> static struct pnfs_layout_segment *
>> send_layoutget(struct pnfs_layout_hdr *lo,
>> struct nfs_open_context *ctx,
>> - u32 iomode)
>> + struct pnfs_layout_range *range)
>> {
>> struct inode *ino = lo->plh_inode;
>> struct nfs_server *server = NFS_SERVER(ino);
>> @@ -497,11 +556,11 @@ send_layoutget(struct pnfs_layout_hdr *lo,
>> goto out_err_free;
>> }
>>
>> - lgp->args.minlength = NFS4_MAX_UINT64;
>> + lgp->args.minlength = PAGE_CACHE_SIZE;
>> + if (lgp->args.minlength > range->length)
>> + lgp->args.minlength = range->length;
>> lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
>> - lgp->args.range.iomode = iomode;
>> - lgp->args.range.offset = 0;
>> - lgp->args.range.length = NFS4_MAX_UINT64;
>> + lgp->args.range = *range;
>
> Do you want to align offet to page boundary?
>
>
Yeah, that makes sense.
>> lgp->args.type = server->pnfs_curr_ld->id;
>> lgp->args.inode = ino;
>> lgp->args.ctx = get_nfs_open_context(ctx);
>> @@ -515,7 +574,7 @@ send_layoutget(struct pnfs_layout_hdr *lo,
>> nfs4_proc_layoutget(lgp);
>> if (!lseg) {
>> /* remember that LAYOUTGET failed and suspend trying */
>> - set_bit(lo_fail_bit(iomode), &lo->plh_flags);
>> + set_bit(lo_fail_bit(range->iomode), &lo->plh_flags);
>> }
>>
>> /* free xdr pages */
>> @@ -622,10 +681,24 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier)
>> * are seen first.
>> */
>> static s64
>> -cmp_layout(u32 iomode1, u32 iomode2)
>> +cmp_layout(struct pnfs_layout_range *l1,
>> + struct pnfs_layout_range *l2)
>> {
>> + s64 d;
>> +
>> + /* higher offset > lower offset */
>> + d = l1->offset - l2->offset;
>> + if (d)
>> + return d;
>> +
>> + /* longer length > shorter length */
>> + d = l1->length - l2->length;
>> + if (d)
>> + return d;
>> +
>
> Assuming iomodes the same, don't we prefer seeing the longer to the shorter?
>
yup. good catch!
> Wouldn't we prefer a short rw to a long ro?
>
hmm, it might matter in the blocks layout world if there's
un-layoutcommit-ted data accessible only through the RW layout
but that means we dropped the data out of our cache without
completing layoutcommit. not good.
Otherwise, using the longer RO layout rather the shorter RW
is helpful to coalesce longer read requests using the same
layout segment, so I'd still do it.
>> /* read > read/write */
>> - return (int)(iomode2 == IOMODE_READ) - (int)(iomode1 == IOMODE_READ);
>> + return (int)(l2->iomode == IOMODE_READ) -
>> + (int)(l1->iomode == IOMODE_READ);
>> }
>>
>> static void
>> @@ -639,7 +712,7 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo,
>>
>> assert_spin_locked(&lo->plh_inode->i_lock);
>> list_for_each_entry(lp, &lo->plh_segs, pls_list) {
>> - if (cmp_layout(lp->pls_range.iomode, lseg->pls_range.iomode) > 0)
>> + if (cmp_layout(&lp->pls_range, &lseg->pls_range) > 0)
>> continue;
>> list_add_tail(&lseg->pls_list, &lp->pls_list);
>> dprintk("%s: inserted lseg %p "
>> @@ -718,16 +791,28 @@ pnfs_find_alloc_layout(struct inode *ino)
>> * READ RW true
>> */
>> static int
>> -is_matching_lseg(struct pnfs_layout_segment *lseg, u32 iomode)
>> +is_matching_lseg(struct pnfs_layout_segment *lseg,
>> + struct pnfs_layout_range *range)
>
> arguments to this should probably both be of struct pnfs_layout_range
>
ok, this seems to be cleaner indeed.
>
>> {
>> - return (iomode != IOMODE_RW || lseg->pls_range.iomode == IOMODE_RW);
>> + struct pnfs_layout_range range1;
>> +
>> + if ((range->iomode == IOMODE_RW &&
>> + lseg->pls_range.iomode != IOMODE_RW) ||
>> + !lo_seg_intersecting(&lseg->pls_range, range))
>> + return 0;
>> +
>> + /* range1 covers only the first byte in the range */
>> + range1 = *range;
>> + range1.length = 1;
>> + return lo_seg_contained(&lseg->pls_range, &range1);
>> }
>>
>> /*
>> * lookup range in layout
>> */
>> static struct pnfs_layout_segment *
>> -pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode)
>> +pnfs_find_lseg(struct pnfs_layout_hdr *lo,
>> + struct pnfs_layout_range *range)
>> {
>> struct pnfs_layout_segment *lseg, *ret = NULL;
>>
>> @@ -736,11 +821,11 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode)
>> assert_spin_locked(&lo->plh_inode->i_lock);
>> list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
>> if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
>> - is_matching_lseg(lseg, iomode)) {
>> + is_matching_lseg(lseg, range)) {
>> ret = get_lseg(lseg);
>> break;
>> }
>> - if (cmp_layout(iomode, lseg->pls_range.iomode) > 0)
>> + if (cmp_layout(range, &lseg->pls_range) > 0)
>> break;
>> }
>>
>> @@ -756,8 +841,15 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode)
>> struct pnfs_layout_segment *
>> pnfs_update_layout(struct inode *ino,
>> struct nfs_open_context *ctx,
>> + loff_t pos,
>> + u64 count,
>> enum pnfs_iomode iomode)
>> {
>> + struct pnfs_layout_range arg = {
>> + .iomode = iomode,
>> + .offset = pos,
>> + .length = count,
>> + };
>> struct nfs_inode *nfsi = NFS_I(ino);
>> struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
>> struct pnfs_layout_hdr *lo;
>> @@ -785,7 +877,7 @@ pnfs_update_layout(struct inode *ino,
>> goto out_unlock;
>>
>> /* Check to see if the layout for the given range already exists */
>> - lseg = pnfs_find_lseg(lo, iomode);
>> + lseg = pnfs_find_lseg(lo, &arg);
>> if (lseg)
>> goto out_unlock;
>>
>> @@ -807,7 +899,7 @@ pnfs_update_layout(struct inode *ino,
>> spin_unlock(&clp->cl_lock);
>> }
>>
>> - lseg = send_layoutget(lo, ctx, iomode);
>> + lseg = send_layoutget(lo, ctx, &arg);
>> if (!lseg && first) {
>> spin_lock(&clp->cl_lock);
>> list_del_init(&lo->plh_layouts);
>> @@ -834,17 +926,6 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
>> struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
>> int status = 0;
>>
>> - /* Verify we got what we asked for.
>> - * Note that because the xdr parsing only accepts a single
>> - * element array, this can fail even if the server is behaving
>> - * correctly.
>> - */
>> - if (lgp->args.range.iomode > res->range.iomode ||
>> - res->range.offset != 0 ||
>> - res->range.length != NFS4_MAX_UINT64) {
>> - status = -EINVAL;
>> - goto out;
>> - }
>> /* Inject layout blob into I/O device driver */
>> lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res);
>> if (!lseg || IS_ERR(lseg)) {
>> @@ -899,8 +980,13 @@ static int pnfs_read_pg_test(struct nfs_pageio_descriptor *pgio,
>> /* This is first coelesce call for a series of nfs_pages */
>> pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
>> prev->wb_context,
>> + req_offset(req),
>> + pgio->pg_count,
>> IOMODE_READ);
>> - }
>> + } else if (pgio->pg_lseg &&
>> + req_offset(req) > end_offset(pgio->pg_lseg->pls_range.offset,
>> + pgio->pg_lseg->pls_range.length))
>> + return 0;
>> return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req);
>> }
>>
>> @@ -921,8 +1007,13 @@ static int pnfs_write_pg_test(struct nfs_pageio_descriptor *pgio,
>> /* This is first coelesce call for a series of nfs_pages */
>> pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
>> prev->wb_context,
>> + req_offset(req),
>> + pgio->pg_count,
>> IOMODE_RW);
>> - }
>> + } else if (pgio->pg_lseg &&
>> + req_offset(req) > end_offset(pgio->pg_lseg->pls_range.offset,
>> + pgio->pg_lseg->pls_range.length))
>> + return 0;
>> return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req);
>> }
>>
>> diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
>> index 4cb0a0d..14a2af9 100644
>> --- a/fs/nfs/pnfs.h
>> +++ b/fs/nfs/pnfs.h
>> @@ -129,7 +129,7 @@ void get_layout_hdr(struct pnfs_layout_hdr *lo);
>> void put_lseg(struct pnfs_layout_segment *lseg);
>> struct pnfs_layout_segment *
>> pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
>> - enum pnfs_iomode access_type);
>> + loff_t pos, u64 count, enum pnfs_iomode access_type);
>> void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
>> void unset_pnfs_layoutdriver(struct nfs_server *);
>> enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *,
>> @@ -248,7 +248,7 @@ static inline void put_lseg(struct pnfs_layout_segment *lseg)
>>
>> static inline struct pnfs_layout_segment *
>> pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
>> - enum pnfs_iomode access_type)
>> + loff_t pos, u64 count, enum pnfs_iomode access_type)
>> {
>> return NULL;
>> }
>> diff --git a/fs/nfs/read.c b/fs/nfs/read.c
>> index 7cded2b..10eff1c 100644
>> --- a/fs/nfs/read.c
>> +++ b/fs/nfs/read.c
>> @@ -288,13 +288,17 @@ static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc)
>> atomic_set(&req->wb_complete, requests);
>>
>> BUG_ON(desc->pg_lseg != NULL);
>> - lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_READ);
>> + lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
>> + req_offset(req), desc->pg_count,
>> + IOMODE_READ);
>> ClearPageError(page);
>> offset = 0;
>> nbytes = desc->pg_count;
>> do {
>> int ret2;
>>
>> + /* FIXME: need a new layout segment? */
>> +
>
> No need for FIXME, since we assume strongly throughout that the
> layouts are page aligned.
OK. I'll remove it.
Thanks!
Benny
>
> Fred
>
>> data = list_entry(list.next, struct nfs_read_data, pages);
>> list_del_init(&data->pages);
>>
>> @@ -351,7 +355,9 @@ static int nfs_pagein_one(struct nfs_pageio_descriptor *desc)
>> }
>> req = nfs_list_entry(data->pages.next);
>> if ((!lseg) && list_is_singular(&data->pages))
>> - lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_READ);
>> + lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
>> + req_offset(req), desc->pg_count,
>> + IOMODE_READ);
>>
>> ret = nfs_read_rpcsetup(req, data, &nfs_read_full_ops, desc->pg_count,
>> 0, lseg);
>> diff --git a/fs/nfs/write.c b/fs/nfs/write.c
>> index e4cbc11..318e0a3 100644
>> --- a/fs/nfs/write.c
>> +++ b/fs/nfs/write.c
>> @@ -940,7 +940,9 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc)
>> atomic_set(&req->wb_complete, requests);
>>
>> BUG_ON(desc->pg_lseg);
>> - lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW);
>> + lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
>> + req_offset(req), desc->pg_count,
>> + IOMODE_RW);
>> ClearPageError(page);
>> offset = 0;
>> nbytes = desc->pg_count;
>> @@ -1014,7 +1016,9 @@ static int nfs_flush_one(struct nfs_pageio_descriptor *desc)
>> }
>> req = nfs_list_entry(data->pages.next);
>> if ((!lseg) && list_is_singular(&data->pages))
>> - lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW);
>> + lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
>> + req_offset(req), desc->pg_count,
>> + IOMODE_RW);
>>
>> if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
>> (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit))
>> --
>> 1.7.3.4
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
next prev parent reply other threads:[~2011-05-12 23:46 UTC|newest]
Thread overview: 43+ messages / expand[flat|nested] mbox.gz Atom feed top
2011-05-09 17:04 [PATCH v2 0/29] pnfs for 2.6.40 Benny Halevy
2011-05-09 17:06 ` [PATCH v2 01/29] pnfs: CB_NOTIFY_DEVICEID Benny Halevy
2011-05-12 14:35 ` Fred Isaman
2011-05-13 0:00 ` Benny Halevy
2011-05-09 17:06 ` [PATCH v2 02/29] pnfs: direct i/o Benny Halevy
2011-05-12 14:41 ` Fred Isaman
2011-05-12 23:54 ` Benny Halevy
2011-05-09 17:06 ` [PATCH v2 03/29] pnfs: Use byte-range for layoutget Benny Halevy
2011-05-12 15:18 ` Fred Isaman
2011-05-12 23:46 ` Benny Halevy [this message]
2011-05-16 13:59 ` [PATCH 1/4] pnfs: align layoutget requests on page boundaries Benny Halevy
2011-05-16 13:59 ` [PATCH 2/4] SQUASHME: pnfs: fix lseg ordering Benny Halevy
2011-05-16 13:59 ` [PATCH 3/4] SQUASHME: pnfs: clean up pnfs_find_lseg lseg arg Benny Halevy
2011-05-16 13:59 ` [PATCH 4/4] SQUASHME: remove unnecessary FIXME Benny Halevy
2011-05-09 17:07 ` [PATCH v2 04/29] pnfs: Use byte-range for cb_layoutrecall Benny Halevy
2011-05-09 17:07 ` [PATCH v2 05/29] pnfs: client stats Benny Halevy
2011-05-09 17:07 ` [PATCH v2 06/29] pnfs: resolve header dependency in pnfs.h Benny Halevy
2011-05-09 17:07 ` [PATCH v2 07/29] pnfs-obj: objlayoutdriver module skeleton Benny Halevy
2011-05-09 17:07 ` [PATCH v2 08/29] NFSD: introduce exp_xdr.h Benny Halevy
2011-05-09 17:08 ` [PATCH v2 09/29] pnfs-obj: pnfs_osd XDR definitions Benny Halevy
2011-05-09 17:08 ` [PATCH v2 10/29] exofs: pnfs-tree: Remove pnfs-osd private definitions Benny Halevy
2011-05-09 17:08 ` [PATCH v2 11/29] pnfs-obj: pnfs_osd XDR client implementation Benny Halevy
2011-05-09 17:08 ` [PATCH v2 12/29] pnfs-obj: decode layout, alloc/free lseg Benny Halevy
2011-05-09 17:08 ` [PATCH v2 13/29] pnfs: per mount layout driver private data Benny Halevy
2011-05-09 17:08 ` [PATCH v2 14/29] pnfs-obj: objio_osd device information retrieval and caching Benny Halevy
2011-05-09 17:09 ` [PATCH v2 15/29] pnfs: set/unset layoutdriver Benny Halevy
2011-05-09 17:09 ` [PATCH v2 16/29] pnfs-obj: objlayout set/unset layout driver methods Benny Halevy
2011-05-09 17:09 ` [PATCH v2 17/29] pnfs: alloc and free layout_hdr layoutdriver methods Benny Halevy
2011-05-09 17:09 ` [PATCH v2 18/29] pnfs: support for non-rpc layout drivers Benny Halevy
2011-05-12 16:07 ` Fred Isaman
2011-05-12 23:48 ` Benny Halevy
2011-05-16 14:29 ` [PATCH] SQUASHME: revert useless change in nfs4_write_done_cb Benny Halevy
2011-05-09 17:09 ` [PATCH v2 19/29] pnfs-obj: read/write implementation Benny Halevy
2011-05-09 17:10 ` [PATCH v2 20/29] pnfs: layoutreturn Benny Halevy
2011-05-09 17:10 ` [PATCH v2 21/29] pnfs: layoutret_on_setattr Benny Halevy
2011-05-09 17:10 ` [PATCH v2 22/29] pnfs: encode_layoutreturn Benny Halevy
2011-05-09 17:10 ` [PATCH v2 23/29] sunrpc: xdr_rewind_stream() Benny Halevy
2011-05-09 17:10 ` [PATCH v2 24/29] pnfs-obj: objlayout_encode_layoutreturn Implementation Benny Halevy
2011-05-09 17:11 ` [PATCH v2 25/29] pnfs-obj: objio_osd report osd_errors for layoutreturn Benny Halevy
2011-05-09 17:11 ` [PATCH v2 26/29] pnfs: encode_layoutcommit Benny Halevy
2011-05-09 17:11 ` [PATCH v2 27/29] pnfs-obj: objlayout_encode_layoutcommit implementation Benny Halevy
2011-05-09 17:11 ` [PATCH v2 28/29] pnfs-obj: objio_osd: RAID0 support Benny Halevy
2011-05-09 17:11 ` [PATCH v2 29/29] pnfs-obj: objio_osd: groups support Benny Halevy
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=4DCC715C.1090101@panasas.com \
--to=bhalevy@panasas.com \
--cc=Trond.Myklebust@netapp.com \
--cc=bharrosh@panasas.com \
--cc=iisaman@netapp.com \
--cc=linux-nfs@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).