From: "J. Bruce Fields" <bfields@fieldses.org>
To: Christoph Hellwig <hch@lst.de>
Cc: trond.myklebust@primarydata.com, linux-nfs@vger.kernel.org
Subject: Re: [PATCH 2/4] nfs/blocklayout: add SCSI layout support
Date: Tue, 8 Mar 2016 17:07:37 -0500 [thread overview]
Message-ID: <20160308220737.GA27006@fieldses.org> (raw)
In-Reply-To: <1457120777-30687-3-git-send-email-hch@lst.de>
Trond, OK if I take this through the nfsd tree?
Or I'm OK with doing this however.
--b.
On Fri, Mar 04, 2016 at 08:46:15PM +0100, Christoph Hellwig wrote:
> This is a trivial extension to the block layout driver to support the
> new SCSI layouts draft. There are three changes:
>
> - device identifcation through the SCSI VPD page. This allows us to
> directly use the udev generated persistent device names instead of
> requiring an expensive lookup by crawling every block device node
> in /dev and reading a signature for it.
> - use of SCSI persistent reservations to protect device access and
> allow for robust fencing. On the client sides this just means
> registering and unregistering a server supplied key.
> - an optimized LAYOUTCOMMIT payload that doesn't send unessecary
> fields to the server.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
> fs/nfs/blocklayout/blocklayout.c | 59 ++++++++++++++--
> fs/nfs/blocklayout/blocklayout.h | 14 +++-
> fs/nfs/blocklayout/dev.c | 144 ++++++++++++++++++++++++++++++++++++++-
> fs/nfs/blocklayout/extent_tree.c | 44 ++++++++----
> fs/nfs/blocklayout/rpc_pipefs.c | 2 +-
> 5 files changed, 238 insertions(+), 25 deletions(-)
>
> diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
> index ddd0138..b27c409 100644
> --- a/fs/nfs/blocklayout/blocklayout.c
> +++ b/fs/nfs/blocklayout/blocklayout.c
> @@ -446,8 +446,8 @@ static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo)
> kfree(bl);
> }
>
> -static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
> - gfp_t gfp_flags)
> +static struct pnfs_layout_hdr *__bl_alloc_layout_hdr(struct inode *inode,
> + gfp_t gfp_flags, bool is_scsi_layout)
> {
> struct pnfs_block_layout *bl;
>
> @@ -460,9 +460,22 @@ static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
> bl->bl_ext_ro = RB_ROOT;
> spin_lock_init(&bl->bl_ext_lock);
>
> + bl->bl_scsi_layout = is_scsi_layout;
> return &bl->bl_layout;
> }
>
> +static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
> + gfp_t gfp_flags)
> +{
> + return __bl_alloc_layout_hdr(inode, gfp_flags, false);
> +}
> +
> +static struct pnfs_layout_hdr *sl_alloc_layout_hdr(struct inode *inode,
> + gfp_t gfp_flags)
> +{
> + return __bl_alloc_layout_hdr(inode, gfp_flags, true);
> +}
> +
> static void bl_free_lseg(struct pnfs_layout_segment *lseg)
> {
> dprintk("%s enter\n", __func__);
> @@ -888,22 +901,53 @@ static struct pnfs_layoutdriver_type blocklayout_type = {
> .sync = pnfs_generic_sync,
> };
>
> +static struct pnfs_layoutdriver_type scsilayout_type = {
> + .id = LAYOUT_SCSI,
> + .name = "LAYOUT_SCSI",
> + .owner = THIS_MODULE,
> + .flags = PNFS_LAYOUTRET_ON_SETATTR |
> + PNFS_READ_WHOLE_PAGE,
> + .read_pagelist = bl_read_pagelist,
> + .write_pagelist = bl_write_pagelist,
> + .alloc_layout_hdr = sl_alloc_layout_hdr,
> + .free_layout_hdr = bl_free_layout_hdr,
> + .alloc_lseg = bl_alloc_lseg,
> + .free_lseg = bl_free_lseg,
> + .return_range = bl_return_range,
> + .prepare_layoutcommit = bl_prepare_layoutcommit,
> + .cleanup_layoutcommit = bl_cleanup_layoutcommit,
> + .set_layoutdriver = bl_set_layoutdriver,
> + .alloc_deviceid_node = bl_alloc_deviceid_node,
> + .free_deviceid_node = bl_free_deviceid_node,
> + .pg_read_ops = &bl_pg_read_ops,
> + .pg_write_ops = &bl_pg_write_ops,
> + .sync = pnfs_generic_sync,
> +};
> +
> +
> static int __init nfs4blocklayout_init(void)
> {
> int ret;
>
> dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__);
>
> - ret = pnfs_register_layoutdriver(&blocklayout_type);
> + ret = bl_init_pipefs();
> if (ret)
> goto out;
> - ret = bl_init_pipefs();
> +
> + ret = pnfs_register_layoutdriver(&blocklayout_type);
> if (ret)
> - goto out_unregister;
> + goto out_cleanup_pipe;
> +
> + ret = pnfs_register_layoutdriver(&scsilayout_type);
> + if (ret)
> + goto out_unregister_block;
> return 0;
>
> -out_unregister:
> +out_unregister_block:
> pnfs_unregister_layoutdriver(&blocklayout_type);
> +out_cleanup_pipe:
> + bl_cleanup_pipefs();
> out:
> return ret;
> }
> @@ -913,8 +957,9 @@ static void __exit nfs4blocklayout_exit(void)
> dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n",
> __func__);
>
> - bl_cleanup_pipefs();
> + pnfs_unregister_layoutdriver(&scsilayout_type);
> pnfs_unregister_layoutdriver(&blocklayout_type);
> + bl_cleanup_pipefs();
> }
>
> MODULE_ALIAS("nfs-layouttype4-3");
> diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
> index c556640..bc21205 100644
> --- a/fs/nfs/blocklayout/blocklayout.h
> +++ b/fs/nfs/blocklayout/blocklayout.h
> @@ -55,7 +55,6 @@ struct pnfs_block_dev;
> */
> #define PNFS_BLOCK_UUID_LEN 128
>
> -
> struct pnfs_block_volume {
> enum pnfs_block_volume_type type;
> union {
> @@ -82,6 +81,13 @@ struct pnfs_block_volume {
> u32 volumes_count;
> u32 volumes[PNFS_BLOCK_MAX_DEVICES];
> } stripe;
> + struct {
> + enum scsi_code_set code_set;
> + enum scsi_designator_type designator_type;
> + int designator_len;
> + u8 designator[256];
> + u64 pr_key;
> + } scsi;
> };
> };
>
> @@ -106,6 +112,9 @@ struct pnfs_block_dev {
> struct block_device *bdev;
> u64 disk_offset;
>
> + u64 pr_key;
> + bool pr_registered;
> +
> bool (*map)(struct pnfs_block_dev *dev, u64 offset,
> struct pnfs_block_dev_map *map);
> };
> @@ -131,6 +140,7 @@ struct pnfs_block_layout {
> struct rb_root bl_ext_rw;
> struct rb_root bl_ext_ro;
> spinlock_t bl_ext_lock; /* Protects list manipulation */
> + bool bl_scsi_layout;
> };
>
> static inline struct pnfs_block_layout *
> @@ -182,6 +192,6 @@ void ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status);
> dev_t bl_resolve_deviceid(struct nfs_server *server,
> struct pnfs_block_volume *b, gfp_t gfp_mask);
> int __init bl_init_pipefs(void);
> -void __exit bl_cleanup_pipefs(void);
> +void bl_cleanup_pipefs(void);
>
> #endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
> diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
> index a861bbd..31b0f6b 100644
> --- a/fs/nfs/blocklayout/dev.c
> +++ b/fs/nfs/blocklayout/dev.c
> @@ -1,11 +1,12 @@
> /*
> - * Copyright (c) 2014 Christoph Hellwig.
> + * Copyright (c) 2014-2016 Christoph Hellwig.
> */
> #include <linux/sunrpc/svc.h>
> #include <linux/blkdev.h>
> #include <linux/nfs4.h>
> #include <linux/nfs_fs.h>
> #include <linux/nfs_xdr.h>
> +#include <linux/pr.h>
>
> #include "blocklayout.h"
>
> @@ -21,6 +22,17 @@ bl_free_device(struct pnfs_block_dev *dev)
> bl_free_device(&dev->children[i]);
> kfree(dev->children);
> } else {
> + if (dev->pr_registered) {
> + const struct pr_ops *ops =
> + dev->bdev->bd_disk->fops->pr_ops;
> + int error;
> +
> + error = ops->pr_register(dev->bdev, dev->pr_key, 0,
> + false);
> + if (error)
> + pr_err("failed to unregister PR key.\n");
> + }
> +
> if (dev->bdev)
> blkdev_put(dev->bdev, FMODE_READ | FMODE_WRITE);
> }
> @@ -113,6 +125,24 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
> for (i = 0; i < b->stripe.volumes_count; i++)
> b->stripe.volumes[i] = be32_to_cpup(p++);
> break;
> + case PNFS_BLOCK_VOLUME_SCSI:
> + p = xdr_inline_decode(xdr, 4 + 4 + 4);
> + if (!p)
> + return -EIO;
> + b->scsi.code_set = be32_to_cpup(p++);
> + b->scsi.designator_type = be32_to_cpup(p++);
> + b->scsi.designator_len = be32_to_cpup(p++);
> + p = xdr_inline_decode(xdr, b->scsi.designator_len);
> + if (!p)
> + return -EIO;
> + if (b->scsi.designator_len > 256)
> + return -EIO;
> + memcpy(&b->scsi.designator, p, b->scsi.designator_len);
> + p = xdr_inline_decode(xdr, 8);
> + if (!p)
> + return -EIO;
> + p = xdr_decode_hyper(p, &b->scsi.pr_key);
> + break;
> default:
> dprintk("unknown volume type!\n");
> return -EIO;
> @@ -216,6 +246,116 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
> return 0;
> }
>
> +static bool
> +bl_validate_designator(struct pnfs_block_volume *v)
> +{
> + switch (v->scsi.designator_type) {
> + case PS_DESIGNATOR_EUI64:
> + if (v->scsi.code_set != PS_CODE_SET_BINARY)
> + return false;
> +
> + if (v->scsi.designator_len != 8 &&
> + v->scsi.designator_len != 10 &&
> + v->scsi.designator_len != 16)
> + return false;
> +
> + return true;
> + case PS_DESIGNATOR_NAA:
> + if (v->scsi.code_set != PS_CODE_SET_BINARY)
> + return false;
> +
> + if (v->scsi.designator_len != 8 &&
> + v->scsi.designator_len != 16)
> + return false;
> +
> + return true;
> + case PS_DESIGNATOR_T10:
> + case PS_DESIGNATOR_NAME:
> + pr_err("pNFS: unsupported designator "
> + "(code set %d, type %d, len %d.\n",
> + v->scsi.code_set,
> + v->scsi.designator_type,
> + v->scsi.designator_len);
> + return false;
> + default:
> + pr_err("pNFS: invalid designator "
> + "(code set %d, type %d, len %d.\n",
> + v->scsi.code_set,
> + v->scsi.designator_type,
> + v->scsi.designator_len);
> + return false;
> + }
> +}
> +
> +static int
> +bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
> + struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
> +{
> + struct pnfs_block_volume *v = &volumes[idx];
> + const struct pr_ops *ops;
> + const char *devname;
> + int error;
> +
> + if (!bl_validate_designator(v))
> + return -EINVAL;
> +
> + switch (v->scsi.designator_len) {
> + case 8:
> + devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%8phN",
> + v->scsi.designator);
> + break;
> + case 12:
> + devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%12phN",
> + v->scsi.designator);
> + break;
> + case 16:
> + devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%16phN",
> + v->scsi.designator);
> + break;
> + default:
> + return -EINVAL;
> + }
> +
> + d->bdev = blkdev_get_by_path(devname, FMODE_READ, NULL);
> + if (IS_ERR(d->bdev)) {
> + pr_warn("pNFS: failed to open device %s (%ld)\n",
> + devname, PTR_ERR(d->bdev));
> + kfree(devname);
> + return PTR_ERR(d->bdev);
> + }
> +
> + kfree(devname);
> +
> + d->len = i_size_read(d->bdev->bd_inode);
> + d->map = bl_map_simple;
> + d->pr_key = v->scsi.pr_key;
> +
> + pr_info("pNFS: using block device %s (reservation key 0x%llx)\n",
> + d->bdev->bd_disk->disk_name, d->pr_key);
> +
> + ops = d->bdev->bd_disk->fops->pr_ops;
> + if (!ops) {
> + pr_err("pNFS: block device %s does not support reservations.",
> + d->bdev->bd_disk->disk_name);
> + error = -EINVAL;
> + goto out_blkdev_put;
> + }
> +
> + error = ops->pr_register(d->bdev, 0, d->pr_key, true);
> + if (error) {
> + pr_err("pNFS: failed to register key for block device %s.",
> + d->bdev->bd_disk->disk_name);
> + goto out_blkdev_put;
> + }
> +
> + d->pr_registered = true;
> + return 0;
> +
> +out_blkdev_put:
> + blkdev_put(d->bdev, FMODE_READ);
> + return error;
> +}
> +
> static int
> bl_parse_slice(struct nfs_server *server, struct pnfs_block_dev *d,
> struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
> @@ -303,6 +443,8 @@ bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
> return bl_parse_concat(server, d, volumes, idx, gfp_mask);
> case PNFS_BLOCK_VOLUME_STRIPE:
> return bl_parse_stripe(server, d, volumes, idx, gfp_mask);
> + case PNFS_BLOCK_VOLUME_SCSI:
> + return bl_parse_scsi(server, d, volumes, idx, gfp_mask);
> default:
> dprintk("unsupported volume type: %d\n", volumes[idx].type);
> return -EIO;
> diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
> index c59a59c..df366fc 100644
> --- a/fs/nfs/blocklayout/extent_tree.c
> +++ b/fs/nfs/blocklayout/extent_tree.c
> @@ -1,5 +1,5 @@
> /*
> - * Copyright (c) 2014 Christoph Hellwig.
> + * Copyright (c) 2014-2016 Christoph Hellwig.
> */
>
> #include <linux/vmalloc.h>
> @@ -462,10 +462,12 @@ out:
> return err;
> }
>
> -static size_t ext_tree_layoutupdate_size(size_t count)
> +static size_t ext_tree_layoutupdate_size(struct pnfs_block_layout *bl, size_t count)
> {
> - return sizeof(__be32) /* number of entries */ +
> - PNFS_BLOCK_EXTENT_SIZE * count;
> + if (bl->bl_scsi_layout)
> + return sizeof(__be32) + PNFS_SCSI_RANGE_SIZE * count;
> + else
> + return sizeof(__be32) + PNFS_BLOCK_EXTENT_SIZE * count;
> }
>
> static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
> @@ -482,6 +484,23 @@ static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
> }
> }
>
> +static __be32 *encode_block_extent(struct pnfs_block_extent *be, __be32 *p)
> +{
> + p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data,
> + NFS4_DEVICEID4_SIZE);
> + p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
> + p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
> + p = xdr_encode_hyper(p, 0LL);
> + *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
> + return p;
> +}
> +
> +static __be32 *encode_scsi_range(struct pnfs_block_extent *be, __be32 *p)
> +{
> + p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
> + return xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
> +}
> +
> static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
> size_t buffer_size, size_t *count)
> {
> @@ -495,19 +514,16 @@ static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
> continue;
>
> (*count)++;
> - if (ext_tree_layoutupdate_size(*count) > buffer_size) {
> + if (ext_tree_layoutupdate_size(bl, *count) > buffer_size) {
> /* keep counting.. */
> ret = -ENOSPC;
> continue;
> }
>
> - p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data,
> - NFS4_DEVICEID4_SIZE);
> - p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
> - p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
> - p = xdr_encode_hyper(p, 0LL);
> - *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
> -
> + if (bl->bl_scsi_layout)
> + p = encode_scsi_range(be, p);
> + else
> + p = encode_block_extent(be, p);
> be->be_tag = EXTENT_COMMITTING;
> }
> spin_unlock(&bl->bl_ext_lock);
> @@ -536,7 +552,7 @@ retry:
> if (unlikely(ret)) {
> ext_tree_free_commitdata(arg, buffer_size);
>
> - buffer_size = ext_tree_layoutupdate_size(count);
> + buffer_size = ext_tree_layoutupdate_size(bl, count);
> count = 0;
>
> arg->layoutupdate_pages =
> @@ -555,7 +571,7 @@ retry:
> }
>
> *start_p = cpu_to_be32(count);
> - arg->layoutupdate_len = ext_tree_layoutupdate_size(count);
> + arg->layoutupdate_len = ext_tree_layoutupdate_size(bl, count);
>
> if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) {
> void *p = start_p, *end = p + arg->layoutupdate_len;
> diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c
> index dbe5839..9fb067a6 100644
> --- a/fs/nfs/blocklayout/rpc_pipefs.c
> +++ b/fs/nfs/blocklayout/rpc_pipefs.c
> @@ -281,7 +281,7 @@ out:
> return ret;
> }
>
> -void __exit bl_cleanup_pipefs(void)
> +void bl_cleanup_pipefs(void)
> {
> rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
> unregister_pernet_subsys(&nfs4blocklayout_net_ops);
> --
> 2.1.4
next prev parent reply other threads:[~2016-03-08 22:07 UTC|newest]
Thread overview: 17+ messages / expand[flat|nested] mbox.gz Atom feed top
2016-03-04 19:46 pNFS SCSI layout support V2 Christoph Hellwig
2016-03-04 19:46 ` [PATCH 1/4] nfs4.h: add SCSI layout defintions Christoph Hellwig
2016-03-04 19:46 ` [PATCH 2/4] nfs/blocklayout: add SCSI layout support Christoph Hellwig
2016-03-08 22:07 ` J. Bruce Fields [this message]
2016-03-08 22:42 ` Trond Myklebust
2016-03-17 21:01 ` J. Bruce Fields
2016-03-18 14:18 ` Trond Myklebust
2016-03-08 22:09 ` J. Bruce Fields
2016-03-04 19:46 ` [PATCH 3/4] nfsd: add a new config option for the block layout driver Christoph Hellwig
2016-03-04 19:46 ` [PATCH 4/4] nfsd: add SCSI layout support Christoph Hellwig
2016-03-08 22:15 ` J. Bruce Fields
2016-03-09 14:48 ` Christoph Hellwig
2016-03-10 22:26 ` J. Bruce Fields
2016-03-11 9:24 ` Christoph Hellwig
2016-03-11 22:52 ` J. Bruce Fields
2016-03-17 21:52 ` J. Bruce Fields
2016-03-22 18:56 ` J. Bruce Fields
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20160308220737.GA27006@fieldses.org \
--to=bfields@fieldses.org \
--cc=hch@lst.de \
--cc=linux-nfs@vger.kernel.org \
--cc=trond.myklebust@primarydata.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).