All of lore.kernel.org
 help / color / mirror / Atom feed
From: Mike Snitzer <snitzer@kernel.org>
To: Christoph Hellwig <hch@infradead.org>
Cc: NeilBrown <neil@brown.name>, Chuck Lever <cel@kernel.org>,
	Jeff Layton <jlayton@kernel.org>,
	Olga Kornievskaia <okorniev@redhat.com>,
	Dai Ngo <dai.ngo@oracle.com>, Tom Talpey <tom@talpey.com>,
	linux-nfs@vger.kernel.org, Chuck Lever <chuck.lever@oracle.com>
Subject: Re: [PATCH v10 4/5] NFSD: Implement NFSD_IO_DIRECT for NFS WRITE
Date: Thu, 6 Nov 2025 11:48:53 -0500	[thread overview]
Message-ID: <aQzRdTcyc2nhTWqj@kernel.org> (raw)
In-Reply-To: <aQyn-_GSL_z3a9to@infradead.org>

On Thu, Nov 06, 2025 at 05:51:55AM -0800, Christoph Hellwig wrote:
> On Thu, Nov 06, 2025 at 05:15:45AM -0800, Christoph Hellwig wrote:
> > On Thu, Nov 06, 2025 at 09:11:51PM +1100, NeilBrown wrote:
> > > > +struct nfsd_write_dio_seg {
> > > > +	struct iov_iter			iter;
> > > > +	bool				use_dio;
> > > 
> > > This is only used to choose which flags to use.
> > > I think it would be neater the have 'flags' here explicitly.
> > 
> > Actually, looking at the grand unified patch now (thanks, this is so
> > much easier to review!), we can just do away with the struct entirely.
> > Just have nfsd_write_dio_iters_init return if direct I/O is possible
> > or not, and do a single vfs_iocb_iter_write on the origin kiocb/iter
> > if not.
> 
> That didn't work out too well, and indeed having flags here seems
> saner.
> 
> Chuck, below is an untested incremental patch I did while reviewing
> it.  Besides this flags thing, it adds the actual NFSD_IO_DIRECT
> definition that was missing, but otherwise mostly just folds things
> so that we don't need the extra args structure and thus simplifies
> things quite a bit.
> 
> diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
> index ea87b42894dd..bdb60ee1f1a4 100644
> --- a/fs/nfsd/nfsd.h
> +++ b/fs/nfsd/nfsd.h
> @@ -157,6 +157,7 @@ enum {
>  	/* Any new NFSD_IO enum value must be added at the end */
>  	NFSD_IO_BUFFERED,
>  	NFSD_IO_DONTCACHE,
> +	NFSD_IO_DIRECT,
>  };
>  
>  extern u64 nfsd_io_cache_read __read_mostly;
> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> index bb94da333d50..8038d8d038f3 100644
> --- a/fs/nfsd/vfs.c
> +++ b/fs/nfsd/vfs.c
> @@ -1170,56 +1170,38 @@ static int wait_for_concurrent_writes(struct file *file)
>  
>  struct nfsd_write_dio_seg {
>  	struct iov_iter			iter;
> -	bool				use_dio;
> +	int				flags;
>  };
>  
> -struct nfsd_write_dio_args {
> -	struct nfsd_file		*nf;
> -	int				flags_buffered;
> -	int				flags_direct;
> -	unsigned int			nsegs;
> -	struct nfsd_write_dio_seg	segment[3];
> -};
> -
> -/*
> - * Check if the bvec iterator is aligned for direct I/O.
> - *
> - * bvecs generated from RPC receive buffers are contiguous: After the first
> - * bvec, all subsequent bvecs start at bv_offset zero (page-aligned).
> - * Therefore, only the first bvec is checked.
> - */
> -static bool
> -nfsd_iov_iter_aligned_bvec(const struct nfsd_file *nf, const struct iov_iter *i)
> +static unsigned long iov_iter_bvec_offset(const struct iov_iter *iter)
>  {
> -	unsigned int addr_mask = nf->nf_dio_mem_align - 1;
> -	const struct bio_vec *bvec = i->bvec;
> -
> -	return ((unsigned long)(bvec->bv_offset + i->iov_offset) & addr_mask) == 0;
> +	return (unsigned long)(iter->bvec->bv_offset + iter->iov_offset);
>  }
>  

This ^ being factored out and documented like was before is better
than the result of this patch, which then spreads the associated
documentation into the caller.

>  static void
>  nfsd_write_dio_seg_init(struct nfsd_write_dio_seg *segment,
>  			struct bio_vec *bvec, unsigned int nvecs,
> -			unsigned long total, size_t start, size_t len)
> +			unsigned long total, size_t start, size_t len,
> +			struct kiocb *iocb)
>  {
>  	iov_iter_bvec(&segment->iter, ITER_SOURCE, bvec, nvecs, total);
>  	if (start)
>  		iov_iter_advance(&segment->iter, start);
>  	iov_iter_truncate(&segment->iter, len);
> -	segment->use_dio = false;
> +	segment->flags = iocb->ki_flags;
>  }
>  
> -static void
> -nfsd_write_dio_iters_init(struct bio_vec *bvec, unsigned int nvecs,
> -			  loff_t offset, unsigned long total,
> -			  struct nfsd_write_dio_args *args)
> +static unsigned int
> +nfsd_write_dio_iters_init(struct nfsd_file *nf, struct bio_vec *bvec,
> +		unsigned int nvecs, struct kiocb *iocb, unsigned long total,
> +		struct nfsd_write_dio_seg segments[3])
>  {
> -	u32 offset_align = args->nf->nf_dio_offset_align;
> -	u32 mem_align = args->nf->nf_dio_mem_align;
> +	u32 offset_align = nf->nf_dio_offset_align;
> +	u32 mem_align = nf->nf_dio_mem_align;
> +	loff_t offset = iocb->ki_pos;
>  	loff_t prefix_end, orig_end, middle_end;
>  	size_t prefix, middle, suffix;
> -
> -	args->nsegs = 0;
> +	unsigned int nsegs = 0;
>  
>  	/*
>  	 * Check if direct I/O is feasible for this write request.
> @@ -1242,87 +1224,80 @@ nfsd_write_dio_iters_init(struct bio_vec *bvec, unsigned int nvecs,
>  	if (!middle)
>  		goto no_dio;
>  
> -	if (prefix) {
> -		nfsd_write_dio_seg_init(&args->segment[args->nsegs], bvec,
> -					nvecs, total, 0, prefix);
> -		++args->nsegs;
> -	}
> +	if (prefix)
> +		nfsd_write_dio_seg_init(&segments[nsegs++], bvec,
> +					nvecs, total, 0, prefix, iocb);
>  
> -	nfsd_write_dio_seg_init(&args->segment[args->nsegs], bvec, nvecs,
> -				total, prefix, middle);
> -	if (!nfsd_iov_iter_aligned_bvec(args->nf,
> -					&args->segment[args->nsegs].iter))
> +	nfsd_write_dio_seg_init(&segments[nsegs], bvec, nvecs,
> +				total, prefix, middle, iocb);
> +
> +	/*
> +	 * Check if the bvec iterator is aligned for direct I/O.
> +	 *
> +	 * bvecs generated from RPC receive buffers are contiguous: After the
> +	 * first bvec, all subsequent bvecs start at bv_offset zero
> +	 * (page-aligned).
> +	 * Therefore, only the first bvec is checked.
> +	 */
> +	if (iov_iter_bvec_offset(&segments[nsegs].iter) & (mem_align - 1))
>  		goto no_dio;
> -	args->segment[args->nsegs].use_dio = true;
> -	++args->nsegs;
> +	segments[nsegs].flags |= IOCB_DIRECT;
> +	nsegs++;
>  
> -	if (suffix) {
> -		nfsd_write_dio_seg_init(&args->segment[args->nsegs], bvec,
> -					nvecs, total, prefix + middle, suffix);
> -		++args->nsegs;
> -	}
> +	if (suffix)
> +		nfsd_write_dio_seg_init(&segments[nsegs++], bvec,
> +					nvecs, total, prefix + middle, suffix,
> +					iocb);
>  
> -	return;
> +	return nsegs;
>  
>  no_dio:
>  	/*
>  	 * No DIO alignment possible - pack into single non-DIO segment.
> -	 * IOCB_DONTCACHE preserves the intent of NFSD_IO_DIRECT.
>  	 */
> -	if (args->nf->nf_file->f_op->fop_flags & FOP_DONTCACHE)
> -		args->flags_buffered |= IOCB_DONTCACHE;
> -	nfsd_write_dio_seg_init(&args->segment[0], bvec, nvecs, total,
> -				0, total);
> -	args->nsegs = 1;
> +	nfsd_write_dio_seg_init(&segments[0], bvec, nvecs, total,
> +				0, total, iocb);
> +	return 1;
>  }

Selectively pushing the flag twiddling out to nfsd_direct_write()
ignores that we don't want to use DONTCACHE for the unaligned
prefix/suffix. Chuck and I discussed this a fair bit 1-2 days ago.

> -static int
> -nfsd_issue_dio_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
> -		     struct kiocb *kiocb, unsigned int nvecs,
> -		     unsigned long *cnt, struct nfsd_write_dio_args *args)
> +static noinline_for_stack int
> +nfsd_direct_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
> +		  struct nfsd_file *nf, unsigned int nvecs,
> +		  unsigned long *cnt, struct kiocb *kiocb)
>  {
> -	struct file *file = args->nf->nf_file;
> +	struct file *file = nf->nf_file;
> +	struct nfsd_write_dio_seg segments[3];
> +	unsigned int nsegs = 0, i;
>  	ssize_t host_err;
> -	unsigned int i;
>  
> -	nfsd_write_dio_iters_init(rqstp->rq_bvec, nvecs, kiocb->ki_pos,
> -				  *cnt, args);
> +	nsegs = nfsd_write_dio_iters_init(nf, rqstp->rq_bvec, nvecs,
> +			kiocb, *cnt, segments);
>  
>  	*cnt = 0;
> -	for (i = 0; i < args->nsegs; i++) {
> -		if (args->segment[i].use_dio) {
> -			kiocb->ki_flags = args->flags_direct;
> +	for (i = 0; i < nsegs; i++) {
> +		kiocb->ki_flags = segments[i].flags;
> +		if (kiocb->ki_flags & IOCB_DIRECT) {
>  			trace_nfsd_write_direct(rqstp, fhp, kiocb->ki_pos,
> -						args->segment[i].iter.count);
> -		} else
> -			kiocb->ki_flags = args->flags_buffered;
> +						segments[i].iter.count);
> +		} else if (nf->nf_file->f_op->fop_flags & FOP_DONTCACHE) {
> +			/*
> +			 * IOCB_DONTCACHE preserves the intent of
> +			 * NFSD_IO_DIRECT.
> +			 */
> +			kiocb->ki_flags |= IOCB_DONTCACHE;
> +		}

So this FOP_DONTCACHE branch needs to stay in
nfsd_write_dio_iters_init() no_dio: section.

>  
> -		host_err = vfs_iocb_iter_write(file, kiocb,
> -					       &args->segment[i].iter);
> +		host_err = vfs_iocb_iter_write(file, kiocb, &segments[i].iter);
>  		if (host_err < 0)
>  			return host_err;
>  		*cnt += host_err;
> -		if (host_err < args->segment[i].iter.count)
> +		if (host_err < segments[i].iter.count)
>  			break;	/* partial write */
>  	}
>  
>  	return 0;
>  }
>  
> -static noinline_for_stack int
> -nfsd_direct_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
> -		  struct nfsd_file *nf, unsigned int nvecs,
> -		  unsigned long *cnt, struct kiocb *kiocb)
> -{
> -	struct nfsd_write_dio_args args;
> -
> -	args.flags_direct = kiocb->ki_flags | IOCB_DIRECT;
> -	args.flags_buffered = kiocb->ki_flags;
> -	args.nf = nf;
> -
> -	return nfsd_issue_dio_write(rqstp, fhp, kiocb, nvecs, cnt, &args);
> -}
> -
>  /**
>   * nfsd_vfs_write - write data to an already-open file
>   * @rqstp: RPC execution context

Combining nfsd_direct_write and nfsd_issue_dio_write is good.  And I
like the intent of removing args but this first attempt has issues
that can be resolved by keeping the flags setup in
nfsd_write_dio_iters_init().

Mike

  parent reply	other threads:[~2025-11-06 16:48 UTC|newest]

Thread overview: 28+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-11-05 19:28 [PATCH v10 0/5] NFSD: Implement NFSD_IO_DIRECT for NFS WRITE Chuck Lever
2025-11-05 19:28 ` [PATCH v10 1/5] NFSD: don't start nfsd if sv_permsocks is empty Chuck Lever
2025-11-05 19:31   ` Chuck Lever
2025-11-05 19:28 ` [PATCH v10 2/5] NFSD: Make FILE_SYNC WRITEs comply with spec Chuck Lever
2025-11-06  0:55   ` NeilBrown
2025-11-06 13:05   ` Christoph Hellwig
2025-11-05 19:28 ` [PATCH v10 3/5] NFSD: Enable return of an updated stable_how to NFS clients Chuck Lever
2025-11-06 13:07   ` Christoph Hellwig
2025-11-06 16:30     ` Chuck Lever
2025-11-05 19:28 ` [PATCH v10 4/5] NFSD: Implement NFSD_IO_DIRECT for NFS WRITE Chuck Lever
2025-11-06 10:11   ` NeilBrown
2025-11-06 13:15     ` Christoph Hellwig
2025-11-06 13:51       ` Christoph Hellwig
2025-11-06 14:45         ` Chuck Lever
2025-11-06 14:49           ` Christoph Hellwig
2025-11-06 16:48         ` Mike Snitzer [this message]
2025-11-06 18:10           ` Chuck Lever
2025-11-06 19:02             ` Mike Snitzer
2025-11-07 13:24               ` Christoph Hellwig
2025-11-07 14:38                 ` Chuck Lever
2025-11-07 15:24                   ` Christoph Hellwig
2025-11-07 15:26                     ` Chuck Lever
2025-11-10 13:26   ` kernel test robot
2025-11-10 19:00   ` kernel test robot
2025-11-05 19:28 ` [PATCH v10 5/5] NFSD: add Documentation/filesystems/nfs/nfsd-io-modes.rst Chuck Lever
2025-11-06 10:24   ` NeilBrown
2025-11-06 15:46     ` Mike Snitzer
2025-11-06 15:52       ` Chuck Lever

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=aQzRdTcyc2nhTWqj@kernel.org \
    --to=snitzer@kernel.org \
    --cc=cel@kernel.org \
    --cc=chuck.lever@oracle.com \
    --cc=dai.ngo@oracle.com \
    --cc=hch@infradead.org \
    --cc=jlayton@kernel.org \
    --cc=linux-nfs@vger.kernel.org \
    --cc=neil@brown.name \
    --cc=okorniev@redhat.com \
    --cc=tom@talpey.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.