Re: [PATCH] NFS: readdirplus optimization by cache mechanism

public inbox for linux-nfs@vger.kernel.org
 help / color / mirror / Atom feed

From: 乱石 <zhangliguang@linux.alibaba.com>
To: linux-nfs@vger.kernel.org
Cc: trond.myklebust@hammerspace.com, anna.schumaker@netapp.com
Subject: Re: [PATCH] NFS: readdirplus optimization by cache mechanism
Date: Fri, 4 Jan 2019 13:46:56 +0800	[thread overview]
Message-ID: <b84bdb4d-4422-6f46-a5f8-e5cf0496173e@linux.alibaba.com> (raw)
In-Reply-To: <1546580095-125450-1-git-send-email-zhangliguang@linux.alibaba.com>

Sorry for this letter， I will send RFC v1 instead.

在 2019/1/4 13:34, zhangliguang 写道:
> When listing very large directories via NFS, clients may take a long
> time to complete. There are about three factors involved:
>
> First of all, ls and practically every other method of listing a
> directory including python os.listdir and find rely on libc readdir().
> However readdir() only reads 32K of directory entries at a time, which
> means that if you have a lot of files in the same directory, it is going
> to take an insanely long time to read all the directory entries.
>
> Secondly, libc readdir() reads 32K of directory entries at a time, in
> kernel space 32K buffer split into 8 pages. One NFS readdirplus rpc will
> be called for one page, which introduces many readdirplus rpc calls.
>
> Lastly, one NFS readdirplus rpc asks for 32K data (filled by nfs_dentry)
> to fill one page (filled by dentry), we found that nearly one third of
> data was wasted.
>
> To solve above problems, pagecache mechanism was introduced. One NFS
> readdirplus rpc will ask for a large data (more than 32k), the data can
> fill more than one page, the cached pages can be used for next readdir
> call. This can reduce many readdirplus rpc calls and improve readdirplus
> performance.
>
> TESTING:
> When listing very large directories(include 300 thousand files) via NFS
>
> time ls -l /nfs_mount | wc -l
>
> without the patch:
> 300001
> real    1m53.524s
> user    0m2.314s
> sys     0m2.599s
>
> with the patch:
> 300001
> real    0m23.487s
> user    0m2.305s
> sys     0m2.558s
>
> Improved performance: 79.6%
> readdirplus rpc calls decrease: 85%
>
> Signed-off-by: Liguang Zhang <zhangliguang@linux.alibaba.com>
> ---
>   fs/nfs/dir.c      | 114 +++++++++++++++++++++++++++++++++++++++++++++++++++---
>   fs/nfs/internal.h |   3 ++
>   2 files changed, 111 insertions(+), 6 deletions(-)
>
> diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
> index 6bf4471..8b80c02 100644
> --- a/fs/nfs/dir.c
> +++ b/fs/nfs/dir.c
> @@ -139,12 +139,19 @@ struct nfs_cache_array {
>   	struct nfs_cache_array_entry array[0];
>   };
>   
> +struct readdirvec {
> +	unsigned long nr;
> +	unsigned long index;
> +	struct page *pages[NFS_MAX_READDIR_RAPAGES];
> +};
> +
>   typedef int (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, bool);
>   typedef struct {
>   	struct file	*file;
>   	struct page	*page;
>   	struct dir_context *ctx;
>   	unsigned long	page_index;
> +	struct readdirvec pvec;
>   	u64		*dir_cookie;
>   	u64		last_cookie;
>   	loff_t		current_index;
> @@ -524,6 +531,11 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
>   	struct nfs_cache_array *array;
>   	unsigned int count = 0;
>   	int status;
> +	int max_rapages = NFS_MAX_READDIR_RAPAGES;
> +	void *src, *dst;
> +
> +	desc->pvec.index = desc->page_index;
> +	desc->pvec.nr = 0;
>   
>   	scratch = alloc_page(GFP_KERNEL);
>   	if (scratch == NULL)
> @@ -548,20 +560,45 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
>   		if (desc->plus)
>   			nfs_prime_dcache(file_dentry(desc->file), entry);
>   
> -		status = nfs_readdir_add_to_array(entry, page);
> +		status = nfs_readdir_add_to_array(entry, desc->pvec.pages[desc->pvec.nr]);
> +		if (status == -ENOSPC) {
> +			desc->pvec.nr++;
> +			if (desc->pvec.nr == max_rapages)
> +				break;
> +			status = nfs_readdir_add_to_array(entry, desc->pvec.pages[desc->pvec.nr]);
> +		}
> +
>   		if (status != 0)
>   			break;
>   	} while (!entry->eof);
>   
> +	/*
> +	 * page and desc->pvec.pages[0] are valid, don't need to check
> +	 * whether or not to be NULL.
> +	 */
> +	dst = kmap(page);
> +	src = kmap(desc->pvec.pages[0]);
> +	memcpy(dst, src, PAGE_SIZE);
> +	kunmap(dst);
> +	kunmap(src);
> +
>   out_nopages:
>   	if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) {
> -		array = kmap(page);
> +		array = kmap(desc->pvec.pages[desc->pvec.nr]);
>   		array->eof_index = array->size;
>   		status = 0;
> -		kunmap(page);
> +		kunmap(desc->pvec.pages[desc->pvec.nr]);
>   	}
>   
>   	put_page(scratch);
> +
> +	/*
> +	 * desc->pvec.nr > 0 means at least one page was completely filled,
> +	 * we should return -ENOSPC. Otherwise function
> +	 * nfs_readdir_xdr_to_array will enter infinite loop.
> +	 */
> +	if (desc->pvec.nr > 0)
> +		return -ENOSPC;
>   	return status;
>   }
>   
> @@ -604,6 +641,30 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
>   	struct nfs_cache_array *array;
>   	int status = -ENOMEM;
>   	unsigned int array_size = ARRAY_SIZE(pages);
> +	int max_rapages = NFS_MAX_READDIR_RAPAGES;
> +	int page_index;
> +
> +	/*
> +	 * This means we hit readdir rdpages miss, the preallocated rdpages
> +	 * are useless, we should release the preallocate rdpages first, and
> +	 * then alloc pages for the next readdir.
> +	 */
> +	nfs_readdir_free_pages(desc->pvec.pages, max_rapages);
> +
> +	status = nfs_readdir_alloc_pages(desc->pvec.pages, max_rapages);
> +	if (status < 0)
> +		return -ENOMEM;
> +
> +	for (page_index = 0; page_index < max_rapages; page_index++) {
> +		array = kmap(desc->pvec.pages[page_index]);
> +		if (IS_ERR(array)) {
> +			status = PTR_ERR(array);
> +			return status;
> +		}
> +		memset(array, 0, sizeof(struct nfs_cache_array));
> +		array->eof_index = -1;
> +		kunmap(desc->pvec.pages[page_index]);
> +	}
>   
>   	entry.prev_cookie = 0;
>   	entry.cookie = desc->last_cookie;
> @@ -663,10 +724,30 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page)
>   {
>   	struct inode	*inode = file_inode(desc->file);
>   	int ret;
> +	void *dst, *src;
> +	unsigned long end_index = desc->pvec.index + desc->pvec.nr;
> +
> +	/*
> +	 * If desc->page_index in range desc->pvec.index and
> +	 * desc->pvec.index + desc->pvec.nr, we get readdir cache hit.
> +	 */
> +	if ((desc->page_index >= desc->pvec.index) && (desc->page_index < end_index)) {
> +		/*
> +		 * page and desc->pvec.pages[x] are valid, don't need to check
> +		 * whether or not to be NULL.
> +		 */
> +		dst = kmap(page);
> +		src = kmap(desc->pvec.pages[desc->page_index - desc->pvec.index]);
> +		memcpy(dst, src, PAGE_SIZE);
> +		kunmap(dst);
> +		kunmap(src);
> +		ret = 0;
> +	} else {
> +		ret = nfs_readdir_xdr_to_array(desc, page, inode);
> +		if (ret < 0)
> +			goto error;
> +	}
>   
> -	ret = nfs_readdir_xdr_to_array(desc, page, inode);
> -	if (ret < 0)
> -		goto error;
>   	SetPageUptodate(page);
>   
>   	if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) {
> @@ -831,6 +912,10 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
>   			*desc = &my_desc;
>   	struct nfs_open_dir_context *dir_ctx = file->private_data;
>   	int res = 0;
> +	struct nfs_cache_array *array;
> +	int max_rapages = NFS_MAX_READDIR_RAPAGES;
> +	int status;
> +	int page_index;
>   
>   	dfprintk(FILE, "NFS: readdir(%pD2) starting at cookie %llu\n",
>   			file, (long long)ctx->pos);
> @@ -850,6 +935,21 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
>   	desc->decode = NFS_PROTO(inode)->decode_dirent;
>   	desc->plus = nfs_use_readdirplus(inode, ctx);
>   
> +	status = nfs_readdir_alloc_pages(desc->pvec.pages, max_rapages);
> +	if (status < 0)
> +		return -ENOMEM;
> +
> +	for (page_index = 0; page_index < max_rapages; page_index++) {
> +		array = kmap(desc->pvec.pages[page_index]);
> +		if (IS_ERR(array)) {
> +			status = PTR_ERR(array);
> +			goto out_pages_free;
> +		}
> +		memset(array, 0, sizeof(struct nfs_cache_array));
> +		array->eof_index = -1;
> +		kunmap(desc->pvec.pages[page_index]);
> +	}
> +
>   	if (ctx->pos == 0 || nfs_attribute_cache_expired(inode))
>   		res = nfs_revalidate_mapping(inode, file->f_mapping);
>   	if (res < 0)
> @@ -884,6 +984,8 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
>   		if (res < 0)
>   			break;
>   	} while (!desc->eof);
> +out_pages_free:
> +	nfs_readdir_free_pages(desc->pvec.pages, max_rapages);
>   out:
>   	if (res > 0)
>   		res = 0;
> diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
> index 7f80f03..132ffc7 100644
> --- a/fs/nfs/internal.h
> +++ b/fs/nfs/internal.h
> @@ -71,6 +71,9 @@ struct nfs_clone_mount {
>    */
>   #define NFS_MAX_READDIR_PAGES 8
>   
> +/* Maximum number of pages that readdir can readahead. */
> +#define NFS_MAX_READDIR_RAPAGES 8
> +
>   struct nfs_client_initdata {
>   	unsigned long init_flags;
>   	const char *hostname;			/* Hostname of the server */

     prev parent reply	other threads:[~2019-01-04  5:48 UTC|newest]

Thread overview: 2+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-01-04  5:34 [PATCH] NFS: readdirplus optimization by cache mechanism zhangliguang
2019-01-04  5:46 ` 乱石 [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=b84bdb4d-4422-6f46-a5f8-e5cf0496173e@linux.alibaba.com \
    --to=zhangliguang@linux.alibaba.com \
    --cc=anna.schumaker@netapp.com \
    --cc=linux-nfs@vger.kernel.org \
    --cc=trond.myklebust@hammerspace.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox