linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Martin Brandenburg <martin@omnibond.com>
To: hubcap@omnibond.com, linux-fsdevel@vger.kernel.org,
	linux-kernel@vger.kernel.org
Cc: Martin Brandenburg <martin@omnibond.com>
Subject: [PATCH 07/10] orangefs: support very large directories
Date: Fri,  7 Apr 2017 17:17:09 -0400	[thread overview]
Message-ID: <1491599832-17773-8-git-send-email-martin@omnibond.com> (raw)
In-Reply-To: <1491599832-17773-1-git-send-email-martin@omnibond.com>

This works by maintaining a linked list of pages which the directory
has been read into rather than one giant fixed-size buffer.

This replaces code which limits the total directory size to the total
amount that could be returned in one server request.  Since filenames
are usually considerably shorter than the maximum, the old code could
usually handle several server requests before running out of space.

Signed-off-by: Martin Brandenburg <martin@omnibond.com>
---
 fs/orangefs/dir.c | 246 ++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 172 insertions(+), 74 deletions(-)

diff --git a/fs/orangefs/dir.c b/fs/orangefs/dir.c
index c48220f..7afde1b 100644
--- a/fs/orangefs/dir.c
+++ b/fs/orangefs/dir.c
@@ -6,18 +6,15 @@
 #include "orangefs-kernel.h"
 #include "orangefs-bufmap.h"
 
-/*
- * There can be up to 512 directory entries.  Each entry contains a four byte
- * string size, a string, and a 16 byte khandle.  The string can be up to 256
- * bytes and is encoded with a trailing zero and four byte padding for the
- * khandle.
- */
-#define MAX_DIRECTORY ((4 + 257 + 3 + 16)*512)
+struct orangefs_dir_page {
+	struct orangefs_dir_page *next;
+	size_t len;
+};
 
 struct orangefs_dir {
 	__u64 token;
-	void *directory;
-	size_t len;
+	struct orangefs_dir_page *page;
+	loff_t end;
 	int error;
 };
 
@@ -32,20 +29,12 @@ struct orangefs_dir {
  * be skipped to get to the directory data.
  */
 
-static int orangefs_dir_more(struct orangefs_inode_s *oi,
-    struct orangefs_dir *od, struct dentry *dentry)
+static int do_readdir(struct orangefs_inode_s *oi, struct orangefs_dir *od,
+    struct dentry *dentry, struct orangefs_kernel_op_s *op)
 {
-	const size_t offset = sizeof(struct orangefs_readdir_response_s);
 	struct orangefs_readdir_response_s *resp;
-	struct orangefs_kernel_op_s *op;
 	int bufi, r;
 
-	op = op_alloc(ORANGEFS_VFS_OP_READDIR);
-	if (!op) {
-		od->error = -ENOMEM;
-		return -ENOMEM;
-	}
-
 	/*
 	 * Despite the badly named field, readdir does not use shared memory.
 	 * However, there are a limited number of readdir slots, which must be
@@ -61,7 +50,6 @@ static int orangefs_dir_more(struct orangefs_inode_s *oi,
 again:
 	bufi = orangefs_readdir_index_get();
 	if (bufi < 0) {
-		op_release(op);
 		od->error = bufi;
 		return bufi;
 	}
@@ -79,7 +67,6 @@ static int orangefs_dir_more(struct orangefs_inode_s *oi,
 			goto again;
 		} else if (r == -EIO) {
 			vfree(op->downcall.trailer_buf);
-			op_release(op);
 			od->error = r;
 			return r;
 		}
@@ -87,75 +74,188 @@ static int orangefs_dir_more(struct orangefs_inode_s *oi,
 
 	if (r < 0) {
 		vfree(op->downcall.trailer_buf);
-		op_release(op);
 		od->error = r;
 		return r;
 	} else if (op->downcall.status) {
 		vfree(op->downcall.trailer_buf);
-		op_release(op);
 		od->error = op->downcall.status;
 		return op->downcall.status;
 	}
 
 	resp = (struct orangefs_readdir_response_s *)op->downcall.trailer_buf;
 	od->token = resp->token;
+	return 0;
+}
 
-	if (od->len + op->downcall.trailer_size - offset <= MAX_DIRECTORY) {
-		memcpy(od->directory + od->len,
-		    op->downcall.trailer_buf + offset,
-		    op->downcall.trailer_size - offset);
-		od->len += op->downcall.trailer_size - offset;
-	} else {
-		/* This limit was chosen based on protocol limits. */
-		gossip_err("orangefs_dir_more: userspace sent too much data\n");
-		vfree(op->downcall.trailer_buf);
-		op_release(op);
-		od->error = -EIO;
-		return -EIO;
+static int parse_readdir(struct orangefs_dir *od,
+    struct orangefs_kernel_op_s *op)
+{
+	const size_t offset = sizeof(struct orangefs_readdir_response_s);
+	size_t last, i = offset, count = 1;
+	struct orangefs_dir_page *page;
+
+	page = od->page;
+	while (page->next) {
+		page = page->next;
+		count++;
+	}
+
+	while (i < op->downcall.trailer_size) {
+		__u32 *len, padlen;
+		/* Calculate how much will fit into the current page. */
+		last = i;
+		while (i < op->downcall.trailer_size) {
+			len = (void *)op->downcall.trailer_buf + i;
+			padlen = (4 + *len + 1 + 16) +
+			    (8 - (4 + *len + 1 + 16)%8)%8;
+			if (i + padlen > op->downcall.trailer_size) {
+				gossip_err(
+				    "orangefs_dir_more: userspace corrupt");
+				return -EIO;
+			}
+			if (page->len + i + padlen - last >
+			    PAGE_SIZE - sizeof *page)
+				break;
+			i += padlen;
+		}
+
+		memcpy((void *)page + sizeof *page + page->len,
+		    op->downcall.trailer_buf + last, i - last);
+		page->len += i - last;
+		/*
+		 * Must add 2.  Once from the current page (count++ below has
+		 * not been done here) and once to move past the final page.
+		 * The iteration code moves to the beginning of the next page
+		 * when it runs out, so the end pointer must point to the
+		 * beginning of the page after the last page.
+		 */
+		od->end = (count + 2) << PAGE_SHIFT;
+		BUG_ON(page->len > PAGE_SIZE - sizeof *page);
+
+		/* New page for next bit. */
+
+		/*
+		 * Should this wait if there's more room in the page?  Sounds
+		 * hard and error-prone to attempt to measure and at best saves
+		 * only a few kilobytes.
+		 */
+
+		page->next = (void *)__get_free_page(GFP_KERNEL);
+		if (!page->next)
+			return -ENOMEM;
+		page = page->next;
+		count++;
+		page->next = NULL;
+		page->len = 0;
+
+		/* Must make progress. */
+		BUG_ON(last == i);
+	}
+
+	return 0;
+}
+
+static int orangefs_dir_more(struct orangefs_inode_s *oi,
+    struct orangefs_dir *od, struct dentry *dentry)
+{
+	struct orangefs_kernel_op_s *op;
+	int r;
+
+	op = op_alloc(ORANGEFS_VFS_OP_READDIR);
+	if (!op) {
+		od->error = -ENOMEM;
+		return -ENOMEM;
+	}
+	r = do_readdir(oi, od, dentry, op);
+	if (r) {
+		od->error = r;
+		goto out;
+	}
+	r = parse_readdir(od, op);
+	if (r) {
+		od->error = r;
+		goto out;
 	}
 
+	od->error = 0;
 	vfree(op->downcall.trailer_buf);
+out:
 	op_release(op);
-	return 0;
+	return od->error;
 }
 
-static int orangefs_dir_fill(struct orangefs_inode_s *oi,
-    struct orangefs_dir *od, struct dentry *dentry, struct dir_context *ctx)
+static int fill_from_page(struct orangefs_dir_page *page,
+    struct dir_context *ctx)
 {
 	struct orangefs_khandle *khandle;
 	__u32 *len, padlen;
 	loff_t i;
 	char *s;
-	i = ctx->pos - 2;
-	while (i < od->len) {
-		if (od->len < i + sizeof *len)
-			goto eio;
-		len = od->directory + i;
+	i = ctx->pos & ~PAGE_MASK;
+
+	/* The file offset from userspace is too large. */
+	if (i > page->len)
+		return -EIO;
+
+	while (i < page->len) {
+		if (page->len < i + sizeof *len)
+			return -EIO;
+		len = (void *)page + sizeof *page + i;
 		/* len does not include trailing zero but padlen does. */
 		padlen = (*len + 1) + (4 - (*len + 1)%4)%4;
-		if (od->len < i + sizeof *len + padlen + sizeof *khandle)
-			goto eio;
-		s = od->directory + i + sizeof *len;
+		if (page->len < i + sizeof *len + padlen + sizeof *khandle)
+			return -EIO;
+		s = (void *)page + sizeof *page + i + sizeof *len;
 		if (s[*len] != 0)
-			goto eio;
-		khandle = od->directory + i + sizeof *len + padlen;
-
+			return -EIO;
+		khandle = (void *)page + sizeof *page +
+		    i + sizeof *len + padlen;
 		if (!dir_emit(ctx, s, *len, orangefs_khandle_to_ino(khandle),
 		    DT_UNKNOWN))
 			return 0;
 		i += sizeof *len + padlen + sizeof *khandle;
 		i = i + (8 - i%8)%8;
-		ctx->pos = i + 2;
+		BUG_ON(i > page->len);
+		ctx->pos = (ctx->pos & PAGE_MASK) | i;
+	}
+	return 1;
+}
+
+static int orangefs_dir_fill(struct orangefs_inode_s *oi,
+    struct orangefs_dir *od, struct dentry *dentry, struct dir_context *ctx)
+{
+	struct orangefs_dir_page *page;
+	size_t count;
+
+	count = ((ctx->pos & PAGE_MASK) >> PAGE_SHIFT) - 1;
+
+	page = od->page;
+	while (page->next && count) {
+		count--;
+		page = page->next;
+	}
+	/* This means the userspace file offset is invalid. */
+	if (count) {
+		od->error = -EIO;
+		return -EIO;
+	}
+
+	while (page && page->len) {
+		int r;
+		r = fill_from_page(page, ctx);
+		if (r < 0) {
+			od->error = r;
+			return r;
+		} else if (r == 0) {
+			/* Userspace buffer is full. */
+			break;
+		} else {
+			/* The page ran out of data.  Move to the next page. */
+			ctx->pos = (ctx->pos & PAGE_MASK) + (1 << PAGE_SHIFT);
+			page = page->next;
+		}
 	}
-	BUG_ON(i > od->len);
 	return 0;
-eio:
-	/*
-	 * Here either data from userspace is corrupt or the application has
-	 * sought to an invalid location.
-	 */
-	od->error = -EIO;
-	return -EIO;
 }
 
 static int orangefs_dir_iterate(struct file *file, struct dir_context *ctx)
@@ -180,7 +280,7 @@ static int orangefs_dir_iterate(struct file *file, struct dir_context *ctx)
 	if (ctx->pos == 1) {
 		if (!dir_emit_dotdot(file, ctx))
 			return 0;
-		ctx->pos++;
+		ctx->pos = 1 << PAGE_SHIFT;
 	}
 
 	r = 0;
@@ -189,17 +289,16 @@ static int orangefs_dir_iterate(struct file *file, struct dir_context *ctx)
 	 * Must read more if the user has sought past what has been read so far.
 	 * Stop a user who has sought past the end.
 	 */
-	while (od->token != ORANGEFS_READDIR_END && ctx->pos - 2 > od->len) {
+	while (od->token != ORANGEFS_READDIR_END && ctx->pos > od->end) {
 		r = orangefs_dir_more(oi, od, dentry);
 		if (r)
 			return r;
 	}
-	if (od->token == ORANGEFS_READDIR_END && ctx->pos - 2 > od->len) {
+	if (od->token == ORANGEFS_READDIR_END && ctx->pos > od->end)
 		return -EIO;
-	}
 
 	/* Then try to fill if there's any left in the buffer. */
-	if (ctx->pos - 2 < od->len) {
+	if (ctx->pos < od->end) {
 		r = orangefs_dir_fill(oi, od, dentry, ctx);
 		if (r)
 			return r;
@@ -224,16 +323,10 @@ static int orangefs_dir_open(struct inode *inode, struct file *file)
 		return -ENOMEM;
 	od = file->private_data;
 	od->token = ORANGEFS_READDIR_START;
-	/*
-	 * XXX: It seems wasteful to allocate such a large buffer for
-	 * each request.  Most will be much smaller.
-	 */
-	od->directory = alloc_pages_exact(MAX_DIRECTORY, GFP_KERNEL);
-	if (!od->directory) {
-		kfree(file->private_data);
-		return -ENOMEM;
-	}
-	od->len = 0;
+	od->page = (void *)__get_free_page(GFP_KERNEL);
+	od->page->next = NULL;
+	od->page->len = 0;
+	od->end = 1 << PAGE_SHIFT;
 	od->error = 0;
 	return 0;
 }
@@ -241,8 +334,13 @@ static int orangefs_dir_open(struct inode *inode, struct file *file)
 static int orangefs_dir_release(struct inode *inode, struct file *file)
 {
 	struct orangefs_dir *od = file->private_data;
+	struct orangefs_dir_page *page = od->page;
 	orangefs_flush_inode(inode);
-	free_pages_exact(od->directory, MAX_DIRECTORY);
+	do {
+		struct orangefs_dir_page *next = page->next;
+		free_page((unsigned long)page);
+		page = next;
+	} while (page);
 	kfree(od);
 	return 0;
 }
-- 
2.1.4

  parent reply	other threads:[~2017-04-07 21:17 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-04-07 21:17 [PATCH 00/10] orangefs changes for 4.12 Martin Brandenburg
2017-04-07 21:17 ` [PATCH 01/10] orangefs: remove unused get_fsid_from_ino Martin Brandenburg
2017-04-07 21:17 ` [PATCH 02/10] orangefs: fix bounds check for listxattr Martin Brandenburg
2017-04-07 21:17 ` [PATCH 03/10] orangefs: clean up oversize xattr validation Martin Brandenburg
2017-04-07 21:17 ` [PATCH 04/10] orangefs: do not set getattr_time on orangefs_lookup Martin Brandenburg
2017-04-07 21:17 ` [PATCH 05/10] orangefs: rewrite readdir to fix several bugs Martin Brandenburg
2017-04-07 21:17 ` [PATCH 06/10] orangefs: support llseek on directories Martin Brandenburg
2017-04-07 21:17 ` Martin Brandenburg [this message]
2017-04-07 21:17 ` [PATCH 08/10] orangefs: remove ORANGEFS_READDIR macros Martin Brandenburg
2017-04-07 21:17 ` [PATCH 09/10] orangefs: implement statx Martin Brandenburg
2017-04-07 21:17 ` [PATCH 10/10] orangefs: do not check possibly stale size on truncate Martin Brandenburg

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1491599832-17773-8-git-send-email-martin@omnibond.com \
    --to=martin@omnibond.com \
    --cc=hubcap@omnibond.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).