[Cluster-devel] [PATCH 0/2] positional readdir cookies

cluster-devel.redhat.com archive mirror
 help / color / mirror / Atom feed

* [Cluster-devel] [PATCH 0/2] positional readdir cookies
@ 2015-12-01  6:10 Benjamin Marzinski
  2015-12-01  6:10 ` [Cluster-devel] [PATCH 1/2] gfs2: keep offset when splitting dir leaf blocks Benjamin Marzinski
  2015-12-01  6:10 ` [Cluster-devel] [PATCH 2/2] gfs2: change gfs2 readdir cookie Benjamin Marzinski
  0 siblings, 2 replies; 5+ messages in thread
From: Benjamin Marzinski @ 2015-12-01  6:10 UTC (permalink / raw)
  To: cluster-devel.redhat.com

These two patches implement positional readdir cookies. The first one simply
changes how splitting leaf blocks works to allow for this method to work. The
second one does the meat of the work.

Like I mention in the second patch, this adds a new parameter to the dirent
structure that is never saved to disk.  This is simply to make use of the
memory to store the computed location based cookie. Avoiding this has a
noticeable performance impact. However, I'm open to any ideas on how to make
this look less strange (or, any other ways of getting space to store these
values that doesn't involve allocating it, which caused the performance hit).

Benjamin Marzinski (2):
  gfs2: keep offset when splitting dir leaf blocks
  gfs2: change gfs2 readdir cookie

 fs/gfs2/dir.c                    | 160 ++++++++++++++++++++++++++++++---------
 fs/gfs2/incore.h                 |   3 +
 fs/gfs2/ops_fstype.c             |   3 +
 fs/gfs2/super.c                  |  12 +++
 include/uapi/linux/gfs2_ondisk.h |   9 ++-
 5 files changed, 148 insertions(+), 39 deletions(-)

-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [Cluster-devel] [PATCH 1/2] gfs2: keep offset when splitting dir leaf blocks
  2015-12-01  6:10 [Cluster-devel] [PATCH 0/2] positional readdir cookies Benjamin Marzinski
@ 2015-12-01  6:10 ` Benjamin Marzinski
  2015-12-01  6:10 ` [Cluster-devel] [PATCH 2/2] gfs2: change gfs2 readdir cookie Benjamin Marzinski
  1 sibling, 0 replies; 5+ messages in thread
From: Benjamin Marzinski @ 2015-12-01  6:10 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Currently, when gfs2 splits a directory leaf block, the dirents that
need to be copied to the new leaf block are packed into the start of it.
This is good for space efficiency. However, if gfs2 were to copy those
dirents into the exact same offset in the new leaf block as they had in
the old block, it would be able to generate a readdir cookie based on
the dirent location, that would be guaranteed to be unique up well past
where the current code is statistically almost guaranteed to have
collisions. So, gfs2 now keeps the dirent's offset in the block the
same when it copies it to the new leaf block.

Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
---
 fs/gfs2/dir.c | 69 +++++++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 53 insertions(+), 16 deletions(-)

diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index c248659..4ee008c 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -443,6 +443,27 @@ static int gfs2_dirent_last(const struct gfs2_dirent *dent,
 	return 0;
 }
 
+/* Look for the dirent that contains the offset specified in data. Once we
+ * find that dirent, there must be space available there for the new dirent */
+static int gfs2_dirent_find_offset(const struct gfs2_dirent *dent,
+				  const struct qstr *name,
+				  void *ptr)
+{
+	unsigned required = GFS2_DIRENT_SIZE(name->len);
+	unsigned actual = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len));
+	unsigned totlen = be16_to_cpu(dent->de_rec_len);
+
+	if (ptr < (void *)dent || ptr >= (void *)dent + totlen)
+		return 0;
+	if (gfs2_dirent_sentinel(dent))
+		actual = 0;
+	if (ptr < (void *)dent + actual)
+		return -1;
+	if ((void *)dent + totlen >= ptr + required)
+		return 1;
+	return -1;
+}
+
 static int gfs2_dirent_find_space(const struct gfs2_dirent *dent,
 				  const struct qstr *name,
 				  void *opaque)
@@ -682,6 +703,27 @@ static void dirent_del(struct gfs2_inode *dip, struct buffer_head *bh,
 	prev->de_rec_len = cpu_to_be16(prev_rec_len);
 }
 
+
+static struct gfs2_dirent *do_init_dirent(struct inode *inode,
+					  struct gfs2_dirent *dent,
+					  const struct qstr *name,
+					  struct buffer_head *bh,
+					  unsigned offset)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_dirent *ndent;
+	unsigned totlen;
+
+	totlen = be16_to_cpu(dent->de_rec_len);
+	BUG_ON(offset + name->len > totlen);
+	gfs2_trans_add_meta(ip->i_gl, bh);
+	ndent = (struct gfs2_dirent *)((char *)dent + offset);
+	dent->de_rec_len = cpu_to_be16(offset);
+	gfs2_qstr2dirent(name, totlen - offset, ndent);
+	return ndent;
+}
+
+
 /*
  * Takes a dent from which to grab space as an argument. Returns the
  * newly created dent.
@@ -691,31 +733,25 @@ static struct gfs2_dirent *gfs2_init_dirent(struct inode *inode,
 					    const struct qstr *name,
 					    struct buffer_head *bh)
 {
-	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_dirent *ndent;
-	unsigned offset = 0, totlen;
+	unsigned offset = 0;
 
 	if (!gfs2_dirent_sentinel(dent))
 		offset = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len));
-	totlen = be16_to_cpu(dent->de_rec_len);
-	BUG_ON(offset + name->len > totlen);
-	gfs2_trans_add_meta(ip->i_gl, bh);
-	ndent = (struct gfs2_dirent *)((char *)dent + offset);
-	dent->de_rec_len = cpu_to_be16(offset);
-	gfs2_qstr2dirent(name, totlen - offset, ndent);
-	return ndent;
+	return do_init_dirent(inode, dent, name, bh, offset);
 }
 
-static struct gfs2_dirent *gfs2_dirent_alloc(struct inode *inode,
-					     struct buffer_head *bh,
-					     const struct qstr *name)
+static struct gfs2_dirent *gfs2_dirent_split_alloc(struct inode *inode,
+						   struct buffer_head *bh,
+						   const struct qstr *name,
+						   void *ptr)
 {
 	struct gfs2_dirent *dent;
 	dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
-				gfs2_dirent_find_space, name, NULL);
+				gfs2_dirent_find_offset, name, ptr);
 	if (!dent || IS_ERR(dent))
 		return dent;
-	return gfs2_init_dirent(inode, dent, name, bh);
+	return do_init_dirent(inode, dent, name, bh,
+			      (unsigned)(ptr - (void *)dent));
 }
 
 static int get_leaf(struct gfs2_inode *dip, u64 leaf_no,
@@ -1051,10 +1087,11 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
 		if (!gfs2_dirent_sentinel(dent) &&
 		    be32_to_cpu(dent->de_hash) < divider) {
 			struct qstr str;
+			void *ptr = ((char *)dent - obh->b_data) + nbh->b_data;
 			str.name = (char*)(dent+1);
 			str.len = be16_to_cpu(dent->de_name_len);
 			str.hash = be32_to_cpu(dent->de_hash);
-			new = gfs2_dirent_alloc(inode, nbh, &str);
+			new = gfs2_dirent_split_alloc(inode, nbh, &str, ptr);
 			if (IS_ERR(new)) {
 				error = PTR_ERR(new);
 				break;
-- 
1.8.3.1



^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [Cluster-devel] [PATCH 2/2] gfs2: change gfs2 readdir cookie
  2015-12-01  6:10 [Cluster-devel] [PATCH 0/2] positional readdir cookies Benjamin Marzinski
  2015-12-01  6:10 ` [Cluster-devel] [PATCH 1/2] gfs2: keep offset when splitting dir leaf blocks Benjamin Marzinski
@ 2015-12-01  6:10 ` Benjamin Marzinski
  2015-12-01 15:01   ` Bob Peterson
  1 sibling, 1 reply; 5+ messages in thread
From: Benjamin Marzinski @ 2015-12-01  6:10 UTC (permalink / raw)
  To: cluster-devel.redhat.com

gfs2 currently returns 31 bits of filename hash as a cookie that readdir
uses for an offset into the directory.  When there are a large number of
directory entries, the likelihood of a collision goes up way too
quickly.  GFS2 will now return cookies that are guaranteed unique for a
while, and then fail back to using 30 bits of filename hash.
Specifically, the directory leaf blocks are divided up into chunks based
on the minimum size of a gfs2 directory entry (48 bytes). Each entry's
cookie is based off the chunk where it starts, in the linked list of
leaf blocks that it hashes to (there are 131072 hash buckets). Directory
entries will have unique names until they take reach chunk 8192.
Assuming the largest filenames possible, and the least efficient spacing
possible, this new method will still be able to return unique names when
the previous method has statistically more than a 99% chance of a
collision.  The non-unique names it fails back to are guaranteed to not
collide with the unique names.

unique cookies will be in this format:
- 1 bit "0" to make sure the the returned cookie is positive
- 17 bits for the hash table index
- 1 bit for the mode "0"
- 13 bits for the offset

non-unique cookies will be in this format:
- 1 bit "0" to make sure the the returned cookie is positive
- 17 bits for the hash table index
- 1 bit for the mode "1"
- 13 more bits of the name hash

Another benefit of location based cookies, is that once a directory's
exhash table is fully extended (so that multiple hash table indexs do
not use the same leaf blocks), gfs2 can skip sorting the directory
entries until it reaches the non-unique ones, and then it only needs to
sort these. This provides a significant speed up for directory reads of
very large directories.

The only issue is that for these cookies to continue to point to the
correct entry as files are added and removed from the directory, gfs2
must keep the entries at the same offset in the leaf block when they are
split (see my previous patch). This means that until all the nodes in a
cluster are running with code that will split the directory leaf blocks
this way, none of the nodes can use the new cookie code. To deal with
this, gfs2 now has the mount option loccookie, which, if set, will make
it return these new location based cookies.  This option must not be set
until all nodes in the cluster are at least running this version of the
kernel code, and you have guaranteed that there are no outstanding
cookies required by other software, such as NFS.

gfs2 uses some of the extra space at the end of the gfs2_dirent
structure to store the calculated readdir cookies. This keeps us from
needing to allocate a seperate array to hold these values.  gfs2
recomputes the cookie stored in de_cookie for every readdir call.  The
time it takes to do so is small, and if gfs2 expected this value to be
saved on disk, the new code wouldn't work correctly on filesystems
created with an earlier version of gfs2.

One issue with adding de_cookie to the union in the gfs2_dirent
structure is that it caused the union to align itself to a 4 byte
boundary, instead of its previous 2 byte boundary. This changed the
offset of de_rahead. To solve that, I pulled de_rahead out of the union,
since it does not need to be there.

Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
---
 fs/gfs2/dir.c                    | 91 +++++++++++++++++++++++++++++++---------
 fs/gfs2/incore.h                 |  3 ++
 fs/gfs2/ops_fstype.c             |  3 ++
 fs/gfs2/super.c                  | 12 ++++++
 include/uapi/linux/gfs2_ondisk.h |  9 ++--
 5 files changed, 95 insertions(+), 23 deletions(-)

diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 4ee008c..6a92592 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -82,6 +82,8 @@
 
 #define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1)
 #define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1))
+#define GFS2_HASH_INDEX_MASK 0xffffc000
+#define GFS2_USE_HASH_FLAG 0x2000
 
 struct qstr gfs2_qdot __read_mostly;
 struct qstr gfs2_qdotdot __read_mostly;
@@ -1223,10 +1225,10 @@ static int compare_dents(const void *a, const void *b)
 	int ret = 0;
 
 	dent_a = *(const struct gfs2_dirent **)a;
-	hash_a = be32_to_cpu(dent_a->de_hash);
+	hash_a = dent_a->de_cookie;
 
 	dent_b = *(const struct gfs2_dirent **)b;
-	hash_b = be32_to_cpu(dent_b->de_hash);
+	hash_b = dent_b->de_cookie;
 
 	if (hash_a > hash_b)
 		ret = 1;
@@ -1264,19 +1266,20 @@ static int compare_dents(const void *a, const void *b)
  */
 
 static int do_filldir_main(struct gfs2_inode *dip, struct dir_context *ctx,
-			   const struct gfs2_dirent **darr, u32 entries,
-			   int *copied)
+			   struct gfs2_dirent **darr, u32 entries,
+			   u32 sort_start, int *copied)
 {
 	const struct gfs2_dirent *dent, *dent_next;
 	u64 off, off_next;
 	unsigned int x, y;
 	int run = 0;
 
-	sort(darr, entries, sizeof(struct gfs2_dirent *), compare_dents, NULL);
+	if (sort_start < entries)
+		sort(&darr[sort_start], entries - sort_start,
+		     sizeof(struct gfs2_dirent *), compare_dents, NULL);
 
 	dent_next = darr[0];
-	off_next = be32_to_cpu(dent_next->de_hash);
-	off_next = gfs2_disk_hash2offset(off_next);
+	off_next = dent_next->de_cookie;
 
 	for (x = 0, y = 1; x < entries; x++, y++) {
 		dent = dent_next;
@@ -1284,8 +1287,7 @@ static int do_filldir_main(struct gfs2_inode *dip, struct dir_context *ctx,
 
 		if (y < entries) {
 			dent_next = darr[y];
-			off_next = be32_to_cpu(dent_next->de_hash);
-			off_next = gfs2_disk_hash2offset(off_next);
+			off_next = dent_next->de_cookie;
 
 			if (off < ctx->pos)
 				continue;
@@ -1332,6 +1334,40 @@ static void *gfs2_alloc_sort_buffer(unsigned size)
 	return ptr;
 }
 
+
+static int gfs2_set_cookies(struct gfs2_sbd *sdp, struct buffer_head *bh,
+			    unsigned leaf_nr, struct gfs2_dirent **darr,
+			    unsigned entries)
+{
+	int sort_id = -1;
+	int i;
+	
+	for (i = 0; i < entries; i++) {
+		unsigned offset;
+
+		darr[i]->de_cookie = be32_to_cpu(darr[i]->de_hash);
+		darr[i]->de_cookie = gfs2_disk_hash2offset(darr[i]->de_cookie);
+
+		if (!sdp->sd_args.ar_loccookie)
+			continue;
+		offset = (char *)(darr[i]) -
+			 (bh->b_data + gfs2_dirent_offset(bh->b_data));
+		offset /= GFS2_MIN_DIRENT_SIZE;
+		offset += leaf_nr * sdp->sd_max_dents_per_leaf;
+		if (offset >= GFS2_USE_HASH_FLAG ||
+		    leaf_nr >= GFS2_USE_HASH_FLAG) {
+			darr[i]->de_cookie |= GFS2_USE_HASH_FLAG;
+			if (sort_id < 0)
+				sort_id = i;
+			continue;
+		}
+		darr[i]->de_cookie &= GFS2_HASH_INDEX_MASK;
+		darr[i]->de_cookie |= offset;
+	}
+	return sort_id;
+}	
+
+
 static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
 			      int *copied, unsigned *depth,
 			      u64 leaf_no)
@@ -1341,12 +1377,11 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
 	struct buffer_head *bh;
 	struct gfs2_leaf *lf;
 	unsigned entries = 0, entries2 = 0;
-	unsigned leaves = 0;
-	const struct gfs2_dirent **darr, *dent;
+	unsigned leaves = 0, leaf = 0, offset, sort_offset;
+	struct gfs2_dirent **darr, *dent;
 	struct dirent_gather g;
 	struct buffer_head **larr;
-	int leaf = 0;
-	int error, i;
+	int error, i, need_sort = 0, sort_id;
 	u64 lfn = leaf_no;
 
 	do {
@@ -1362,6 +1397,11 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
 		brelse(bh);
 	} while(lfn);
 
+	if (*depth < GFS2_DIR_MAX_DEPTH || !sdp->sd_args.ar_loccookie) {
+		need_sort = 1;
+		sort_offset = 0;
+	}
+
 	if (!entries)
 		return 0;
 
@@ -1375,8 +1415,8 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
 	larr = gfs2_alloc_sort_buffer((leaves + entries + 99) * sizeof(void *));
 	if (!larr)
 		goto out;
-	darr = (const struct gfs2_dirent **)(larr + leaves);
-	g.pdent = darr;
+	darr = (struct gfs2_dirent **)(larr + leaves);
+	g.pdent = (const struct gfs2_dirent **)darr;
 	g.offset = 0;
 	lfn = leaf_no;
 
@@ -1387,6 +1427,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
 		lf = (struct gfs2_leaf *)bh->b_data;
 		lfn = be64_to_cpu(lf->lf_next);
 		if (lf->lf_entries) {
+			offset = g.offset;
 			entries2 += be16_to_cpu(lf->lf_entries);
 			dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
 						gfs2_dirent_gather, NULL, &g);
@@ -1404,17 +1445,26 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
 				goto out_free;
 			}
 			error = 0;
+			sort_id = gfs2_set_cookies(sdp, bh, leaf, &darr[offset],
+						   be16_to_cpu(lf->lf_entries));
+			if (!need_sort && sort_id >= 0) {
+				need_sort = 1;
+				sort_offset = offset + sort_id;
+			}
 			larr[leaf++] = bh;
 		} else {
+			larr[leaf++] = NULL;
 			brelse(bh);
 		}
 	} while(lfn);
 
 	BUG_ON(entries2 != entries);
-	error = do_filldir_main(ip, ctx, darr, entries, copied);
+	error = do_filldir_main(ip, ctx, darr, entries, need_sort ?
+				sort_offset : entries, copied);
 out_free:
 	for(i = 0; i < leaf; i++)
-		brelse(larr[i]);
+		if (larr[i])
+			brelse(larr[i]);
 	kvfree(larr);
 out:
 	return error;
@@ -1520,7 +1570,7 @@ int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
 	struct gfs2_inode *dip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	struct dirent_gather g;
-	const struct gfs2_dirent **darr, *dent;
+	struct gfs2_dirent **darr, *dent;
 	struct buffer_head *dibh;
 	int copied = 0;
 	int error;
@@ -1544,7 +1594,7 @@ int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
 	/* 96 is max number of dirents which can be stuffed into an inode */
 	darr = kmalloc(96 * sizeof(struct gfs2_dirent *), GFP_NOFS);
 	if (darr) {
-		g.pdent = darr;
+		g.pdent = (const struct gfs2_dirent **)darr;
 		g.offset = 0;
 		dent = gfs2_dirent_scan(inode, dibh->b_data, dibh->b_size,
 					gfs2_dirent_gather, NULL, &g);
@@ -1561,8 +1611,9 @@ int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
 			error = -EIO;
 			goto out;
 		}
+		gfs2_set_cookies(sdp, dibh, 0, darr, dip->i_entries);
 		error = do_filldir_main(dip, ctx, darr,
-					dip->i_entries, &copied);
+					dip->i_entries, 0, &copied);
 out:
 		kfree(darr);
 	}
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 921304e..e8eaf71 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -562,6 +562,8 @@ struct gfs2_args {
 	unsigned int ar_errors:2;               /* errors=withdraw | panic */
 	unsigned int ar_nobarrier:1;            /* do not send barriers */
 	unsigned int ar_rgrplvb:1;		/* use lvbs for rgrp info */
+	unsigned int ar_loccookie;		/* use location based readdir
+						   cookies */
 	int ar_commit;				/* Commit interval */
 	int ar_statfs_quantum;			/* The fast statfs interval */
 	int ar_quota_quantum;			/* The quota interval */
@@ -689,6 +691,7 @@ struct gfs2_sbd {
 	u64 sd_heightsize[GFS2_MAX_META_HEIGHT + 1];
 	u32 sd_max_jheight; /* Max height of journaled file's meta tree */
 	u64 sd_jheightsize[GFS2_MAX_META_HEIGHT + 1];
+	u32 sd_max_dents_per_leaf; /* Max number of dirents in a leaf block */
 
 	struct gfs2_args sd_args;	/* Mount arguments */
 	struct gfs2_tune sd_tune;	/* Filesystem tuning structure */
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 1f9de17..7aacdf2 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -352,6 +352,9 @@ static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent)
 	sdp->sd_jheightsize[x] = ~0;
 	gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT);
 
+	sdp->sd_max_dents_per_leaf = (sdp->sd_sb.sb_bsize -
+				      sizeof(struct gfs2_leaf)) /
+				     GFS2_MIN_DIRENT_SIZE;
 	return 0;
 }
 
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 03fa155..0f3d646 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -83,6 +83,8 @@ enum {
 	Opt_nobarrier,
 	Opt_rgrplvb,
 	Opt_norgrplvb,
+	Opt_loccookie,
+	Opt_noloccookie,
 	Opt_error,
 };
 
@@ -122,6 +124,8 @@ static const match_table_t tokens = {
 	{Opt_nobarrier, "nobarrier"},
 	{Opt_rgrplvb, "rgrplvb"},
 	{Opt_norgrplvb, "norgrplvb"},
+	{Opt_loccookie, "loccookie"},
+	{Opt_noloccookie, "noloccookie"},
 	{Opt_error, NULL}
 };
 
@@ -278,6 +282,12 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
 		case Opt_norgrplvb:
 			args->ar_rgrplvb = 0;
 			break;
+		case Opt_loccookie:
+			args->ar_loccookie = 1;
+			break;
+		case Opt_noloccookie:
+			args->ar_loccookie = 0;
+			break;
 		case Opt_error:
 		default:
 			pr_warn("invalid mount option: %s\n", o);
@@ -1418,6 +1428,8 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root)
 		seq_puts(s, ",demote_interface_used");
 	if (args->ar_rgrplvb)
 		seq_puts(s, ",rgrplvb");
+	if (args->ar_loccookie)
+		seq_puts(s, ",loccookie");
 	return 0;
 }
 
diff --git a/include/uapi/linux/gfs2_ondisk.h b/include/uapi/linux/gfs2_ondisk.h
index 1a763ea..7c4be77 100644
--- a/include/uapi/linux/gfs2_ondisk.h
+++ b/include/uapi/linux/gfs2_ondisk.h
@@ -297,6 +297,8 @@ struct gfs2_dinode {
 
 #define GFS2_FNAMESIZE		255
 #define GFS2_DIRENT_SIZE(name_len) ((sizeof(struct gfs2_dirent) + (name_len) + 7) & ~7)
+#define GFS2_MIN_DIRENT_SIZE (GFS2_DIRENT_SIZE(1))
+
 
 struct gfs2_dirent {
 	struct gfs2_inum de_inum;
@@ -304,11 +306,12 @@ struct gfs2_dirent {
 	__be16 de_rec_len;
 	__be16 de_name_len;
 	__be16 de_type;
+	__be16 de_rahead;
 	union {
-		__u8 __pad[14];
+		__u8 __pad[12];
 		struct {
-			__be16 de_rahead;
-			__u8 pad2[12];
+			__u32 de_cookie; /* ondisk value not used */
+			__u8 pad3[8];
 		};
 	};
 };
-- 
1.8.3.1



^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [Cluster-devel] [PATCH 2/2] gfs2: change gfs2 readdir cookie
  2015-12-01  6:10 ` [Cluster-devel] [PATCH 2/2] gfs2: change gfs2 readdir cookie Benjamin Marzinski
@ 2015-12-01 15:01   ` Bob Peterson
  0 siblings, 0 replies; 5+ messages in thread
From: Bob Peterson @ 2015-12-01 15:01 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Hi Ben,

----- Original Message -----
(snip)

> diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
> index 921304e..e8eaf71 100644
> --- a/fs/gfs2/incore.h
> +++ b/fs/gfs2/incore.h
> @@ -562,6 +562,8 @@ struct gfs2_args {
>  	unsigned int ar_errors:2;               /* errors=withdraw | panic */
>  	unsigned int ar_nobarrier:1;            /* do not send barriers */
>  	unsigned int ar_rgrplvb:1;		/* use lvbs for rgrp info */
> +	unsigned int ar_loccookie;		/* use location based readdir
> +						   cookies */
>  	int ar_commit;				/* Commit interval */
>  	int ar_statfs_quantum;			/* The fast statfs interval */
>  	int ar_quota_quantum;			/* The quota interval */

Just a nit: Is okay if we specify ar_loccookie:1 for consistency?

Otherwise, the patches look good, and I've tested a RHEL6 port of them
using specsfs which uses NFS and gives this code a good workout.

Regards,

Bob Peterson
Red Hat File Systems



^ permalink raw reply	[flat|nested] 5+ messages in thread

* [Cluster-devel] [PATCH 0/2] readdir cookie patches
@ 2015-07-18  4:40 Benjamin Marzinski
  2015-07-18  4:40 ` [Cluster-devel] [PATCH 2/2] gfs2: change gfs2 readdir cookie Benjamin Marzinski
  0 siblings, 1 reply; 5+ messages in thread
From: Benjamin Marzinski @ 2015-07-18  4:40 UTC (permalink / raw)
  To: cluster-devel.redhat.com

So, these two patches change the readdir cookies to format that should last a
lot longer before there are collisions. When I did performance testing, the
results depended on how large the directory was.  For small directories, the
code performed slightly worse, with it becoming more noticeable as the
directories got larger until a point. At around 100000 entries it seemed the
worst, with a "ls -f" time of 0.095s in the new code vs 0.084s in the old code.
After that you start reaching the point where hash indexes reach the maximum
depth, and the new code stops needing to sort them, and performance of the new
code quickly surpasses the old code. For instance, when I contrived a situation
where there were 1000 dirents with the same hash index, the new code "ls -f"
time was a less than a tenth of the old code, 0.003s vs 0.036s. However, this
is a pretty unrealistic size, since with 131072 hash buckets, you shouldn't
expect this many dirents per average bucket until you have around 130 million
files in a directory.

The only other real issue with the new code is that since we have to compute
and save the cookie when we first process the dirent in the read_dir_code,
instead of at the moment of sorting, we need to double the space used to
save the dirents for sorting. We could avoid the, by using part of the
dirent padding as a scratch space to store the computed cookie. 

Benjamin Marzinski (2):
  gfs2: keep offset when splitting dir leaf blocks
  gfs2: change gfs2 readdir cookie

 fs/gfs2/dir.c                    | 189 ++++++++++++++++++++++++++++++---------
 fs/gfs2/incore.h                 |   3 +
 fs/gfs2/ops_fstype.c             |   3 +
 fs/gfs2/super.c                  |  12 +++
 include/uapi/linux/gfs2_ondisk.h |   2 +
 5 files changed, 167 insertions(+), 42 deletions(-)

-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [Cluster-devel] [PATCH 2/2] gfs2: change gfs2 readdir cookie
  2015-07-18  4:40 [Cluster-devel] [PATCH 0/2] readdir cookie patches Benjamin Marzinski
@ 2015-07-18  4:40 ` Benjamin Marzinski
  0 siblings, 0 replies; 5+ messages in thread
From: Benjamin Marzinski @ 2015-07-18  4:40 UTC (permalink / raw)
  To: cluster-devel.redhat.com

gfs2 currently returns 31 bits of filename hash as a cookie that readdir
uses for an offset into the directory.  When there are a large number of
directory entries, the likelihood of a collision goes up way too
quickly.  GFS2 will now return cookies that are guaranteed unique for a
while, and then fail back to using 30 bits of filename hash.
Specifically, the directory leaf blocks are divided up into chunks based
on the minimum size of a gfs2 directory entry (48 bytes). Each entry's
cookie is based off the chunk where it starts, in the linked list of
leaf blocks that it hashes to (there are 131072 hash buckets). Directory
entries will have unique names until they take reach chunk 8192.
Assuming the largest filenames possible, and the least efficient spacing
possible, this new method will still be able to return unique names when
the previous method has statistically more than a 99% chance of a
collision.  The non-unique names it fails back to are guaranteed to not
collide with the unique names.

unique cookies will be in this format:
- 1 bit "0" to make sure the the returned cookie is positive
- 17 bits for the hash table index
- 1 bit for the mode "0"
- 13 bits for the offset

non-unique cookies will be in this format:
- 1 bit "0" to make sure the the returned cookie is positive
- 17 bits for the hash table index
- 1 bit for the mode "1"
- 13 more bits of the name hash

Another benefit of location based cookies, is that once a directory's
exhash table is fully extended, so that multiple hash table indexs do
not sure the same leaf blocks, gfs2 no longer needs to sort the
directory entries until it reaches the non-unique ones, and then it only
needs to sort these. This provides a significant speed up for directory
reads of very large directories.

The only issue is that for these cookies to continue to point to the
correct entry as files are added and removed from the directory, gfs2
must keep the entries at the same offset in the leaf block when they are
split (see my previous patch). This means that until all the nodes in a
cluster are running with code that will split the directory leaf blocks
this way, none of the nodes can use the new cookie code. To deal with
this, gfs2 now has the mount option loccookie, which, if set, will make
it return these new location based cookies.  This option must not be set
until all nodes in the cluster are at least running this version of the
kernel code, and you have guaranteed that there are no outstanding
cookies required by other software, such as NFS.

Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
---
 fs/gfs2/dir.c                    | 120 ++++++++++++++++++++++++++++++---------
 fs/gfs2/incore.h                 |   3 +
 fs/gfs2/ops_fstype.c             |   3 +
 fs/gfs2/super.c                  |  12 ++++
 include/uapi/linux/gfs2_ondisk.h |   2 +
 5 files changed, 114 insertions(+), 26 deletions(-)

diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index a894557..7c2ccca 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -82,6 +82,8 @@
 
 #define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1)
 #define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1))
+#define GFS2_HASH_INDEX_MASK 0xffffc000
+#define GFS2_USE_HASH_FLAG 0x2000
 
 struct qstr gfs2_qdot __read_mostly;
 struct qstr gfs2_qdotdot __read_mostly;
@@ -474,8 +476,13 @@ static int gfs2_dirent_find_space(const struct gfs2_dirent *dent,
 	return 0;
 }
 
+struct dirent_cookie {
+	const struct gfs2_dirent *dent;
+	u32 cookie;
+};
+
 struct dirent_gather {
-	const struct gfs2_dirent **pdent;
+	struct dirent_cookie *pdent;
 	unsigned offset;
 };
 
@@ -485,7 +492,7 @@ static int gfs2_dirent_gather(const struct gfs2_dirent *dent,
 {
 	struct dirent_gather *g = opaque;
 	if (!gfs2_dirent_sentinel(dent)) {
-		g->pdent[g->offset++] = dent;
+		g->pdent[g->offset++].dent = dent;
 	}
 	return 0;
 }
@@ -1217,11 +1224,11 @@ static int compare_dents(const void *a, const void *b)
 	u32 hash_a, hash_b;
 	int ret = 0;
 
-	dent_a = *(const struct gfs2_dirent **)a;
-	hash_a = be32_to_cpu(dent_a->de_hash);
+	dent_a = ((const struct dirent_cookie *)a)->dent;
+	hash_a = ((const struct dirent_cookie *)a)->cookie;
 
-	dent_b = *(const struct gfs2_dirent **)b;
-	hash_b = be32_to_cpu(dent_b->de_hash);
+	dent_b = ((const struct dirent_cookie *)b)->dent;
+	hash_b = ((const struct dirent_cookie *)b)->cookie;
 
 	if (hash_a > hash_b)
 		ret = 1;
@@ -1259,28 +1266,28 @@ static int compare_dents(const void *a, const void *b)
  */
 
 static int do_filldir_main(struct gfs2_inode *dip, struct dir_context *ctx,
-			   const struct gfs2_dirent **darr, u32 entries,
-			   int *copied)
+			   struct dirent_cookie *darr, u32 entries,
+			   u32 sort_start, int *copied)
 {
 	const struct gfs2_dirent *dent, *dent_next;
 	u64 off, off_next;
 	unsigned int x, y;
 	int run = 0;
 
-	sort(darr, entries, sizeof(struct gfs2_dirent *), compare_dents, NULL);
+	if (sort_start < entries)
+		sort(&darr[sort_start], entries - sort_start, sizeof(darr[0]),
+		     compare_dents, NULL);
 
-	dent_next = darr[0];
-	off_next = be32_to_cpu(dent_next->de_hash);
-	off_next = gfs2_disk_hash2offset(off_next);
+	dent_next = darr[0].dent;
+	off_next = darr[0].cookie;
 
 	for (x = 0, y = 1; x < entries; x++, y++) {
 		dent = dent_next;
 		off = off_next;
 
 		if (y < entries) {
-			dent_next = darr[y];
-			off_next = be32_to_cpu(dent_next->de_hash);
-			off_next = gfs2_disk_hash2offset(off_next);
+			dent_next = darr[y].dent;
+			off_next = darr[y].cookie;
 
 			if (off < ctx->pos)
 				continue;
@@ -1327,6 +1334,36 @@ static void *gfs2_alloc_sort_buffer(unsigned size)
 	return ptr;
 }
 
+
+static void gfs2_set_cookies(struct gfs2_sbd *sdp, struct buffer_head *bh,
+			     unsigned leaf_nr, struct dirent_cookie *darr,
+			     unsigned entries)
+{
+	int i;
+	
+	for (i = 0; i < entries; i++) {
+		unsigned offset;
+
+		darr[i].cookie = be32_to_cpu(darr[i].dent->de_hash);
+		darr[i].cookie = gfs2_disk_hash2offset(darr[i].cookie);
+
+		if (!sdp->sd_args.ar_loccookie)
+			continue;
+		offset = (char *)(darr[i].dent) -
+			 (bh->b_data + gfs2_dirent_offset(bh->b_data));
+		offset = offset / GFS2_MIN_DIRENT_SIZE;
+		offset += leaf_nr * sdp->sd_max_dents_per_leaf;
+		if (offset >= GFS2_USE_HASH_FLAG ||
+		    leaf_nr >= GFS2_USE_HASH_FLAG) {
+			darr[i].cookie |= GFS2_USE_HASH_FLAG;
+			continue;
+		}
+		darr[i].cookie &= GFS2_HASH_INDEX_MASK;
+		darr[i].cookie |= offset;
+	}
+}	
+
+
 static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
 			      int *copied, unsigned *depth,
 			      u64 leaf_no)
@@ -1336,12 +1373,12 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
 	struct buffer_head *bh;
 	struct gfs2_leaf *lf;
 	unsigned entries = 0, entries2 = 0;
-	unsigned leaves = 0;
-	const struct gfs2_dirent **darr, *dent;
+	unsigned leaves = 0, leaf = 0, offset, sort_offset;
+	struct dirent_cookie *darr;
+	const struct gfs2_dirent *dent;
 	struct dirent_gather g;
 	struct buffer_head **larr;
-	int leaf = 0;
-	int error, i;
+	int error, i, need_sort = 0;
 	u64 lfn = leaf_no;
 
 	do {
@@ -1357,6 +1394,11 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
 		brelse(bh);
 	} while(lfn);
 
+	if (*depth < GFS2_DIR_MAX_DEPTH || !sdp->sd_args.ar_loccookie) {
+		need_sort = 1;
+		sort_offset = 0;
+	}
+
 	if (!entries)
 		return 0;
 
@@ -1367,10 +1409,13 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
 	 * 99 is the maximum number of entries that can fit in a single
 	 * leaf block.
 	 */
-	larr = gfs2_alloc_sort_buffer((leaves + entries + 99) * sizeof(void *));
+	larr = gfs2_alloc_sort_buffer(leaves * sizeof(void *));
 	if (!larr)
 		goto out;
-	darr = (const struct gfs2_dirent **)(larr + leaves);
+	darr = gfs2_alloc_sort_buffer((entries + 99) *
+				      sizeof(struct dirent_cookie));
+	if (!darr)
+		goto out_larr;
 	g.pdent = darr;
 	g.offset = 0;
 	lfn = leaf_no;
@@ -1382,6 +1427,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
 		lf = (struct gfs2_leaf *)bh->b_data;
 		lfn = be64_to_cpu(lf->lf_next);
 		if (lf->lf_entries) {
+			offset = g.offset;
 			entries2 += be16_to_cpu(lf->lf_entries);
 			dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
 						gfs2_dirent_gather, NULL, &g);
@@ -1399,17 +1445,37 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
 				goto out_free;
 			}
 			error = 0;
+			gfs2_set_cookies(sdp, bh, leaf, &darr[offset],
+					 be16_to_cpu(lf->lf_entries));
+			if (!need_sort &&
+			    (darr[entries2 - 1].cookie & GFS2_USE_HASH_FLAG)) {
+				need_sort = 1;
+				for (i = offset; i < entries2; i++) {
+					if (darr[i].cookie & GFS2_USE_HASH_FLAG)
+						break;
+				}
+				sort_offset = i;
+			}
 			larr[leaf++] = bh;
 		} else {
+			larr[leaf++] = NULL;
 			brelse(bh);
 		}
 	} while(lfn);
 
 	BUG_ON(entries2 != entries);
-	error = do_filldir_main(ip, ctx, darr, entries, copied);
+	if (!need_sort)
+		error = do_filldir_main(ip, ctx, darr, entries, entries,
+					copied);
+	else
+		error = do_filldir_main(ip, ctx, darr, entries, sort_offset,
+					copied);
 out_free:
 	for(i = 0; i < leaf; i++)
-		brelse(larr[i]);
+		if (larr[i])
+			brelse(larr[i]);
+	kvfree(darr);
+out_larr:
 	kvfree(larr);
 out:
 	return error;
@@ -1515,7 +1581,8 @@ int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
 	struct gfs2_inode *dip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	struct dirent_gather g;
-	const struct gfs2_dirent **darr, *dent;
+	struct dirent_cookie *darr;
+	const struct gfs2_dirent *dent;
 	struct buffer_head *dibh;
 	int copied = 0;
 	int error;
@@ -1537,7 +1604,7 @@ int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
 
 	error = -ENOMEM;
 	/* 96 is max number of dirents which can be stuffed into an inode */
-	darr = kmalloc(96 * sizeof(struct gfs2_dirent *), GFP_NOFS);
+	darr = kmalloc(96 * sizeof(struct dirent_cookie), GFP_NOFS);
 	if (darr) {
 		g.pdent = darr;
 		g.offset = 0;
@@ -1556,8 +1623,9 @@ int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
 			error = -EIO;
 			goto out;
 		}
+		gfs2_set_cookies(sdp, dibh, 0, darr, dip->i_entries);
 		error = do_filldir_main(dip, ctx, darr,
-					dip->i_entries, &copied);
+					dip->i_entries, 0, &copied);
 out:
 		kfree(darr);
 	}
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index e300f74..25cadee 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -559,6 +559,8 @@ struct gfs2_args {
 	unsigned int ar_errors:2;               /* errors=withdraw | panic */
 	unsigned int ar_nobarrier:1;            /* do not send barriers */
 	unsigned int ar_rgrplvb:1;		/* use lvbs for rgrp info */
+	unsigned int ar_loccookie;		/* use location based readdir
+						   cookies */
 	int ar_commit;				/* Commit interval */
 	int ar_statfs_quantum;			/* The fast statfs interval */
 	int ar_quota_quantum;			/* The quota interval */
@@ -686,6 +688,7 @@ struct gfs2_sbd {
 	u64 sd_heightsize[GFS2_MAX_META_HEIGHT + 1];
 	u32 sd_max_jheight; /* Max height of journaled file's meta tree */
 	u64 sd_jheightsize[GFS2_MAX_META_HEIGHT + 1];
+	u32 sd_max_dents_per_leaf; /* Max number of dirents in a leaf block */
 
 	struct gfs2_args sd_args;	/* Mount arguments */
 	struct gfs2_tune sd_tune;	/* Filesystem tuning structure */
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 1e3a93f..638c6f5 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -352,6 +352,9 @@ static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent)
 	sdp->sd_jheightsize[x] = ~0;
 	gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT);
 
+	sdp->sd_max_dents_per_leaf = (sdp->sd_sb.sb_bsize -
+				      sizeof(struct gfs2_leaf)) /
+				     GFS2_MIN_DIRENT_SIZE;
 	return 0;
 }
 
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 2982445..e194b2b 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -83,6 +83,8 @@ enum {
 	Opt_nobarrier,
 	Opt_rgrplvb,
 	Opt_norgrplvb,
+	Opt_loccookie,
+	Opt_noloccookie,
 	Opt_error,
 };
 
@@ -122,6 +124,8 @@ static const match_table_t tokens = {
 	{Opt_nobarrier, "nobarrier"},
 	{Opt_rgrplvb, "rgrplvb"},
 	{Opt_norgrplvb, "norgrplvb"},
+	{Opt_loccookie, "loccookie"},
+	{Opt_noloccookie, "noloccookie"},
 	{Opt_error, NULL}
 };
 
@@ -278,6 +282,12 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
 		case Opt_norgrplvb:
 			args->ar_rgrplvb = 0;
 			break;
+		case Opt_loccookie:
+			args->ar_loccookie = 1;
+			break;
+		case Opt_noloccookie:
+			args->ar_loccookie = 0;
+			break;
 		case Opt_error:
 		default:
 			pr_warn("invalid mount option: %s\n", o);
@@ -1419,6 +1429,8 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root)
 		seq_puts(s, ",demote_interface_used");
 	if (args->ar_rgrplvb)
 		seq_puts(s, ",rgrplvb");
+	if (args->ar_loccookie)
+		seq_puts(s, ",loccookie");
 	return 0;
 }
 
diff --git a/include/uapi/linux/gfs2_ondisk.h b/include/uapi/linux/gfs2_ondisk.h
index 1a763ea..54f0025 100644
--- a/include/uapi/linux/gfs2_ondisk.h
+++ b/include/uapi/linux/gfs2_ondisk.h
@@ -297,6 +297,8 @@ struct gfs2_dinode {
 
 #define GFS2_FNAMESIZE		255
 #define GFS2_DIRENT_SIZE(name_len) ((sizeof(struct gfs2_dirent) + (name_len) + 7) & ~7)
+#define GFS2_MIN_DIRENT_SIZE (GFS2_DIRENT_SIZE(1))
+
 
 struct gfs2_dirent {
 	struct gfs2_inum de_inum;
-- 
1.8.3.1



^ permalink raw reply related	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2015-12-01 15:01 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-12-01  6:10 [Cluster-devel] [PATCH 0/2] positional readdir cookies Benjamin Marzinski
2015-12-01  6:10 ` [Cluster-devel] [PATCH 1/2] gfs2: keep offset when splitting dir leaf blocks Benjamin Marzinski
2015-12-01  6:10 ` [Cluster-devel] [PATCH 2/2] gfs2: change gfs2 readdir cookie Benjamin Marzinski
2015-12-01 15:01   ` Bob Peterson
  -- strict thread matches above, loose matches on Subject: below --
2015-07-18  4:40 [Cluster-devel] [PATCH 0/2] readdir cookie patches Benjamin Marzinski
2015-07-18  4:40 ` [Cluster-devel] [PATCH 2/2] gfs2: change gfs2 readdir cookie Benjamin Marzinski

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).