From: Bernd Schubert <bernd.schubert@itwm.fraunhofer.de>
To: linux-nfs@vger.kernel.org, linux-ext4@vger.kernel.org
Cc: linux-fsdevel@vger.kernel.org, Fan Yong <yong.fan@whamcloud.com>,
bfields@redhat.com, sandeen@redhat.com,
Andreas Dilger <adilger@whamcloud.com>
Subject: [PATCH 5 2/4] Return 32/64-bit dir name hash according to usage type
Date: Mon, 09 Jan 2012 14:21:48 +0100 [thread overview]
Message-ID: <20120109132148.2616029.68798.stgit@localhost.localdomain> (raw)
In-Reply-To: <20120109132137.2616029.76288.stgit@localhost.localdomain>
From: Fan Yong <yong.fan@whamcloud.com>
Traditionally ext2/3/4 has returned a 32-bit hash value from llseek()
to appease NFSv2, which can only handle a 32-bit cookie for seekdir()
and telldir(). However, this causes problems if there are 32-bit hash
collisions, since the NFSv2 server can get stuck resending the same
entries from the directory repeatedly.
Allow ext4 to return a full 64-bit hash (both major and minor) for
telldir to decrease the chance of hash collisions. This still needs
integration on the NFS side.
Patch-updated-by: Bernd Schubert <bernd.schubert@itwm.fraunhofer.de>
(blame me if something is not correct)
Signed-off-by: Fan Yong <yong.fan@whamcloud.com>
Signed-off-by: Andreas Dilger <adilger@whamcloud.com>
Signed-off-by: Bernd Schubert <bernd.schubert@itwm.fraunhofer.de>
---
fs/ext4/dir.c | 185 ++++++++++++++++++++++++++++++++++++++++++++------------
fs/ext4/ext4.h | 6 ++
fs/ext4/hash.c | 4 +
3 files changed, 154 insertions(+), 41 deletions(-)
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 164c560..cee09f2 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -32,24 +32,8 @@ static unsigned char ext4_filetype_table[] = {
DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
};
-static int ext4_readdir(struct file *, void *, filldir_t);
static int ext4_dx_readdir(struct file *filp,
void *dirent, filldir_t filldir);
-static int ext4_release_dir(struct inode *inode,
- struct file *filp);
-
-const struct file_operations ext4_dir_operations = {
- .llseek = ext4_llseek,
- .read = generic_read_dir,
- .readdir = ext4_readdir, /* we take BKL. needed?*/
- .unlocked_ioctl = ext4_ioctl,
-#ifdef CONFIG_COMPAT
- .compat_ioctl = ext4_compat_ioctl,
-#endif
- .fsync = ext4_sync_file,
- .release = ext4_release_dir,
-};
-
static unsigned char get_dtype(struct super_block *sb, int filetype)
{
@@ -254,22 +238,134 @@ out:
return ret;
}
+static inline int is_32bit_api(void)
+{
+#ifdef CONFIG_COMPAT
+ return is_compat_task();
+#else
+ return (BITS_PER_LONG == 32);
+#endif
+}
+
/*
* These functions convert from the major/minor hash to an f_pos
- * value.
+ * value for dx directories
*
- * Currently we only use major hash numer. This is unfortunate, but
- * on 32-bit machines, the same VFS interface is used for lseek and
- * llseek, so if we use the 64 bit offset, then the 32-bit versions of
- * lseek/telldir/seekdir will blow out spectacularly, and from within
- * the ext2 low-level routine, we don't know if we're being called by
- * a 64-bit version of the system call or the 32-bit version of the
- * system call. Worse yet, NFSv2 only allows for a 32-bit readdir
- * cookie. Sigh.
+ * Upper layer (for example NFS) should specify FMODE_32BITHASH or
+ * FMODE_64BITHASH explicitly. On the other hand, we allow ext4 to be mounted
+ * directly on both 32-bit and 64-bit nodes, under such case, neither
+ * FMODE_32BITHASH nor FMODE_64BITHASH is specified.
*/
-#define hash2pos(major, minor) (major >> 1)
-#define pos2maj_hash(pos) ((pos << 1) & 0xffffffff)
-#define pos2min_hash(pos) (0)
+static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor)
+{
+ if ((filp->f_flags & FMODE_32BITHASH) ||
+ (!(filp->f_flags & FMODE_64BITHASH) && is_32bit_api()))
+ return major >> 1;
+ else
+ return ((__u64)(major >> 1) << 32) | (__u64)minor;
+}
+
+static inline __u32 pos2maj_hash(struct file *filp, loff_t pos)
+{
+ if ((filp->f_flags & FMODE_32BITHASH) ||
+ (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
+ return (pos << 1) & 0xffffffff;
+ else
+ return ((pos >> 32) << 1) & 0xffffffff;
+}
+
+static inline __u32 pos2min_hash(struct file *filp, loff_t pos)
+{
+ if ((filp->f_flags & FMODE_32BITHASH) ||
+ (!(filp->f_flags & FMODE_64BITHASH) && is_32bit_api()))
+ return 0;
+ else
+ return pos & 0xffffffff;
+}
+
+/*
+ * Return 32- or 64-bit end-of-file for dx directories
+ */
+static inline loff_t ext4_get_htree_eof(struct file *filp)
+{
+ if ((filp->f_mode & FMODE_32BITHASH) ||
+ (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
+ return EXT4_HTREE_EOF_32BIT;
+ else
+ return EXT4_HTREE_EOF_64BIT;
+}
+
+
+/*
+ * ext4_dir_llseek() based on generic_file_llseek() to handle both
+ * non-htree and htree directories, where the "offset" is in terms
+ * of the filename hash value instead of the byte offset.
+ *
+ * NOTE: offsets obtained *before* ext4_set_inode_flag(dir, EXT4_INODE_INDEX)
+ * will be invalid once the directory was converted into a dx directory
+ */
+loff_t ext4_dir_llseek(struct file *file, loff_t offset, int origin)
+{
+ struct inode *inode = file->f_mapping->host;
+ loff_t ret = -EINVAL;
+ int is_dx_dir = ext4_test_inode_flag(inode, EXT4_INODE_INDEX);
+
+ mutex_lock(&inode->i_mutex);
+
+ /* NOTE: relative offsets with dx directories might not work
+ * as expected, as it is difficult to figure out the
+ * correct offset between dx hashes */
+
+ switch (origin) {
+ case SEEK_END:
+ if (unlikely(offset > 0))
+ goto out_err; /* not supported for directories */
+
+ /* so only negative offsets are left, does that have a
+ * meaning for directories at all? */
+ if (is_dx_dir)
+ offset += ext4_get_htree_eof(file);
+ else
+ offset += inode->i_size;
+ break;
+ case SEEK_CUR:
+ /*
+ * Here we special-case the lseek(fd, 0, SEEK_CUR)
+ * position-querying operation. Avoid rewriting the "same"
+ * f_pos value back to the file because a concurrent read(),
+ * write() or lseek() might have altered it
+ */
+ if (offset == 0) {
+ offset = file->f_pos;
+ goto out_ok;
+ }
+
+ offset += file->f_pos;
+ break;
+ }
+
+ if (unlikely(offset < 0))
+ goto out_err;
+
+ if (!is_dx_dir) {
+ if (offset > inode->i_sb->s_maxbytes)
+ goto out_err;
+ } else if (offset > ext4_get_htree_eof(file))
+ goto out_err;
+
+ /* Special lock needed here? */
+ if (offset != file->f_pos) {
+ file->f_pos = offset;
+ file->f_version = 0;
+ }
+
+out_ok:
+ ret = offset;
+out_err:
+ mutex_unlock(&inode->i_mutex);
+
+ return ret;
+}
/*
* This structure holds the nodes of the red-black tree used to store
@@ -330,15 +426,16 @@ static void free_rb_tree_fname(struct rb_root *root)
}
-static struct dir_private_info *ext4_htree_create_dir_info(loff_t pos)
+static struct dir_private_info *ext4_htree_create_dir_info(struct file *filp,
+ loff_t pos)
{
struct dir_private_info *p;
p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
if (!p)
return NULL;
- p->curr_hash = pos2maj_hash(pos);
- p->curr_minor_hash = pos2min_hash(pos);
+ p->curr_hash = pos2maj_hash(filp, pos);
+ p->curr_minor_hash = pos2min_hash(filp, pos);
return p;
}
@@ -429,7 +526,7 @@ static int call_filldir(struct file *filp, void *dirent,
"null fname?!?\n");
return 0;
}
- curr_pos = hash2pos(fname->hash, fname->minor_hash);
+ curr_pos = hash2pos(filp, fname->hash, fname->minor_hash);
while (fname) {
error = filldir(dirent, fname->name,
fname->name_len, curr_pos,
@@ -454,13 +551,13 @@ static int ext4_dx_readdir(struct file *filp,
int ret;
if (!info) {
- info = ext4_htree_create_dir_info(filp->f_pos);
+ info = ext4_htree_create_dir_info(filp, filp->f_pos);
if (!info)
return -ENOMEM;
filp->private_data = info;
}
- if (filp->f_pos == EXT4_HTREE_EOF)
+ if (filp->f_pos == ext4_get_htree_eof(filp))
return 0; /* EOF */
/* Some one has messed with f_pos; reset the world */
@@ -468,8 +565,8 @@ static int ext4_dx_readdir(struct file *filp,
free_rb_tree_fname(&info->root);
info->curr_node = NULL;
info->extra_fname = NULL;
- info->curr_hash = pos2maj_hash(filp->f_pos);
- info->curr_minor_hash = pos2min_hash(filp->f_pos);
+ info->curr_hash = pos2maj_hash(filp, filp->f_pos);
+ info->curr_minor_hash = pos2min_hash(filp, filp->f_pos);
}
/*
@@ -501,7 +598,7 @@ static int ext4_dx_readdir(struct file *filp,
if (ret < 0)
return ret;
if (ret == 0) {
- filp->f_pos = EXT4_HTREE_EOF;
+ filp->f_pos = ext4_get_htree_eof(filp);
break;
}
info->curr_node = rb_first(&info->root);
@@ -521,7 +618,7 @@ static int ext4_dx_readdir(struct file *filp,
info->curr_minor_hash = fname->minor_hash;
} else {
if (info->next_hash == ~0) {
- filp->f_pos = EXT4_HTREE_EOF;
+ filp->f_pos = ext4_get_htree_eof(filp);
break;
}
info->curr_hash = info->next_hash;
@@ -540,3 +637,15 @@ static int ext4_release_dir(struct inode *inode, struct file *filp)
return 0;
}
+
+const struct file_operations ext4_dir_operations = {
+ .llseek = ext4_dir_llseek,
+ .read = generic_read_dir,
+ .readdir = ext4_readdir,
+ .unlocked_ioctl = ext4_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = ext4_compat_ioctl,
+#endif
+ .fsync = ext4_sync_file,
+ .release = ext4_release_dir,
+};
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 1554b15..d3fe1ea 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1599,7 +1599,11 @@ struct dx_hash_info
u32 *seed;
};
-#define EXT4_HTREE_EOF 0x7fffffff
+
+/* 32 and 64 bit signed EOF for dx directories */
+#define EXT4_HTREE_EOF_32BIT ((1UL << (32 - 1)) - 1)
+#define EXT4_HTREE_EOF_64BIT ((1ULL << (64 - 1)) - 1)
+
/*
* Control parameters used by ext4_htree_next_block
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index ac8f168..fa8e491 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -200,8 +200,8 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
return -1;
}
hash = hash & ~1;
- if (hash == (EXT4_HTREE_EOF << 1))
- hash = (EXT4_HTREE_EOF-1) << 1;
+ if (hash == (EXT4_HTREE_EOF_32BIT << 1))
+ hash = (EXT4_HTREE_EOF_32BIT - 1) << 1;
hinfo->hash = hash;
hinfo->minor_hash = minor_hash;
return 0;
next prev parent reply other threads:[~2012-01-09 13:21 UTC|newest]
Thread overview: 47+ messages / expand[flat|nested] mbox.gz Atom feed top
2012-01-09 13:21 [PATCH 0/4] [RESEND] 32/64 bit llseek hashes (v5) Bernd Schubert
2012-01-09 13:21 ` [PATCH 5 1/4] Add new FMODE flags: FMODE_32bithash and FMODE_64bithash Bernd Schubert
2012-01-09 13:21 ` Bernd Schubert [this message]
2012-03-05 15:59 ` [PATCH 5 2/4] Return 32/64-bit dir name hash according to usage type Ted Ts'o
[not found] ` <20120305155939.GE21356-AKGzg7BKzIDYtjvyW6yDsg@public.gmane.org>
2012-03-06 0:40 ` Bernd Schubert
2012-03-06 2:28 ` Ted Ts'o
[not found] ` <20120306022838.GA24323-AKGzg7BKzIDYtjvyW6yDsg@public.gmane.org>
2012-03-06 9:59 ` Bernd Schubert
[not found] ` <4F55E01B.3060105-mPn0NPGs4xGatNDF+KUbs4QuADTiUCJX@public.gmane.org>
2012-03-06 15:15 ` Ted Ts'o
[not found] ` <20120306151543.GA32282-AKGzg7BKzIDYtjvyW6yDsg@public.gmane.org>
2012-03-07 9:01 ` Bernd Schubert
[not found] ` <20120109132148.2616029.68798.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
2012-04-20 20:04 ` Eric Sandeen
[not found] ` <4F91C15B.6070200-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2012-04-22 12:51 ` Bernd Schubert
2012-04-23 20:37 ` Eric Sandeen
[not found] ` <4F95BD72.6090200-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2012-04-23 20:52 ` Bernd Schubert
2012-04-23 21:22 ` Eric Sandeen
[not found] ` <4F95C109.1030401-mPn0NPGs4xGatNDF+KUbs4QuADTiUCJX@public.gmane.org>
2012-04-23 22:23 ` Eric Sandeen
2012-04-23 22:42 ` Andreas Dilger
[not found] ` <A754D23B-B946-4E80-ACEA-0E2C2E6FAA2E-KloliPT79xf2eFz/2MeuCQ@public.gmane.org>
2012-04-24 16:10 ` Bernd Schubert
2012-01-09 13:21 ` [PATCH 5 3/4] nfsd_open(): rename 'int access' to 'int may_flags' in nfsd_open() Bernd Schubert
2012-01-09 13:21 ` [PATCH 5 4/4] nfsd: vfs_llseek() with 32 or 64 bit offsets (hashes) Bernd Schubert
[not found] ` <20120109132158.2616029.30467.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
[not found] ` <20120109132153.2616029.26302.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
2012-03-06 0:08 ` [PATCH 5 3/4] nfsd_open(): rename 'int access' to 'int may_flags' in nfsd_open() Ted Ts'o
[not found] ` <20120306000837.GA17164-AKGzg7BKzIDYtjvyW6yDsg@public.gmane.org>
2012-03-06 2:08 ` J. Bruce Fields
2012-03-06 15:18 ` Ted Ts'o
2012-03-06 15:28 ` J. Bruce Fields
2012-03-09 20:51 ` Ted Ts'o
[not found] ` <20120309205148.GB5635-AKGzg7BKzIDYtjvyW6yDsg@public.gmane.org>
2012-03-12 15:09 ` Ted Ts'o
[not found] ` <20120312150912.GB12440-AKGzg7BKzIDYtjvyW6yDsg@public.gmane.org>
2012-03-12 15:49 ` J. Bruce Fields
2012-03-12 22:22 ` J. Bruce Fields
2012-03-13 20:01 ` J. Bruce Fields
[not found] ` <20120313200117.GA21991-uC3wQj2KruNg9hUCZPvPmw@public.gmane.org>
2012-03-13 20:03 ` Bernd Schubert
[not found] ` <4F5FA827.8020606-mPn0NPGs4xGatNDF+KUbs4QuADTiUCJX@public.gmane.org>
2012-03-13 20:34 ` J. Bruce Fields
2012-03-13 21:09 ` Bernd Schubert
2012-03-13 21:29 ` J. Bruce Fields
[not found] ` <20120313212947.GK31995-spRCxval1Z7TsXDwO4sDpg@public.gmane.org>
2012-03-14 14:32 ` Bernd Schubert
[not found] ` <4F60AC0D.9020204-mPn0NPGs4xGatNDF+KUbs4QuADTiUCJX@public.gmane.org>
2012-03-14 16:05 ` J. Bruce Fields
[not found] ` <20120314160529.GB31194-uC3wQj2KruNg9hUCZPvPmw@public.gmane.org>
2012-03-16 21:22 ` Bernd Schubert
2012-03-19 2:54 ` Ted Ts'o
[not found] ` <20120319025455.GD31682-AKGzg7BKzIDYtjvyW6yDsg@public.gmane.org>
2012-03-19 20:00 ` J. Bruce Fields
[not found] ` <20120319200041.GA25161-uC3wQj2KruNg9hUCZPvPmw@public.gmane.org>
2012-03-20 0:10 ` Ted Ts'o
2012-04-12 20:49 ` J. Bruce Fields
[not found] ` <20120412204948.GE6667-uC3wQj2KruNg9hUCZPvPmw@public.gmane.org>
2012-04-12 21:22 ` Bernd Schubert
[not found] ` <4F8747A1.8060800-97jfqw80gc6171pxa8y+qA@public.gmane.org>
2012-04-12 21:25 ` J. Bruce Fields
[not found] ` <20120313203446.GB21991-uC3wQj2KruNg9hUCZPvPmw@public.gmane.org>
2012-03-13 21:10 ` Ted Ts'o
[not found] ` <20120313211009.GA11969-AKGzg7BKzIDYtjvyW6yDsg@public.gmane.org>
2012-03-13 21:27 ` J. Bruce Fields
2012-01-10 11:27 ` [PATCH 0/4] [RESEND] 32/64 bit llseek hashes (v5) Andreas Dilger
2012-01-11 14:48 ` J. Bruce Fields
[not found] ` <20120111144827.GA32381-uC3wQj2KruNg9hUCZPvPmw@public.gmane.org>
2012-01-11 15:31 ` Ted Ts'o
2012-03-05 12:23 ` Bernd Schubert
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20120109132148.2616029.68798.stgit@localhost.localdomain \
--to=bernd.schubert@itwm.fraunhofer.de \
--cc=adilger@whamcloud.com \
--cc=bfields@redhat.com \
--cc=linux-ext4@vger.kernel.org \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-nfs@vger.kernel.org \
--cc=sandeen@redhat.com \
--cc=yong.fan@whamcloud.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).