linux-ext4.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 09/35] whiteout: Split of ext2_append_link() from ext2_add_link()
       [not found]               ` <1271372682-21225-9-git-send-email-vaurora@redhat.com>
@ 2010-04-15 23:04                 ` Valerie Aurora
  2010-04-15 23:04                   ` [PATCH 10/35] whiteout: ext2 whiteout support Valerie Aurora
  0 siblings, 1 reply; 22+ messages in thread
From: Valerie Aurora @ 2010-04-15 23:04 UTC (permalink / raw)
  To: Alexander Viro
  Cc: linux-kernel, linux-fsdevel, Jan Blunck, Valerie Aurora,
	Theodore Tso, linux-ext4

From: Jan Blunck <jblunck@suse.de>

The ext2_append_link() is later used to find or append a directory
entry to whiteout.

Signed-off-by: Jan Blunck <jblunck@suse.de>
Signed-off-by: Valerie Aurora <vaurora@redhat.com>
Cc: Theodore Tso <tytso@mit.edu>
Cc: linux-ext4@vger.kernel.org
---
 fs/ext2/dir.c |   70 ++++++++++++++++++++++++++++++++++++++++----------------
 1 files changed, 50 insertions(+), 20 deletions(-)

diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 7516957..57207a9 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -472,9 +472,10 @@ void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de,
 }
 
 /*
- *	Parent is locked.
+ * Find or append a given dentry to the parent directory
  */
-int ext2_add_link (struct dentry *dentry, struct inode *inode)
+static ext2_dirent * ext2_append_entry(struct dentry * dentry,
+				       struct page ** page)
 {
 	struct inode *dir = dentry->d_parent->d_inode;
 	const char *name = dentry->d_name.name;
@@ -482,13 +483,10 @@ int ext2_add_link (struct dentry *dentry, struct inode *inode)
 	unsigned chunk_size = ext2_chunk_size(dir);
 	unsigned reclen = EXT2_DIR_REC_LEN(namelen);
 	unsigned short rec_len, name_len;
-	struct page *page = NULL;
-	ext2_dirent * de;
+	ext2_dirent * de = NULL;
 	unsigned long npages = dir_pages(dir);
 	unsigned long n;
 	char *kaddr;
-	loff_t pos;
-	int err;
 
 	/*
 	 * We take care of directory expansion in the same loop.
@@ -498,20 +496,19 @@ int ext2_add_link (struct dentry *dentry, struct inode *inode)
 	for (n = 0; n <= npages; n++) {
 		char *dir_end;
 
-		page = ext2_get_page(dir, n, 0);
-		err = PTR_ERR(page);
-		if (IS_ERR(page))
+		*page = ext2_get_page(dir, n, 0);
+		de = ERR_PTR(PTR_ERR(*page));
+		if (IS_ERR(*page))
 			goto out;
-		lock_page(page);
-		kaddr = page_address(page);
+		lock_page(*page);
+		kaddr = page_address(*page);
 		dir_end = kaddr + ext2_last_byte(dir, n);
 		de = (ext2_dirent *)kaddr;
 		kaddr += PAGE_CACHE_SIZE - reclen;
 		while ((char *)de <= kaddr) {
 			if ((char *)de == dir_end) {
 				/* We hit i_size */
-				name_len = 0;
-				rec_len = chunk_size;
+				de->name_len = 0;
 				de->rec_len = ext2_rec_len_to_disk(chunk_size);
 				de->inode = 0;
 				goto got_it;
@@ -519,12 +516,11 @@ int ext2_add_link (struct dentry *dentry, struct inode *inode)
 			if (de->rec_len == 0) {
 				ext2_error(dir->i_sb, __func__,
 					"zero-length directory entry");
-				err = -EIO;
+				de = ERR_PTR(-EIO);
 				goto out_unlock;
 			}
-			err = -EEXIST;
 			if (ext2_match (namelen, name, de))
-				goto out_unlock;
+				goto got_it;
 			name_len = EXT2_DIR_REC_LEN(de->name_len);
 			rec_len = ext2_rec_len_from_disk(de->rec_len);
 			if (!de->inode && rec_len >= reclen)
@@ -533,13 +529,48 @@ int ext2_add_link (struct dentry *dentry, struct inode *inode)
 				goto got_it;
 			de = (ext2_dirent *) ((char *) de + rec_len);
 		}
-		unlock_page(page);
-		ext2_put_page(page);
+		unlock_page(*page);
+		ext2_put_page(*page);
 	}
+
 	BUG();
-	return -EINVAL;
 
 got_it:
+	return de;
+	/* OFFSET_CACHE */
+out_unlock:
+	unlock_page(*page);
+	ext2_put_page(*page);
+out:
+	return de;
+}
+
+/*
+ *	Parent is locked.
+ */
+int ext2_add_link (struct dentry *dentry, struct inode *inode)
+{
+	struct inode *dir = dentry->d_parent->d_inode;
+	const char *name = dentry->d_name.name;
+	int namelen = dentry->d_name.len;
+	unsigned short rec_len, name_len;
+	ext2_dirent * de;
+	struct page *page;
+	loff_t pos;
+	int err;
+
+	de = ext2_append_entry(dentry, &page);
+	if (IS_ERR(de))
+		return PTR_ERR(de);
+
+	err = -EEXIST;
+	if (ext2_match (namelen, name, de))
+		goto out_unlock;
+
+got_it:
+	name_len = EXT2_DIR_REC_LEN(de->name_len);
+	rec_len = ext2_rec_len_from_disk(de->rec_len);
+
 	pos = page_offset(page) +
 		(char*)de - (char*)page_address(page);
 	err = __ext2_write_begin(NULL, page->mapping, pos, rec_len, 0,
@@ -563,7 +594,6 @@ got_it:
 	/* OFFSET_CACHE */
 out_put:
 	ext2_put_page(page);
-out:
 	return err;
 out_unlock:
 	unlock_page(page);
-- 
1.6.3.3

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH 10/35] whiteout: ext2 whiteout support
  2010-04-15 23:04                 ` [PATCH 09/35] whiteout: Split of ext2_append_link() from ext2_add_link() Valerie Aurora
@ 2010-04-15 23:04                   ` Valerie Aurora
       [not found]                     ` <1271372682-21225-12-git-send-email-vaurora@redhat.com>
  0 siblings, 1 reply; 22+ messages in thread
From: Valerie Aurora @ 2010-04-15 23:04 UTC (permalink / raw)
  To: Alexander Viro
  Cc: linux-kernel, linux-fsdevel, Jan Blunck, Valerie Aurora,
	Theodore Tso, linux-ext4

From: Jan Blunck <jblunck@suse.de>

This patch adds whiteout support to EXT2. A whiteout is an empty directory
entry (inode == 0) with the file type set to EXT2_FT_WHT. Therefore it
allocates space in directories. Due to being implemented as a filetype it is
necessary to have the EXT2_FEATURE_INCOMPAT_FILETYPE flag set.

XXX - Whiteouts could be implemented as special symbolic links

Signed-off-by: Jan Blunck <jblunck@suse.de>
Signed-off-by: Valerie Aurora <vaurora@redhat.com>
Cc: Theodore Tso <tytso@mit.edu>
Cc: linux-ext4@vger.kernel.org
---
 fs/ext2/dir.c           |   96 +++++++++++++++++++++++++++++++++++++++++++++--
 fs/ext2/ext2.h          |    3 +
 fs/ext2/inode.c         |   11 ++++-
 fs/ext2/namei.c         |   67 +++++++++++++++++++++++++++++++-
 fs/ext2/super.c         |    6 +++
 include/linux/ext2_fs.h |    4 ++
 6 files changed, 177 insertions(+), 10 deletions(-)

diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 57207a9..030bd46 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -219,7 +219,7 @@ static inline int ext2_match (int len, const char * const name,
 {
 	if (len != de->name_len)
 		return 0;
-	if (!de->inode)
+	if (!de->inode && (de->file_type != EXT2_FT_WHT))
 		return 0;
 	return !memcmp(name, de->name, len);
 }
@@ -255,6 +255,7 @@ static unsigned char ext2_filetype_table[EXT2_FT_MAX] = {
 	[EXT2_FT_FIFO]		= DT_FIFO,
 	[EXT2_FT_SOCK]		= DT_SOCK,
 	[EXT2_FT_SYMLINK]	= DT_LNK,
+	[EXT2_FT_WHT]		= DT_WHT,
 };
 
 #define S_SHIFT 12
@@ -448,6 +449,26 @@ ino_t ext2_inode_by_name(struct inode *dir, struct qstr *child)
 	return res;
 }
 
+/* Special version for filetype based whiteout support */
+ino_t ext2_inode_by_dentry(struct inode *dir, struct dentry *dentry)
+{
+	ino_t res = 0;
+	struct ext2_dir_entry_2 *de;
+	struct page *page;
+
+	de = ext2_find_entry (dir, &dentry->d_name, &page);
+	if (de) {
+		res = le32_to_cpu(de->inode);
+		if (!res && de->file_type == EXT2_FT_WHT) {
+			spin_lock(&dentry->d_lock);
+			dentry->d_flags |= DCACHE_WHITEOUT;
+			spin_unlock(&dentry->d_lock);
+		}
+		ext2_put_page(page);
+	}
+	return res;
+}
+
 /* Releases the page */
 void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de,
 		   struct page *page, struct inode *inode, int update_times)
@@ -523,7 +544,8 @@ static ext2_dirent * ext2_append_entry(struct dentry * dentry,
 				goto got_it;
 			name_len = EXT2_DIR_REC_LEN(de->name_len);
 			rec_len = ext2_rec_len_from_disk(de->rec_len);
-			if (!de->inode && rec_len >= reclen)
+			if (!de->inode && (de->file_type != EXT2_FT_WHT) &&
+			    (rec_len >= reclen))
 				goto got_it;
 			if (rec_len >= name_len + reclen)
 				goto got_it;
@@ -564,8 +586,11 @@ int ext2_add_link (struct dentry *dentry, struct inode *inode)
 		return PTR_ERR(de);
 
 	err = -EEXIST;
-	if (ext2_match (namelen, name, de))
+	if (ext2_match (namelen, name, de)) {
+		if (de->file_type == EXT2_FT_WHT)
+			goto got_it;
 		goto out_unlock;
+	}
 
 got_it:
 	name_len = EXT2_DIR_REC_LEN(de->name_len);
@@ -577,7 +602,8 @@ got_it:
 							&page, NULL);
 	if (err)
 		goto out_unlock;
-	if (de->inode) {
+	if (de->inode || ((de->file_type == EXT2_FT_WHT) &&
+			  !ext2_match (namelen, name, de))) {
 		ext2_dirent *de1 = (ext2_dirent *) ((char *) de + name_len);
 		de1->rec_len = ext2_rec_len_to_disk(rec_len - name_len);
 		de->rec_len = ext2_rec_len_to_disk(name_len);
@@ -646,6 +672,68 @@ out:
 	return err;
 }
 
+int ext2_whiteout_entry (struct inode * dir, struct dentry * dentry,
+			 struct ext2_dir_entry_2 * de, struct page * page)
+{
+	const char *name = dentry->d_name.name;
+	int namelen = dentry->d_name.len;
+	unsigned short rec_len, name_len;
+	loff_t pos;
+	int err;
+
+	if (!de) {
+		de = ext2_append_entry(dentry, &page);
+		BUG_ON(!de);
+	}
+
+	err = -EEXIST;
+	if (ext2_match (namelen, name, de) &&
+	    (de->file_type == EXT2_FT_WHT)) {
+		ext2_error(dir->i_sb, __func__,
+			   "entry is already a whiteout in directory #%lu",
+			   dir->i_ino);
+		goto out_unlock;
+	}
+
+	name_len = EXT2_DIR_REC_LEN(de->name_len);
+	rec_len = ext2_rec_len_from_disk(de->rec_len);
+
+	pos = page_offset(page) +
+		(char*)de - (char*)page_address(page);
+	err = __ext2_write_begin(NULL, page->mapping, pos, rec_len, 0,
+							&page, NULL);
+	if (err)
+		goto out_unlock;
+	/*
+	 * We whiteout an existing entry. Do what ext2_delete_entry() would do,
+	 * except that we don't need to merge with the previous entry since
+	 * we are going to reuse it.
+	 */
+	if (ext2_match (namelen, name, de))
+		de->inode = 0;
+	if (de->inode || (de->file_type == EXT2_FT_WHT)) {
+		ext2_dirent *de1 = (ext2_dirent *) ((char *) de + name_len);
+		de1->rec_len = ext2_rec_len_to_disk(rec_len - name_len);
+		de->rec_len = ext2_rec_len_to_disk(name_len);
+		de = de1;
+	}
+	de->name_len = namelen;
+	memcpy(de->name, name, namelen);
+	de->inode = 0;
+	de->file_type = EXT2_FT_WHT;
+	err = ext2_commit_chunk(page, pos, rec_len);
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+	EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL;
+	mark_inode_dirty(dir);
+	/* OFFSET_CACHE */
+out_put:
+	ext2_put_page(page);
+	return err;
+out_unlock:
+	unlock_page(page);
+	goto out_put;
+}
+
 /*
  * Set the first fragment of directory.
  */
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 0b038e4..44d190c 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -102,9 +102,12 @@ extern void ext2_rsv_window_add(struct super_block *sb, struct ext2_reserve_wind
 /* dir.c */
 extern int ext2_add_link (struct dentry *, struct inode *);
 extern ino_t ext2_inode_by_name(struct inode *, struct qstr *);
+extern ino_t ext2_inode_by_dentry(struct inode *, struct dentry *);
 extern int ext2_make_empty(struct inode *, struct inode *);
 extern struct ext2_dir_entry_2 * ext2_find_entry (struct inode *,struct qstr *, struct page **);
 extern int ext2_delete_entry (struct ext2_dir_entry_2 *, struct page *);
+extern int ext2_whiteout_entry (struct inode *, struct dentry *,
+				struct ext2_dir_entry_2 *, struct page *);
 extern int ext2_empty_dir (struct inode *);
 extern struct ext2_dir_entry_2 * ext2_dotdot (struct inode *, struct page **);
 extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page *, struct inode *, int);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index fc13cc1..5ad2cbb 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1184,7 +1184,8 @@ void ext2_set_inode_flags(struct inode *inode)
 {
 	unsigned int flags = EXT2_I(inode)->i_flags;
 
-	inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
+	inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|
+			    S_OPAQUE);
 	if (flags & EXT2_SYNC_FL)
 		inode->i_flags |= S_SYNC;
 	if (flags & EXT2_APPEND_FL)
@@ -1195,6 +1196,8 @@ void ext2_set_inode_flags(struct inode *inode)
 		inode->i_flags |= S_NOATIME;
 	if (flags & EXT2_DIRSYNC_FL)
 		inode->i_flags |= S_DIRSYNC;
+	if (flags & EXT2_OPAQUE_FL)
+		inode->i_flags |= S_OPAQUE;
 }
 
 /* Propagate flags from i_flags to EXT2_I(inode)->i_flags */
@@ -1202,8 +1205,8 @@ void ext2_get_inode_flags(struct ext2_inode_info *ei)
 {
 	unsigned int flags = ei->vfs_inode.i_flags;
 
-	ei->i_flags &= ~(EXT2_SYNC_FL|EXT2_APPEND_FL|
-			EXT2_IMMUTABLE_FL|EXT2_NOATIME_FL|EXT2_DIRSYNC_FL);
+	ei->i_flags &= ~(EXT2_SYNC_FL|EXT2_APPEND_FL|EXT2_IMMUTABLE_FL|
+			 EXT2_NOATIME_FL|EXT2_DIRSYNC_FL|EXT2_OPAQUE_FL);
 	if (flags & S_SYNC)
 		ei->i_flags |= EXT2_SYNC_FL;
 	if (flags & S_APPEND)
@@ -1214,6 +1217,8 @@ void ext2_get_inode_flags(struct ext2_inode_info *ei)
 		ei->i_flags |= EXT2_NOATIME_FL;
 	if (flags & S_DIRSYNC)
 		ei->i_flags |= EXT2_DIRSYNC_FL;
+	if (flags & S_OPAQUE)
+		ei->i_flags |= EXT2_OPAQUE_FL;
 }
 
 struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 71efb0e..12195a5 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -55,15 +55,16 @@ static inline int ext2_add_nondir(struct dentry *dentry, struct inode *inode)
  * Methods themselves.
  */
 
-static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
+static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry,
+				  struct nameidata *nd)
 {
 	struct inode * inode;
 	ino_t ino;
-	
+
 	if (dentry->d_name.len > EXT2_NAME_LEN)
 		return ERR_PTR(-ENAMETOOLONG);
 
-	ino = ext2_inode_by_name(dir, &dentry->d_name);
+	ino = ext2_inode_by_dentry(dir, dentry);
 	inode = NULL;
 	if (ino) {
 		inode = ext2_iget(dir->i_sb, ino);
@@ -242,6 +243,10 @@ static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode)
 	else
 		inode->i_mapping->a_ops = &ext2_aops;
 
+	/* if we call mkdir on a whiteout create an opaque directory */
+	if (dentry->d_flags & DCACHE_WHITEOUT)
+		inode->i_flags |= S_OPAQUE;
+
 	inode_inc_link_count(inode);
 
 	err = ext2_make_empty(inode, dir);
@@ -307,6 +312,61 @@ static int ext2_rmdir (struct inode * dir, struct dentry *dentry)
 	return err;
 }
 
+/*
+ * Create a whiteout for the dentry
+ */
+static int ext2_whiteout(struct inode *dir, struct dentry *dentry,
+			 struct dentry *new_dentry)
+{
+	struct inode * inode = dentry->d_inode;
+	struct ext2_dir_entry_2 * de = NULL;
+	struct page * page;
+	int err = -ENOTEMPTY;
+
+	if (!EXT2_HAS_INCOMPAT_FEATURE(dir->i_sb,
+				       EXT2_FEATURE_INCOMPAT_FILETYPE)) {
+		ext2_error (dir->i_sb, "ext2_whiteout",
+			    "can't set whiteout filetype");
+		err = -EPERM;
+		goto out;
+	}
+
+	dquot_initialize(dir);
+
+	if (inode) {
+		if (S_ISDIR(inode->i_mode) && !ext2_empty_dir(inode))
+			goto out;
+
+		err = -ENOENT;
+		de = ext2_find_entry (dir, &dentry->d_name, &page);
+		if (!de)
+			goto out;
+		lock_page(page);
+	}
+
+	err = ext2_whiteout_entry (dir, dentry, de, page);
+	if (err)
+		goto out;
+
+	spin_lock(&new_dentry->d_lock);
+	new_dentry->d_flags |= DCACHE_WHITEOUT;
+	spin_unlock(&new_dentry->d_lock);
+	d_add(new_dentry, NULL);
+
+	if (inode) {
+		inode->i_ctime = dir->i_ctime;
+		inode_dec_link_count(inode);
+		if (S_ISDIR(inode->i_mode)) {
+			inode->i_size = 0;
+			inode_dec_link_count(inode);
+			inode_dec_link_count(dir);
+		}
+	}
+	err = 0;
+out:
+	return err;
+}
+
 static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
 	struct inode * new_dir,	struct dentry * new_dentry )
 {
@@ -409,6 +469,7 @@ const struct inode_operations ext2_dir_inode_operations = {
 	.mkdir		= ext2_mkdir,
 	.rmdir		= ext2_rmdir,
 	.mknod		= ext2_mknod,
+	.whiteout	= ext2_whiteout,
 	.rename		= ext2_rename,
 #ifdef CONFIG_EXT2_FS_XATTR
 	.setxattr	= generic_setxattr,
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 42e4a30..000ee17 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1079,6 +1079,12 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 	if (EXT2_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL))
 		ext2_msg(sb, KERN_WARNING,
 			"warning: mounting ext3 filesystem as ext2");
+	/*
+	 * Whiteouts (and fallthrus) require explicit whiteout support.
+	 */
+	if (EXT2_HAS_INCOMPAT_FEATURE(sb, EXT2_FEATURE_INCOMPAT_WHITEOUT))
+		sb->s_flags |= MS_WHITEOUT;
+
 	ext2_setup_super (sb, es, sb->s_flags & MS_RDONLY);
 	return 0;
 
diff --git a/include/linux/ext2_fs.h b/include/linux/ext2_fs.h
index 2dfa707..20468bd 100644
--- a/include/linux/ext2_fs.h
+++ b/include/linux/ext2_fs.h
@@ -189,6 +189,7 @@ struct ext2_group_desc
 #define EXT2_NOTAIL_FL			FS_NOTAIL_FL	/* file tail should not be merged */
 #define EXT2_DIRSYNC_FL			FS_DIRSYNC_FL	/* dirsync behaviour (directories only) */
 #define EXT2_TOPDIR_FL			FS_TOPDIR_FL	/* Top of directory hierarchies*/
+#define EXT2_OPAQUE_FL			0x00040000
 #define EXT2_RESERVED_FL		FS_RESERVED_FL	/* reserved for ext2 lib */
 
 #define EXT2_FL_USER_VISIBLE		FS_FL_USER_VISIBLE	/* User visible flags */
@@ -503,10 +504,12 @@ struct ext2_super_block {
 #define EXT3_FEATURE_INCOMPAT_RECOVER		0x0004
 #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV	0x0008
 #define EXT2_FEATURE_INCOMPAT_META_BG		0x0010
+#define EXT2_FEATURE_INCOMPAT_WHITEOUT		0x0020
 #define EXT2_FEATURE_INCOMPAT_ANY		0xffffffff
 
 #define EXT2_FEATURE_COMPAT_SUPP	EXT2_FEATURE_COMPAT_EXT_ATTR
 #define EXT2_FEATURE_INCOMPAT_SUPP	(EXT2_FEATURE_INCOMPAT_FILETYPE| \
+					 EXT2_FEATURE_INCOMPAT_WHITEOUT| \
 					 EXT2_FEATURE_INCOMPAT_META_BG)
 #define EXT2_FEATURE_RO_COMPAT_SUPP	(EXT2_FEATURE_RO_COMPAT_SPARSE_SUPER| \
 					 EXT2_FEATURE_RO_COMPAT_LARGE_FILE| \
@@ -573,6 +576,7 @@ enum {
 	EXT2_FT_FIFO		= 5,
 	EXT2_FT_SOCK		= 6,
 	EXT2_FT_SYMLINK		= 7,
+	EXT2_FT_WHT		= 8,
 	EXT2_FT_MAX
 };
 
-- 
1.6.3.3

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH 13/35] fallthru: ext2 fallthru support
       [not found]                       ` <1271372682-21225-13-git-send-email-vaurora@redhat.com>
@ 2010-04-15 23:04                         ` Valerie Aurora
  2010-04-19 12:40                           ` Jan Blunck
  0 siblings, 1 reply; 22+ messages in thread
From: Valerie Aurora @ 2010-04-15 23:04 UTC (permalink / raw)
  To: Alexander Viro
  Cc: linux-kernel, linux-fsdevel, Valerie Aurora, Theodore Tso,
	linux-ext4, Jan Blunck

Add support for fallthru directory entries to ext2.

XXX - Makes up inode number for fallthru entry
XXX - Might be better implemented as special symlinks

Cc: Theodore Tso <tytso@mit.edu>
Cc: linux-ext4@vger.kernel.org
Signed-off-by: Valerie Aurora <vaurora@redhat.com>
Signed-off-by: Jan Blunck <jblunck@suse.de>
---
 fs/ext2/dir.c           |   92 ++++++++++++++++++++++++++++++++++++++++++++--
 fs/ext2/ext2.h          |    1 +
 fs/ext2/namei.c         |   22 +++++++++++
 include/linux/ext2_fs.h |    1 +
 4 files changed, 112 insertions(+), 4 deletions(-)

diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 030bd46..f3b4aff 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -219,7 +219,8 @@ static inline int ext2_match (int len, const char * const name,
 {
 	if (len != de->name_len)
 		return 0;
-	if (!de->inode && (de->file_type != EXT2_FT_WHT))
+	if (!de->inode && ((de->file_type != EXT2_FT_WHT) &&
+			   (de->file_type != EXT2_FT_FALLTHRU)))
 		return 0;
 	return !memcmp(name, de->name, len);
 }
@@ -256,6 +257,7 @@ static unsigned char ext2_filetype_table[EXT2_FT_MAX] = {
 	[EXT2_FT_SOCK]		= DT_SOCK,
 	[EXT2_FT_SYMLINK]	= DT_LNK,
 	[EXT2_FT_WHT]		= DT_WHT,
+	[EXT2_FT_FALLTHRU]	= DT_UNKNOWN,
 };
 
 #define S_SHIFT 12
@@ -342,6 +344,24 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir)
 					ext2_put_page(page);
 					return 0;
 				}
+			} else if (de->file_type == EXT2_FT_FALLTHRU) {
+				int over;
+				unsigned char d_type = DT_UNKNOWN;
+
+				offset = (char *)de - kaddr;
+				/* XXX We don't know the inode number
+				 * of the directory entry in the
+				 * underlying file system.  Should
+				 * look it up, either on fallthru
+				 * creation at first readdir or now at
+				 * filldir time. */
+				over = filldir(dirent, de->name, de->name_len,
+					       (n<<PAGE_CACHE_SHIFT) | offset,
+					       123 /* Made up ino */, d_type);
+				if (over) {
+					ext2_put_page(page);
+					return 0;
+				}
 			}
 			filp->f_pos += ext2_rec_len_from_disk(de->rec_len);
 		}
@@ -463,6 +483,10 @@ ino_t ext2_inode_by_dentry(struct inode *dir, struct dentry *dentry)
 			spin_lock(&dentry->d_lock);
 			dentry->d_flags |= DCACHE_WHITEOUT;
 			spin_unlock(&dentry->d_lock);
+		} else if(!res && de->file_type == EXT2_FT_FALLTHRU) {
+			spin_lock(&dentry->d_lock);
+			dentry->d_flags |= DCACHE_FALLTHRU;
+			spin_unlock(&dentry->d_lock);
 		}
 		ext2_put_page(page);
 	}
@@ -532,6 +556,7 @@ static ext2_dirent * ext2_append_entry(struct dentry * dentry,
 				de->name_len = 0;
 				de->rec_len = ext2_rec_len_to_disk(chunk_size);
 				de->inode = 0;
+				de->file_type = 0;
 				goto got_it;
 			}
 			if (de->rec_len == 0) {
@@ -545,6 +570,7 @@ static ext2_dirent * ext2_append_entry(struct dentry * dentry,
 			name_len = EXT2_DIR_REC_LEN(de->name_len);
 			rec_len = ext2_rec_len_from_disk(de->rec_len);
 			if (!de->inode && (de->file_type != EXT2_FT_WHT) &&
+			    (de->file_type != EXT2_FT_FALLTHRU) &&
 			    (rec_len >= reclen))
 				goto got_it;
 			if (rec_len >= name_len + reclen)
@@ -587,7 +613,8 @@ int ext2_add_link (struct dentry *dentry, struct inode *inode)
 
 	err = -EEXIST;
 	if (ext2_match (namelen, name, de)) {
-		if (de->file_type == EXT2_FT_WHT)
+		if ((de->file_type == EXT2_FT_WHT) ||
+		    (de->file_type == EXT2_FT_FALLTHRU))
 			goto got_it;
 		goto out_unlock;
 	}
@@ -602,7 +629,8 @@ got_it:
 							&page, NULL);
 	if (err)
 		goto out_unlock;
-	if (de->inode || ((de->file_type == EXT2_FT_WHT) &&
+	if (de->inode || (((de->file_type == EXT2_FT_WHT) ||
+			   (de->file_type == EXT2_FT_FALLTHRU)) &&
 			  !ext2_match (namelen, name, de))) {
 		ext2_dirent *de1 = (ext2_dirent *) ((char *) de + name_len);
 		de1->rec_len = ext2_rec_len_to_disk(rec_len - name_len);
@@ -627,6 +655,60 @@ out_unlock:
 }
 
 /*
+ * Create a fallthru entry.
+ */
+int ext2_fallthru_entry (struct inode *dir, struct dentry *dentry)
+{
+	const char *name = dentry->d_name.name;
+	int namelen = dentry->d_name.len;
+	unsigned short rec_len, name_len;
+	ext2_dirent * de;
+	struct page *page;
+	loff_t pos;
+	int err;
+
+	de = ext2_append_entry(dentry, &page);
+	if (IS_ERR(de))
+		return PTR_ERR(de);
+
+	err = -EEXIST;
+	if (ext2_match (namelen, name, de))
+		goto out_unlock;
+
+	name_len = EXT2_DIR_REC_LEN(de->name_len);
+	rec_len = ext2_rec_len_from_disk(de->rec_len);
+
+	pos = page_offset(page) +
+		(char*)de - (char*)page_address(page);
+	err = __ext2_write_begin(NULL, page->mapping, pos, rec_len, 0,
+							&page, NULL);
+	if (err)
+		goto out_unlock;
+	if (de->inode || (de->file_type == EXT2_FT_WHT) ||
+	    (de->file_type == EXT2_FT_FALLTHRU)) {
+		ext2_dirent *de1 = (ext2_dirent *) ((char *) de + name_len);
+		de1->rec_len = ext2_rec_len_to_disk(rec_len - name_len);
+		de->rec_len = ext2_rec_len_to_disk(name_len);
+		de = de1;
+	}
+	de->name_len = namelen;
+	memcpy(de->name, name, namelen);
+	de->inode = 0;
+	de->file_type = EXT2_FT_FALLTHRU;
+	err = ext2_commit_chunk(page, pos, rec_len);
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+	EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL;
+	mark_inode_dirty(dir);
+	/* OFFSET_CACHE */
+out_put:
+	ext2_put_page(page);
+	return err;
+out_unlock:
+	unlock_page(page);
+	goto out_put;
+}
+
+/*
  * ext2_delete_entry deletes a directory entry by merging it with the
  * previous entry. Page is up-to-date. Releases the page.
  */
@@ -711,7 +793,9 @@ int ext2_whiteout_entry (struct inode * dir, struct dentry * dentry,
 	 */
 	if (ext2_match (namelen, name, de))
 		de->inode = 0;
-	if (de->inode || (de->file_type == EXT2_FT_WHT)) {
+	if (de->inode || (((de->file_type == EXT2_FT_WHT) ||
+			   (de->file_type == EXT2_FT_FALLTHRU)) &&
+			  !ext2_match (namelen, name, de))) {
 		ext2_dirent *de1 = (ext2_dirent *) ((char *) de + name_len);
 		de1->rec_len = ext2_rec_len_to_disk(rec_len - name_len);
 		de->rec_len = ext2_rec_len_to_disk(name_len);
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 44d190c..2fa32b3 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -108,6 +108,7 @@ extern struct ext2_dir_entry_2 * ext2_find_entry (struct inode *,struct qstr *,
 extern int ext2_delete_entry (struct ext2_dir_entry_2 *, struct page *);
 extern int ext2_whiteout_entry (struct inode *, struct dentry *,
 				struct ext2_dir_entry_2 *, struct page *);
+extern int ext2_fallthru_entry (struct inode *, struct dentry *);
 extern int ext2_empty_dir (struct inode *);
 extern struct ext2_dir_entry_2 * ext2_dotdot (struct inode *, struct page **);
 extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page *, struct inode *, int);
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 12195a5..f28154c 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -349,6 +349,7 @@ static int ext2_whiteout(struct inode *dir, struct dentry *dentry,
 		goto out;
 
 	spin_lock(&new_dentry->d_lock);
+	new_dentry->d_flags &= ~DCACHE_FALLTHRU;
 	new_dentry->d_flags |= DCACHE_WHITEOUT;
 	spin_unlock(&new_dentry->d_lock);
 	d_add(new_dentry, NULL);
@@ -367,6 +368,26 @@ out:
 	return err;
 }
 
+/*
+ * Create a fallthru entry.
+ */
+static int ext2_fallthru (struct inode *dir, struct dentry *dentry)
+{
+	int err;
+
+	dquot_initialize(dir);
+
+	err = ext2_fallthru_entry(dir, dentry);
+	if (err)
+		return err;
+
+	d_instantiate(dentry, NULL);
+	spin_lock(&dentry->d_lock);
+	dentry->d_flags |= DCACHE_FALLTHRU;
+	spin_unlock(&dentry->d_lock);
+	return 0;
+}
+
 static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
 	struct inode * new_dir,	struct dentry * new_dentry )
 {
@@ -470,6 +491,7 @@ const struct inode_operations ext2_dir_inode_operations = {
 	.rmdir		= ext2_rmdir,
 	.mknod		= ext2_mknod,
 	.whiteout	= ext2_whiteout,
+	.fallthru	= ext2_fallthru,
 	.rename		= ext2_rename,
 #ifdef CONFIG_EXT2_FS_XATTR
 	.setxattr	= generic_setxattr,
diff --git a/include/linux/ext2_fs.h b/include/linux/ext2_fs.h
index 20468bd..cb3d400 100644
--- a/include/linux/ext2_fs.h
+++ b/include/linux/ext2_fs.h
@@ -577,6 +577,7 @@ enum {
 	EXT2_FT_SOCK		= 6,
 	EXT2_FT_SYMLINK		= 7,
 	EXT2_FT_WHT		= 8,
+	EXT2_FT_FALLTHRU	= 9,
 	EXT2_FT_MAX
 };
 
-- 
1.6.3.3


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH 13/35] fallthru: ext2 fallthru support
  2010-04-15 23:04                         ` [PATCH 13/35] fallthru: ext2 fallthru support Valerie Aurora
@ 2010-04-19 12:40                           ` Jan Blunck
  2010-04-19 13:02                             ` David Woodhouse
  0 siblings, 1 reply; 22+ messages in thread
From: Jan Blunck @ 2010-04-19 12:40 UTC (permalink / raw)
  To: Valerie Aurora
  Cc: Alexander Viro, linux-kernel, linux-fsdevel, Theodore Tso,
	linux-ext4, David Woodhouse

On Thu, Apr 15, Valerie Aurora wrote:

> Add support for fallthru directory entries to ext2.
> 
> XXX - Makes up inode number for fallthru entry
> XXX - Might be better implemented as special symlinks

Better not. David Woodhouse actually convinced me of moving away from the
special symlink approach. The whiteouts have been implemented as special
symlinks before.

What makes you think that it would be beneficial to do so?

Thanks,
Jan

> Cc: Theodore Tso <tytso@mit.edu>
> Cc: linux-ext4@vger.kernel.org
> Signed-off-by: Valerie Aurora <vaurora@redhat.com>
> Signed-off-by: Jan Blunck <jblunck@suse.de>
> ---
>  fs/ext2/dir.c           |   92 ++++++++++++++++++++++++++++++++++++++++++++--
>  fs/ext2/ext2.h          |    1 +
>  fs/ext2/namei.c         |   22 +++++++++++
>  include/linux/ext2_fs.h |    1 +
>  4 files changed, 112 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
> index 030bd46..f3b4aff 100644
> --- a/fs/ext2/dir.c
> +++ b/fs/ext2/dir.c
> @@ -219,7 +219,8 @@ static inline int ext2_match (int len, const char * const name,
>  {
>  	if (len != de->name_len)
>  		return 0;
> -	if (!de->inode && (de->file_type != EXT2_FT_WHT))
> +	if (!de->inode && ((de->file_type != EXT2_FT_WHT) &&
> +			   (de->file_type != EXT2_FT_FALLTHRU)))
>  		return 0;
>  	return !memcmp(name, de->name, len);
>  }
> @@ -256,6 +257,7 @@ static unsigned char ext2_filetype_table[EXT2_FT_MAX] = {
>  	[EXT2_FT_SOCK]		= DT_SOCK,
>  	[EXT2_FT_SYMLINK]	= DT_LNK,
>  	[EXT2_FT_WHT]		= DT_WHT,
> +	[EXT2_FT_FALLTHRU]	= DT_UNKNOWN,
>  };
>  
>  #define S_SHIFT 12
> @@ -342,6 +344,24 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir)
>  					ext2_put_page(page);
>  					return 0;
>  				}
> +			} else if (de->file_type == EXT2_FT_FALLTHRU) {
> +				int over;
> +				unsigned char d_type = DT_UNKNOWN;
> +
> +				offset = (char *)de - kaddr;
> +				/* XXX We don't know the inode number
> +				 * of the directory entry in the
> +				 * underlying file system.  Should
> +				 * look it up, either on fallthru
> +				 * creation at first readdir or now at
> +				 * filldir time. */
> +				over = filldir(dirent, de->name, de->name_len,
> +					       (n<<PAGE_CACHE_SHIFT) | offset,
> +					       123 /* Made up ino */, d_type);
> +				if (over) {
> +					ext2_put_page(page);
> +					return 0;
> +				}
>  			}
>  			filp->f_pos += ext2_rec_len_from_disk(de->rec_len);
>  		}
> @@ -463,6 +483,10 @@ ino_t ext2_inode_by_dentry(struct inode *dir, struct dentry *dentry)
>  			spin_lock(&dentry->d_lock);
>  			dentry->d_flags |= DCACHE_WHITEOUT;
>  			spin_unlock(&dentry->d_lock);
> +		} else if(!res && de->file_type == EXT2_FT_FALLTHRU) {
> +			spin_lock(&dentry->d_lock);
> +			dentry->d_flags |= DCACHE_FALLTHRU;
> +			spin_unlock(&dentry->d_lock);
>  		}
>  		ext2_put_page(page);
>  	}
> @@ -532,6 +556,7 @@ static ext2_dirent * ext2_append_entry(struct dentry * dentry,
>  				de->name_len = 0;
>  				de->rec_len = ext2_rec_len_to_disk(chunk_size);
>  				de->inode = 0;
> +				de->file_type = 0;
>  				goto got_it;
>  			}
>  			if (de->rec_len == 0) {
> @@ -545,6 +570,7 @@ static ext2_dirent * ext2_append_entry(struct dentry * dentry,
>  			name_len = EXT2_DIR_REC_LEN(de->name_len);
>  			rec_len = ext2_rec_len_from_disk(de->rec_len);
>  			if (!de->inode && (de->file_type != EXT2_FT_WHT) &&
> +			    (de->file_type != EXT2_FT_FALLTHRU) &&
>  			    (rec_len >= reclen))
>  				goto got_it;
>  			if (rec_len >= name_len + reclen)
> @@ -587,7 +613,8 @@ int ext2_add_link (struct dentry *dentry, struct inode *inode)
>  
>  	err = -EEXIST;
>  	if (ext2_match (namelen, name, de)) {
> -		if (de->file_type == EXT2_FT_WHT)
> +		if ((de->file_type == EXT2_FT_WHT) ||
> +		    (de->file_type == EXT2_FT_FALLTHRU))
>  			goto got_it;
>  		goto out_unlock;
>  	}
> @@ -602,7 +629,8 @@ got_it:
>  							&page, NULL);
>  	if (err)
>  		goto out_unlock;
> -	if (de->inode || ((de->file_type == EXT2_FT_WHT) &&
> +	if (de->inode || (((de->file_type == EXT2_FT_WHT) ||
> +			   (de->file_type == EXT2_FT_FALLTHRU)) &&
>  			  !ext2_match (namelen, name, de))) {
>  		ext2_dirent *de1 = (ext2_dirent *) ((char *) de + name_len);
>  		de1->rec_len = ext2_rec_len_to_disk(rec_len - name_len);
> @@ -627,6 +655,60 @@ out_unlock:
>  }
>  
>  /*
> + * Create a fallthru entry.
> + */
> +int ext2_fallthru_entry (struct inode *dir, struct dentry *dentry)
> +{
> +	const char *name = dentry->d_name.name;
> +	int namelen = dentry->d_name.len;
> +	unsigned short rec_len, name_len;
> +	ext2_dirent * de;
> +	struct page *page;
> +	loff_t pos;
> +	int err;
> +
> +	de = ext2_append_entry(dentry, &page);
> +	if (IS_ERR(de))
> +		return PTR_ERR(de);
> +
> +	err = -EEXIST;
> +	if (ext2_match (namelen, name, de))
> +		goto out_unlock;
> +
> +	name_len = EXT2_DIR_REC_LEN(de->name_len);
> +	rec_len = ext2_rec_len_from_disk(de->rec_len);
> +
> +	pos = page_offset(page) +
> +		(char*)de - (char*)page_address(page);
> +	err = __ext2_write_begin(NULL, page->mapping, pos, rec_len, 0,
> +							&page, NULL);
> +	if (err)
> +		goto out_unlock;
> +	if (de->inode || (de->file_type == EXT2_FT_WHT) ||
> +	    (de->file_type == EXT2_FT_FALLTHRU)) {
> +		ext2_dirent *de1 = (ext2_dirent *) ((char *) de + name_len);
> +		de1->rec_len = ext2_rec_len_to_disk(rec_len - name_len);
> +		de->rec_len = ext2_rec_len_to_disk(name_len);
> +		de = de1;
> +	}
> +	de->name_len = namelen;
> +	memcpy(de->name, name, namelen);
> +	de->inode = 0;
> +	de->file_type = EXT2_FT_FALLTHRU;
> +	err = ext2_commit_chunk(page, pos, rec_len);
> +	dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
> +	EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL;
> +	mark_inode_dirty(dir);
> +	/* OFFSET_CACHE */
> +out_put:
> +	ext2_put_page(page);
> +	return err;
> +out_unlock:
> +	unlock_page(page);
> +	goto out_put;
> +}
> +
> +/*
>   * ext2_delete_entry deletes a directory entry by merging it with the
>   * previous entry. Page is up-to-date. Releases the page.
>   */
> @@ -711,7 +793,9 @@ int ext2_whiteout_entry (struct inode * dir, struct dentry * dentry,
>  	 */
>  	if (ext2_match (namelen, name, de))
>  		de->inode = 0;
> -	if (de->inode || (de->file_type == EXT2_FT_WHT)) {
> +	if (de->inode || (((de->file_type == EXT2_FT_WHT) ||
> +			   (de->file_type == EXT2_FT_FALLTHRU)) &&
> +			  !ext2_match (namelen, name, de))) {
>  		ext2_dirent *de1 = (ext2_dirent *) ((char *) de + name_len);
>  		de1->rec_len = ext2_rec_len_to_disk(rec_len - name_len);
>  		de->rec_len = ext2_rec_len_to_disk(name_len);
> diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
> index 44d190c..2fa32b3 100644
> --- a/fs/ext2/ext2.h
> +++ b/fs/ext2/ext2.h
> @@ -108,6 +108,7 @@ extern struct ext2_dir_entry_2 * ext2_find_entry (struct inode *,struct qstr *,
>  extern int ext2_delete_entry (struct ext2_dir_entry_2 *, struct page *);
>  extern int ext2_whiteout_entry (struct inode *, struct dentry *,
>  				struct ext2_dir_entry_2 *, struct page *);
> +extern int ext2_fallthru_entry (struct inode *, struct dentry *);
>  extern int ext2_empty_dir (struct inode *);
>  extern struct ext2_dir_entry_2 * ext2_dotdot (struct inode *, struct page **);
>  extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page *, struct inode *, int);
> diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
> index 12195a5..f28154c 100644
> --- a/fs/ext2/namei.c
> +++ b/fs/ext2/namei.c
> @@ -349,6 +349,7 @@ static int ext2_whiteout(struct inode *dir, struct dentry *dentry,
>  		goto out;
>  
>  	spin_lock(&new_dentry->d_lock);
> +	new_dentry->d_flags &= ~DCACHE_FALLTHRU;
>  	new_dentry->d_flags |= DCACHE_WHITEOUT;
>  	spin_unlock(&new_dentry->d_lock);
>  	d_add(new_dentry, NULL);
> @@ -367,6 +368,26 @@ out:
>  	return err;
>  }
>  
> +/*
> + * Create a fallthru entry.
> + */
> +static int ext2_fallthru (struct inode *dir, struct dentry *dentry)
> +{
> +	int err;
> +
> +	dquot_initialize(dir);
> +
> +	err = ext2_fallthru_entry(dir, dentry);
> +	if (err)
> +		return err;
> +
> +	d_instantiate(dentry, NULL);
> +	spin_lock(&dentry->d_lock);
> +	dentry->d_flags |= DCACHE_FALLTHRU;
> +	spin_unlock(&dentry->d_lock);
> +	return 0;
> +}
> +
>  static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
>  	struct inode * new_dir,	struct dentry * new_dentry )
>  {
> @@ -470,6 +491,7 @@ const struct inode_operations ext2_dir_inode_operations = {
>  	.rmdir		= ext2_rmdir,
>  	.mknod		= ext2_mknod,
>  	.whiteout	= ext2_whiteout,
> +	.fallthru	= ext2_fallthru,
>  	.rename		= ext2_rename,
>  #ifdef CONFIG_EXT2_FS_XATTR
>  	.setxattr	= generic_setxattr,
> diff --git a/include/linux/ext2_fs.h b/include/linux/ext2_fs.h
> index 20468bd..cb3d400 100644
> --- a/include/linux/ext2_fs.h
> +++ b/include/linux/ext2_fs.h
> @@ -577,6 +577,7 @@ enum {
>  	EXT2_FT_SOCK		= 6,
>  	EXT2_FT_SYMLINK		= 7,
>  	EXT2_FT_WHT		= 8,
> +	EXT2_FT_FALLTHRU	= 9,
>  	EXT2_FT_MAX
>  };
>  
> -- 
> 1.6.3.3
> 
Regards,
	Jan

-- 
Jan Blunck <jblunck@suse.de>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 13/35] fallthru: ext2 fallthru support
  2010-04-19 12:40                           ` Jan Blunck
@ 2010-04-19 13:02                             ` David Woodhouse
  2010-04-19 13:23                               ` Jan Blunck
  0 siblings, 1 reply; 22+ messages in thread
From: David Woodhouse @ 2010-04-19 13:02 UTC (permalink / raw)
  To: Jan Blunck
  Cc: Valerie Aurora, Alexander Viro, linux-kernel, linux-fsdevel,
	Theodore Tso, linux-ext4

On Mon, 2010-04-19 at 14:40 +0200, Jan Blunck wrote:
> On Thu, Apr 15, Valerie Aurora wrote:
> 
> > Add support for fallthru directory entries to ext2.
> > 
> > XXX - Makes up inode number for fallthru entry
> > XXX - Might be better implemented as special symlinks
> 
> Better not. David Woodhouse actually convinced me of moving away from the
> special symlink approach. The whiteouts have been implemented as special
> symlinks before.

I certainly asked whether you really need a real 'struct inode' for
whiteouts, and suggested that they should be represented _purely_ as a
dentry with type DT_WHT.

I don't much like the manifestation of that in this patch though,
especially with the made-up inode number. (ISTR I had other
jffs2-specific objections too, which I'll dig out and forward).

-- 
David Woodhouse                            Open Source Technology Centre
David.Woodhouse@intel.com                              Intel Corporation


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 13/35] fallthru: ext2 fallthru support
  2010-04-19 13:02                             ` David Woodhouse
@ 2010-04-19 13:23                               ` Jan Blunck
  2010-04-19 13:30                                 ` Jamie Lokier
  0 siblings, 1 reply; 22+ messages in thread
From: Jan Blunck @ 2010-04-19 13:23 UTC (permalink / raw)
  To: David Woodhouse
  Cc: Valerie Aurora, Alexander Viro, linux-kernel, linux-fsdevel,
	Theodore Tso, linux-ext4

On Mon, Apr 19, David Woodhouse wrote:

> On Mon, 2010-04-19 at 14:40 +0200, Jan Blunck wrote:
> > On Thu, Apr 15, Valerie Aurora wrote:
> > 
> > > Add support for fallthru directory entries to ext2.
> > > 
> > > XXX - Makes up inode number for fallthru entry
> > > XXX - Might be better implemented as special symlinks
> > 
> > Better not. David Woodhouse actually convinced me of moving away from the
> > special symlink approach. The whiteouts have been implemented as special
> > symlinks before.
> 
> I certainly asked whether you really need a real 'struct inode' for
> whiteouts, and suggested that they should be represented _purely_ as a
> dentry with type DT_WHT.
> 
> I don't much like the manifestation of that in this patch though,
> especially with the made-up inode number. (ISTR I had other
> jffs2-specific objections too, which I'll dig out and forward).

Yes, this patches still have issues that Val and me are aware off. I can't
remember anything jffs2-specific though.

We return that inode number because we don't want to lookup the name on the
other filesystem during readdir. Therefore returning DT_UNKNOWN to let the
userspace decide if it needs to stat the file was the easiest workaround. I
know that POSIX requires d_ino and d_name but on the other hand it does not
require anything more on how long d_ino is valid.

If somebody has an idea how to make this cleaner please speak up.

Regards,
	Jan

-- 
Jan Blunck <jblunck@suse.de>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 13/35] fallthru: ext2 fallthru support
  2010-04-19 13:23                               ` Jan Blunck
@ 2010-04-19 13:30                                 ` Jamie Lokier
  2010-04-19 14:12                                   ` Jan Blunck
  0 siblings, 1 reply; 22+ messages in thread
From: Jamie Lokier @ 2010-04-19 13:30 UTC (permalink / raw)
  To: Jan Blunck
  Cc: David Woodhouse, Valerie Aurora, Alexander Viro, linux-kernel,
	linux-fsdevel, Theodore Tso, linux-ext4

Jan Blunck wrote:
> On Mon, Apr 19, David Woodhouse wrote:
> 
> > On Mon, 2010-04-19 at 14:40 +0200, Jan Blunck wrote:
> > > On Thu, Apr 15, Valerie Aurora wrote:
> > > 
> > > > Add support for fallthru directory entries to ext2.
> > > > 
> > > > XXX - Makes up inode number for fallthru entry
> > > > XXX - Might be better implemented as special symlinks
> > > 
> > > Better not. David Woodhouse actually convinced me of moving away from the
> > > special symlink approach. The whiteouts have been implemented as special
> > > symlinks before.
> > 
> > I certainly asked whether you really need a real 'struct inode' for
> > whiteouts, and suggested that they should be represented _purely_ as a
> > dentry with type DT_WHT.
> > 
> > I don't much like the manifestation of that in this patch though,
> > especially with the made-up inode number. (ISTR I had other
> > jffs2-specific objections too, which I'll dig out and forward).
> 
> Yes, this patches still have issues that Val and me are aware off. I can't
> remember anything jffs2-specific though.
> 
> We return that inode number because we don't want to lookup the name on the
> other filesystem during readdir. Therefore returning DT_UNKNOWN to let the
> userspace decide if it needs to stat the file was the easiest workaround. I
> know that POSIX requires d_ino and d_name but on the other hand it does not
> require anything more on how long d_ino is valid.

Although the lifetime of d_ino might very, I know some programs (not
public) that will break if they see a d_ino which is wrongly matching
the st_ino of another file somewhere on the same st_dev.  They will
assume the name is a hard link to the other file, without calling
stat(), which I think is a reasonable assumption and a useful optimisation.

So the made-up d_ino should at least be careful to not match an inode
number of another file which has a stable st_ino.

Why not zero for d_ino?

-- Jamie


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 13/35] fallthru: ext2 fallthru support
  2010-04-19 13:30                                 ` Jamie Lokier
@ 2010-04-19 14:12                                   ` Jan Blunck
  2010-04-19 14:23                                     ` Valerie Aurora
  0 siblings, 1 reply; 22+ messages in thread
From: Jan Blunck @ 2010-04-19 14:12 UTC (permalink / raw)
  To: Jamie Lokier
  Cc: David Woodhouse, Valerie Aurora, Alexander Viro, linux-kernel,
	linux-fsdevel, Theodore Tso, linux-ext4

On Mon, Apr 19, Jamie Lokier wrote:

> Jan Blunck wrote:
> > On Mon, Apr 19, David Woodhouse wrote:
> > 
> > > On Mon, 2010-04-19 at 14:40 +0200, Jan Blunck wrote:
> > > > On Thu, Apr 15, Valerie Aurora wrote:
> > > > 
> > > > > Add support for fallthru directory entries to ext2.
> > > > > 
> > > > > XXX - Makes up inode number for fallthru entry
> > > > > XXX - Might be better implemented as special symlinks
> > > > 
> > > > Better not. David Woodhouse actually convinced me of moving away from the
> > > > special symlink approach. The whiteouts have been implemented as special
> > > > symlinks before.
> > > 
> > > I certainly asked whether you really need a real 'struct inode' for
> > > whiteouts, and suggested that they should be represented _purely_ as a
> > > dentry with type DT_WHT.
> > > 
> > > I don't much like the manifestation of that in this patch though,
> > > especially with the made-up inode number. (ISTR I had other
> > > jffs2-specific objections too, which I'll dig out and forward).
> > 
> > Yes, this patches still have issues that Val and me are aware off. I can't
> > remember anything jffs2-specific though.
> > 
> > We return that inode number because we don't want to lookup the name on the
> > other filesystem during readdir. Therefore returning DT_UNKNOWN to let the
> > userspace decide if it needs to stat the file was the easiest workaround. I
> > know that POSIX requires d_ino and d_name but on the other hand it does not
> > require anything more on how long d_ino is valid.
> 
> Although the lifetime of d_ino might very, I know some programs (not
> public) that will break if they see a d_ino which is wrongly matching
> the st_ino of another file somewhere on the same st_dev.  They will
> assume the name is a hard link to the other file, without calling
> stat(), which I think is a reasonable assumption and a useful optimisation.
> 
> So the made-up d_ino should at least be careful to not match an inode
> number of another file which has a stable st_ino.
> 
> Why not zero for d_ino?
> 

Hmm, why not. Or even the ino of the directory we are reading from ...

Regards,
	Jan

-- 
Jan Blunck <jblunck@suse.de>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 13/35] fallthru: ext2 fallthru support
  2010-04-19 14:12                                   ` Jan Blunck
@ 2010-04-19 14:23                                     ` Valerie Aurora
  2010-04-19 14:53                                       ` Miklos Szeredi
  2010-04-20 21:40                                       ` Jamie Lokier
  0 siblings, 2 replies; 22+ messages in thread
From: Valerie Aurora @ 2010-04-19 14:23 UTC (permalink / raw)
  To: Jan Blunck
  Cc: Jamie Lokier, David Woodhouse, Alexander Viro, linux-kernel,
	linux-fsdevel, Theodore Tso, linux-ext4

On Mon, Apr 19, 2010 at 04:12:48PM +0200, Jan Blunck wrote:
> On Mon, Apr 19, Jamie Lokier wrote:
> 
> > Jan Blunck wrote:
> > > On Mon, Apr 19, David Woodhouse wrote:
> > > 
> > > > On Mon, 2010-04-19 at 14:40 +0200, Jan Blunck wrote:
> > > > > On Thu, Apr 15, Valerie Aurora wrote:
> > > > > 
> > > > > > Add support for fallthru directory entries to ext2.
> > > > > > 
> > > > > > XXX - Makes up inode number for fallthru entry
> > > > > > XXX - Might be better implemented as special symlinks
> > > > > 
> > > > > Better not. David Woodhouse actually convinced me of moving away from the
> > > > > special symlink approach. The whiteouts have been implemented as special
> > > > > symlinks before.
> > > > 
> > > > I certainly asked whether you really need a real 'struct inode' for
> > > > whiteouts, and suggested that they should be represented _purely_ as a
> > > > dentry with type DT_WHT.
> > > > 
> > > > I don't much like the manifestation of that in this patch though,
> > > > especially with the made-up inode number. (ISTR I had other
> > > > jffs2-specific objections too, which I'll dig out and forward).
> > > 
> > > Yes, this patches still have issues that Val and me are aware off. I can't
> > > remember anything jffs2-specific though.
> > > 
> > > We return that inode number because we don't want to lookup the name on the
> > > other filesystem during readdir. Therefore returning DT_UNKNOWN to let the
> > > userspace decide if it needs to stat the file was the easiest workaround. I
> > > know that POSIX requires d_ino and d_name but on the other hand it does not
> > > require anything more on how long d_ino is valid.
> > 
> > Although the lifetime of d_ino might very, I know some programs (not
> > public) that will break if they see a d_ino which is wrongly matching
> > the st_ino of another file somewhere on the same st_dev.  They will
> > assume the name is a hard link to the other file, without calling
> > stat(), which I think is a reasonable assumption and a useful optimisation.
> > 
> > So the made-up d_ino should at least be careful to not match an inode
> > number of another file which has a stable st_ino.
> > 
> > Why not zero for d_ino?
> > 
> 
> Hmm, why not. Or even the ino of the directory we are reading from ...

I don't recall there being any technical reason not to look up the
real inode number.  I just wrote it that we because I was lazy.  So I
like returning the directory's d_ino better than a single magic
number, but I'd at least like to try returning the real inode number
too.

-VAL

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 13/35] fallthru: ext2 fallthru support
  2010-04-19 14:23                                     ` Valerie Aurora
@ 2010-04-19 14:53                                       ` Miklos Szeredi
  2010-04-20 21:34                                         ` Jamie Lokier
  2010-04-20 21:40                                       ` Jamie Lokier
  1 sibling, 1 reply; 22+ messages in thread
From: Miklos Szeredi @ 2010-04-19 14:53 UTC (permalink / raw)
  To: Valerie Aurora
  Cc: jblunck, jamie, dwmw2, viro, linux-kernel, linux-fsdevel, tytso,
	linux-ext4

On Mon, 19 Apr 2010, Valerie Aurora wrote:
> I don't recall there being any technical reason not to look up the
> real inode number.  I just wrote it that we because I was lazy.  So I
> like returning the directory's d_ino better than a single magic
> number, but I'd at least like to try returning the real inode number
> too.

Note, "struct dirent" doesn't have d_dev, so you really can't return
the "real" inode number, that's on a different filesystem and just a
random number in the context of the the readdir in question.

Thanks,
Miklos

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 13/35] fallthru: ext2 fallthru support
  2010-04-19 14:53                                       ` Miklos Szeredi
@ 2010-04-20 21:34                                         ` Jamie Lokier
  2010-04-21  8:42                                           ` Jan Blunck
  0 siblings, 1 reply; 22+ messages in thread
From: Jamie Lokier @ 2010-04-20 21:34 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: Valerie Aurora, jblunck, dwmw2, viro, linux-kernel, linux-fsdevel,
	tytso, linux-ext4

Miklos Szeredi wrote:
> On Mon, 19 Apr 2010, Valerie Aurora wrote:
> > I don't recall there being any technical reason not to look up the
> > real inode number.  I just wrote it that we because I was lazy.  So I
> > like returning the directory's d_ino better than a single magic
> > number, but I'd at least like to try returning the real inode number
> > too.
> 
> Note, "struct dirent" doesn't have d_dev, so you really can't return
> the "real" inode number, that's on a different filesystem and just a
> random number in the context of the the readdir in question.

Agree.  Does this inappropriate inode number for the union mount's
st_dev happen with stat() on the actual files too?  That could be bad.

-- Jamie

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 13/35] fallthru: ext2 fallthru support
  2010-04-19 14:23                                     ` Valerie Aurora
  2010-04-19 14:53                                       ` Miklos Szeredi
@ 2010-04-20 21:40                                       ` Jamie Lokier
  1 sibling, 0 replies; 22+ messages in thread
From: Jamie Lokier @ 2010-04-20 21:40 UTC (permalink / raw)
  To: Valerie Aurora
  Cc: Jan Blunck, David Woodhouse, Alexander Viro, linux-kernel,
	linux-fsdevel, Theodore Tso, linux-ext4

Valerie Aurora wrote:
> On Mon, Apr 19, 2010 at 04:12:48PM +0200, Jan Blunck wrote:
> > On Mon, Apr 19, Jamie Lokier wrote:
> > 
> > > Jan Blunck wrote:
> > > > On Mon, Apr 19, David Woodhouse wrote:
> > > > 
> > > > > On Mon, 2010-04-19 at 14:40 +0200, Jan Blunck wrote:
> > > > > > On Thu, Apr 15, Valerie Aurora wrote:
> > > > > > 
> > > > > > > Add support for fallthru directory entries to ext2.
> > > > > > > 
> > > > > > > XXX - Makes up inode number for fallthru entry
> > > > > > > XXX - Might be better implemented as special symlinks
> > > > > > 
> > > > > > Better not. David Woodhouse actually convinced me of moving away from the
> > > > > > special symlink approach. The whiteouts have been implemented as special
> > > > > > symlinks before.
> > > > > 
> > > > > I certainly asked whether you really need a real 'struct inode' for
> > > > > whiteouts, and suggested that they should be represented _purely_ as a
> > > > > dentry with type DT_WHT.
> > > > > 
> > > > > I don't much like the manifestation of that in this patch though,
> > > > > especially with the made-up inode number. (ISTR I had other
> > > > > jffs2-specific objections too, which I'll dig out and forward).
> > > > 
> > > > Yes, this patches still have issues that Val and me are aware off. I can't
> > > > remember anything jffs2-specific though.
> > > > 
> > > > We return that inode number because we don't want to lookup the name on the
> > > > other filesystem during readdir. Therefore returning DT_UNKNOWN to let the
> > > > userspace decide if it needs to stat the file was the easiest workaround. I
> > > > know that POSIX requires d_ino and d_name but on the other hand it does not
> > > > require anything more on how long d_ino is valid.
> > > 
> > > Although the lifetime of d_ino might very, I know some programs (not
> > > public) that will break if they see a d_ino which is wrongly matching
> > > the st_ino of another file somewhere on the same st_dev.  They will
> > > assume the name is a hard link to the other file, without calling
> > > stat(), which I think is a reasonable assumption and a useful optimisation.
> > > 
> > > So the made-up d_ino should at least be careful to not match an inode
> > > number of another file which has a stable st_ino.
> > > 
> > > Why not zero for d_ino?
> > > 
> > 
> > Hmm, why not. Or even the ino of the directory we are reading from ...
> 
> I don't recall there being any technical reason not to look up the
> real inode number.  I just wrote it that we because I was lazy.  So I
> like returning the directory's d_ino better than a single magic
> number, but I'd at least like to try returning the real inode number
> too.

I thought of zero because Bash and GNU Readline both check d_ino != 0
to decide if an entry is valid.

On reflection, that is why zero must _not_ be used :-)

-- Jamie

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 13/35] fallthru: ext2 fallthru support
  2010-04-20 21:34                                         ` Jamie Lokier
@ 2010-04-21  8:42                                           ` Jan Blunck
  2010-04-21  9:22                                             ` Jamie Lokier
  0 siblings, 1 reply; 22+ messages in thread
From: Jan Blunck @ 2010-04-21  8:42 UTC (permalink / raw)
  To: Jamie Lokier
  Cc: Miklos Szeredi, Valerie Aurora, dwmw2, viro, linux-kernel,
	linux-fsdevel, tytso, linux-ext4

On Tue, Apr 20, Jamie Lokier wrote:

> Miklos Szeredi wrote:
> > On Mon, 19 Apr 2010, Valerie Aurora wrote:
> > > I don't recall there being any technical reason not to look up the
> > > real inode number.  I just wrote it that we because I was lazy.  So I
> > > like returning the directory's d_ino better than a single magic
> > > number, but I'd at least like to try returning the real inode number
> > > too.
> > 
> > Note, "struct dirent" doesn't have d_dev, so you really can't return
> > the "real" inode number, that's on a different filesystem and just a
> > random number in the context of the the readdir in question.
> 
> Agree.  Does this inappropriate inode number for the union mount's
> st_dev happen with stat() on the actual files too?  That could be bad.

No, for stat() you do a lookup and that is returning the correct dentry/inode
for the filesystem the name is on.

We just return the the fallthru directory entries to give userspace an offset
that they can seekdir() to.

Regards,
	Jan

-- 
Jan Blunck <jblunck@suse.de>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 13/35] fallthru: ext2 fallthru support
  2010-04-21  8:42                                           ` Jan Blunck
@ 2010-04-21  9:22                                             ` Jamie Lokier
  2010-04-21  9:34                                               ` Miklos Szeredi
  2010-04-22 10:30                                               ` J. R. Okajima
  0 siblings, 2 replies; 22+ messages in thread
From: Jamie Lokier @ 2010-04-21  9:22 UTC (permalink / raw)
  To: Jan Blunck
  Cc: Miklos Szeredi, Valerie Aurora, dwmw2, viro, linux-kernel,
	linux-fsdevel, tytso, linux-ext4

Jan Blunck wrote:
> On Tue, Apr 20, Jamie Lokier wrote:
> 
> > Miklos Szeredi wrote:
> > > On Mon, 19 Apr 2010, Valerie Aurora wrote:
> > > > I don't recall there being any technical reason not to look up the
> > > > real inode number.  I just wrote it that we because I was lazy.  So I
> > > > like returning the directory's d_ino better than a single magic
> > > > number, but I'd at least like to try returning the real inode number
> > > > too.
> > > 
> > > Note, "struct dirent" doesn't have d_dev, so you really can't return
> > > the "real" inode number, that's on a different filesystem and just a
> > > random number in the context of the the readdir in question.
> > 
> > Agree.  Does this inappropriate inode number for the union mount's
> > st_dev happen with stat() on the actual files too?  That could be bad.
> 
> No, for stat() you do a lookup and that is returning the correct
> dentry/inode for the filesystem the name is on.

Hmm.  I smell potential confusion for some otherwise POSIX-friendly
userspaces.

When I open /path/to/foo, call fstat (st_dev=2, st_ino=5678), and then
keep the file open, then later do a readdir which includes foo
(dir.st_dev=1, d_ino=1234), I'm going to immediately assume a rename
or unlink happened, close the file, abort streaming from it, refresh
the GUI windows, refresh application caches for that name entry, etc.

Because in the POSIX world I think open files have stable inode
numbers (as long as they are open), and I don't think that an open
file can have it's name's d_ino not match the inode number unless it's
a mount point, which my program would know about.

This plays into inotify, where you have to know if you are monitoring
every directory that contains a link to a file, to know if you need to
monitor the file itself directly instead.

Now I think it's fair enough that a union mount doesn't play all the
traditional rules :-)  C'est la vie.

This mismatch of (dir.st_dev,d_ino) and st_ino strongly resembles a
file-bind-mount.  Like bind mounts, it's quite annoying for programs
that like to assume they've seen all of a file's links when they've
seen i_nlink of them.

Bind mounts can be detected by looking in /proc/mounts.  st_dev
changing doesn't work because it can be a binding of the same
filesystem.

How would I go about detecting when a union mount's directory entry
has similar behaviour, without calling stat() on each entry?  Is it
just a matter of recognising a particular filesystem name in
/proc/mounts, or something more?

Thanks,
-- Jamie

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 13/35] fallthru: ext2 fallthru support
  2010-04-21  9:22                                             ` Jamie Lokier
@ 2010-04-21  9:34                                               ` Miklos Szeredi
  2010-04-21  9:52                                                 ` Jamie Lokier
  2010-04-21 21:38                                                 ` Valerie Aurora
  2010-04-22 10:30                                               ` J. R. Okajima
  1 sibling, 2 replies; 22+ messages in thread
From: Miklos Szeredi @ 2010-04-21  9:34 UTC (permalink / raw)
  To: Jamie Lokier
  Cc: jblunck, miklos, vaurora, dwmw2, viro, linux-kernel,
	linux-fsdevel, tytso, linux-ext4

On Wed, 21 Apr 2010, Jamie Lokier wrote:
> Hmm.  I smell potential confusion for some otherwise POSIX-friendly
> userspaces.
> 
> When I open /path/to/foo, call fstat (st_dev=2, st_ino=5678), and then
> keep the file open, then later do a readdir which includes foo
> (dir.st_dev=1, d_ino=1234), I'm going to immediately assume a rename
> or unlink happened, close the file, abort streaming from it, refresh
> the GUI windows, refresh application caches for that name entry, etc.
> 
> Because in the POSIX world I think open files have stable inode
> numbers (as long as they are open), and I don't think that an open
> file can have it's name's d_ino not match the inode number unless it's
> a mount point, which my program would know about.
> 
> This plays into inotify, where you have to know if you are monitoring
> every directory that contains a link to a file, to know if you need to
> monitor the file itself directly instead.
> 
> Now I think it's fair enough that a union mount doesn't play all the
> traditional rules :-)  C'est la vie.
> 
> This mismatch of (dir.st_dev,d_ino) and st_ino strongly resembles a
> file-bind-mount.  Like bind mounts, it's quite annoying for programs
> that like to assume they've seen all of a file's links when they've
> seen i_nlink of them.
> 
> Bind mounts can be detected by looking in /proc/mounts.  st_dev
> changing doesn't work because it can be a binding of the same
> filesystem.
> 
> How would I go about detecting when a union mount's directory entry
> has similar behaviour, without calling stat() on each entry?  Is it
> just a matter of recognising a particular filesystem name in
> /proc/mounts, or something more?

Detecting mount points is best done by comparing st_dev for the parent
directory with st_dev of the child.  This is much simpler than parsing
/proc/mounts and will work for bind mounts as well as union mounts.

I think there's no question that union mounts might break apps (POSIX
or not).  But I think there's hope that they are few and can easily be
fixed.

Thanks,
Miklos

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 13/35] fallthru: ext2 fallthru support
  2010-04-21  9:34                                               ` Miklos Szeredi
@ 2010-04-21  9:52                                                 ` Jamie Lokier
  2010-04-21 10:17                                                   ` Miklos Szeredi
  2010-04-21 21:34                                                   ` Valerie Aurora
  2010-04-21 21:38                                                 ` Valerie Aurora
  1 sibling, 2 replies; 22+ messages in thread
From: Jamie Lokier @ 2010-04-21  9:52 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: jblunck, vaurora, dwmw2, viro, linux-kernel, linux-fsdevel, tytso,
	linux-ext4

Miklos Szeredi wrote:
> On Wed, 21 Apr 2010, Jamie Lokier wrote:
> > Hmm.  I smell potential confusion for some otherwise POSIX-friendly
> > userspaces.
> > 
> > When I open /path/to/foo, call fstat (st_dev=2, st_ino=5678), and then
> > keep the file open, then later do a readdir which includes foo
> > (dir.st_dev=1, d_ino=1234), I'm going to immediately assume a rename
> > or unlink happened, close the file, abort streaming from it, refresh
> > the GUI windows, refresh application caches for that name entry, etc.
> > 
> > Because in the POSIX world I think open files have stable inode
> > numbers (as long as they are open), and I don't think that an open
> > file can have it's name's d_ino not match the inode number unless it's
> > a mount point, which my program would know about.
> > 
> > This plays into inotify, where you have to know if you are monitoring
> > every directory that contains a link to a file, to know if you need to
> > monitor the file itself directly instead.
> > 
> > Now I think it's fair enough that a union mount doesn't play all the
> > traditional rules :-)  C'est la vie.
> > 
> > This mismatch of (dir.st_dev,d_ino) and st_ino strongly resembles a
> > file-bind-mount.  Like bind mounts, it's quite annoying for programs
> > that like to assume they've seen all of a file's links when they've
> > seen i_nlink of them.
> > 
> > Bind mounts can be detected by looking in /proc/mounts.  st_dev
> > changing doesn't work because it can be a binding of the same
> > filesystem.
> > 
> > How would I go about detecting when a union mount's directory entry
> > has similar behaviour, without calling stat() on each entry?  Is it
> > just a matter of recognising a particular filesystem name in
> > /proc/mounts, or something more?
> 
> Detecting mount points is best done by comparing st_dev for the parent
> directory with st_dev of the child.  This is much simpler than parsing
> /proc/mounts and will work for bind mounts as well as union mounts.

Sorry, no: That does not work for bind mounts.  Both layers can have
the same st_dev.  Nor does O_NOFOLLOW stop traversal in the middle of
a path, there is no handy O_NOCROSSMOUNTS, and no st_mode flag or
d_type to say it's a bind mount.  Bind mounts are really a big pain
for i_nlink+inotify name counting.

Besides, calling stat() on every entry in a large directory to check
st_ino can be orders of magnitude slower than readdir() on a large
directory - especially with a cold cache.  It is quicker, but much
more complicated, to parse /proc/mounts and apply arcane rules to find
the exceptions.

Can a union mount overlap two parts of the same filesystem?

> I think there's no question that union mounts might break apps (POSIX
> or not).  But I think there's hope that they are few and can easily be
> fixed.

I agree, and union moint is a very useful feature that's worth
breaking a few apps for :-)

I'm curious if there's a clear way to go about it in this case, or
if it'll involve a certain amount of pattern recognition in /proc/mounts.

Basically I'm wondering if it's been thought about already.

-- Jamie

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 13/35] fallthru: ext2 fallthru support
  2010-04-21  9:52                                                 ` Jamie Lokier
@ 2010-04-21 10:17                                                   ` Miklos Szeredi
  2010-04-21 17:36                                                     ` Jamie Lokier
  2010-04-21 21:34                                                   ` Valerie Aurora
  1 sibling, 1 reply; 22+ messages in thread
From: Miklos Szeredi @ 2010-04-21 10:17 UTC (permalink / raw)
  To: Jamie Lokier
  Cc: miklos, jblunck, vaurora, dwmw2, viro, linux-kernel,
	linux-fsdevel, tytso, linux-ext4

On Wed, 21 Apr 2010, Jamie Lokier wrote:
> Sorry, no: That does not work for bind mounts.  Both layers can have
> the same st_dev.

Okay.

>  Nor does O_NOFOLLOW stop traversal in the middle of
> a path, there is no handy O_NOCROSSMOUNTS, and no st_mode flag or
> d_type to say it's a bind mount.  Bind mounts are really a big pain
> for i_nlink+inotify name counting.

I'm confused.  You are monitoring a specific file and would like to
know if something is happening to any of it's links, right?

Why do you need to know about bind mounts for that?

Count the number of times you encounter that d_ino and if that matches
i_nlink then every directory is monitored.  Simple as that, no?

Thanks,
Miklos

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 13/35] fallthru: ext2 fallthru support
  2010-04-21 10:17                                                   ` Miklos Szeredi
@ 2010-04-21 17:36                                                     ` Jamie Lokier
  0 siblings, 0 replies; 22+ messages in thread
From: Jamie Lokier @ 2010-04-21 17:36 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: jblunck, vaurora, dwmw2, viro, linux-kernel, linux-fsdevel, tytso,
	linux-ext4

Miklos Szeredi wrote:
> On Wed, 21 Apr 2010, Jamie Lokier wrote:
> > Sorry, no: That does not work for bind mounts.  Both layers can have
> > the same st_dev.
> 
> Okay.
> 
> >  Nor does O_NOFOLLOW stop traversal in the middle of
> > a path, there is no handy O_NOCROSSMOUNTS, and no st_mode flag or
> > d_type to say it's a bind mount.  Bind mounts are really a big pain
> > for i_nlink+inotify name counting.
> 
> I'm confused.  You are monitoring a specific file and would like to
> know if something is happening to any of it's links, right?

Not quite. I'm monitoring a million files (say), so I must use
directory watches for most of them.  I need directory watches anyway,
when the semantic is "calling open on /path/to/file and reading would
return the same data", because renames and unlinks are also a way to
invalidate monitored file contents.

At a high level, what we're talking about is the ability to cache and
verify the the validity information derived from reading files in the
filesystem, in a manner which efficiently triggers invalidation only
on changes.  Being able to answer, as quickly as possible, "if I read
this, that and other, will I get the same results as the last time I
did those operations, without having to actually do them to check".
There are many applications, provided the method is reliable.

> Why do you need to know about bind mounts for that?
> 
> Count the number of times you encounter that d_ino and if that matches
> i_nlink then every directory is monitored.  Simple as that, no?

When I see a file has i_nlink > 1, I must watch the file directly
using a file-watch (with inotify; polling with stat() with dnotify),
_unless_ I have seen all the links to that file.

When I've seen all the links to a file, I know that my directory
watches on the directories containing those links are sufficient to
detect changes to the file contents.  That's because every
file change will get notified on at least one of those paths.

I learn that I've seen all the links by seeing d_ino during readdir as
you suggested, or by st_ino in the cases where I've not had reason to
readdir and I have needed to open the file or call stat.

Let's look at some bind mounts.  One where st_ino doesn't work:

    /dirA/file1  [hard link to inode 100, i_nlink = 2]
    /dirA/bound  [bind mount, has /dirA/file1 mounted on it]
    /dirB/file2  [hard link to inode 100, i_nlink = 2]

If the program is asked to open /dirA/file1 and /dirA/bound at various
times, and never asked to readdir /dirA, it will have used fstat not
readdir, seen the same (st_dev,st_ino,i_nlink = 2), and _wrongly_
concluded that it is monitoring all directories containing paths to
the file.

To avoid that problem, it parses /proc/mounts and detects that
/dirA/bound does not contributed to the link count.  This is faster
than calling readdir in all possible places that it can happen.

Another one, where readdir + d_ino doesn't work anyway:

    /dirA/file1  [hard link to inode 100, i_nlink = 2]
    /dirB/dirX   [bind mount, has /dirA mounted on it]
    /dirC/file2  [hard link to inode 100, i_nlink = 2]

This time the program is asked to open /dirA/file1 and
/dirB/dirX/file1 at various times.  Suppose it aggressively calls
readdir on all of the places it goes near, and uses d_ino comparisons.

Bear in mind it can't hunt for /dirC because there may be millions of
directories; this is just an example.

Then it will see the same d_ino for /dirA/file1 and /dirB/dirX/file1,
and wrongly conclude that it is monitoring all directories containing
paths to the file.

So again, it must parse /proc/mounts to detect that everything found
under /dirB/dirX mirrors /dirA.

This is a bit more complicated by the fact that inotify/dnotify send
events to the watching dentry parent of the link used to access a
file, not necessarily the parent in the mounted path space.

Although this doesn't make the bind mount problem go away, this is
where union mounts complicate the picture more:

Ideally, the program may assume that d_ino and st_ino match as long as
the file is open (on any filesystem), or that the filesystem type is
in a whitelist of ones with stable inode numbers (most local
filesystems), and it's not a mountpoint.  So when it's asked to open
at one path, and something else asks it to readdir at another path, it
could combine the information to learn when it's found all entries,
without having to use redundant readdirs and stats.

I'm thinking that I might have to detect union mounts specially in
/proc/mounts, now that they are a VFS feature, and disable a bunch of
assumptions about d_ino when seeing them.  Hopefully it is possible to
unambiguously check for union mount points in /proc/mounts?

d_ino == directory's st_ino sounds neat.  Maybe that will be enough,
as a special magical Linux rule.  When reading a directory, it's cheap
to get the directory's st_ino with fstat().  It's possible to bind
mount a directory on it's _own_ child, so that st_ino == directory's
st_ino, but d_ino isn't affected so maybe that's the trick to use.

-- Jamie

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 13/35] fallthru: ext2 fallthru support
  2010-04-21  9:52                                                 ` Jamie Lokier
  2010-04-21 10:17                                                   ` Miklos Szeredi
@ 2010-04-21 21:34                                                   ` Valerie Aurora
  1 sibling, 0 replies; 22+ messages in thread
From: Valerie Aurora @ 2010-04-21 21:34 UTC (permalink / raw)
  To: Jamie Lokier
  Cc: Miklos Szeredi, jblunck, dwmw2, viro, linux-kernel, linux-fsdevel,
	tytso, linux-ext4

On Wed, Apr 21, 2010 at 10:52:21AM +0100, Jamie Lokier wrote:
> Miklos Szeredi wrote:
> > Detecting mount points is best done by comparing st_dev for the parent
> > directory with st_dev of the child.  This is much simpler than parsing
> > /proc/mounts and will work for bind mounts as well as union mounts.
> 
> Sorry, no: That does not work for bind mounts.  Both layers can have
> the same st_dev.  Nor does O_NOFOLLOW stop traversal in the middle of
> a path, there is no handy O_NOCROSSMOUNTS, and no st_mode flag or
> d_type to say it's a bind mount.  Bind mounts are really a big pain
> for i_nlink+inotify name counting.
> 
> Besides, calling stat() on every entry in a large directory to check
> st_ino can be orders of magnitude slower than readdir() on a large
> directory - especially with a cold cache.  It is quicker, but much
> more complicated, to parse /proc/mounts and apply arcane rules to find
> the exceptions.
> 
> Can a union mount overlap two parts of the same filesystem?

No.  Each layer must be a separate file system, the bottom must be
read-only, the top must be writable, and they must be unioned at their
mount points.

> > I think there's no question that union mounts might break apps (POSIX
> > or not).  But I think there's hope that they are few and can easily be
> > fixed.
> 
> I agree, and union moint is a very useful feature that's worth
> breaking a few apps for :-)
> 
> I'm curious if there's a clear way to go about it in this case, or
> if it'll involve a certain amount of pattern recognition in /proc/mounts.

All it takes is looking for the "union" string in the mount options.

> Basically I'm wondering if it's been thought about already.

Not as much as it deserves. :) Do you have any thoughts about better
solutions?

Something to keep in mind is that most of the app issues are already
present with bind mounts.  In many cases, if an app doesn't work with
union mounts, it's also not going to work with bind mounts.  I think
you have a good point that we could use a more straightforward way to
say, "Hey, you can't use the normal st_dev/st_ino rules right now..."

-VAL

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 13/35] fallthru: ext2 fallthru support
  2010-04-21  9:34                                               ` Miklos Szeredi
  2010-04-21  9:52                                                 ` Jamie Lokier
@ 2010-04-21 21:38                                                 ` Valerie Aurora
  2010-04-21 22:10                                                   ` Jamie Lokier
  1 sibling, 1 reply; 22+ messages in thread
From: Valerie Aurora @ 2010-04-21 21:38 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: Jamie Lokier, jblunck, dwmw2, viro, linux-kernel, linux-fsdevel,
	tytso, linux-ext4

On Wed, Apr 21, 2010 at 11:34:52AM +0200, Miklos Szeredi wrote:
> On Wed, 21 Apr 2010, Jamie Lokier wrote:
> > Hmm.  I smell potential confusion for some otherwise POSIX-friendly
> > userspaces.
> > 
> > When I open /path/to/foo, call fstat (st_dev=2, st_ino=5678), and then
> > keep the file open, then later do a readdir which includes foo
> > (dir.st_dev=1, d_ino=1234), I'm going to immediately assume a rename
> > or unlink happened, close the file, abort streaming from it, refresh
> > the GUI windows, refresh application caches for that name entry, etc.
> > 
> > Because in the POSIX world I think open files have stable inode
> > numbers (as long as they are open), and I don't think that an open
> > file can have it's name's d_ino not match the inode number unless it's
> > a mount point, which my program would know about.
> > 
> > This plays into inotify, where you have to know if you are monitoring
> > every directory that contains a link to a file, to know if you need to
> > monitor the file itself directly instead.
> > 
> > Now I think it's fair enough that a union mount doesn't play all the
> > traditional rules :-)  C'est la vie.
> > 
> > This mismatch of (dir.st_dev,d_ino) and st_ino strongly resembles a
> > file-bind-mount.  Like bind mounts, it's quite annoying for programs
> > that like to assume they've seen all of a file's links when they've
> > seen i_nlink of them.
> > 
> > Bind mounts can be detected by looking in /proc/mounts.  st_dev
> > changing doesn't work because it can be a binding of the same
> > filesystem.
> > 
> > How would I go about detecting when a union mount's directory entry
> > has similar behaviour, without calling stat() on each entry?  Is it
> > just a matter of recognising a particular filesystem name in
> > /proc/mounts, or something more?
> 
> Detecting mount points is best done by comparing st_dev for the parent
> directory with st_dev of the child.  This is much simpler than parsing
> /proc/mounts and will work for bind mounts as well as union mounts.
> 
> I think there's no question that union mounts might break apps (POSIX
> or not).  But I think there's hope that they are few and can easily be
> fixed.

I couldn't have put it better myself.

To expand slightly, if the broken apps are not few and easily fixed,
then we'll go back and make the kernel more complicated.  I'd like to
try the simplest version we think will work, first.

Thanks!

-VAL

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 13/35] fallthru: ext2 fallthru support
  2010-04-21 21:38                                                 ` Valerie Aurora
@ 2010-04-21 22:10                                                   ` Jamie Lokier
  0 siblings, 0 replies; 22+ messages in thread
From: Jamie Lokier @ 2010-04-21 22:10 UTC (permalink / raw)
  To: Valerie Aurora
  Cc: Miklos Szeredi, jblunck, dwmw2, viro, linux-kernel, linux-fsdevel,
	tytso, linux-ext4

Valerie Aurora wrote:
> > I think there's no question that union mounts might break apps (POSIX
> > or not).  But I think there's hope that they are few and can easily be
> > fixed.
> 
> I couldn't have put it better myself.
> 
> To expand slightly, if the broken apps are not few and easily fixed,
> then we'll go back and make the kernel more complicated.  I'd like to
> try the simplest version we think will work, first.

Don't worry, I'm not trying to deviate you from that good plan.

Just throwing questions out to find what's a good and simple answer to
these little open questions to minimise trouble.

-- Jamie

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 13/35] fallthru: ext2 fallthru support
  2010-04-21  9:22                                             ` Jamie Lokier
  2010-04-21  9:34                                               ` Miklos Szeredi
@ 2010-04-22 10:30                                               ` J. R. Okajima
  1 sibling, 0 replies; 22+ messages in thread
From: J. R. Okajima @ 2010-04-22 10:30 UTC (permalink / raw)
  To: Jamie Lokier
  Cc: Jan Blunck, Miklos Szeredi, Valerie Aurora, dwmw2, viro,
	linux-kernel, linux-fsdevel, tytso, linux-ext4


Jamie Lokier:
> Hmm.  I smell potential confusion for some otherwise POSIX-friendly
> userspaces.
	:::
> This plays into inotify, where you have to know if you are monitoring
> every directory that contains a link to a file, to know if you need to
> monitor the file itself directly instead.

Addition to the inode number of fallthru/readdir, hardlink in union
mount may be a problem. If you open a hardlinked file for writing or
try chmod it, the internal copyup will happen and the hardlink will be 
destroyed. For instance, when fileA and fileB are hardlinked on the
lower layer, and the contents of fileA is modifed (copyup happens). You
will not see the latest contents via fileB.
And the IN_CREATE event may be fired to the parent dir if you monitor
it, I am afraid.

(I have pointed out this issue before, but the posted document didn't
seem to contain about it)


J. R. Okajima

^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2010-04-22 10:30 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <1271372682-21225-1-git-send-email-vaurora@redhat.com>
     [not found] ` <1271372682-21225-2-git-send-email-vaurora@redhat.com>
     [not found]   ` <1271372682-21225-3-git-send-email-vaurora@redhat.com>
     [not found]     ` <1271372682-21225-4-git-send-email-vaurora@redhat.com>
     [not found]       ` <1271372682-21225-5-git-send-email-vaurora@redhat.com>
     [not found]         ` <1271372682-21225-6-git-send-email-vaurora@redhat.com>
     [not found]           ` <1271372682-21225-7-git-send-email-vaurora@redhat.com>
     [not found]             ` <1271372682-21225-8-git-send-email-vaurora@redhat.com>
     [not found]               ` <1271372682-21225-9-git-send-email-vaurora@redhat.com>
2010-04-15 23:04                 ` [PATCH 09/35] whiteout: Split of ext2_append_link() from ext2_add_link() Valerie Aurora
2010-04-15 23:04                   ` [PATCH 10/35] whiteout: ext2 whiteout support Valerie Aurora
     [not found]                     ` <1271372682-21225-12-git-send-email-vaurora@redhat.com>
     [not found]                       ` <1271372682-21225-13-git-send-email-vaurora@redhat.com>
2010-04-15 23:04                         ` [PATCH 13/35] fallthru: ext2 fallthru support Valerie Aurora
2010-04-19 12:40                           ` Jan Blunck
2010-04-19 13:02                             ` David Woodhouse
2010-04-19 13:23                               ` Jan Blunck
2010-04-19 13:30                                 ` Jamie Lokier
2010-04-19 14:12                                   ` Jan Blunck
2010-04-19 14:23                                     ` Valerie Aurora
2010-04-19 14:53                                       ` Miklos Szeredi
2010-04-20 21:34                                         ` Jamie Lokier
2010-04-21  8:42                                           ` Jan Blunck
2010-04-21  9:22                                             ` Jamie Lokier
2010-04-21  9:34                                               ` Miklos Szeredi
2010-04-21  9:52                                                 ` Jamie Lokier
2010-04-21 10:17                                                   ` Miklos Szeredi
2010-04-21 17:36                                                     ` Jamie Lokier
2010-04-21 21:34                                                   ` Valerie Aurora
2010-04-21 21:38                                                 ` Valerie Aurora
2010-04-21 22:10                                                   ` Jamie Lokier
2010-04-22 10:30                                               ` J. R. Okajima
2010-04-20 21:40                                       ` Jamie Lokier

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).