cloning file data

linux-btrfs.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* cloning file data
@ 2008-04-24 22:47 Sage Weil
  2008-04-25 13:41 ` [Btrfs-devel] " Chris Mason
                   ` (2 more replies)
  0 siblings, 3 replies; 32+ messages in thread
From: Sage Weil @ 2008-04-24 22:47 UTC (permalink / raw)
  To: linux-btrfs, btrfs-devel

Hi-

I'm working on a clone ioctl that will quickly and efficiently duplicate 
the contents of a file, e.g.

int main(int argc, const char **argv)
{
  int in = open(argv[1], O_RDONLY);
  int out = open(argv[2], O_CREAT|O_TRUNC|O_WRONLY, 0644);
  ioctl(out, BTRFS_IOC_CLONE, in);
  close(in);
  close(out);
  return 0;
}

I've probably got the locking order a bit wrong, lots of error handling is 
missing, and I suspect there's a cleaner way to do the target inode size 
update, but it behaves well enough in my (limited :) testing.  

Oh, and I wasn't certain the 'offset' in file_extent_item could be safely 
ignored when duplicating the extent reference.  My assumption was that it 
is orthogonal to extent allocation and isn't related to the backref.  
However, btrfs_insert_file_extent() always set offset=0.  I'm guessing I 
need to add an argument there and fix up the other callers?

Anyway, any comments or suggestions (on the interface or implemantation) 
are welcome.. :)

sage


diff -r 1791a620d509 inode.c
--- a/inode.c	Thu Apr 24 13:43:27 2008 -0700
+++ b/inode.c	Thu Apr 24 15:10:17 2008 -0700
@@ -18,6 +18,7 @@
 
 #include <linux/bio.h>
 #include <linux/buffer_head.h>
+#include <linux/file.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/highmem.h>
@@ -2726,6 +2727,158 @@ long btrfs_ioctl_trans_end(struct file *
 	return 0;
 }
 
+void dup_item_to_inode(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root,
+		       struct btrfs_path *path,
+		       struct extent_buffer *leaf,
+		       int slot,
+		       struct btrfs_key *key,
+		       u64 destino)
+{
+	struct btrfs_path *cpath = btrfs_alloc_path();
+	int len = btrfs_item_size_nr(leaf, slot);
+	int dstoff;
+	struct btrfs_key ckey = *key;
+	int ret;
+
+	ckey.objectid = destino;
+	ret = btrfs_insert_empty_item(trans, root, cpath, &ckey, len);
+	dstoff = btrfs_item_ptr_offset(cpath->nodes[0], cpath->slots[0]);
+	copy_extent_buffer(cpath->nodes[0], leaf, dstoff,
+			   btrfs_item_ptr_offset(leaf, slot),
+			   len);
+	btrfs_release_path(root, cpath);
+}
+
+long btrfs_ioctl_clone(struct file *file, unsigned long src_fd)
+{
+	struct inode *inode = fdentry(file)->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct file *src_file;
+	struct inode *src;
+	struct btrfs_trans_handle *trans;
+	int ret;
+	u64 pos;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct extent_buffer *leaf;
+	u32 nritems;
+	int nextret;
+	int slot;
+
+	src_file = fget(src_fd);
+	if (!src_file)
+		return -EBADF;
+	src = src_file->f_dentry->d_inode;
+
+	ret = -EXDEV;
+	if (src->i_sb != inode->i_sb)
+		goto out_fput;
+
+	if (inode < src) {
+		mutex_lock(&inode->i_mutex);
+		mutex_lock(&src->i_mutex);
+	} else {
+		mutex_lock(&src->i_mutex);
+		mutex_lock(&inode->i_mutex);	
+	}
+	
+	ret = -ENOTEMPTY;
+	if (inode->i_size)
+		goto out_unlock;
+
+	/* do any pending delalloc/csum calc on src, one way or another */
+	filemap_write_and_wait(src->i_mapping);
+
+	mutex_lock(&root->fs_info->fs_mutex);
+	trans = btrfs_start_transaction(root, 0);
+	path = btrfs_alloc_path();
+	pos = 0;
+	while (1) {
+		ret = btrfs_lookup_file_extent(trans, root, path, src->i_ino,
+					       pos, 0);
+		if (ret < 0)
+			goto out;
+		if (ret > 0) {
+			if (path->slots[0] == 0) {
+				ret = 0;
+				goto out;
+			}
+			path->slots[0]--;
+		}
+	next_slot:
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+
+		printk("key(%llu %x %llu)\n",
+		       key.objectid, key.type, key.offset);
+		if (btrfs_key_type(&key) > BTRFS_CSUM_ITEM_KEY ||
+		    key.objectid != src->i_ino)
+			goto out;
+		if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
+			struct btrfs_file_extent_item *extent;
+			int found_type;
+			u64 len;
+			pos = key.offset;
+			extent = btrfs_item_ptr(leaf, slot,
+						struct btrfs_file_extent_item);
+			found_type = btrfs_file_extent_type(leaf, extent);
+			len = btrfs_file_extent_num_bytes(leaf, extent);
+			if (found_type == BTRFS_FILE_EXTENT_REG) {
+				u64 ds = btrfs_file_extent_disk_bytenr(leaf, 
+								       extent);
+				u64 dl = btrfs_file_extent_disk_num_bytes(leaf, 
+								 extent);
+				printk(" %llu~%llu disk %llu~%llu off %llu\n",
+				       pos, len, ds, dl,
+				       btrfs_file_extent_offset(leaf, extent));
+				btrfs_insert_file_extent(trans, root, 
+							 inode->i_ino, pos,
+							 ds, dl, len);
+				btrfs_inc_extent_ref(trans, root, ds, dl,
+						     root->root_key.objectid,
+						     trans->transid,
+						     inode->i_ino, pos);
+			} else if (found_type == BTRFS_FILE_EXTENT_INLINE)
+				dup_item_to_inode(trans, root, path, leaf, slot,
+						  &key, inode->i_ino);
+			pos = key.offset + len;
+		} else if (btrfs_key_type(&key) == BTRFS_CSUM_ITEM_KEY)
+			dup_item_to_inode(trans, root, path, leaf, slot, &key, 
+					  inode->i_ino);
+		
+		nritems = btrfs_header_nritems(leaf);
+		if (slot >= nritems - 1) {
+			nextret = btrfs_next_leaf(root, path);
+			if (nextret)
+				goto out;
+		} else {
+			path->slots[0]++;
+		}
+		goto next_slot;
+	}
+
+out:
+	ret = 0;
+	mutex_unlock(&root->fs_info->fs_mutex);	
+
+	i_size_write(inode, src->i_size);
+	inode->i_blocks = src->i_blocks;
+	mark_inode_dirty(inode);
+
+	mutex_lock(&root->fs_info->fs_mutex);	
+	btrfs_end_transaction(trans, root);
+	mutex_unlock(&root->fs_info->fs_mutex);	
+
+out_unlock:
+	mutex_unlock(&src->i_mutex);
+	mutex_unlock(&inode->i_mutex);
+out_fput:
+	fput(src_file);
+	return ret;
+}
+
 long btrfs_ioctl(struct file *file, unsigned int
 		cmd, unsigned long arg)
 {
@@ -2744,6 +2897,9 @@ long btrfs_ioctl(struct file *file, unsi
 		return btrfs_ioctl_trans_end(file);
 	case BTRFS_IOC_SYNC:
 		btrfs_sync_fs(file->f_dentry->d_sb, 1);
+		return 0;
+	case BTRFS_IOC_CLONE:
+		btrfs_ioctl_clone(file, arg);
 		return 0;
 	}
 
diff -r 1791a620d509 ioctl.h
--- a/ioctl.h	Thu Apr 24 13:43:27 2008 -0700
+++ b/ioctl.h	Thu Apr 24 15:10:17 2008 -0700
@@ -36,5 +36,6 @@ struct btrfs_ioctl_vol_args {
 #define BTRFS_IOC_TRANS_START  _IO(BTRFS_IOCTL_MAGIC, 6)
 #define BTRFS_IOC_TRANS_END    _IO(BTRFS_IOCTL_MAGIC, 7)
 #define BTRFS_IOC_SYNC         _IO(BTRFS_IOCTL_MAGIC, 8)
+#define BTRFS_IOC_CLONE        _IOW(BTRFS_IOCTL_MAGIC, 9, int)
 #endif
 

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [Btrfs-devel] cloning file data
  2008-04-24 22:47 cloning file data Sage Weil
@ 2008-04-25 13:41 ` Chris Mason
  2008-04-25 16:50   ` Zach Brown
                     ` (3 more replies)
  2008-04-29 20:52 ` Chris Mason
  2008-05-02 20:50 ` Chris Mason
  2 siblings, 4 replies; 32+ messages in thread
From: Chris Mason @ 2008-04-25 13:41 UTC (permalink / raw)
  To: btrfs-devel; +Cc: Sage Weil, linux-btrfs

On Thursday 24 April 2008, Sage Weil wrote:
> Hi-
>
> I'm working on a clone ioctl that will quickly and efficiently duplicate
> the contents of a file, e.g.

Very cool.  I'd actually loved to see this wrapped into a program that will 
cow a directory tree.  Basically the same as cp -al, but with cow instead of 
linking.

>
> int main(int argc, const char **argv)
> {
>   int in = open(argv[1], O_RDONLY);
>   int out = open(argv[2], O_CREAT|O_TRUNC|O_WRONLY, 0644);
>   ioctl(out, BTRFS_IOC_CLONE, in);
>   close(in);
>   close(out);
>   return 0;
> }
>
> I've probably got the locking order a bit wrong, lots of error handling is
> missing, and I suspect there's a cleaner way to do the target inode size
> update, but it behaves well enough in my (limited :) testing.
>
> Oh, and I wasn't certain the 'offset' in file_extent_item could be safely
> ignored when duplicating the extent reference.  My assumption was that it
> is orthogonal to extent allocation and isn't related to the backref.
> However, btrfs_insert_file_extent() always set offset=0.  I'm guessing I
> need to add an argument there and fix up the other callers?
>
Yes, you need to preserve the offset.  There's only one place right now that 
sets a non-zero offset and it inserts the extent by hand for other reasons 
(if you're brave, file.c:btrfs_drop_extents)

The reason file extents have an offset field is to allow COW without 
read/modify/write.  Picture something like this:

# create a single 100MB extent in file foo
dd if=/dev/zero of=foo bs=1M count=100
sync

# write into the middle
dd if=/dev/zero of=foo bs=4k count=1 seek=100 conv=notrunc
sync

We've written into the middle of that 100MB extent, and we need to do COW.  
One option is to read the whole thing, change 4k and write it all back.  
Instead, btrfs does something like this (+/- off by need more coffee errors):

file pos = 0 -> [ old extent, offset = 0, num_bytes = 400k ]
file pos = 409600 -> [ new 4k extent, offset = 0, num_bytes = 4k ]
file pos = 413696 -> [ old extent, offset = 413696, num_bytes = 100MB - 404k]

An extra reference is taken on the old extent to reflect that we're pointing 
to it twice.

> Anyway, any comments or suggestions (on the interface or implemantation)
> are welcome.. :)
>
By taking the inode mutex, you protect against file_write and truncates 
changing the file.  But, we also need to prevent mmaps from changing the file 
pages as well.  What you want to do lock all the file bytes in the extent 
tree:

lock_extent(&BTRFS_I(src_inode)->io_tree, 0, (u64)-1, GFP_NOFS);

But unfortunately, the code to fill delayed allocation takes that same lock.  
So you need to loop a bit:

while(1) {
    filemap_write_and_wait(src_inode);
    lock_extent()
    if (BTRFS_I(src_inode)->delalloc_bytes == 0)
        break;
    unlock_extent()
}

That should keep you from racing with btrfs_page_mkwrite()

-chris

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [Btrfs-devel] cloning file data
  2008-04-25 13:41 ` [Btrfs-devel] " Chris Mason
@ 2008-04-25 16:50   ` Zach Brown
  2008-04-25 16:58     ` Chris Mason
  2008-04-25 16:50   ` Zach Brown
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 32+ messages in thread
From: Zach Brown @ 2008-04-25 16:50 UTC (permalink / raw)
  To: Sage Weil; +Cc: Chris Mason, btrfs-devel, linux-btrfs

> We've written into the middle of that 100MB extent, and we need to do COW.  
> One option is to read the whole thing, change 4k and write it all back.  
> Instead, btrfs does something like this (+/- off by need more coffee errors):
> 
> file pos = 0 -> [ old extent, offset = 0, num_bytes = 400k ]
> file pos = 409600 -> [ new 4k extent, offset = 0, num_bytes = 4k ]
> file pos = 413696 -> [ old extent, offset = 413696, num_bytes = 100MB - 404k]
> 
> An extra reference is taken on the old extent to reflect that we're pointing 
> to it twice.

If you learn how to parse the debug-tree output then this can be seen
pretty easily.  To do this we can watch the leaves of the fs tree for
the inode and extent items of the file we work with:

# dd if=/dev/zero bs=1M count=1k of=/tmp/image
# losetup /dev/loop0 /tmp/image
# ./mkfs.btrfs /dev/loop0
# mount -t btrfs /dev/loop0 /mnt/btrfs

# dd if=/dev/zero bs=64M count=1 of=/mnt/btrfs/test
# sync

# ./debug-tree /tmp/image

	item 5 key (256 11 258) itemoff 3779 itemsize 26
		dir index 258 type 1
		namelen 4 datalen 0 name: test
	[...]
	item 1 key (258 1 0) itemoff 2699 itemsize 108
		inode generation 0 size 67108864 [...]
	[...]
	item 3 key (258 12 0) itemoff 2652 itemsize 41
		extent data disk byte 190382080 nr 67108864
		extent data offset 0 nr 67108864

In the root directory we found a dirent for our test file which shows it
has objectid 258, then we found its inode with size=64m and the file
extent which references the 64m extent on disk which starts at byte
offset 190382080.

So now we over-write a 4k region in the file at offset 64k.

# dd if=/dev/zero bs=4k count=1 seek=16 of=/mnt/btrfs/test conv=notrunc
# sync

# ./debug-tree /tmp/image

	item 1 key (258 1 0) itemoff 2699 itemsize 108
		inode generation 0 size 67108864 [...]
	[...]
	item 3 key (258 12 0) itemoff 2652 itemsize 41
		extent data disk byte 190382080 nr 67108864
		extent data offset 0 nr 65536
	item 4 key (258 12 65536) itemoff 2611 itemsize 41
		extent data disk byte 257490944 nr 4096
		extent data offset 0 nr 4096
	item 5 key (258 12 69632) itemoff 2570 itemsize 41
		extent data disk byte 190382080 nr 67108864
		extent data offset 69632 nr 67039232

We still have the same inode, and it has the same size, but its extent
items look very different.  The extent for the first 64k looks much the
same.  It references the old 64m extent on disk.  But see the 'nr
65536', it only maps 64k of that 64m into the file.  Then we have the 4k
extent that we just wrote.  Then we have another reference to that 64m
extent but for the remaining data after the new 4k.

The extra credit assignment is to observe the effect of these extent
reference item changes on the reference count items which are stored
over in the leaves of the extent allocation tree.

debug-tree is fantastic, but it can be kind of intimidating if you don't
already know what all the numbers mean :).  Reducing the barrier to
understanding its output might be a great project for someone interested
in learning the disk format without having to learn how to work with the
kernel code.

- z

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [Btrfs-devel] cloning file data
  2008-04-25 13:41 ` [Btrfs-devel] " Chris Mason
  2008-04-25 16:50   ` Zach Brown
@ 2008-04-25 16:50   ` Zach Brown
  2008-04-25 18:32     ` Sage Weil
  2008-04-25 18:26   ` Sage Weil
  2008-04-25 20:28   ` [Btrfs-devel] cloning file data Sage Weil
  3 siblings, 1 reply; 32+ messages in thread
From: Zach Brown @ 2008-04-25 16:50 UTC (permalink / raw)
  To: Sage Weil; +Cc: Chris Mason, btrfs-devel, linux-btrfs

> We've written into the middle of that 100MB extent, and we need to do COW.  
> One option is to read the whole thing, change 4k and write it all back.  
> Instead, btrfs does something like this (+/- off by need more coffee errors):
> 
> file pos = 0 -> [ old extent, offset = 0, num_bytes = 400k ]
> file pos = 409600 -> [ new 4k extent, offset = 0, num_bytes = 4k ]
> file pos = 413696 -> [ old extent, offset = 413696, num_bytes = 100MB - 404k]
> 
> An extra reference is taken on the old extent to reflect that we're pointing 
> to it twice.

If you learn how to parse the debug-tree output then this can be seen
pretty easily.  To do this we can watch the leaves of the fs tree for
the inode and extent items of the file we work with:

# dd if=/dev/zero bs=1M count=1k of=/tmp/image
# losetup /dev/loop0 /tmp/image
# ./mkfs.btrfs /dev/loop0
# mount -t btrfs /dev/loop0 /mnt/btrfs

# dd if=/dev/zero bs=64M count=1 of=/mnt/btrfs/test
# sync

# ./debug-tree /tmp/image

	item 5 key (256 11 258) itemoff 3779 itemsize 26
		dir index 258 type 1
		namelen 4 datalen 0 name: test
	[...]
	item 1 key (258 1 0) itemoff 2699 itemsize 108
		inode generation 0 size 67108864 [...]
	[...]
	item 3 key (258 12 0) itemoff 2652 itemsize 41
		extent data disk byte 190382080 nr 67108864
		extent data offset 0 nr 67108864

In the root directory we found a dirent for our test file which shows it
has objectid 258, then we found its inode with size=64m and the file
extent which references the 64m extent on disk which starts at byte
offset 190382080.

So now we over-write a 4k region in the file at offset 64k.

# dd if=/dev/zero bs=4k count=1 seek=16 of=/mnt/btrfs/test conv=notrunc
# sync

# ./debug-tree /tmp/image

	item 1 key (258 1 0) itemoff 2699 itemsize 108
		inode generation 0 size 67108864 [...]
	[...]
	item 3 key (258 12 0) itemoff 2652 itemsize 41
		extent data disk byte 190382080 nr 67108864
		extent data offset 0 nr 65536
	item 4 key (258 12 65536) itemoff 2611 itemsize 41
		extent data disk byte 257490944 nr 4096
		extent data offset 0 nr 4096
	item 5 key (258 12 69632) itemoff 2570 itemsize 41
		extent data disk byte 190382080 nr 67108864
		extent data offset 69632 nr 67039232

We still have the same inode, and it has the same size, but its extent
items look very different.  The extent for the first 64k looks much the
same.  It references the old 64m extent on disk.  But see the 'nr
65536', it only maps 64k of that 64m into the file.  Then we have the 4k
extent that we just wrote.  Then we have another reference to that 64m
extent but for the remaining data after the new 4k.

The extra credit assignment is to observe the effect of these extent
reference item changes on the reference count items which are stored
over in the leaves of the extent allocation tree.

debug-tree is fantastic, but it can be kind of intimidating if you don't
already know what all the numbers mean :).  Reducing the barrier to
understanding its output might be a great project for someone interested
in learning the disk format without having to learn how to work with the
kernel code.

- z

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [Btrfs-devel] cloning file data
  2008-04-25 16:50   ` Zach Brown
@ 2008-04-25 16:58     ` Chris Mason
  2008-04-25 17:04       ` Zach Brown
  0 siblings, 1 reply; 32+ messages in thread
From: Chris Mason @ 2008-04-25 16:58 UTC (permalink / raw)
  To: Zach Brown; +Cc: Sage Weil, btrfs-devel, linux-btrfs

On Friday 25 April 2008, Zach Brown wrote:
> > We've written into the middle of that 100MB extent, and we need to do
> > COW. One option is to read the whole thing, change 4k and write it all
> > back. Instead, btrfs does something like this (+/- off by need more
> > coffee errors):
> >
> > file pos = 0 -> [ old extent, offset = 0, num_bytes = 400k ]
> > file pos = 409600 -> [ new 4k extent, offset = 0, num_bytes = 4k ]
> > file pos = 413696 -> [ old extent, offset = 413696, num_bytes = 100MB -
> > 404k]
> >
> > An extra reference is taken on the old extent to reflect that we're
> > pointing to it twice.
>
> If you learn how to parse the debug-tree output then this can be seen
> pretty easily.  To do this we can watch the leaves of the fs tree for
> the inode and extent items of the file we work with:

Thanks for this example Zach, it's great.  One small note:

>
> # dd if=/dev/zero bs=1M count=1k of=/tmp/image
> # losetup /dev/loop0 /tmp/image
> # ./mkfs.btrfs /dev/loop0
> # mount -t btrfs /dev/loop0 /mnt/btrfs
>
> # dd if=/dev/zero bs=64M count=1 of=/mnt/btrfs/test
> # sync
>
> # ./debug-tree /tmp/image

Running debug-tree on a live FS is a very good way to learn about trees that 
get left around while snapshot deletion is happening and cache aliasing 
caused by the way Btrfs puts metadata into its own address space.

But, if you're trying to learn the disk format, I'd stick an unmount between 
the dd and the debug-tree ;)

-chris

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [Btrfs-devel] cloning file data
  2008-04-25 16:58     ` Chris Mason
@ 2008-04-25 17:04       ` Zach Brown
  0 siblings, 0 replies; 32+ messages in thread
From: Zach Brown @ 2008-04-25 17:04 UTC (permalink / raw)
  To: Chris Mason; +Cc: Sage Weil, btrfs-devel, linux-btrfs


> Running debug-tree on a live FS is a very good way to learn about trees that 
> get left around while snapshot deletion is happening and cache aliasing 
> caused by the way Btrfs puts metadata into its own address space.
> 
> But, if you're trying to learn the disk format, I'd stick an unmount between 
> the dd and the debug-tree ;)

Haha, true, true.  If you don't know to watch for confusing racey
results, indeed, serialize around each dump :).

- z

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [Btrfs-devel] cloning file data
  2008-04-25 13:41 ` [Btrfs-devel] " Chris Mason
  2008-04-25 16:50   ` Zach Brown
  2008-04-25 16:50   ` Zach Brown
@ 2008-04-25 18:26   ` Sage Weil
  2008-04-26  4:38     ` Sage Weil
  2008-04-25 20:28   ` [Btrfs-devel] cloning file data Sage Weil
  3 siblings, 1 reply; 32+ messages in thread
From: Sage Weil @ 2008-04-25 18:26 UTC (permalink / raw)
  To: Chris Mason; +Cc: btrfs-devel, linux-btrfs

On Fri, 25 Apr 2008, Chris Mason wrote:
> Very cool.  I'd actually loved to see this wrapped into a program that will 
> cow a directory tree.  Basically the same as cp -al, but with cow instead of 
> linking.

Yeah definitely.  I added a -c/--cow flag to GNU cp, but am having trouble 
coercing autotools into even building on my box.  I'll fiddle with it a 
little later.  Basically, it just tries the ioctl, and goes into the 
regular copy read/write loop if that fails.

> > However, btrfs_insert_file_extent() always set offset=0.  I'm guessing I
> > need to add an argument there and fix up the other callers?
> >
> Yes, you need to preserve the offset.  There's only one place right now that 
> sets a non-zero offset and it inserts the extent by hand for other reasons 
> (if you're brave, file.c:btrfs_drop_extents)

I see.  In this case, since I'm duplicating the forward and backrefs, I 
just added the offset arg to btrfs_insert_file_extent().

> By taking the inode mutex, you protect against file_write and truncates 
> changing the file.  But, we also need to prevent mmaps from changing the file 
> pages as well.  What you want to do lock all the file bytes in the extent 
> tree:
> 
> lock_extent(&BTRFS_I(src_inode)->io_tree, 0, (u64)-1, GFP_NOFS);
> 
> But unfortunately, the code to fill delayed allocation takes that same lock.  
> So you need to loop a bit:
> 
> while(1) {
>     filemap_write_and_wait(src_inode);
>     lock_extent()
>     if (BTRFS_I(src_inode)->delalloc_bytes == 0)
>         break;
>     unlock_extent()
> }
> 
> That should keep you from racing with btrfs_page_mkwrite()

Ah, that's what I was looking for.  The adjusted patch is below!

Thanks-
sage



diff -r 1791a620d509 ctree.h
--- a/ctree.h	Thu Apr 24 13:43:27 2008 -0700
+++ b/ctree.h	Fri Apr 25 10:12:46 2008 -0700
@@ -1135,9 +1135,9 @@ int btrfs_lookup_inode(struct btrfs_tran
 /* file-item.c */
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root,
-			       u64 objectid, u64 pos, u64 offset,
+			       u64 objectid, u64 pos, u64 disk_offset,
 			       u64 disk_num_bytes,
-			       u64 num_bytes);
+			     u64 num_bytes, u64 offset);
 int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     struct btrfs_path *path, u64 objectid,
diff -r 1791a620d509 file-item.c
--- a/file-item.c	Thu Apr 24 13:43:27 2008 -0700
+++ b/file-item.c	Fri Apr 25 10:12:46 2008 -0700
@@ -28,10 +28,10 @@
 			       sizeof(struct btrfs_item) * 2) / \
 			       BTRFS_CRC32_SIZE) - 1))
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
-			       struct btrfs_root *root,
-			       u64 objectid, u64 pos,
-			       u64 offset, u64 disk_num_bytes,
-			       u64 num_bytes)
+			     struct btrfs_root *root,
+			     u64 objectid, u64 pos,
+			     u64 disk_offset, u64 disk_num_bytes,
+			     u64 num_bytes, u64 offset)
 {
 	int ret = 0;
 	struct btrfs_file_extent_item *item;
@@ -53,9 +53,9 @@ int btrfs_insert_file_extent(struct btrf
 	leaf = path->nodes[0];
 	item = btrfs_item_ptr(leaf, path->slots[0],
 			      struct btrfs_file_extent_item);
-	btrfs_set_file_extent_disk_bytenr(leaf, item, offset);
+	btrfs_set_file_extent_disk_bytenr(leaf, item, disk_offset);
 	btrfs_set_file_extent_disk_num_bytes(leaf, item, disk_num_bytes);
-	btrfs_set_file_extent_offset(leaf, item, 0);
+	btrfs_set_file_extent_offset(leaf, item, offset);
 	btrfs_set_file_extent_num_bytes(leaf, item, num_bytes);
 	btrfs_set_file_extent_generation(leaf, item, trans->transid);
 	btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
diff -r 1791a620d509 file.c
--- a/file.c	Thu Apr 24 13:43:27 2008 -0700
+++ b/file.c	Fri Apr 25 10:12:46 2008 -0700
@@ -285,7 +285,7 @@ static int noinline dirty_and_release_pa
 			err = btrfs_insert_file_extent(trans, root,
 						       inode->i_ino,
 						       last_pos_in_file,
-						       0, 0, hole_size);
+						       0, 0, hole_size, 0);
 			btrfs_drop_extent_cache(inode, last_pos_in_file,
 					last_pos_in_file + hole_size -1);
 			btrfs_check_file(root, inode);
diff -r 1791a620d509 inode.c
--- a/inode.c	Thu Apr 24 13:43:27 2008 -0700
+++ b/inode.c	Fri Apr 25 10:12:46 2008 -0700
@@ -18,6 +18,7 @@
 
 #include <linux/bio.h>
 #include <linux/buffer_head.h>
+#include <linux/file.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/highmem.h>
@@ -134,7 +135,7 @@ static int cow_file_range(struct inode *
 		}
 		ret = btrfs_insert_file_extent(trans, root, inode->i_ino,
 					       start, ins.objectid, ins.offset,
-					       ins.offset);
+					       ins.offset, 0);
 		inode->i_blocks += ins.offset >> 9;
 		btrfs_check_file(root, inode);
 		num_bytes -= cur_alloc_size;
@@ -1046,7 +1047,7 @@ static int btrfs_setattr(struct dentry *
 			err = btrfs_insert_file_extent(trans, root,
 						       inode->i_ino,
 						       hole_start, 0, 0,
-						       hole_size);
+						       hole_size, 0);
 			btrfs_drop_extent_cache(inode, hole_start,
 						hole_size - 1);
 			btrfs_check_file(root, inode);
@@ -2726,6 +2727,168 @@ long btrfs_ioctl_trans_end(struct file *
 	return 0;
 }
 
+void dup_item_to_inode(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root,
+		       struct btrfs_path *path,
+		       struct extent_buffer *leaf,
+		       int slot,
+		       struct btrfs_key *key,
+		       u64 destino)
+{
+	struct btrfs_path *cpath = btrfs_alloc_path();
+	int len = btrfs_item_size_nr(leaf, slot);
+	int dstoff;
+	struct btrfs_key ckey = *key;
+	int ret;
+
+	ckey.objectid = destino;
+	ret = btrfs_insert_empty_item(trans, root, cpath, &ckey, len);
+	dstoff = btrfs_item_ptr_offset(cpath->nodes[0], cpath->slots[0]);
+	copy_extent_buffer(cpath->nodes[0], leaf, dstoff,
+			   btrfs_item_ptr_offset(leaf, slot),
+			   len);
+	btrfs_release_path(root, cpath);
+}
+
+long btrfs_ioctl_clone(struct file *file, unsigned long src_fd)
+{
+	struct inode *inode = fdentry(file)->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct file *src_file;
+	struct inode *src;
+	struct btrfs_trans_handle *trans;
+	int ret;
+	u64 pos;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct extent_buffer *leaf;
+	u32 nritems;
+	int nextret;
+	int slot;
+
+	src_file = fget(src_fd);
+	if (!src_file)
+		return -EBADF;
+	src = src_file->f_dentry->d_inode;
+
+	ret = -EXDEV;
+	if (src->i_sb != inode->i_sb)
+		goto out_fput;
+
+	if (inode < src) {
+		mutex_lock(&inode->i_mutex);
+		mutex_lock(&src->i_mutex);
+	} else {
+		mutex_lock(&src->i_mutex);
+		mutex_lock(&inode->i_mutex);	
+	}
+	
+	ret = -ENOTEMPTY;
+	if (inode->i_size)
+		goto out_unlock;
+
+	/* do any pending delalloc/csum calc on src, one way or
+	   another, and lock file content */
+	while (1) {
+		filemap_write_and_wait(src->i_mapping);
+		lock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS);
+		if (BTRFS_I(src)->delalloc_bytes == 0)
+			break;
+		unlock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS);
+	}
+
+	mutex_lock(&root->fs_info->fs_mutex);
+	trans = btrfs_start_transaction(root, 0);
+	path = btrfs_alloc_path();
+	pos = 0;
+	while (1) {
+		ret = btrfs_lookup_file_extent(trans, root, path, src->i_ino,
+					       pos, 0);
+		if (ret < 0)
+			goto out;
+		if (ret > 0) {
+			if (path->slots[0] == 0) {
+				ret = 0;
+				goto out;
+			}
+			path->slots[0]--;
+		}
+	next_slot:
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+
+		printk("key(%llu %x %llu)\n",
+		       key.objectid, key.type, key.offset);
+		if (btrfs_key_type(&key) > BTRFS_CSUM_ITEM_KEY ||
+		    key.objectid != src->i_ino)
+			goto out;
+		if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
+			struct btrfs_file_extent_item *extent;
+			int found_type;
+			u64 len;
+			pos = key.offset;
+			extent = btrfs_item_ptr(leaf, slot,
+						struct btrfs_file_extent_item);
+			found_type = btrfs_file_extent_type(leaf, extent);
+			len = btrfs_file_extent_num_bytes(leaf, extent);
+			if (found_type == BTRFS_FILE_EXTENT_REG) {
+				u64 ds = btrfs_file_extent_disk_bytenr(leaf, 
+								       extent);
+				u64 dl = btrfs_file_extent_disk_num_bytes(leaf, 
+								 extent);
+				u64 off = btrfs_file_extent_offset(leaf, 
+								   extent);
+				printk(" %llu~%llu disk %llu~%llu off %llu\n",
+				       pos, len, ds, dl, off);
+				btrfs_insert_file_extent(trans, root, 
+							 inode->i_ino, pos,
+							 ds, dl, len, off);
+				btrfs_inc_extent_ref(trans, root, ds, dl,
+						     root->root_key.objectid,
+						     trans->transid,
+						     inode->i_ino, pos);
+			} else if (found_type == BTRFS_FILE_EXTENT_INLINE)
+				dup_item_to_inode(trans, root, path, leaf, slot,
+						  &key, inode->i_ino);
+			pos = key.offset + len;
+		} else if (btrfs_key_type(&key) == BTRFS_CSUM_ITEM_KEY)
+			dup_item_to_inode(trans, root, path, leaf, slot, &key, 
+					  inode->i_ino);
+		
+		nritems = btrfs_header_nritems(leaf);
+		if (slot >= nritems - 1) {
+			nextret = btrfs_next_leaf(root, path);
+			if (nextret)
+				goto out;
+		} else {
+			path->slots[0]++;
+		}
+		goto next_slot;
+	}
+
+out:
+	ret = 0;
+	mutex_unlock(&root->fs_info->fs_mutex);	
+
+	i_size_write(inode, src->i_size);
+	inode->i_blocks = src->i_blocks;
+	mark_inode_dirty(inode);
+
+	unlock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS);
+
+	mutex_lock(&root->fs_info->fs_mutex);	
+	btrfs_end_transaction(trans, root);
+	mutex_unlock(&root->fs_info->fs_mutex);	
+
+out_unlock:
+	mutex_unlock(&src->i_mutex);
+	mutex_unlock(&inode->i_mutex);
+out_fput:
+	fput(src_file);
+	return ret;
+}
+
 long btrfs_ioctl(struct file *file, unsigned int
 		cmd, unsigned long arg)
 {
@@ -2744,6 +2907,9 @@ long btrfs_ioctl(struct file *file, unsi
 		return btrfs_ioctl_trans_end(file);
 	case BTRFS_IOC_SYNC:
 		btrfs_sync_fs(file->f_dentry->d_sb, 1);
+		return 0;
+	case BTRFS_IOC_CLONE:
+		btrfs_ioctl_clone(file, arg);
 		return 0;
 	}
 
diff -r 1791a620d509 ioctl.h
--- a/ioctl.h	Thu Apr 24 13:43:27 2008 -0700
+++ b/ioctl.h	Fri Apr 25 10:12:46 2008 -0700
@@ -36,5 +36,6 @@ struct btrfs_ioctl_vol_args {
 #define BTRFS_IOC_TRANS_START  _IO(BTRFS_IOCTL_MAGIC, 6)
 #define BTRFS_IOC_TRANS_END    _IO(BTRFS_IOCTL_MAGIC, 7)
 #define BTRFS_IOC_SYNC         _IO(BTRFS_IOCTL_MAGIC, 8)
+#define BTRFS_IOC_CLONE        _IOW(BTRFS_IOCTL_MAGIC, 9, int)
 #endif
 

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [Btrfs-devel] cloning file data
  2008-04-25 16:50   ` Zach Brown
@ 2008-04-25 18:32     ` Sage Weil
  0 siblings, 0 replies; 32+ messages in thread
From: Sage Weil @ 2008-04-25 18:32 UTC (permalink / raw)
  To: Zach Brown; +Cc: Chris Mason, btrfs-devel, linux-btrfs

On Fri, 25 Apr 2008, Zach Brown wrote:
> We still have the same inode, and it has the same size, but its extent
> items look very different.  The extent for the first 64k looks much the
> same.  It references the old 64m extent on disk.  But see the 'nr
> 65536', it only maps 64k of that 64m into the file.  Then we have the 4k
> extent that we just wrote.  Then we have another reference to that 64m
> extent but for the remaining data after the new 4k.

Is there anything in the defragger (or whatever) that looks for minimally 
referenced extents?  Once can imagine a situation where only a small piece 
of a large extent is remains referenced, but that information is buried in 
the forward reference(s).

> debug-tree is fantastic, but it can be kind of intimidating if you don't
> already know what all the numbers mean :).  Reducing the barrier to

Yep.  Although in my case, the biggest stumbling block was realizing that 
the key type

> 	item 3 key (258 12 0) itemoff 2652 itemsize 41
                        ^^
is in printed in hex for some reason.  Der.

sage

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [Btrfs-devel] cloning file data
  2008-04-25 13:41 ` [Btrfs-devel] " Chris Mason
                     ` (2 preceding siblings ...)
  2008-04-25 18:26   ` Sage Weil
@ 2008-04-25 20:28   ` Sage Weil
  3 siblings, 0 replies; 32+ messages in thread
From: Sage Weil @ 2008-04-25 20:28 UTC (permalink / raw)
  To: Chris Mason; +Cc: btrfs-devel, linux-btrfs

On Fri, 25 Apr 2008, Chris Mason wrote:
> Very cool.  I'd actually loved to see this wrapped into a program that will 
> cow a directory tree.  Basically the same as cp -al, but with cow instead of 
> linking.

Here's a pretty trivial patch against cp in coreutils-6.10.  It'll 
probably take you more time to build than to make the changes yourself, 
but this seems much handier and more robust than a separate tool.

$ cp -ac /mnt/btrfs/from /mnt/btrfs/to
...

sage


diff -ur coreutils-6.10/src/copy.c coreutils-6.10-btrfs/src/copy.c
--- coreutils-6.10/src/copy.c	2008-01-05 14:59:11.000000000 -0800
+++ coreutils-6.10-btrfs/src/copy.c	2008-04-25 13:12:20.000000000 -0700
@@ -537,6 +537,9 @@
     buf_alloc = xmalloc (buf_size + buf_alignment_slop);
     buf = ptr_align (buf_alloc, buf_alignment);
 
+#define BTRFS_IOC_CLONE 0x40049409
+    if (!x->cow ||
+	ioctl(dest_desc, BTRFS_IOC_CLONE, source_desc) != 0)
     for (;;)
       {
 	word *wp = NULL;
diff -ur coreutils-6.10/src/copy.h coreutils-6.10-btrfs/src/copy.h
--- coreutils-6.10/src/copy.h	2008-01-05 14:58:25.000000000 -0800
+++ coreutils-6.10-btrfs/src/copy.h	2008-04-25 12:46:30.000000000 -0700
@@ -135,6 +135,8 @@
      on different file systems from the one we started on.  */
   bool one_file_system;
 
+  bool cow;
+
   /* If true, attempt to give the copies the original files' permissions,
      owner, group, and timestamps. */
   bool preserve_ownership;
diff -ur coreutils-6.10/src/cp.c coreutils-6.10-btrfs/src/cp.c
--- coreutils-6.10/src/cp.c	2008-01-11 03:19:53.000000000 -0800
+++ coreutils-6.10-btrfs/src/cp.c	2008-04-25 13:23:12.000000000 -0700
@@ -125,6 +125,7 @@
   {"archive", no_argument, NULL, 'a'},
   {"backup", optional_argument, NULL, 'b'},
   {"copy-contents", no_argument, NULL, COPY_CONTENTS_OPTION},
+  {"cow", no_argument, NULL, 'c'},
   {"dereference", no_argument, NULL, 'L'},
   {"force", no_argument, NULL, 'f'},
   {"interactive", no_argument, NULL, 'i'},
@@ -178,6 +179,7 @@
       --backup[=CONTROL]       make a backup of each existing destination file\n\
   -b                           like --backup but does not accept an argument\n\
       --copy-contents          copy contents of special files when recursive\n\
+  -c, --cow                    attempt issuing copy-on-write ioctl to fs\n\
   -d                           same as --no-dereference --preserve=links\n\
 "), stdout);
       fputs (_("\
@@ -767,6 +769,7 @@
   x->interactive = I_UNSPECIFIED;
   x->move_mode = false;
   x->one_file_system = false;
+  x->cow = false;
 
   x->preserve_ownership = false;
   x->preserve_links = false;
@@ -909,7 +912,7 @@
      we'll actually use backup_suffix_string.  */
   backup_suffix_string = getenv ("SIMPLE_BACKUP_SUFFIX");
 
-  while ((c = getopt_long (argc, argv, "abdfHilLprst:uvxPRS:T",
+  while ((c = getopt_long (argc, argv, "abcdfHilLprst:uvxPRS:T",
 			   long_opts, NULL))
 	 != -1)
     {
@@ -940,6 +943,10 @@
 	  copy_contents = true;
 	  break;
 
+	case 'c':
+	  x.cow = true;
+	  break;
+
 	case 'd':
 	  x.preserve_links = true;
 	  x.dereference = DEREF_NEVER;

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [Btrfs-devel] cloning file data
  2008-04-25 18:26   ` Sage Weil
@ 2008-04-26  4:38     ` Sage Weil
  2008-05-03  4:44       ` Yan Zheng
  0 siblings, 1 reply; 32+ messages in thread
From: Sage Weil @ 2008-04-26  4:38 UTC (permalink / raw)
  To: Chris Mason; +Cc: btrfs-devel, linux-btrfs

Hi-

Couple small fixes.  Cloning a linux kernel tree (cp -ac) takes 5-6 
seconds (cp -al is 3-4 seconds, cp -a is 15-20 seconds).

sage



diff -r 1791a620d509 ctree.h
--- a/ctree.h	Thu Apr 24 13:43:27 2008 -0700
+++ b/ctree.h	Fri Apr 25 21:14:25 2008 -0700
@@ -1135,9 +1135,9 @@ int btrfs_lookup_inode(struct btrfs_tran
 /* file-item.c */
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root,
-			       u64 objectid, u64 pos, u64 offset,
+			       u64 objectid, u64 pos, u64 disk_offset,
 			       u64 disk_num_bytes,
-			       u64 num_bytes);
+			     u64 num_bytes, u64 offset);
 int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     struct btrfs_path *path, u64 objectid,
diff -r 1791a620d509 file-item.c
--- a/file-item.c	Thu Apr 24 13:43:27 2008 -0700
+++ b/file-item.c	Fri Apr 25 21:14:25 2008 -0700
@@ -28,10 +28,10 @@
 			       sizeof(struct btrfs_item) * 2) / \
 			       BTRFS_CRC32_SIZE) - 1))
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
-			       struct btrfs_root *root,
-			       u64 objectid, u64 pos,
-			       u64 offset, u64 disk_num_bytes,
-			       u64 num_bytes)
+			     struct btrfs_root *root,
+			     u64 objectid, u64 pos,
+			     u64 disk_offset, u64 disk_num_bytes,
+			     u64 num_bytes, u64 offset)
 {
 	int ret = 0;
 	struct btrfs_file_extent_item *item;
@@ -53,9 +53,9 @@ int btrfs_insert_file_extent(struct btrf
 	leaf = path->nodes[0];
 	item = btrfs_item_ptr(leaf, path->slots[0],
 			      struct btrfs_file_extent_item);
-	btrfs_set_file_extent_disk_bytenr(leaf, item, offset);
+	btrfs_set_file_extent_disk_bytenr(leaf, item, disk_offset);
 	btrfs_set_file_extent_disk_num_bytes(leaf, item, disk_num_bytes);
-	btrfs_set_file_extent_offset(leaf, item, 0);
+	btrfs_set_file_extent_offset(leaf, item, offset);
 	btrfs_set_file_extent_num_bytes(leaf, item, num_bytes);
 	btrfs_set_file_extent_generation(leaf, item, trans->transid);
 	btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
diff -r 1791a620d509 file.c
--- a/file.c	Thu Apr 24 13:43:27 2008 -0700
+++ b/file.c	Fri Apr 25 21:14:25 2008 -0700
@@ -285,7 +285,7 @@ static int noinline dirty_and_release_pa
 			err = btrfs_insert_file_extent(trans, root,
 						       inode->i_ino,
 						       last_pos_in_file,
-						       0, 0, hole_size);
+						       0, 0, hole_size, 0);
 			btrfs_drop_extent_cache(inode, last_pos_in_file,
 					last_pos_in_file + hole_size -1);
 			btrfs_check_file(root, inode);
diff -r 1791a620d509 inode.c
--- a/inode.c	Thu Apr 24 13:43:27 2008 -0700
+++ b/inode.c	Fri Apr 25 21:14:25 2008 -0700
@@ -18,6 +18,7 @@
 
 #include <linux/bio.h>
 #include <linux/buffer_head.h>
+#include <linux/file.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/highmem.h>
@@ -134,7 +135,7 @@ static int cow_file_range(struct inode *
 		}
 		ret = btrfs_insert_file_extent(trans, root, inode->i_ino,
 					       start, ins.objectid, ins.offset,
-					       ins.offset);
+					       ins.offset, 0);
 		inode->i_blocks += ins.offset >> 9;
 		btrfs_check_file(root, inode);
 		num_bytes -= cur_alloc_size;
@@ -1046,7 +1047,7 @@ static int btrfs_setattr(struct dentry *
 			err = btrfs_insert_file_extent(trans, root,
 						       inode->i_ino,
 						       hole_start, 0, 0,
-						       hole_size);
+						       hole_size, 0);
 			btrfs_drop_extent_cache(inode, hole_start,
 						hole_size - 1);
 			btrfs_check_file(root, inode);
@@ -2726,6 +2727,168 @@ long btrfs_ioctl_trans_end(struct file *
 	return 0;
 }
 
+void dup_item_to_inode(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root,
+		       struct btrfs_path *path,
+		       struct extent_buffer *leaf,
+		       int slot,
+		       struct btrfs_key *key,
+		       u64 destino)
+{
+	struct btrfs_path *cpath = btrfs_alloc_path();
+	int len = btrfs_item_size_nr(leaf, slot);
+	int dstoff;
+	struct btrfs_key ckey = *key;
+	int ret;
+
+	ckey.objectid = destino;
+	ret = btrfs_insert_empty_item(trans, root, cpath, &ckey, len);
+	dstoff = btrfs_item_ptr_offset(cpath->nodes[0], cpath->slots[0]);
+	copy_extent_buffer(cpath->nodes[0], leaf, dstoff,
+			   btrfs_item_ptr_offset(leaf, slot),
+			   len);
+	btrfs_release_path(root, cpath);
+}
+
+long btrfs_ioctl_clone(struct file *file, unsigned long src_fd)
+{
+	struct inode *inode = fdentry(file)->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct file *src_file;
+	struct inode *src;
+	struct btrfs_trans_handle *trans;
+	int ret;
+	u64 pos;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct extent_buffer *leaf;
+	u32 nritems;
+	int nextret;
+	int slot;
+
+	src_file = fget(src_fd);
+	if (!src_file)
+		return -EBADF;
+	src = src_file->f_dentry->d_inode;
+
+	ret = -EXDEV;
+	if (src->i_sb != inode->i_sb)
+		goto out_fput;
+
+	if (inode < src) {
+		mutex_lock(&inode->i_mutex);
+		mutex_lock(&src->i_mutex);
+	} else {
+		mutex_lock(&src->i_mutex);
+		mutex_lock(&inode->i_mutex);	
+	}
+	
+	ret = -ENOTEMPTY;
+	if (inode->i_size)
+		goto out_unlock;
+
+	/* do any pending delalloc/csum calc on src, one way or
+	   another, and lock file content */
+	while (1) {
+		filemap_write_and_wait(src->i_mapping);
+		lock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS);
+		if (BTRFS_I(src)->delalloc_bytes == 0)
+			break;
+		unlock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS);
+	}
+
+	mutex_lock(&root->fs_info->fs_mutex);
+	trans = btrfs_start_transaction(root, 0);
+	path = btrfs_alloc_path();
+	pos = 0;
+	while (1) {
+		ret = btrfs_lookup_file_extent(trans, root, path, src->i_ino,
+					       pos, 0);
+		if (ret < 0)
+			goto out;
+		if (ret > 0) {
+			if (path->slots[0] == 0) {
+				ret = 0;
+				goto out;
+			}
+			path->slots[0]--;
+		}
+	next_slot:
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+		nritems = btrfs_header_nritems(leaf);
+
+		if (btrfs_key_type(&key) > BTRFS_CSUM_ITEM_KEY ||
+		    key.objectid != src->i_ino)
+			goto out;
+		if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
+			struct btrfs_file_extent_item *extent;
+			int found_type;
+			pos = key.offset;
+			extent = btrfs_item_ptr(leaf, slot,
+						struct btrfs_file_extent_item);
+			found_type = btrfs_file_extent_type(leaf, extent);
+			if (found_type == BTRFS_FILE_EXTENT_REG) {
+				u64 len = btrfs_file_extent_num_bytes(leaf, 
+								      extent);
+				u64 ds = btrfs_file_extent_disk_bytenr(leaf, 
+								       extent);
+				u64 dl = btrfs_file_extent_disk_num_bytes(leaf, 
+								 extent);
+				u64 off = btrfs_file_extent_offset(leaf, 
+								   extent);
+				btrfs_insert_file_extent(trans, root, 
+							 inode->i_ino, pos,
+							 ds, dl, len, off);
+				btrfs_inc_extent_ref(trans, root, ds, dl,
+						     root->root_key.objectid,
+						     trans->transid,
+						     inode->i_ino, pos);
+				pos = key.offset + len;
+			} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+				dup_item_to_inode(trans, root, path, leaf, slot,
+						  &key, inode->i_ino);
+				pos = key.offset + btrfs_item_size_nr(leaf, 
+								      slot);
+			}
+		} else if (btrfs_key_type(&key) == BTRFS_CSUM_ITEM_KEY)
+			dup_item_to_inode(trans, root, path, leaf, slot, &key, 
+					  inode->i_ino);
+		
+		if (slot >= nritems - 1) {
+			nextret = btrfs_next_leaf(root, path);
+			if (nextret)
+				goto out;
+		} else {
+			path->slots[0]++;
+		}
+		goto next_slot;
+	}
+
+out:
+	btrfs_free_path(path);
+	ret = 0;
+	mutex_unlock(&root->fs_info->fs_mutex);	
+
+	i_size_write(inode, src->i_size);
+	inode->i_blocks = src->i_blocks;
+	mark_inode_dirty(inode);
+
+	unlock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS);
+
+	mutex_lock(&root->fs_info->fs_mutex);	
+	btrfs_end_transaction(trans, root);
+	mutex_unlock(&root->fs_info->fs_mutex);	
+
+out_unlock:
+	mutex_unlock(&src->i_mutex);
+	mutex_unlock(&inode->i_mutex);
+out_fput:
+	fput(src_file);
+	return ret;
+}
+
 long btrfs_ioctl(struct file *file, unsigned int
 		cmd, unsigned long arg)
 {
@@ -2744,6 +2907,9 @@ long btrfs_ioctl(struct file *file, unsi
 		return btrfs_ioctl_trans_end(file);
 	case BTRFS_IOC_SYNC:
 		btrfs_sync_fs(file->f_dentry->d_sb, 1);
+		return 0;
+	case BTRFS_IOC_CLONE:
+		btrfs_ioctl_clone(file, arg);
 		return 0;
 	}
 
diff -r 1791a620d509 ioctl.h
--- a/ioctl.h	Thu Apr 24 13:43:27 2008 -0700
+++ b/ioctl.h	Fri Apr 25 21:14:25 2008 -0700
@@ -36,5 +36,6 @@ struct btrfs_ioctl_vol_args {
 #define BTRFS_IOC_TRANS_START  _IO(BTRFS_IOCTL_MAGIC, 6)
 #define BTRFS_IOC_TRANS_END    _IO(BTRFS_IOCTL_MAGIC, 7)
 #define BTRFS_IOC_SYNC         _IO(BTRFS_IOCTL_MAGIC, 8)
+#define BTRFS_IOC_CLONE        _IOW(BTRFS_IOCTL_MAGIC, 9, int)
 #endif
 

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [Btrfs-devel] cloning file data
  2008-04-24 22:47 cloning file data Sage Weil
  2008-04-25 13:41 ` [Btrfs-devel] " Chris Mason
@ 2008-04-29 20:52 ` Chris Mason
  2008-05-02 20:50 ` Chris Mason
  2 siblings, 0 replies; 32+ messages in thread
From: Chris Mason @ 2008-04-29 20:52 UTC (permalink / raw)
  To: btrfs-devel; +Cc: Sage Weil, linux-btrfs

On Thursday 24 April 2008, Sage Weil wrote:
> Hi-
>
> I'm working on a clone ioctl that will quickly and efficiently duplicate
> the contents of a file, e.g.
>

Just FYI, I didn't sneak this into v0.14 because I didn't quite have the 
cycles to test it.  It'll go in this week.

-chris

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [Btrfs-devel] cloning file data
  2008-04-24 22:47 cloning file data Sage Weil
  2008-04-25 13:41 ` [Btrfs-devel] " Chris Mason
  2008-04-29 20:52 ` Chris Mason
@ 2008-05-02 20:50 ` Chris Mason
  2008-05-02 21:38   ` Sage Weil
  2 siblings, 1 reply; 32+ messages in thread
From: Chris Mason @ 2008-05-02 20:50 UTC (permalink / raw)
  To: Sage Weil; +Cc: linux-btrfs

On Thursday 24 April 2008, Sage Weil wrote:
> Hi-
>
> I'm working on a clone ioctl that will quickly and efficiently duplicate
> the contents of a file, e.g.

Sage's work has been pushed into the stable and unstable trees, along with a 
small command called bcp to trigger the clone ioctls.  bcp is used like this:

bcp src dst

If src is a directory, it is copied recursively.  If the clone ioctl fails, a 
fallback to buffer copies is done instead.

Sage, I had to make a few small changes to your ioctl code.  One was to skip 
reference count updates if the extent is a hole, and the other was to change 
around mark_inode_dirty a bit to avoid transaction deadlock.  We aren't 
actually making any pages dirty so it is safe to just update the inode.

-chris

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [Btrfs-devel] cloning file data
  2008-05-02 20:50 ` Chris Mason
@ 2008-05-02 21:38   ` Sage Weil
  0 siblings, 0 replies; 32+ messages in thread
From: Sage Weil @ 2008-05-02 21:38 UTC (permalink / raw)
  To: Chris Mason; +Cc: linux-btrfs

On Fri, 2 May 2008, Chris Mason wrote:
> Sage's work has been pushed into the stable and unstable trees, along with a 

Thanks!

For the transaction ioctls...  would it make more sense to specify a 
string of ops to apply atomically in a vector instead of exposing the raw 
transaction start/end to userspace?  Would that avoid the deadlock and 
memory issues?

sage

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [Btrfs-devel] cloning file data
  2008-04-26  4:38     ` Sage Weil
@ 2008-05-03  4:44       ` Yan Zheng
  2008-05-03  6:16         ` Sage Weil
  0 siblings, 1 reply; 32+ messages in thread
From: Yan Zheng @ 2008-05-03  4:44 UTC (permalink / raw)
  To: Sage Weil; +Cc: Chris Mason, btrfs-devel, linux-btrfs

Hello Sage,

I think the clone ioctl won't work in some corner case. The big loop
in btrfs_ioctl_clone uses path->slots[0]++ and btrfs_next_leaf to get
next item in the tree. However, this approach works only when the
layout of tree keeps unchangeed. In btrfs_ioctl_clone, both
btrfs_insert_file_extent and dup_item_to_inode may change the layout
of tree.

To be safe, I think the codes should:
 use btrfs_search_slot to find next item.
 use a intermediate buffer when coping item between two extent buffer.

Regards
YZ

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [Btrfs-devel] cloning file data
  2008-05-03  4:44       ` Yan Zheng
@ 2008-05-03  6:16         ` Sage Weil
  2008-05-03  6:48           ` Yan Zheng
  2008-05-03  7:25           ` Yan Zheng
  0 siblings, 2 replies; 32+ messages in thread
From: Sage Weil @ 2008-05-03  6:16 UTC (permalink / raw)
  To: Yan Zheng; +Cc: Chris Mason, btrfs-devel, linux-btrfs

Hi Yan-

On Sat, 3 May 2008, Yan Zheng wrote:
> I think the clone ioctl won't work in some corner case. The big loop
> in btrfs_ioctl_clone uses path->slots[0]++ and btrfs_next_leaf to get
> next item in the tree. However, this approach works only when the
> layout of tree keeps unchangeed. In btrfs_ioctl_clone, both
> btrfs_insert_file_extent and dup_item_to_inode may change the layout
> of tree.
> 
> To be safe, I think the codes should:
>  use btrfs_search_slot to find next item.
>  use a intermediate buffer when coping item between two extent buffer.

Oh, right.  I think for the item copy, though, we just need to re-search 
for the source key again after doing the insert_empty_item.  Then we can 
still use copy_extent_buffer, since at that point both paths will be 
valid?

Something like the below (untested) patch.  I suspect I didn't hit this 
because I was always cloning 'file' to something like 'file2' that always 
sorted after the src file, and didn't shift its position in the leaf. I'll 
try to test this in the next few days.

Thanks-
sage


diff -r f6ba18a50ad7 inode.c
--- a/inode.c   Fri May 02 16:13:49 2008 -0400
+++ b/inode.c   Fri May 02 23:11:49 2008 -0700
@@ -3103,25 +3103,27 @@ out:
 
 void dup_item_to_inode(struct btrfs_trans_handle *trans,
                       struct btrfs_root *root,
-                      struct btrfs_path *path,
-                      struct extent_buffer *leaf,
-                      int slot,
-                      struct btrfs_key *key,
+                      struct btrfs_path *srcpath,
+                      struct btrfs_key *srckey,
                       u64 destino)
 {
-       struct btrfs_path *cpath = btrfs_alloc_path();
-       int len = btrfs_item_size_nr(leaf, slot);
-       int dstoff;
-       struct btrfs_key ckey = *key;
+       struct btrfs_key dstkey = *srckey;
+       struct btrfs_path *dstpath = btrfs_alloc_path();
+       int len = btrfs_item_size_nr(srcpath->nodes[0], 
+                                    srcpath->slots[0]);
        int ret;
 
-       ckey.objectid = destino;
-       ret = btrfs_insert_empty_item(trans, root, cpath, &ckey, len);
-       dstoff = btrfs_item_ptr_offset(cpath->nodes[0], cpath->slots[0]);
-       copy_extent_buffer(cpath->nodes[0], leaf, dstoff,
-                          btrfs_item_ptr_offset(leaf, slot),
+       dstkey.objectid = destino;
+       ret = btrfs_insert_empty_item(trans, root, dstpath, &dstkey, len);
+       /* re-search for src key, in case we changed srcpath */
+       ret = btrfs_search_slot(trans, root, srckey, srcpath, 0, 0);
+       copy_extent_buffer(dstpath->nodes[0], srcpath->nodes[0], 
+                          btrfs_item_ptr_offset(dstpath->nodes[0], 
+                                                dstpath->slots[0]),
+                          btrfs_item_ptr_offset(srcpath->nodes[0],
+                                                srcpath->slots[0]),
                           len);
-       btrfs_release_path(root, cpath);
+       btrfs_release_path(root, dstpath);
 }
 
 long btrfs_ioctl_clone(struct file *file, unsigned long src_fd)
@@ -3137,7 +3139,6 @@ long btrfs_ioctl_clone(struct file *file
        struct btrfs_key key;
        struct extent_buffer *leaf;
        u32 nritems;
-       int nextret;
        int slot;
 
        src_file = fget(src_fd);
@@ -3178,6 +3179,8 @@ long btrfs_ioctl_clone(struct file *file
        while (1) {
                ret = btrfs_lookup_file_extent(trans, root, path, src->i_ino,
                                               pos, 0);
+
+next_slot:
                if (ret < 0)
                        goto out;
                if (ret > 0) {
@@ -3187,7 +3190,7 @@ long btrfs_ioctl_clone(struct file *file
                        }
                        path->slots[0]--;
                }
-next_slot:
+
                leaf = path->nodes[0];
                slot = path->slots[0];
                btrfs_item_key_to_cpu(leaf, &key, slot);
@@ -3225,22 +3228,19 @@ next_slot:
                                }
                                pos = key.offset + len;
                        } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
-                               dup_item_to_inode(trans, root, path, leaf, slot,
-                                                 &key, inode->i_ino);
+                               dup_item_to_inode(trans, root, path, &key, 
+                                                 inode->i_ino);
                                pos = key.offset + btrfs_item_size_nr(leaf,
                                                                      slot);
                        }
                } else if (btrfs_key_type(&key) == BTRFS_CSUM_ITEM_KEY)
-                       dup_item_to_inode(trans, root, path, leaf, slot, &key,
+                       dup_item_to_inode(trans, root, path, &key,
                                          inode->i_ino);
 
-               if (slot >= nritems - 1) {
-                       nextret = btrfs_next_leaf(root, path);
-                       if (nextret)
-                               goto out;
-               } else {
-                       path->slots[0]++;
-               }
+               /* path may not still be valid, so explicitly search
+                * for the next key */
+               key.offset = pos;
+               ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
                goto next_slot;
        }
 


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [Btrfs-devel] cloning file data
  2008-05-03  6:16         ` Sage Weil
@ 2008-05-03  6:48           ` Yan Zheng
  2008-05-03  7:25           ` Yan Zheng
  1 sibling, 0 replies; 32+ messages in thread
From: Yan Zheng @ 2008-05-03  6:48 UTC (permalink / raw)
  To: Sage Weil; +Cc: Chris Mason, btrfs-devel, linux-btrfs

2008/5/3, Sage Weil <sage@newdream.net>:
> Hi Yan-
>
> On Sat, 3 May 2008, Yan Zheng wrote:
> > I think the clone ioctl won't work in some corner case. The big loop
> > in btrfs_ioctl_clone uses path->slots[0]++ and btrfs_next_leaf to get
> > next item in the tree. However, this approach works only when the
> > layout of tree keeps unchangeed. In btrfs_ioctl_clone, both
> > btrfs_insert_file_extent and dup_item_to_inode may change the layout
> > of tree.
> >
> > To be safe, I think the codes should:
> >  use btrfs_search_slot to find next item.
> >  use a intermediate buffer when coping item between two extent buffer.
>
> Oh, right.  I think for the item copy, though, we just need to re-search
> for the source key again after doing the insert_empty_item.  Then we can
> still use copy_extent_buffer, since at that point both paths will be
> valid?
>
> Something like the below (untested) patch.  I suspect I didn't hit this
> because I was always cloning 'file' to something like 'file2' that always
> sorted after the src file, and didn't shift its position in the leaf. I'll
> try to test this in the next few days.
>
> Thanks-
> sage
>
>
> diff -r f6ba18a50ad7 inode.c
> --- a/inode.c   Fri May 02 16:13:49 2008 -0400
> +++ b/inode.c   Fri May 02 23:11:49 2008 -0700
> @@ -3103,25 +3103,27 @@ out:
>
>  void dup_item_to_inode(struct btrfs_trans_handle *trans,
>                       struct btrfs_root *root,
> -                      struct btrfs_path *path,
> -                      struct extent_buffer *leaf,
> -                      int slot,
> -                      struct btrfs_key *key,
> +                      struct btrfs_path *srcpath,
> +                      struct btrfs_key *srckey,
>                       u64 destino)
>  {
> -       struct btrfs_path *cpath = btrfs_alloc_path();
> -       int len = btrfs_item_size_nr(leaf, slot);
> -       int dstoff;
> -       struct btrfs_key ckey = *key;
> +       struct btrfs_key dstkey = *srckey;
> +       struct btrfs_path *dstpath = btrfs_alloc_path();
> +       int len = btrfs_item_size_nr(srcpath->nodes[0],
> +                                    srcpath->slots[0]);
>        int ret;
>
> -       ckey.objectid = destino;
> -       ret = btrfs_insert_empty_item(trans, root, cpath, &ckey, len);
> -       dstoff = btrfs_item_ptr_offset(cpath->nodes[0], cpath->slots[0]);
> -       copy_extent_buffer(cpath->nodes[0], leaf, dstoff,
> -                          btrfs_item_ptr_offset(leaf, slot),
> +       dstkey.objectid = destino;
> +       ret = btrfs_insert_empty_item(trans, root, dstpath, &dstkey, len);
> +       /* re-search for src key, in case we changed srcpath */
> +       ret = btrfs_search_slot(trans, root, srckey, srcpath, 0, 0);
> +       copy_extent_buffer(dstpath->nodes[0], srcpath->nodes[0],
> +                          btrfs_item_ptr_offset(dstpath->nodes[0],
> +                                                dstpath->slots[0]),
> +                          btrfs_item_ptr_offset(srcpath->nodes[0],
> +                                                srcpath->slots[0]),
>                           len);
> -       btrfs_release_path(root, cpath);
> +       btrfs_release_path(root, dstpath);
>  }
>
>  long btrfs_ioctl_clone(struct file *file, unsigned long src_fd)
> @@ -3137,7 +3139,6 @@ long btrfs_ioctl_clone(struct file *file
>        struct btrfs_key key;
>        struct extent_buffer *leaf;
>        u32 nritems;
> -       int nextret;
>        int slot;
>
>        src_file = fget(src_fd);
> @@ -3178,6 +3179,8 @@ long btrfs_ioctl_clone(struct file *file
>        while (1) {
>                ret = btrfs_lookup_file_extent(trans, root, path, src->i_ino,
>                                               pos, 0);
> +
> +next_slot:
>                if (ret < 0)
>                        goto out;
>                if (ret > 0) {
> @@ -3187,7 +3190,7 @@ long btrfs_ioctl_clone(struct file *file
>                        }
>                        path->slots[0]--;
>                }
> -next_slot:
> +
>                leaf = path->nodes[0];
>                slot = path->slots[0];
>                btrfs_item_key_to_cpu(leaf, &key, slot);
> @@ -3225,22 +3228,19 @@ next_slot:
>                                }
>                                pos = key.offset + len;
>                        } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
> -                               dup_item_to_inode(trans, root, path, leaf, slot,
> -                                                 &key, inode->i_ino);
> +                               dup_item_to_inode(trans, root, path, &key,
> +                                                 inode->i_ino);
>                                pos = key.offset + btrfs_item_size_nr(leaf,
>                                                                      slot);
>                        }
>                } else if (btrfs_key_type(&key) == BTRFS_CSUM_ITEM_KEY)
> -                       dup_item_to_inode(trans, root, path, leaf, slot, &key,
> +                       dup_item_to_inode(trans, root, path, &key,
>                                          inode->i_ino);
>
> -               if (slot >= nritems - 1) {
> -                       nextret = btrfs_next_leaf(root, path);
> -                       if (nextret)
> -                               goto out;
> -               } else {
> -                       path->slots[0]++;
> -               }
> +               /* path may not still be valid, so explicitly search
> +                * for the next key */
> +               key.offset = pos;
> +               ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
>                goto next_slot;
>        }

key.offset isn't updated properly. you should differentiate extent
item, inline extent item and csum item.

For extent item:
key.offset += btrfs_file_extent_num_bytes(...);

For inline extent item:
u32 len = btrfs_file_extent_inline_len(...);
key.offset += ALIGN(len, root->sectorsize);

For csum item
key.offset += btrfs_item_size_nr(...) / BTRFS_CRC32_SIZE * root->sectorsize;

Regards
YZ

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [Btrfs-devel] cloning file data
  2008-05-03  6:16         ` Sage Weil
  2008-05-03  6:48           ` Yan Zheng
@ 2008-05-03  7:25           ` Yan Zheng
  2008-05-05 10:27             ` Chris Mason
  1 sibling, 1 reply; 32+ messages in thread
From: Yan Zheng @ 2008-05-03  7:25 UTC (permalink / raw)
  To: Sage Weil; +Cc: Chris Mason, btrfs-devel, linux-btrfs

2008/5/3, Sage Weil <sage@newdream.net>:
> Hi Yan-
>
> On Sat, 3 May 2008, Yan Zheng wrote:
> > I think the clone ioctl won't work in some corner case. The big loop
> > in btrfs_ioctl_clone uses path->slots[0]++ and btrfs_next_leaf to get
> > next item in the tree. However, this approach works only when the
> > layout of tree keeps unchangeed. In btrfs_ioctl_clone, both
> > btrfs_insert_file_extent and dup_item_to_inode may change the layout
> > of tree.
> >
> > To be safe, I think the codes should:
> >  use btrfs_search_slot to find next item.
> >  use a intermediate buffer when coping item between two extent buffer.
>
> Oh, right.  I think for the item copy, though, we just need to re-search
> for the source key again after doing the insert_empty_item.  Then we can
> still use copy_extent_buffer, since at that point both paths will be
> valid?
>
> Something like the below (untested) patch.  I suspect I didn't hit this
> because I was always cloning 'file' to something like 'file2' that always
> sorted after the src file, and didn't shift its position in the leaf. I'll
> try to test this in the next few days.
>
> Thanks-
> sage
>
>
> diff -r f6ba18a50ad7 inode.c
> --- a/inode.c   Fri May 02 16:13:49 2008 -0400
> +++ b/inode.c   Fri May 02 23:11:49 2008 -0700
> @@ -3103,25 +3103,27 @@ out:
>
>  void dup_item_to_inode(struct btrfs_trans_handle *trans,
>                       struct btrfs_root *root,
> -                      struct btrfs_path *path,
> -                      struct extent_buffer *leaf,
> -                      int slot,
> -                      struct btrfs_key *key,
> +                      struct btrfs_path *srcpath,
> +                      struct btrfs_key *srckey,
>                       u64 destino)
>  {
> -       struct btrfs_path *cpath = btrfs_alloc_path();
> -       int len = btrfs_item_size_nr(leaf, slot);
> -       int dstoff;
> -       struct btrfs_key ckey = *key;
> +       struct btrfs_key dstkey = *srckey;
> +       struct btrfs_path *dstpath = btrfs_alloc_path();
> +       int len = btrfs_item_size_nr(srcpath->nodes[0],
> +                                    srcpath->slots[0]);
>        int ret;
>
> -       ckey.objectid = destino;
> -       ret = btrfs_insert_empty_item(trans, root, cpath, &ckey, len);
> -       dstoff = btrfs_item_ptr_offset(cpath->nodes[0], cpath->slots[0]);
> -       copy_extent_buffer(cpath->nodes[0], leaf, dstoff,
> -                          btrfs_item_ptr_offset(leaf, slot),
> +       dstkey.objectid = destino;
> +       ret = btrfs_insert_empty_item(trans, root, dstpath, &dstkey, len);
> +       /* re-search for src key, in case we changed srcpath */
> +       ret = btrfs_search_slot(trans, root, srckey, srcpath, 0, 0);
> +       copy_extent_buffer(dstpath->nodes[0], srcpath->nodes[0],
> +                          btrfs_item_ptr_offset(dstpath->nodes[0],
> +                                                dstpath->slots[0]),
> +                          btrfs_item_ptr_offset(srcpath->nodes[0],
> +                                                srcpath->slots[0]),
>                           len);
> -       btrfs_release_path(root, cpath);
> +       btrfs_release_path(root, dstpath);
>  }
>
>  long btrfs_ioctl_clone(struct file *file, unsigned long src_fd)
> @@ -3137,7 +3139,6 @@ long btrfs_ioctl_clone(struct file *file
>        struct btrfs_key key;
>        struct extent_buffer *leaf;
>        u32 nritems;
> -       int nextret;
>        int slot;
>
>        src_file = fget(src_fd);
> @@ -3178,6 +3179,8 @@ long btrfs_ioctl_clone(struct file *file
>        while (1) {
>                ret = btrfs_lookup_file_extent(trans, root, path, src->i_ino,
>                                               pos, 0);
> +
> +next_slot:
>                if (ret < 0)
>                        goto out;
>                if (ret > 0) {
> @@ -3187,7 +3190,7 @@ long btrfs_ioctl_clone(struct file *file
>                        }
>                        path->slots[0]--;
>                }
> -next_slot:
> +
>                leaf = path->nodes[0];
>                slot = path->slots[0];
>                btrfs_item_key_to_cpu(leaf, &key, slot);
> @@ -3225,22 +3228,19 @@ next_slot:
>                                }
>                                pos = key.offset + len;
>                        } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
> -                               dup_item_to_inode(trans, root, path, leaf, slot,
> -                                                 &key, inode->i_ino);
> +                               dup_item_to_inode(trans, root, path, &key,
> +                                                 inode->i_ino);
>                                pos = key.offset + btrfs_item_size_nr(leaf,
>                                                                      slot);
>                        }
>                } else if (btrfs_key_type(&key) == BTRFS_CSUM_ITEM_KEY)
> -                       dup_item_to_inode(trans, root, path, leaf, slot, &key,
> +                       dup_item_to_inode(trans, root, path, &key,
>                                          inode->i_ino);
>
> -               if (slot >= nritems - 1) {
> -                       nextret = btrfs_next_leaf(root, path);
> -                       if (nextret)
> -                               goto out;
> -               } else {
> -                       path->slots[0]++;
> -               }
> +               /* path may not still be valid, so explicitly search
> +                * for the next key */
> +               key.offset = pos;
> +               ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
>                goto next_slot;
>        }
>
>

In my previous mail, I said items of different types should be
differentiated. Actually, there is no need to do that. Please consider
changing the big loop in btrfs_ioctl_clone to something like:
---
key.objectid = src->i_ino;
key.offset = 0;
key.type = BTRFS_EXTENT_DATA_KEY;
while (1) {
	ret = btrfs_search_slot(trans, root, &key, &path, 0, 0);
	if (ret < 0)
		goto fail;

	leaf = path.nodes[0];
	if (path.slots[0] >= btrfs_header_nritems(leaf)) {
		ret = btrfs_next_leaf(extent_root, &path);
		if (ret < 0)
			goto fail;
		if (ret > 0)
			break;
		leaf = path.nodes[0];
	}

	btrfs_item_key_to_cpu(leaf, &key, path.slots[0]);

	...
	do some works
	...

	btrfs_release_path(root, path);
	key.offset++;
}

---
Regards
YZ

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [Btrfs-devel] cloning file data
  2008-05-03  7:25           ` Yan Zheng
@ 2008-05-05 10:27             ` Chris Mason
  2008-05-05 15:57               ` Sage Weil
  0 siblings, 1 reply; 32+ messages in thread
From: Chris Mason @ 2008-05-05 10:27 UTC (permalink / raw)
  To: Yan Zheng; +Cc: Sage Weil, btrfs-devel, linux-btrfs

On Saturday 03 May 2008, Yan Zheng wrote:
> 2008/5/3, Sage Weil <sage@newdream.net>:
> > Hi Yan-
> >
> > On Sat, 3 May 2008, Yan Zheng wrote:
> > > I think the clone ioctl won't work in some corner case. The big loop
> > > in btrfs_ioctl_clone uses path->slots[0]++ and btrfs_next_leaf to get
> > > next item in the tree. However, this approach works only when the
> > > layout of tree keeps unchangeed. In btrfs_ioctl_clone, both
> > > btrfs_insert_file_extent and dup_item_to_inode may change the layout
> > > of tree.
> > >
> > > To be safe, I think the codes should:
> > >  use btrfs_search_slot to find next item.
> > >  use a intermediate buffer when coping item between two extent buffer.

 [ ... ]

> In my previous mail, I said items of different types should be
> differentiated. Actually, there is no need to do that. Please consider
> changing the big loop in btrfs_ioctl_clone to something like:

Oh, nice catch Yan, thanks.  I've pushed out a new version to the unstable 
tree.  Sage, could you please give this a try too?

-chris

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [Btrfs-devel] cloning file data
  2008-05-05 10:27             ` Chris Mason
@ 2008-05-05 15:57               ` Sage Weil
  2008-05-21 17:19                 ` btrfs_put_inode Mingming
  0 siblings, 1 reply; 32+ messages in thread
From: Sage Weil @ 2008-05-05 15:57 UTC (permalink / raw)
  To: Chris Mason; +Cc: Yan Zheng, btrfs-devel, linux-btrfs

> > In my previous mail, I said items of different types should be
> > differentiated. Actually, there is no need to do that. Please consider
> > changing the big loop in btrfs_ioctl_clone to something like:
> 
> Oh, nice catch Yan, thanks.  I've pushed out a new version to the unstable 
> tree.  Sage, could you please give this a try too?

Looks good to me.  Here's a small cleanup to remove the now unnecessary 
pos.

sage


diff -r d94a17e354a8 inode.c
--- a/inode.c   Mon May 05 06:26:21 2008 -0400
+++ b/inode.c   Mon May 05 09:02:55 2008 -0700
@@ -3135,7 +3135,6 @@ long btrfs_ioctl_clone(struct file *file
        struct inode *src;
        struct btrfs_trans_handle *trans;
        int ret;
-       u64 pos;
        struct btrfs_path *path;
        struct btrfs_key key;
        struct extent_buffer *leaf;
@@ -3183,7 +3182,6 @@ long btrfs_ioctl_clone(struct file *file
        key.offset = 0;
        key.type = BTRFS_EXTENT_DATA_KEY;
        key.objectid = src->i_ino;
-       pos = 0;
        path->reada = 2;
 
        while (1) {
@@ -3214,7 +3212,6 @@ long btrfs_ioctl_clone(struct file *file
                if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
                        struct btrfs_file_extent_item *extent;
                        int found_type;
-                       pos = key.offset;
                        extent = btrfs_item_ptr(leaf, slot,
                                                struct btrfs_file_extent_item);
                        found_type = btrfs_file_extent_type(leaf, extent);
@@ -3228,25 +3225,22 @@ long btrfs_ioctl_clone(struct file *file
                                u64 off = btrfs_file_extent_offset(leaf,
                                                                   extent);
                                btrfs_insert_file_extent(trans, root,
-                                                        inode->i_ino, pos,
+                                                        inode->i_ino, 
+                                                        key.offset,
                                                         ds, dl, len, off);
                                /* ds == 0 means there's a hole */
-                               if (ds != 0) {
+                               if (ds != 0)
                                        btrfs_inc_extent_ref(trans, root,
                                                     ds, dl,
                                                     root->root_key.objectid,
                                                     trans->transid,
-                                                    inode->i_ino, pos);
-                               }
-                               pos = key.offset + len;
+                                                    inode->i_ino, key.offset);
                        } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
                                ret = dup_item_to_inode(trans, root, path,
                                                        leaf, slot, &key,
                                                        inode->i_ino);
                                if (ret)
                                        goto out;
-                               pos = key.offset + btrfs_item_size_nr(leaf,
-                                                                     slot);
                        }
                } else if (btrfs_key_type(&key) == BTRFS_CSUM_ITEM_KEY) {
                        ret = dup_item_to_inode(trans, root, path, leaf,

^ permalink raw reply	[flat|nested] 32+ messages in thread

* btrfs_put_inode
  2008-05-05 15:57               ` Sage Weil
@ 2008-05-21 17:19                 ` Mingming
  2008-05-21 18:02                   ` btrfs_put_inode Chris Mason
  2008-05-21 18:23                   ` btrfs_put_inode Ryan Hope
  0 siblings, 2 replies; 32+ messages in thread
From: Mingming @ 2008-05-21 17:19 UTC (permalink / raw)
  To: Chris Mason; +Cc: linux-btrfs

Hi Chris,

btrfs failed to compile on 2.6.26-rc2 since the put_inode callback
function is removed from super_operations. 

I noticed btrfs_put_inode() seems being actively used, to drop the inode
from the tree for ordered mode transaction. Any idea where to relocate
this function? Do we need to call btrfs_del_ordered_inode() for every
iput()?

Thanks,
Mingming

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: btrfs_put_inode
  2008-05-21 17:19                 ` btrfs_put_inode Mingming
@ 2008-05-21 18:02                   ` Chris Mason
  2008-05-21 18:45                     ` btrfs_put_inode Mingming
  2008-05-21 18:23                   ` btrfs_put_inode Ryan Hope
  1 sibling, 1 reply; 32+ messages in thread
From: Chris Mason @ 2008-05-21 18:02 UTC (permalink / raw)
  To: Mingming; +Cc: linux-btrfs

On Wednesday 21 May 2008, Mingming wrote:
> Hi Chris,
>
> btrfs failed to compile on 2.6.26-rc2 since the put_inode callback
> function is removed from super_operations.
>
> I noticed btrfs_put_inode() seems being actively used, to drop the inode
> from the tree for ordered mode transaction. Any idea where to relocate
> this function? Do we need to call btrfs_del_ordered_inode() for every
> iput()?

The ordered inode list has incremented the inode->i_count, so we can't do the 
iput code inside clear_inode.  But, we should be able to safely move the 
checks to file_release.

This is basically an optimization so that we don't leave clean inodes hanging 
around on the data=ordered list.  It is safe to remove the put_inode call 
completely, it'll just be a little slower.

-chris

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: btrfs_put_inode
  2008-05-21 17:19                 ` btrfs_put_inode Mingming
  2008-05-21 18:02                   ` btrfs_put_inode Chris Mason
@ 2008-05-21 18:23                   ` Ryan Hope
  2008-05-21 18:32                     ` btrfs_put_inode Chris Mason
  1 sibling, 1 reply; 32+ messages in thread
From: Ryan Hope @ 2008-05-21 18:23 UTC (permalink / raw)
  To: Mingming; +Cc: Chris Mason, linux-btrfs

try this patch out,
http://zen-sources.org/files/btrfs-stable-for-2.6.26-rc3.patch

it should apply to 2.6.26-rc*... probably even 2.6.25

im btrfs with 2.6.26-rc3 with no issues

On Wed, May 21, 2008 at 1:19 PM, Mingming <cmm@us.ibm.com> wrote:
> Hi Chris,
>
> btrfs failed to compile on 2.6.26-rc2 since the put_inode callback
> function is removed from super_operations.
>
> I noticed btrfs_put_inode() seems being actively used, to drop the inode
> from the tree for ordered mode transaction. Any idea where to relocate
> this function? Do we need to call btrfs_del_ordered_inode() for every
> iput()?
>
> Thanks,
> Mingming
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: btrfs_put_inode
  2008-05-21 18:23                   ` btrfs_put_inode Ryan Hope
@ 2008-05-21 18:32                     ` Chris Mason
  2008-05-21 19:02                       ` btrfs_put_inode Mingming
  0 siblings, 1 reply; 32+ messages in thread
From: Chris Mason @ 2008-05-21 18:32 UTC (permalink / raw)
  To: Ryan Hope; +Cc: Mingming, linux-btrfs

On Wednesday 21 May 2008, Ryan Hope wrote:
> try this patch out,
> http://zen-sources.org/files/btrfs-stable-for-2.6.26-rc3.patch

It is a little hard to tell from the big long patch what the incremental 
difference is ;)

-chris

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: btrfs_put_inode
  2008-05-21 18:02                   ` btrfs_put_inode Chris Mason
@ 2008-05-21 18:45                     ` Mingming
  2008-05-21 18:52                       ` btrfs_put_inode Chris Mason
  0 siblings, 1 reply; 32+ messages in thread
From: Mingming @ 2008-05-21 18:45 UTC (permalink / raw)
  To: Chris Mason; +Cc: linux-btrfs


On Wed, 2008-05-21 at 14:02 -0400, Chris Mason wrote:
> On Wednesday 21 May 2008, Mingming wrote:
> > Hi Chris,
> >
> > btrfs failed to compile on 2.6.26-rc2 since the put_inode callback
> > function is removed from super_operations.
> >
> > I noticed btrfs_put_inode() seems being actively used, to drop the inode
> > from the tree for ordered mode transaction. Any idea where to relocate
> > this function? Do we need to call btrfs_del_ordered_inode() for every
> > iput()?
> 
> The ordered inode list has incremented the inode->i_count, so we can't do the 
> iput code inside clear_inode.  But, we should be able to safely move the 
> checks to file_release.
> 
Ah okay. file release is called on the last file_close() to this inode.
But I thought NFS could grab the inode without thr file_open/file_close,
so we might also need to delete the inode from the ordered tree at
delete_inode time, like ext3?

> This is basically an optimization so that we don't leave clean inodes hanging 
> around on the data=ordered list.  It is safe to remove the put_inode call 
> completely, it'll just be a little slower.
> 
> -chris


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: btrfs_put_inode
  2008-05-21 18:45                     ` btrfs_put_inode Mingming
@ 2008-05-21 18:52                       ` Chris Mason
  2008-05-21 22:29                         ` [RFC][PATCH]btrfs delete ordered inode handling fix Mingming
  0 siblings, 1 reply; 32+ messages in thread
From: Chris Mason @ 2008-05-21 18:52 UTC (permalink / raw)
  To: Mingming; +Cc: linux-btrfs

On Wednesday 21 May 2008, Mingming wrote:
> On Wed, 2008-05-21 at 14:02 -0400, Chris Mason wrote:
> > On Wednesday 21 May 2008, Mingming wrote:
> > > Hi Chris,
> > >
> > > btrfs failed to compile on 2.6.26-rc2 since the put_inode callback
> > > function is removed from super_operations.
> > >
> > > I noticed btrfs_put_inode() seems being actively used, to drop the
> > > inode from the tree for ordered mode transaction. Any idea where to
> > > relocate this function? Do we need to call btrfs_del_ordered_inode()
> > > for every iput()?
> >
> > The ordered inode list has incremented the inode->i_count, so we can't do
> > the iput code inside clear_inode.  But, we should be able to safely move
> > the checks to file_release.
>
> Ah okay. file release is called on the last file_close() to this inode.
> But I thought NFS could grab the inode without thr file_open/file_close,
> so we might also need to delete the inode from the ordered tree at
> delete_inode time, like ext3?

I do it right now in unlink, but delete_inode is fine too.

-chris

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: btrfs_put_inode
  2008-05-21 18:32                     ` btrfs_put_inode Chris Mason
@ 2008-05-21 19:02                       ` Mingming
  0 siblings, 0 replies; 32+ messages in thread
From: Mingming @ 2008-05-21 19:02 UTC (permalink / raw)
  To: Chris Mason; +Cc: Ryan Hope, linux-btrfs


On Wed, 2008-05-21 at 14:32 -0400, Chris Mason wrote:
> On Wednesday 21 May 2008, Ryan Hope wrote:
> > try this patch out,
> > http://zen-sources.org/files/btrfs-stable-for-2.6.26-rc3.patch
> 
> It is a little hard to tell from the big long patch what the incremental 
> difference is ;)
> 
Looks like just break the hook up in super_operations:-)
> -chris


^ permalink raw reply	[flat|nested] 32+ messages in thread

* [RFC][PATCH]btrfs delete ordered inode handling fix
  2008-05-21 18:52                       ` btrfs_put_inode Chris Mason
@ 2008-05-21 22:29                         ` Mingming
  2008-05-22 14:11                           ` Chris Mason
  0 siblings, 1 reply; 32+ messages in thread
From: Mingming @ 2008-05-21 22:29 UTC (permalink / raw)
  To: Chris Mason; +Cc: linux-btrfs

Hi Chris,  I thought I spotted a few bugs, While looking at how to
properly remove inode from ordered tree, let me know if I got it right.

* inode->i_count accounting issue
- remove extra i_count_decrease after calling
filemap_file_write_and_wait() in btrfs_write_ordered_inodes()

- Remove extra i_count decrease after each call
btrfs_del_ordered_inode(), it decrease the i_count again. I couldn't
find the where the i_count being increased. The tree_lock should
protecting it races with btrfs_write_ordered_inodes(), no? 

* There is possible race with inode delete and
btrfs_find_first_ordered_inode(). The inode could possibly in the
process of freeing while we are trying to get hold of it during commit
transaction. The fix is using igrab() instead, and search for next inode
in the tree if the found one is in the middle of being released.

* get rid of btrfs_put_inode(), and move the functionality under the
btrfs_del_ordered_inode() directly.

* Remove the inode from ordered tree at last iput(). Did not do it at
file release() time, as it may remove the inode from the ordered tree
before ensure the ordering of write to the same inode from other
process.

Perhaps calling btrfs_del_ordered_inode() under unlink() is enough, but
it would not be hurt to do it again at delete_inode() time.


Here is the patch trying to address above issues... compiled, but not
tested. Is ordered mode on by default?

Regards,
Mingming

diff -r c3290d51e5f9 inode.c
--- a/inode.c	Fri May 16 13:30:15 2008 -0400
+++ b/inode.c	Wed May 21 14:51:22 2008 -0700
@@ -857,15 +857,11 @@ static int btrfs_unlink(struct inode *di
 	nr = trans->blocks_used;
 
 	if (inode->i_nlink == 0) {
-		int found;
 		/* if the inode isn't linked anywhere,
 		 * we don't need to worry about
 		 * data=ordered
 		 */
-		found = btrfs_del_ordered_inode(inode);
-		if (found == 1) {
-			atomic_dec(&inode->i_count);
-		}
+		btrfs_del_ordered_inode(inode);
 	}
 
 	btrfs_end_transaction(trans, root);
@@ -1271,24 +1267,6 @@ fail:
 	return err;
 }
 
-void btrfs_put_inode(struct inode *inode)
-{
-	int ret;
-
-	if (!BTRFS_I(inode)->ordered_trans) {
-		return;
-	}
-
-	if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY) ||
-	    mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK))
-		return;
-
-	ret = btrfs_del_ordered_inode(inode);
-	if (ret == 1) {
-		atomic_dec(&inode->i_count);
-	}
-}
-
 void btrfs_delete_inode(struct inode *inode)
 {
 	struct btrfs_trans_handle *trans;
@@ -1311,6 +1289,7 @@ void btrfs_delete_inode(struct inode *in
 		goto no_delete_lock;
 
 	nr = trans->blocks_used;
+	btrfs_del_ordered_inode(inode);
 	clear_inode(inode);
 
 	btrfs_end_transaction(trans, root);
diff -r c3290d51e5f9 ordered-data.c
--- a/ordered-data.c	Fri May 16 13:30:15 2008 -0400
+++ b/ordered-data.c	Wed May 21 14:55:25 2008 -0700
@@ -166,7 +166,7 @@ int btrfs_find_first_ordered_inode(struc
 				   struct inode **inode)
 {
 	struct tree_entry *entry;
-	struct rb_node *node;
+	struct rb_node *node, *next;
 
 	write_lock(&tree->lock);
 	node = tree_search(&tree->tree, *root_objectid, *objectid);
@@ -174,6 +174,7 @@ int btrfs_find_first_ordered_inode(struc
 		write_unlock(&tree->lock);
 		return 0;
 	}
+again:
 	entry = rb_entry(node, struct tree_entry, rb_node);
 
 	while(comp_entry(entry, *root_objectid, *objectid) >= 0) {
@@ -187,9 +188,20 @@ int btrfs_find_first_ordered_inode(struc
 		return 0;
 	}
 
+	*inode = igrab(entry->inode);
+	if (!*inode) {
+		next = rb_next(node);
+		rb_erase(node, &tree->tree);
+		kfree(entry);
+		if (!next) {
+			write_unlock(&tree->lock);
+			return 0;
+		}
+		node = next;
+		goto again;
+	}
+
 	*root_objectid = entry->root_objectid;
-	*inode = entry->inode;
-	atomic_inc(&entry->inode->i_count);
 	*objectid = entry->objectid;
 	write_unlock(&tree->lock);
 	return 1;
@@ -200,7 +212,7 @@ int btrfs_find_del_first_ordered_inode(s
 				       struct inode **inode)
 {
 	struct tree_entry *entry;
-	struct rb_node *node;
+	struct rb_node *node, *next;
 
 	write_lock(&tree->lock);
 	node = tree_search(&tree->tree, *root_objectid, *objectid);
@@ -208,7 +220,7 @@ int btrfs_find_del_first_ordered_inode(s
 		write_unlock(&tree->lock);
 		return 0;
 	}
-
+retry:
 	entry = rb_entry(node, struct tree_entry, rb_node);
 	while(comp_entry(entry, *root_objectid, *objectid) >= 0) {
 		node = rb_next(node);
@@ -220,11 +232,21 @@ int btrfs_find_del_first_ordered_inode(s
 		write_unlock(&tree->lock);
 		return 0;
 	}
+	*inode = igrab(entry->inode);
+	if (!*inode) {
+		next = rb_next(node);
+		rb_erase(node, &tree->tree);
+		kfree(entry);
+		if (!next) {
+			write_unlock(&tree->lock);
+			return 0;
+		}
+		node = next;
+		goto retry;
+	}
 
 	*root_objectid = entry->root_objectid;
 	*objectid = entry->objectid;
-	*inode = entry->inode;
-	atomic_inc(&entry->inode->i_count);
 	rb_erase(node, &tree->tree);
 	write_unlock(&tree->lock);
 	kfree(entry);
@@ -253,11 +275,19 @@ static int __btrfs_del_ordered_inode(str
 	return 1;
 }
 
-int btrfs_del_ordered_inode(struct inode *inode)
+void btrfs_del_ordered_inode(struct inode *inode)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	u64 root_objectid = root->root_key.objectid;
 	int ret = 0;
+
+	if (!BTRFS_I(inode)->ordered_trans) {
+		return;
+	}
+
+	if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY) ||
+		mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK))
+		return;
 
 	spin_lock(&root->fs_info->new_trans_lock);
 	if (root->fs_info->running_transaction) {
@@ -267,7 +297,7 @@ int btrfs_del_ordered_inode(struct inode
 						inode->i_ino);
 	}
 	spin_unlock(&root->fs_info->new_trans_lock);
-	return ret;
+	return;
 }
 
 int btrfs_ordered_throttle(struct btrfs_root *root, struct inode *inode)
diff -r c3290d51e5f9 ordered-data.h
--- a/ordered-data.h	Fri May 16 13:30:15 2008 -0400
+++ b/ordered-data.h	Wed May 21 14:56:49 2008 -0700
@@ -38,6 +38,6 @@ int btrfs_find_first_ordered_inode(struc
 int btrfs_find_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
 				       u64 *root_objectid, u64 *objectid,
 				       struct inode **inode);
-int btrfs_del_ordered_inode(struct inode *inode);
+void btrfs_del_ordered_inode(struct inode *inode);
 int btrfs_ordered_throttle(struct btrfs_root *root, struct inode *inode);
 #endif
diff -r c3290d51e5f9 super.c
--- a/super.c	Fri May 16 13:30:15 2008 -0400
+++ b/super.c	Wed May 21 12:54:48 2008 -0700
@@ -487,7 +487,6 @@ static void btrfs_unlockfs(struct super_
 
 static struct super_operations btrfs_super_ops = {
 	.delete_inode	= btrfs_delete_inode,
-	.put_inode	= btrfs_put_inode,
 	.put_super	= btrfs_put_super,
 	.write_super	= btrfs_write_super,
 	.sync_fs	= btrfs_sync_fs,
diff -r c3290d51e5f9 transaction.c
--- a/transaction.c	Fri May 16 13:30:15 2008 -0400
+++ b/transaction.c	Wed May 21 14:18:34 2008 -0700
@@ -538,7 +538,6 @@ int btrfs_write_ordered_inodes(struct bt
 			filemap_write_and_wait(inode->i_mapping);
 			atomic_dec(&BTRFS_I(inode)->ordered_writeback);
 		}
-		atomic_dec(&inode->i_count);
 		iput(inode);
 
 		mutex_lock(&root->fs_info->fs_mutex);



^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC][PATCH]btrfs delete ordered inode handling fix
  2008-05-21 22:29                         ` [RFC][PATCH]btrfs delete ordered inode handling fix Mingming
@ 2008-05-22 14:11                           ` Chris Mason
  2008-05-22 17:43                             ` Mingming
  0 siblings, 1 reply; 32+ messages in thread
From: Chris Mason @ 2008-05-22 14:11 UTC (permalink / raw)
  To: Mingming; +Cc: linux-btrfs

On Wednesday 21 May 2008, Mingming wrote:
> Hi Chris,  I thought I spotted a few bugs, While looking at how to
> properly remove inode from ordered tree, let me know if I got it right.

Hi Mingming, thanks for going through this code.  The i_count rules in the 
current code should work like this:

* btrfs_add_ordered_inode calls igrab when the inode is inserted into the 
list.  The whole time the inode is on the list, there's an extra reference on 
i_count.  There will be no final iput while the inode is on the list.

* btrfs_find_first_ordered_inode and btrfs_find_del_ordered_inode both 
increment i_count before returning the inode.  This gives the caller an extra 
reference on i_count in addition to the one held by the list.

* ordered-data.c never drops i_count.  This allows (and forces) the caller to 
decide when it is safe to call iput because callers have different locks held 
and because the last iput might can in theory trigger delete_inode.

All of the current callers of btrfs_del_ordered_inode already have a i_count 
increased before they call it.  So, they can safely just do an atomic_dec on 
i_count and let their own iputs later do the delete_inode.

>
> * inode->i_count accounting issue
> - remove extra i_count_decrease after calling
> filemap_file_write_and_wait() in btrfs_write_ordered_inodes()

There's an atomic_dec of i_count to drop the reference from 
btrfs_find_del_first_ordered_inode, and an iput to drop the reference added 
by igrab when the inode was put on the list.

>
> - Remove extra i_count decrease after each call
> btrfs_del_ordered_inode(), it decrease the i_count again. I couldn't
> find the where the i_count being increased. The tree_lock should
> protecting it races with btrfs_write_ordered_inodes(), no?

The increase comes from igrab on list insertion

>
> * There is possible race with inode delete and
> btrfs_find_first_ordered_inode(). The inode could possibly in the
> process of freeing while we are trying to get hold of it during commit
> transaction. The fix is using igrab() instead, and search for next inode
> in the tree if the found one is in the middle of being released.

These kinds of races where the main reason why I had the list take a reference 
on the inode.  delete_inode won't be called while i_count is increased.

Over the long term I'd prefer to move the ordered-data list to a model where 
the list doesn't have a reference and it is magically removed after all the 
dirty pages are gone (by the end_io_hook handlers in inode.c).  The end_io 
hooks in inode.c may be sufficient for this.

>
> * get rid of btrfs_put_inode(), and move the functionality under the
> btrfs_del_ordered_inode() directly.

I like this change, thanks.

>
> * Remove the inode from ordered tree at last iput(). Did not do it at
> file release() time, as it may remove the inode from the ordered tree
> before ensure the ordering of write to the same inode from other
> process.
>
> Perhaps calling btrfs_del_ordered_inode() under unlink() is enough, but
> it would not be hurt to do it again at delete_inode() time.

I'm afraid we'll have to do it at file_release time, at least until the 
ordered list is changed not to keep a reference.

>
>
> Here is the patch trying to address above issues... compiled, but not
> tested. Is ordered mode on by default?
>

Yes, there's actually no way to turn it off ;)

-chris

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC][PATCH]btrfs delete ordered inode handling fix
  2008-05-22 14:11                           ` Chris Mason
@ 2008-05-22 17:43                             ` Mingming
  2008-05-22 17:47                               ` Chris Mason
  0 siblings, 1 reply; 32+ messages in thread
From: Mingming @ 2008-05-22 17:43 UTC (permalink / raw)
  To: Chris Mason; +Cc: linux-btrfs

On Thu, 2008-05-22 at 10:11 -0400, Chris Mason wrote:
> On Wednesday 21 May 2008, Mingming wrote:
> > Hi Chris,  I thought I spotted a few bugs, While looking at how to
> > properly remove inode from ordered tree, let me know if I got it right.
> 
> Hi Mingming, thanks for going through this code.  The i_count rules in the 
> current code should work like this:
> 

Hi Chris, thanks for your detailed clarification.

> * btrfs_add_ordered_inode calls igrab when the inode is inserted into the 
> list.  The whole time the inode is on the list, there's an extra reference on 
> i_count.  There will be no final iput while the inode is on the list.
> 

Ah I missed that. That explains all my confusion of the i_count accounting.

> >
> > * There is possible race with inode delete and
> > btrfs_find_first_ordered_inode(). The inode could possibly in the
> > process of freeing while we are trying to get hold of it during commit
> > transaction. The fix is using igrab() instead, and search for next inode
> > in the tree if the found one is in the middle of being released.
> 
> These kinds of races where the main reason why I had the list take a reference 
> on the inode.  delete_inode won't be called while i_count is increased.
> 
> Over the long term I'd prefer to move the ordered-data list to a model where 
> the list doesn't have a reference and it is magically removed after all the 
> dirty pages are gone (by the end_io_hook handlers in inode.c).  The end_io 
> hooks in inode.c may be sufficient for this.
> 
Make sense.
> >
> > * get rid of btrfs_put_inode(), and move the functionality under the
> > btrfs_del_ordered_inode() directly.
> 
> I like this change, thanks.
> 
> >
> > * Remove the inode from ordered tree at last iput(). Did not do it at
> > file release() time, as it may remove the inode from the ordered tree
> > before ensure the ordering of write to the same inode from other
> > process.
> >
> > Perhaps calling btrfs_del_ordered_inode() under unlink() is enough, but
> > it would not be hurt to do it again at delete_inode() time.
> 
> I'm afraid we'll have to do it at file_release time, at least until the 
> ordered list is changed not to keep a reference.
> 
Yes with the i_count logic delete_inode() is not the right place to call
btrfs_del_ordered_inode.

But I am still not quite sure whether it is safe to remove the inode
from the ordered tree at the file_release() time. i.e. whether the dirty
data already being flushed to disk at last file_close()/file_release()
time and when two process open and write to the same inode ...

Regards,
Mingming


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC][PATCH]btrfs delete ordered inode handling fix
  2008-05-22 17:43                             ` Mingming
@ 2008-05-22 17:47                               ` Chris Mason
  2008-05-22 20:39                                 ` Mingming
  0 siblings, 1 reply; 32+ messages in thread
From: Chris Mason @ 2008-05-22 17:47 UTC (permalink / raw)
  To: Mingming; +Cc: linux-btrfs

On Thursday 22 May 2008, Mingming wrote:
> On Thu, 2008-05-22 at 10:11 -0400, Chris Mason wrote:
> > On Wednesday 21 May 2008, Mingming wrote:
> > > Hi Chris,  I thought I spotted a few bugs, While looking at how to
> > > properly remove inode from ordered tree, let me know if I got it right.
> >
> > Hi Mingming, thanks for going through this code.  The i_count rules in
> > the current code should work like this:
>
> Hi Chris, thanks for your detailed clarification.
>
> > * btrfs_add_ordered_inode calls igrab when the inode is inserted into the
> > list.  The whole time the inode is on the list, there's an extra
> > reference on i_count.  There will be no final iput while the inode is on
> > the list.
>
> Ah I missed that. That explains all my confusion of the i_count accounting.
>
> > > * There is possible race with inode delete and
> > > btrfs_find_first_ordered_inode(). The inode could possibly in the
> > > process of freeing while we are trying to get hold of it during commit
> > > transaction. The fix is using igrab() instead, and search for next
> > > inode in the tree if the found one is in the middle of being released.
> >
> > These kinds of races where the main reason why I had the list take a
> > reference on the inode.  delete_inode won't be called while i_count is
> > increased.
> >
> > Over the long term I'd prefer to move the ordered-data list to a model
> > where the list doesn't have a reference and it is magically removed after
> > all the dirty pages are gone (by the end_io_hook handlers in inode.c). 
> > The end_io hooks in inode.c may be sufficient for this.
>
> Make sense.
>
> > > * get rid of btrfs_put_inode(), and move the functionality under the
> > > btrfs_del_ordered_inode() directly.
> >
> > I like this change, thanks.
> >
> > > * Remove the inode from ordered tree at last iput(). Did not do it at
> > > file release() time, as it may remove the inode from the ordered tree
> > > before ensure the ordering of write to the same inode from other
> > > process.
> > >
> > > Perhaps calling btrfs_del_ordered_inode() under unlink() is enough, but
> > > it would not be hurt to do it again at delete_inode() time.
> >
> > I'm afraid we'll have to do it at file_release time, at least until the
> > ordered list is changed not to keep a reference.
>
> Yes with the i_count logic delete_inode() is not the right place to call
> btrfs_del_ordered_inode.
>
> But I am still not quite sure whether it is safe to remove the inode
> from the ordered tree at the file_release() time. i.e. whether the dirty
> data already being flushed to disk at last file_close()/file_release()
> time and when two process open and write to the same inode ...

I get around this by testing for dirty/writeback pages before removing the 
inode from the ordered list.  If another writer allocates blocks to the file, 
it will be added back to the list.

-chris

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC][PATCH]btrfs delete ordered inode handling fix
  2008-05-22 17:47                               ` Chris Mason
@ 2008-05-22 20:39                                 ` Mingming
  2008-05-22 22:23                                   ` Chris Mason
  0 siblings, 1 reply; 32+ messages in thread
From: Mingming @ 2008-05-22 20:39 UTC (permalink / raw)
  To: Chris Mason; +Cc: linux-btrfs


On Thu, 2008-05-22 at 13:47 -0400, Chris Mason wrote:
> On Thursday 22 May 2008, Mingming wrote:
> > On Thu, 2008-05-22 at 10:11 -0400, Chris Mason wrote:
> > > On Wednesday 21 May 2008, Mingming wrote:
> > > > Hi Chris,  I thought I spotted a few bugs, While looking at how to
> > > > properly remove inode from ordered tree, let me know if I got it right.
> > >
> > > Hi Mingming, thanks for going through this code.  The i_count rules in
> > > the current code should work like this:
> >
> > Hi Chris, thanks for your detailed clarification.
> >
> > > * btrfs_add_ordered_inode calls igrab when the inode is inserted into the
> > > list.  The whole time the inode is on the list, there's an extra
> > > reference on i_count.  There will be no final iput while the inode is on
> > > the list.
> >
> > Ah I missed that. That explains all my confusion of the i_count accounting.
> >
> > > > * There is possible race with inode delete and
> > > > btrfs_find_first_ordered_inode(). The inode could possibly in the
> > > > process of freeing while we are trying to get hold of it during commit
> > > > transaction. The fix is using igrab() instead, and search for next
> > > > inode in the tree if the found one is in the middle of being released.
> > >
> > > These kinds of races where the main reason why I had the list take a
> > > reference on the inode.  delete_inode won't be called while i_count is
> > > increased.
> > >
> > > Over the long term I'd prefer to move the ordered-data list to a model
> > > where the list doesn't have a reference and it is magically removed after
> > > all the dirty pages are gone (by the end_io_hook handlers in inode.c). 
> > > The end_io hooks in inode.c may be sufficient for this.
> >
> > Make sense.
> >
> > > > * get rid of btrfs_put_inode(), and move the functionality under the
> > > > btrfs_del_ordered_inode() directly.
> > >
> > > I like this change, thanks.
> > >
> > > > * Remove the inode from ordered tree at last iput(). Did not do it at
> > > > file release() time, as it may remove the inode from the ordered tree
> > > > before ensure the ordering of write to the same inode from other
> > > > process.
> > > >
> > > > Perhaps calling btrfs_del_ordered_inode() under unlink() is enough, but
> > > > it would not be hurt to do it again at delete_inode() time.
> > >
> > > I'm afraid we'll have to do it at file_release time, at least until the
> > > ordered list is changed not to keep a reference.
> >
> > Yes with the i_count logic delete_inode() is not the right place to call
> > btrfs_del_ordered_inode.
> >
> > But I am still not quite sure whether it is safe to remove the inode
> > from the ordered tree at the file_release() time. i.e. whether the dirty
> > data already being flushed to disk at last file_close()/file_release()
> > time and when two process open and write to the same inode ...
> 
> I get around this by testing for dirty/writeback pages before removing the 
> inode from the ordered list.  If another writer allocates blocks to the file, 
> it will be added back to the list.
> 

I see.:) How about patch below?

Mingming

diff -r c3290d51e5f9 file.c
--- a/file.c	Fri May 16 13:30:15 2008 -0400
+++ b/file.c	Thu May 22 13:29:42 2008 -0700
@@ -978,6 +978,12 @@ out_nolock:
 	return num_written ? num_written : err;
 }
 
+static int btrfs_release_file (struct inode * inode, struct file * filp)
+{
+	btrfs_del_ordered_inode(inode);
+	return 0;
+}
+
 static int btrfs_sync_file(struct file *file,
 			   struct dentry *dentry, int datasync)
 {
@@ -1044,6 +1050,7 @@ struct file_operations btrfs_file_operat
 	.write		= btrfs_file_write,
 	.mmap		= btrfs_file_mmap,
 	.open		= generic_file_open,
+	.release	= btrfs_release_file,
 	.fsync		= btrfs_sync_file,
 	.unlocked_ioctl	= btrfs_ioctl,
 #ifdef CONFIG_COMPAT
diff -r c3290d51e5f9 inode.c
--- a/inode.c	Fri May 16 13:30:15 2008 -0400
+++ b/inode.c	Thu May 22 13:31:14 2008 -0700
@@ -857,15 +857,11 @@ static int btrfs_unlink(struct inode *di
 	nr = trans->blocks_used;
 
 	if (inode->i_nlink == 0) {
-		int found;
 		/* if the inode isn't linked anywhere,
 		 * we don't need to worry about
 		 * data=ordered
 		 */
-		found = btrfs_del_ordered_inode(inode);
-		if (found == 1) {
-			atomic_dec(&inode->i_count);
-		}
+		btrfs_del_ordered_inode(inode);
 	}
 
 	btrfs_end_transaction(trans, root);
@@ -1271,24 +1267,6 @@ fail:
 	return err;
 }
 
-void btrfs_put_inode(struct inode *inode)
-{
-	int ret;
-
-	if (!BTRFS_I(inode)->ordered_trans) {
-		return;
-	}
-
-	if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY) ||
-	    mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK))
-		return;
-
-	ret = btrfs_del_ordered_inode(inode);
-	if (ret == 1) {
-		atomic_dec(&inode->i_count);
-	}
-}
-
 void btrfs_delete_inode(struct inode *inode)
 {
 	struct btrfs_trans_handle *trans;
diff -r c3290d51e5f9 ordered-data.c
--- a/ordered-data.c	Fri May 16 13:30:15 2008 -0400
+++ b/ordered-data.c	Thu May 22 13:24:56 2008 -0700
@@ -231,7 +231,7 @@ int btrfs_find_del_first_ordered_inode(s
 	return 1;
 }
 
-static int __btrfs_del_ordered_inode(struct btrfs_ordered_inode_tree *tree,
+static void __btrfs_del_ordered_inode(struct btrfs_ordered_inode_tree *tree,
 				     struct inode *inode,
 				     u64 root_objectid, u64 objectid)
 {
@@ -243,31 +243,38 @@ static int __btrfs_del_ordered_inode(str
 	node = __tree_search(&tree->tree, root_objectid, objectid, &prev);
 	if (!node) {
 		write_unlock(&tree->lock);
-		return 0;
+		return;
 	}
 	rb_erase(node, &tree->tree);
 	BTRFS_I(inode)->ordered_trans = 0;
 	write_unlock(&tree->lock);
+	atomic_dec(&inode->i_count);
 	entry = rb_entry(node, struct tree_entry, rb_node);
 	kfree(entry);
-	return 1;
-}
-
-int btrfs_del_ordered_inode(struct inode *inode)
+	return;
+}
+
+void btrfs_del_ordered_inode(struct inode *inode)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	u64 root_objectid = root->root_key.objectid;
-	int ret = 0;
+
+	if (!BTRFS_I(inode)->ordered_trans) {
+		return;
+	}
+
+	if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY) ||
+	    mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK))
+		return;
 
 	spin_lock(&root->fs_info->new_trans_lock);
 	if (root->fs_info->running_transaction) {
 		struct btrfs_ordered_inode_tree *tree;
 		tree = &root->fs_info->running_transaction->ordered_inode_tree;
-		ret = __btrfs_del_ordered_inode(tree, inode, root_objectid,
+		 __btrfs_del_ordered_inode(tree, inode, root_objectid,
 						inode->i_ino);
 	}
 	spin_unlock(&root->fs_info->new_trans_lock);
-	return ret;
 }
 
 int btrfs_ordered_throttle(struct btrfs_root *root, struct inode *inode)
diff -r c3290d51e5f9 ordered-data.h
--- a/ordered-data.h	Fri May 16 13:30:15 2008 -0400
+++ b/ordered-data.h	Thu May 22 13:25:09 2008 -0700
@@ -38,6 +38,6 @@ int btrfs_find_first_ordered_inode(struc
 int btrfs_find_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
 				       u64 *root_objectid, u64 *objectid,
 				       struct inode **inode);
-int btrfs_del_ordered_inode(struct inode *inode);
+void btrfs_del_ordered_inode(struct inode *inode);
 int btrfs_ordered_throttle(struct btrfs_root *root, struct inode *inode);
 #endif
diff -r c3290d51e5f9 super.c
--- a/super.c	Fri May 16 13:30:15 2008 -0400
+++ b/super.c	Thu May 22 13:30:44 2008 -0700
@@ -487,7 +487,6 @@ static void btrfs_unlockfs(struct super_
 
 static struct super_operations btrfs_super_ops = {
 	.delete_inode	= btrfs_delete_inode,
-	.put_inode	= btrfs_put_inode,
 	.put_super	= btrfs_put_super,
 	.write_super	= btrfs_write_super,
 	.sync_fs	= btrfs_sync_fs,



^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC][PATCH]btrfs delete ordered inode handling fix
  2008-05-22 20:39                                 ` Mingming
@ 2008-05-22 22:23                                   ` Chris Mason
  0 siblings, 0 replies; 32+ messages in thread
From: Chris Mason @ 2008-05-22 22:23 UTC (permalink / raw)
  To: Mingming; +Cc: linux-btrfs

On Thursday 22 May 2008, Mingming wrote:

[ ... ]

> > I get around this by testing for dirty/writeback pages before removing
> > the inode from the ordered list.  If another writer allocates blocks to
> > the file, it will be added back to the list.
>
> I see.:) How about patch below?

Looks good, I'll give it a shot here.

-chris

^ permalink raw reply	[flat|nested] 32+ messages in thread

end of thread, other threads:[~2008-05-22 22:23 UTC | newest]

Thread overview: 32+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-04-24 22:47 cloning file data Sage Weil
2008-04-25 13:41 ` [Btrfs-devel] " Chris Mason
2008-04-25 16:50   ` Zach Brown
2008-04-25 16:58     ` Chris Mason
2008-04-25 17:04       ` Zach Brown
2008-04-25 16:50   ` Zach Brown
2008-04-25 18:32     ` Sage Weil
2008-04-25 18:26   ` Sage Weil
2008-04-26  4:38     ` Sage Weil
2008-05-03  4:44       ` Yan Zheng
2008-05-03  6:16         ` Sage Weil
2008-05-03  6:48           ` Yan Zheng
2008-05-03  7:25           ` Yan Zheng
2008-05-05 10:27             ` Chris Mason
2008-05-05 15:57               ` Sage Weil
2008-05-21 17:19                 ` btrfs_put_inode Mingming
2008-05-21 18:02                   ` btrfs_put_inode Chris Mason
2008-05-21 18:45                     ` btrfs_put_inode Mingming
2008-05-21 18:52                       ` btrfs_put_inode Chris Mason
2008-05-21 22:29                         ` [RFC][PATCH]btrfs delete ordered inode handling fix Mingming
2008-05-22 14:11                           ` Chris Mason
2008-05-22 17:43                             ` Mingming
2008-05-22 17:47                               ` Chris Mason
2008-05-22 20:39                                 ` Mingming
2008-05-22 22:23                                   ` Chris Mason
2008-05-21 18:23                   ` btrfs_put_inode Ryan Hope
2008-05-21 18:32                     ` btrfs_put_inode Chris Mason
2008-05-21 19:02                       ` btrfs_put_inode Mingming
2008-04-25 20:28   ` [Btrfs-devel] cloning file data Sage Weil
2008-04-29 20:52 ` Chris Mason
2008-05-02 20:50 ` Chris Mason
2008-05-02 21:38   ` Sage Weil

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).