* cloning file data
@ 2008-04-24 22:47 Sage Weil
2008-04-25 13:41 ` [Btrfs-devel] " Chris Mason
` (2 more replies)
0 siblings, 3 replies; 32+ messages in thread
From: Sage Weil @ 2008-04-24 22:47 UTC (permalink / raw)
To: linux-btrfs, btrfs-devel
Hi-
I'm working on a clone ioctl that will quickly and efficiently duplicate
the contents of a file, e.g.
int main(int argc, const char **argv)
{
int in = open(argv[1], O_RDONLY);
int out = open(argv[2], O_CREAT|O_TRUNC|O_WRONLY, 0644);
ioctl(out, BTRFS_IOC_CLONE, in);
close(in);
close(out);
return 0;
}
I've probably got the locking order a bit wrong, lots of error handling is
missing, and I suspect there's a cleaner way to do the target inode size
update, but it behaves well enough in my (limited :) testing.
Oh, and I wasn't certain the 'offset' in file_extent_item could be safely
ignored when duplicating the extent reference. My assumption was that it
is orthogonal to extent allocation and isn't related to the backref.
However, btrfs_insert_file_extent() always set offset=0. I'm guessing I
need to add an argument there and fix up the other callers?
Anyway, any comments or suggestions (on the interface or implemantation)
are welcome.. :)
sage
diff -r 1791a620d509 inode.c
--- a/inode.c Thu Apr 24 13:43:27 2008 -0700
+++ b/inode.c Thu Apr 24 15:10:17 2008 -0700
@@ -18,6 +18,7 @@
#include <linux/bio.h>
#include <linux/buffer_head.h>
+#include <linux/file.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
@@ -2726,6 +2727,158 @@ long btrfs_ioctl_trans_end(struct file *
return 0;
}
+void dup_item_to_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct extent_buffer *leaf,
+ int slot,
+ struct btrfs_key *key,
+ u64 destino)
+{
+ struct btrfs_path *cpath = btrfs_alloc_path();
+ int len = btrfs_item_size_nr(leaf, slot);
+ int dstoff;
+ struct btrfs_key ckey = *key;
+ int ret;
+
+ ckey.objectid = destino;
+ ret = btrfs_insert_empty_item(trans, root, cpath, &ckey, len);
+ dstoff = btrfs_item_ptr_offset(cpath->nodes[0], cpath->slots[0]);
+ copy_extent_buffer(cpath->nodes[0], leaf, dstoff,
+ btrfs_item_ptr_offset(leaf, slot),
+ len);
+ btrfs_release_path(root, cpath);
+}
+
+long btrfs_ioctl_clone(struct file *file, unsigned long src_fd)
+{
+ struct inode *inode = fdentry(file)->d_inode;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct file *src_file;
+ struct inode *src;
+ struct btrfs_trans_handle *trans;
+ int ret;
+ u64 pos;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ u32 nritems;
+ int nextret;
+ int slot;
+
+ src_file = fget(src_fd);
+ if (!src_file)
+ return -EBADF;
+ src = src_file->f_dentry->d_inode;
+
+ ret = -EXDEV;
+ if (src->i_sb != inode->i_sb)
+ goto out_fput;
+
+ if (inode < src) {
+ mutex_lock(&inode->i_mutex);
+ mutex_lock(&src->i_mutex);
+ } else {
+ mutex_lock(&src->i_mutex);
+ mutex_lock(&inode->i_mutex);
+ }
+
+ ret = -ENOTEMPTY;
+ if (inode->i_size)
+ goto out_unlock;
+
+ /* do any pending delalloc/csum calc on src, one way or another */
+ filemap_write_and_wait(src->i_mapping);
+
+ mutex_lock(&root->fs_info->fs_mutex);
+ trans = btrfs_start_transaction(root, 0);
+ path = btrfs_alloc_path();
+ pos = 0;
+ while (1) {
+ ret = btrfs_lookup_file_extent(trans, root, path, src->i_ino,
+ pos, 0);
+ if (ret < 0)
+ goto out;
+ if (ret > 0) {
+ if (path->slots[0] == 0) {
+ ret = 0;
+ goto out;
+ }
+ path->slots[0]--;
+ }
+ next_slot:
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+
+ printk("key(%llu %x %llu)\n",
+ key.objectid, key.type, key.offset);
+ if (btrfs_key_type(&key) > BTRFS_CSUM_ITEM_KEY ||
+ key.objectid != src->i_ino)
+ goto out;
+ if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
+ struct btrfs_file_extent_item *extent;
+ int found_type;
+ u64 len;
+ pos = key.offset;
+ extent = btrfs_item_ptr(leaf, slot,
+ struct btrfs_file_extent_item);
+ found_type = btrfs_file_extent_type(leaf, extent);
+ len = btrfs_file_extent_num_bytes(leaf, extent);
+ if (found_type == BTRFS_FILE_EXTENT_REG) {
+ u64 ds = btrfs_file_extent_disk_bytenr(leaf,
+ extent);
+ u64 dl = btrfs_file_extent_disk_num_bytes(leaf,
+ extent);
+ printk(" %llu~%llu disk %llu~%llu off %llu\n",
+ pos, len, ds, dl,
+ btrfs_file_extent_offset(leaf, extent));
+ btrfs_insert_file_extent(trans, root,
+ inode->i_ino, pos,
+ ds, dl, len);
+ btrfs_inc_extent_ref(trans, root, ds, dl,
+ root->root_key.objectid,
+ trans->transid,
+ inode->i_ino, pos);
+ } else if (found_type == BTRFS_FILE_EXTENT_INLINE)
+ dup_item_to_inode(trans, root, path, leaf, slot,
+ &key, inode->i_ino);
+ pos = key.offset + len;
+ } else if (btrfs_key_type(&key) == BTRFS_CSUM_ITEM_KEY)
+ dup_item_to_inode(trans, root, path, leaf, slot, &key,
+ inode->i_ino);
+
+ nritems = btrfs_header_nritems(leaf);
+ if (slot >= nritems - 1) {
+ nextret = btrfs_next_leaf(root, path);
+ if (nextret)
+ goto out;
+ } else {
+ path->slots[0]++;
+ }
+ goto next_slot;
+ }
+
+out:
+ ret = 0;
+ mutex_unlock(&root->fs_info->fs_mutex);
+
+ i_size_write(inode, src->i_size);
+ inode->i_blocks = src->i_blocks;
+ mark_inode_dirty(inode);
+
+ mutex_lock(&root->fs_info->fs_mutex);
+ btrfs_end_transaction(trans, root);
+ mutex_unlock(&root->fs_info->fs_mutex);
+
+out_unlock:
+ mutex_unlock(&src->i_mutex);
+ mutex_unlock(&inode->i_mutex);
+out_fput:
+ fput(src_file);
+ return ret;
+}
+
long btrfs_ioctl(struct file *file, unsigned int
cmd, unsigned long arg)
{
@@ -2744,6 +2897,9 @@ long btrfs_ioctl(struct file *file, unsi
return btrfs_ioctl_trans_end(file);
case BTRFS_IOC_SYNC:
btrfs_sync_fs(file->f_dentry->d_sb, 1);
+ return 0;
+ case BTRFS_IOC_CLONE:
+ btrfs_ioctl_clone(file, arg);
return 0;
}
diff -r 1791a620d509 ioctl.h
--- a/ioctl.h Thu Apr 24 13:43:27 2008 -0700
+++ b/ioctl.h Thu Apr 24 15:10:17 2008 -0700
@@ -36,5 +36,6 @@ struct btrfs_ioctl_vol_args {
#define BTRFS_IOC_TRANS_START _IO(BTRFS_IOCTL_MAGIC, 6)
#define BTRFS_IOC_TRANS_END _IO(BTRFS_IOCTL_MAGIC, 7)
#define BTRFS_IOC_SYNC _IO(BTRFS_IOCTL_MAGIC, 8)
+#define BTRFS_IOC_CLONE _IOW(BTRFS_IOCTL_MAGIC, 9, int)
#endif
^ permalink raw reply [flat|nested] 32+ messages in thread* Re: [Btrfs-devel] cloning file data 2008-04-24 22:47 cloning file data Sage Weil @ 2008-04-25 13:41 ` Chris Mason 2008-04-25 16:50 ` Zach Brown ` (3 more replies) 2008-04-29 20:52 ` Chris Mason 2008-05-02 20:50 ` Chris Mason 2 siblings, 4 replies; 32+ messages in thread From: Chris Mason @ 2008-04-25 13:41 UTC (permalink / raw) To: btrfs-devel; +Cc: Sage Weil, linux-btrfs On Thursday 24 April 2008, Sage Weil wrote: > Hi- > > I'm working on a clone ioctl that will quickly and efficiently duplicate > the contents of a file, e.g. Very cool. I'd actually loved to see this wrapped into a program that will cow a directory tree. Basically the same as cp -al, but with cow instead of linking. > > int main(int argc, const char **argv) > { > int in = open(argv[1], O_RDONLY); > int out = open(argv[2], O_CREAT|O_TRUNC|O_WRONLY, 0644); > ioctl(out, BTRFS_IOC_CLONE, in); > close(in); > close(out); > return 0; > } > > I've probably got the locking order a bit wrong, lots of error handling is > missing, and I suspect there's a cleaner way to do the target inode size > update, but it behaves well enough in my (limited :) testing. > > Oh, and I wasn't certain the 'offset' in file_extent_item could be safely > ignored when duplicating the extent reference. My assumption was that it > is orthogonal to extent allocation and isn't related to the backref. > However, btrfs_insert_file_extent() always set offset=0. I'm guessing I > need to add an argument there and fix up the other callers? > Yes, you need to preserve the offset. There's only one place right now that sets a non-zero offset and it inserts the extent by hand for other reasons (if you're brave, file.c:btrfs_drop_extents) The reason file extents have an offset field is to allow COW without read/modify/write. Picture something like this: # create a single 100MB extent in file foo dd if=/dev/zero of=foo bs=1M count=100 sync # write into the middle dd if=/dev/zero of=foo bs=4k count=1 seek=100 conv=notrunc sync We've written into the middle of that 100MB extent, and we need to do COW. One option is to read the whole thing, change 4k and write it all back. Instead, btrfs does something like this (+/- off by need more coffee errors): file pos = 0 -> [ old extent, offset = 0, num_bytes = 400k ] file pos = 409600 -> [ new 4k extent, offset = 0, num_bytes = 4k ] file pos = 413696 -> [ old extent, offset = 413696, num_bytes = 100MB - 404k] An extra reference is taken on the old extent to reflect that we're pointing to it twice. > Anyway, any comments or suggestions (on the interface or implemantation) > are welcome.. :) > By taking the inode mutex, you protect against file_write and truncates changing the file. But, we also need to prevent mmaps from changing the file pages as well. What you want to do lock all the file bytes in the extent tree: lock_extent(&BTRFS_I(src_inode)->io_tree, 0, (u64)-1, GFP_NOFS); But unfortunately, the code to fill delayed allocation takes that same lock. So you need to loop a bit: while(1) { filemap_write_and_wait(src_inode); lock_extent() if (BTRFS_I(src_inode)->delalloc_bytes == 0) break; unlock_extent() } That should keep you from racing with btrfs_page_mkwrite() -chris ^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [Btrfs-devel] cloning file data 2008-04-25 13:41 ` [Btrfs-devel] " Chris Mason @ 2008-04-25 16:50 ` Zach Brown 2008-04-25 16:58 ` Chris Mason 2008-04-25 16:50 ` Zach Brown ` (2 subsequent siblings) 3 siblings, 1 reply; 32+ messages in thread From: Zach Brown @ 2008-04-25 16:50 UTC (permalink / raw) To: Sage Weil; +Cc: Chris Mason, btrfs-devel, linux-btrfs > We've written into the middle of that 100MB extent, and we need to do COW. > One option is to read the whole thing, change 4k and write it all back. > Instead, btrfs does something like this (+/- off by need more coffee errors): > > file pos = 0 -> [ old extent, offset = 0, num_bytes = 400k ] > file pos = 409600 -> [ new 4k extent, offset = 0, num_bytes = 4k ] > file pos = 413696 -> [ old extent, offset = 413696, num_bytes = 100MB - 404k] > > An extra reference is taken on the old extent to reflect that we're pointing > to it twice. If you learn how to parse the debug-tree output then this can be seen pretty easily. To do this we can watch the leaves of the fs tree for the inode and extent items of the file we work with: # dd if=/dev/zero bs=1M count=1k of=/tmp/image # losetup /dev/loop0 /tmp/image # ./mkfs.btrfs /dev/loop0 # mount -t btrfs /dev/loop0 /mnt/btrfs # dd if=/dev/zero bs=64M count=1 of=/mnt/btrfs/test # sync # ./debug-tree /tmp/image item 5 key (256 11 258) itemoff 3779 itemsize 26 dir index 258 type 1 namelen 4 datalen 0 name: test [...] item 1 key (258 1 0) itemoff 2699 itemsize 108 inode generation 0 size 67108864 [...] [...] item 3 key (258 12 0) itemoff 2652 itemsize 41 extent data disk byte 190382080 nr 67108864 extent data offset 0 nr 67108864 In the root directory we found a dirent for our test file which shows it has objectid 258, then we found its inode with size=64m and the file extent which references the 64m extent on disk which starts at byte offset 190382080. So now we over-write a 4k region in the file at offset 64k. # dd if=/dev/zero bs=4k count=1 seek=16 of=/mnt/btrfs/test conv=notrunc # sync # ./debug-tree /tmp/image item 1 key (258 1 0) itemoff 2699 itemsize 108 inode generation 0 size 67108864 [...] [...] item 3 key (258 12 0) itemoff 2652 itemsize 41 extent data disk byte 190382080 nr 67108864 extent data offset 0 nr 65536 item 4 key (258 12 65536) itemoff 2611 itemsize 41 extent data disk byte 257490944 nr 4096 extent data offset 0 nr 4096 item 5 key (258 12 69632) itemoff 2570 itemsize 41 extent data disk byte 190382080 nr 67108864 extent data offset 69632 nr 67039232 We still have the same inode, and it has the same size, but its extent items look very different. The extent for the first 64k looks much the same. It references the old 64m extent on disk. But see the 'nr 65536', it only maps 64k of that 64m into the file. Then we have the 4k extent that we just wrote. Then we have another reference to that 64m extent but for the remaining data after the new 4k. The extra credit assignment is to observe the effect of these extent reference item changes on the reference count items which are stored over in the leaves of the extent allocation tree. debug-tree is fantastic, but it can be kind of intimidating if you don't already know what all the numbers mean :). Reducing the barrier to understanding its output might be a great project for someone interested in learning the disk format without having to learn how to work with the kernel code. - z ^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [Btrfs-devel] cloning file data 2008-04-25 16:50 ` Zach Brown @ 2008-04-25 16:58 ` Chris Mason 2008-04-25 17:04 ` Zach Brown 0 siblings, 1 reply; 32+ messages in thread From: Chris Mason @ 2008-04-25 16:58 UTC (permalink / raw) To: Zach Brown; +Cc: Sage Weil, btrfs-devel, linux-btrfs On Friday 25 April 2008, Zach Brown wrote: > > We've written into the middle of that 100MB extent, and we need to do > > COW. One option is to read the whole thing, change 4k and write it all > > back. Instead, btrfs does something like this (+/- off by need more > > coffee errors): > > > > file pos = 0 -> [ old extent, offset = 0, num_bytes = 400k ] > > file pos = 409600 -> [ new 4k extent, offset = 0, num_bytes = 4k ] > > file pos = 413696 -> [ old extent, offset = 413696, num_bytes = 100MB - > > 404k] > > > > An extra reference is taken on the old extent to reflect that we're > > pointing to it twice. > > If you learn how to parse the debug-tree output then this can be seen > pretty easily. To do this we can watch the leaves of the fs tree for > the inode and extent items of the file we work with: Thanks for this example Zach, it's great. One small note: > > # dd if=/dev/zero bs=1M count=1k of=/tmp/image > # losetup /dev/loop0 /tmp/image > # ./mkfs.btrfs /dev/loop0 > # mount -t btrfs /dev/loop0 /mnt/btrfs > > # dd if=/dev/zero bs=64M count=1 of=/mnt/btrfs/test > # sync > > # ./debug-tree /tmp/image Running debug-tree on a live FS is a very good way to learn about trees that get left around while snapshot deletion is happening and cache aliasing caused by the way Btrfs puts metadata into its own address space. But, if you're trying to learn the disk format, I'd stick an unmount between the dd and the debug-tree ;) -chris ^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [Btrfs-devel] cloning file data 2008-04-25 16:58 ` Chris Mason @ 2008-04-25 17:04 ` Zach Brown 0 siblings, 0 replies; 32+ messages in thread From: Zach Brown @ 2008-04-25 17:04 UTC (permalink / raw) To: Chris Mason; +Cc: Sage Weil, btrfs-devel, linux-btrfs > Running debug-tree on a live FS is a very good way to learn about trees that > get left around while snapshot deletion is happening and cache aliasing > caused by the way Btrfs puts metadata into its own address space. > > But, if you're trying to learn the disk format, I'd stick an unmount between > the dd and the debug-tree ;) Haha, true, true. If you don't know to watch for confusing racey results, indeed, serialize around each dump :). - z ^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [Btrfs-devel] cloning file data 2008-04-25 13:41 ` [Btrfs-devel] " Chris Mason 2008-04-25 16:50 ` Zach Brown @ 2008-04-25 16:50 ` Zach Brown 2008-04-25 18:32 ` Sage Weil 2008-04-25 18:26 ` Sage Weil 2008-04-25 20:28 ` [Btrfs-devel] cloning file data Sage Weil 3 siblings, 1 reply; 32+ messages in thread From: Zach Brown @ 2008-04-25 16:50 UTC (permalink / raw) To: Sage Weil; +Cc: Chris Mason, btrfs-devel, linux-btrfs > We've written into the middle of that 100MB extent, and we need to do COW. > One option is to read the whole thing, change 4k and write it all back. > Instead, btrfs does something like this (+/- off by need more coffee errors): > > file pos = 0 -> [ old extent, offset = 0, num_bytes = 400k ] > file pos = 409600 -> [ new 4k extent, offset = 0, num_bytes = 4k ] > file pos = 413696 -> [ old extent, offset = 413696, num_bytes = 100MB - 404k] > > An extra reference is taken on the old extent to reflect that we're pointing > to it twice. If you learn how to parse the debug-tree output then this can be seen pretty easily. To do this we can watch the leaves of the fs tree for the inode and extent items of the file we work with: # dd if=/dev/zero bs=1M count=1k of=/tmp/image # losetup /dev/loop0 /tmp/image # ./mkfs.btrfs /dev/loop0 # mount -t btrfs /dev/loop0 /mnt/btrfs # dd if=/dev/zero bs=64M count=1 of=/mnt/btrfs/test # sync # ./debug-tree /tmp/image item 5 key (256 11 258) itemoff 3779 itemsize 26 dir index 258 type 1 namelen 4 datalen 0 name: test [...] item 1 key (258 1 0) itemoff 2699 itemsize 108 inode generation 0 size 67108864 [...] [...] item 3 key (258 12 0) itemoff 2652 itemsize 41 extent data disk byte 190382080 nr 67108864 extent data offset 0 nr 67108864 In the root directory we found a dirent for our test file which shows it has objectid 258, then we found its inode with size=64m and the file extent which references the 64m extent on disk which starts at byte offset 190382080. So now we over-write a 4k region in the file at offset 64k. # dd if=/dev/zero bs=4k count=1 seek=16 of=/mnt/btrfs/test conv=notrunc # sync # ./debug-tree /tmp/image item 1 key (258 1 0) itemoff 2699 itemsize 108 inode generation 0 size 67108864 [...] [...] item 3 key (258 12 0) itemoff 2652 itemsize 41 extent data disk byte 190382080 nr 67108864 extent data offset 0 nr 65536 item 4 key (258 12 65536) itemoff 2611 itemsize 41 extent data disk byte 257490944 nr 4096 extent data offset 0 nr 4096 item 5 key (258 12 69632) itemoff 2570 itemsize 41 extent data disk byte 190382080 nr 67108864 extent data offset 69632 nr 67039232 We still have the same inode, and it has the same size, but its extent items look very different. The extent for the first 64k looks much the same. It references the old 64m extent on disk. But see the 'nr 65536', it only maps 64k of that 64m into the file. Then we have the 4k extent that we just wrote. Then we have another reference to that 64m extent but for the remaining data after the new 4k. The extra credit assignment is to observe the effect of these extent reference item changes on the reference count items which are stored over in the leaves of the extent allocation tree. debug-tree is fantastic, but it can be kind of intimidating if you don't already know what all the numbers mean :). Reducing the barrier to understanding its output might be a great project for someone interested in learning the disk format without having to learn how to work with the kernel code. - z ^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [Btrfs-devel] cloning file data 2008-04-25 16:50 ` Zach Brown @ 2008-04-25 18:32 ` Sage Weil 0 siblings, 0 replies; 32+ messages in thread From: Sage Weil @ 2008-04-25 18:32 UTC (permalink / raw) To: Zach Brown; +Cc: Chris Mason, btrfs-devel, linux-btrfs On Fri, 25 Apr 2008, Zach Brown wrote: > We still have the same inode, and it has the same size, but its extent > items look very different. The extent for the first 64k looks much the > same. It references the old 64m extent on disk. But see the 'nr > 65536', it only maps 64k of that 64m into the file. Then we have the 4k > extent that we just wrote. Then we have another reference to that 64m > extent but for the remaining data after the new 4k. Is there anything in the defragger (or whatever) that looks for minimally referenced extents? Once can imagine a situation where only a small piece of a large extent is remains referenced, but that information is buried in the forward reference(s). > debug-tree is fantastic, but it can be kind of intimidating if you don't > already know what all the numbers mean :). Reducing the barrier to Yep. Although in my case, the biggest stumbling block was realizing that the key type > item 3 key (258 12 0) itemoff 2652 itemsize 41 ^^ is in printed in hex for some reason. Der. sage ^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [Btrfs-devel] cloning file data 2008-04-25 13:41 ` [Btrfs-devel] " Chris Mason 2008-04-25 16:50 ` Zach Brown 2008-04-25 16:50 ` Zach Brown @ 2008-04-25 18:26 ` Sage Weil 2008-04-26 4:38 ` Sage Weil 2008-04-25 20:28 ` [Btrfs-devel] cloning file data Sage Weil 3 siblings, 1 reply; 32+ messages in thread From: Sage Weil @ 2008-04-25 18:26 UTC (permalink / raw) To: Chris Mason; +Cc: btrfs-devel, linux-btrfs On Fri, 25 Apr 2008, Chris Mason wrote: > Very cool. I'd actually loved to see this wrapped into a program that will > cow a directory tree. Basically the same as cp -al, but with cow instead of > linking. Yeah definitely. I added a -c/--cow flag to GNU cp, but am having trouble coercing autotools into even building on my box. I'll fiddle with it a little later. Basically, it just tries the ioctl, and goes into the regular copy read/write loop if that fails. > > However, btrfs_insert_file_extent() always set offset=0. I'm guessing I > > need to add an argument there and fix up the other callers? > > > Yes, you need to preserve the offset. There's only one place right now that > sets a non-zero offset and it inserts the extent by hand for other reasons > (if you're brave, file.c:btrfs_drop_extents) I see. In this case, since I'm duplicating the forward and backrefs, I just added the offset arg to btrfs_insert_file_extent(). > By taking the inode mutex, you protect against file_write and truncates > changing the file. But, we also need to prevent mmaps from changing the file > pages as well. What you want to do lock all the file bytes in the extent > tree: > > lock_extent(&BTRFS_I(src_inode)->io_tree, 0, (u64)-1, GFP_NOFS); > > But unfortunately, the code to fill delayed allocation takes that same lock. > So you need to loop a bit: > > while(1) { > filemap_write_and_wait(src_inode); > lock_extent() > if (BTRFS_I(src_inode)->delalloc_bytes == 0) > break; > unlock_extent() > } > > That should keep you from racing with btrfs_page_mkwrite() Ah, that's what I was looking for. The adjusted patch is below! Thanks- sage diff -r 1791a620d509 ctree.h --- a/ctree.h Thu Apr 24 13:43:27 2008 -0700 +++ b/ctree.h Fri Apr 25 10:12:46 2008 -0700 @@ -1135,9 +1135,9 @@ int btrfs_lookup_inode(struct btrfs_tran /* file-item.c */ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 objectid, u64 pos, u64 offset, + u64 objectid, u64 pos, u64 disk_offset, u64 disk_num_bytes, - u64 num_bytes); + u64 num_bytes, u64 offset); int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid, diff -r 1791a620d509 file-item.c --- a/file-item.c Thu Apr 24 13:43:27 2008 -0700 +++ b/file-item.c Fri Apr 25 10:12:46 2008 -0700 @@ -28,10 +28,10 @@ sizeof(struct btrfs_item) * 2) / \ BTRFS_CRC32_SIZE) - 1)) int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 objectid, u64 pos, - u64 offset, u64 disk_num_bytes, - u64 num_bytes) + struct btrfs_root *root, + u64 objectid, u64 pos, + u64 disk_offset, u64 disk_num_bytes, + u64 num_bytes, u64 offset) { int ret = 0; struct btrfs_file_extent_item *item; @@ -53,9 +53,9 @@ int btrfs_insert_file_extent(struct btrf leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); - btrfs_set_file_extent_disk_bytenr(leaf, item, offset); + btrfs_set_file_extent_disk_bytenr(leaf, item, disk_offset); btrfs_set_file_extent_disk_num_bytes(leaf, item, disk_num_bytes); - btrfs_set_file_extent_offset(leaf, item, 0); + btrfs_set_file_extent_offset(leaf, item, offset); btrfs_set_file_extent_num_bytes(leaf, item, num_bytes); btrfs_set_file_extent_generation(leaf, item, trans->transid); btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG); diff -r 1791a620d509 file.c --- a/file.c Thu Apr 24 13:43:27 2008 -0700 +++ b/file.c Fri Apr 25 10:12:46 2008 -0700 @@ -285,7 +285,7 @@ static int noinline dirty_and_release_pa err = btrfs_insert_file_extent(trans, root, inode->i_ino, last_pos_in_file, - 0, 0, hole_size); + 0, 0, hole_size, 0); btrfs_drop_extent_cache(inode, last_pos_in_file, last_pos_in_file + hole_size -1); btrfs_check_file(root, inode); diff -r 1791a620d509 inode.c --- a/inode.c Thu Apr 24 13:43:27 2008 -0700 +++ b/inode.c Fri Apr 25 10:12:46 2008 -0700 @@ -18,6 +18,7 @@ #include <linux/bio.h> #include <linux/buffer_head.h> +#include <linux/file.h> #include <linux/fs.h> #include <linux/pagemap.h> #include <linux/highmem.h> @@ -134,7 +135,7 @@ static int cow_file_range(struct inode * } ret = btrfs_insert_file_extent(trans, root, inode->i_ino, start, ins.objectid, ins.offset, - ins.offset); + ins.offset, 0); inode->i_blocks += ins.offset >> 9; btrfs_check_file(root, inode); num_bytes -= cur_alloc_size; @@ -1046,7 +1047,7 @@ static int btrfs_setattr(struct dentry * err = btrfs_insert_file_extent(trans, root, inode->i_ino, hole_start, 0, 0, - hole_size); + hole_size, 0); btrfs_drop_extent_cache(inode, hole_start, hole_size - 1); btrfs_check_file(root, inode); @@ -2726,6 +2727,168 @@ long btrfs_ioctl_trans_end(struct file * return 0; } +void dup_item_to_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *leaf, + int slot, + struct btrfs_key *key, + u64 destino) +{ + struct btrfs_path *cpath = btrfs_alloc_path(); + int len = btrfs_item_size_nr(leaf, slot); + int dstoff; + struct btrfs_key ckey = *key; + int ret; + + ckey.objectid = destino; + ret = btrfs_insert_empty_item(trans, root, cpath, &ckey, len); + dstoff = btrfs_item_ptr_offset(cpath->nodes[0], cpath->slots[0]); + copy_extent_buffer(cpath->nodes[0], leaf, dstoff, + btrfs_item_ptr_offset(leaf, slot), + len); + btrfs_release_path(root, cpath); +} + +long btrfs_ioctl_clone(struct file *file, unsigned long src_fd) +{ + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct file *src_file; + struct inode *src; + struct btrfs_trans_handle *trans; + int ret; + u64 pos; + struct btrfs_path *path; + struct btrfs_key key; + struct extent_buffer *leaf; + u32 nritems; + int nextret; + int slot; + + src_file = fget(src_fd); + if (!src_file) + return -EBADF; + src = src_file->f_dentry->d_inode; + + ret = -EXDEV; + if (src->i_sb != inode->i_sb) + goto out_fput; + + if (inode < src) { + mutex_lock(&inode->i_mutex); + mutex_lock(&src->i_mutex); + } else { + mutex_lock(&src->i_mutex); + mutex_lock(&inode->i_mutex); + } + + ret = -ENOTEMPTY; + if (inode->i_size) + goto out_unlock; + + /* do any pending delalloc/csum calc on src, one way or + another, and lock file content */ + while (1) { + filemap_write_and_wait(src->i_mapping); + lock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS); + if (BTRFS_I(src)->delalloc_bytes == 0) + break; + unlock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS); + } + + mutex_lock(&root->fs_info->fs_mutex); + trans = btrfs_start_transaction(root, 0); + path = btrfs_alloc_path(); + pos = 0; + while (1) { + ret = btrfs_lookup_file_extent(trans, root, path, src->i_ino, + pos, 0); + if (ret < 0) + goto out; + if (ret > 0) { + if (path->slots[0] == 0) { + ret = 0; + goto out; + } + path->slots[0]--; + } + next_slot: + leaf = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(leaf, &key, slot); + + printk("key(%llu %x %llu)\n", + key.objectid, key.type, key.offset); + if (btrfs_key_type(&key) > BTRFS_CSUM_ITEM_KEY || + key.objectid != src->i_ino) + goto out; + if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) { + struct btrfs_file_extent_item *extent; + int found_type; + u64 len; + pos = key.offset; + extent = btrfs_item_ptr(leaf, slot, + struct btrfs_file_extent_item); + found_type = btrfs_file_extent_type(leaf, extent); + len = btrfs_file_extent_num_bytes(leaf, extent); + if (found_type == BTRFS_FILE_EXTENT_REG) { + u64 ds = btrfs_file_extent_disk_bytenr(leaf, + extent); + u64 dl = btrfs_file_extent_disk_num_bytes(leaf, + extent); + u64 off = btrfs_file_extent_offset(leaf, + extent); + printk(" %llu~%llu disk %llu~%llu off %llu\n", + pos, len, ds, dl, off); + btrfs_insert_file_extent(trans, root, + inode->i_ino, pos, + ds, dl, len, off); + btrfs_inc_extent_ref(trans, root, ds, dl, + root->root_key.objectid, + trans->transid, + inode->i_ino, pos); + } else if (found_type == BTRFS_FILE_EXTENT_INLINE) + dup_item_to_inode(trans, root, path, leaf, slot, + &key, inode->i_ino); + pos = key.offset + len; + } else if (btrfs_key_type(&key) == BTRFS_CSUM_ITEM_KEY) + dup_item_to_inode(trans, root, path, leaf, slot, &key, + inode->i_ino); + + nritems = btrfs_header_nritems(leaf); + if (slot >= nritems - 1) { + nextret = btrfs_next_leaf(root, path); + if (nextret) + goto out; + } else { + path->slots[0]++; + } + goto next_slot; + } + +out: + ret = 0; + mutex_unlock(&root->fs_info->fs_mutex); + + i_size_write(inode, src->i_size); + inode->i_blocks = src->i_blocks; + mark_inode_dirty(inode); + + unlock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS); + + mutex_lock(&root->fs_info->fs_mutex); + btrfs_end_transaction(trans, root); + mutex_unlock(&root->fs_info->fs_mutex); + +out_unlock: + mutex_unlock(&src->i_mutex); + mutex_unlock(&inode->i_mutex); +out_fput: + fput(src_file); + return ret; +} + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -2744,6 +2907,9 @@ long btrfs_ioctl(struct file *file, unsi return btrfs_ioctl_trans_end(file); case BTRFS_IOC_SYNC: btrfs_sync_fs(file->f_dentry->d_sb, 1); + return 0; + case BTRFS_IOC_CLONE: + btrfs_ioctl_clone(file, arg); return 0; } diff -r 1791a620d509 ioctl.h --- a/ioctl.h Thu Apr 24 13:43:27 2008 -0700 +++ b/ioctl.h Fri Apr 25 10:12:46 2008 -0700 @@ -36,5 +36,6 @@ struct btrfs_ioctl_vol_args { #define BTRFS_IOC_TRANS_START _IO(BTRFS_IOCTL_MAGIC, 6) #define BTRFS_IOC_TRANS_END _IO(BTRFS_IOCTL_MAGIC, 7) #define BTRFS_IOC_SYNC _IO(BTRFS_IOCTL_MAGIC, 8) +#define BTRFS_IOC_CLONE _IOW(BTRFS_IOCTL_MAGIC, 9, int) #endif ^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [Btrfs-devel] cloning file data 2008-04-25 18:26 ` Sage Weil @ 2008-04-26 4:38 ` Sage Weil 2008-05-03 4:44 ` Yan Zheng 0 siblings, 1 reply; 32+ messages in thread From: Sage Weil @ 2008-04-26 4:38 UTC (permalink / raw) To: Chris Mason; +Cc: btrfs-devel, linux-btrfs Hi- Couple small fixes. Cloning a linux kernel tree (cp -ac) takes 5-6 seconds (cp -al is 3-4 seconds, cp -a is 15-20 seconds). sage diff -r 1791a620d509 ctree.h --- a/ctree.h Thu Apr 24 13:43:27 2008 -0700 +++ b/ctree.h Fri Apr 25 21:14:25 2008 -0700 @@ -1135,9 +1135,9 @@ int btrfs_lookup_inode(struct btrfs_tran /* file-item.c */ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 objectid, u64 pos, u64 offset, + u64 objectid, u64 pos, u64 disk_offset, u64 disk_num_bytes, - u64 num_bytes); + u64 num_bytes, u64 offset); int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid, diff -r 1791a620d509 file-item.c --- a/file-item.c Thu Apr 24 13:43:27 2008 -0700 +++ b/file-item.c Fri Apr 25 21:14:25 2008 -0700 @@ -28,10 +28,10 @@ sizeof(struct btrfs_item) * 2) / \ BTRFS_CRC32_SIZE) - 1)) int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 objectid, u64 pos, - u64 offset, u64 disk_num_bytes, - u64 num_bytes) + struct btrfs_root *root, + u64 objectid, u64 pos, + u64 disk_offset, u64 disk_num_bytes, + u64 num_bytes, u64 offset) { int ret = 0; struct btrfs_file_extent_item *item; @@ -53,9 +53,9 @@ int btrfs_insert_file_extent(struct btrf leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); - btrfs_set_file_extent_disk_bytenr(leaf, item, offset); + btrfs_set_file_extent_disk_bytenr(leaf, item, disk_offset); btrfs_set_file_extent_disk_num_bytes(leaf, item, disk_num_bytes); - btrfs_set_file_extent_offset(leaf, item, 0); + btrfs_set_file_extent_offset(leaf, item, offset); btrfs_set_file_extent_num_bytes(leaf, item, num_bytes); btrfs_set_file_extent_generation(leaf, item, trans->transid); btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG); diff -r 1791a620d509 file.c --- a/file.c Thu Apr 24 13:43:27 2008 -0700 +++ b/file.c Fri Apr 25 21:14:25 2008 -0700 @@ -285,7 +285,7 @@ static int noinline dirty_and_release_pa err = btrfs_insert_file_extent(trans, root, inode->i_ino, last_pos_in_file, - 0, 0, hole_size); + 0, 0, hole_size, 0); btrfs_drop_extent_cache(inode, last_pos_in_file, last_pos_in_file + hole_size -1); btrfs_check_file(root, inode); diff -r 1791a620d509 inode.c --- a/inode.c Thu Apr 24 13:43:27 2008 -0700 +++ b/inode.c Fri Apr 25 21:14:25 2008 -0700 @@ -18,6 +18,7 @@ #include <linux/bio.h> #include <linux/buffer_head.h> +#include <linux/file.h> #include <linux/fs.h> #include <linux/pagemap.h> #include <linux/highmem.h> @@ -134,7 +135,7 @@ static int cow_file_range(struct inode * } ret = btrfs_insert_file_extent(trans, root, inode->i_ino, start, ins.objectid, ins.offset, - ins.offset); + ins.offset, 0); inode->i_blocks += ins.offset >> 9; btrfs_check_file(root, inode); num_bytes -= cur_alloc_size; @@ -1046,7 +1047,7 @@ static int btrfs_setattr(struct dentry * err = btrfs_insert_file_extent(trans, root, inode->i_ino, hole_start, 0, 0, - hole_size); + hole_size, 0); btrfs_drop_extent_cache(inode, hole_start, hole_size - 1); btrfs_check_file(root, inode); @@ -2726,6 +2727,168 @@ long btrfs_ioctl_trans_end(struct file * return 0; } +void dup_item_to_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *leaf, + int slot, + struct btrfs_key *key, + u64 destino) +{ + struct btrfs_path *cpath = btrfs_alloc_path(); + int len = btrfs_item_size_nr(leaf, slot); + int dstoff; + struct btrfs_key ckey = *key; + int ret; + + ckey.objectid = destino; + ret = btrfs_insert_empty_item(trans, root, cpath, &ckey, len); + dstoff = btrfs_item_ptr_offset(cpath->nodes[0], cpath->slots[0]); + copy_extent_buffer(cpath->nodes[0], leaf, dstoff, + btrfs_item_ptr_offset(leaf, slot), + len); + btrfs_release_path(root, cpath); +} + +long btrfs_ioctl_clone(struct file *file, unsigned long src_fd) +{ + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct file *src_file; + struct inode *src; + struct btrfs_trans_handle *trans; + int ret; + u64 pos; + struct btrfs_path *path; + struct btrfs_key key; + struct extent_buffer *leaf; + u32 nritems; + int nextret; + int slot; + + src_file = fget(src_fd); + if (!src_file) + return -EBADF; + src = src_file->f_dentry->d_inode; + + ret = -EXDEV; + if (src->i_sb != inode->i_sb) + goto out_fput; + + if (inode < src) { + mutex_lock(&inode->i_mutex); + mutex_lock(&src->i_mutex); + } else { + mutex_lock(&src->i_mutex); + mutex_lock(&inode->i_mutex); + } + + ret = -ENOTEMPTY; + if (inode->i_size) + goto out_unlock; + + /* do any pending delalloc/csum calc on src, one way or + another, and lock file content */ + while (1) { + filemap_write_and_wait(src->i_mapping); + lock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS); + if (BTRFS_I(src)->delalloc_bytes == 0) + break; + unlock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS); + } + + mutex_lock(&root->fs_info->fs_mutex); + trans = btrfs_start_transaction(root, 0); + path = btrfs_alloc_path(); + pos = 0; + while (1) { + ret = btrfs_lookup_file_extent(trans, root, path, src->i_ino, + pos, 0); + if (ret < 0) + goto out; + if (ret > 0) { + if (path->slots[0] == 0) { + ret = 0; + goto out; + } + path->slots[0]--; + } + next_slot: + leaf = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(leaf, &key, slot); + nritems = btrfs_header_nritems(leaf); + + if (btrfs_key_type(&key) > BTRFS_CSUM_ITEM_KEY || + key.objectid != src->i_ino) + goto out; + if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) { + struct btrfs_file_extent_item *extent; + int found_type; + pos = key.offset; + extent = btrfs_item_ptr(leaf, slot, + struct btrfs_file_extent_item); + found_type = btrfs_file_extent_type(leaf, extent); + if (found_type == BTRFS_FILE_EXTENT_REG) { + u64 len = btrfs_file_extent_num_bytes(leaf, + extent); + u64 ds = btrfs_file_extent_disk_bytenr(leaf, + extent); + u64 dl = btrfs_file_extent_disk_num_bytes(leaf, + extent); + u64 off = btrfs_file_extent_offset(leaf, + extent); + btrfs_insert_file_extent(trans, root, + inode->i_ino, pos, + ds, dl, len, off); + btrfs_inc_extent_ref(trans, root, ds, dl, + root->root_key.objectid, + trans->transid, + inode->i_ino, pos); + pos = key.offset + len; + } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { + dup_item_to_inode(trans, root, path, leaf, slot, + &key, inode->i_ino); + pos = key.offset + btrfs_item_size_nr(leaf, + slot); + } + } else if (btrfs_key_type(&key) == BTRFS_CSUM_ITEM_KEY) + dup_item_to_inode(trans, root, path, leaf, slot, &key, + inode->i_ino); + + if (slot >= nritems - 1) { + nextret = btrfs_next_leaf(root, path); + if (nextret) + goto out; + } else { + path->slots[0]++; + } + goto next_slot; + } + +out: + btrfs_free_path(path); + ret = 0; + mutex_unlock(&root->fs_info->fs_mutex); + + i_size_write(inode, src->i_size); + inode->i_blocks = src->i_blocks; + mark_inode_dirty(inode); + + unlock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS); + + mutex_lock(&root->fs_info->fs_mutex); + btrfs_end_transaction(trans, root); + mutex_unlock(&root->fs_info->fs_mutex); + +out_unlock: + mutex_unlock(&src->i_mutex); + mutex_unlock(&inode->i_mutex); +out_fput: + fput(src_file); + return ret; +} + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -2744,6 +2907,9 @@ long btrfs_ioctl(struct file *file, unsi return btrfs_ioctl_trans_end(file); case BTRFS_IOC_SYNC: btrfs_sync_fs(file->f_dentry->d_sb, 1); + return 0; + case BTRFS_IOC_CLONE: + btrfs_ioctl_clone(file, arg); return 0; } diff -r 1791a620d509 ioctl.h --- a/ioctl.h Thu Apr 24 13:43:27 2008 -0700 +++ b/ioctl.h Fri Apr 25 21:14:25 2008 -0700 @@ -36,5 +36,6 @@ struct btrfs_ioctl_vol_args { #define BTRFS_IOC_TRANS_START _IO(BTRFS_IOCTL_MAGIC, 6) #define BTRFS_IOC_TRANS_END _IO(BTRFS_IOCTL_MAGIC, 7) #define BTRFS_IOC_SYNC _IO(BTRFS_IOCTL_MAGIC, 8) +#define BTRFS_IOC_CLONE _IOW(BTRFS_IOCTL_MAGIC, 9, int) #endif ^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [Btrfs-devel] cloning file data 2008-04-26 4:38 ` Sage Weil @ 2008-05-03 4:44 ` Yan Zheng 2008-05-03 6:16 ` Sage Weil 0 siblings, 1 reply; 32+ messages in thread From: Yan Zheng @ 2008-05-03 4:44 UTC (permalink / raw) To: Sage Weil; +Cc: Chris Mason, btrfs-devel, linux-btrfs Hello Sage, I think the clone ioctl won't work in some corner case. The big loop in btrfs_ioctl_clone uses path->slots[0]++ and btrfs_next_leaf to get next item in the tree. However, this approach works only when the layout of tree keeps unchangeed. In btrfs_ioctl_clone, both btrfs_insert_file_extent and dup_item_to_inode may change the layout of tree. To be safe, I think the codes should: use btrfs_search_slot to find next item. use a intermediate buffer when coping item between two extent buffer. Regards YZ ^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [Btrfs-devel] cloning file data 2008-05-03 4:44 ` Yan Zheng @ 2008-05-03 6:16 ` Sage Weil 2008-05-03 6:48 ` Yan Zheng 2008-05-03 7:25 ` Yan Zheng 0 siblings, 2 replies; 32+ messages in thread From: Sage Weil @ 2008-05-03 6:16 UTC (permalink / raw) To: Yan Zheng; +Cc: Chris Mason, btrfs-devel, linux-btrfs Hi Yan- On Sat, 3 May 2008, Yan Zheng wrote: > I think the clone ioctl won't work in some corner case. The big loop > in btrfs_ioctl_clone uses path->slots[0]++ and btrfs_next_leaf to get > next item in the tree. However, this approach works only when the > layout of tree keeps unchangeed. In btrfs_ioctl_clone, both > btrfs_insert_file_extent and dup_item_to_inode may change the layout > of tree. > > To be safe, I think the codes should: > use btrfs_search_slot to find next item. > use a intermediate buffer when coping item between two extent buffer. Oh, right. I think for the item copy, though, we just need to re-search for the source key again after doing the insert_empty_item. Then we can still use copy_extent_buffer, since at that point both paths will be valid? Something like the below (untested) patch. I suspect I didn't hit this because I was always cloning 'file' to something like 'file2' that always sorted after the src file, and didn't shift its position in the leaf. I'll try to test this in the next few days. Thanks- sage diff -r f6ba18a50ad7 inode.c --- a/inode.c Fri May 02 16:13:49 2008 -0400 +++ b/inode.c Fri May 02 23:11:49 2008 -0700 @@ -3103,25 +3103,27 @@ out: void dup_item_to_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_path *path, - struct extent_buffer *leaf, - int slot, - struct btrfs_key *key, + struct btrfs_path *srcpath, + struct btrfs_key *srckey, u64 destino) { - struct btrfs_path *cpath = btrfs_alloc_path(); - int len = btrfs_item_size_nr(leaf, slot); - int dstoff; - struct btrfs_key ckey = *key; + struct btrfs_key dstkey = *srckey; + struct btrfs_path *dstpath = btrfs_alloc_path(); + int len = btrfs_item_size_nr(srcpath->nodes[0], + srcpath->slots[0]); int ret; - ckey.objectid = destino; - ret = btrfs_insert_empty_item(trans, root, cpath, &ckey, len); - dstoff = btrfs_item_ptr_offset(cpath->nodes[0], cpath->slots[0]); - copy_extent_buffer(cpath->nodes[0], leaf, dstoff, - btrfs_item_ptr_offset(leaf, slot), + dstkey.objectid = destino; + ret = btrfs_insert_empty_item(trans, root, dstpath, &dstkey, len); + /* re-search for src key, in case we changed srcpath */ + ret = btrfs_search_slot(trans, root, srckey, srcpath, 0, 0); + copy_extent_buffer(dstpath->nodes[0], srcpath->nodes[0], + btrfs_item_ptr_offset(dstpath->nodes[0], + dstpath->slots[0]), + btrfs_item_ptr_offset(srcpath->nodes[0], + srcpath->slots[0]), len); - btrfs_release_path(root, cpath); + btrfs_release_path(root, dstpath); } long btrfs_ioctl_clone(struct file *file, unsigned long src_fd) @@ -3137,7 +3139,6 @@ long btrfs_ioctl_clone(struct file *file struct btrfs_key key; struct extent_buffer *leaf; u32 nritems; - int nextret; int slot; src_file = fget(src_fd); @@ -3178,6 +3179,8 @@ long btrfs_ioctl_clone(struct file *file while (1) { ret = btrfs_lookup_file_extent(trans, root, path, src->i_ino, pos, 0); + +next_slot: if (ret < 0) goto out; if (ret > 0) { @@ -3187,7 +3190,7 @@ long btrfs_ioctl_clone(struct file *file } path->slots[0]--; } -next_slot: + leaf = path->nodes[0]; slot = path->slots[0]; btrfs_item_key_to_cpu(leaf, &key, slot); @@ -3225,22 +3228,19 @@ next_slot: } pos = key.offset + len; } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { - dup_item_to_inode(trans, root, path, leaf, slot, - &key, inode->i_ino); + dup_item_to_inode(trans, root, path, &key, + inode->i_ino); pos = key.offset + btrfs_item_size_nr(leaf, slot); } } else if (btrfs_key_type(&key) == BTRFS_CSUM_ITEM_KEY) - dup_item_to_inode(trans, root, path, leaf, slot, &key, + dup_item_to_inode(trans, root, path, &key, inode->i_ino); - if (slot >= nritems - 1) { - nextret = btrfs_next_leaf(root, path); - if (nextret) - goto out; - } else { - path->slots[0]++; - } + /* path may not still be valid, so explicitly search + * for the next key */ + key.offset = pos; + ret = btrfs_search_slot(trans, root, &key, path, 0, 0); goto next_slot; } ^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [Btrfs-devel] cloning file data 2008-05-03 6:16 ` Sage Weil @ 2008-05-03 6:48 ` Yan Zheng 2008-05-03 7:25 ` Yan Zheng 1 sibling, 0 replies; 32+ messages in thread From: Yan Zheng @ 2008-05-03 6:48 UTC (permalink / raw) To: Sage Weil; +Cc: Chris Mason, btrfs-devel, linux-btrfs 2008/5/3, Sage Weil <sage@newdream.net>: > Hi Yan- > > On Sat, 3 May 2008, Yan Zheng wrote: > > I think the clone ioctl won't work in some corner case. The big loop > > in btrfs_ioctl_clone uses path->slots[0]++ and btrfs_next_leaf to get > > next item in the tree. However, this approach works only when the > > layout of tree keeps unchangeed. In btrfs_ioctl_clone, both > > btrfs_insert_file_extent and dup_item_to_inode may change the layout > > of tree. > > > > To be safe, I think the codes should: > > use btrfs_search_slot to find next item. > > use a intermediate buffer when coping item between two extent buffer. > > Oh, right. I think for the item copy, though, we just need to re-search > for the source key again after doing the insert_empty_item. Then we can > still use copy_extent_buffer, since at that point both paths will be > valid? > > Something like the below (untested) patch. I suspect I didn't hit this > because I was always cloning 'file' to something like 'file2' that always > sorted after the src file, and didn't shift its position in the leaf. I'll > try to test this in the next few days. > > Thanks- > sage > > > diff -r f6ba18a50ad7 inode.c > --- a/inode.c Fri May 02 16:13:49 2008 -0400 > +++ b/inode.c Fri May 02 23:11:49 2008 -0700 > @@ -3103,25 +3103,27 @@ out: > > void dup_item_to_inode(struct btrfs_trans_handle *trans, > struct btrfs_root *root, > - struct btrfs_path *path, > - struct extent_buffer *leaf, > - int slot, > - struct btrfs_key *key, > + struct btrfs_path *srcpath, > + struct btrfs_key *srckey, > u64 destino) > { > - struct btrfs_path *cpath = btrfs_alloc_path(); > - int len = btrfs_item_size_nr(leaf, slot); > - int dstoff; > - struct btrfs_key ckey = *key; > + struct btrfs_key dstkey = *srckey; > + struct btrfs_path *dstpath = btrfs_alloc_path(); > + int len = btrfs_item_size_nr(srcpath->nodes[0], > + srcpath->slots[0]); > int ret; > > - ckey.objectid = destino; > - ret = btrfs_insert_empty_item(trans, root, cpath, &ckey, len); > - dstoff = btrfs_item_ptr_offset(cpath->nodes[0], cpath->slots[0]); > - copy_extent_buffer(cpath->nodes[0], leaf, dstoff, > - btrfs_item_ptr_offset(leaf, slot), > + dstkey.objectid = destino; > + ret = btrfs_insert_empty_item(trans, root, dstpath, &dstkey, len); > + /* re-search for src key, in case we changed srcpath */ > + ret = btrfs_search_slot(trans, root, srckey, srcpath, 0, 0); > + copy_extent_buffer(dstpath->nodes[0], srcpath->nodes[0], > + btrfs_item_ptr_offset(dstpath->nodes[0], > + dstpath->slots[0]), > + btrfs_item_ptr_offset(srcpath->nodes[0], > + srcpath->slots[0]), > len); > - btrfs_release_path(root, cpath); > + btrfs_release_path(root, dstpath); > } > > long btrfs_ioctl_clone(struct file *file, unsigned long src_fd) > @@ -3137,7 +3139,6 @@ long btrfs_ioctl_clone(struct file *file > struct btrfs_key key; > struct extent_buffer *leaf; > u32 nritems; > - int nextret; > int slot; > > src_file = fget(src_fd); > @@ -3178,6 +3179,8 @@ long btrfs_ioctl_clone(struct file *file > while (1) { > ret = btrfs_lookup_file_extent(trans, root, path, src->i_ino, > pos, 0); > + > +next_slot: > if (ret < 0) > goto out; > if (ret > 0) { > @@ -3187,7 +3190,7 @@ long btrfs_ioctl_clone(struct file *file > } > path->slots[0]--; > } > -next_slot: > + > leaf = path->nodes[0]; > slot = path->slots[0]; > btrfs_item_key_to_cpu(leaf, &key, slot); > @@ -3225,22 +3228,19 @@ next_slot: > } > pos = key.offset + len; > } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { > - dup_item_to_inode(trans, root, path, leaf, slot, > - &key, inode->i_ino); > + dup_item_to_inode(trans, root, path, &key, > + inode->i_ino); > pos = key.offset + btrfs_item_size_nr(leaf, > slot); > } > } else if (btrfs_key_type(&key) == BTRFS_CSUM_ITEM_KEY) > - dup_item_to_inode(trans, root, path, leaf, slot, &key, > + dup_item_to_inode(trans, root, path, &key, > inode->i_ino); > > - if (slot >= nritems - 1) { > - nextret = btrfs_next_leaf(root, path); > - if (nextret) > - goto out; > - } else { > - path->slots[0]++; > - } > + /* path may not still be valid, so explicitly search > + * for the next key */ > + key.offset = pos; > + ret = btrfs_search_slot(trans, root, &key, path, 0, 0); > goto next_slot; > } key.offset isn't updated properly. you should differentiate extent item, inline extent item and csum item. For extent item: key.offset += btrfs_file_extent_num_bytes(...); For inline extent item: u32 len = btrfs_file_extent_inline_len(...); key.offset += ALIGN(len, root->sectorsize); For csum item key.offset += btrfs_item_size_nr(...) / BTRFS_CRC32_SIZE * root->sectorsize; Regards YZ ^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [Btrfs-devel] cloning file data 2008-05-03 6:16 ` Sage Weil 2008-05-03 6:48 ` Yan Zheng @ 2008-05-03 7:25 ` Yan Zheng 2008-05-05 10:27 ` Chris Mason 1 sibling, 1 reply; 32+ messages in thread From: Yan Zheng @ 2008-05-03 7:25 UTC (permalink / raw) To: Sage Weil; +Cc: Chris Mason, btrfs-devel, linux-btrfs 2008/5/3, Sage Weil <sage@newdream.net>: > Hi Yan- > > On Sat, 3 May 2008, Yan Zheng wrote: > > I think the clone ioctl won't work in some corner case. The big loop > > in btrfs_ioctl_clone uses path->slots[0]++ and btrfs_next_leaf to get > > next item in the tree. However, this approach works only when the > > layout of tree keeps unchangeed. In btrfs_ioctl_clone, both > > btrfs_insert_file_extent and dup_item_to_inode may change the layout > > of tree. > > > > To be safe, I think the codes should: > > use btrfs_search_slot to find next item. > > use a intermediate buffer when coping item between two extent buffer. > > Oh, right. I think for the item copy, though, we just need to re-search > for the source key again after doing the insert_empty_item. Then we can > still use copy_extent_buffer, since at that point both paths will be > valid? > > Something like the below (untested) patch. I suspect I didn't hit this > because I was always cloning 'file' to something like 'file2' that always > sorted after the src file, and didn't shift its position in the leaf. I'll > try to test this in the next few days. > > Thanks- > sage > > > diff -r f6ba18a50ad7 inode.c > --- a/inode.c Fri May 02 16:13:49 2008 -0400 > +++ b/inode.c Fri May 02 23:11:49 2008 -0700 > @@ -3103,25 +3103,27 @@ out: > > void dup_item_to_inode(struct btrfs_trans_handle *trans, > struct btrfs_root *root, > - struct btrfs_path *path, > - struct extent_buffer *leaf, > - int slot, > - struct btrfs_key *key, > + struct btrfs_path *srcpath, > + struct btrfs_key *srckey, > u64 destino) > { > - struct btrfs_path *cpath = btrfs_alloc_path(); > - int len = btrfs_item_size_nr(leaf, slot); > - int dstoff; > - struct btrfs_key ckey = *key; > + struct btrfs_key dstkey = *srckey; > + struct btrfs_path *dstpath = btrfs_alloc_path(); > + int len = btrfs_item_size_nr(srcpath->nodes[0], > + srcpath->slots[0]); > int ret; > > - ckey.objectid = destino; > - ret = btrfs_insert_empty_item(trans, root, cpath, &ckey, len); > - dstoff = btrfs_item_ptr_offset(cpath->nodes[0], cpath->slots[0]); > - copy_extent_buffer(cpath->nodes[0], leaf, dstoff, > - btrfs_item_ptr_offset(leaf, slot), > + dstkey.objectid = destino; > + ret = btrfs_insert_empty_item(trans, root, dstpath, &dstkey, len); > + /* re-search for src key, in case we changed srcpath */ > + ret = btrfs_search_slot(trans, root, srckey, srcpath, 0, 0); > + copy_extent_buffer(dstpath->nodes[0], srcpath->nodes[0], > + btrfs_item_ptr_offset(dstpath->nodes[0], > + dstpath->slots[0]), > + btrfs_item_ptr_offset(srcpath->nodes[0], > + srcpath->slots[0]), > len); > - btrfs_release_path(root, cpath); > + btrfs_release_path(root, dstpath); > } > > long btrfs_ioctl_clone(struct file *file, unsigned long src_fd) > @@ -3137,7 +3139,6 @@ long btrfs_ioctl_clone(struct file *file > struct btrfs_key key; > struct extent_buffer *leaf; > u32 nritems; > - int nextret; > int slot; > > src_file = fget(src_fd); > @@ -3178,6 +3179,8 @@ long btrfs_ioctl_clone(struct file *file > while (1) { > ret = btrfs_lookup_file_extent(trans, root, path, src->i_ino, > pos, 0); > + > +next_slot: > if (ret < 0) > goto out; > if (ret > 0) { > @@ -3187,7 +3190,7 @@ long btrfs_ioctl_clone(struct file *file > } > path->slots[0]--; > } > -next_slot: > + > leaf = path->nodes[0]; > slot = path->slots[0]; > btrfs_item_key_to_cpu(leaf, &key, slot); > @@ -3225,22 +3228,19 @@ next_slot: > } > pos = key.offset + len; > } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { > - dup_item_to_inode(trans, root, path, leaf, slot, > - &key, inode->i_ino); > + dup_item_to_inode(trans, root, path, &key, > + inode->i_ino); > pos = key.offset + btrfs_item_size_nr(leaf, > slot); > } > } else if (btrfs_key_type(&key) == BTRFS_CSUM_ITEM_KEY) > - dup_item_to_inode(trans, root, path, leaf, slot, &key, > + dup_item_to_inode(trans, root, path, &key, > inode->i_ino); > > - if (slot >= nritems - 1) { > - nextret = btrfs_next_leaf(root, path); > - if (nextret) > - goto out; > - } else { > - path->slots[0]++; > - } > + /* path may not still be valid, so explicitly search > + * for the next key */ > + key.offset = pos; > + ret = btrfs_search_slot(trans, root, &key, path, 0, 0); > goto next_slot; > } > > In my previous mail, I said items of different types should be differentiated. Actually, there is no need to do that. Please consider changing the big loop in btrfs_ioctl_clone to something like: --- key.objectid = src->i_ino; key.offset = 0; key.type = BTRFS_EXTENT_DATA_KEY; while (1) { ret = btrfs_search_slot(trans, root, &key, &path, 0, 0); if (ret < 0) goto fail; leaf = path.nodes[0]; if (path.slots[0] >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(extent_root, &path); if (ret < 0) goto fail; if (ret > 0) break; leaf = path.nodes[0]; } btrfs_item_key_to_cpu(leaf, &key, path.slots[0]); ... do some works ... btrfs_release_path(root, path); key.offset++; } --- Regards YZ ^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [Btrfs-devel] cloning file data 2008-05-03 7:25 ` Yan Zheng @ 2008-05-05 10:27 ` Chris Mason 2008-05-05 15:57 ` Sage Weil 0 siblings, 1 reply; 32+ messages in thread From: Chris Mason @ 2008-05-05 10:27 UTC (permalink / raw) To: Yan Zheng; +Cc: Sage Weil, btrfs-devel, linux-btrfs On Saturday 03 May 2008, Yan Zheng wrote: > 2008/5/3, Sage Weil <sage@newdream.net>: > > Hi Yan- > > > > On Sat, 3 May 2008, Yan Zheng wrote: > > > I think the clone ioctl won't work in some corner case. The big loop > > > in btrfs_ioctl_clone uses path->slots[0]++ and btrfs_next_leaf to get > > > next item in the tree. However, this approach works only when the > > > layout of tree keeps unchangeed. In btrfs_ioctl_clone, both > > > btrfs_insert_file_extent and dup_item_to_inode may change the layout > > > of tree. > > > > > > To be safe, I think the codes should: > > > use btrfs_search_slot to find next item. > > > use a intermediate buffer when coping item between two extent buffer. [ ... ] > In my previous mail, I said items of different types should be > differentiated. Actually, there is no need to do that. Please consider > changing the big loop in btrfs_ioctl_clone to something like: Oh, nice catch Yan, thanks. I've pushed out a new version to the unstable tree. Sage, could you please give this a try too? -chris ^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [Btrfs-devel] cloning file data 2008-05-05 10:27 ` Chris Mason @ 2008-05-05 15:57 ` Sage Weil 2008-05-21 17:19 ` btrfs_put_inode Mingming 0 siblings, 1 reply; 32+ messages in thread From: Sage Weil @ 2008-05-05 15:57 UTC (permalink / raw) To: Chris Mason; +Cc: Yan Zheng, btrfs-devel, linux-btrfs > > In my previous mail, I said items of different types should be > > differentiated. Actually, there is no need to do that. Please consider > > changing the big loop in btrfs_ioctl_clone to something like: > > Oh, nice catch Yan, thanks. I've pushed out a new version to the unstable > tree. Sage, could you please give this a try too? Looks good to me. Here's a small cleanup to remove the now unnecessary pos. sage diff -r d94a17e354a8 inode.c --- a/inode.c Mon May 05 06:26:21 2008 -0400 +++ b/inode.c Mon May 05 09:02:55 2008 -0700 @@ -3135,7 +3135,6 @@ long btrfs_ioctl_clone(struct file *file struct inode *src; struct btrfs_trans_handle *trans; int ret; - u64 pos; struct btrfs_path *path; struct btrfs_key key; struct extent_buffer *leaf; @@ -3183,7 +3182,6 @@ long btrfs_ioctl_clone(struct file *file key.offset = 0; key.type = BTRFS_EXTENT_DATA_KEY; key.objectid = src->i_ino; - pos = 0; path->reada = 2; while (1) { @@ -3214,7 +3212,6 @@ long btrfs_ioctl_clone(struct file *file if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) { struct btrfs_file_extent_item *extent; int found_type; - pos = key.offset; extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); found_type = btrfs_file_extent_type(leaf, extent); @@ -3228,25 +3225,22 @@ long btrfs_ioctl_clone(struct file *file u64 off = btrfs_file_extent_offset(leaf, extent); btrfs_insert_file_extent(trans, root, - inode->i_ino, pos, + inode->i_ino, + key.offset, ds, dl, len, off); /* ds == 0 means there's a hole */ - if (ds != 0) { + if (ds != 0) btrfs_inc_extent_ref(trans, root, ds, dl, root->root_key.objectid, trans->transid, - inode->i_ino, pos); - } - pos = key.offset + len; + inode->i_ino, key.offset); } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { ret = dup_item_to_inode(trans, root, path, leaf, slot, &key, inode->i_ino); if (ret) goto out; - pos = key.offset + btrfs_item_size_nr(leaf, - slot); } } else if (btrfs_key_type(&key) == BTRFS_CSUM_ITEM_KEY) { ret = dup_item_to_inode(trans, root, path, leaf, ^ permalink raw reply [flat|nested] 32+ messages in thread
* btrfs_put_inode 2008-05-05 15:57 ` Sage Weil @ 2008-05-21 17:19 ` Mingming 2008-05-21 18:02 ` btrfs_put_inode Chris Mason 2008-05-21 18:23 ` btrfs_put_inode Ryan Hope 0 siblings, 2 replies; 32+ messages in thread From: Mingming @ 2008-05-21 17:19 UTC (permalink / raw) To: Chris Mason; +Cc: linux-btrfs Hi Chris, btrfs failed to compile on 2.6.26-rc2 since the put_inode callback function is removed from super_operations. I noticed btrfs_put_inode() seems being actively used, to drop the inode from the tree for ordered mode transaction. Any idea where to relocate this function? Do we need to call btrfs_del_ordered_inode() for every iput()? Thanks, Mingming ^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: btrfs_put_inode 2008-05-21 17:19 ` btrfs_put_inode Mingming @ 2008-05-21 18:02 ` Chris Mason 2008-05-21 18:45 ` btrfs_put_inode Mingming 2008-05-21 18:23 ` btrfs_put_inode Ryan Hope 1 sibling, 1 reply; 32+ messages in thread From: Chris Mason @ 2008-05-21 18:02 UTC (permalink / raw) To: Mingming; +Cc: linux-btrfs On Wednesday 21 May 2008, Mingming wrote: > Hi Chris, > > btrfs failed to compile on 2.6.26-rc2 since the put_inode callback > function is removed from super_operations. > > I noticed btrfs_put_inode() seems being actively used, to drop the inode > from the tree for ordered mode transaction. Any idea where to relocate > this function? Do we need to call btrfs_del_ordered_inode() for every > iput()? The ordered inode list has incremented the inode->i_count, so we can't do the iput code inside clear_inode. But, we should be able to safely move the checks to file_release. This is basically an optimization so that we don't leave clean inodes hanging around on the data=ordered list. It is safe to remove the put_inode call completely, it'll just be a little slower. -chris ^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: btrfs_put_inode 2008-05-21 18:02 ` btrfs_put_inode Chris Mason @ 2008-05-21 18:45 ` Mingming 2008-05-21 18:52 ` btrfs_put_inode Chris Mason 0 siblings, 1 reply; 32+ messages in thread From: Mingming @ 2008-05-21 18:45 UTC (permalink / raw) To: Chris Mason; +Cc: linux-btrfs On Wed, 2008-05-21 at 14:02 -0400, Chris Mason wrote: > On Wednesday 21 May 2008, Mingming wrote: > > Hi Chris, > > > > btrfs failed to compile on 2.6.26-rc2 since the put_inode callback > > function is removed from super_operations. > > > > I noticed btrfs_put_inode() seems being actively used, to drop the inode > > from the tree for ordered mode transaction. Any idea where to relocate > > this function? Do we need to call btrfs_del_ordered_inode() for every > > iput()? > > The ordered inode list has incremented the inode->i_count, so we can't do the > iput code inside clear_inode. But, we should be able to safely move the > checks to file_release. > Ah okay. file release is called on the last file_close() to this inode. But I thought NFS could grab the inode without thr file_open/file_close, so we might also need to delete the inode from the ordered tree at delete_inode time, like ext3? > This is basically an optimization so that we don't leave clean inodes hanging > around on the data=ordered list. It is safe to remove the put_inode call > completely, it'll just be a little slower. > > -chris ^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: btrfs_put_inode 2008-05-21 18:45 ` btrfs_put_inode Mingming @ 2008-05-21 18:52 ` Chris Mason 2008-05-21 22:29 ` [RFC][PATCH]btrfs delete ordered inode handling fix Mingming 0 siblings, 1 reply; 32+ messages in thread From: Chris Mason @ 2008-05-21 18:52 UTC (permalink / raw) To: Mingming; +Cc: linux-btrfs On Wednesday 21 May 2008, Mingming wrote: > On Wed, 2008-05-21 at 14:02 -0400, Chris Mason wrote: > > On Wednesday 21 May 2008, Mingming wrote: > > > Hi Chris, > > > > > > btrfs failed to compile on 2.6.26-rc2 since the put_inode callback > > > function is removed from super_operations. > > > > > > I noticed btrfs_put_inode() seems being actively used, to drop the > > > inode from the tree for ordered mode transaction. Any idea where to > > > relocate this function? Do we need to call btrfs_del_ordered_inode() > > > for every iput()? > > > > The ordered inode list has incremented the inode->i_count, so we can't do > > the iput code inside clear_inode. But, we should be able to safely move > > the checks to file_release. > > Ah okay. file release is called on the last file_close() to this inode. > But I thought NFS could grab the inode without thr file_open/file_close, > so we might also need to delete the inode from the ordered tree at > delete_inode time, like ext3? I do it right now in unlink, but delete_inode is fine too. -chris ^ permalink raw reply [flat|nested] 32+ messages in thread
* [RFC][PATCH]btrfs delete ordered inode handling fix 2008-05-21 18:52 ` btrfs_put_inode Chris Mason @ 2008-05-21 22:29 ` Mingming 2008-05-22 14:11 ` Chris Mason 0 siblings, 1 reply; 32+ messages in thread From: Mingming @ 2008-05-21 22:29 UTC (permalink / raw) To: Chris Mason; +Cc: linux-btrfs Hi Chris, I thought I spotted a few bugs, While looking at how to properly remove inode from ordered tree, let me know if I got it right. * inode->i_count accounting issue - remove extra i_count_decrease after calling filemap_file_write_and_wait() in btrfs_write_ordered_inodes() - Remove extra i_count decrease after each call btrfs_del_ordered_inode(), it decrease the i_count again. I couldn't find the where the i_count being increased. The tree_lock should protecting it races with btrfs_write_ordered_inodes(), no? * There is possible race with inode delete and btrfs_find_first_ordered_inode(). The inode could possibly in the process of freeing while we are trying to get hold of it during commit transaction. The fix is using igrab() instead, and search for next inode in the tree if the found one is in the middle of being released. * get rid of btrfs_put_inode(), and move the functionality under the btrfs_del_ordered_inode() directly. * Remove the inode from ordered tree at last iput(). Did not do it at file release() time, as it may remove the inode from the ordered tree before ensure the ordering of write to the same inode from other process. Perhaps calling btrfs_del_ordered_inode() under unlink() is enough, but it would not be hurt to do it again at delete_inode() time. Here is the patch trying to address above issues... compiled, but not tested. Is ordered mode on by default? Regards, Mingming diff -r c3290d51e5f9 inode.c --- a/inode.c Fri May 16 13:30:15 2008 -0400 +++ b/inode.c Wed May 21 14:51:22 2008 -0700 @@ -857,15 +857,11 @@ static int btrfs_unlink(struct inode *di nr = trans->blocks_used; if (inode->i_nlink == 0) { - int found; /* if the inode isn't linked anywhere, * we don't need to worry about * data=ordered */ - found = btrfs_del_ordered_inode(inode); - if (found == 1) { - atomic_dec(&inode->i_count); - } + btrfs_del_ordered_inode(inode); } btrfs_end_transaction(trans, root); @@ -1271,24 +1267,6 @@ fail: return err; } -void btrfs_put_inode(struct inode *inode) -{ - int ret; - - if (!BTRFS_I(inode)->ordered_trans) { - return; - } - - if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY) || - mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)) - return; - - ret = btrfs_del_ordered_inode(inode); - if (ret == 1) { - atomic_dec(&inode->i_count); - } -} - void btrfs_delete_inode(struct inode *inode) { struct btrfs_trans_handle *trans; @@ -1311,6 +1289,7 @@ void btrfs_delete_inode(struct inode *in goto no_delete_lock; nr = trans->blocks_used; + btrfs_del_ordered_inode(inode); clear_inode(inode); btrfs_end_transaction(trans, root); diff -r c3290d51e5f9 ordered-data.c --- a/ordered-data.c Fri May 16 13:30:15 2008 -0400 +++ b/ordered-data.c Wed May 21 14:55:25 2008 -0700 @@ -166,7 +166,7 @@ int btrfs_find_first_ordered_inode(struc struct inode **inode) { struct tree_entry *entry; - struct rb_node *node; + struct rb_node *node, *next; write_lock(&tree->lock); node = tree_search(&tree->tree, *root_objectid, *objectid); @@ -174,6 +174,7 @@ int btrfs_find_first_ordered_inode(struc write_unlock(&tree->lock); return 0; } +again: entry = rb_entry(node, struct tree_entry, rb_node); while(comp_entry(entry, *root_objectid, *objectid) >= 0) { @@ -187,9 +188,20 @@ int btrfs_find_first_ordered_inode(struc return 0; } + *inode = igrab(entry->inode); + if (!*inode) { + next = rb_next(node); + rb_erase(node, &tree->tree); + kfree(entry); + if (!next) { + write_unlock(&tree->lock); + return 0; + } + node = next; + goto again; + } + *root_objectid = entry->root_objectid; - *inode = entry->inode; - atomic_inc(&entry->inode->i_count); *objectid = entry->objectid; write_unlock(&tree->lock); return 1; @@ -200,7 +212,7 @@ int btrfs_find_del_first_ordered_inode(s struct inode **inode) { struct tree_entry *entry; - struct rb_node *node; + struct rb_node *node, *next; write_lock(&tree->lock); node = tree_search(&tree->tree, *root_objectid, *objectid); @@ -208,7 +220,7 @@ int btrfs_find_del_first_ordered_inode(s write_unlock(&tree->lock); return 0; } - +retry: entry = rb_entry(node, struct tree_entry, rb_node); while(comp_entry(entry, *root_objectid, *objectid) >= 0) { node = rb_next(node); @@ -220,11 +232,21 @@ int btrfs_find_del_first_ordered_inode(s write_unlock(&tree->lock); return 0; } + *inode = igrab(entry->inode); + if (!*inode) { + next = rb_next(node); + rb_erase(node, &tree->tree); + kfree(entry); + if (!next) { + write_unlock(&tree->lock); + return 0; + } + node = next; + goto retry; + } *root_objectid = entry->root_objectid; *objectid = entry->objectid; - *inode = entry->inode; - atomic_inc(&entry->inode->i_count); rb_erase(node, &tree->tree); write_unlock(&tree->lock); kfree(entry); @@ -253,11 +275,19 @@ static int __btrfs_del_ordered_inode(str return 1; } -int btrfs_del_ordered_inode(struct inode *inode) +void btrfs_del_ordered_inode(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; u64 root_objectid = root->root_key.objectid; int ret = 0; + + if (!BTRFS_I(inode)->ordered_trans) { + return; + } + + if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY) || + mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)) + return; spin_lock(&root->fs_info->new_trans_lock); if (root->fs_info->running_transaction) { @@ -267,7 +297,7 @@ int btrfs_del_ordered_inode(struct inode inode->i_ino); } spin_unlock(&root->fs_info->new_trans_lock); - return ret; + return; } int btrfs_ordered_throttle(struct btrfs_root *root, struct inode *inode) diff -r c3290d51e5f9 ordered-data.h --- a/ordered-data.h Fri May 16 13:30:15 2008 -0400 +++ b/ordered-data.h Wed May 21 14:56:49 2008 -0700 @@ -38,6 +38,6 @@ int btrfs_find_first_ordered_inode(struc int btrfs_find_first_ordered_inode(struct btrfs_ordered_inode_tree *tree, u64 *root_objectid, u64 *objectid, struct inode **inode); -int btrfs_del_ordered_inode(struct inode *inode); +void btrfs_del_ordered_inode(struct inode *inode); int btrfs_ordered_throttle(struct btrfs_root *root, struct inode *inode); #endif diff -r c3290d51e5f9 super.c --- a/super.c Fri May 16 13:30:15 2008 -0400 +++ b/super.c Wed May 21 12:54:48 2008 -0700 @@ -487,7 +487,6 @@ static void btrfs_unlockfs(struct super_ static struct super_operations btrfs_super_ops = { .delete_inode = btrfs_delete_inode, - .put_inode = btrfs_put_inode, .put_super = btrfs_put_super, .write_super = btrfs_write_super, .sync_fs = btrfs_sync_fs, diff -r c3290d51e5f9 transaction.c --- a/transaction.c Fri May 16 13:30:15 2008 -0400 +++ b/transaction.c Wed May 21 14:18:34 2008 -0700 @@ -538,7 +538,6 @@ int btrfs_write_ordered_inodes(struct bt filemap_write_and_wait(inode->i_mapping); atomic_dec(&BTRFS_I(inode)->ordered_writeback); } - atomic_dec(&inode->i_count); iput(inode); mutex_lock(&root->fs_info->fs_mutex); ^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [RFC][PATCH]btrfs delete ordered inode handling fix 2008-05-21 22:29 ` [RFC][PATCH]btrfs delete ordered inode handling fix Mingming @ 2008-05-22 14:11 ` Chris Mason 2008-05-22 17:43 ` Mingming 0 siblings, 1 reply; 32+ messages in thread From: Chris Mason @ 2008-05-22 14:11 UTC (permalink / raw) To: Mingming; +Cc: linux-btrfs On Wednesday 21 May 2008, Mingming wrote: > Hi Chris, I thought I spotted a few bugs, While looking at how to > properly remove inode from ordered tree, let me know if I got it right. Hi Mingming, thanks for going through this code. The i_count rules in the current code should work like this: * btrfs_add_ordered_inode calls igrab when the inode is inserted into the list. The whole time the inode is on the list, there's an extra reference on i_count. There will be no final iput while the inode is on the list. * btrfs_find_first_ordered_inode and btrfs_find_del_ordered_inode both increment i_count before returning the inode. This gives the caller an extra reference on i_count in addition to the one held by the list. * ordered-data.c never drops i_count. This allows (and forces) the caller to decide when it is safe to call iput because callers have different locks held and because the last iput might can in theory trigger delete_inode. All of the current callers of btrfs_del_ordered_inode already have a i_count increased before they call it. So, they can safely just do an atomic_dec on i_count and let their own iputs later do the delete_inode. > > * inode->i_count accounting issue > - remove extra i_count_decrease after calling > filemap_file_write_and_wait() in btrfs_write_ordered_inodes() There's an atomic_dec of i_count to drop the reference from btrfs_find_del_first_ordered_inode, and an iput to drop the reference added by igrab when the inode was put on the list. > > - Remove extra i_count decrease after each call > btrfs_del_ordered_inode(), it decrease the i_count again. I couldn't > find the where the i_count being increased. The tree_lock should > protecting it races with btrfs_write_ordered_inodes(), no? The increase comes from igrab on list insertion > > * There is possible race with inode delete and > btrfs_find_first_ordered_inode(). The inode could possibly in the > process of freeing while we are trying to get hold of it during commit > transaction. The fix is using igrab() instead, and search for next inode > in the tree if the found one is in the middle of being released. These kinds of races where the main reason why I had the list take a reference on the inode. delete_inode won't be called while i_count is increased. Over the long term I'd prefer to move the ordered-data list to a model where the list doesn't have a reference and it is magically removed after all the dirty pages are gone (by the end_io_hook handlers in inode.c). The end_io hooks in inode.c may be sufficient for this. > > * get rid of btrfs_put_inode(), and move the functionality under the > btrfs_del_ordered_inode() directly. I like this change, thanks. > > * Remove the inode from ordered tree at last iput(). Did not do it at > file release() time, as it may remove the inode from the ordered tree > before ensure the ordering of write to the same inode from other > process. > > Perhaps calling btrfs_del_ordered_inode() under unlink() is enough, but > it would not be hurt to do it again at delete_inode() time. I'm afraid we'll have to do it at file_release time, at least until the ordered list is changed not to keep a reference. > > > Here is the patch trying to address above issues... compiled, but not > tested. Is ordered mode on by default? > Yes, there's actually no way to turn it off ;) -chris ^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [RFC][PATCH]btrfs delete ordered inode handling fix 2008-05-22 14:11 ` Chris Mason @ 2008-05-22 17:43 ` Mingming 2008-05-22 17:47 ` Chris Mason 0 siblings, 1 reply; 32+ messages in thread From: Mingming @ 2008-05-22 17:43 UTC (permalink / raw) To: Chris Mason; +Cc: linux-btrfs On Thu, 2008-05-22 at 10:11 -0400, Chris Mason wrote: > On Wednesday 21 May 2008, Mingming wrote: > > Hi Chris, I thought I spotted a few bugs, While looking at how to > > properly remove inode from ordered tree, let me know if I got it right. > > Hi Mingming, thanks for going through this code. The i_count rules in the > current code should work like this: > Hi Chris, thanks for your detailed clarification. > * btrfs_add_ordered_inode calls igrab when the inode is inserted into the > list. The whole time the inode is on the list, there's an extra reference on > i_count. There will be no final iput while the inode is on the list. > Ah I missed that. That explains all my confusion of the i_count accounting. > > > > * There is possible race with inode delete and > > btrfs_find_first_ordered_inode(). The inode could possibly in the > > process of freeing while we are trying to get hold of it during commit > > transaction. The fix is using igrab() instead, and search for next inode > > in the tree if the found one is in the middle of being released. > > These kinds of races where the main reason why I had the list take a reference > on the inode. delete_inode won't be called while i_count is increased. > > Over the long term I'd prefer to move the ordered-data list to a model where > the list doesn't have a reference and it is magically removed after all the > dirty pages are gone (by the end_io_hook handlers in inode.c). The end_io > hooks in inode.c may be sufficient for this. > Make sense. > > > > * get rid of btrfs_put_inode(), and move the functionality under the > > btrfs_del_ordered_inode() directly. > > I like this change, thanks. > > > > > * Remove the inode from ordered tree at last iput(). Did not do it at > > file release() time, as it may remove the inode from the ordered tree > > before ensure the ordering of write to the same inode from other > > process. > > > > Perhaps calling btrfs_del_ordered_inode() under unlink() is enough, but > > it would not be hurt to do it again at delete_inode() time. > > I'm afraid we'll have to do it at file_release time, at least until the > ordered list is changed not to keep a reference. > Yes with the i_count logic delete_inode() is not the right place to call btrfs_del_ordered_inode. But I am still not quite sure whether it is safe to remove the inode from the ordered tree at the file_release() time. i.e. whether the dirty data already being flushed to disk at last file_close()/file_release() time and when two process open and write to the same inode ... Regards, Mingming ^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [RFC][PATCH]btrfs delete ordered inode handling fix 2008-05-22 17:43 ` Mingming @ 2008-05-22 17:47 ` Chris Mason 2008-05-22 20:39 ` Mingming 0 siblings, 1 reply; 32+ messages in thread From: Chris Mason @ 2008-05-22 17:47 UTC (permalink / raw) To: Mingming; +Cc: linux-btrfs On Thursday 22 May 2008, Mingming wrote: > On Thu, 2008-05-22 at 10:11 -0400, Chris Mason wrote: > > On Wednesday 21 May 2008, Mingming wrote: > > > Hi Chris, I thought I spotted a few bugs, While looking at how to > > > properly remove inode from ordered tree, let me know if I got it right. > > > > Hi Mingming, thanks for going through this code. The i_count rules in > > the current code should work like this: > > Hi Chris, thanks for your detailed clarification. > > > * btrfs_add_ordered_inode calls igrab when the inode is inserted into the > > list. The whole time the inode is on the list, there's an extra > > reference on i_count. There will be no final iput while the inode is on > > the list. > > Ah I missed that. That explains all my confusion of the i_count accounting. > > > > * There is possible race with inode delete and > > > btrfs_find_first_ordered_inode(). The inode could possibly in the > > > process of freeing while we are trying to get hold of it during commit > > > transaction. The fix is using igrab() instead, and search for next > > > inode in the tree if the found one is in the middle of being released. > > > > These kinds of races where the main reason why I had the list take a > > reference on the inode. delete_inode won't be called while i_count is > > increased. > > > > Over the long term I'd prefer to move the ordered-data list to a model > > where the list doesn't have a reference and it is magically removed after > > all the dirty pages are gone (by the end_io_hook handlers in inode.c). > > The end_io hooks in inode.c may be sufficient for this. > > Make sense. > > > > * get rid of btrfs_put_inode(), and move the functionality under the > > > btrfs_del_ordered_inode() directly. > > > > I like this change, thanks. > > > > > * Remove the inode from ordered tree at last iput(). Did not do it at > > > file release() time, as it may remove the inode from the ordered tree > > > before ensure the ordering of write to the same inode from other > > > process. > > > > > > Perhaps calling btrfs_del_ordered_inode() under unlink() is enough, but > > > it would not be hurt to do it again at delete_inode() time. > > > > I'm afraid we'll have to do it at file_release time, at least until the > > ordered list is changed not to keep a reference. > > Yes with the i_count logic delete_inode() is not the right place to call > btrfs_del_ordered_inode. > > But I am still not quite sure whether it is safe to remove the inode > from the ordered tree at the file_release() time. i.e. whether the dirty > data already being flushed to disk at last file_close()/file_release() > time and when two process open and write to the same inode ... I get around this by testing for dirty/writeback pages before removing the inode from the ordered list. If another writer allocates blocks to the file, it will be added back to the list. -chris ^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [RFC][PATCH]btrfs delete ordered inode handling fix 2008-05-22 17:47 ` Chris Mason @ 2008-05-22 20:39 ` Mingming 2008-05-22 22:23 ` Chris Mason 0 siblings, 1 reply; 32+ messages in thread From: Mingming @ 2008-05-22 20:39 UTC (permalink / raw) To: Chris Mason; +Cc: linux-btrfs On Thu, 2008-05-22 at 13:47 -0400, Chris Mason wrote: > On Thursday 22 May 2008, Mingming wrote: > > On Thu, 2008-05-22 at 10:11 -0400, Chris Mason wrote: > > > On Wednesday 21 May 2008, Mingming wrote: > > > > Hi Chris, I thought I spotted a few bugs, While looking at how to > > > > properly remove inode from ordered tree, let me know if I got it right. > > > > > > Hi Mingming, thanks for going through this code. The i_count rules in > > > the current code should work like this: > > > > Hi Chris, thanks for your detailed clarification. > > > > > * btrfs_add_ordered_inode calls igrab when the inode is inserted into the > > > list. The whole time the inode is on the list, there's an extra > > > reference on i_count. There will be no final iput while the inode is on > > > the list. > > > > Ah I missed that. That explains all my confusion of the i_count accounting. > > > > > > * There is possible race with inode delete and > > > > btrfs_find_first_ordered_inode(). The inode could possibly in the > > > > process of freeing while we are trying to get hold of it during commit > > > > transaction. The fix is using igrab() instead, and search for next > > > > inode in the tree if the found one is in the middle of being released. > > > > > > These kinds of races where the main reason why I had the list take a > > > reference on the inode. delete_inode won't be called while i_count is > > > increased. > > > > > > Over the long term I'd prefer to move the ordered-data list to a model > > > where the list doesn't have a reference and it is magically removed after > > > all the dirty pages are gone (by the end_io_hook handlers in inode.c). > > > The end_io hooks in inode.c may be sufficient for this. > > > > Make sense. > > > > > > * get rid of btrfs_put_inode(), and move the functionality under the > > > > btrfs_del_ordered_inode() directly. > > > > > > I like this change, thanks. > > > > > > > * Remove the inode from ordered tree at last iput(). Did not do it at > > > > file release() time, as it may remove the inode from the ordered tree > > > > before ensure the ordering of write to the same inode from other > > > > process. > > > > > > > > Perhaps calling btrfs_del_ordered_inode() under unlink() is enough, but > > > > it would not be hurt to do it again at delete_inode() time. > > > > > > I'm afraid we'll have to do it at file_release time, at least until the > > > ordered list is changed not to keep a reference. > > > > Yes with the i_count logic delete_inode() is not the right place to call > > btrfs_del_ordered_inode. > > > > But I am still not quite sure whether it is safe to remove the inode > > from the ordered tree at the file_release() time. i.e. whether the dirty > > data already being flushed to disk at last file_close()/file_release() > > time and when two process open and write to the same inode ... > > I get around this by testing for dirty/writeback pages before removing the > inode from the ordered list. If another writer allocates blocks to the file, > it will be added back to the list. > I see.:) How about patch below? Mingming diff -r c3290d51e5f9 file.c --- a/file.c Fri May 16 13:30:15 2008 -0400 +++ b/file.c Thu May 22 13:29:42 2008 -0700 @@ -978,6 +978,12 @@ out_nolock: return num_written ? num_written : err; } +static int btrfs_release_file (struct inode * inode, struct file * filp) +{ + btrfs_del_ordered_inode(inode); + return 0; +} + static int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) { @@ -1044,6 +1050,7 @@ struct file_operations btrfs_file_operat .write = btrfs_file_write, .mmap = btrfs_file_mmap, .open = generic_file_open, + .release = btrfs_release_file, .fsync = btrfs_sync_file, .unlocked_ioctl = btrfs_ioctl, #ifdef CONFIG_COMPAT diff -r c3290d51e5f9 inode.c --- a/inode.c Fri May 16 13:30:15 2008 -0400 +++ b/inode.c Thu May 22 13:31:14 2008 -0700 @@ -857,15 +857,11 @@ static int btrfs_unlink(struct inode *di nr = trans->blocks_used; if (inode->i_nlink == 0) { - int found; /* if the inode isn't linked anywhere, * we don't need to worry about * data=ordered */ - found = btrfs_del_ordered_inode(inode); - if (found == 1) { - atomic_dec(&inode->i_count); - } + btrfs_del_ordered_inode(inode); } btrfs_end_transaction(trans, root); @@ -1271,24 +1267,6 @@ fail: return err; } -void btrfs_put_inode(struct inode *inode) -{ - int ret; - - if (!BTRFS_I(inode)->ordered_trans) { - return; - } - - if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY) || - mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)) - return; - - ret = btrfs_del_ordered_inode(inode); - if (ret == 1) { - atomic_dec(&inode->i_count); - } -} - void btrfs_delete_inode(struct inode *inode) { struct btrfs_trans_handle *trans; diff -r c3290d51e5f9 ordered-data.c --- a/ordered-data.c Fri May 16 13:30:15 2008 -0400 +++ b/ordered-data.c Thu May 22 13:24:56 2008 -0700 @@ -231,7 +231,7 @@ int btrfs_find_del_first_ordered_inode(s return 1; } -static int __btrfs_del_ordered_inode(struct btrfs_ordered_inode_tree *tree, +static void __btrfs_del_ordered_inode(struct btrfs_ordered_inode_tree *tree, struct inode *inode, u64 root_objectid, u64 objectid) { @@ -243,31 +243,38 @@ static int __btrfs_del_ordered_inode(str node = __tree_search(&tree->tree, root_objectid, objectid, &prev); if (!node) { write_unlock(&tree->lock); - return 0; + return; } rb_erase(node, &tree->tree); BTRFS_I(inode)->ordered_trans = 0; write_unlock(&tree->lock); + atomic_dec(&inode->i_count); entry = rb_entry(node, struct tree_entry, rb_node); kfree(entry); - return 1; -} - -int btrfs_del_ordered_inode(struct inode *inode) + return; +} + +void btrfs_del_ordered_inode(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; u64 root_objectid = root->root_key.objectid; - int ret = 0; + + if (!BTRFS_I(inode)->ordered_trans) { + return; + } + + if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY) || + mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)) + return; spin_lock(&root->fs_info->new_trans_lock); if (root->fs_info->running_transaction) { struct btrfs_ordered_inode_tree *tree; tree = &root->fs_info->running_transaction->ordered_inode_tree; - ret = __btrfs_del_ordered_inode(tree, inode, root_objectid, + __btrfs_del_ordered_inode(tree, inode, root_objectid, inode->i_ino); } spin_unlock(&root->fs_info->new_trans_lock); - return ret; } int btrfs_ordered_throttle(struct btrfs_root *root, struct inode *inode) diff -r c3290d51e5f9 ordered-data.h --- a/ordered-data.h Fri May 16 13:30:15 2008 -0400 +++ b/ordered-data.h Thu May 22 13:25:09 2008 -0700 @@ -38,6 +38,6 @@ int btrfs_find_first_ordered_inode(struc int btrfs_find_first_ordered_inode(struct btrfs_ordered_inode_tree *tree, u64 *root_objectid, u64 *objectid, struct inode **inode); -int btrfs_del_ordered_inode(struct inode *inode); +void btrfs_del_ordered_inode(struct inode *inode); int btrfs_ordered_throttle(struct btrfs_root *root, struct inode *inode); #endif diff -r c3290d51e5f9 super.c --- a/super.c Fri May 16 13:30:15 2008 -0400 +++ b/super.c Thu May 22 13:30:44 2008 -0700 @@ -487,7 +487,6 @@ static void btrfs_unlockfs(struct super_ static struct super_operations btrfs_super_ops = { .delete_inode = btrfs_delete_inode, - .put_inode = btrfs_put_inode, .put_super = btrfs_put_super, .write_super = btrfs_write_super, .sync_fs = btrfs_sync_fs, ^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [RFC][PATCH]btrfs delete ordered inode handling fix 2008-05-22 20:39 ` Mingming @ 2008-05-22 22:23 ` Chris Mason 0 siblings, 0 replies; 32+ messages in thread From: Chris Mason @ 2008-05-22 22:23 UTC (permalink / raw) To: Mingming; +Cc: linux-btrfs On Thursday 22 May 2008, Mingming wrote: [ ... ] > > I get around this by testing for dirty/writeback pages before removing > > the inode from the ordered list. If another writer allocates blocks to > > the file, it will be added back to the list. > > I see.:) How about patch below? Looks good, I'll give it a shot here. -chris ^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: btrfs_put_inode 2008-05-21 17:19 ` btrfs_put_inode Mingming 2008-05-21 18:02 ` btrfs_put_inode Chris Mason @ 2008-05-21 18:23 ` Ryan Hope 2008-05-21 18:32 ` btrfs_put_inode Chris Mason 1 sibling, 1 reply; 32+ messages in thread From: Ryan Hope @ 2008-05-21 18:23 UTC (permalink / raw) To: Mingming; +Cc: Chris Mason, linux-btrfs try this patch out, http://zen-sources.org/files/btrfs-stable-for-2.6.26-rc3.patch it should apply to 2.6.26-rc*... probably even 2.6.25 im btrfs with 2.6.26-rc3 with no issues On Wed, May 21, 2008 at 1:19 PM, Mingming <cmm@us.ibm.com> wrote: > Hi Chris, > > btrfs failed to compile on 2.6.26-rc2 since the put_inode callback > function is removed from super_operations. > > I noticed btrfs_put_inode() seems being actively used, to drop the inode > from the tree for ordered mode transaction. Any idea where to relocate > this function? Do we need to call btrfs_del_ordered_inode() for every > iput()? > > Thanks, > Mingming > > -- > To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > ^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: btrfs_put_inode 2008-05-21 18:23 ` btrfs_put_inode Ryan Hope @ 2008-05-21 18:32 ` Chris Mason 2008-05-21 19:02 ` btrfs_put_inode Mingming 0 siblings, 1 reply; 32+ messages in thread From: Chris Mason @ 2008-05-21 18:32 UTC (permalink / raw) To: Ryan Hope; +Cc: Mingming, linux-btrfs On Wednesday 21 May 2008, Ryan Hope wrote: > try this patch out, > http://zen-sources.org/files/btrfs-stable-for-2.6.26-rc3.patch It is a little hard to tell from the big long patch what the incremental difference is ;) -chris ^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: btrfs_put_inode 2008-05-21 18:32 ` btrfs_put_inode Chris Mason @ 2008-05-21 19:02 ` Mingming 0 siblings, 0 replies; 32+ messages in thread From: Mingming @ 2008-05-21 19:02 UTC (permalink / raw) To: Chris Mason; +Cc: Ryan Hope, linux-btrfs On Wed, 2008-05-21 at 14:32 -0400, Chris Mason wrote: > On Wednesday 21 May 2008, Ryan Hope wrote: > > try this patch out, > > http://zen-sources.org/files/btrfs-stable-for-2.6.26-rc3.patch > > It is a little hard to tell from the big long patch what the incremental > difference is ;) > Looks like just break the hook up in super_operations:-) > -chris ^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [Btrfs-devel] cloning file data 2008-04-25 13:41 ` [Btrfs-devel] " Chris Mason ` (2 preceding siblings ...) 2008-04-25 18:26 ` Sage Weil @ 2008-04-25 20:28 ` Sage Weil 3 siblings, 0 replies; 32+ messages in thread From: Sage Weil @ 2008-04-25 20:28 UTC (permalink / raw) To: Chris Mason; +Cc: btrfs-devel, linux-btrfs On Fri, 25 Apr 2008, Chris Mason wrote: > Very cool. I'd actually loved to see this wrapped into a program that will > cow a directory tree. Basically the same as cp -al, but with cow instead of > linking. Here's a pretty trivial patch against cp in coreutils-6.10. It'll probably take you more time to build than to make the changes yourself, but this seems much handier and more robust than a separate tool. $ cp -ac /mnt/btrfs/from /mnt/btrfs/to ... sage diff -ur coreutils-6.10/src/copy.c coreutils-6.10-btrfs/src/copy.c --- coreutils-6.10/src/copy.c 2008-01-05 14:59:11.000000000 -0800 +++ coreutils-6.10-btrfs/src/copy.c 2008-04-25 13:12:20.000000000 -0700 @@ -537,6 +537,9 @@ buf_alloc = xmalloc (buf_size + buf_alignment_slop); buf = ptr_align (buf_alloc, buf_alignment); +#define BTRFS_IOC_CLONE 0x40049409 + if (!x->cow || + ioctl(dest_desc, BTRFS_IOC_CLONE, source_desc) != 0) for (;;) { word *wp = NULL; diff -ur coreutils-6.10/src/copy.h coreutils-6.10-btrfs/src/copy.h --- coreutils-6.10/src/copy.h 2008-01-05 14:58:25.000000000 -0800 +++ coreutils-6.10-btrfs/src/copy.h 2008-04-25 12:46:30.000000000 -0700 @@ -135,6 +135,8 @@ on different file systems from the one we started on. */ bool one_file_system; + bool cow; + /* If true, attempt to give the copies the original files' permissions, owner, group, and timestamps. */ bool preserve_ownership; diff -ur coreutils-6.10/src/cp.c coreutils-6.10-btrfs/src/cp.c --- coreutils-6.10/src/cp.c 2008-01-11 03:19:53.000000000 -0800 +++ coreutils-6.10-btrfs/src/cp.c 2008-04-25 13:23:12.000000000 -0700 @@ -125,6 +125,7 @@ {"archive", no_argument, NULL, 'a'}, {"backup", optional_argument, NULL, 'b'}, {"copy-contents", no_argument, NULL, COPY_CONTENTS_OPTION}, + {"cow", no_argument, NULL, 'c'}, {"dereference", no_argument, NULL, 'L'}, {"force", no_argument, NULL, 'f'}, {"interactive", no_argument, NULL, 'i'}, @@ -178,6 +179,7 @@ --backup[=CONTROL] make a backup of each existing destination file\n\ -b like --backup but does not accept an argument\n\ --copy-contents copy contents of special files when recursive\n\ + -c, --cow attempt issuing copy-on-write ioctl to fs\n\ -d same as --no-dereference --preserve=links\n\ "), stdout); fputs (_("\ @@ -767,6 +769,7 @@ x->interactive = I_UNSPECIFIED; x->move_mode = false; x->one_file_system = false; + x->cow = false; x->preserve_ownership = false; x->preserve_links = false; @@ -909,7 +912,7 @@ we'll actually use backup_suffix_string. */ backup_suffix_string = getenv ("SIMPLE_BACKUP_SUFFIX"); - while ((c = getopt_long (argc, argv, "abdfHilLprst:uvxPRS:T", + while ((c = getopt_long (argc, argv, "abcdfHilLprst:uvxPRS:T", long_opts, NULL)) != -1) { @@ -940,6 +943,10 @@ copy_contents = true; break; + case 'c': + x.cow = true; + break; + case 'd': x.preserve_links = true; x.dereference = DEREF_NEVER; ^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [Btrfs-devel] cloning file data 2008-04-24 22:47 cloning file data Sage Weil 2008-04-25 13:41 ` [Btrfs-devel] " Chris Mason @ 2008-04-29 20:52 ` Chris Mason 2008-05-02 20:50 ` Chris Mason 2 siblings, 0 replies; 32+ messages in thread From: Chris Mason @ 2008-04-29 20:52 UTC (permalink / raw) To: btrfs-devel; +Cc: Sage Weil, linux-btrfs On Thursday 24 April 2008, Sage Weil wrote: > Hi- > > I'm working on a clone ioctl that will quickly and efficiently duplicate > the contents of a file, e.g. > Just FYI, I didn't sneak this into v0.14 because I didn't quite have the cycles to test it. It'll go in this week. -chris ^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [Btrfs-devel] cloning file data 2008-04-24 22:47 cloning file data Sage Weil 2008-04-25 13:41 ` [Btrfs-devel] " Chris Mason 2008-04-29 20:52 ` Chris Mason @ 2008-05-02 20:50 ` Chris Mason 2008-05-02 21:38 ` Sage Weil 2 siblings, 1 reply; 32+ messages in thread From: Chris Mason @ 2008-05-02 20:50 UTC (permalink / raw) To: Sage Weil; +Cc: linux-btrfs On Thursday 24 April 2008, Sage Weil wrote: > Hi- > > I'm working on a clone ioctl that will quickly and efficiently duplicate > the contents of a file, e.g. Sage's work has been pushed into the stable and unstable trees, along with a small command called bcp to trigger the clone ioctls. bcp is used like this: bcp src dst If src is a directory, it is copied recursively. If the clone ioctl fails, a fallback to buffer copies is done instead. Sage, I had to make a few small changes to your ioctl code. One was to skip reference count updates if the extent is a hole, and the other was to change around mark_inode_dirty a bit to avoid transaction deadlock. We aren't actually making any pages dirty so it is safe to just update the inode. -chris ^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [Btrfs-devel] cloning file data 2008-05-02 20:50 ` Chris Mason @ 2008-05-02 21:38 ` Sage Weil 0 siblings, 0 replies; 32+ messages in thread From: Sage Weil @ 2008-05-02 21:38 UTC (permalink / raw) To: Chris Mason; +Cc: linux-btrfs On Fri, 2 May 2008, Chris Mason wrote: > Sage's work has been pushed into the stable and unstable trees, along with a Thanks! For the transaction ioctls... would it make more sense to specify a string of ops to apply atomically in a vector instead of exposing the raw transaction start/end to userspace? Would that avoid the deadlock and memory issues? sage ^ permalink raw reply [flat|nested] 32+ messages in thread
end of thread, other threads:[~2008-05-22 22:23 UTC | newest] Thread overview: 32+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2008-04-24 22:47 cloning file data Sage Weil 2008-04-25 13:41 ` [Btrfs-devel] " Chris Mason 2008-04-25 16:50 ` Zach Brown 2008-04-25 16:58 ` Chris Mason 2008-04-25 17:04 ` Zach Brown 2008-04-25 16:50 ` Zach Brown 2008-04-25 18:32 ` Sage Weil 2008-04-25 18:26 ` Sage Weil 2008-04-26 4:38 ` Sage Weil 2008-05-03 4:44 ` Yan Zheng 2008-05-03 6:16 ` Sage Weil 2008-05-03 6:48 ` Yan Zheng 2008-05-03 7:25 ` Yan Zheng 2008-05-05 10:27 ` Chris Mason 2008-05-05 15:57 ` Sage Weil 2008-05-21 17:19 ` btrfs_put_inode Mingming 2008-05-21 18:02 ` btrfs_put_inode Chris Mason 2008-05-21 18:45 ` btrfs_put_inode Mingming 2008-05-21 18:52 ` btrfs_put_inode Chris Mason 2008-05-21 22:29 ` [RFC][PATCH]btrfs delete ordered inode handling fix Mingming 2008-05-22 14:11 ` Chris Mason 2008-05-22 17:43 ` Mingming 2008-05-22 17:47 ` Chris Mason 2008-05-22 20:39 ` Mingming 2008-05-22 22:23 ` Chris Mason 2008-05-21 18:23 ` btrfs_put_inode Ryan Hope 2008-05-21 18:32 ` btrfs_put_inode Chris Mason 2008-05-21 19:02 ` btrfs_put_inode Mingming 2008-04-25 20:28 ` [Btrfs-devel] cloning file data Sage Weil 2008-04-29 20:52 ` Chris Mason 2008-05-02 20:50 ` Chris Mason 2008-05-02 21:38 ` Sage Weil
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).