diff for duplicates of <20140925204749.GP27730@localhost.localdomain> diff --git a/a/2.txt b/N1/2.txt index 4e79915..8b13789 100644 --- a/a/2.txt +++ b/N1/2.txt @@ -1,49 +1 @@ ->From e41949f26f9cc492aab17a1b94d030a11c020893 Mon Sep 17 00:00:00 2001 -From: Matthew Wilcox <willy@linux.intel.com> -Date: Wed, 10 Sep 2014 13:19:22 -0400 -Subject: [PATCH 1/7] dax: A couple of fixes from Dave Chinner -If dax_clear_blocks() returns an error, segfault. - -Don't bother calling get_block() again if the BH is unwritten; the block -is already allocated, and this won't help matters. - -Call b_end_io() if it's set, after zeroing the block, enabling the fs -to convert the block from unwritten to written. - -Signed-off-by: Matthew Wilcox <willy@linux.intel.com> ---- - fs/dax.c | 11 ++++++++--- - 1 file changed, 8 insertions(+), 3 deletions(-) - -diff --git a/fs/dax.c b/fs/dax.c -index bdf6622..90418ca 100644 ---- a/fs/dax.c -+++ b/fs/dax.c -@@ -327,7 +327,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, - if (error) - goto unlock_page; - -- if (!buffer_written(&bh) && !vmf->cow_page) { -+ if (!buffer_mapped(&bh) && !vmf->cow_page) { - if (vmf->flags & FAULT_FLAG_WRITE) { - error = get_block(inode, block, &bh, 1); - count_vm_event(PGMAJFAULT); -@@ -364,8 +364,13 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, - return VM_FAULT_LOCKED; - } - -- if (buffer_unwritten(&bh) || buffer_new(&bh)) -- dax_clear_blocks(inode, bh.b_blocknr, bh.b_size); -+ if (buffer_unwritten(&bh) || buffer_new(&bh)) { -+ error = dax_clear_blocks(inode, bh.b_blocknr, bh.b_size); -+ if (error) -+ goto out; -+ if (bh.b_end_io) -+ bh.b_end_io(&bh, 1); -+ } - - /* Check we didn't race with a read fault installing a new page */ - if (!page && major) --- -2.1.0 diff --git a/a/3.hdr b/a/3.hdr deleted file mode 100644 index 931795a..0000000 --- a/a/3.hdr +++ /dev/null @@ -1,2 +0,0 @@ -Content-Type: text/x-diff; charset=us-ascii -Content-Disposition: attachment; filename="0002-dax-Missing-unlock-in-error-path.patch" diff --git a/a/3.txt b/a/3.txt deleted file mode 100644 index ca9062a..0000000 --- a/a/3.txt +++ /dev/null @@ -1,27 +0,0 @@ ->From 9c9739b5942dca1e9238631c1bed48f1b21d8b63 Mon Sep 17 00:00:00 2001 -From: Matthew Wilcox <willy@linux.intel.com> -Date: Thu, 11 Sep 2014 12:42:47 -0400 -Subject: [PATCH 2/7] dax: Missing unlock in error path - -If the file was truncated, we have to drop the i_mmap_mutex before -returning an error. - -Signed-off-by: Matthew Wilcox <willy@linux.intel.com> ---- - fs/dax.c | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/fs/dax.c b/fs/dax.c -index 90418ca..fabe9da 100644 ---- a/fs/dax.c -+++ b/fs/dax.c -@@ -357,6 +357,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, - size = (i_size_read(inode) + PAGE_SIZE - 1) >> - PAGE_SHIFT; - if (vmf->pgoff >= size) { -+ mutex_unlock(&mapping->i_mmap_mutex); - error = -EIO; - goto out; - } --- -2.1.0 diff --git a/a/4.hdr b/a/4.hdr deleted file mode 100644 index f21586a..0000000 --- a/a/4.hdr +++ /dev/null @@ -1,2 +0,0 @@ -Content-Type: text/x-diff; charset=us-ascii -Content-Disposition: attachment; filename="0003-dax-Must-hold-mutex-while-clearing-blocks.patch" diff --git a/a/4.txt b/a/4.txt deleted file mode 100644 index 014c059..0000000 --- a/a/4.txt +++ /dev/null @@ -1,152 +0,0 @@ ->From ea8e4473e479bbf66a1caa956214b101b6845855 Mon Sep 17 00:00:00 2001 -From: Matthew Wilcox <willy@linux.intel.com> -Date: Thu, 11 Sep 2014 12:44:20 -0400 -Subject: [PATCH 3/7] dax: Must hold mutex while clearing blocks - -The i_mmap_mutex was not being held across the call to dax_clear_blocks(). -That made it possible for a truncate racing with the page fault -to have removed the blocks from the file before the call to -dax_clear_blocks(). If the blocks had been reassigned to some other -purpose, dax_clear_blocks() could end up clearing blocks that had somebody -else's data in them. - -dax_do_fault() is getting a little long, so bundle up all this code -into a new dax_insert_mapping() function. Call clear_page() instead -of dax_clear_blocks(), since we know we're only clearing a single page. -And use bdev_direct_access() instead of dax_get_pfn() since we actually -want both the pfn (for inserting the map) and the address (for clearing -the memory). - -Signed-off-by: Matthew Wilcox <willy@linux.intel.com> ---- - fs/dax.c | 87 ++++++++++++++++++++++++++++++++++++---------------------------- - 1 file changed, 49 insertions(+), 38 deletions(-) - -diff --git a/fs/dax.c b/fs/dax.c -index fabe9da..b130b47 100644 ---- a/fs/dax.c -+++ b/fs/dax.c -@@ -68,14 +68,6 @@ static long dax_get_addr(struct buffer_head *bh, void **addr, unsigned blkbits) - return bdev_direct_access(bh->b_bdev, sector, addr, &pfn, bh->b_size); - } - --static long dax_get_pfn(struct buffer_head *bh, unsigned long *pfn, -- unsigned blkbits) --{ -- void *addr; -- sector_t sector = bh->b_blocknr << (blkbits - 9); -- return bdev_direct_access(bh->b_bdev, sector, &addr, pfn, bh->b_size); --} -- - static void dax_new_buf(void *addr, unsigned size, unsigned first, loff_t pos, - loff_t end) - { -@@ -283,6 +275,54 @@ static int copy_user_bh(struct page *to, struct buffer_head *bh, - return 0; - } - -+static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, -+ struct vm_area_struct *vma, struct vm_fault *vmf) -+{ -+ struct address_space *mapping = inode->i_mapping; -+ sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9); -+ unsigned long vaddr = (unsigned long)vmf->virtual_address; -+ void *addr; -+ unsigned long pfn; -+ pgoff_t size; -+ int error; -+ -+ mutex_lock(&mapping->i_mmap_mutex); -+ -+ /* -+ * Check truncate didn't happen while we were allocating a block. -+ * If it did, this block may or may not be still allocated to the -+ * file. We can't tell the filesystem to free it because we can't -+ * take i_mutex here. In the worst case, the file still has blocks -+ * allocated past the end of the file. -+ */ -+ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; -+ if (unlikely(vmf->pgoff >= size)) { -+ error = -EIO; -+ goto out; -+ } -+ -+ error = bdev_direct_access(bh->b_bdev, sector, &addr, &pfn, bh->b_size); -+ if (error < 0) -+ goto out; -+ if (error < PAGE_SIZE) { -+ error = -EIO; -+ goto out; -+ } -+ -+ if (buffer_unwritten(bh) || buffer_new(bh)) { -+ clear_page(addr); -+ if (bh->b_end_io) -+ bh->b_end_io(bh, 1); -+ } -+ -+ error = vm_insert_mixed(vma, vaddr, pfn); -+ -+ out: -+ mutex_unlock(&mapping->i_mmap_mutex); -+ -+ return error; -+} -+ - static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, - get_block_t get_block) - { -@@ -295,7 +335,6 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, - unsigned blkbits = inode->i_blkbits; - sector_t block; - pgoff_t size; -- unsigned long pfn; - int error; - int major = 0; - -@@ -365,14 +404,6 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, - return VM_FAULT_LOCKED; - } - -- if (buffer_unwritten(&bh) || buffer_new(&bh)) { -- error = dax_clear_blocks(inode, bh.b_blocknr, bh.b_size); -- if (error) -- goto out; -- if (bh.b_end_io) -- bh.b_end_io(&bh, 1); -- } -- - /* Check we didn't race with a read fault installing a new page */ - if (!page && major) - page = find_lock_page(mapping, vmf->pgoff); -@@ -385,27 +416,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, - page_cache_release(page); - } - -- mutex_lock(&mapping->i_mmap_mutex); -- -- /* -- * Check truncate didn't happen while we were allocating a block. -- * If it did, this block may or may not be still allocated to the -- * file. We can't tell the filesystem to free it because we can't -- * take i_mutex here. In the worst case, the file still has blocks -- * allocated past the end of the file. -- */ -- size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; -- if (unlikely(vmf->pgoff >= size)) { -- mutex_unlock(&mapping->i_mmap_mutex); -- error = -EIO; -- goto out; -- } -- -- error = dax_get_pfn(&bh, &pfn, blkbits); -- if (error > 0) -- error = vm_insert_mixed(vma, vaddr, pfn); -- -- mutex_unlock(&mapping->i_mmap_mutex); -+ error = dax_insert_mapping(inode, &bh, vma, vmf); - - out: - if (error == -ENOMEM) --- -2.1.0 diff --git a/a/5.hdr b/a/5.hdr deleted file mode 100644 index 8823165..0000000 --- a/a/5.hdr +++ /dev/null @@ -1,2 +0,0 @@ -Content-Type: text/x-diff; charset=us-ascii -Content-Disposition: attachment; filename="0004-dax-Unwritten-extents-don-t-set-the-mapped-flag.patch" diff --git a/a/5.txt b/a/5.txt deleted file mode 100644 index 3180cf8..0000000 --- a/a/5.txt +++ /dev/null @@ -1,27 +0,0 @@ ->From 9daf54382b53f3cffc3f050d75edf43e3c51efb4 Mon Sep 17 00:00:00 2001 -From: Matthew Wilcox <willy@linux.intel.com> -Date: Wed, 24 Sep 2014 13:53:24 -0400 -Subject: [PATCH 4/7] dax: Unwritten extents don't set the mapped flag - -Despite an unwritten extent having a defined mapping, buffer_mapped() -returns false. We don't need to call get_block() again here, since -we know wat the disk block is that corresponds to this file offset. ---- - fs/dax.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/fs/dax.c b/fs/dax.c -index b130b47..59be664 100644 ---- a/fs/dax.c -+++ b/fs/dax.c -@@ -366,7 +366,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, - if (error) - goto unlock_page; - -- if (!buffer_mapped(&bh) && !vmf->cow_page) { -+ if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) { - if (vmf->flags & FAULT_FLAG_WRITE) { - error = get_block(inode, block, &bh, 1); - count_vm_event(PGMAJFAULT); --- -2.1.0 diff --git a/a/6.hdr b/a/6.hdr deleted file mode 100644 index eea44e9..0000000 --- a/a/6.hdr +++ /dev/null @@ -1,2 +0,0 @@ -Content-Type: text/x-diff; charset=us-ascii -Content-Disposition: attachment; filename="0005-ext4-Add-a-callback-to-convert-unwritten-extents.patch" diff --git a/a/6.txt b/a/6.txt deleted file mode 100644 index 0d12c03..0000000 --- a/a/6.txt +++ /dev/null @@ -1,54 +0,0 @@ ->From 96f051597cfd91fe51a30fc3dbdeed290b98d7fe Mon Sep 17 00:00:00 2001 -From: Matthew Wilcox <willy@linux.intel.com> -Date: Wed, 24 Sep 2014 14:02:38 -0400 -Subject: [PATCH 5/7] ext4: Add a callback to convert unwritten extents - -A different bug was masking the problem that unwritten extents need to -be converted to written extents once we've faulted them into existence. -Following the XFS example, add a b_end_io callback. We "borrow" a few -additional fields in the buffer_head, but there aren't any big enough -for a sector_t. Fortunately, we only use this callback for DAX, and -ext4 already requires a 4k block size for using DAX, which puts the -limit at 16TB. The page cache already limits file sizes to 16TB on -32-bit systems, so we don't need to grow any fields. ---- - fs/ext4/inode.c | 17 +++++++++++++++++ - 1 file changed, 17 insertions(+) - -diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c -index 5edd903..eaa293a 100644 ---- a/fs/ext4/inode.c -+++ b/fs/ext4/inode.c -@@ -676,6 +676,18 @@ has_zeroout: - return retval; - } - -+static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate) -+{ -+ struct inode *inode = bh->b_assoc_map->host; -+ /* XXX: breaks on 32-bit > 16GB. Is that even supported? */ -+ loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits; -+ int err; -+ if (!uptodate) -+ return; -+ WARN_ON(!buffer_unwritten(bh)); -+ err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size); -+} -+ - /* Maximum number of blocks we map for direct IO at once. */ - #define DIO_MAX_BLOCKS 4096 - -@@ -713,6 +725,11 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock, - - map_bh(bh, inode->i_sb, map.m_pblk); - bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; -+ if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) { -+ bh->b_assoc_map = inode->i_mapping; -+ bh->b_private = (void *)(unsigned long)iblock; -+ bh->b_end_io = ext4_end_io_unwritten; -+ } - if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN) - set_buffer_defer_completion(bh); - bh->b_size = inode->i_sb->s_blocksize * map.m_len; --- -2.1.0 diff --git a/a/7.hdr b/a/7.hdr deleted file mode 100644 index c673ffb..0000000 --- a/a/7.hdr +++ /dev/null @@ -1,2 +0,0 @@ -Content-Type: text/x-diff; charset=us-ascii -Content-Disposition: attachment; filename="0006-vfs-Prevent-DAX-I-Os-from-falling-back-to-buffered-I.patch" diff --git a/a/7.txt b/a/7.txt deleted file mode 100644 index f42673f..0000000 --- a/a/7.txt +++ /dev/null @@ -1,55 +0,0 @@ ->From b73ccea0e0bb4f09fa4ad0a4fa20f6d346bedf50 Mon Sep 17 00:00:00 2001 -From: Matthew Wilcox <willy@linux.intel.com> -Date: Wed, 24 Sep 2014 14:08:40 -0400 -Subject: [PATCH 6/7] vfs: Prevent DAX I/Os from falling back to buffered I/O - -Unlike regular direct I/O, DAX will handle file holes, and there is -no desire to fall back to buffered I/O. Buffered I/O ought to fail if -DAX I/O fails, unless we're doing a random-failure test. So skip the -buffered I/O attempts for DAX files. ---- - mm/filemap.c | 19 ++++++++++++------- - 1 file changed, 12 insertions(+), 7 deletions(-) - -diff --git a/mm/filemap.c b/mm/filemap.c -index 19bdb68..e69b586 100644 ---- a/mm/filemap.c -+++ b/mm/filemap.c -@@ -1717,9 +1717,11 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) - * we've already read everything we wanted to, or if - * there was a short read because we hit EOF, go ahead - * and return. Otherwise fallthrough to buffered io for -- * the rest of the read. -+ * the rest of the read. Buffered reads will not work for -+ * DAX files, so don't bother trying. - */ -- if (retval < 0 || !iov_iter_count(iter) || *ppos >= size) { -+ if (retval < 0 || !iov_iter_count(iter) || *ppos >= size || -+ IS_DAX(inode)) { - file_accessed(file); - goto out; - } -@@ -2582,13 +2584,16 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) - loff_t endbyte; - - written = generic_file_direct_write(iocb, from, pos); -- if (written < 0 || written == count) -- goto out; -- - /* -- * direct-io write to a hole: fall through to buffered I/O -- * for completing the rest of the request. -+ * If the write stopped short of completing, fall back to -+ * buffered writes. Some filesystems do this for writes to -+ * holes, for example. For DAX files, a buffered write will -+ * not succeed (even if it did, DAX does not handle dirty -+ * page-cache pages correctly). - */ -+ if (written < 0 || written == count || IS_DAX(inode)) -+ goto out; -+ - pos += written; - count -= written; - --- -2.1.0 diff --git a/a/8.hdr b/a/8.hdr deleted file mode 100644 index 04d9326..0000000 --- a/a/8.hdr +++ /dev/null @@ -1,2 +0,0 @@ -Content-Type: text/x-diff; charset=us-ascii -Content-Disposition: attachment; filename="0007-dax-Call-b_end_io-outside-the-i_mmap_mutex.patch" diff --git a/a/8.txt b/a/8.txt deleted file mode 100644 index 4efa412..0000000 --- a/a/8.txt +++ /dev/null @@ -1,42 +0,0 @@ ->From 953153c75704a3954052b68c63b4cc3ad15b7a06 Mon Sep 17 00:00:00 2001 -From: Matthew Wilcox <willy@linux.intel.com> -Date: Wed, 24 Sep 2014 14:27:57 -0400 -Subject: [PATCH 7/7] dax: Call b_end_io outside the i_mmap_mutex - -Lockdep helpfully points out that ext4_convert_unwritten_extents will -start a jbd transaction, and i_mmap_mutex is already nested inside the -jbd2_handle lock. So move the call to b_end_io to after we drop the -i_mmap_mutex; this only extends the window in which we can crash and -the allocation will be lost; it does not introduce any new races. ---- - fs/dax.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/fs/dax.c b/fs/dax.c -index 59be664..91b7561 100644 ---- a/fs/dax.c -+++ b/fs/dax.c -@@ -309,17 +309,17 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, - goto out; - } - -- if (buffer_unwritten(bh) || buffer_new(bh)) { -+ if (buffer_unwritten(bh) || buffer_new(bh)) - clear_page(addr); -- if (bh->b_end_io) -- bh->b_end_io(bh, 1); -- } - - error = vm_insert_mixed(vma, vaddr, pfn); - - out: - mutex_unlock(&mapping->i_mmap_mutex); - -+ if (bh->b_end_io) -+ bh->b_end_io(bh, 1); -+ - return error; - } - --- -2.1.0 diff --git a/a/content_digest b/N1/content_digest index 490e224..5fb0665 100644 --- a/a/content_digest +++ b/N1/content_digest @@ -17,429 +17,5 @@ "\01:2\0" "fn\00001-dax-A-couple-of-fixes-from-Dave-Chinner.patch\0" "b\0" - ">From e41949f26f9cc492aab17a1b94d030a11c020893 Mon Sep 17 00:00:00 2001\n" - "From: Matthew Wilcox <willy@linux.intel.com>\n" - "Date: Wed, 10 Sep 2014 13:19:22 -0400\n" - "Subject: [PATCH 1/7] dax: A couple of fixes from Dave Chinner\n" - "\n" - "If dax_clear_blocks() returns an error, segfault.\n" - "\n" - "Don't bother calling get_block() again if the BH is unwritten; the block\n" - "is already allocated, and this won't help matters.\n" - "\n" - "Call b_end_io() if it's set, after zeroing the block, enabling the fs\n" - "to convert the block from unwritten to written.\n" - "\n" - "Signed-off-by: Matthew Wilcox <willy@linux.intel.com>\n" - "---\n" - " fs/dax.c | 11 ++++++++---\n" - " 1 file changed, 8 insertions(+), 3 deletions(-)\n" - "\n" - "diff --git a/fs/dax.c b/fs/dax.c\n" - "index bdf6622..90418ca 100644\n" - "--- a/fs/dax.c\n" - "+++ b/fs/dax.c\n" - "@@ -327,7 +327,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,\n" - " \tif (error)\n" - " \t\tgoto unlock_page;\n" - " \n" - "-\tif (!buffer_written(&bh) && !vmf->cow_page) {\n" - "+\tif (!buffer_mapped(&bh) && !vmf->cow_page) {\n" - " \t\tif (vmf->flags & FAULT_FLAG_WRITE) {\n" - " \t\t\terror = get_block(inode, block, &bh, 1);\n" - " \t\t\tcount_vm_event(PGMAJFAULT);\n" - "@@ -364,8 +364,13 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,\n" - " \t\treturn VM_FAULT_LOCKED;\n" - " \t}\n" - " \n" - "-\tif (buffer_unwritten(&bh) || buffer_new(&bh))\n" - "-\t\tdax_clear_blocks(inode, bh.b_blocknr, bh.b_size);\n" - "+\tif (buffer_unwritten(&bh) || buffer_new(&bh)) {\n" - "+\t\terror = dax_clear_blocks(inode, bh.b_blocknr, bh.b_size);\n" - "+\t\tif (error)\n" - "+\t\t\tgoto out;\n" - "+\t\tif (bh.b_end_io)\n" - "+\t\t\tbh.b_end_io(&bh, 1);\n" - "+\t}\n" - " \n" - " \t/* Check we didn't race with a read fault installing a new page */\n" - " \tif (!page && major)\n" - "-- \n" - 2.1.0 - "\01:3\0" - "fn\00002-dax-Missing-unlock-in-error-path.patch\0" - "b\0" - ">From 9c9739b5942dca1e9238631c1bed48f1b21d8b63 Mon Sep 17 00:00:00 2001\n" - "From: Matthew Wilcox <willy@linux.intel.com>\n" - "Date: Thu, 11 Sep 2014 12:42:47 -0400\n" - "Subject: [PATCH 2/7] dax: Missing unlock in error path\n" - "\n" - "If the file was truncated, we have to drop the i_mmap_mutex before\n" - "returning an error.\n" - "\n" - "Signed-off-by: Matthew Wilcox <willy@linux.intel.com>\n" - "---\n" - " fs/dax.c | 1 +\n" - " 1 file changed, 1 insertion(+)\n" - "\n" - "diff --git a/fs/dax.c b/fs/dax.c\n" - "index 90418ca..fabe9da 100644\n" - "--- a/fs/dax.c\n" - "+++ b/fs/dax.c\n" - "@@ -357,6 +357,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,\n" - " \t\t\tsize = (i_size_read(inode) + PAGE_SIZE - 1) >>\n" - " \t\t\t\t\t\t\t\tPAGE_SHIFT;\n" - " \t\t\tif (vmf->pgoff >= size) {\n" - "+\t\t\t\tmutex_unlock(&mapping->i_mmap_mutex);\n" - " \t\t\t\terror = -EIO;\n" - " \t\t\t\tgoto out;\n" - " \t\t\t}\n" - "-- \n" - 2.1.0 - "\01:4\0" - "fn\00003-dax-Must-hold-mutex-while-clearing-blocks.patch\0" - "b\0" - ">From ea8e4473e479bbf66a1caa956214b101b6845855 Mon Sep 17 00:00:00 2001\n" - "From: Matthew Wilcox <willy@linux.intel.com>\n" - "Date: Thu, 11 Sep 2014 12:44:20 -0400\n" - "Subject: [PATCH 3/7] dax: Must hold mutex while clearing blocks\n" - "\n" - "The i_mmap_mutex was not being held across the call to dax_clear_blocks().\n" - "That made it possible for a truncate racing with the page fault\n" - "to have removed the blocks from the file before the call to\n" - "dax_clear_blocks(). If the blocks had been reassigned to some other\n" - "purpose, dax_clear_blocks() could end up clearing blocks that had somebody\n" - "else's data in them.\n" - "\n" - "dax_do_fault() is getting a little long, so bundle up all this code\n" - "into a new dax_insert_mapping() function. Call clear_page() instead\n" - "of dax_clear_blocks(), since we know we're only clearing a single page.\n" - "And use bdev_direct_access() instead of dax_get_pfn() since we actually\n" - "want both the pfn (for inserting the map) and the address (for clearing\n" - "the memory).\n" - "\n" - "Signed-off-by: Matthew Wilcox <willy@linux.intel.com>\n" - "---\n" - " fs/dax.c | 87 ++++++++++++++++++++++++++++++++++++----------------------------\n" - " 1 file changed, 49 insertions(+), 38 deletions(-)\n" - "\n" - "diff --git a/fs/dax.c b/fs/dax.c\n" - "index fabe9da..b130b47 100644\n" - "--- a/fs/dax.c\n" - "+++ b/fs/dax.c\n" - "@@ -68,14 +68,6 @@ static long dax_get_addr(struct buffer_head *bh, void **addr, unsigned blkbits)\n" - " \treturn bdev_direct_access(bh->b_bdev, sector, addr, &pfn, bh->b_size);\n" - " }\n" - " \n" - "-static long dax_get_pfn(struct buffer_head *bh, unsigned long *pfn,\n" - "-\t\t\t\t\t\t\tunsigned blkbits)\n" - "-{\n" - "-\tvoid *addr;\n" - "-\tsector_t sector = bh->b_blocknr << (blkbits - 9);\n" - "-\treturn bdev_direct_access(bh->b_bdev, sector, &addr, pfn, bh->b_size);\n" - "-}\n" - "-\n" - " static void dax_new_buf(void *addr, unsigned size, unsigned first, loff_t pos,\n" - " \t\t\tloff_t end)\n" - " {\n" - "@@ -283,6 +275,54 @@ static int copy_user_bh(struct page *to, struct buffer_head *bh,\n" - " \treturn 0;\n" - " }\n" - " \n" - "+static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,\n" - "+\t\t\tstruct vm_area_struct *vma, struct vm_fault *vmf)\n" - "+{\n" - "+\tstruct address_space *mapping = inode->i_mapping;\n" - "+\tsector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);\n" - "+\tunsigned long vaddr = (unsigned long)vmf->virtual_address;\n" - "+\tvoid *addr;\n" - "+\tunsigned long pfn;\n" - "+\tpgoff_t size;\n" - "+\tint error;\n" - "+\n" - "+\tmutex_lock(&mapping->i_mmap_mutex);\n" - "+\n" - "+\t/*\n" - "+\t * Check truncate didn't happen while we were allocating a block.\n" - "+\t * If it did, this block may or may not be still allocated to the\n" - "+\t * file. We can't tell the filesystem to free it because we can't\n" - "+\t * take i_mutex here. In the worst case, the file still has blocks\n" - "+\t * allocated past the end of the file.\n" - "+\t */\n" - "+\tsize = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;\n" - "+\tif (unlikely(vmf->pgoff >= size)) {\n" - "+\t\terror = -EIO;\n" - "+\t\tgoto out;\n" - "+\t}\n" - "+\n" - "+\terror = bdev_direct_access(bh->b_bdev, sector, &addr, &pfn, bh->b_size);\n" - "+\tif (error < 0)\n" - "+\t\tgoto out;\n" - "+\tif (error < PAGE_SIZE) {\n" - "+\t\terror = -EIO;\n" - "+\t\tgoto out;\n" - "+\t}\n" - "+\n" - "+\tif (buffer_unwritten(bh) || buffer_new(bh)) {\n" - "+\t\tclear_page(addr);\n" - "+\t\tif (bh->b_end_io)\n" - "+\t\t\tbh->b_end_io(bh, 1);\n" - "+\t}\n" - "+\n" - "+\terror = vm_insert_mixed(vma, vaddr, pfn);\n" - "+\n" - "+ out:\n" - "+\tmutex_unlock(&mapping->i_mmap_mutex);\n" - "+\n" - "+\treturn error;\n" - "+}\n" - "+\n" - " static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,\n" - " \t\t\tget_block_t get_block)\n" - " {\n" - "@@ -295,7 +335,6 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,\n" - " \tunsigned blkbits = inode->i_blkbits;\n" - " \tsector_t block;\n" - " \tpgoff_t size;\n" - "-\tunsigned long pfn;\n" - " \tint error;\n" - " \tint major = 0;\n" - " \n" - "@@ -365,14 +404,6 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,\n" - " \t\treturn VM_FAULT_LOCKED;\n" - " \t}\n" - " \n" - "-\tif (buffer_unwritten(&bh) || buffer_new(&bh)) {\n" - "-\t\terror = dax_clear_blocks(inode, bh.b_blocknr, bh.b_size);\n" - "-\t\tif (error)\n" - "-\t\t\tgoto out;\n" - "-\t\tif (bh.b_end_io)\n" - "-\t\t\tbh.b_end_io(&bh, 1);\n" - "-\t}\n" - "-\n" - " \t/* Check we didn't race with a read fault installing a new page */\n" - " \tif (!page && major)\n" - " \t\tpage = find_lock_page(mapping, vmf->pgoff);\n" - "@@ -385,27 +416,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,\n" - " \t\tpage_cache_release(page);\n" - " \t}\n" - " \n" - "-\tmutex_lock(&mapping->i_mmap_mutex);\n" - "-\n" - "-\t/*\n" - "-\t * Check truncate didn't happen while we were allocating a block.\n" - "-\t * If it did, this block may or may not be still allocated to the\n" - "-\t * file. We can't tell the filesystem to free it because we can't\n" - "-\t * take i_mutex here. In the worst case, the file still has blocks\n" - "-\t * allocated past the end of the file.\n" - "-\t */\n" - "-\tsize = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;\n" - "-\tif (unlikely(vmf->pgoff >= size)) {\n" - "-\t\tmutex_unlock(&mapping->i_mmap_mutex);\n" - "-\t\terror = -EIO;\n" - "-\t\tgoto out;\n" - "-\t}\n" - "-\n" - "-\terror = dax_get_pfn(&bh, &pfn, blkbits);\n" - "-\tif (error > 0)\n" - "-\t\terror = vm_insert_mixed(vma, vaddr, pfn);\n" - "-\n" - "-\tmutex_unlock(&mapping->i_mmap_mutex);\n" - "+\terror = dax_insert_mapping(inode, &bh, vma, vmf);\n" - " \n" - " out:\n" - " \tif (error == -ENOMEM)\n" - "-- \n" - 2.1.0 - "\01:5\0" - "fn\00004-dax-Unwritten-extents-don-t-set-the-mapped-flag.patch\0" - "b\0" - ">From 9daf54382b53f3cffc3f050d75edf43e3c51efb4 Mon Sep 17 00:00:00 2001\n" - "From: Matthew Wilcox <willy@linux.intel.com>\n" - "Date: Wed, 24 Sep 2014 13:53:24 -0400\n" - "Subject: [PATCH 4/7] dax: Unwritten extents don't set the mapped flag\n" - "\n" - "Despite an unwritten extent having a defined mapping, buffer_mapped()\n" - "returns false. We don't need to call get_block() again here, since\n" - "we know wat the disk block is that corresponds to this file offset.\n" - "---\n" - " fs/dax.c | 2 +-\n" - " 1 file changed, 1 insertion(+), 1 deletion(-)\n" - "\n" - "diff --git a/fs/dax.c b/fs/dax.c\n" - "index b130b47..59be664 100644\n" - "--- a/fs/dax.c\n" - "+++ b/fs/dax.c\n" - "@@ -366,7 +366,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,\n" - " \tif (error)\n" - " \t\tgoto unlock_page;\n" - " \n" - "-\tif (!buffer_mapped(&bh) && !vmf->cow_page) {\n" - "+\tif (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) {\n" - " \t\tif (vmf->flags & FAULT_FLAG_WRITE) {\n" - " \t\t\terror = get_block(inode, block, &bh, 1);\n" - " \t\t\tcount_vm_event(PGMAJFAULT);\n" - "-- \n" - 2.1.0 - "\01:6\0" - "fn\00005-ext4-Add-a-callback-to-convert-unwritten-extents.patch\0" - "b\0" - ">From 96f051597cfd91fe51a30fc3dbdeed290b98d7fe Mon Sep 17 00:00:00 2001\n" - "From: Matthew Wilcox <willy@linux.intel.com>\n" - "Date: Wed, 24 Sep 2014 14:02:38 -0400\n" - "Subject: [PATCH 5/7] ext4: Add a callback to convert unwritten extents\n" - "\n" - "A different bug was masking the problem that unwritten extents need to\n" - "be converted to written extents once we've faulted them into existence.\n" - "Following the XFS example, add a b_end_io callback. We \"borrow\" a few\n" - "additional fields in the buffer_head, but there aren't any big enough\n" - "for a sector_t. Fortunately, we only use this callback for DAX, and\n" - "ext4 already requires a 4k block size for using DAX, which puts the\n" - "limit at 16TB. The page cache already limits file sizes to 16TB on\n" - "32-bit systems, so we don't need to grow any fields.\n" - "---\n" - " fs/ext4/inode.c | 17 +++++++++++++++++\n" - " 1 file changed, 17 insertions(+)\n" - "\n" - "diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c\n" - "index 5edd903..eaa293a 100644\n" - "--- a/fs/ext4/inode.c\n" - "+++ b/fs/ext4/inode.c\n" - "@@ -676,6 +676,18 @@ has_zeroout:\n" - " \treturn retval;\n" - " }\n" - " \n" - "+static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)\n" - "+{\n" - "+\tstruct inode *inode = bh->b_assoc_map->host;\n" - "+\t/* XXX: breaks on 32-bit > 16GB. Is that even supported? */\n" - "+\tloff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;\n" - "+\tint err;\n" - "+\tif (!uptodate)\n" - "+\t\treturn;\n" - "+\tWARN_ON(!buffer_unwritten(bh));\n" - "+\terr = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);\n" - "+}\n" - "+\n" - " /* Maximum number of blocks we map for direct IO at once. */\n" - " #define DIO_MAX_BLOCKS 4096\n" - " \n" - "@@ -713,6 +725,11 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,\n" - " \n" - " \t\tmap_bh(bh, inode->i_sb, map.m_pblk);\n" - " \t\tbh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;\n" - "+\t\tif (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) {\n" - "+\t\t\tbh->b_assoc_map = inode->i_mapping;\n" - "+\t\t\tbh->b_private = (void *)(unsigned long)iblock;\n" - "+\t\t\tbh->b_end_io = ext4_end_io_unwritten;\n" - "+\t\t}\n" - " \t\tif (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)\n" - " \t\t\tset_buffer_defer_completion(bh);\n" - " \t\tbh->b_size = inode->i_sb->s_blocksize * map.m_len;\n" - "-- \n" - 2.1.0 - "\01:7\0" - "fn\00006-vfs-Prevent-DAX-I-Os-from-falling-back-to-buffered-I.patch\0" - "b\0" - ">From b73ccea0e0bb4f09fa4ad0a4fa20f6d346bedf50 Mon Sep 17 00:00:00 2001\n" - "From: Matthew Wilcox <willy@linux.intel.com>\n" - "Date: Wed, 24 Sep 2014 14:08:40 -0400\n" - "Subject: [PATCH 6/7] vfs: Prevent DAX I/Os from falling back to buffered I/O\n" - "\n" - "Unlike regular direct I/O, DAX will handle file holes, and there is\n" - "no desire to fall back to buffered I/O. Buffered I/O ought to fail if\n" - "DAX I/O fails, unless we're doing a random-failure test. So skip the\n" - "buffered I/O attempts for DAX files.\n" - "---\n" - " mm/filemap.c | 19 ++++++++++++-------\n" - " 1 file changed, 12 insertions(+), 7 deletions(-)\n" - "\n" - "diff --git a/mm/filemap.c b/mm/filemap.c\n" - "index 19bdb68..e69b586 100644\n" - "--- a/mm/filemap.c\n" - "+++ b/mm/filemap.c\n" - "@@ -1717,9 +1717,11 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)\n" - " \t\t * we've already read everything we wanted to, or if\n" - " \t\t * there was a short read because we hit EOF, go ahead\n" - " \t\t * and return. Otherwise fallthrough to buffered io for\n" - "-\t\t * the rest of the read.\n" - "+\t\t * the rest of the read. Buffered reads will not work for\n" - "+\t\t * DAX files, so don't bother trying.\n" - " \t\t */\n" - "-\t\tif (retval < 0 || !iov_iter_count(iter) || *ppos >= size) {\n" - "+\t\tif (retval < 0 || !iov_iter_count(iter) || *ppos >= size ||\n" - "+\t\t IS_DAX(inode)) {\n" - " \t\t\tfile_accessed(file);\n" - " \t\t\tgoto out;\n" - " \t\t}\n" - "@@ -2582,13 +2584,16 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)\n" - " \t\tloff_t endbyte;\n" - " \n" - " \t\twritten = generic_file_direct_write(iocb, from, pos);\n" - "-\t\tif (written < 0 || written == count)\n" - "-\t\t\tgoto out;\n" - "-\n" - " \t\t/*\n" - "-\t\t * direct-io write to a hole: fall through to buffered I/O\n" - "-\t\t * for completing the rest of the request.\n" - "+\t\t * If the write stopped short of completing, fall back to\n" - "+\t\t * buffered writes. Some filesystems do this for writes to\n" - "+\t\t * holes, for example. For DAX files, a buffered write will\n" - "+\t\t * not succeed (even if it did, DAX does not handle dirty\n" - "+\t\t * page-cache pages correctly).\n" - " \t\t */\n" - "+\t\tif (written < 0 || written == count || IS_DAX(inode))\n" - "+\t\t\tgoto out;\n" - "+\n" - " \t\tpos += written;\n" - " \t\tcount -= written;\n" - " \n" - "-- \n" - 2.1.0 - "\01:8\0" - "fn\00007-dax-Call-b_end_io-outside-the-i_mmap_mutex.patch\0" - "b\0" - ">From 953153c75704a3954052b68c63b4cc3ad15b7a06 Mon Sep 17 00:00:00 2001\n" - "From: Matthew Wilcox <willy@linux.intel.com>\n" - "Date: Wed, 24 Sep 2014 14:27:57 -0400\n" - "Subject: [PATCH 7/7] dax: Call b_end_io outside the i_mmap_mutex\n" - "\n" - "Lockdep helpfully points out that ext4_convert_unwritten_extents will\n" - "start a jbd transaction, and i_mmap_mutex is already nested inside the\n" - "jbd2_handle lock. So move the call to b_end_io to after we drop the\n" - "i_mmap_mutex; this only extends the window in which we can crash and\n" - "the allocation will be lost; it does not introduce any new races.\n" - "---\n" - " fs/dax.c | 8 ++++----\n" - " 1 file changed, 4 insertions(+), 4 deletions(-)\n" - "\n" - "diff --git a/fs/dax.c b/fs/dax.c\n" - "index 59be664..91b7561 100644\n" - "--- a/fs/dax.c\n" - "+++ b/fs/dax.c\n" - "@@ -309,17 +309,17 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,\n" - " \t\tgoto out;\n" - " \t}\n" - " \n" - "-\tif (buffer_unwritten(bh) || buffer_new(bh)) {\n" - "+\tif (buffer_unwritten(bh) || buffer_new(bh))\n" - " \t\tclear_page(addr);\n" - "-\t\tif (bh->b_end_io)\n" - "-\t\t\tbh->b_end_io(bh, 1);\n" - "-\t}\n" - " \n" - " \terror = vm_insert_mixed(vma, vaddr, pfn);\n" - " \n" - " out:\n" - " \tmutex_unlock(&mapping->i_mmap_mutex);\n" - " \n" - "+\tif (bh->b_end_io)\n" - "+\t\tbh->b_end_io(bh, 1);\n" - "+\n" - " \treturn error;\n" - " }\n" - " \n" - "-- \n" - 2.1.0 -f266fa0861019dd9436c855ad7335e23e91496bd157f5f26810b3015277d4cf9 +98d28e2e89f798079193d92f9e246d770e40dd269b294f98b81e535d172e7356
This is an external index of several public inboxes, see mirroring instructions on how to clone and mirror all data and code used by this external index.