* [patch 1/3] mm: pagecache write deadlocks zerolength fix
@ 2006-11-30 7:20 Nick Piggin
2006-11-30 7:22 ` [patch 2/3] mm: pagecache write deadlocks stale holes fix Nick Piggin
` (2 more replies)
0 siblings, 3 replies; 24+ messages in thread
From: Nick Piggin @ 2006-11-30 7:20 UTC (permalink / raw)
To: Linux Kernel Mailing List, Andrew Morton, linux-fsdevel
writev with a zero-length segment is a noop, and we shouldn't return EFAULT.
Signed-off-by: Nick Piggin <npiggin@suse.de>
Index: linux-2.6/include/linux/pagemap.h
===================================================================
--- linux-2.6.orig/include/linux/pagemap.h
+++ linux-2.6/include/linux/pagemap.h
@@ -198,6 +198,9 @@ static inline int fault_in_pages_writeab
{
int ret;
+ if (unlikely(size == 0))
+ return 0;
+
/*
* Writing zeroes into userspace here is OK, because we know that if
* the zero gets there, we'll be overwriting it.
@@ -222,6 +225,9 @@ static inline int fault_in_pages_readabl
volatile char c;
int ret;
+ if (unlikely(size == 0))
+ return 0;
+
ret = __get_user(c, uaddr);
if (ret == 0) {
const char __user *end = uaddr + size - 1;
^ permalink raw reply [flat|nested] 24+ messages in thread* [patch 2/3] mm: pagecache write deadlocks stale holes fix 2006-11-30 7:20 [patch 1/3] mm: pagecache write deadlocks zerolength fix Nick Piggin @ 2006-11-30 7:22 ` Nick Piggin 2006-11-30 7:22 ` [patch 3/3] fs: fix cont vs deadlock patches Nick Piggin 2006-11-30 7:26 ` [patch 0/3] more buffered write fixes Nick Piggin 2006-11-30 10:15 ` [patch 1/3] mm: pagecache write deadlocks zerolength fix Andreas Schwab 2 siblings, 1 reply; 24+ messages in thread From: Nick Piggin @ 2006-11-30 7:22 UTC (permalink / raw) To: Linux Kernel Mailing List, Andrew Morton, linux-fsdevel If the data copy within a prepare_write can potentially allocate blocks to fill holes, so if the page copy fails then new blocks must be zeroed so uninitialised data cannot be exposed with a subsequent read. Signed-off-by: Nick Piggin <npiggin@suse.de> Index: linux-2.6/mm/filemap.c =================================================================== --- linux-2.6.orig/mm/filemap.c +++ linux-2.6/mm/filemap.c @@ -1951,7 +1951,14 @@ retry_noprogress: bytes); dec_preempt_count(); - if (!PageUptodate(page)) { + if (unlikely(copied != bytes)) { + /* + * Must zero out new buffers here so that we do end + * up properly filling holes rather than leaving stale + * data in them that might be read in future. + */ + page_zero_new_buffers(page); + /* * If the page is not uptodate, we cannot allow a * partial commit_write because when we unlock the @@ -1965,10 +1972,10 @@ retry_noprogress: * Abort the operation entirely with a zero length * commit_write. Retry. We will enter the * single-segment path below, which should get the - * filesystem to bring the page uputodate for us next + * filesystem to bring the page uptodate for us next * time. */ - if (unlikely(copied != bytes)) + if (!PageUptodate(page)) copied = 0; } Index: linux-2.6/fs/buffer.c =================================================================== --- linux-2.6.orig/fs/buffer.c +++ linux-2.6/fs/buffer.c @@ -1491,6 +1491,39 @@ out: } EXPORT_SYMBOL(block_invalidatepage); +void page_zero_new_buffers(struct page *page) +{ + unsigned int block_start, block_end; + struct buffer_head *head, *bh; + + BUG_ON(!PageLocked(page)); + if (!page_has_buffers(page)) + return; + + bh = head = page_buffers(page); + block_start = 0; + do { + block_end = block_start + bh->b_size; + + if (buffer_new(bh)) { + void *kaddr; + + if (!PageUptodate(page)) { + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr+block_start, 0, bh->b_size); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + } + clear_buffer_new(bh); + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + } + + block_start = block_end; + bh = bh->b_this_page; + } while (bh != head); +} + /* * We attach and possibly dirty the buffers atomically wrt * __set_page_dirty_buffers() via private_lock. try_to_free_buffers @@ -1784,36 +1817,33 @@ static int __block_prepare_write(struct } continue; } - if (buffer_new(bh)) - clear_buffer_new(bh); if (!buffer_mapped(bh)) { WARN_ON(bh->b_size != blocksize); err = get_block(inode, block, bh, 1); if (err) break; - if (buffer_new(bh)) { - unmap_underlying_metadata(bh->b_bdev, - bh->b_blocknr); - if (PageUptodate(page)) { - set_buffer_uptodate(bh); - continue; - } - if (block_end > to || block_start < from) { - void *kaddr; - - kaddr = kmap_atomic(page, KM_USER0); - if (block_end > to) - memset(kaddr+to, 0, - block_end-to); - if (block_start < from) - memset(kaddr+block_start, - 0, from-block_start); - flush_dcache_page(page); - kunmap_atomic(kaddr, KM_USER0); - } + } + if (buffer_new(bh)) { + unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); + if (PageUptodate(page)) { + set_buffer_uptodate(bh); continue; } + if (block_end > to || block_start < from) { + void *kaddr; + + kaddr = kmap_atomic(page, KM_USER0); + if (block_end > to) + memset(kaddr+to, 0, block_end-to); + if (block_start < from) + memset(kaddr+block_start, + 0, from-block_start); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + } + continue; } + if (PageUptodate(page)) { if (!buffer_uptodate(bh)) set_buffer_uptodate(bh); @@ -1833,43 +1863,10 @@ static int __block_prepare_write(struct if (!buffer_uptodate(*wait_bh)) err = -EIO; } - if (!err) { - bh = head; - do { - if (buffer_new(bh)) - clear_buffer_new(bh); - } while ((bh = bh->b_this_page) != head); - return 0; - } - /* Error case: */ - /* - * Zero out any newly allocated blocks to avoid exposing stale - * data. If BH_New is set, we know that the block was newly - * allocated in the above loop. - */ - bh = head; - block_start = 0; - do { - block_end = block_start+blocksize; - if (block_end <= from) - goto next_bh; - if (block_start >= to) - break; - if (buffer_new(bh)) { - void *kaddr; - clear_buffer_new(bh); - kaddr = kmap_atomic(page, KM_USER0); - memset(kaddr+block_start, 0, bh->b_size); - flush_dcache_page(page); - kunmap_atomic(kaddr, KM_USER0); - set_buffer_uptodate(bh); - mark_buffer_dirty(bh); - } -next_bh: - block_start = block_end; - bh = bh->b_this_page; - } while (bh != head); + if (err) + page_zero_new_buffers(page); + return err; } @@ -2246,7 +2243,6 @@ int nobh_prepare_write(struct page *page int i; int ret = 0; int is_mapped_to_disk = 1; - int dirtied_it = 0; if (PageMappedToDisk(page)) return 0; @@ -2282,15 +2278,16 @@ int nobh_prepare_write(struct page *page if (PageUptodate(page)) continue; if (buffer_new(&map_bh) || !buffer_mapped(&map_bh)) { + /* + * XXX: stale data can be exposed as we are not zeroing + * out newly allocated blocks. If a subsequent operation + * fails, we'll never know about this :( + */ kaddr = kmap_atomic(page, KM_USER0); - if (block_start < from) { - memset(kaddr+block_start, 0, from-block_start); - dirtied_it = 1; - } - if (block_end > to) { + if (block_start < from) + memset(kaddr+block_start, 0, block_end-block_start); + if (block_end > to) memset(kaddr + to, 0, block_end - to); - dirtied_it = 1; - } flush_dcache_page(page); kunmap_atomic(kaddr, KM_USER0); continue; Index: linux-2.6/include/linux/buffer_head.h =================================================================== --- linux-2.6.orig/include/linux/buffer_head.h +++ linux-2.6/include/linux/buffer_head.h @@ -151,6 +151,7 @@ struct buffer_head *alloc_page_buffers(s int retry); void create_empty_buffers(struct page *, unsigned long, unsigned long b_state); +void page_zero_new_buffers(struct page *page); void end_buffer_read_sync(struct buffer_head *bh, int uptodate); void end_buffer_write_sync(struct buffer_head *bh, int uptodate); Index: linux-2.6/include/linux/pagemap.h =================================================================== --- linux-2.6.orig/include/linux/pagemap.h +++ linux-2.6/include/linux/pagemap.h @@ -206,7 +206,7 @@ static inline int fault_in_pages_writeab * the zero gets there, we'll be overwriting it. */ ret = __put_user(0, uaddr); - if (ret == 0) { + if (likely(ret == 0)) { char __user *end = uaddr + size - 1; /* @@ -229,7 +229,7 @@ static inline int fault_in_pages_readabl return 0; ret = __get_user(c, uaddr); - if (ret == 0) { + if (likely(ret == 0)) { const char __user *end = uaddr + size - 1; if (((unsigned long)uaddr & PAGE_MASK) != ^ permalink raw reply [flat|nested] 24+ messages in thread
* [patch 3/3] fs: fix cont vs deadlock patches 2006-11-30 7:22 ` [patch 2/3] mm: pagecache write deadlocks stale holes fix Nick Piggin @ 2006-11-30 7:22 ` Nick Piggin 2006-11-30 11:32 ` Nick Piggin 0 siblings, 1 reply; 24+ messages in thread From: Nick Piggin @ 2006-11-30 7:22 UTC (permalink / raw) To: Linux Kernel Mailing List, Andrew Morton, linux-fsdevel Rework the cont filesystem helpers so that generic_cont_expand does the actual work of expanding the file. cont_prepare_write then calls this routine if expanding is needed, and retries. Also solves the problem where cont_prepare_write would previously hold the target page locked while doing not-very-nice things like locking other pages. Means that zero-length prepare/commit_write pairs are no longer needed as an overloaded directive to extend the file, thus cont should operate better within the new deadlock-free buffered write code. Converts fat over to the new cont scheme. Signed-off-by: Nick Piggin <npiggin@suse.de> Index: linux-2.6/fs/buffer.c =================================================================== --- linux-2.6.orig/fs/buffer.c +++ linux-2.6/fs/buffer.c @@ -2004,19 +2004,20 @@ int block_read_full_page(struct page *pa return 0; } -/* utility function for filesystems that need to do work on expanding - * truncates. Uses prepare/commit_write to allow the filesystem to - * deal with the hole. +/* + * Utility function for filesystems that need to do work on expanding + * truncates. For moronic filesystems that do not allow holes in file. */ -static int __generic_cont_expand(struct inode *inode, loff_t size, - pgoff_t index, unsigned int offset) +int generic_cont_expand(struct inode *inode, loff_t size, loff_t *bytes, + get_block_t *get_block) { struct address_space *mapping = inode->i_mapping; + unsigned long blocksize = 1 << inode->i_blkbits; struct page *page; unsigned long limit; - int err; + int status; - err = -EFBIG; + status = -EFBIG; limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; if (limit != RLIM_INFINITY && size > (loff_t)limit) { send_sig(SIGXFSZ, current, 0); @@ -2025,146 +2026,83 @@ static int __generic_cont_expand(struct if (size > inode->i_sb->s_maxbytes) goto out; - err = -ENOMEM; - page = grab_cache_page(mapping, index); - if (!page) - goto out; - err = mapping->a_ops->prepare_write(NULL, page, offset, offset); - if (err) { - /* - * ->prepare_write() may have instantiated a few blocks - * outside i_size. Trim these off again. - */ - unlock_page(page); - page_cache_release(page); - vmtruncate(inode, inode->i_size); - goto out; - } + status = 0; - err = mapping->a_ops->commit_write(NULL, page, offset, offset); + while (*bytes < size) { + unsigned int zerofrom; + unsigned int zeroto; + void *kaddr; + pgoff_t pgpos; + + pgpos = *bytes >> PAGE_CACHE_SHIFT; + page = grab_cache_page(mapping, pgpos); + if (!page) { + status = -ENOMEM; + break; + } + /* we might sleep */ + if (*bytes >> PAGE_CACHE_SHIFT != pgpos) + goto unlock; - unlock_page(page); - page_cache_release(page); - if (err > 0) - err = 0; -out: - return err; -} + zerofrom = *bytes & ~PAGE_CACHE_MASK; + if (zerofrom & (blocksize-1)) + *bytes = (*bytes + blocksize-1) & (blocksize-1); -int generic_cont_expand(struct inode *inode, loff_t size) -{ - pgoff_t index; - unsigned int offset; + zeroto = PAGE_CACHE_SIZE; - offset = (size & (PAGE_CACHE_SIZE - 1)); /* Within page */ + status = __block_prepare_write(inode, page, zerofrom, + zeroto, get_block); + if (status) + goto unlock; + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + status = __block_commit_write(inode, page, zerofrom, zeroto); - /* ugh. in prepare/commit_write, if from==to==start of block, we - ** skip the prepare. make sure we never send an offset for the start - ** of a block - */ - if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) { - /* caller must handle this extra byte. */ - offset++; +unlock: + unlock_page(page); + page_cache_release(page); + if (status) { + BUG_ON(status == AOP_TRUNCATED_PAGE); + break; + } } - index = size >> PAGE_CACHE_SHIFT; - return __generic_cont_expand(inode, size, index, offset); -} - -int generic_cont_expand_simple(struct inode *inode, loff_t size) -{ - loff_t pos = size - 1; - pgoff_t index = pos >> PAGE_CACHE_SHIFT; - unsigned int offset = (pos & (PAGE_CACHE_SIZE - 1)) + 1; + /* + * No need to use i_size_read() here, the i_size + * cannot change under us because we hold i_mutex. + */ + if (size > inode->i_size) { + i_size_write(inode, size); + mark_inode_dirty(inode); + } - /* prepare/commit_write can handle even if from==to==start of block. */ - return __generic_cont_expand(inode, size, index, offset); +out: + return status; } -/* - * For moronic filesystems that do not allow holes in file. - * We may have to extend the file. - */ - int cont_prepare_write(struct page *page, unsigned offset, unsigned to, get_block_t *get_block, loff_t *bytes) { - struct address_space *mapping = page->mapping; - struct inode *inode = mapping->host; - struct page *new_page; - pgoff_t pgpos; - long status; - unsigned zerofrom; - unsigned blocksize = 1 << inode->i_blkbits; - void *kaddr; - - while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) { - status = -ENOMEM; - new_page = grab_cache_page(mapping, pgpos); - if (!new_page) - goto out; - /* we might sleep */ - if (*bytes>>PAGE_CACHE_SHIFT != pgpos) { - unlock_page(new_page); - page_cache_release(new_page); - continue; - } - zerofrom = *bytes & ~PAGE_CACHE_MASK; - if (zerofrom & (blocksize-1)) { - *bytes |= (blocksize-1); - (*bytes)++; - } - status = __block_prepare_write(inode, new_page, zerofrom, - PAGE_CACHE_SIZE, get_block); - if (status) - goto out_unmap; - kaddr = kmap_atomic(new_page, KM_USER0); - memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom); - flush_dcache_page(new_page); - kunmap_atomic(kaddr, KM_USER0); - generic_commit_write(NULL, new_page, zerofrom, PAGE_CACHE_SIZE); - unlock_page(new_page); - page_cache_release(new_page); - } - - if (page->index < pgpos) { - /* completely inside the area */ - zerofrom = offset; - } else { - /* page covers the boundary, find the boundary offset */ - zerofrom = *bytes & ~PAGE_CACHE_MASK; - - /* if we will expand the thing last block will be filled */ - if (to > zerofrom && (zerofrom & (blocksize-1))) { - *bytes |= (blocksize-1); - (*bytes)++; - } + loff_t size = (page->index << PAGE_CACHE_SHIFT) + offset; + struct inode *inode = page->mapping->host; + int err; - /* starting below the boundary? Nothing to zero out */ - if (offset <= zerofrom) - zerofrom = offset; - } - status = __block_prepare_write(inode, page, zerofrom, to, get_block); - if (status) - goto out1; - if (zerofrom < offset) { - kaddr = kmap_atomic(page, KM_USER0); - memset(kaddr+zerofrom, 0, offset-zerofrom); - flush_dcache_page(page); - kunmap_atomic(kaddr, KM_USER0); - __block_commit_write(inode, page, zerofrom, offset); + if (*bytes < size) { + unlock_page(page); + err = generic_cont_expand(inode, size, bytes, get_block); + if (!err) + err = AOP_TRUNCATED_PAGE; + else + lock_page(page); /* caller expects this */ + return err; } - return 0; -out1: - ClearPageUptodate(page); - return status; -out_unmap: - ClearPageUptodate(new_page); - unlock_page(new_page); - page_cache_release(new_page); -out: - return status; + err = __block_prepare_write(inode, page, offset, to, get_block); + if (err) + ClearPageUptodate(page); + return err; } int block_prepare_write(struct page *page, unsigned from, unsigned to, @@ -3015,7 +2953,6 @@ EXPORT_SYMBOL(fsync_bdev); EXPORT_SYMBOL(generic_block_bmap); EXPORT_SYMBOL(generic_commit_write); EXPORT_SYMBOL(generic_cont_expand); -EXPORT_SYMBOL(generic_cont_expand_simple); EXPORT_SYMBOL(init_buffer); EXPORT_SYMBOL(invalidate_bdev); EXPORT_SYMBOL(ll_rw_block); Index: linux-2.6/include/linux/buffer_head.h =================================================================== --- linux-2.6.orig/include/linux/buffer_head.h +++ linux-2.6/include/linux/buffer_head.h @@ -202,8 +202,7 @@ int block_read_full_page(struct page*, g int block_prepare_write(struct page*, unsigned, unsigned, get_block_t*); int cont_prepare_write(struct page*, unsigned, unsigned, get_block_t*, loff_t *); -int generic_cont_expand(struct inode *inode, loff_t size); -int generic_cont_expand_simple(struct inode *inode, loff_t size); +int generic_cont_expand(struct inode *inode, loff_t size, loff_t *bytes, get_block_t *); int block_commit_write(struct page *page, unsigned from, unsigned to); void block_sync_page(struct page *); sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *); Index: linux-2.6/fs/fat/inode.c =================================================================== --- linux-2.6.orig/fs/fat/inode.c +++ linux-2.6/fs/fat/inode.c @@ -151,11 +151,12 @@ static int fat_commit_write(struct file { struct inode *inode = page->mapping->host; int err; - if (to - from > 0) - return 0; err = generic_commit_write(file, page, from, to); - if (!err && !(MSDOS_I(inode)->i_attrs & ATTR_ARCH)) { + if (err) + return err; + + if (!(MSDOS_I(inode)->i_attrs & ATTR_ARCH)) { inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; MSDOS_I(inode)->i_attrs |= ATTR_ARCH; mark_inode_dirty(inode); ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [patch 3/3] fs: fix cont vs deadlock patches 2006-11-30 7:22 ` [patch 3/3] fs: fix cont vs deadlock patches Nick Piggin @ 2006-11-30 11:32 ` Nick Piggin 2006-11-30 22:14 ` OGAWA Hirofumi 0 siblings, 1 reply; 24+ messages in thread From: Nick Piggin @ 2006-11-30 11:32 UTC (permalink / raw) To: Linux Kernel Mailing List, Andrew Morton, linux-fsdevel Sorry, missed a hunk which prevented fat from linking :( --- Rework the cont filesystem helpers so that generic_cont_expand does the actual work of expanding the file. cont_prepare_write then calls this routine if expanding is needed, and retries. Also solves the problem where cont_prepare_write would previously hold the target page locked while doing not-very-nice things like locking other pages. Means that zero-length prepare/commit_write pairs are no longer needed as an overloaded directive to extend the file, thus cont should operate better within the new deadlock-free buffered write code. Converts fat over to the new cont scheme. Signed-off-by: Nick Piggin <npiggin@suse.de> Index: linux-2.6/fs/buffer.c =================================================================== --- linux-2.6.orig/fs/buffer.c +++ linux-2.6/fs/buffer.c @@ -2004,19 +2004,20 @@ int block_read_full_page(struct page *pa return 0; } -/* utility function for filesystems that need to do work on expanding - * truncates. Uses prepare/commit_write to allow the filesystem to - * deal with the hole. +/* + * Utility function for filesystems that need to do work on expanding + * truncates. For moronic filesystems that do not allow holes in file. */ -static int __generic_cont_expand(struct inode *inode, loff_t size, - pgoff_t index, unsigned int offset) +int generic_cont_expand(struct inode *inode, loff_t size, loff_t *bytes, + get_block_t *get_block) { struct address_space *mapping = inode->i_mapping; + unsigned long blocksize = 1 << inode->i_blkbits; struct page *page; unsigned long limit; - int err; + int status; - err = -EFBIG; + status = -EFBIG; limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; if (limit != RLIM_INFINITY && size > (loff_t)limit) { send_sig(SIGXFSZ, current, 0); @@ -2025,146 +2026,83 @@ static int __generic_cont_expand(struct if (size > inode->i_sb->s_maxbytes) goto out; - err = -ENOMEM; - page = grab_cache_page(mapping, index); - if (!page) - goto out; - err = mapping->a_ops->prepare_write(NULL, page, offset, offset); - if (err) { - /* - * ->prepare_write() may have instantiated a few blocks - * outside i_size. Trim these off again. - */ - unlock_page(page); - page_cache_release(page); - vmtruncate(inode, inode->i_size); - goto out; - } + status = 0; - err = mapping->a_ops->commit_write(NULL, page, offset, offset); + while (*bytes < size) { + unsigned int zerofrom; + unsigned int zeroto; + void *kaddr; + pgoff_t pgpos; + + pgpos = *bytes >> PAGE_CACHE_SHIFT; + page = grab_cache_page(mapping, pgpos); + if (!page) { + status = -ENOMEM; + break; + } + /* we might sleep */ + if (*bytes >> PAGE_CACHE_SHIFT != pgpos) + goto unlock; - unlock_page(page); - page_cache_release(page); - if (err > 0) - err = 0; -out: - return err; -} + zerofrom = *bytes & ~PAGE_CACHE_MASK; + if (zerofrom & (blocksize-1)) + *bytes = (*bytes + blocksize-1) & (blocksize-1); -int generic_cont_expand(struct inode *inode, loff_t size) -{ - pgoff_t index; - unsigned int offset; + zeroto = PAGE_CACHE_SIZE; - offset = (size & (PAGE_CACHE_SIZE - 1)); /* Within page */ + status = __block_prepare_write(inode, page, zerofrom, + zeroto, get_block); + if (status) + goto unlock; + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + status = __block_commit_write(inode, page, zerofrom, zeroto); - /* ugh. in prepare/commit_write, if from==to==start of block, we - ** skip the prepare. make sure we never send an offset for the start - ** of a block - */ - if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) { - /* caller must handle this extra byte. */ - offset++; +unlock: + unlock_page(page); + page_cache_release(page); + if (status) { + BUG_ON(status == AOP_TRUNCATED_PAGE); + break; + } } - index = size >> PAGE_CACHE_SHIFT; - return __generic_cont_expand(inode, size, index, offset); -} - -int generic_cont_expand_simple(struct inode *inode, loff_t size) -{ - loff_t pos = size - 1; - pgoff_t index = pos >> PAGE_CACHE_SHIFT; - unsigned int offset = (pos & (PAGE_CACHE_SIZE - 1)) + 1; + /* + * No need to use i_size_read() here, the i_size + * cannot change under us because we hold i_mutex. + */ + if (size > inode->i_size) { + i_size_write(inode, size); + mark_inode_dirty(inode); + } - /* prepare/commit_write can handle even if from==to==start of block. */ - return __generic_cont_expand(inode, size, index, offset); +out: + return status; } -/* - * For moronic filesystems that do not allow holes in file. - * We may have to extend the file. - */ - int cont_prepare_write(struct page *page, unsigned offset, unsigned to, get_block_t *get_block, loff_t *bytes) { - struct address_space *mapping = page->mapping; - struct inode *inode = mapping->host; - struct page *new_page; - pgoff_t pgpos; - long status; - unsigned zerofrom; - unsigned blocksize = 1 << inode->i_blkbits; - void *kaddr; - - while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) { - status = -ENOMEM; - new_page = grab_cache_page(mapping, pgpos); - if (!new_page) - goto out; - /* we might sleep */ - if (*bytes>>PAGE_CACHE_SHIFT != pgpos) { - unlock_page(new_page); - page_cache_release(new_page); - continue; - } - zerofrom = *bytes & ~PAGE_CACHE_MASK; - if (zerofrom & (blocksize-1)) { - *bytes |= (blocksize-1); - (*bytes)++; - } - status = __block_prepare_write(inode, new_page, zerofrom, - PAGE_CACHE_SIZE, get_block); - if (status) - goto out_unmap; - kaddr = kmap_atomic(new_page, KM_USER0); - memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom); - flush_dcache_page(new_page); - kunmap_atomic(kaddr, KM_USER0); - generic_commit_write(NULL, new_page, zerofrom, PAGE_CACHE_SIZE); - unlock_page(new_page); - page_cache_release(new_page); - } - - if (page->index < pgpos) { - /* completely inside the area */ - zerofrom = offset; - } else { - /* page covers the boundary, find the boundary offset */ - zerofrom = *bytes & ~PAGE_CACHE_MASK; - - /* if we will expand the thing last block will be filled */ - if (to > zerofrom && (zerofrom & (blocksize-1))) { - *bytes |= (blocksize-1); - (*bytes)++; - } + loff_t size = (page->index << PAGE_CACHE_SHIFT) + offset; + struct inode *inode = page->mapping->host; + int err; - /* starting below the boundary? Nothing to zero out */ - if (offset <= zerofrom) - zerofrom = offset; - } - status = __block_prepare_write(inode, page, zerofrom, to, get_block); - if (status) - goto out1; - if (zerofrom < offset) { - kaddr = kmap_atomic(page, KM_USER0); - memset(kaddr+zerofrom, 0, offset-zerofrom); - flush_dcache_page(page); - kunmap_atomic(kaddr, KM_USER0); - __block_commit_write(inode, page, zerofrom, offset); + if (*bytes < size) { + unlock_page(page); + err = generic_cont_expand(inode, size, bytes, get_block); + if (!err) + err = AOP_TRUNCATED_PAGE; + else + lock_page(page); /* caller expects this */ + return err; } - return 0; -out1: - ClearPageUptodate(page); - return status; -out_unmap: - ClearPageUptodate(new_page); - unlock_page(new_page); - page_cache_release(new_page); -out: - return status; + err = __block_prepare_write(inode, page, offset, to, get_block); + if (err) + ClearPageUptodate(page); + return err; } int block_prepare_write(struct page *page, unsigned from, unsigned to, @@ -3015,7 +2953,6 @@ EXPORT_SYMBOL(fsync_bdev); EXPORT_SYMBOL(generic_block_bmap); EXPORT_SYMBOL(generic_commit_write); EXPORT_SYMBOL(generic_cont_expand); -EXPORT_SYMBOL(generic_cont_expand_simple); EXPORT_SYMBOL(init_buffer); EXPORT_SYMBOL(invalidate_bdev); EXPORT_SYMBOL(ll_rw_block); Index: linux-2.6/include/linux/buffer_head.h =================================================================== --- linux-2.6.orig/include/linux/buffer_head.h +++ linux-2.6/include/linux/buffer_head.h @@ -202,8 +202,7 @@ int block_read_full_page(struct page*, g int block_prepare_write(struct page*, unsigned, unsigned, get_block_t*); int cont_prepare_write(struct page*, unsigned, unsigned, get_block_t*, loff_t *); -int generic_cont_expand(struct inode *inode, loff_t size); -int generic_cont_expand_simple(struct inode *inode, loff_t size); +int generic_cont_expand(struct inode *inode, loff_t size, loff_t *bytes, get_block_t *); int block_commit_write(struct page *page, unsigned from, unsigned to); void block_sync_page(struct page *); sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *); Index: linux-2.6/fs/fat/inode.c =================================================================== --- linux-2.6.orig/fs/fat/inode.c +++ linux-2.6/fs/fat/inode.c @@ -117,6 +117,25 @@ static int fat_get_block(struct inode *i return 0; } +int fat_cont_expand(struct inode *inode, loff_t size) +{ + struct address_space *mapping = inode->i_mapping; + loff_t start = inode->i_size, count = size - inode->i_size; + int err; + + err = generic_cont_expand(inode, size, + &MSDOS_I(inode)->mmu_private, fat_get_block); + if (err) + goto out; + + inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); + if (IS_SYNC(inode)) + err = sync_page_range_nolock(inode, mapping, start, count); +out: + return err; +} + static int fat_writepage(struct page *page, struct writeback_control *wbc) { return block_write_full_page(page, fat_get_block, wbc); @@ -151,11 +170,12 @@ static int fat_commit_write(struct file { struct inode *inode = page->mapping->host; int err; - if (to - from > 0) - return 0; err = generic_commit_write(file, page, from, to); - if (!err && !(MSDOS_I(inode)->i_attrs & ATTR_ARCH)) { + if (err) + return err; + + if (!(MSDOS_I(inode)->i_attrs & ATTR_ARCH)) { inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; MSDOS_I(inode)->i_attrs |= ATTR_ARCH; mark_inode_dirty(inode); Index: linux-2.6/fs/fat/file.c =================================================================== --- linux-2.6.orig/fs/fat/file.c +++ linux-2.6/fs/fat/file.c @@ -137,24 +137,6 @@ const struct file_operations fat_file_op .sendfile = generic_file_sendfile, }; -static int fat_cont_expand(struct inode *inode, loff_t size) -{ - struct address_space *mapping = inode->i_mapping; - loff_t start = inode->i_size, count = size - inode->i_size; - int err; - - err = generic_cont_expand_simple(inode, size); - if (err) - goto out; - - inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC; - mark_inode_dirty(inode); - if (IS_SYNC(inode)) - err = sync_page_range_nolock(inode, mapping, start, count); -out: - return err; -} - int fat_notify_change(struct dentry *dentry, struct iattr *attr) { struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb); Index: linux-2.6/include/linux/msdos_fs.h =================================================================== --- linux-2.6.orig/include/linux/msdos_fs.h +++ linux-2.6/include/linux/msdos_fs.h @@ -406,6 +406,7 @@ extern int fat_getattr(struct vfsmount * struct kstat *stat); /* fat/inode.c */ +extern int fat_cont_expand(struct inode *inode, loff_t size); extern void fat_attach(struct inode *inode, loff_t i_pos); extern void fat_detach(struct inode *inode); extern struct inode *fat_iget(struct super_block *sb, loff_t i_pos); ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [patch 3/3] fs: fix cont vs deadlock patches 2006-11-30 11:32 ` Nick Piggin @ 2006-11-30 22:14 ` OGAWA Hirofumi 2006-12-01 0:27 ` Nick Piggin 2006-12-01 2:09 ` Nick Piggin 0 siblings, 2 replies; 24+ messages in thread From: OGAWA Hirofumi @ 2006-11-30 22:14 UTC (permalink / raw) To: Nick Piggin; +Cc: Linux Kernel Mailing List, Andrew Morton, linux-fsdevel Nick Piggin <npiggin@suse.de> writes: > Rework the cont filesystem helpers so that generic_cont_expand does the > actual work of expanding the file. cont_prepare_write then calls this > routine if expanding is needed, and retries. Also solves the problem > where cont_prepare_write would previously hold the target page locked > while doing not-very-nice things like locking other pages. > > Means that zero-length prepare/commit_write pairs are no longer needed > as an overloaded directive to extend the file, thus cont should operate > better within the new deadlock-free buffered write code. > > Converts fat over to the new cont scheme. > > Signed-off-by: Nick Piggin <npiggin@suse.de> > > > Index: linux-2.6/fs/buffer.c > =================================================================== > --- linux-2.6.orig/fs/buffer.c > +++ linux-2.6/fs/buffer.c > @@ -2004,19 +2004,20 @@ int block_read_full_page(struct page *pa > return 0; > } > > -/* utility function for filesystems that need to do work on expanding > - * truncates. Uses prepare/commit_write to allow the filesystem to > - * deal with the hole. > +/* > + * Utility function for filesystems that need to do work on expanding > + * truncates. For moronic filesystems that do not allow holes in file. > */ > -static int __generic_cont_expand(struct inode *inode, loff_t size, > - pgoff_t index, unsigned int offset) > +int generic_cont_expand(struct inode *inode, loff_t size, loff_t *bytes, > + get_block_t *get_block) > { > struct address_space *mapping = inode->i_mapping; > + unsigned long blocksize = 1 << inode->i_blkbits; > struct page *page; > unsigned long limit; > - int err; > + int status; > > - err = -EFBIG; > + status = -EFBIG; > limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; > if (limit != RLIM_INFINITY && size > (loff_t)limit) { > send_sig(SIGXFSZ, current, 0); > @@ -2025,146 +2026,83 @@ static int __generic_cont_expand(struct > if (size > inode->i_sb->s_maxbytes) > goto out; > > - err = -ENOMEM; > - page = grab_cache_page(mapping, index); > - if (!page) > - goto out; > - err = mapping->a_ops->prepare_write(NULL, page, offset, offset); > - if (err) { > - /* > - * ->prepare_write() may have instantiated a few blocks > - * outside i_size. Trim these off again. > - */ > - unlock_page(page); > - page_cache_release(page); > - vmtruncate(inode, inode->i_size); > - goto out; > - } > + status = 0; > > - err = mapping->a_ops->commit_write(NULL, page, offset, offset); > + while (*bytes < size) { > + unsigned int zerofrom; > + unsigned int zeroto; > + void *kaddr; > + pgoff_t pgpos; > + > + pgpos = *bytes >> PAGE_CACHE_SHIFT; > + page = grab_cache_page(mapping, pgpos); > + if (!page) { > + status = -ENOMEM; > + break; > + } > + /* we might sleep */ > + if (*bytes >> PAGE_CACHE_SHIFT != pgpos) > + goto unlock; > > - unlock_page(page); > - page_cache_release(page); > - if (err > 0) > - err = 0; > -out: > - return err; > -} > + zerofrom = *bytes & ~PAGE_CACHE_MASK; > + if (zerofrom & (blocksize-1)) > + *bytes = (*bytes + blocksize-1) & (blocksize-1); > > -int generic_cont_expand(struct inode *inode, loff_t size) > -{ > - pgoff_t index; > - unsigned int offset; > + zeroto = PAGE_CACHE_SIZE; > > - offset = (size & (PAGE_CACHE_SIZE - 1)); /* Within page */ > + status = __block_prepare_write(inode, page, zerofrom, > + zeroto, get_block); > + if (status) > + goto unlock; > + kaddr = kmap_atomic(page, KM_USER0); > + memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom); > + flush_dcache_page(page); > + kunmap_atomic(kaddr, KM_USER0); > + status = __block_commit_write(inode, page, zerofrom, zeroto); > > - /* ugh. in prepare/commit_write, if from==to==start of block, we > - ** skip the prepare. make sure we never send an offset for the start > - ** of a block > - */ > - if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) { > - /* caller must handle this extra byte. */ > - offset++; > +unlock: > + unlock_page(page); > + page_cache_release(page); > + if (status) { > + BUG_ON(status == AOP_TRUNCATED_PAGE); > + break; > + } > } > - index = size >> PAGE_CACHE_SHIFT; > > - return __generic_cont_expand(inode, size, index, offset); > -} > - > -int generic_cont_expand_simple(struct inode *inode, loff_t size) > -{ > - loff_t pos = size - 1; > - pgoff_t index = pos >> PAGE_CACHE_SHIFT; > - unsigned int offset = (pos & (PAGE_CACHE_SIZE - 1)) + 1; > + /* > + * No need to use i_size_read() here, the i_size > + * cannot change under us because we hold i_mutex. > + */ > + if (size > inode->i_size) { > + i_size_write(inode, size); > + mark_inode_dirty(inode); > + } > > - /* prepare/commit_write can handle even if from==to==start of block. */ > - return __generic_cont_expand(inode, size, index, offset); > +out: > + return status; > } quick look. Doesn't this break reiserfs? IIRC, the reiserfs is using it for another reason. I was also working for this, but I lost the thread of this, sorry. I found some another users (affs, hfs, hfsplus). Those seem have same problem, but probably those also can use this... What do you think? -- OGAWA Hirofumi <hirofumi@mail.parknet.co.jp> Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp> --- fs/buffer.c | 59 +++++++++++++++++--------------------------- fs/fat/file.c | 2 - include/linux/buffer_head.h | 1 3 files changed, 24 insertions(+), 38 deletions(-) diff -puN fs/buffer.c~generic_cont_expand-avoid-zero-size fs/buffer.c --- linux-2.6/fs/buffer.c~generic_cont_expand-avoid-zero-size 2006-11-13 01:42:01.000000000 +0900 +++ linux-2.6-hirofumi/fs/buffer.c 2006-11-13 02:16:20.000000000 +0900 @@ -2004,18 +2004,24 @@ int block_read_full_page(struct page *pa return 0; } -/* utility function for filesystems that need to do work on expanding +/* + * utility function for filesystems that need to do work on expanding * truncates. Uses prepare/commit_write to allow the filesystem to * deal with the hole. */ -static int __generic_cont_expand(struct inode *inode, loff_t size, - pgoff_t index, unsigned int offset) +int generic_cont_expand(struct inode *inode, loff_t size) { struct address_space *mapping = inode->i_mapping; + loff_t pos = inode->i_size; struct page *page; unsigned long limit; + pgoff_t index; + unsigned int from, to; + void *kaddr; int err; + WARN_ON(pos >= size); + err = -EFBIG; limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; if (limit != RLIM_INFINITY && size > (loff_t)limit) { @@ -2025,11 +2031,18 @@ static int __generic_cont_expand(struct if (size > inode->i_sb->s_maxbytes) goto out; + index = (size - 1) >> PAGE_CACHE_SHIFT; + to = size - ((loff_t)index << PAGE_CACHE_SHIFT); + if (index != (pos >> PAGE_CACHE_SHIFT)) + from = 0; + else + from = pos & (PAGE_CACHE_SIZE - 1); + err = -ENOMEM; page = grab_cache_page(mapping, index); if (!page) goto out; - err = mapping->a_ops->prepare_write(NULL, page, offset, offset); + err = mapping->a_ops->prepare_write(NULL, page, from, to); if (err) { /* * ->prepare_write() may have instantiated a few blocks @@ -2041,7 +2054,12 @@ static int __generic_cont_expand(struct goto out; } - err = mapping->a_ops->commit_write(NULL, page, offset, offset); + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr + from, 0, to - from); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + + err = mapping->a_ops->commit_write(NULL, page, from, to); unlock_page(page); page_cache_release(page); @@ -2051,36 +2069,6 @@ out: return err; } -int generic_cont_expand(struct inode *inode, loff_t size) -{ - pgoff_t index; - unsigned int offset; - - offset = (size & (PAGE_CACHE_SIZE - 1)); /* Within page */ - - /* ugh. in prepare/commit_write, if from==to==start of block, we - ** skip the prepare. make sure we never send an offset for the start - ** of a block - */ - if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) { - /* caller must handle this extra byte. */ - offset++; - } - index = size >> PAGE_CACHE_SHIFT; - - return __generic_cont_expand(inode, size, index, offset); -} - -int generic_cont_expand_simple(struct inode *inode, loff_t size) -{ - loff_t pos = size - 1; - pgoff_t index = pos >> PAGE_CACHE_SHIFT; - unsigned int offset = (pos & (PAGE_CACHE_SIZE - 1)) + 1; - - /* prepare/commit_write can handle even if from==to==start of block. */ - return __generic_cont_expand(inode, size, index, offset); -} - /* * For moronic filesystems that do not allow holes in file. * We may have to extend the file. @@ -3032,7 +3020,6 @@ EXPORT_SYMBOL(fsync_bdev); EXPORT_SYMBOL(generic_block_bmap); EXPORT_SYMBOL(generic_commit_write); EXPORT_SYMBOL(generic_cont_expand); -EXPORT_SYMBOL(generic_cont_expand_simple); EXPORT_SYMBOL(init_buffer); EXPORT_SYMBOL(invalidate_bdev); EXPORT_SYMBOL(ll_rw_block); diff -puN include/linux/buffer_head.h~generic_cont_expand-avoid-zero-size include/linux/buffer_head.h --- linux-2.6/include/linux/buffer_head.h~generic_cont_expand-avoid-zero-size 2006-11-13 01:42:01.000000000 +0900 +++ linux-2.6-hirofumi/include/linux/buffer_head.h 2006-11-13 01:42:01.000000000 +0900 @@ -202,7 +202,6 @@ int block_prepare_write(struct page*, un int cont_prepare_write(struct page*, unsigned, unsigned, get_block_t*, loff_t *); int generic_cont_expand(struct inode *inode, loff_t size); -int generic_cont_expand_simple(struct inode *inode, loff_t size); int block_commit_write(struct page *page, unsigned from, unsigned to); void block_sync_page(struct page *); sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *); diff -puN fs/fat/file.c~generic_cont_expand-avoid-zero-size fs/fat/file.c --- linux-2.6/fs/fat/file.c~generic_cont_expand-avoid-zero-size 2006-11-13 01:42:01.000000000 +0900 +++ linux-2.6-hirofumi/fs/fat/file.c 2006-11-13 01:42:01.000000000 +0900 @@ -143,7 +143,7 @@ static int fat_cont_expand(struct inode loff_t start = inode->i_size, count = size - inode->i_size; int err; - err = generic_cont_expand_simple(inode, size); + err = generic_cont_expand(inode, size); if (err) goto out; _ ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [patch 3/3] fs: fix cont vs deadlock patches 2006-11-30 22:14 ` OGAWA Hirofumi @ 2006-12-01 0:27 ` Nick Piggin 2006-12-01 1:11 ` OGAWA Hirofumi 2006-12-01 2:09 ` Nick Piggin 1 sibling, 1 reply; 24+ messages in thread From: Nick Piggin @ 2006-12-01 0:27 UTC (permalink / raw) To: OGAWA Hirofumi; +Cc: Linux Kernel Mailing List, Andrew Morton, linux-fsdevel On Fri, Dec 01, 2006 at 07:14:28AM +0900, OGAWA Hirofumi wrote: > > quick look. Doesn't this break reiserfs? IIRC, the reiserfs is using > it for another reason. I was also working for this, but I lost the > thread of this, sorry. Yes I think it will break reiserfs, so I just have to have a look at converting it. Shouldn't take too long. > > I found some another users (affs, hfs, hfsplus). Those seem have same > problem, but probably those also can use this... > > What do you think? Well I guess this is your code, so it is up to you ;) I would be happy if you come up with a quick fix, I'm just trying to stamp out a few big bugs in mm. However I did prefer my way of moving all the exapand code into generic_cont_expand, out of prepare_write, and avoiding holding the target page locked while we're doing all the expand work (strictly, you might be able to get away with this, but it is fragile and ugly). AFAIKS, the only reason to use prepare_write is to avoid passing the get_block into generic_cont_expand? Thanks, Nick ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [patch 3/3] fs: fix cont vs deadlock patches 2006-12-01 0:27 ` Nick Piggin @ 2006-12-01 1:11 ` OGAWA Hirofumi 2006-12-01 2:03 ` Nick Piggin 0 siblings, 1 reply; 24+ messages in thread From: OGAWA Hirofumi @ 2006-12-01 1:11 UTC (permalink / raw) To: Nick Piggin; +Cc: Linux Kernel Mailing List, Andrew Morton, linux-fsdevel Nick Piggin <npiggin@suse.de> writes: > I would be happy if you come up with a quick fix, I'm just trying to > stamp out a few big bugs in mm. However I did prefer my way of moving > all the exapand code into generic_cont_expand, out of prepare_write, and > avoiding holding the target page locked while we're doing all the expand > work (strictly, you might be able to get away with this, but it is > fragile and ugly). > > AFAIKS, the only reason to use prepare_write is to avoid passing the > get_block into generic_cont_expand? IIRC, because generic_cont_expand is designed as really generic. It can also use for non moronic filesystem. In the case of reiserfs, it ->prepare_write might be necessary. -- OGAWA Hirofumi <hirofumi@mail.parknet.co.jp> ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [patch 3/3] fs: fix cont vs deadlock patches 2006-12-01 1:11 ` OGAWA Hirofumi @ 2006-12-01 2:03 ` Nick Piggin 0 siblings, 0 replies; 24+ messages in thread From: Nick Piggin @ 2006-12-01 2:03 UTC (permalink / raw) To: OGAWA Hirofumi; +Cc: Linux Kernel Mailing List, Andrew Morton, linux-fsdevel On Fri, Dec 01, 2006 at 10:11:14AM +0900, OGAWA Hirofumi wrote: > Nick Piggin <npiggin@suse.de> writes: > > > I would be happy if you come up with a quick fix, I'm just trying to > > stamp out a few big bugs in mm. However I did prefer my way of moving > > all the exapand code into generic_cont_expand, out of prepare_write, and > > avoiding holding the target page locked while we're doing all the expand > > work (strictly, you might be able to get away with this, but it is > > fragile and ugly). > > > > AFAIKS, the only reason to use prepare_write is to avoid passing the > > get_block into generic_cont_expand? > > IIRC, because generic_cont_expand is designed as really generic. It > can also use for non moronic filesystem. > > In the case of reiserfs, it ->prepare_write might be necessary. Yes I see :( Well, maybe we should use your alternate patch, then. I have a few questions on it. ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [patch 3/3] fs: fix cont vs deadlock patches 2006-11-30 22:14 ` OGAWA Hirofumi 2006-12-01 0:27 ` Nick Piggin @ 2006-12-01 2:09 ` Nick Piggin 2006-12-01 3:41 ` OGAWA Hirofumi 1 sibling, 1 reply; 24+ messages in thread From: Nick Piggin @ 2006-12-01 2:09 UTC (permalink / raw) To: OGAWA Hirofumi; +Cc: Linux Kernel Mailing List, Andrew Morton, linux-fsdevel On Fri, Dec 01, 2006 at 07:14:28AM +0900, OGAWA Hirofumi wrote: > > quick look. Doesn't this break reiserfs? IIRC, the reiserfs is using > it for another reason. I was also working for this, but I lost the > thread of this, sorry. > > I found some another users (affs, hfs, hfsplus). Those seem have same > problem, but probably those also can use this... > > What do you think? > -- > OGAWA Hirofumi <hirofumi@mail.parknet.co.jp> > > > > Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp> > --- > > fs/buffer.c | 59 +++++++++++++++++--------------------------- > fs/fat/file.c | 2 - > include/linux/buffer_head.h | 1 > 3 files changed, 24 insertions(+), 38 deletions(-) > > diff -puN fs/buffer.c~generic_cont_expand-avoid-zero-size fs/buffer.c > --- linux-2.6/fs/buffer.c~generic_cont_expand-avoid-zero-size 2006-11-13 01:42:01.000000000 +0900 > +++ linux-2.6-hirofumi/fs/buffer.c 2006-11-13 02:16:20.000000000 +0900 > @@ -2004,18 +2004,24 @@ int block_read_full_page(struct page *pa > return 0; > } > > -/* utility function for filesystems that need to do work on expanding > +/* > + * utility function for filesystems that need to do work on expanding > * truncates. Uses prepare/commit_write to allow the filesystem to > * deal with the hole. > */ > -static int __generic_cont_expand(struct inode *inode, loff_t size, > - pgoff_t index, unsigned int offset) > +int generic_cont_expand(struct inode *inode, loff_t size) > { > struct address_space *mapping = inode->i_mapping; > + loff_t pos = inode->i_size; > struct page *page; > unsigned long limit; > + pgoff_t index; > + unsigned int from, to; > + void *kaddr; > int err; > > + WARN_ON(pos >= size); > + > err = -EFBIG; > limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; > if (limit != RLIM_INFINITY && size > (loff_t)limit) { > @@ -2025,11 +2031,18 @@ static int __generic_cont_expand(struct > if (size > inode->i_sb->s_maxbytes) > goto out; > > + index = (size - 1) >> PAGE_CACHE_SHIFT; > + to = size - ((loff_t)index << PAGE_CACHE_SHIFT); > + if (index != (pos >> PAGE_CACHE_SHIFT)) > + from = 0; > + else > + from = pos & (PAGE_CACHE_SIZE - 1); > + > err = -ENOMEM; > page = grab_cache_page(mapping, index); > if (!page) > goto out; > - err = mapping->a_ops->prepare_write(NULL, page, offset, offset); > + err = mapping->a_ops->prepare_write(NULL, page, from, to); > if (err) { > /* > * ->prepare_write() may have instantiated a few blocks > @@ -2041,7 +2054,12 @@ static int __generic_cont_expand(struct > goto out; > } > > - err = mapping->a_ops->commit_write(NULL, page, offset, offset); > + kaddr = kmap_atomic(page, KM_USER0); > + memset(kaddr + from, 0, to - from); > + flush_dcache_page(page); > + kunmap_atomic(kaddr, KM_USER0); > + > + err = mapping->a_ops->commit_write(NULL, page, from, to); So basically this is changing from having prepare_write do all the zeroing, to zeroing the last page in generic_cont_expand, so that we don't have to pass a zero-length to prepare_write? ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [patch 3/3] fs: fix cont vs deadlock patches 2006-12-01 2:09 ` Nick Piggin @ 2006-12-01 3:41 ` OGAWA Hirofumi 2006-12-01 3:47 ` Nick Piggin 2006-12-01 5:08 ` Nick Piggin 0 siblings, 2 replies; 24+ messages in thread From: OGAWA Hirofumi @ 2006-12-01 3:41 UTC (permalink / raw) To: Nick Piggin; +Cc: Linux Kernel Mailing List, Andrew Morton, linux-fsdevel Nick Piggin <npiggin@suse.de> writes: > On Fri, Dec 01, 2006 at 07:14:28AM +0900, OGAWA Hirofumi wrote: >> >> quick look. Doesn't this break reiserfs? IIRC, the reiserfs is using >> it for another reason. I was also working for this, but I lost the >> thread of this, sorry. >> >> I found some another users (affs, hfs, hfsplus). Those seem have same >> problem, but probably those also can use this... >> >> What do you think? >> -- >> OGAWA Hirofumi <hirofumi@mail.parknet.co.jp> >> >> >> >> Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp> >> --- >> >> fs/buffer.c | 59 +++++++++++++++++--------------------------- >> fs/fat/file.c | 2 - >> include/linux/buffer_head.h | 1 >> 3 files changed, 24 insertions(+), 38 deletions(-) >> >> diff -puN fs/buffer.c~generic_cont_expand-avoid-zero-size fs/buffer.c >> --- linux-2.6/fs/buffer.c~generic_cont_expand-avoid-zero-size 2006-11-13 01:42:01.000000000 +0900 >> +++ linux-2.6-hirofumi/fs/buffer.c 2006-11-13 02:16:20.000000000 +0900 >> @@ -2004,18 +2004,24 @@ int block_read_full_page(struct page *pa >> return 0; >> } >> >> -/* utility function for filesystems that need to do work on expanding >> +/* >> + * utility function for filesystems that need to do work on expanding >> * truncates. Uses prepare/commit_write to allow the filesystem to >> * deal with the hole. >> */ >> -static int __generic_cont_expand(struct inode *inode, loff_t size, >> - pgoff_t index, unsigned int offset) >> +int generic_cont_expand(struct inode *inode, loff_t size) >> { >> struct address_space *mapping = inode->i_mapping; >> + loff_t pos = inode->i_size; >> struct page *page; >> unsigned long limit; >> + pgoff_t index; >> + unsigned int from, to; >> + void *kaddr; >> int err; >> >> + WARN_ON(pos >= size); >> + >> err = -EFBIG; >> limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; >> if (limit != RLIM_INFINITY && size > (loff_t)limit) { >> @@ -2025,11 +2031,18 @@ static int __generic_cont_expand(struct >> if (size > inode->i_sb->s_maxbytes) >> goto out; >> >> + index = (size - 1) >> PAGE_CACHE_SHIFT; >> + to = size - ((loff_t)index << PAGE_CACHE_SHIFT); >> + if (index != (pos >> PAGE_CACHE_SHIFT)) >> + from = 0; >> + else >> + from = pos & (PAGE_CACHE_SIZE - 1); >> + >> err = -ENOMEM; >> page = grab_cache_page(mapping, index); >> if (!page) >> goto out; >> - err = mapping->a_ops->prepare_write(NULL, page, offset, offset); >> + err = mapping->a_ops->prepare_write(NULL, page, from, to); >> if (err) { >> /* >> * ->prepare_write() may have instantiated a few blocks >> @@ -2041,7 +2054,12 @@ static int __generic_cont_expand(struct >> goto out; >> } >> >> - err = mapping->a_ops->commit_write(NULL, page, offset, offset); >> + kaddr = kmap_atomic(page, KM_USER0); >> + memset(kaddr + from, 0, to - from); >> + flush_dcache_page(page); >> + kunmap_atomic(kaddr, KM_USER0); >> + >> + err = mapping->a_ops->commit_write(NULL, page, from, to); > > So basically this is changing from having prepare_write do all the > zeroing, to zeroing the last page in generic_cont_expand, so that > we don't have to pass a zero-length to prepare_write? Yes, this patch doesn't pass zero-length to prepare_write. However, I'm not checking this patch is ok for reiserfs... -- OGAWA Hirofumi <hirofumi@mail.parknet.co.jp> ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [patch 3/3] fs: fix cont vs deadlock patches 2006-12-01 3:41 ` OGAWA Hirofumi @ 2006-12-01 3:47 ` Nick Piggin 2006-12-01 5:08 ` Nick Piggin 1 sibling, 0 replies; 24+ messages in thread From: Nick Piggin @ 2006-12-01 3:47 UTC (permalink / raw) To: OGAWA Hirofumi; +Cc: Linux Kernel Mailing List, Andrew Morton, linux-fsdevel On Fri, Dec 01, 2006 at 12:41:25PM +0900, OGAWA Hirofumi wrote: > Nick Piggin <npiggin@suse.de> writes: > > > > So basically this is changing from having prepare_write do all the > > zeroing, to zeroing the last page in generic_cont_expand, so that > > we don't have to pass a zero-length to prepare_write? > > Yes, this patch doesn't pass zero-length to prepare_write. However, > I'm not checking this patch is ok for reiserfs... OK, I'm doing some testing with it now... ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [patch 3/3] fs: fix cont vs deadlock patches 2006-12-01 3:41 ` OGAWA Hirofumi 2006-12-01 3:47 ` Nick Piggin @ 2006-12-01 5:08 ` Nick Piggin 2006-12-01 7:21 ` Andrew Morton 1 sibling, 1 reply; 24+ messages in thread From: Nick Piggin @ 2006-12-01 5:08 UTC (permalink / raw) To: OGAWA Hirofumi; +Cc: Linux Kernel Mailing List, Andrew Morton, linux-fsdevel On Fri, Dec 01, 2006 at 12:41:25PM +0900, OGAWA Hirofumi wrote: > > Yes, this patch doesn't pass zero-length to prepare_write. However, > I'm not checking this patch is ok for reiserfs... OK, vfat wasn't working correctly for me -- I needed the following patch: Index: linux-2.6/fs/buffer.c =================================================================== --- linux-2.6.orig/fs/buffer.c 2006-12-01 15:31:22.000000000 +1100 +++ linux-2.6/fs/buffer.c 2006-12-01 16:02:23.000000000 +1100 @@ -2102,6 +2102,7 @@ *bytes |= (blocksize-1); (*bytes)++; } + status = __block_prepare_write(inode, new_page, zerofrom, PAGE_CACHE_SIZE, get_block); if (status) @@ -2110,7 +2111,7 @@ memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom); flush_dcache_page(new_page); kunmap_atomic(kaddr, KM_USER0); - generic_commit_write(NULL, new_page, zerofrom, PAGE_CACHE_SIZE); + __block_commit_write(inode, new_page, zerofrom, PAGE_CACHE_SIZE); unlock_page(new_page); page_cache_release(new_page); } @@ -2142,6 +2143,7 @@ kunmap_atomic(kaddr, KM_USER0); __block_commit_write(inode, page, zerofrom, offset); } + return 0; out1: ClearPageUptodate(page); Index: linux-2.6/fs/fat/inode.c =================================================================== --- linux-2.6.orig/fs/fat/inode.c 2006-12-01 15:31:22.000000000 +1100 +++ linux-2.6/fs/fat/inode.c 2006-12-01 16:02:08.000000000 +1100 @@ -151,7 +151,8 @@ { struct inode *inode = page->mapping->host; int err; - if (to - from > 0) + + if (to - from == 0) return 0; err = generic_commit_write(file, page, from, to); And reiserfs has some creative uses of generic_cont_expand, which I think I've fixed in the following way. I'll send out the full patch if you (and others) agree with the changes. Index: linux-2.6/fs/reiserfs/file.c =================================================================== --- linux-2.6.orig/fs/reiserfs/file.c 2006-12-01 16:01:17.000000000 +1100 +++ linux-2.6/fs/reiserfs/file.c 2006-12-01 16:03:34.000000000 +1100 @@ -892,6 +892,34 @@ return retval; } +static int reiserfs_convert_tail(struct inode *inode, loff_t offset) +{ + struct address_space *mapping = inode->i_mapping; + pgoff_t index; + struct page *page; + int err = -ENOMEM; + + index = offset >> PAGE_CACHE_SHIFT; + page = grab_cache_page(mapping, index); + if (!page) + goto out; + + offset = offset & ~PAGE_CACHE_MASK; + WARN_ON(!offset); /* shouldn't pass zero length to prepare/commit */ + + err = mapping->a_ops->prepare_write(NULL, page, 0, offset); + if (err) + goto out_unlock; + + err = mapping->a_ops->commit_write(NULL, page, 0, offset); + +out_unlock: + unlock_page(page); + page_cache_release(page); +out: + return err; +} + /* Look if passed writing region is going to touch file's tail (if it is present). And if it is, convert the tail to unformatted node */ static int reiserfs_check_for_tail_and_convert(struct inode *inode, /* inode to deal with */ @@ -937,7 +965,8 @@ le_key_k_offset(get_inode_item_key_version(inode), &(ih->ih_key)); pathrelse(&path); - res = generic_cont_expand(inode, cont_expand_offset); + + res = reiserfs_convert_tail(inode, cont_expand_offset); } else pathrelse(&path); ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [patch 3/3] fs: fix cont vs deadlock patches 2006-12-01 5:08 ` Nick Piggin @ 2006-12-01 7:21 ` Andrew Morton 2006-12-01 7:53 ` Nick Piggin 0 siblings, 1 reply; 24+ messages in thread From: Andrew Morton @ 2006-12-01 7:21 UTC (permalink / raw) To: Nick Piggin; +Cc: OGAWA Hirofumi, Linux Kernel Mailing List, linux-fsdevel On Fri, 1 Dec 2006 06:08:52 +0100 Nick Piggin <npiggin@suse.de> wrote: > On Fri, Dec 01, 2006 at 12:41:25PM +0900, OGAWA Hirofumi wrote: > > > > Yes, this patch doesn't pass zero-length to prepare_write. However, > > I'm not checking this patch is ok for reiserfs... > > OK, vfat wasn't working correctly for me -- I needed the following patch: Now I'm confused. What relationship does this patch have to the below? revert-generic_file_buffered_write-handle-zero-length-iovec-segments.patch revert-generic_file_buffered_write-deadlock-on-vectored-write.patch generic_file_buffered_write-cleanup.patch mm-only-mm-debug-write-deadlocks.patch mm-fix-pagecache-write-deadlocks.patch mm-fix-pagecache-write-deadlocks-comment.patch mm-fix-pagecache-write-deadlocks-xip.patch mm-fix-pagecache-write-deadlocks-mm-pagecache-write-deadlocks-efault-fix.patch mm-fix-pagecache-write-deadlocks-zerolength-fix.patch mm-fix-pagecache-write-deadlocks-stale-holes-fix.patch fs-prepare_write-fixes.patch fs-prepare_write-fixes-fuse-fix.patch fs-prepare_write-fixes-jffs-fix.patch fs-prepare_write-fixes-fat-fix.patch fs-fix-cont-vs-deadlock-patches.patch > Index: linux-2.6/fs/buffer.c > =================================================================== > --- linux-2.6.orig/fs/buffer.c 2006-12-01 15:31:22.000000000 +1100 > +++ linux-2.6/fs/buffer.c 2006-12-01 16:02:23.000000000 +1100 > @@ -2102,6 +2102,7 @@ Please always use `diff -p' > *bytes |= (blocksize-1); > (*bytes)++; > } > + > status = __block_prepare_write(inode, new_page, zerofrom, > PAGE_CACHE_SIZE, get_block); > if (status) > @@ -2110,7 +2111,7 @@ > memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom); > flush_dcache_page(new_page); > kunmap_atomic(kaddr, KM_USER0); > - generic_commit_write(NULL, new_page, zerofrom, PAGE_CACHE_SIZE); > + __block_commit_write(inode, new_page, zerofrom, PAGE_CACHE_SIZE); Whatever function this is doesn't need to update i_size? ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [patch 3/3] fs: fix cont vs deadlock patches 2006-12-01 7:21 ` Andrew Morton @ 2006-12-01 7:53 ` Nick Piggin 2006-12-01 14:50 ` OGAWA Hirofumi 0 siblings, 1 reply; 24+ messages in thread From: Nick Piggin @ 2006-12-01 7:53 UTC (permalink / raw) To: Andrew Morton; +Cc: OGAWA Hirofumi, Linux Kernel Mailing List, linux-fsdevel On Thu, Nov 30, 2006 at 11:21:02PM -0800, Andrew Morton wrote: > On Fri, 1 Dec 2006 06:08:52 +0100 > Nick Piggin <npiggin@suse.de> wrote: > > > On Fri, Dec 01, 2006 at 12:41:25PM +0900, OGAWA Hirofumi wrote: > > > > > > Yes, this patch doesn't pass zero-length to prepare_write. However, > > > I'm not checking this patch is ok for reiserfs... > > > > OK, vfat wasn't working correctly for me -- I needed the following patch: > > Now I'm confused. What relationship does this patch have to the below? It is on top of OGAWA Hirofumi's patch, which itself is a replacement for fs-fix-cont-vs-deadlock-patches.patch. I'll ensure you get the right one. > revert-generic_file_buffered_write-handle-zero-length-iovec-segments.patch > revert-generic_file_buffered_write-deadlock-on-vectored-write.patch > generic_file_buffered_write-cleanup.patch > mm-only-mm-debug-write-deadlocks.patch > mm-fix-pagecache-write-deadlocks.patch > mm-fix-pagecache-write-deadlocks-comment.patch > mm-fix-pagecache-write-deadlocks-xip.patch > mm-fix-pagecache-write-deadlocks-mm-pagecache-write-deadlocks-efault-fix.patch > mm-fix-pagecache-write-deadlocks-zerolength-fix.patch > mm-fix-pagecache-write-deadlocks-stale-holes-fix.patch > fs-prepare_write-fixes.patch > fs-prepare_write-fixes-fuse-fix.patch > fs-prepare_write-fixes-jffs-fix.patch > fs-prepare_write-fixes-fat-fix.patch > fs-fix-cont-vs-deadlock-patches.patch > > > > Index: linux-2.6/fs/buffer.c > > =================================================================== > > --- linux-2.6.orig/fs/buffer.c 2006-12-01 15:31:22.000000000 +1100 > > +++ linux-2.6/fs/buffer.c 2006-12-01 16:02:23.000000000 +1100 > > @@ -2102,6 +2102,7 @@ > > Please always use `diff -p' Dang, yes its much better. > > *bytes |= (blocksize-1); > > (*bytes)++; > > } > > + > > status = __block_prepare_write(inode, new_page, zerofrom, > > PAGE_CACHE_SIZE, get_block); > > if (status) > > @@ -2110,7 +2111,7 @@ > > memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom); > > flush_dcache_page(new_page); > > kunmap_atomic(kaddr, KM_USER0); > > - generic_commit_write(NULL, new_page, zerofrom, PAGE_CACHE_SIZE); > > + __block_commit_write(inode, new_page, zerofrom, PAGE_CACHE_SIZE); > > Whatever function this is doesn't need to update i_size? Yes, it is the code in cont_prepare_write that is expanding a hole at the end of file. We can do this now because fat_commit_write is now changed to call generic_commit_write in the case of a non-zero length. I think it is an improvement because now the file will not get arbitrarily extended in the case of a write failure somewhere down the track. ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [patch 3/3] fs: fix cont vs deadlock patches 2006-12-01 7:53 ` Nick Piggin @ 2006-12-01 14:50 ` OGAWA Hirofumi 2006-12-01 15:47 ` OGAWA Hirofumi 2006-12-02 0:36 ` Nick Piggin 0 siblings, 2 replies; 24+ messages in thread From: OGAWA Hirofumi @ 2006-12-01 14:50 UTC (permalink / raw) To: Nick Piggin; +Cc: Andrew Morton, Linux Kernel Mailing List, linux-fsdevel Nick Piggin <npiggin@suse.de> writes: >> > status = __block_prepare_write(inode, new_page, zerofrom, >> > PAGE_CACHE_SIZE, get_block); >> > if (status) >> > @@ -2110,7 +2111,7 @@ >> > memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom); >> > flush_dcache_page(new_page); >> > kunmap_atomic(kaddr, KM_USER0); >> > - generic_commit_write(NULL, new_page, zerofrom, PAGE_CACHE_SIZE); >> > + __block_commit_write(inode, new_page, zerofrom, PAGE_CACHE_SIZE); >> >> Whatever function this is doesn't need to update i_size? > > Yes, it is the code in cont_prepare_write that is expanding a hole > at the end of file. > > We can do this now because fat_commit_write is now changed to call > generic_commit_write in the case of a non-zero length. > > I think it is an improvement because now the file will not get > arbitrarily extended in the case of a write failure somewhere down > the track. Ah, unfortunately we can't this. If we don't update ->i_size before page_cache_release, pdflush will think these pages is outside ->i_size and just clean the page without writing it. -- OGAWA Hirofumi <hirofumi@mail.parknet.co.jp> ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [patch 3/3] fs: fix cont vs deadlock patches 2006-12-01 14:50 ` OGAWA Hirofumi @ 2006-12-01 15:47 ` OGAWA Hirofumi 2006-12-02 0:36 ` Nick Piggin 1 sibling, 0 replies; 24+ messages in thread From: OGAWA Hirofumi @ 2006-12-01 15:47 UTC (permalink / raw) To: Nick Piggin; +Cc: Andrew Morton, Linux Kernel Mailing List, linux-fsdevel OGAWA Hirofumi <hirofumi@mail.parknet.co.jp> writes: > Ah, unfortunately we can't this. If we don't update ->i_size before > page_cache_release, pdflush will think these pages is outside ->i_size > and just clean the page without writing it. Ugh, of course, s/page_cache_release/unlock_page/ -- OGAWA Hirofumi <hirofumi@mail.parknet.co.jp> ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [patch 3/3] fs: fix cont vs deadlock patches 2006-12-01 14:50 ` OGAWA Hirofumi 2006-12-01 15:47 ` OGAWA Hirofumi @ 2006-12-02 0:36 ` Nick Piggin 2006-12-02 7:28 ` [new patch " Nick Piggin 1 sibling, 1 reply; 24+ messages in thread From: Nick Piggin @ 2006-12-02 0:36 UTC (permalink / raw) To: OGAWA Hirofumi Cc: Nick Piggin, Andrew Morton, Linux Kernel Mailing List, linux-fsdevel OGAWA Hirofumi wrote: > Nick Piggin <npiggin@suse.de> writes: > > >>>> status = __block_prepare_write(inode, new_page, zerofrom, >>>> PAGE_CACHE_SIZE, get_block); >>>> if (status) >>>>@@ -2110,7 +2111,7 @@ >>>> memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom); >>>> flush_dcache_page(new_page); >>>> kunmap_atomic(kaddr, KM_USER0); >>>>- generic_commit_write(NULL, new_page, zerofrom, PAGE_CACHE_SIZE); >>>>+ __block_commit_write(inode, new_page, zerofrom, PAGE_CACHE_SIZE); >>> >>>Whatever function this is doesn't need to update i_size? >> >>Yes, it is the code in cont_prepare_write that is expanding a hole >>at the end of file. >> >>We can do this now because fat_commit_write is now changed to call >>generic_commit_write in the case of a non-zero length. >> >>I think it is an improvement because now the file will not get >>arbitrarily extended in the case of a write failure somewhere down >>the track. > > > Ah, unfortunately we can't this. If we don't update ->i_size before > page_cache_release, pdflush will think these pages is outside ->i_size > and just clean the page without writing it. I see. I guess you need to synchronise your writepage versus this extention in order to handle it properly then. I won't bother with that though: it won't be worse than it was before. Thanks for review, do you agree with the other hunks? -- SUSE Labs, Novell Inc. Send instant messages to your online friends http://au.messenger.yahoo.com ^ permalink raw reply [flat|nested] 24+ messages in thread
* [new patch 3/3] fs: fix cont vs deadlock patches 2006-12-02 0:36 ` Nick Piggin @ 2006-12-02 7:28 ` Nick Piggin 2006-12-02 9:43 ` OGAWA Hirofumi 0 siblings, 1 reply; 24+ messages in thread From: Nick Piggin @ 2006-12-02 7:28 UTC (permalink / raw) To: Nick Piggin Cc: OGAWA Hirofumi, Andrew Morton, Linux Kernel Mailing List, linux-fsdevel > I see. I guess you need to synchronise your writepage versus this > extention in order to handle it properly then. I won't bother with > that though: it won't be worse than it was before. > > Thanks for review, do you agree with the other hunks? Well, Andrew's got the rest of the patches in his tree, so I'll send what we've got for now. Has had some testing on both reiserfs and fat. Doesn't look like the other filesystems using cont_prepare_write will have any problems... Andrew, please apply this patch as a replacement for the fat-fix patch in your rollup (this patch includes the same fix, and is a more logical change unit I think). -- Stop overloading zero length prepare/commit_write to request a file extend operation by the "cont" buffer code. Instead, have generic_cont_expand perform a zeroing operation on the last page, and cont_prepare_write naturally zeroes any previous pages required. Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp> Reiserfs was trying to "extend" a file to something smaller than it already is with generic_cont_expand. This broke the above fix. Open code the prepare/ commit pair instead... maybe the code would be cleaner if reiserfs just did the operation explicitly (call get_block or whatever helper is used to unpack the tail)? Signed-off-by: Nick Piggin <npiggin@suse.de> Index: linux-2.6/fs/buffer.c =================================================================== --- linux-2.6.orig/fs/buffer.c +++ linux-2.6/fs/buffer.c @@ -2004,18 +2004,24 @@ int block_read_full_page(struct page *pa return 0; } -/* utility function for filesystems that need to do work on expanding +/* + * utility function for filesystems that need to do work on expanding * truncates. Uses prepare/commit_write to allow the filesystem to * deal with the hole. */ -static int __generic_cont_expand(struct inode *inode, loff_t size, - pgoff_t index, unsigned int offset) +int generic_cont_expand(struct inode *inode, loff_t size) { struct address_space *mapping = inode->i_mapping; + loff_t pos = inode->i_size; struct page *page; unsigned long limit; + pgoff_t index; + unsigned int from, to; + void *kaddr; int err; + WARN_ON(pos >= size); + err = -EFBIG; limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; if (limit != RLIM_INFINITY && size > (loff_t)limit) { @@ -2025,11 +2031,18 @@ static int __generic_cont_expand(struct if (size > inode->i_sb->s_maxbytes) goto out; + index = (size - 1) >> PAGE_CACHE_SHIFT; + to = size - ((loff_t)index << PAGE_CACHE_SHIFT); + if (index != (pos >> PAGE_CACHE_SHIFT)) + from = 0; + else + from = pos & (PAGE_CACHE_SIZE - 1); + err = -ENOMEM; page = grab_cache_page(mapping, index); if (!page) goto out; - err = mapping->a_ops->prepare_write(NULL, page, offset, offset); + err = mapping->a_ops->prepare_write(NULL, page, from, to); if (err) { /* * ->prepare_write() may have instantiated a few blocks @@ -2041,7 +2054,12 @@ static int __generic_cont_expand(struct goto out; } - err = mapping->a_ops->commit_write(NULL, page, offset, offset); + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr + from, 0, to - from); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + + err = mapping->a_ops->commit_write(NULL, page, from, to); unlock_page(page); page_cache_release(page); @@ -2051,36 +2069,6 @@ out: return err; } -int generic_cont_expand(struct inode *inode, loff_t size) -{ - pgoff_t index; - unsigned int offset; - - offset = (size & (PAGE_CACHE_SIZE - 1)); /* Within page */ - - /* ugh. in prepare/commit_write, if from==to==start of block, we - ** skip the prepare. make sure we never send an offset for the start - ** of a block - */ - if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) { - /* caller must handle this extra byte. */ - offset++; - } - index = size >> PAGE_CACHE_SHIFT; - - return __generic_cont_expand(inode, size, index, offset); -} - -int generic_cont_expand_simple(struct inode *inode, loff_t size) -{ - loff_t pos = size - 1; - pgoff_t index = pos >> PAGE_CACHE_SHIFT; - unsigned int offset = (pos & (PAGE_CACHE_SIZE - 1)) + 1; - - /* prepare/commit_write can handle even if from==to==start of block. */ - return __generic_cont_expand(inode, size, index, offset); -} - /* * For moronic filesystems that do not allow holes in file. * We may have to extend the file. @@ -3015,7 +3003,6 @@ EXPORT_SYMBOL(fsync_bdev); EXPORT_SYMBOL(generic_block_bmap); EXPORT_SYMBOL(generic_commit_write); EXPORT_SYMBOL(generic_cont_expand); -EXPORT_SYMBOL(generic_cont_expand_simple); EXPORT_SYMBOL(init_buffer); EXPORT_SYMBOL(invalidate_bdev); EXPORT_SYMBOL(ll_rw_block); Index: linux-2.6/include/linux/buffer_head.h =================================================================== --- linux-2.6.orig/include/linux/buffer_head.h +++ linux-2.6/include/linux/buffer_head.h @@ -203,7 +203,6 @@ int block_prepare_write(struct page*, un int cont_prepare_write(struct page*, unsigned, unsigned, get_block_t*, loff_t *); int generic_cont_expand(struct inode *inode, loff_t size); -int generic_cont_expand_simple(struct inode *inode, loff_t size); int block_commit_write(struct page *page, unsigned from, unsigned to); void block_sync_page(struct page *); sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *); Index: linux-2.6/fs/fat/file.c =================================================================== --- linux-2.6.orig/fs/fat/file.c +++ linux-2.6/fs/fat/file.c @@ -143,7 +143,7 @@ static int fat_cont_expand(struct inode loff_t start = inode->i_size, count = size - inode->i_size; int err; - err = generic_cont_expand_simple(inode, size); + err = generic_cont_expand(inode, size); if (err) goto out; Index: linux-2.6/fs/fat/inode.c =================================================================== --- linux-2.6.orig/fs/fat/inode.c +++ linux-2.6/fs/fat/inode.c @@ -151,7 +151,8 @@ static int fat_commit_write(struct file { struct inode *inode = page->mapping->host; int err; - if (to - from > 0) + + if (to - from == 0) return 0; err = generic_commit_write(file, page, from, to); Index: linux-2.6/fs/reiserfs/file.c =================================================================== --- linux-2.6.orig/fs/reiserfs/file.c +++ linux-2.6/fs/reiserfs/file.c @@ -892,6 +892,37 @@ static int reiserfs_submit_file_region_f return retval; } +/* To not overcomplicate matters, we just call prepare/commit_wrie + which will in turn call other stuff and finally will boil down to + reiserfs_get_block() that would do necessary conversion. */ +static int reiserfs_convert_tail(struct inode *inode, loff_t offset) +{ + struct address_space *mapping = inode->i_mapping; + pgoff_t index; + struct page *page; + int err = -ENOMEM; + + index = offset >> PAGE_CACHE_SHIFT; + page = grab_cache_page(mapping, index); + if (!page) + goto out; + + offset = offset & ~PAGE_CACHE_MASK; + WARN_ON(!offset); /* shouldn't pass zero length to prepare/commit */ + + err = mapping->a_ops->prepare_write(NULL, page, 0, offset); + if (err) + goto out_unlock; + + err = mapping->a_ops->commit_write(NULL, page, 0, offset); + +out_unlock: + unlock_page(page); + page_cache_release(page); +out: + return err; +} + /* Look if passed writing region is going to touch file's tail (if it is present). And if it is, convert the tail to unformatted node */ static int reiserfs_check_for_tail_and_convert(struct inode *inode, /* inode to deal with */ @@ -930,14 +961,12 @@ static int reiserfs_check_for_tail_and_c if (is_direct_le_ih(ih)) { /* Ok, closest item is file tail (tails are stored in "direct" * items), so we need to unpack it. */ - /* To not overcomplicate matters, we just call generic_cont_expand - which will in turn call other stuff and finally will boil down to - reiserfs_get_block() that would do necessary conversion. */ cont_expand_offset = le_key_k_offset(get_inode_item_key_version(inode), &(ih->ih_key)); pathrelse(&path); - res = generic_cont_expand(inode, cont_expand_offset); + + res = reiserfs_convert_tail(inode, cont_expand_offset); } else pathrelse(&path); ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [new patch 3/3] fs: fix cont vs deadlock patches 2006-12-02 7:28 ` [new patch " Nick Piggin @ 2006-12-02 9:43 ` OGAWA Hirofumi 0 siblings, 0 replies; 24+ messages in thread From: OGAWA Hirofumi @ 2006-12-02 9:43 UTC (permalink / raw) To: Nick Piggin Cc: Nick Piggin, Andrew Morton, Linux Kernel Mailing List, linux-fsdevel Nick Piggin <npiggin@suse.de> writes: >> I see. I guess you need to synchronise your writepage versus this >> extention in order to handle it properly then. I won't bother with >> that though: it won't be worse than it was before. >> >> Thanks for review, do you agree with the other hunks? > > Well, Andrew's got the rest of the patches in his tree, so I'll send > what we've got for now. Has had some testing on both reiserfs and > fat. Doesn't look like the other filesystems using cont_prepare_write > will have any problems... > > Andrew, please apply this patch as a replacement for the fat-fix > patch in your rollup (this patch includes the same fix, and is a > more logical change unit I think). I'm confused. I couldn't track what is final patchset. Anyway, I'll see and test your final patchset in -mm. Thanks. -- OGAWA Hirofumi <hirofumi@mail.parknet.co.jp> ^ permalink raw reply [flat|nested] 24+ messages in thread
* [patch 0/3] more buffered write fixes 2006-11-30 7:20 [patch 1/3] mm: pagecache write deadlocks zerolength fix Nick Piggin 2006-11-30 7:22 ` [patch 2/3] mm: pagecache write deadlocks stale holes fix Nick Piggin @ 2006-11-30 7:26 ` Nick Piggin 2006-11-30 10:15 ` [patch 1/3] mm: pagecache write deadlocks zerolength fix Andreas Schwab 2 siblings, 0 replies; 24+ messages in thread From: Nick Piggin @ 2006-11-30 7:26 UTC (permalink / raw) To: Linux Kernel Mailing List, Andrew Morton, linux-fsdevel Sorry, I should give some background. The following patches attempt to fix the problems people have identified with buffered write deadlock patches. Against 2.6.19 + the previous patchset dropped from -mm. Comments? ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [patch 1/3] mm: pagecache write deadlocks zerolength fix 2006-11-30 7:20 [patch 1/3] mm: pagecache write deadlocks zerolength fix Nick Piggin 2006-11-30 7:22 ` [patch 2/3] mm: pagecache write deadlocks stale holes fix Nick Piggin 2006-11-30 7:26 ` [patch 0/3] more buffered write fixes Nick Piggin @ 2006-11-30 10:15 ` Andreas Schwab 2006-11-30 10:19 ` Nick Piggin 2 siblings, 1 reply; 24+ messages in thread From: Andreas Schwab @ 2006-11-30 10:15 UTC (permalink / raw) To: Nick Piggin; +Cc: Linux Kernel Mailing List, Andrew Morton, linux-fsdevel Nick Piggin <npiggin@suse.de> writes: > writev with a zero-length segment is a noop, and we shouldn't return EFAULT. AFAICS the callers of these functions never pass a zero length. Andreas. -- Andreas Schwab, SuSE Labs, schwab@suse.de SuSE Linux Products GmbH, Maxfeldstraße 5, 90409 Nürnberg, Germany PGP key fingerprint = 58CA 54C7 6D53 942B 1756 01D3 44D5 214B 8276 4ED5 "And now for something completely different." - To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [patch 1/3] mm: pagecache write deadlocks zerolength fix 2006-11-30 10:15 ` [patch 1/3] mm: pagecache write deadlocks zerolength fix Andreas Schwab @ 2006-11-30 10:19 ` Nick Piggin 2006-11-30 10:30 ` Andreas Schwab 0 siblings, 1 reply; 24+ messages in thread From: Nick Piggin @ 2006-11-30 10:19 UTC (permalink / raw) To: Andreas Schwab; +Cc: Linux Kernel Mailing List, Andrew Morton, linux-fsdevel On Thu, Nov 30, 2006 at 11:15:39AM +0100, Andreas Schwab wrote: > Nick Piggin <npiggin@suse.de> writes: > > > writev with a zero-length segment is a noop, and we shouldn't return EFAULT. > > AFAICS the callers of these functions never pass a zero length. They can in the case of a zero length write. I had considered also doing this check in the caller, but I don't think it is too harmful to make the API a little more robust? But if you have another preference? Thanks, Nick ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [patch 1/3] mm: pagecache write deadlocks zerolength fix 2006-11-30 10:19 ` Nick Piggin @ 2006-11-30 10:30 ` Andreas Schwab 2006-11-30 11:30 ` Nick Piggin 0 siblings, 1 reply; 24+ messages in thread From: Andreas Schwab @ 2006-11-30 10:30 UTC (permalink / raw) To: Nick Piggin; +Cc: Linux Kernel Mailing List, Andrew Morton, linux-fsdevel Nick Piggin <npiggin@suse.de> writes: > On Thu, Nov 30, 2006 at 11:15:39AM +0100, Andreas Schwab wrote: >> Nick Piggin <npiggin@suse.de> writes: >> >> > writev with a zero-length segment is a noop, and we shouldn't return EFAULT. >> >> AFAICS the callers of these functions never pass a zero length. > > They can in the case of a zero length write. How? All (indirect) callers I could find explicitly handle the zero-length case. Andreas. -- Andreas Schwab, SuSE Labs, schwab@suse.de SuSE Linux Products GmbH, Maxfeldstraße 5, 90409 Nürnberg, Germany PGP key fingerprint = 58CA 54C7 6D53 942B 1756 01D3 44D5 214B 8276 4ED5 "And now for something completely different." - To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [patch 1/3] mm: pagecache write deadlocks zerolength fix 2006-11-30 10:30 ` Andreas Schwab @ 2006-11-30 11:30 ` Nick Piggin 0 siblings, 0 replies; 24+ messages in thread From: Nick Piggin @ 2006-11-30 11:30 UTC (permalink / raw) To: Andreas Schwab; +Cc: Linux Kernel Mailing List, Andrew Morton, linux-fsdevel On Thu, Nov 30, 2006 at 11:30:33AM +0100, Andreas Schwab wrote: > Nick Piggin <npiggin@suse.de> writes: > > > On Thu, Nov 30, 2006 at 11:15:39AM +0100, Andreas Schwab wrote: > >> Nick Piggin <npiggin@suse.de> writes: > >> > >> > writev with a zero-length segment is a noop, and we shouldn't return EFAULT. > >> > >> AFAICS the callers of these functions never pass a zero length. > > > > They can in the case of a zero length write. > > How? All (indirect) callers I could find explicitly handle the > zero-length case. Sorry, zero length iov to writev (just had to double-check there ;). ^ permalink raw reply [flat|nested] 24+ messages in thread
end of thread, other threads:[~2006-12-02 9:43 UTC | newest] Thread overview: 24+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2006-11-30 7:20 [patch 1/3] mm: pagecache write deadlocks zerolength fix Nick Piggin 2006-11-30 7:22 ` [patch 2/3] mm: pagecache write deadlocks stale holes fix Nick Piggin 2006-11-30 7:22 ` [patch 3/3] fs: fix cont vs deadlock patches Nick Piggin 2006-11-30 11:32 ` Nick Piggin 2006-11-30 22:14 ` OGAWA Hirofumi 2006-12-01 0:27 ` Nick Piggin 2006-12-01 1:11 ` OGAWA Hirofumi 2006-12-01 2:03 ` Nick Piggin 2006-12-01 2:09 ` Nick Piggin 2006-12-01 3:41 ` OGAWA Hirofumi 2006-12-01 3:47 ` Nick Piggin 2006-12-01 5:08 ` Nick Piggin 2006-12-01 7:21 ` Andrew Morton 2006-12-01 7:53 ` Nick Piggin 2006-12-01 14:50 ` OGAWA Hirofumi 2006-12-01 15:47 ` OGAWA Hirofumi 2006-12-02 0:36 ` Nick Piggin 2006-12-02 7:28 ` [new patch " Nick Piggin 2006-12-02 9:43 ` OGAWA Hirofumi 2006-11-30 7:26 ` [patch 0/3] more buffered write fixes Nick Piggin 2006-11-30 10:15 ` [patch 1/3] mm: pagecache write deadlocks zerolength fix Andreas Schwab 2006-11-30 10:19 ` Nick Piggin 2006-11-30 10:30 ` Andreas Schwab 2006-11-30 11:30 ` Nick Piggin
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).