Christoph, We've fixed the source of the assertion (that was the bugs in xfs_buf_associate_memory()) so I'm pushing your buffer lock removal patch back in again. While looking through it I found a couple of issues: - It called unlock_page() before calls to PagePrivate() and PageUptodate(). I think the page needs to be locked during these calls so I moved the unlock_page() further down. - Unlocking the pages as we go can cause a double unlock in the error handling for a NULL page in the XBF_READ_AHEAD case so I removed the unlocking code for that case. Would you mind checking these changes? Lachlan Lachlan McIlroy wrote: > Christoph, > > We've had to reverse this change because it's caused a regression. > We haven't been able to identify why we see the following assertion > trigger with these changes but the assertion goes away without the > changes. Until we figure out why we'll have to leave the buffer > locking in. > > <5>XFS mounting filesystem hdb2 > <5>Starting XFS recovery on filesystem: hdb2 (logdev: internal) > <4>XFS: xlog_recover_process_data: bad clientid > <4>Assertion failed: 0, file: fs/xfs/xfs_log_recover.c, line: 2912 > <0>------------[ cut here ]------------ > <2>kernel BUG at fs/xfs/support/debug.c:81! > <0>invalid opcode: 0000 [#1] > <0>SMP > <4>Modules linked in: > <0>CPU: 2 > <0>EIP: 0060:[] Not tainted VLI > <0>EFLAGS: 00010286 (2.6.23-kali-26_xfs-debug #1) > <0>EIP is at assfail+0x1e/0x22 > <0>eax: 00000043 ebx: f3002a50 ecx: 00000001 edx: 00000086 > <0>esi: f56e2300 edi: f8fa5c28 ebp: efa67ae4 esp: efa67ad4 > <0>ds: 007b es: 007b fs: 00d8 gs: 0033 ss: 0068 > <0>Process mount (pid: 15191, ti=efa66000 task=f7b43570 task.ti=efa66000) > <0>Stack: c05c8bda c05c6762 c05c4750 00000b60 efa67b1c c0269a35 00000004 > c05c5903 > <0> f3a14000 efa67ba8 f7e458c0 f8fa5c34 efa67bb8 f8fa6a38 0000000d > 00001e00 > <0> f8fa4000 00000000 efa67bf4 c026a566 f8fa4000 00000001 00000651 > 00000000 > <0>Call Trace: > <0> [] show_trace_log_lvl+0x1a/0x2f > <0> [] show_stack_log_lvl+0x9b/0xa3 > <0> [] show_registers+0x1b9/0x28b > <0> [] die+0x119/0x27b > <0> [] do_trap+0x8a/0xa3 > <0> [] do_invalid_op+0x88/0x92 > <0> [] error_code+0x72/0x78 > <0> [] xlog_recover_process_data+0x6a/0x1ff > <0> [] xlog_do_recovery_pass+0x810/0x9f3 > <0> [] xlog_do_log_recovery+0x62/0xe2 > <0> [] xlog_do_recover+0x1d/0x187 > <0> [] xlog_recover+0x88/0x95 > <0> [] xfs_log_mount+0x100/0x144 > <0> [] xfs_mountfs+0x278/0x639 > <0> [] xfs_mount+0x25c/0x2f7 > <0> [] xfs_fs_fill_super+0xab/0x1fd > <0> [] get_sb_bdev+0xd6/0x114 > <0> [] xfs_fs_get_sb+0x21/0x27 > <0> [] vfs_kern_mount+0x41/0x7a > <0> [] do_kern_mount+0x37/0xbd > <0> [] do_mount+0x566/0x5c0 > <0> [] sys_mount+0x6f/0xa9 > <0> [] sysenter_past_esp+0x5f/0x85 > <0> ======================= > <0>Code: 04 24 10 00 00 00 e8 2a e7 03 00 c9 c3 55 89 e5 83 ec 10 89 4c > 24 0c 89 54 24 08 89 44 24 04 c7 04 24 da 8b 5c c0 e8 07 bf e9 ff <0f> > 0b eb fe 55 83 e0 07 89 e5 57 bf 07 00 00 00 56 89 d6 53 89 > <0>EIP: [] assfail+0x1e/0x22 SS:ESP 0068:efa67ad4 > > Lachlan > > Christoph Hellwig wrote: >> There is no need to lock any page in xfs_buf.c because we operate >> on our own address_space and all locking is covered by the buffer >> semaphore. If we ever switch back to main blockdeive address_space >> as suggested e.g. for fsblock with a similar scheme the locking will >> have to be totally revised anyway because the current scheme is >> neither correct nor coherent with itself. >> >> >> Signed-off-by: Christoph Hellwig >> >> Index: linux-2.6-xfs/fs/xfs/linux-2.6/xfs_buf.c >> =================================================================== >> --- linux-2.6-xfs.orig/fs/xfs/linux-2.6/xfs_buf.c 2007-09-23 >> 13:28:00.000000000 +0200 >> +++ linux-2.6-xfs/fs/xfs/linux-2.6/xfs_buf.c 2007-09-23 >> 14:13:43.000000000 +0200 >> @@ -396,6 +396,7 @@ _xfs_buf_lookup_pages( >> congestion_wait(WRITE, HZ/50); >> goto retry; >> } >> + unlock_page(page); >> >> XFS_STATS_INC(xb_page_found); >> >> @@ -405,10 +406,7 @@ _xfs_buf_lookup_pages( >> ASSERT(!PagePrivate(page)); >> if (!PageUptodate(page)) { >> page_count--; >> - if (blocksize >= PAGE_CACHE_SIZE) { >> - if (flags & XBF_READ) >> - bp->b_locked = 1; >> - } else if (!PagePrivate(page)) { >> + if (blocksize < PAGE_CACHE_SIZE && !PagePrivate(page)) { >> if (test_page_region(page, offset, nbytes)) >> page_count++; >> } >> @@ -418,11 +416,6 @@ _xfs_buf_lookup_pages( >> offset = 0; >> } >> >> - if (!bp->b_locked) { >> - for (i = 0; i < bp->b_page_count; i++) >> - unlock_page(bp->b_pages[i]); >> - } >> - >> if (page_count == bp->b_page_count) >> bp->b_flags |= XBF_DONE; >> >> @@ -747,7 +740,6 @@ xfs_buf_associate_memory( >> bp->b_page_count = ++i; >> ptr += PAGE_CACHE_SIZE; >> } >> - bp->b_locked = 0; >> >> bp->b_count_desired = bp->b_buffer_length = len; >> bp->b_flags |= XBF_MAPPED; >> @@ -1093,25 +1085,13 @@ xfs_buf_iostart( >> return status; >> } >> >> -STATIC_INLINE int >> -_xfs_buf_iolocked( >> - xfs_buf_t *bp) >> -{ >> - ASSERT(bp->b_flags & (XBF_READ | XBF_WRITE)); >> - if (bp->b_flags & XBF_READ) >> - return bp->b_locked; >> - return 0; >> -} >> - >> STATIC_INLINE void >> _xfs_buf_ioend( >> xfs_buf_t *bp, >> int schedule) >> { >> - if (atomic_dec_and_test(&bp->b_io_remaining) == 1) { >> - bp->b_locked = 0; >> + if (atomic_dec_and_test(&bp->b_io_remaining) == 1) >> xfs_buf_ioend(bp, schedule); >> - } >> } >> >> STATIC int >> @@ -1146,10 +1126,6 @@ xfs_buf_bio_end_io( >> >> if (--bvec >= bio->bi_io_vec) >> prefetchw(&bvec->bv_page->flags); >> - >> - if (_xfs_buf_iolocked(bp)) { >> - unlock_page(page); >> - } >> } while (bvec >= bio->bi_io_vec); >> >> _xfs_buf_ioend(bp, 1); >> @@ -1161,13 +1137,12 @@ STATIC void >> _xfs_buf_ioapply( >> xfs_buf_t *bp) >> { >> - int i, rw, map_i, total_nr_pages, nr_pages; >> + int rw, map_i, total_nr_pages, nr_pages; >> struct bio *bio; >> int offset = bp->b_offset; >> int size = bp->b_count_desired; >> sector_t sector = bp->b_bn; >> unsigned int blocksize = bp->b_target->bt_bsize; >> - int locking = _xfs_buf_iolocked(bp); >> >> total_nr_pages = bp->b_page_count; >> map_i = 0; >> @@ -1190,7 +1165,7 @@ _xfs_buf_ioapply( >> * filesystem block size is not smaller than the page size. >> */ >> if ((bp->b_buffer_length < PAGE_CACHE_SIZE) && >> - (bp->b_flags & XBF_READ) && locking && >> + (bp->b_flags & XBF_READ) && >> (blocksize >= PAGE_CACHE_SIZE)) { >> bio = bio_alloc(GFP_NOIO, 1); >> >> @@ -1207,24 +1182,6 @@ _xfs_buf_ioapply( >> goto submit_io; >> } >> >> - /* Lock down the pages which we need to for the request */ >> - if (locking && (bp->b_flags & XBF_WRITE) && (bp->b_locked == 0)) { >> - for (i = 0; size; i++) { >> - int nbytes = PAGE_CACHE_SIZE - offset; >> - struct page *page = bp->b_pages[i]; >> - >> - if (nbytes > size) >> - nbytes = size; >> - >> - lock_page(page); >> - >> - size -= nbytes; >> - offset = 0; >> - } >> - offset = bp->b_offset; >> - size = bp->b_count_desired; >> - } >> - >> next_chunk: >> atomic_inc(&bp->b_io_remaining); >> nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT); >> Index: linux-2.6-xfs/fs/xfs/linux-2.6/xfs_buf.h >> =================================================================== >> --- linux-2.6-xfs.orig/fs/xfs/linux-2.6/xfs_buf.h 2007-09-05 >> 11:17:42.000000000 +0200 >> +++ linux-2.6-xfs/fs/xfs/linux-2.6/xfs_buf.h 2007-09-23 >> 14:04:36.000000000 +0200 >> @@ -143,7 +143,6 @@ typedef struct xfs_buf { >> void *b_fspriv2; >> void *b_fspriv3; >> unsigned short b_error; /* error code on I/O */ >> - unsigned short b_locked; /* page array is locked */ >> unsigned int b_page_count; /* size of page array */ >> unsigned int b_offset; /* page offset in first page */ >> struct page **b_pages; /* array of page pointers */ >> Index: linux-2.6-xfs/fs/xfs/xfsidbg.c >> =================================================================== >> --- linux-2.6-xfs.orig/fs/xfs/xfsidbg.c 2007-09-23 >> 13:33:07.000000000 +0200 >> +++ linux-2.6-xfs/fs/xfs/xfsidbg.c 2007-09-23 14:04:36.000000000 +0200 >> @@ -2110,9 +2110,9 @@ print_xfs_buf( >> (unsigned long long) bp->b_file_offset, >> (unsigned long long) bp->b_buffer_length, >> bp->b_addr); >> - kdb_printf(" b_bn 0x%llx b_count_desired 0x%lx b_locked %d\n", >> + kdb_printf(" b_bn 0x%llx b_count_desired 0x%lxn", >> (unsigned long long)bp->b_bn, >> - (unsigned long) bp->b_count_desired, (int)bp->b_locked); >> + (unsigned long) bp->b_count_desired); >> kdb_printf(" b_queuetime %ld (now=%ld/age=%ld) b_io_remaining >> %d\n", >> bp->b_queuetime, jiffies, bp->b_queuetime + age, >> bp->b_io_remaining.counter); >> >> >> > > >