linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* endless sync on bdi_sched_wait()? 2.6.33.1
@ 2010-03-31 16:07 Denys Fedorysychenko
  2010-03-31 22:12 ` Dave Chinner
  2010-04-08  9:28 ` Jan Kara
  0 siblings, 2 replies; 16+ messages in thread
From: Denys Fedorysychenko @ 2010-03-31 16:07 UTC (permalink / raw)
  To: Alexander Viro, linux-fsdevel, linux-kernel

I have a proxy server with "loaded" squid. On some moment i did sync, and 
expecting it to finish in reasonable time. Waited more than 30 minutes, still 
"sync". Can be reproduced easily.

Here is some stats and info:

Linux SUPERPROXY 2.6.33.1-build-0051 #16 SMP Wed Mar 31 17:23:28 EEST 2010 
i686 GNU/Linux

SUPERPROXY ~ # iostat -k -x -d 30
Linux 2.6.33.1-build-0051 (SUPERPROXY)  03/31/10        _i686_  (4 CPU)

Device:         rrqm/s   wrqm/s     r/s     w/s    rkB/s    wkB/s avgrq-sz 
avgqu-sz   await  svctm  %util
sda               0.16     0.01    0.08    0.03     3.62     1.33    88.94     
0.15 1389.89  59.15   0.66
sdb               4.14    61.25    6.22   25.55    44.52   347.21    24.66     
2.24   70.60   2.36   7.49
sdc               4.37   421.28    9.95   98.31   318.27  2081.95    44.34    
20.93  193.21   2.31  24.96
sdd               2.34   339.90    3.97  117.47    95.48  1829.52    31.70     
1.73   14.23   8.09  98.20
sde               2.29    71.40    2.34   27.97    22.56   397.81    27.74     
2.34   77.34   1.66   5.04
dm-0              0.00     0.00    0.19    0.02     3.48     0.02    32.96     
0.05  252.11  28.05   0.60

Device:         rrqm/s   wrqm/s     r/s     w/s    rkB/s    wkB/s avgrq-sz 
avgqu-sz   await  svctm  %util
sda               0.00     0.00    0.00    0.00     0.00     0.00     0.00     
0.00    0.00   0.00   0.00
sdb               0.00    54.67    2.93   26.87    12.27   326.13    22.71     
2.19   73.49   1.91   5.68
sdc               0.00   420.50    3.43  110.53   126.40  2127.73    39.56    
23.82  209.00   2.06  23.44
sdd               0.00   319.63    2.30  122.03   121.87  1765.87    30.37     
1.72   13.83   7.99  99.37
sde               0.00    71.67    0.83   30.63     6.93   409.33    26.46     
2.66   84.68   1.51   4.76
dm-0              0.00     0.00    0.00    0.00     0.00     0.00     0.00     
0.00    0.00   0.00   0.00


CPU:  8.4% usr  7.7% sys  0.0% nic 50.7% idle 27.7% io  0.6% irq  4.7% sirq
Load average: 5.57 4.82 4.46 2/243 2032
  PID  PPID USER     STAT   VSZ %MEM CPU %CPU COMMAND
 1769  1552 squid    R     668m  8.3   3 11.7 /usr/sbin/squid -N
 1546  1545 root     R    10800  0.1   2  6.0 /config/globax 
/config/globax.conf
 1549  1548 root     S    43264  0.5   2  1.5 /config/globax /config/globax-
dld.conf
 1531     2 root     DW       0  0.0   0  0.3 [jbd2/sdd1-8]
 1418     1 root     S     2500  0.0   3  0.0 /sbin/syslogd -R 80.83.17.2
 1524     2 root     SW       0  0.0   0  0.0 [flush-8:32]
 1525     2 root     SW       0  0.0   1  0.0 [jbd2/sdc1-8]
 1604     2 root     DW       0  0.0   0  0.0 [flush-8:48]
 1537     2 root     SW       0  0.0   1  0.0 [jbd2/sde1-8]
   18     2 root     SW       0  0.0   3  0.0 [events/3]
 1545     1 root     S     3576  0.0   1  0.0 /config/globax 
/config/globax.conf
 1548     1 root     S     3576  0.0   0  0.0 /config/globax /config/globax-
dld.conf
 1918     1 ntp      S     3316  0.0   3  0.0 /usr/sbin/ntpd -s
 1919     1 root     S     3268  0.0   3  0.0 /usr/sbin/ntpd -s
    1     0 root     S     2504  0.0   0  0.0 /bin/sh /init trynew trynew 
trynew trynew
 1923  1257 root     S     2504  0.0   1  0.0 /sbin/getty 38400 tty1
 1924  1257 root     S     2504  0.0   0  0.0 /sbin/getty 38400 tty2
 1927  1257 root     S     2504  0.0   0  0.0 /sbin/getty 38400 tty3
 2015  2014 root     S     2504  0.0   1  0.0 -ash
 2032  2015 root     R     2504  0.0   3  0.0 top
 1584     1 root     S     2500  0.0   1  0.0 /usr/bin/ifplugd -i eth0 -a -r 
/etc/startup/rc.ifup -t 1 -u 1 -d 1
 1592     1 root     S     2500  0.0   1  0.0 /usr/bin/ifplugd -i eth2 -a -r 
/etc/startup/rc.ifup -t 1 -u 1 -d 1
 1587     1 root     S     2500  0.0   1  0.0 /usr/bin/ifplugd -i eth1 -a -r 
/etc/startup/rc.ifup -t 1 -u 1 -d 1
 1595     1 root     S     2500  0.0   1  0.0 /usr/bin/ifplugd -i eth3 -a -r 
/etc/startup/rc.ifup -t 1 -u 1 -d 1
 1257     1 root     S     2500  0.0   0  0.0 init
 1420     1 root     S     2500  0.0   1  0.0 /sbin/klogd
 1432     1 root     S     2500  0.0   3  0.0 /usr/sbin/telnetd -f 
/etc/issue.telnet
 1552     1 root     S     2500  0.0   1  0.0 /bin/sh /bin/squidloop
 1743  1742 root     S     2500  0.0   3  0.0 ash -c gs newkernel
 1744  1743 root     S     2500  0.0   0  0.0 /bin/sh /bin/gs newkernel
 1753  1744 root     D     2368  0.0   0  0.0 sync


SUPERPROXY ~ # cat /proc/1753/stack
[<c019a93c>] bdi_sched_wait+0x8/0xc
[<c019a807>] wait_on_bit+0x20/0x2c
[<c019a9af>] sync_inodes_sb+0x6f/0x10a
[<c019dd53>] __sync_filesystem+0x28/0x49
[<c019ddf3>] sync_filesystems+0x7f/0xc0
[<c019de7a>] sys_sync+0x1b/0x2d
[<c02f7a25>] syscall_call+0x7/0xb
[<ffffffff>] 0xffffffff
SUPERPROXY ~ # cat /proc/1753/status
Name:   sync
State:  D (disk sleep)
Tgid:   1753
Pid:    1753
PPid:   1744
TracerPid:      0
Uid:    0       0       0       0
Gid:    0       0       0       0
FDSize: 32
Groups: 0
VmPeak:     2368 kB
VmSize:     2368 kB
VmLck:         0 kB
VmHWM:       408 kB
VmRSS:       408 kB
VmData:      192 kB
VmStk:        84 kB
VmExe:       488 kB
VmLib:      1540 kB
VmPTE:        12 kB
Threads:        1
SigQ:   2/127824
SigPnd: 0000000000000000
ShdPnd: 0000000000000000
SigBlk: 0000000000000000
SigIgn: 0000000000000000
SigCgt: 0000000000000000
CapInh: 0000000000000000
CapPrm: ffffffffffffffff
CapEff: ffffffffffffffff
CapBnd: ffffffffffffffff
Cpus_allowed:   f
Cpus_allowed_list:      0-3
Mems_allowed:   1
Mems_allowed_list:      0
voluntary_ctxt_switches:        124
nonvoluntary_ctxt_switches:     2

SUPERPROXY ~ # cat /proc/meminfo
MemTotal:        8186788 kB
MemFree:         3758908 kB
Buffers:          373656 kB
Cached:          3122824 kB
SwapCached:            0 kB
Active:          1851000 kB
Inactive:        2152916 kB
Active(anon):     446516 kB
Inactive(anon):    63140 kB
Active(file):    1404484 kB
Inactive(file):  2089776 kB
Unevictable:        2152 kB
Mlocked:               0 kB
HighTotal:       7346120 kB
HighFree:        3720868 kB
LowTotal:         840668 kB
LowFree:           38040 kB
SwapTotal:             0 kB
SwapFree:              0 kB
Dirty:            109540 kB
Writeback:          1036 kB
AnonPages:        509648 kB
Mapped:             2320 kB
Shmem:                 8 kB
Slab:             263512 kB
SReclaimable:     129176 kB
SUnreclaim:       134336 kB
KernelStack:         980 kB
PageTables:         1460 kB
NFS_Unstable:          0 kB
Bounce:                0 kB
WritebackTmp:          0 kB
CommitLimit:     4093392 kB
Committed_AS:     746808 kB
VmallocTotal:     122880 kB
VmallocUsed:        2860 kB
VmallocChunk:     113308 kB
DirectMap4k:        4088 kB
DirectMap2M:      907264 kB

"Tuning" (that probably is wrong?)
SCHED="cfq"
echo ${SCHED} >/sys/block/sdb/queue/scheduler
echo ${SCHED} >/sys/block/sdc/queue/scheduler
echo ${SCHED} >/sys/block/sdd/queue/scheduler
echo ${SCHED} >/sys/block/sde/queue/scheduler
echo ${SCHED} >/sys/block/sdf/queue/scheduler

echo 0 >/sys/block/sdb/queue/iosched/low_latency
echo 0 >/sys/block/sdc/queue/iosched/low_latency
echo 0 >/sys/block/sdd/queue/iosched/low_latency
echo 0 >/sys/block/sde/queue/iosched/low_latency
echo 0 >/sys/block/sdf/queue/iosched/low_latency


echo 128 >/sys/block/sdb/queue/iosched/quantum
echo 128 >/sys/block/sdc/queue/iosched/quantum
echo 128 >/sys/block/sdd/queue/iosched/quantum
echo 128 >/sys/block/sde/queue/iosched/quantum
echo 128 >/sys/block/sdf/queue/iosched/quantum

sysctl -w vm.dirty_background_bytes=600000000
sysctl -w vm.dirty_bytes=800000000

rootfs on / type rootfs (rw)
proc on /proc type proc (rw,relatime)
sysfs on /sys type sysfs (rw,relatime)
none on /dev/pts type devpts (rw,relatime,mode=600)
none on /proc/bus/usb type usbfs (rw,relatime)
/dev/mapper/root on /mnt/flash type ext3 
(rw,noatime,errors=continue,data=writeback)
/dev/loop0 on /lib/modules/2.6.33.1-build-0051 type squashfs (ro,relatime)
none on /sys/kernel/debug type debugfs (rw,relatime)
/dev/sdb2 on /cache1 type ext4 
(rw,noatime,nodiratime,barrier=0,journal_async_commit,nobh,data=writeback)
/dev/sdc1 on /cache2 type ext4 
(rw,noatime,nodiratime,barrier=0,journal_async_commit,nobh,data=writeback)
/dev/sdd1 on /cache3 type ext4 
(rw,noatime,nodiratime,barrier=0,journal_async_commit,nobh,data=writeback)
/dev/sde1 on /cache4 type ext4 
(rw,noatime,nodiratime,barrier=0,journal_async_commit,nobh,data=writeback)
/dev/sda1 on /mnt/boot type ext2 (rw,relatime,errors=continue)

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: endless sync on bdi_sched_wait()? 2.6.33.1
  2010-03-31 16:07 endless sync on bdi_sched_wait()? 2.6.33.1 Denys Fedorysychenko
@ 2010-03-31 22:12 ` Dave Chinner
  2010-04-01 10:42   ` Denys Fedorysychenko
  2010-04-08  9:28 ` Jan Kara
  1 sibling, 1 reply; 16+ messages in thread
From: Dave Chinner @ 2010-03-31 22:12 UTC (permalink / raw)
  To: Denys Fedorysychenko; +Cc: Alexander Viro, linux-fsdevel, linux-kernel

On Wed, Mar 31, 2010 at 07:07:31PM +0300, Denys Fedorysychenko wrote:
> I have a proxy server with "loaded" squid. On some moment i did sync, and 
> expecting it to finish in reasonable time. Waited more than 30 minutes, still 
> "sync". Can be reproduced easily.
> 
> Here is some stats and info:
> 
> Linux SUPERPROXY 2.6.33.1-build-0051 #16 SMP Wed Mar 31 17:23:28 EEST 2010 
> i686 GNU/Linux
> 
> SUPERPROXY ~ # iostat -k -x -d 30
> Linux 2.6.33.1-build-0051 (SUPERPROXY)  03/31/10        _i686_  (4 CPU)
> 
> Device:         rrqm/s   wrqm/s     r/s     w/s    rkB/s    wkB/s avgrq-sz 
> avgqu-sz   await  svctm  %util
> sda               0.16     0.01    0.08    0.03     3.62     1.33    88.94     
> 0.15 1389.89  59.15   0.66
> sdb               4.14    61.25    6.22   25.55    44.52   347.21    24.66     
> 2.24   70.60   2.36   7.49
> sdc               4.37   421.28    9.95   98.31   318.27  2081.95    44.34    
> 20.93  193.21   2.31  24.96
> sdd               2.34   339.90    3.97  117.47    95.48  1829.52    31.70     
> 1.73   14.23   8.09  98.20
                 ^^^^  ^^^^^

/dev/sdd is IO bound doing small random writeback IO. A service time
of 8ms implies that it is doing lots of large seeks. If you've got
GBs of data to sync and that's the writeback pattern, then sync will
most definitely take a long, long time.

it may be that ext4 is allocating blocks far apart rather than close
together (as appears to be the case for /dev/sdc), so maybe this is
is related to how the filesytems are aging or how full they are...

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: endless sync on bdi_sched_wait()? 2.6.33.1
  2010-03-31 22:12 ` Dave Chinner
@ 2010-04-01 10:42   ` Denys Fedorysychenko
  2010-04-01 11:13     ` Dave Chinner
  0 siblings, 1 reply; 16+ messages in thread
From: Denys Fedorysychenko @ 2010-04-01 10:42 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Alexander Viro, linux-fsdevel, linux-kernel

On Thursday 01 April 2010 01:12:54 Dave Chinner wrote:
> On Wed, Mar 31, 2010 at 07:07:31PM +0300, Denys Fedorysychenko wrote:
> > I have a proxy server with "loaded" squid. On some moment i did sync, and
> > expecting it to finish in reasonable time. Waited more than 30 minutes,
> > still "sync". Can be reproduced easily.
> >
> > Here is some stats and info:
> >
> > Linux SUPERPROXY 2.6.33.1-build-0051 #16 SMP Wed Mar 31 17:23:28 EEST
> > 2010 i686 GNU/Linux
> >
> > SUPERPROXY ~ # iostat -k -x -d 30
> > Linux 2.6.33.1-build-0051 (SUPERPROXY)  03/31/10        _i686_  (4 CPU)
> >
> > Device:         rrqm/s   wrqm/s     r/s     w/s    rkB/s    wkB/s
> > avgrq-sz avgqu-sz   await  svctm  %util
> > sda               0.16     0.01    0.08    0.03     3.62     1.33   
> > 88.94 0.15 1389.89  59.15   0.66
> > sdb               4.14    61.25    6.22   25.55    44.52   347.21   
> > 24.66 2.24   70.60   2.36   7.49
> > sdc               4.37   421.28    9.95   98.31   318.27  2081.95   
> > 44.34 20.93  193.21   2.31  24.96
> > sdd               2.34   339.90    3.97  117.47    95.48  1829.52   
> > 31.70 1.73   14.23   8.09  98.20
> 
>                  ^^^^  ^^^^^
> 
> /dev/sdd is IO bound doing small random writeback IO. A service time
> of 8ms implies that it is doing lots of large seeks. If you've got
> GBs of data to sync and that's the writeback pattern, then sync will
> most definitely take a long, long time.
> 
> it may be that ext4 is allocating blocks far apart rather than close
> together (as appears to be the case for /dev/sdc), so maybe this is
> is related to how the filesytems are aging or how full they are...
Thats correct, it is quite busy cache server.

Well, if i stop squid(cache) sync will finish enough fast.
If i don't - it took more than hour. Actually i left that PC after 1 hour, and 
it didn't finish yet. I don't think it is normal.
Probably sync taking new data and trying to flush it too, and till he finish 
that, more data comes. 
Actually all what i need - to sync config directory. I cannot use fsync, 
because it is multiple files opened before by other processes, and sync is 
doing trick like this. I got dead process, and only fast way to recover system 
- kill the cache process, so I/O pumping will stop for a while, and sync() 
will have chance to finish.
Sure there is way just to "remount" config partition to ro, but i guess just 
sync must flush only current buffer cache pages.

I will do more tests now and will give exact numbers, how much time it needs 
with running squid and if i kill it shortly after running sync.
> 
> Cheers,
> 
> Dave.
> 

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: endless sync on bdi_sched_wait()? 2.6.33.1
  2010-04-01 10:42   ` Denys Fedorysychenko
@ 2010-04-01 11:13     ` Dave Chinner
  2010-04-01 20:14       ` Jeff Moyer
  0 siblings, 1 reply; 16+ messages in thread
From: Dave Chinner @ 2010-04-01 11:13 UTC (permalink / raw)
  To: Denys Fedorysychenko; +Cc: Alexander Viro, linux-fsdevel, linux-kernel

On Thu, Apr 01, 2010 at 01:42:42PM +0300, Denys Fedorysychenko wrote:
> Thats correct, it is quite busy cache server.
> 
> Well, if i stop squid(cache) sync will finish enough fast.
> If i don't - it took more than hour. Actually i left that PC after 1 hour, and 
> it didn't finish yet. I don't think it is normal.
> Probably sync taking new data and trying to flush it too, and till he finish 
> that, more data comes. 
> Actually all what i need - to sync config directory. I cannot use fsync, 
> because it is multiple files opened before by other processes, and sync is 
> doing trick like this. I got dead process, and only fast way to recover system 
> - kill the cache process, so I/O pumping will stop for a while, and sync() 
> will have chance to finish.
> Sure there is way just to "remount" config partition to ro, but i guess just 
> sync must flush only current buffer cache pages.
> 
> I will do more tests now and will give exact numbers, how much time it needs 
> with running squid and if i kill it shortly after running sync.

Ok. What would be interesting is regular output from /proc/meminfo
to see how the dirty memory is changing over the time the sync is
running....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: endless sync on bdi_sched_wait()? 2.6.33.1
  2010-04-01 11:13     ` Dave Chinner
@ 2010-04-01 20:14       ` Jeff Moyer
  0 siblings, 0 replies; 16+ messages in thread
From: Jeff Moyer @ 2010-04-01 20:14 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Denys Fedorysychenko, Alexander Viro, linux-fsdevel, linux-kernel

Dave Chinner <david@fromorbit.com> writes:

> On Thu, Apr 01, 2010 at 01:42:42PM +0300, Denys Fedorysychenko wrote:
>> Thats correct, it is quite busy cache server.
>> 
>> Well, if i stop squid(cache) sync will finish enough fast.
>> If i don't - it took more than hour. Actually i left that PC after 1 hour, and 
>> it didn't finish yet. I don't think it is normal.
>> Probably sync taking new data and trying to flush it too, and till he finish 
>> that, more data comes. 
>> Actually all what i need - to sync config directory. I cannot use fsync, 
>> because it is multiple files opened before by other processes, and sync is 
>> doing trick like this. I got dead process, and only fast way to recover system 
>> - kill the cache process, so I/O pumping will stop for a while, and sync() 
>> will have chance to finish.
>> Sure there is way just to "remount" config partition to ro, but i guess just 
>> sync must flush only current buffer cache pages.
>> 
>> I will do more tests now and will give exact numbers, how much time it needs 
>> with running squid and if i kill it shortly after running sync.
>
> Ok. What would be interesting is regular output from /proc/meminfo
> to see how the dirty memory is changing over the time the sync is
> running....

This sounds familiar:

http://lkml.org/lkml/2010/2/12/41

Cheers,
Jeff

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: endless sync on bdi_sched_wait()? 2.6.33.1
  2010-03-31 16:07 endless sync on bdi_sched_wait()? 2.6.33.1 Denys Fedorysychenko
  2010-03-31 22:12 ` Dave Chinner
@ 2010-04-08  9:28 ` Jan Kara
  2010-04-08 10:12   ` Denys Fedorysychenko
                     ` (2 more replies)
  1 sibling, 3 replies; 16+ messages in thread
From: Jan Kara @ 2010-04-08  9:28 UTC (permalink / raw)
  To: Denys Fedorysychenko; +Cc: Alexander Viro, linux-fsdevel, linux-kernel

  Hi,

On Wed 31-03-10 19:07:31, Denys Fedorysychenko wrote:
> I have a proxy server with "loaded" squid. On some moment i did sync, and 
> expecting it to finish in reasonable time. Waited more than 30 minutes, still 
> "sync". Can be reproduced easily.
> 
> Here is some stats and info:
> 
> Linux SUPERPROXY 2.6.33.1-build-0051 #16 SMP Wed Mar 31 17:23:28 EEST 2010 
> i686 GNU/Linux
> 
> SUPERPROXY ~ # iostat -k -x -d 30
> Linux 2.6.33.1-build-0051 (SUPERPROXY)  03/31/10        _i686_  (4 CPU)
> 
> Device:         rrqm/s   wrqm/s     r/s     w/s    rkB/s    wkB/s avgrq-sz 
> avgqu-sz   await  svctm  %util
> sda               0.16     0.01    0.08    0.03     3.62     1.33    88.94     
> 0.15 1389.89  59.15   0.66
> sdb               4.14    61.25    6.22   25.55    44.52   347.21    24.66     
> 2.24   70.60   2.36   7.49
> sdc               4.37   421.28    9.95   98.31   318.27  2081.95    44.34    
> 20.93  193.21   2.31  24.96
> sdd               2.34   339.90    3.97  117.47    95.48  1829.52    31.70     
> 1.73   14.23   8.09  98.20
> sde               2.29    71.40    2.34   27.97    22.56   397.81    27.74     
> 2.34   77.34   1.66   5.04
> dm-0              0.00     0.00    0.19    0.02     3.48     0.02    32.96     
> 0.05  252.11  28.05   0.60
> 
> Device:         rrqm/s   wrqm/s     r/s     w/s    rkB/s    wkB/s avgrq-sz 
> avgqu-sz   await  svctm  %util
> sda               0.00     0.00    0.00    0.00     0.00     0.00     0.00     
> 0.00    0.00   0.00   0.00
> sdb               0.00    54.67    2.93   26.87    12.27   326.13    22.71     
> 2.19   73.49   1.91   5.68
> sdc               0.00   420.50    3.43  110.53   126.40  2127.73    39.56    
> 23.82  209.00   2.06  23.44
> sdd               0.00   319.63    2.30  122.03   121.87  1765.87    30.37     
> 1.72   13.83   7.99  99.37
> sde               0.00    71.67    0.83   30.63     6.93   409.33    26.46     
> 2.66   84.68   1.51   4.76
> dm-0              0.00     0.00    0.00    0.00     0.00     0.00     0.00     
> 0.00    0.00   0.00   0.00
> 
> 
> CPU:  8.4% usr  7.7% sys  0.0% nic 50.7% idle 27.7% io  0.6% irq  4.7% sirq
> Load average: 5.57 4.82 4.46 2/243 2032
>   PID  PPID USER     STAT   VSZ %MEM CPU %CPU COMMAND
>  1769  1552 squid    R     668m  8.3   3 11.7 /usr/sbin/squid -N
>  1546  1545 root     R    10800  0.1   2  6.0 /config/globax 
> /config/globax.conf
>  1549  1548 root     S    43264  0.5   2  1.5 /config/globax /config/globax-
> dld.conf
>  1531     2 root     DW       0  0.0   0  0.3 [jbd2/sdd1-8]
>  1418     1 root     S     2500  0.0   3  0.0 /sbin/syslogd -R 80.83.17.2
>  1524     2 root     SW       0  0.0   0  0.0 [flush-8:32]
>  1525     2 root     SW       0  0.0   1  0.0 [jbd2/sdc1-8]
>  1604     2 root     DW       0  0.0   0  0.0 [flush-8:48]
>  1537     2 root     SW       0  0.0   1  0.0 [jbd2/sde1-8]
>    18     2 root     SW       0  0.0   3  0.0 [events/3]
>  1545     1 root     S     3576  0.0   1  0.0 /config/globax 
> /config/globax.conf
>  1548     1 root     S     3576  0.0   0  0.0 /config/globax /config/globax-
> dld.conf
>  1918     1 ntp      S     3316  0.0   3  0.0 /usr/sbin/ntpd -s
>  1919     1 root     S     3268  0.0   3  0.0 /usr/sbin/ntpd -s
>     1     0 root     S     2504  0.0   0  0.0 /bin/sh /init trynew trynew 
> trynew trynew
>  1923  1257 root     S     2504  0.0   1  0.0 /sbin/getty 38400 tty1
>  1924  1257 root     S     2504  0.0   0  0.0 /sbin/getty 38400 tty2
>  1927  1257 root     S     2504  0.0   0  0.0 /sbin/getty 38400 tty3
>  2015  2014 root     S     2504  0.0   1  0.0 -ash
>  2032  2015 root     R     2504  0.0   3  0.0 top
>  1584     1 root     S     2500  0.0   1  0.0 /usr/bin/ifplugd -i eth0 -a -r 
> /etc/startup/rc.ifup -t 1 -u 1 -d 1
>  1592     1 root     S     2500  0.0   1  0.0 /usr/bin/ifplugd -i eth2 -a -r 
> /etc/startup/rc.ifup -t 1 -u 1 -d 1
>  1587     1 root     S     2500  0.0   1  0.0 /usr/bin/ifplugd -i eth1 -a -r 
> /etc/startup/rc.ifup -t 1 -u 1 -d 1
>  1595     1 root     S     2500  0.0   1  0.0 /usr/bin/ifplugd -i eth3 -a -r 
> /etc/startup/rc.ifup -t 1 -u 1 -d 1
>  1257     1 root     S     2500  0.0   0  0.0 init
>  1420     1 root     S     2500  0.0   1  0.0 /sbin/klogd
>  1432     1 root     S     2500  0.0   3  0.0 /usr/sbin/telnetd -f 
> /etc/issue.telnet
>  1552     1 root     S     2500  0.0   1  0.0 /bin/sh /bin/squidloop
>  1743  1742 root     S     2500  0.0   3  0.0 ash -c gs newkernel
>  1744  1743 root     S     2500  0.0   0  0.0 /bin/sh /bin/gs newkernel
>  1753  1744 root     D     2368  0.0   0  0.0 sync
> 
> 
> SUPERPROXY ~ # cat /proc/1753/stack
> [<c019a93c>] bdi_sched_wait+0x8/0xc
> [<c019a807>] wait_on_bit+0x20/0x2c
> [<c019a9af>] sync_inodes_sb+0x6f/0x10a
> [<c019dd53>] __sync_filesystem+0x28/0x49
> [<c019ddf3>] sync_filesystems+0x7f/0xc0
> [<c019de7a>] sys_sync+0x1b/0x2d
> [<c02f7a25>] syscall_call+0x7/0xb
> [<ffffffff>] 0xffffffff
  Hmm, I guess you are observing the problem reported in
https://bugzilla.kernel.org/show_bug.cgi?id=14830
  There seem to be several issues in the per-bdi writeback code that
cause sync on a busy filesystem to last almost forever. To that bug are
attached two patches that fix two issues but apparently it's not all.
I'm still looking into it...

								Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: endless sync on bdi_sched_wait()? 2.6.33.1
  2010-04-08  9:28 ` Jan Kara
@ 2010-04-08 10:12   ` Denys Fedorysychenko
  2010-04-12  0:47   ` Dave Chinner
  2010-04-19  1:37   ` Dave Chinner
  2 siblings, 0 replies; 16+ messages in thread
From: Denys Fedorysychenko @ 2010-04-08 10:12 UTC (permalink / raw)
  To: Jan Kara; +Cc: Alexander Viro, linux-fsdevel, linux-kernel

> > SUPERPROXY ~ # cat /proc/1753/stack
> > [<c019a93c>] bdi_sched_wait+0x8/0xc
> > [<c019a807>] wait_on_bit+0x20/0x2c
> > [<c019a9af>] sync_inodes_sb+0x6f/0x10a
> > [<c019dd53>] __sync_filesystem+0x28/0x49
> > [<c019ddf3>] sync_filesystems+0x7f/0xc0
> > [<c019de7a>] sys_sync+0x1b/0x2d
> > [<c02f7a25>] syscall_call+0x7/0xb
> > [<ffffffff>] 0xffffffff
> 
>   Hmm, I guess you are observing the problem reported in
> https://bugzilla.kernel.org/show_bug.cgi?id=14830
>   There seem to be several issues in the per-bdi writeback code that
> cause sync on a busy filesystem to last almost forever. To that bug are
> attached two patches that fix two issues but apparently it's not all.
> I'm still looking into it...
> 
> 								Honza
> 
Hi

Thanks for info, i will try to test them as soon as i finish with my current 
issues, and kernel will reach at least rc5, because servers where i test - 
loaded and production.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: endless sync on bdi_sched_wait()? 2.6.33.1
  2010-04-08  9:28 ` Jan Kara
  2010-04-08 10:12   ` Denys Fedorysychenko
@ 2010-04-12  0:47   ` Dave Chinner
  2010-04-19  1:37   ` Dave Chinner
  2 siblings, 0 replies; 16+ messages in thread
From: Dave Chinner @ 2010-04-12  0:47 UTC (permalink / raw)
  To: Jan Kara
  Cc: Denys Fedorysychenko, Alexander Viro, linux-fsdevel, linux-kernel

On Thu, Apr 08, 2010 at 11:28:50AM +0200, Jan Kara wrote:
> On Wed 31-03-10 19:07:31, Denys Fedorysychenko wrote:
> > I have a proxy server with "loaded" squid. On some moment i did sync, and 
> > expecting it to finish in reasonable time. Waited more than 30 minutes, still 
> > "sync". Can be reproduced easily.
....
> > 
> > SUPERPROXY ~ # cat /proc/1753/stack
> > [<c019a93c>] bdi_sched_wait+0x8/0xc
> > [<c019a807>] wait_on_bit+0x20/0x2c
> > [<c019a9af>] sync_inodes_sb+0x6f/0x10a
> > [<c019dd53>] __sync_filesystem+0x28/0x49
> > [<c019ddf3>] sync_filesystems+0x7f/0xc0
> > [<c019de7a>] sys_sync+0x1b/0x2d
> > [<c02f7a25>] syscall_call+0x7/0xb
> > [<ffffffff>] 0xffffffff
>   Hmm, I guess you are observing the problem reported in
> https://bugzilla.kernel.org/show_bug.cgi?id=14830
>   There seem to be several issues in the per-bdi writeback code that
> cause sync on a busy filesystem to last almost forever. To that bug are
> attached two patches that fix two issues but apparently it's not all.
> I'm still looking into it...

Jan, just another data point that i haven't had a chance to look
into yet - I noticed that 2.6.34-rc1 writeback patterns have changed
on XFS from looking at blocktrace.

The bdi-flush background write threadi almost never completes - it
blocks in get_request() and it is doing 1-2 page IOs. If I do a
large dd write, the writeback thread starts with 512k IOs for a
short while, then suddenly degrades to 1-2 page IOs that get merged
in the elevator to 512k IOs.

My theory is that the inode is getting dirtied by the concurrent
write() and the inode is never moving back to the dirty list and
having it's dirtied_when time reset  - it's being moved to the
b_more_io list in writeback_single_inode(), wbc->more_io is being
set, and then we re-enter writeback_inodes_wb() which splices the
b_more_io list back onto the b_io list and we try to write it out
again.

Because I have so many dirty pages in memory, nr_pages is quite high
and this pattern continues for some time until it is exhausted, at
which time throttling triggers background sync to run again and the
1-2 page IO pattern continues.

And for sync(), nr_pages is set to LONG_MAX, so regardless of how
many pages were dirty, if we keep dirtying pages it will stay in
this loop until LONG_MAX pages are written....

Anyway, that's my theory - if we had trace points in the writeback
code, I could confirm/deny this straight away. First thing I need to
do, though, is to forward port the original writeback tracng code
Jens posted a while back....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: endless sync on bdi_sched_wait()? 2.6.33.1
  2010-04-08  9:28 ` Jan Kara
  2010-04-08 10:12   ` Denys Fedorysychenko
  2010-04-12  0:47   ` Dave Chinner
@ 2010-04-19  1:37   ` Dave Chinner
  2010-04-19  7:04     ` Dave Chinner
  2 siblings, 1 reply; 16+ messages in thread
From: Dave Chinner @ 2010-04-19  1:37 UTC (permalink / raw)
  To: Jan Kara
  Cc: Denys Fedorysychenko, Alexander Viro, linux-fsdevel, linux-kernel

On Thu, Apr 08, 2010 at 11:28:50AM +0200, Jan Kara wrote:
> > SUPERPROXY ~ # cat /proc/1753/stack
> > [<c019a93c>] bdi_sched_wait+0x8/0xc
> > [<c019a807>] wait_on_bit+0x20/0x2c
> > [<c019a9af>] sync_inodes_sb+0x6f/0x10a
> > [<c019dd53>] __sync_filesystem+0x28/0x49
> > [<c019ddf3>] sync_filesystems+0x7f/0xc0
> > [<c019de7a>] sys_sync+0x1b/0x2d
> > [<c02f7a25>] syscall_call+0x7/0xb
> > [<ffffffff>] 0xffffffff
>   Hmm, I guess you are observing the problem reported in
> https://bugzilla.kernel.org/show_bug.cgi?id=14830
>   There seem to be several issues in the per-bdi writeback code that
> cause sync on a busy filesystem to last almost forever. To that bug are
> attached two patches that fix two issues but apparently it's not all.
> I'm still looking into it...

So Jen's writeback tracing shows this for a normal cycle during a
large dd:

           <...>-6030  [005] 604446.696454: writeback_sched: work=38c0, task=task
    flush-253:16-6029  [002] 604446.696492: writeback_exec: work=38c0 pages=9223372036854775807, sb=0, kupdate=0, range_cyclic=-1 for_background=-1
    flush-253:16-6029  [002] 604446.696493: writeback_clear: work=ffff88011f1a38c0, refs=1
    flush-253:16-6029  [003] 604446.784240: writeback_pages_written: 1024

There were 100 of these traces (102400 pages (400MB) which is exactly 10% of
RAM) before this:

           <...>-6030  [000] 604462.346329: writeback_sched: work=6c0, task=task
    flush-253:16-6029  [002] 604462.267098: writeback_exec: work=6c0 pages=9223372036854775807, sb=0, kupdate=0, range_cyclic=-1 for_background=-1
    flush-253:16-6029  [002] 604462.267101: writeback_clear: work=ffff88011e8006c0, refs=1
    flush-253:16-6029  [001] 604465.406331: writeback_pages_written: 160771

which shows 160771 pages written in a single iteration (650MB). I suspect some
interaction between foreground and background writeout is occurring here.

The flusher thread appears to be getting stuck on congestion - the wchan
it is sitting on indicates it is blocking in get_request().  I'm going to
extend this tracing further down into the writeback code so that what is
happening is clearer...

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: endless sync on bdi_sched_wait()? 2.6.33.1
  2010-04-19  1:37   ` Dave Chinner
@ 2010-04-19  7:04     ` Dave Chinner
  2010-04-19  7:23       ` Dave Chinner
  2010-04-21  0:33       ` Jan Kara
  0 siblings, 2 replies; 16+ messages in thread
From: Dave Chinner @ 2010-04-19  7:04 UTC (permalink / raw)
  To: Jan Kara
  Cc: Denys Fedorysychenko, Alexander Viro, linux-fsdevel, linux-kernel

On Mon, Apr 19, 2010 at 11:37:02AM +1000, Dave Chinner wrote:
> On Thu, Apr 08, 2010 at 11:28:50AM +0200, Jan Kara wrote:
> > > SUPERPROXY ~ # cat /proc/1753/stack
> > > [<c019a93c>] bdi_sched_wait+0x8/0xc
> > > [<c019a807>] wait_on_bit+0x20/0x2c
> > > [<c019a9af>] sync_inodes_sb+0x6f/0x10a
> > > [<c019dd53>] __sync_filesystem+0x28/0x49
> > > [<c019ddf3>] sync_filesystems+0x7f/0xc0
> > > [<c019de7a>] sys_sync+0x1b/0x2d
> > > [<c02f7a25>] syscall_call+0x7/0xb
> > > [<ffffffff>] 0xffffffff
> >   Hmm, I guess you are observing the problem reported in
> > https://bugzilla.kernel.org/show_bug.cgi?id=14830
> >   There seem to be several issues in the per-bdi writeback code that
> > cause sync on a busy filesystem to last almost forever. To that bug are
> > attached two patches that fix two issues but apparently it's not all.
> > I'm still looking into it...
> 
> So Jen's writeback tracing shows this for a normal cycle during a
> large dd:
> 
>            <...>-6030  [005] 604446.696454: writeback_sched: work=38c0, task=task
>     flush-253:16-6029  [002] 604446.696492: writeback_exec: work=38c0 pages=9223372036854775807, sb=0, kupdate=0, range_cyclic=-1 for_background=-1
>     flush-253:16-6029  [002] 604446.696493: writeback_clear: work=ffff88011f1a38c0, refs=1
>     flush-253:16-6029  [003] 604446.784240: writeback_pages_written: 1024
> 
> There were 100 of these traces (102400 pages (400MB) which is exactly 10% of
> RAM) before this:
> 
>            <...>-6030  [000] 604462.346329: writeback_sched: work=6c0, task=task
>     flush-253:16-6029  [002] 604462.267098: writeback_exec: work=6c0 pages=9223372036854775807, sb=0, kupdate=0, range_cyclic=-1 for_background=-1
>     flush-253:16-6029  [002] 604462.267101: writeback_clear: work=ffff88011e8006c0, refs=1
>     flush-253:16-6029  [001] 604465.406331: writeback_pages_written: 160771
> 
> which shows 160771 pages written in a single iteration (650MB). I suspect some
> interaction between foreground and background writeout is occurring here.
> 
> The flusher thread appears to be getting stuck on congestion - the wchan
> it is sitting on indicates it is blocking in get_request().  I'm going to
> extend this tracing further down into the writeback code so that what is
> happening is clearer...

Ok, I moved to a VM with less memory (1GB vs 4GB) and slower block
devices (100MB/s vs 500MB/s) and now it's pretty clear what is
happening. i'll go through the traces.

To start with, sync() queues up several tasks to the bdi flush daemon:

SYSCALL_DEFINE0(sync)
{
        wakeup_flusher_threads(0);
        sync_filesystems(0);
        sync_filesystems(1);
        if (unlikely(laptop_mode))
                laptop_sync_completion();
        return 0;
}


First is via wakeup_flusher_threads() - an async flush:

            sync-2499  [000] 616072.710212: writeback_queue: 253:16: pages=13818, sb=0, kupdate=0, range_cyclic=0 for_background=0
            sync-2499  [000] 616072.710217: writeback_sched: work=13c0, task=task
                                                                  ^^^^
second is a per-sb async flush via sync_filesystems(0):

            sync-2499  [000] 616072.717181: writeback_queue: 253:16: pages=15761, sb=1, kupdate=0, range_cyclic=-1 for_background=0
            sync-2499  [000] 616072.717191: writeback_sched: work=3840, task=task
                                                                  ^^^^
And finally there is the sync flush via sync_filesystems(1):

            sync-2499  [000] 616072.737836: writeback_queue: 253:16: pages=9223372036854775807, sb=1, kupdate=0, range_cyclic=0 for_background=0
            sync-2499  [000] 616072.737840: writeback_sched: work=3e58, task=task
                                                                  ^^^^

The first async flush does:
                                                                 vvvv
    flush-253:16-2497  [000] 616072.897747: writeback_exec: work=13c0 pages=13818, sb=0, kupdate=0, range_cyclic=0 for_background=0
    flush-253:16-2497  [000] 616072.897748: writeback_clear: work=ffff88003d8813c0, refs=1
    flush-253:16-2497  [000] 616072.897753: wbc_writeback_start: dev 253:16 wbc=9d20 towrt=1024 skip=0 sb=0 mode=0 kupd=0 bgrd=0 reclm=0 cyclic=0 more=0 older=0x0 start=0x0 end=0x7fffffffffffffff
    flush-253:16-2497  [000] 616072.897768: wbc_writeback_written: dev 253:16 wbc=9d20 towrt=1024 skip=0 sb=0 mode=0 kupd=0 bgrd=0 reclm=0 cyclic=0 more=0 older=0x0 start=0x0 end=0x7fffffffffffffff

Nothing - it does not write any pages towrt (nr_to_write) is
unchanged by the attempted flush.

The second async flush:
                                                                 vvvv
    flush-253:16-2497  [000] 616072.897769: writeback_exec: work=3840 pages=15761, sb=1, kupdate=0, range_cyclic=-1 for_background=0
    flush-253:16-2497  [000] 616072.897770: writeback_clear: work=ffff88003fb53840, refs=1
    flush-253:16-2497  [000] 616072.897771: wbc_writeback_start: dev 253:16 wbc=9d20 towrt=1024 skip=0 sb=1 mode=0 kupd=0 bgrd=0 reclm=0 cyclic=1 more=0 older=0x0 start=0x0 end=0x0
    flush-253:16-2497  [000] 616072.897783: wbc_writeback_written: dev 253:16 wbc=9d20 towrt=1024 skip=0 sb=1 mode=0 kupd=0 bgrd=0 reclm=0 cyclic=1 more=0 older=0x0 start=0x0 end=0x0

Differs in setup only by range_cyclic=1 instead of zero, and it also
fails to write anything.

The third flush - the sync one - does:
                                                                 vvvv
    flush-253:16-2497  [000] 616072.897784: writeback_exec: work=3e58 pages=9223372036854775807, sb=1, kupdate=0, range_cyclic=0 for_background=0
    flush-253:16-2497  [000] 616072.897785: wbc_writeback_start: dev 253:16 wbc=9d20 towrt=1024 skip=0 sb=1 mode=1 kupd=0 bgrd=0 reclm=0 cyclic=0 more=0 older=0x0 start=0x0 end=0x7fffffffffffffff

some 75 seconds later having written only 1024 pages. In the mean
time, the traces show dd blocked in balance_dirty_pages():

              dd-2498  [000] 616072.908675: wbc_balance_dirty_start: dev 253:16 wbc=fb68 towrt=1536 skip=0 sb=0 mode=0 kupd=0 bgrd=0 reclm=0 cyclic=1 more=0 older=0x0 start=0x0 end=0x0
              dd-2498  [000] 616072.908679: wbc_balance_dirty_wait: dev 253:16 wbc=fb68 towrt=1536 skip=0 sb=0 mode=0 kupd=0 bgrd=0 reclm=0 cyclic=1 more=0 older=0x0 start=0x0 end=0x0
              dd-2498  [000] 616073.238785: wbc_balance_dirty_start: dev 253:16 wbc=fb68 towrt=1536 skip=0 sb=0 mode=0 kupd=0 bgrd=0 reclm=0 cyclic=1 more=0 older=0x0 start=0x0 end=0x0
              dd-2498  [000] 616073.238788: wbc_balance_dirty_wait: dev 253:16 wbc=fb68 towrt=1536 skip=0 sb=0 mode=0 kupd=0 bgrd=0 reclm=0 cyclic=1 more=0 older=0x0 start=0x0 end=0x0

And it appears to stay blocked there without doing any writeback at
all - there are no wbc_balance_dirty_pages_written traces at all.
That is, it is blocking until the number of dirty pages is dropping
below the dirty threshold, then continuing to write and dirty more
pages.

This continues for another 75 seconds, until the dd completes and
then the sync flush thread completes:

    flush-253:16-2497  [000] 616145.763145: wbc_writeback_written: dev 253:16 wbc=9d20 towrt=0 skip=0 sb=1 mode=1 kupd=0 bgrd=0 reclm=0 cyclic=0 more=1 older=0x0 start=0x0 end=0x7fffffffffffffff
    flush-253:16-2497  [000] 616145.763148: wbc_writeback_start: dev 253:16 wbc=9d20 towrt=1024 skip=0 sb=1 mode=1 kupd=0 bgrd=0 reclm=0 cyclic=0 more=0 older=0x0 start=0x0 end=0x7fffffffffffffff
    flush-253:16-2497  [000] 616145.763160: wbc_writeback_written: dev 253:16 wbc=9d20 towrt=1024 skip=0 sb=1 mode=1 kupd=0 bgrd=0 reclm=0 cyclic=0 more=0 older=0x0 start=0x0 end=0x7fffffffffffffff
    flush-253:16-2497  [000] 616145.763161: writeback_clear: work=ffff88002e783e58, refs=1
                                                                             ^^^^^

The flush thread does not appear to be cycling through 1024 pages at
a time as the wbc structure says it should - it appears to be doing
all the writeback.  Indeed, it is almost always blocked here:

  task                        PC stack   pid father
flush-253:16  D 00000000ffffffff     0  2511      2 0x00000000
 ffff880038409690 0000000000000046 ffff880038409610 00000000001d42c0
 ffff880038409fd8 00000000001d42c0 ffff880038409fd8 ffff88003840c340
 00000000001d42c0 00000000001d42c0 ffff880038409fd8 00000000001d42c0
Call Trace:
 [<ffffffff81790197>] io_schedule+0x47/0x70
 [<ffffffff8141b637>] get_request_wait+0xc7/0x190
 [<ffffffff8109d880>] ? autoremove_wake_function+0x0/0x40
 [<ffffffff81414817>] ? elv_merge+0x47/0x220
 [<ffffffff8141bce3>] __make_request+0x93/0x480
 [<ffffffff8141a359>] generic_make_request+0x1f9/0x510
 [<ffffffff810b41bd>] ? trace_hardirqs_on+0xd/0x10
 [<ffffffff8117e462>] ? bvec_alloc_bs+0x62/0x110
 [<ffffffff8141a6ca>] submit_bio+0x5a/0xd0
 [<ffffffff8134f874>] xfs_submit_ioend_bio+0x74/0xa0
 [<ffffffff8134fbb1>] xfs_submit_ioend+0xb1/0x110
 [<ffffffff81350e34>] xfs_page_state_convert+0x3a4/0x730
 [<ffffffff810b416d>] ? trace_hardirqs_on_caller+0x14d/0x190
 [<ffffffff8135137c>] xfs_vm_writepage+0x8c/0x160
 [<ffffffff81112cfa>] __writepage+0x1a/0x50
 [<ffffffff81113b17>] write_cache_pages+0x1f7/0x400
 [<ffffffff81112ce0>] ? __writepage+0x0/0x50
 [<ffffffff81113d47>] generic_writepages+0x27/0x30
 [<ffffffff8134f28d>] xfs_vm_writepages+0x5d/0x80
 [<ffffffff81113d74>] do_writepages+0x24/0x40
 [<ffffffff811722f7>] writeback_single_inode+0xe7/0x3b0
 [<ffffffff81172d65>] writeback_inodes_wb+0x2e5/0x550
 [<ffffffff811247fb>] ? ftrace_raw_event_id_wbc_class+0x16b/0x190
 [<ffffffff811730c2>] wb_writeback+0xf2/0x2d0
 [<ffffffff811243aa>] ? ftrace_raw_event_writeback_exec+0xea/0xf0
 [<ffffffff811734c8>] wb_do_writeback+0x108/0x240
 [<ffffffff811733f0>] ? wb_do_writeback+0x30/0x240
 [<ffffffff8117365b>] bdi_writeback_task+0x5b/0x180
 [<ffffffff81125ac0>] ? bdi_start_fn+0x0/0x100
 [<ffffffff81125b46>] bdi_start_fn+0x86/0x100
 [<ffffffff81125ac0>] ? bdi_start_fn+0x0/0x100
 [<ffffffff8109d396>] kthread+0x96/0xa0
 [<ffffffff81036e24>] kernel_thread_helper+0x4/0x10
 [<ffffffff817934d0>] ? restore_args+0x0/0x30
 [<ffffffff8109d300>] ? kthread+0x0/0xa0
 [<ffffffff81036e20>] ? kernel_thread_helper+0x0/0x10

Waiting on block device congestion.

Because I have this in wb_writeback():

 756                 trace_wbc_writeback_start(&wbc);
 757                 writeback_inodes_wb(wb, &wbc);
 758                 trace_wbc_writeback_written(&wbc);

I know that we are stuck in a single iteration of
writeback_inodes_wb(). This also implies that we are stuck in a
single do_writepages() call.

Indeed, looking at write_cache_pages():

 838         long nr_to_write = wbc->nr_to_write;
...
 920                         ret = (*writepage)(page, wbc, data);
...
 940                         if (nr_to_write > 0) {
 941                                 nr_to_write--;
 942                                 if (nr_to_write == 0 &&
 943                                     wbc->sync_mode == WB_SYNC_NONE) {
 944                                         /*
 945                                          * We stop writing back only if we are
 946                                          * not doing integrity sync. In case of
 947                                          * integrity sync we have to keep going
 948                                          * because someone may be concurrently
 949                                          * dirtying pages, and we might have
 950                                          * synced a lot of newly appeared dirty
 951                                          * pages, but have not synced all of the
 952                                          * old dirty pages.
 953                                          */
 954                                         done = 1;
 955                                         break;
 956                                 }
 957                         }
...
 973         if (!wbc->no_nrwrite_index_update) {
 974                 if (wbc->range_cyclic || (range_whole && nr_to_write > 0))
 975                         mapping->writeback_index = done_index;
 976                 wbc->nr_to_write = nr_to_write;
 977         }

It even hides this fact from the higher layers by rewriting
wbc->nr_to_write with what it thinks it did, not what really
happened. So, where did this come from?

<git blame>

commit 89e12190 - fix bug in nr_to_write introduced by dcf6a79d
commit dcf6a79d - fix bug in nr_to_write introduced by 05fe478d
commit 05fe478d - data integrity write fix: ignore nr_to_write for
			WB_SYNC_ALL writes.
		"This change does indeed make the possibility of
		long stalls la[r]ger, and that's not a good thing,
		but lying about data integrity is even worse."

IOWs, the observed sync behaviour is as intended - if you keep
dirtying the file, sync will keep cleaning it because it defaults to
being safe. I'd say "not a bug" then. I agree it's not ideal, but
until Jan's inode sync sweep code is accepted I don't think there's
much that can be done about it.

However, what this is doing to XFS writeback is really, really nasty
- it's effectively causing single page allocation and IO submission
instead of doing it in much, much larger chunks.

Adding a wbc trace to xfs_vm_writepage(), I see:

    flush-253:16-2586  [000] 620402.417931: wbc_writepage: dev 253:16 wbc=3d20 towrt=1024 skip=0 sb=0 mode=0 kupd=0 bgrd=1 reclm=0 cyclic=1 more=0 older=0x0 start=0x0 end=0x0
    flush-253:16-2586  [000] 620402.442765: wbc_writepage: dev 253:16 wbc=3d20 towrt=0 skip=0 sb=0 mode=0 kupd=0 bgrd=1 reclm=0 cyclic=1 more=0 older=0x0 start=0x0 end=0x0
    flush-253:16-2586  [000] 620402.442899: wbc_writepage: dev 253:16 wbc=3d20 towrt=-1 skip=0 sb=0 mode=0 kupd=0 bgrd=1 reclm=0 cyclic=1 more=0 older=0x0 start=0x0 end=0x0
    flush-253:16-2586  [000] 620402.442910: wbc_writepage: dev 253:16 wbc=3d20 towrt=-5 skip=0 sb=0 mode=0 kupd=0 bgrd=1 reclm=0 cyclic=1 more=0 older=0x0 start=0x0 end=0x0
    flush-253:16-2586  [000] 620402.442918: wbc_writepage: dev 253:16 wbc=3d20 towrt=-21 skip=0 sb=0 mode=0 kupd=0 bgrd=1 reclm=0 cyclic=1 more=0 older=0x0 start=0x0 end=0x0
    flush-253:16-2586  [000] 620402.442927: wbc_writepage: dev 253:16 wbc=3d20 towrt=-85 skip=0 sb=0 mode=0 kupd=0 bgrd=1 reclm=0 cyclic=1 more=0 older=0x0 start=0x0 end=0x0

Which shows why XFS is having problems. Basically, if a filesystem
writes more than one page in ->writepage and updates
wbc->nr_to_write to indicate this, write_cache_pages completely
ignores it. IOWs, write_cache_pages() wants to call ->writepage()
nr_to_write times, not write nr_to_write pages.  And by sending a
negative number down to ->writepage, XFs is writing a single page
and then returning, completely defeating all the page clustering
optimisations XFS has in ->writepage....

I'll post some patches for the tracing and the XFS fixes soon, but i
don't have anything for sync except understanding, though...

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: endless sync on bdi_sched_wait()? 2.6.33.1
  2010-04-19  7:04     ` Dave Chinner
@ 2010-04-19  7:23       ` Dave Chinner
  2010-04-21  0:33       ` Jan Kara
  1 sibling, 0 replies; 16+ messages in thread
From: Dave Chinner @ 2010-04-19  7:23 UTC (permalink / raw)
  To: Jan Kara
  Cc: Denys Fedorysychenko, Alexander Viro, linux-fsdevel, linux-kernel

This is food for thought. On XFS, the only difference between sync
and freeze is that freeze stops incoming writers:

$ dd if=/dev/zero of=/mnt/scratch/test bs=1024k count=8000 &
[1] 2565
$ sleep 5; time (sudo xfs_freeze -f /mnt/scratch ; sudo xfs_freeze -u /mnt/scratch)

real    0m2.536s
user    0m0.000s
sys     0m0.020s
$ sleep 5; time (sudo xfs_freeze -f /mnt/scratch ; sudo xfs_freeze -u /mnt/scratch)

real    0m2.251s
user    0m0.004s
sys     0m0.012s
$ sleep 5; time (sudo xfs_freeze -f /mnt/scratch ; sudo xfs_freeze -u /mnt/scratch)

real    0m1.985s
user    0m0.004s
sys     0m0.024s
$ sleep 5; time sync
8000+0 records in
8000+0 records out
8388608000 bytes (8.4 GB) copied, 80.822 s, 104 MB/s
[1]+  Done                    dd if=/dev/zero of=/mnt/scratch/test
bs=1024k count=8000

real    0m47.237s
user    0m0.036s
sys     0m18.769s
$

Freezing the filesystem and immediately unfreezing is much, much
faster than running sync, yet it gives us exactly the same data
integrity guarantees without the endless blocking problems. Is it
time to make sure every filesystem implements freeze and thaw, and
start using them for sync instead of the current code?

Cheers,

Dave.



On Mon, Apr 19, 2010 at 05:04:58PM +1000, Dave Chinner wrote:
> On Mon, Apr 19, 2010 at 11:37:02AM +1000, Dave Chinner wrote:
> > On Thu, Apr 08, 2010 at 11:28:50AM +0200, Jan Kara wrote:
> > > > SUPERPROXY ~ # cat /proc/1753/stack
> > > > [<c019a93c>] bdi_sched_wait+0x8/0xc
> > > > [<c019a807>] wait_on_bit+0x20/0x2c
> > > > [<c019a9af>] sync_inodes_sb+0x6f/0x10a
> > > > [<c019dd53>] __sync_filesystem+0x28/0x49
> > > > [<c019ddf3>] sync_filesystems+0x7f/0xc0
> > > > [<c019de7a>] sys_sync+0x1b/0x2d
> > > > [<c02f7a25>] syscall_call+0x7/0xb
> > > > [<ffffffff>] 0xffffffff
> > >   Hmm, I guess you are observing the problem reported in
> > > https://bugzilla.kernel.org/show_bug.cgi?id=14830
> > >   There seem to be several issues in the per-bdi writeback code that
> > > cause sync on a busy filesystem to last almost forever. To that bug are
> > > attached two patches that fix two issues but apparently it's not all.
> > > I'm still looking into it...
> > 
> > So Jen's writeback tracing shows this for a normal cycle during a
> > large dd:
> > 
> >            <...>-6030  [005] 604446.696454: writeback_sched: work=38c0, task=task
> >     flush-253:16-6029  [002] 604446.696492: writeback_exec: work=38c0 pages=9223372036854775807, sb=0, kupdate=0, range_cyclic=-1 for_background=-1
> >     flush-253:16-6029  [002] 604446.696493: writeback_clear: work=ffff88011f1a38c0, refs=1
> >     flush-253:16-6029  [003] 604446.784240: writeback_pages_written: 1024
> > 
> > There were 100 of these traces (102400 pages (400MB) which is exactly 10% of
> > RAM) before this:
> > 
> >            <...>-6030  [000] 604462.346329: writeback_sched: work=6c0, task=task
> >     flush-253:16-6029  [002] 604462.267098: writeback_exec: work=6c0 pages=9223372036854775807, sb=0, kupdate=0, range_cyclic=-1 for_background=-1
> >     flush-253:16-6029  [002] 604462.267101: writeback_clear: work=ffff88011e8006c0, refs=1
> >     flush-253:16-6029  [001] 604465.406331: writeback_pages_written: 160771
> > 
> > which shows 160771 pages written in a single iteration (650MB). I suspect some
> > interaction between foreground and background writeout is occurring here.
> > 
> > The flusher thread appears to be getting stuck on congestion - the wchan
> > it is sitting on indicates it is blocking in get_request().  I'm going to
> > extend this tracing further down into the writeback code so that what is
> > happening is clearer...
> 
> Ok, I moved to a VM with less memory (1GB vs 4GB) and slower block
> devices (100MB/s vs 500MB/s) and now it's pretty clear what is
> happening. i'll go through the traces.
> 
> To start with, sync() queues up several tasks to the bdi flush daemon:
> 
> SYSCALL_DEFINE0(sync)
> {
>         wakeup_flusher_threads(0);
>         sync_filesystems(0);
>         sync_filesystems(1);
>         if (unlikely(laptop_mode))
>                 laptop_sync_completion();
>         return 0;
> }
> 
> 
> First is via wakeup_flusher_threads() - an async flush:
> 
>             sync-2499  [000] 616072.710212: writeback_queue: 253:16: pages=13818, sb=0, kupdate=0, range_cyclic=0 for_background=0
>             sync-2499  [000] 616072.710217: writeback_sched: work=13c0, task=task
>                                                                   ^^^^
> second is a per-sb async flush via sync_filesystems(0):
> 
>             sync-2499  [000] 616072.717181: writeback_queue: 253:16: pages=15761, sb=1, kupdate=0, range_cyclic=-1 for_background=0
>             sync-2499  [000] 616072.717191: writeback_sched: work=3840, task=task
>                                                                   ^^^^
> And finally there is the sync flush via sync_filesystems(1):
> 
>             sync-2499  [000] 616072.737836: writeback_queue: 253:16: pages=9223372036854775807, sb=1, kupdate=0, range_cyclic=0 for_background=0
>             sync-2499  [000] 616072.737840: writeback_sched: work=3e58, task=task
>                                                                   ^^^^
> 
> The first async flush does:
>                                                                  vvvv
>     flush-253:16-2497  [000] 616072.897747: writeback_exec: work=13c0 pages=13818, sb=0, kupdate=0, range_cyclic=0 for_background=0
>     flush-253:16-2497  [000] 616072.897748: writeback_clear: work=ffff88003d8813c0, refs=1
>     flush-253:16-2497  [000] 616072.897753: wbc_writeback_start: dev 253:16 wbc=9d20 towrt=1024 skip=0 sb=0 mode=0 kupd=0 bgrd=0 reclm=0 cyclic=0 more=0 older=0x0 start=0x0 end=0x7fffffffffffffff
>     flush-253:16-2497  [000] 616072.897768: wbc_writeback_written: dev 253:16 wbc=9d20 towrt=1024 skip=0 sb=0 mode=0 kupd=0 bgrd=0 reclm=0 cyclic=0 more=0 older=0x0 start=0x0 end=0x7fffffffffffffff
> 
> Nothing - it does not write any pages towrt (nr_to_write) is
> unchanged by the attempted flush.
> 
> The second async flush:
>                                                                  vvvv
>     flush-253:16-2497  [000] 616072.897769: writeback_exec: work=3840 pages=15761, sb=1, kupdate=0, range_cyclic=-1 for_background=0
>     flush-253:16-2497  [000] 616072.897770: writeback_clear: work=ffff88003fb53840, refs=1
>     flush-253:16-2497  [000] 616072.897771: wbc_writeback_start: dev 253:16 wbc=9d20 towrt=1024 skip=0 sb=1 mode=0 kupd=0 bgrd=0 reclm=0 cyclic=1 more=0 older=0x0 start=0x0 end=0x0
>     flush-253:16-2497  [000] 616072.897783: wbc_writeback_written: dev 253:16 wbc=9d20 towrt=1024 skip=0 sb=1 mode=0 kupd=0 bgrd=0 reclm=0 cyclic=1 more=0 older=0x0 start=0x0 end=0x0
> 
> Differs in setup only by range_cyclic=1 instead of zero, and it also
> fails to write anything.
> 
> The third flush - the sync one - does:
>                                                                  vvvv
>     flush-253:16-2497  [000] 616072.897784: writeback_exec: work=3e58 pages=9223372036854775807, sb=1, kupdate=0, range_cyclic=0 for_background=0
>     flush-253:16-2497  [000] 616072.897785: wbc_writeback_start: dev 253:16 wbc=9d20 towrt=1024 skip=0 sb=1 mode=1 kupd=0 bgrd=0 reclm=0 cyclic=0 more=0 older=0x0 start=0x0 end=0x7fffffffffffffff
> 
> some 75 seconds later having written only 1024 pages. In the mean
> time, the traces show dd blocked in balance_dirty_pages():
> 
>               dd-2498  [000] 616072.908675: wbc_balance_dirty_start: dev 253:16 wbc=fb68 towrt=1536 skip=0 sb=0 mode=0 kupd=0 bgrd=0 reclm=0 cyclic=1 more=0 older=0x0 start=0x0 end=0x0
>               dd-2498  [000] 616072.908679: wbc_balance_dirty_wait: dev 253:16 wbc=fb68 towrt=1536 skip=0 sb=0 mode=0 kupd=0 bgrd=0 reclm=0 cyclic=1 more=0 older=0x0 start=0x0 end=0x0
>               dd-2498  [000] 616073.238785: wbc_balance_dirty_start: dev 253:16 wbc=fb68 towrt=1536 skip=0 sb=0 mode=0 kupd=0 bgrd=0 reclm=0 cyclic=1 more=0 older=0x0 start=0x0 end=0x0
>               dd-2498  [000] 616073.238788: wbc_balance_dirty_wait: dev 253:16 wbc=fb68 towrt=1536 skip=0 sb=0 mode=0 kupd=0 bgrd=0 reclm=0 cyclic=1 more=0 older=0x0 start=0x0 end=0x0
> 
> And it appears to stay blocked there without doing any writeback at
> all - there are no wbc_balance_dirty_pages_written traces at all.
> That is, it is blocking until the number of dirty pages is dropping
> below the dirty threshold, then continuing to write and dirty more
> pages.
> 
> This continues for another 75 seconds, until the dd completes and
> then the sync flush thread completes:
> 
>     flush-253:16-2497  [000] 616145.763145: wbc_writeback_written: dev 253:16 wbc=9d20 towrt=0 skip=0 sb=1 mode=1 kupd=0 bgrd=0 reclm=0 cyclic=0 more=1 older=0x0 start=0x0 end=0x7fffffffffffffff
>     flush-253:16-2497  [000] 616145.763148: wbc_writeback_start: dev 253:16 wbc=9d20 towrt=1024 skip=0 sb=1 mode=1 kupd=0 bgrd=0 reclm=0 cyclic=0 more=0 older=0x0 start=0x0 end=0x7fffffffffffffff
>     flush-253:16-2497  [000] 616145.763160: wbc_writeback_written: dev 253:16 wbc=9d20 towrt=1024 skip=0 sb=1 mode=1 kupd=0 bgrd=0 reclm=0 cyclic=0 more=0 older=0x0 start=0x0 end=0x7fffffffffffffff
>     flush-253:16-2497  [000] 616145.763161: writeback_clear: work=ffff88002e783e58, refs=1
>                                                                              ^^^^^
> 
> The flush thread does not appear to be cycling through 1024 pages at
> a time as the wbc structure says it should - it appears to be doing
> all the writeback.  Indeed, it is almost always blocked here:
> 
>   task                        PC stack   pid father
> flush-253:16  D 00000000ffffffff     0  2511      2 0x00000000
>  ffff880038409690 0000000000000046 ffff880038409610 00000000001d42c0
>  ffff880038409fd8 00000000001d42c0 ffff880038409fd8 ffff88003840c340
>  00000000001d42c0 00000000001d42c0 ffff880038409fd8 00000000001d42c0
> Call Trace:
>  [<ffffffff81790197>] io_schedule+0x47/0x70
>  [<ffffffff8141b637>] get_request_wait+0xc7/0x190
>  [<ffffffff8109d880>] ? autoremove_wake_function+0x0/0x40
>  [<ffffffff81414817>] ? elv_merge+0x47/0x220
>  [<ffffffff8141bce3>] __make_request+0x93/0x480
>  [<ffffffff8141a359>] generic_make_request+0x1f9/0x510
>  [<ffffffff810b41bd>] ? trace_hardirqs_on+0xd/0x10
>  [<ffffffff8117e462>] ? bvec_alloc_bs+0x62/0x110
>  [<ffffffff8141a6ca>] submit_bio+0x5a/0xd0
>  [<ffffffff8134f874>] xfs_submit_ioend_bio+0x74/0xa0
>  [<ffffffff8134fbb1>] xfs_submit_ioend+0xb1/0x110
>  [<ffffffff81350e34>] xfs_page_state_convert+0x3a4/0x730
>  [<ffffffff810b416d>] ? trace_hardirqs_on_caller+0x14d/0x190
>  [<ffffffff8135137c>] xfs_vm_writepage+0x8c/0x160
>  [<ffffffff81112cfa>] __writepage+0x1a/0x50
>  [<ffffffff81113b17>] write_cache_pages+0x1f7/0x400
>  [<ffffffff81112ce0>] ? __writepage+0x0/0x50
>  [<ffffffff81113d47>] generic_writepages+0x27/0x30
>  [<ffffffff8134f28d>] xfs_vm_writepages+0x5d/0x80
>  [<ffffffff81113d74>] do_writepages+0x24/0x40
>  [<ffffffff811722f7>] writeback_single_inode+0xe7/0x3b0
>  [<ffffffff81172d65>] writeback_inodes_wb+0x2e5/0x550
>  [<ffffffff811247fb>] ? ftrace_raw_event_id_wbc_class+0x16b/0x190
>  [<ffffffff811730c2>] wb_writeback+0xf2/0x2d0
>  [<ffffffff811243aa>] ? ftrace_raw_event_writeback_exec+0xea/0xf0
>  [<ffffffff811734c8>] wb_do_writeback+0x108/0x240
>  [<ffffffff811733f0>] ? wb_do_writeback+0x30/0x240
>  [<ffffffff8117365b>] bdi_writeback_task+0x5b/0x180
>  [<ffffffff81125ac0>] ? bdi_start_fn+0x0/0x100
>  [<ffffffff81125b46>] bdi_start_fn+0x86/0x100
>  [<ffffffff81125ac0>] ? bdi_start_fn+0x0/0x100
>  [<ffffffff8109d396>] kthread+0x96/0xa0
>  [<ffffffff81036e24>] kernel_thread_helper+0x4/0x10
>  [<ffffffff817934d0>] ? restore_args+0x0/0x30
>  [<ffffffff8109d300>] ? kthread+0x0/0xa0
>  [<ffffffff81036e20>] ? kernel_thread_helper+0x0/0x10
> 
> Waiting on block device congestion.
> 
> Because I have this in wb_writeback():
> 
>  756                 trace_wbc_writeback_start(&wbc);
>  757                 writeback_inodes_wb(wb, &wbc);
>  758                 trace_wbc_writeback_written(&wbc);
> 
> I know that we are stuck in a single iteration of
> writeback_inodes_wb(). This also implies that we are stuck in a
> single do_writepages() call.
> 
> Indeed, looking at write_cache_pages():
> 
>  838         long nr_to_write = wbc->nr_to_write;
> ...
>  920                         ret = (*writepage)(page, wbc, data);
> ...
>  940                         if (nr_to_write > 0) {
>  941                                 nr_to_write--;
>  942                                 if (nr_to_write == 0 &&
>  943                                     wbc->sync_mode == WB_SYNC_NONE) {
>  944                                         /*
>  945                                          * We stop writing back only if we are
>  946                                          * not doing integrity sync. In case of
>  947                                          * integrity sync we have to keep going
>  948                                          * because someone may be concurrently
>  949                                          * dirtying pages, and we might have
>  950                                          * synced a lot of newly appeared dirty
>  951                                          * pages, but have not synced all of the
>  952                                          * old dirty pages.
>  953                                          */
>  954                                         done = 1;
>  955                                         break;
>  956                                 }
>  957                         }
> ...
>  973         if (!wbc->no_nrwrite_index_update) {
>  974                 if (wbc->range_cyclic || (range_whole && nr_to_write > 0))
>  975                         mapping->writeback_index = done_index;
>  976                 wbc->nr_to_write = nr_to_write;
>  977         }
> 
> It even hides this fact from the higher layers by rewriting
> wbc->nr_to_write with what it thinks it did, not what really
> happened. So, where did this come from?
> 
> <git blame>
> 
> commit 89e12190 - fix bug in nr_to_write introduced by dcf6a79d
> commit dcf6a79d - fix bug in nr_to_write introduced by 05fe478d
> commit 05fe478d - data integrity write fix: ignore nr_to_write for
> 			WB_SYNC_ALL writes.
> 		"This change does indeed make the possibility of
> 		long stalls la[r]ger, and that's not a good thing,
> 		but lying about data integrity is even worse."
> 
> IOWs, the observed sync behaviour is as intended - if you keep
> dirtying the file, sync will keep cleaning it because it defaults to
> being safe. I'd say "not a bug" then. I agree it's not ideal, but
> until Jan's inode sync sweep code is accepted I don't think there's
> much that can be done about it.
> 
> However, what this is doing to XFS writeback is really, really nasty
> - it's effectively causing single page allocation and IO submission
> instead of doing it in much, much larger chunks.
> 
> Adding a wbc trace to xfs_vm_writepage(), I see:
> 
>     flush-253:16-2586  [000] 620402.417931: wbc_writepage: dev 253:16 wbc=3d20 towrt=1024 skip=0 sb=0 mode=0 kupd=0 bgrd=1 reclm=0 cyclic=1 more=0 older=0x0 start=0x0 end=0x0
>     flush-253:16-2586  [000] 620402.442765: wbc_writepage: dev 253:16 wbc=3d20 towrt=0 skip=0 sb=0 mode=0 kupd=0 bgrd=1 reclm=0 cyclic=1 more=0 older=0x0 start=0x0 end=0x0
>     flush-253:16-2586  [000] 620402.442899: wbc_writepage: dev 253:16 wbc=3d20 towrt=-1 skip=0 sb=0 mode=0 kupd=0 bgrd=1 reclm=0 cyclic=1 more=0 older=0x0 start=0x0 end=0x0
>     flush-253:16-2586  [000] 620402.442910: wbc_writepage: dev 253:16 wbc=3d20 towrt=-5 skip=0 sb=0 mode=0 kupd=0 bgrd=1 reclm=0 cyclic=1 more=0 older=0x0 start=0x0 end=0x0
>     flush-253:16-2586  [000] 620402.442918: wbc_writepage: dev 253:16 wbc=3d20 towrt=-21 skip=0 sb=0 mode=0 kupd=0 bgrd=1 reclm=0 cyclic=1 more=0 older=0x0 start=0x0 end=0x0
>     flush-253:16-2586  [000] 620402.442927: wbc_writepage: dev 253:16 wbc=3d20 towrt=-85 skip=0 sb=0 mode=0 kupd=0 bgrd=1 reclm=0 cyclic=1 more=0 older=0x0 start=0x0 end=0x0
> 
> Which shows why XFS is having problems. Basically, if a filesystem
> writes more than one page in ->writepage and updates
> wbc->nr_to_write to indicate this, write_cache_pages completely
> ignores it. IOWs, write_cache_pages() wants to call ->writepage()
> nr_to_write times, not write nr_to_write pages.  And by sending a
> negative number down to ->writepage, XFs is writing a single page
> and then returning, completely defeating all the page clustering
> optimisations XFS has in ->writepage....
> 
> I'll post some patches for the tracing and the XFS fixes soon, but i
> don't have anything for sync except understanding, though...
> 
> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 

-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: endless sync on bdi_sched_wait()? 2.6.33.1
  2010-04-19  7:04     ` Dave Chinner
  2010-04-19  7:23       ` Dave Chinner
@ 2010-04-21  0:33       ` Jan Kara
  2010-04-21  1:54         ` Dave Chinner
  1 sibling, 1 reply; 16+ messages in thread
From: Jan Kara @ 2010-04-21  0:33 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Jan Kara, Denys Fedorysychenko, Alexander Viro, linux-fsdevel,
	linux-kernel

On Mon 19-04-10 17:04:58, Dave Chinner wrote:
> On Mon, Apr 19, 2010 at 11:37:02AM +1000, Dave Chinner wrote:
> > On Thu, Apr 08, 2010 at 11:28:50AM +0200, Jan Kara wrote:
> > > > SUPERPROXY ~ # cat /proc/1753/stack
> > > > [<c019a93c>] bdi_sched_wait+0x8/0xc
> > > > [<c019a807>] wait_on_bit+0x20/0x2c
> > > > [<c019a9af>] sync_inodes_sb+0x6f/0x10a
> > > > [<c019dd53>] __sync_filesystem+0x28/0x49
> > > > [<c019ddf3>] sync_filesystems+0x7f/0xc0
> > > > [<c019de7a>] sys_sync+0x1b/0x2d
> > > > [<c02f7a25>] syscall_call+0x7/0xb
> > > > [<ffffffff>] 0xffffffff
> > >   Hmm, I guess you are observing the problem reported in
> > > https://bugzilla.kernel.org/show_bug.cgi?id=14830
> > >   There seem to be several issues in the per-bdi writeback code that
> > > cause sync on a busy filesystem to last almost forever. To that bug are
> > > attached two patches that fix two issues but apparently it's not all.
> > > I'm still looking into it...
> > 
> > So Jen's writeback tracing shows this for a normal cycle during a
> > large dd:
> > 
> >            <...>-6030  [005] 604446.696454: writeback_sched: work=38c0, task=task
> >     flush-253:16-6029  [002] 604446.696492: writeback_exec: work=38c0 pages=9223372036854775807, sb=0, kupdate=0, range_cyclic=-1 for_background=-1
> >     flush-253:16-6029  [002] 604446.696493: writeback_clear: work=ffff88011f1a38c0, refs=1
> >     flush-253:16-6029  [003] 604446.784240: writeback_pages_written: 1024
> > 
> > There were 100 of these traces (102400 pages (400MB) which is exactly 10% of
> > RAM) before this:
> > 
> >            <...>-6030  [000] 604462.346329: writeback_sched: work=6c0, task=task
> >     flush-253:16-6029  [002] 604462.267098: writeback_exec: work=6c0 pages=9223372036854775807, sb=0, kupdate=0, range_cyclic=-1 for_background=-1
> >     flush-253:16-6029  [002] 604462.267101: writeback_clear: work=ffff88011e8006c0, refs=1
> >     flush-253:16-6029  [001] 604465.406331: writeback_pages_written: 160771
> > 
> > which shows 160771 pages written in a single iteration (650MB). I suspect some
> > interaction between foreground and background writeout is occurring here.
> > 
> > The flusher thread appears to be getting stuck on congestion - the wchan
> > it is sitting on indicates it is blocking in get_request().  I'm going to
> > extend this tracing further down into the writeback code so that what is
> > happening is clearer...
> 
> Ok, I moved to a VM with less memory (1GB vs 4GB) and slower block
> devices (100MB/s vs 500MB/s) and now it's pretty clear what is
> happening. i'll go through the traces.
> 
> To start with, sync() queues up several tasks to the bdi flush daemon:
> 
> SYSCALL_DEFINE0(sync)
> {
>         wakeup_flusher_threads(0);
>         sync_filesystems(0);
>         sync_filesystems(1);
>         if (unlikely(laptop_mode))
>                 laptop_sync_completion();
>         return 0;
> }
> 
> 
> First is via wakeup_flusher_threads() - an async flush:
> 
>             sync-2499  [000] 616072.710212: writeback_queue: 253:16: pages=13818, sb=0, kupdate=0, range_cyclic=0 for_background=0
>             sync-2499  [000] 616072.710217: writeback_sched: work=13c0, task=task
>                                                                   ^^^^
> second is a per-sb async flush via sync_filesystems(0):
> 
>             sync-2499  [000] 616072.717181: writeback_queue: 253:16: pages=15761, sb=1, kupdate=0, range_cyclic=-1 for_background=0
>             sync-2499  [000] 616072.717191: writeback_sched: work=3840, task=task
>                                                                   ^^^^
> And finally there is the sync flush via sync_filesystems(1):
> 
>             sync-2499  [000] 616072.737836: writeback_queue: 253:16: pages=9223372036854775807, sb=1, kupdate=0, range_cyclic=0 for_background=0
>             sync-2499  [000] 616072.737840: writeback_sched: work=3e58, task=task
>                                                                   ^^^^
> 
> The first async flush does:
>                                                                  vvvv
>     flush-253:16-2497  [000] 616072.897747: writeback_exec: work=13c0 pages=13818, sb=0, kupdate=0, range_cyclic=0 for_background=0
>     flush-253:16-2497  [000] 616072.897748: writeback_clear: work=ffff88003d8813c0, refs=1
>     flush-253:16-2497  [000] 616072.897753: wbc_writeback_start: dev 253:16 wbc=9d20 towrt=1024 skip=0 sb=0 mode=0 kupd=0 bgrd=0 reclm=0 cyclic=0 more=0 older=0x0 start=0x0 end=0x7fffffffffffffff
>     flush-253:16-2497  [000] 616072.897768: wbc_writeback_written: dev 253:16 wbc=9d20 towrt=1024 skip=0 sb=0 mode=0 kupd=0 bgrd=0 reclm=0 cyclic=0 more=0 older=0x0 start=0x0 end=0x7fffffffffffffff
> 
> Nothing - it does not write any pages towrt (nr_to_write) is
> unchanged by the attempted flush.
  This looks a bit strange. Surly there are plenty of dirty pages. I guess
we never get to ->writepages for XFS.  But then I wonder how does it
happen that we return without more_io set. Strange.

> The second async flush:
>                                                                  vvvv
>     flush-253:16-2497  [000] 616072.897769: writeback_exec: work=3840 pages=15761, sb=1, kupdate=0, range_cyclic=-1 for_background=0
>     flush-253:16-2497  [000] 616072.897770: writeback_clear: work=ffff88003fb53840, refs=1
>     flush-253:16-2497  [000] 616072.897771: wbc_writeback_start: dev 253:16 wbc=9d20 towrt=1024 skip=0 sb=1 mode=0 kupd=0 bgrd=0 reclm=0 cyclic=1 more=0 older=0x0 start=0x0 end=0x0
>     flush-253:16-2497  [000] 616072.897783: wbc_writeback_written: dev 253:16 wbc=9d20 towrt=1024 skip=0 sb=1 mode=0 kupd=0 bgrd=0 reclm=0 cyclic=1 more=0 older=0x0 start=0x0 end=0x0
> 
> Differs in setup only by range_cyclic=1 instead of zero, and it also
> fails to write anything.
> 
> The third flush - the sync one - does:
>                                                                  vvvv
>     flush-253:16-2497  [000] 616072.897784: writeback_exec: work=3e58 pages=9223372036854775807, sb=1, kupdate=0, range_cyclic=0 for_background=0
>     flush-253:16-2497  [000] 616072.897785: wbc_writeback_start: dev 253:16 wbc=9d20 towrt=1024 skip=0 sb=1 mode=1 kupd=0 bgrd=0 reclm=0 cyclic=0 more=0 older=0x0 start=0x0 end=0x7fffffffffffffff
> 
> some 75 seconds later having written only 1024 pages. In the mean
> time, the traces show dd blocked in balance_dirty_pages():
> 
>               dd-2498  [000] 616072.908675: wbc_balance_dirty_start: dev 253:16 wbc=fb68 towrt=1536 skip=0 sb=0 mode=0 kupd=0 bgrd=0 reclm=0 cyclic=1 more=0 older=0x0 start=0x0 end=0x0
>               dd-2498  [000] 616072.908679: wbc_balance_dirty_wait: dev 253:16 wbc=fb68 towrt=1536 skip=0 sb=0 mode=0 kupd=0 bgrd=0 reclm=0 cyclic=1 more=0 older=0x0 start=0x0 end=0x0
>               dd-2498  [000] 616073.238785: wbc_balance_dirty_start: dev 253:16 wbc=fb68 towrt=1536 skip=0 sb=0 mode=0 kupd=0 bgrd=0 reclm=0 cyclic=1 more=0 older=0x0 start=0x0 end=0x0
>               dd-2498  [000] 616073.238788: wbc_balance_dirty_wait: dev 253:16 wbc=fb68 towrt=1536 skip=0 sb=0 mode=0 kupd=0 bgrd=0 reclm=0 cyclic=1 more=0 older=0x0 start=0x0 end=0x0
> And it appears to stay blocked there without doing any writeback at
> all - there are no wbc_balance_dirty_pages_written traces at all.
> That is, it is blocking until the number of dirty pages is dropping
> below the dirty threshold, then continuing to write and dirty more
> pages.
  I think this happens because sync writeback is running so I_SYNC is set
and thus we cannot do any writeout for the inode from balance_dirty_pages.

> This continues for another 75 seconds, until the dd completes and
> then the sync flush thread completes:
> 
>     flush-253:16-2497  [000] 616145.763145: wbc_writeback_written: dev 253:16 wbc=9d20 towrt=0 skip=0 sb=1 mode=1 kupd=0 bgrd=0 reclm=0 cyclic=0 more=1 older=0x0 start=0x0 end=0x7fffffffffffffff
>     flush-253:16-2497  [000] 616145.763148: wbc_writeback_start: dev 253:16 wbc=9d20 towrt=1024 skip=0 sb=1 mode=1 kupd=0 bgrd=0 reclm=0 cyclic=0 more=0 older=0x0 start=0x0 end=0x7fffffffffffffff
>     flush-253:16-2497  [000] 616145.763160: wbc_writeback_written: dev 253:16 wbc=9d20 towrt=1024 skip=0 sb=1 mode=1 kupd=0 bgrd=0 reclm=0 cyclic=0 more=0 older=0x0 start=0x0 end=0x7fffffffffffffff
>     flush-253:16-2497  [000] 616145.763161: writeback_clear: work=ffff88002e783e58, refs=1
>                                                                              ^^^^^
...
> <git blame>
> 
> commit 89e12190 - fix bug in nr_to_write introduced by dcf6a79d
> commit dcf6a79d - fix bug in nr_to_write introduced by 05fe478d
> commit 05fe478d - data integrity write fix: ignore nr_to_write for
> 			WB_SYNC_ALL writes.
> 		"This change does indeed make the possibility of
> 		long stalls la[r]ger, and that's not a good thing,
> 		but lying about data integrity is even worse."
> 
> IOWs, the observed sync behaviour is as intended - if you keep
> dirtying the file, sync will keep cleaning it because it defaults to
> being safe. I'd say "not a bug" then. I agree it's not ideal, but
> until Jan's inode sync sweep code is accepted I don't think there's
> much that can be done about it.
  Yes, my writeback sweeping patch was aimed exactly to reliably address
this issue. Anyway, if we could get the async stuff working properly then I
think livelocks should happen much less often... Need to really find some
time for this.

									Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: endless sync on bdi_sched_wait()? 2.6.33.1
  2010-04-21  0:33       ` Jan Kara
@ 2010-04-21  1:54         ` Dave Chinner
  2010-04-21 13:27           ` Jan Kara
  0 siblings, 1 reply; 16+ messages in thread
From: Dave Chinner @ 2010-04-21  1:54 UTC (permalink / raw)
  To: Jan Kara
  Cc: Denys Fedorysychenko, Alexander Viro, linux-fsdevel, linux-kernel

On Wed, Apr 21, 2010 at 02:33:09AM +0200, Jan Kara wrote:
> On Mon 19-04-10 17:04:58, Dave Chinner wrote:
> > The first async flush does:
> >                                                                  vvvv
> >     flush-253:16-2497  [000] 616072.897747: writeback_exec: work=13c0 pages=13818, sb=0, kupdate=0, range_cyclic=0 for_background=0
> >     flush-253:16-2497  [000] 616072.897748: writeback_clear: work=ffff88003d8813c0, refs=1
> >     flush-253:16-2497  [000] 616072.897753: wbc_writeback_start: dev 253:16 wbc=9d20 towrt=1024 skip=0 sb=0 mode=0 kupd=0 bgrd=0 reclm=0 cyclic=0 more=0 older=0x0 start=0x0 end=0x7fffffffffffffff
> >     flush-253:16-2497  [000] 616072.897768: wbc_writeback_written: dev 253:16 wbc=9d20 towrt=1024 skip=0 sb=0 mode=0 kupd=0 bgrd=0 reclm=0 cyclic=0 more=0 older=0x0 start=0x0 end=0x7fffffffffffffff
> > 
> > Nothing - it does not write any pages towrt (nr_to_write) is
> > unchanged by the attempted flush.
>   This looks a bit strange. Surly there are plenty of dirty pages. I guess
> we never get to ->writepages for XFS.  But then I wonder how does it
> happen that we return without more_io set. Strange.

more_io not being set implies that we aren't calling requeue_io().
So that means it's not caused by I_SYNC being set. If we get down to
write_cache_pages, it implies that there are no dirty pages
remaining we can write back.

Given a background flush just completed before this queued async
flush was executed (didn't seem relevant, so I didn't include
it), it is entirely possible that there were no dirty pages to
write in the followup async flushes.

> > The third flush - the sync one - does:
> >                                                                  vvvv
> >     flush-253:16-2497  [000] 616072.897784: writeback_exec: work=3e58 pages=9223372036854775807, sb=1, kupdate=0, range_cyclic=0 for_background=0
> >     flush-253:16-2497  [000] 616072.897785: wbc_writeback_start: dev 253:16 wbc=9d20 towrt=1024 skip=0 sb=1 mode=1 kupd=0 bgrd=0 reclm=0 cyclic=0 more=0 older=0x0 start=0x0 end=0x7fffffffffffffff
> > 
> > some 75 seconds later having written only 1024 pages. In the mean
> > time, the traces show dd blocked in balance_dirty_pages():
> > 
> >               dd-2498  [000] 616072.908675: wbc_balance_dirty_start: dev 253:16 wbc=fb68 towrt=1536 skip=0 sb=0 mode=0 kupd=0 bgrd=0 reclm=0 cyclic=1 more=0 older=0x0 start=0x0 end=0x0
> >               dd-2498  [000] 616072.908679: wbc_balance_dirty_wait: dev 253:16 wbc=fb68 towrt=1536 skip=0 sb=0 mode=0 kupd=0 bgrd=0 reclm=0 cyclic=1 more=0 older=0x0 start=0x0 end=0x0
> >               dd-2498  [000] 616073.238785: wbc_balance_dirty_start: dev 253:16 wbc=fb68 towrt=1536 skip=0 sb=0 mode=0 kupd=0 bgrd=0 reclm=0 cyclic=1 more=0 older=0x0 start=0x0 end=0x0
> >               dd-2498  [000] 616073.238788: wbc_balance_dirty_wait: dev 253:16 wbc=fb68 towrt=1536 skip=0 sb=0 mode=0 kupd=0 bgrd=0 reclm=0 cyclic=1 more=0 older=0x0 start=0x0 end=0x0
> > And it appears to stay blocked there without doing any writeback at
> > all - there are no wbc_balance_dirty_pages_written traces at all.
> > That is, it is blocking until the number of dirty pages is dropping
> > below the dirty threshold, then continuing to write and dirty more
> > pages.
>   I think this happens because sync writeback is running so I_SYNC is set
> and thus we cannot do any writeout for the inode from balance_dirty_pages.

It's not even calling into writeback so the I_SYNC flag is way out of
scope ;)

> > <git blame>
> > 
> > commit 89e12190 - fix bug in nr_to_write introduced by dcf6a79d
> > commit dcf6a79d - fix bug in nr_to_write introduced by 05fe478d
> > commit 05fe478d - data integrity write fix: ignore nr_to_write for
> > 			WB_SYNC_ALL writes.
> > 		"This change does indeed make the possibility of
> > 		long stalls la[r]ger, and that's not a good thing,
> > 		but lying about data integrity is even worse."
> > 
> > IOWs, the observed sync behaviour is as intended - if you keep
> > dirtying the file, sync will keep cleaning it because it defaults to
> > being safe. I'd say "not a bug" then. I agree it's not ideal, but
> > until Jan's inode sync sweep code is accepted I don't think there's
> > much that can be done about it.
>   Yes, my writeback sweeping patch was aimed exactly to reliably address
> this issue. Anyway, if we could get the async stuff working properly then I
> think livelocks should happen much less often... Need to really find some
> time for this.

I think the async writeback is working correctly. It's just that if
we queue async writeback, and it runs directly after a previous
async writeback command was executed, it's possible it has nothing
to do. The problem is that the sync writeback will wait on pages
under writeback as it finds them, so it's likely to be running while
more pages get dirtied and that's when the the tail-chase in
write_cache_pages() starts.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: endless sync on bdi_sched_wait()? 2.6.33.1
  2010-04-21  1:54         ` Dave Chinner
@ 2010-04-21 13:27           ` Jan Kara
  2010-04-22  0:06             ` Dave Chinner
  0 siblings, 1 reply; 16+ messages in thread
From: Jan Kara @ 2010-04-21 13:27 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Jan Kara, Denys Fedorysychenko, Alexander Viro, linux-fsdevel,
	linux-kernel

On Wed 21-04-10 11:54:28, Dave Chinner wrote:
> On Wed, Apr 21, 2010 at 02:33:09AM +0200, Jan Kara wrote:
> > On Mon 19-04-10 17:04:58, Dave Chinner wrote:
> > > The first async flush does:
> > >                                                                  vvvv
> > >     flush-253:16-2497  [000] 616072.897747: writeback_exec: work=13c0 pages=13818, sb=0, kupdate=0, range_cyclic=0 for_background=0
> > >     flush-253:16-2497  [000] 616072.897748: writeback_clear: work=ffff88003d8813c0, refs=1
> > >     flush-253:16-2497  [000] 616072.897753: wbc_writeback_start: dev 253:16 wbc=9d20 towrt=1024 skip=0 sb=0 mode=0 kupd=0 bgrd=0 reclm=0 cyclic=0 more=0 older=0x0 start=0x0 end=0x7fffffffffffffff
> > >     flush-253:16-2497  [000] 616072.897768: wbc_writeback_written: dev 253:16 wbc=9d20 towrt=1024 skip=0 sb=0 mode=0 kupd=0 bgrd=0 reclm=0 cyclic=0 more=0 older=0x0 start=0x0 end=0x7fffffffffffffff
> > > 
> > > Nothing - it does not write any pages towrt (nr_to_write) is
> > > unchanged by the attempted flush.
> >   This looks a bit strange. Surly there are plenty of dirty pages. I guess
> > we never get to ->writepages for XFS.  But then I wonder how does it
> > happen that we return without more_io set. Strange.
> 
> more_io not being set implies that we aren't calling requeue_io().
> So that means it's not caused by I_SYNC being set. If we get down to
> write_cache_pages, it implies that there are no dirty pages
> remaining we can write back.
> 
> Given a background flush just completed before this queued async
> flush was executed (didn't seem relevant, so I didn't include
> it), it is entirely possible that there were no dirty pages to
> write in the followup async flushes.
  Ah, OK.

> > > The third flush - the sync one - does:
> > >                                                                  vvvv
> > >     flush-253:16-2497  [000] 616072.897784: writeback_exec: work=3e58 pages=9223372036854775807, sb=1, kupdate=0, range_cyclic=0 for_background=0
> > >     flush-253:16-2497  [000] 616072.897785: wbc_writeback_start: dev 253:16 wbc=9d20 towrt=1024 skip=0 sb=1 mode=1 kupd=0 bgrd=0 reclm=0 cyclic=0 more=0 older=0x0 start=0x0 end=0x7fffffffffffffff
> > > 
> > > some 75 seconds later having written only 1024 pages. In the mean
> > > time, the traces show dd blocked in balance_dirty_pages():
> > > 
> > >               dd-2498  [000] 616072.908675: wbc_balance_dirty_start: dev 253:16 wbc=fb68 towrt=1536 skip=0 sb=0 mode=0 kupd=0 bgrd=0 reclm=0 cyclic=1 more=0 older=0x0 start=0x0 end=0x0
> > >               dd-2498  [000] 616072.908679: wbc_balance_dirty_wait: dev 253:16 wbc=fb68 towrt=1536 skip=0 sb=0 mode=0 kupd=0 bgrd=0 reclm=0 cyclic=1 more=0 older=0x0 start=0x0 end=0x0
> > >               dd-2498  [000] 616073.238785: wbc_balance_dirty_start: dev 253:16 wbc=fb68 towrt=1536 skip=0 sb=0 mode=0 kupd=0 bgrd=0 reclm=0 cyclic=1 more=0 older=0x0 start=0x0 end=0x0
> > >               dd-2498  [000] 616073.238788: wbc_balance_dirty_wait: dev 253:16 wbc=fb68 towrt=1536 skip=0 sb=0 mode=0 kupd=0 bgrd=0 reclm=0 cyclic=1 more=0 older=0x0 start=0x0 end=0x0
> > > And it appears to stay blocked there without doing any writeback at
> > > all - there are no wbc_balance_dirty_pages_written traces at all.
> > > That is, it is blocking until the number of dirty pages is dropping
> > > below the dirty threshold, then continuing to write and dirty more
> > > pages.
> >   I think this happens because sync writeback is running so I_SYNC is set
> > and thus we cannot do any writeout for the inode from balance_dirty_pages.
> 
> It's not even calling into writeback so the I_SYNC flag is way out of
> scope ;)
  Are you sure? The tracepoints are in wb_writeback() but
writeback_inodes_wbc() calls directly into writeback_inodes_wb() so you
won't see any of the tracepoints to trigger. So how do you know we didn't
get to writeback_single_inode?

> > > <git blame>
> > > 
> > > commit 89e12190 - fix bug in nr_to_write introduced by dcf6a79d
> > > commit dcf6a79d - fix bug in nr_to_write introduced by 05fe478d
> > > commit 05fe478d - data integrity write fix: ignore nr_to_write for
> > > 			WB_SYNC_ALL writes.
> > > 		"This change does indeed make the possibility of
> > > 		long stalls la[r]ger, and that's not a good thing,
> > > 		but lying about data integrity is even worse."
> > > 
> > > IOWs, the observed sync behaviour is as intended - if you keep
> > > dirtying the file, sync will keep cleaning it because it defaults to
> > > being safe. I'd say "not a bug" then. I agree it's not ideal, but
> > > until Jan's inode sync sweep code is accepted I don't think there's
> > > much that can be done about it.
> >   Yes, my writeback sweeping patch was aimed exactly to reliably address
> > this issue. Anyway, if we could get the async stuff working properly then I
> > think livelocks should happen much less often... Need to really find some
> > time for this.
> 
> I think the async writeback is working correctly. It's just that if
> we queue async writeback, and it runs directly after a previous
> async writeback command was executed, it's possible it has nothing
> to do. The problem is that the sync writeback will wait on pages
> under writeback as it finds them, so it's likely to be running while
> more pages get dirtied and that's when the the tail-chase in
> write_cache_pages() starts.
  OK, probably you are right...
								Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: endless sync on bdi_sched_wait()? 2.6.33.1
  2010-04-21 13:27           ` Jan Kara
@ 2010-04-22  0:06             ` Dave Chinner
  2010-04-22 12:48               ` Jan Kara
  0 siblings, 1 reply; 16+ messages in thread
From: Dave Chinner @ 2010-04-22  0:06 UTC (permalink / raw)
  To: Jan Kara
  Cc: Denys Fedorysychenko, Alexander Viro, linux-fsdevel, linux-kernel

On Wed, Apr 21, 2010 at 03:27:18PM +0200, Jan Kara wrote:
> On Wed 21-04-10 11:54:28, Dave Chinner wrote:
> > On Wed, Apr 21, 2010 at 02:33:09AM +0200, Jan Kara wrote:
> > > On Mon 19-04-10 17:04:58, Dave Chinner wrote:
> > > > The third flush - the sync one - does:
.....
> > > > some 75 seconds later having written only 1024 pages. In the mean
> > > > time, the traces show dd blocked in balance_dirty_pages():
.....
> > > > And it appears to stay blocked there without doing any writeback at
> > > > all - there are no wbc_balance_dirty_pages_written traces at all.
> > > > That is, it is blocking until the number of dirty pages is dropping
> > > > below the dirty threshold, then continuing to write and dirty more
> > > > pages.
> > >   I think this happens because sync writeback is running so I_SYNC is set
> > > and thus we cannot do any writeout for the inode from balance_dirty_pages.
> > 
> > It's not even calling into writeback so the I_SYNC flag is way out of
> > scope ;)
>   Are you sure? The tracepoints are in wb_writeback() but
> writeback_inodes_wbc() calls directly into writeback_inodes_wb() so you
> won't see any of the tracepoints to trigger. So how do you know we didn't
> get to writeback_single_inode?

The balance_dirty_pages() tracing code added this hunk:

@@ -536,11 +537,13 @@ static void balance_dirty_pages(struct address_space *mapping,
                 * threshold otherwise wait until the disk writes catch
                 * up.
                 */
+               trace_wbc_balance_dirty_start(&wbc);
                if (bdi_nr_reclaimable > bdi_thresh) {
                        writeback_inodes_wbc(&wbc);
                        pages_written += write_chunk - wbc.nr_to_write;
                        get_dirty_limits(&background_thresh, &dirty_thresh,
                                       &bdi_thresh, bdi);
+                       trace_wbc_balance_dirty_written(&wbc);
                }

                /*

So if we tried to do writeback from here, the
wbc_balance_dirty_written trace would have been emitted, and that is
not showing up very often in any of the traces. e.g:

$ grep balance t.t |grep start |wc -l
4356
$ grep balance t.t |grep wait |wc -l
2171
$ grep balance t.t |grep written |wc -l
7

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: endless sync on bdi_sched_wait()? 2.6.33.1
  2010-04-22  0:06             ` Dave Chinner
@ 2010-04-22 12:48               ` Jan Kara
  0 siblings, 0 replies; 16+ messages in thread
From: Jan Kara @ 2010-04-22 12:48 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Jan Kara, Denys Fedorysychenko, Alexander Viro, linux-fsdevel,
	linux-kernel

On Thu 22-04-10 10:06:52, Dave Chinner wrote:
> On Wed, Apr 21, 2010 at 03:27:18PM +0200, Jan Kara wrote:
> > On Wed 21-04-10 11:54:28, Dave Chinner wrote:
> > > On Wed, Apr 21, 2010 at 02:33:09AM +0200, Jan Kara wrote:
> > > > On Mon 19-04-10 17:04:58, Dave Chinner wrote:
> > > > > The third flush - the sync one - does:
> .....
> > > > > some 75 seconds later having written only 1024 pages. In the mean
> > > > > time, the traces show dd blocked in balance_dirty_pages():
> .....
> > > > > And it appears to stay blocked there without doing any writeback at
> > > > > all - there are no wbc_balance_dirty_pages_written traces at all.
> > > > > That is, it is blocking until the number of dirty pages is dropping
> > > > > below the dirty threshold, then continuing to write and dirty more
> > > > > pages.
> > > >   I think this happens because sync writeback is running so I_SYNC is set
> > > > and thus we cannot do any writeout for the inode from balance_dirty_pages.
> > > 
> > > It's not even calling into writeback so the I_SYNC flag is way out of
> > > scope ;)
> >   Are you sure? The tracepoints are in wb_writeback() but
> > writeback_inodes_wbc() calls directly into writeback_inodes_wb() so you
> > won't see any of the tracepoints to trigger. So how do you know we didn't
> > get to writeback_single_inode?
> 
> The balance_dirty_pages() tracing code added this hunk:
> 
> @@ -536,11 +537,13 @@ static void balance_dirty_pages(struct address_space *mapping,
>                  * threshold otherwise wait until the disk writes catch
>                  * up.
>                  */
> +               trace_wbc_balance_dirty_start(&wbc);
>                 if (bdi_nr_reclaimable > bdi_thresh) {
>                         writeback_inodes_wbc(&wbc);
>                         pages_written += write_chunk - wbc.nr_to_write;
>                         get_dirty_limits(&background_thresh, &dirty_thresh,
>                                        &bdi_thresh, bdi);
> +                       trace_wbc_balance_dirty_written(&wbc);
>                 }
> 
>                 /*
> 
> So if we tried to do writeback from here, the
> wbc_balance_dirty_written trace would have been emitted, and that is
> not showing up very often in any of the traces. e.g:
> 
> $ grep balance t.t |grep start |wc -l
> 4356
> $ grep balance t.t |grep wait |wc -l
> 2171
> $ grep balance t.t |grep written |wc -l
> 7
  Ah, OK. I've missed the 'written' trace. Thanks for explanation. So it
means that enough pages are under writeback and we just wait in
balance_dirty_pages for writes to finish. That works as expected. Fine.

								Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2010-04-22 12:48 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-03-31 16:07 endless sync on bdi_sched_wait()? 2.6.33.1 Denys Fedorysychenko
2010-03-31 22:12 ` Dave Chinner
2010-04-01 10:42   ` Denys Fedorysychenko
2010-04-01 11:13     ` Dave Chinner
2010-04-01 20:14       ` Jeff Moyer
2010-04-08  9:28 ` Jan Kara
2010-04-08 10:12   ` Denys Fedorysychenko
2010-04-12  0:47   ` Dave Chinner
2010-04-19  1:37   ` Dave Chinner
2010-04-19  7:04     ` Dave Chinner
2010-04-19  7:23       ` Dave Chinner
2010-04-21  0:33       ` Jan Kara
2010-04-21  1:54         ` Dave Chinner
2010-04-21 13:27           ` Jan Kara
2010-04-22  0:06             ` Dave Chinner
2010-04-22 12:48               ` Jan Kara

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).