ppoll() stuck on POLLIN while TCP peer is sending

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* ppoll() stuck on POLLIN while TCP peer is sending
@ 2012-12-28  1:45 Eric Wong
  2012-12-28  7:06 ` Eric Wong
                   ` (2 more replies)
  0 siblings, 3 replies; 53+ messages in thread
From: Eric Wong @ 2012-12-28  1:45 UTC (permalink / raw)
  To: netdev, linux-kernel
  Cc: Andreas Voellmy, viro, linux-fsdevel, Junchang(Jason) Wang

I'm finding ppoll() unexpectedly stuck when waiting for POLLIN on a
local TCP socket.  The isolated code below can reproduces the issue
after many minutes (<1 hour).  It might be easier to reproduce on
a busy system while disk I/O is happening.

This may also be related to an epoll-related issue reported
by Andreas Voellmy:
http://thread.gmane.org/gmane.linux.kernel/1408782/

My example involves a 3 thread data flow between two pairs
of (4) sockets:

	 send_loop ->   recv_loop(recv_send)   -> recv_loop(recv_only)
	 pair_a[1] -> (pair_a[0] -> pair_b[1]) -> pair_b[0]

At least 3.7 and 3.7.1 are affected.

I have tcp_low_latency=1 set, I will try 0 later

The last progress message I got was after receiving 2942052597760
bytes on fd=7 (out of 64-bit ULONG_MAX / 2)

strace:

3644  sendto(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384, 0, NULL, 0 <unfinished ...>
3643  sendto(6, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384, 0, NULL, 0 <unfinished ...>
3642  ppoll([{fd=7, events=POLLIN}], 1, NULL, NULL, 8 <unfinished ...>
3641  futex(0x7f23ed8129d0, FUTEX_WAIT, 3644, NULL <unfinished ...>

The first and last lines of the strace are expected:

+ 3644	sendto(4) is blocked because 3643 is blocked on sendto(fd=6)
  and not able to call recv().
+ 3641 is the main thread calling pthread_join

What is unexpected is the tid=3643 and tid=3642 interaction.  As confirmed
by lsof below, fd=6 is sending to wake up fd=7, but ppoll(fd=7) seems
to not be waking up.

lsof:
toosleepy 3641   ew    4u  IPv4  12405      0t0     TCP localhost:55904->localhost:33249 (ESTABLISHED)
toosleepy 3641   ew    5u  IPv4  12406      0t0     TCP localhost:33249->localhost:55904 (ESTABLISHED)
toosleepy 3641   ew    6u  IPv4  12408      0t0     TCP localhost:48777->localhost:33348 (ESTABLISHED)
toosleepy 3641   ew    7u  IPv4  12409      0t0     TCP localhost:33348->localhost:48777 (ESTABLISHED)

System info: Linux 3.7.1 x86_64 SMP PREEMPT
AMD Phenom(tm) II X4 945 Processor (4 cores)
Nothing interesting in dmesg, iptables rules are empty.

I have not yet been able to reproduce the issue using UNIX sockets,
only TCP, but you can run:

  ./toosleepy unix

...to test with UNIX sockets intead of TCP.

The following code is also available via git://bogomips.org/toosleepy
gcc -o toosleepy -O2 -Wall -lpthread toosleepy.c
-------------------------------- 8< ------------------------------------
#define _GNU_SOURCE
#include <poll.h>
#include <sys/ioctl.h>
#include <pthread.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <arpa/inet.h>
#include <netinet/tcp.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
#include <assert.h>
#include <limits.h>

struct receiver {
	int rfd;
	int sfd;
};

/* blocking sender */
static void * send_loop(void *fdp)
{
	int fd = *(int *)fdp;
	char buf[16384];
	ssize_t s;
	size_t sent = 0;
	size_t max = (size_t)ULONG_MAX / 2;

	while (sent < max) {
		s = send(fd, buf, sizeof(buf), 0);
		if (s > 0)
			sent += s;
		if (s == -1)
			assert(errno == EINTR);
	}
	dprintf(2, "%d done sending: %zu\n", fd, sent);
	close(fd);
	return NULL;
}

/* non-blocking receiver, using ppoll */
static void * recv_loop(void *p)
{
	const struct receiver *rcvr = p;
	char buf[16384];
	nfds_t nfds = 1;
	struct pollfd fds;
	int rc;
	ssize_t r, s;
	size_t received = 0;
	size_t sent = 0;

	for (;;) {
		r = recv(rcvr->rfd, buf, sizeof(buf), 0);
		if (r == 0) {
			break;
		} else if (r == -1) {
			assert(errno == EAGAIN);

			fds.fd = rcvr->rfd;
			fds.events = POLLIN;
			errno = 0;
			rc = ppoll(&fds, nfds, NULL, NULL);
			assert(rc == 1);
		} else {
			assert(r > 0);
			received += r;
			if (rcvr->sfd >= 0) {
				s = send(rcvr->sfd, buf, sizeof(buf), 0);
				if (s > 0)
					sent += s;
				if (s == -1)
					assert(errno == EINTR);
			} else {
				/* just burn some cycles */
				write(-1, buf, sizeof(buf));
			}
		}
		if ((received % (sizeof(buf) * sizeof(buf) * 16) == 0))
			dprintf(2, " %d progress: %zu\n",
			        rcvr->rfd, received);
	}
	dprintf(2, "%d got: %zu\n", rcvr->rfd, received);
	if (rcvr->sfd >= 0) {
		dprintf(2, "%d sent: %zu\n", rcvr->sfd, sent);
		close(rcvr->sfd);
	}

	return NULL;
}

static void tcp_socketpair(int sv[2], int accept_flags)
{
	struct sockaddr_in addr;
	socklen_t addrlen = sizeof(addr);
	int l = socket(PF_INET, SOCK_STREAM, 0);
	int c = socket(PF_INET, SOCK_STREAM, 0);
	int a;

	addr.sin_family = AF_INET;
	addr.sin_addr.s_addr = INADDR_ANY;
	addr.sin_port = 0;
	assert(0 == bind(l, (struct sockaddr*)&addr, addrlen));
	assert(0 == listen(l, 1024));
	assert(0 == getsockname(l, (struct sockaddr *)&addr, &addrlen));
	assert(0 == connect(c, (struct sockaddr *)&addr, addrlen));
	a = accept4(l, NULL, NULL, accept_flags);
	assert(a >= 0);
	close(l);
	sv[0] = a;
	sv[1] = c;
}

int main(int argc, char *argv[])
{
	int pair_a[2];
	int pair_b[2];
	pthread_t s, rs, r;
	struct receiver recv_only;
	struct receiver recv_send;

	if (argc == 2 && strcmp(argv[1], "unix") == 0) {
		int val;
		assert(0 == socketpair(AF_UNIX, SOCK_STREAM, 0, pair_a));
		assert(0 == socketpair(AF_UNIX, SOCK_STREAM, 0, pair_b));
		/* only make the receiver non-blocking */
		val = 1;
		assert(0 == ioctl(pair_a[0], FIONBIO, &val));
		val = 1;
		assert(0 == ioctl(pair_b[0], FIONBIO, &val));
	} else {
		tcp_socketpair(pair_a, SOCK_NONBLOCK);
		tcp_socketpair(pair_b, SOCK_NONBLOCK);
	}

	recv_send.rfd = pair_a[0];
	recv_send.sfd = pair_b[1];
	recv_only.rfd = pair_b[0];
	recv_only.sfd = -1;

	/*
	 * data flow:
	 * send_loop ->   recv_loop(recv_send)   -> recv_loop(recv_only)
	 * pair_a[1] -> (pair_a[0] -> pair_b[1]) -> pair_b[0]
	 */
	assert(0 == pthread_create(&r, NULL, recv_loop, &recv_only));
	assert(0 == pthread_create(&rs, NULL, recv_loop, &recv_send));
	assert(0 == pthread_create(&s, NULL, send_loop, &pair_a[1]));
	assert(0 == pthread_join(s, NULL));
	assert(0 == pthread_join(rs, NULL));
	assert(0 == pthread_join(r, NULL));

	return 0;
}
-------------------------------- 8< ------------------------------------
Any help/suggestions/test patches would be greatly appreciated.
Thanks for reading!

-- 
Eric Wong

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: ppoll() stuck on POLLIN while TCP peer is sending
  2012-12-28  1:45 ppoll() stuck on POLLIN while TCP peer is sending Eric Wong
@ 2012-12-28  7:06 ` Eric Wong
  2012-12-29 11:34   ` Eric Wong
  2012-12-31 13:21 ` [PATCH] poll: prevent missed events if _qproc is NULL Eric Wong
  2013-01-02 20:08 ` ppoll() stuck on POLLIN while TCP peer is sending Eric Wong
  2 siblings, 1 reply; 53+ messages in thread
From: Eric Wong @ 2012-12-28  7:06 UTC (permalink / raw)
  To: netdev, linux-kernel
  Cc: Andreas Voellmy, viro, linux-fsdevel, Junchang(Jason) Wang

Eric Wong <normalperson@yhbt.net> wrote:
> I'm finding ppoll() unexpectedly stuck when waiting for POLLIN on a
> local TCP socket.  The isolated code below can reproduces the issue
> after many minutes (<1 hour).  It might be easier to reproduce on
> a busy system while disk I/O is happening.

Ugh, I can't seem to reproduce this anymore...  Will try something
else tomorrow.

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: ppoll() stuck on POLLIN while TCP peer is sending
  2012-12-28  7:06 ` Eric Wong
@ 2012-12-29 11:34   ` Eric Wong
  0 siblings, 0 replies; 53+ messages in thread
From: Eric Wong @ 2012-12-29 11:34 UTC (permalink / raw)
  To: netdev, linux-kernel
  Cc: Andreas Voellmy, viro, linux-fsdevel, Junchang(Jason) Wang

Eric Wong <normalperson@yhbt.net> wrote:
> Eric Wong <normalperson@yhbt.net> wrote:
> > I'm finding ppoll() unexpectedly stuck when waiting for POLLIN on a
> > local TCP socket.  The isolated code below can reproduces the issue
> > after many minutes (<1 hour).  It might be easier to reproduce on
> > a busy system while disk I/O is happening.
> 
> Ugh, I can't seem to reproduce this anymore...  Will try something
> else tomorrow.

The good news is I'm not imagining this...

The bad news is the issue is real and took a long time to reproduce
again.  This issue happens even without preempt, and without
tcp_low_latency on 3.7.1

While running `toosleepy', I also needed to run heavy (not loopback)
network and disk activity (several USB, SATA, and eSATA drives
simultaneously) for many hours before hitting this.

Hopefully this report is helpful in solving the issue.  Looking in at
the various pieces in net and select/poll paths, there's several
references to race conditions in the comments so this is hopefully
familiar territory to someone here...

^ permalink raw reply	[flat|nested] 53+ messages in thread

* [PATCH] poll: prevent missed events if _qproc is NULL
  2012-12-28  1:45 ppoll() stuck on POLLIN while TCP peer is sending Eric Wong
  2012-12-28  7:06 ` Eric Wong
@ 2012-12-31 13:21 ` Eric Wong
  2012-12-31 23:24   ` Eric Wong
  2013-01-01 18:42   ` Eric Dumazet
  2013-01-02 20:08 ` ppoll() stuck on POLLIN while TCP peer is sending Eric Wong
  2 siblings, 2 replies; 53+ messages in thread
From: Eric Wong @ 2012-12-31 13:21 UTC (permalink / raw)
  To: linux-kernel
  Cc: Eric Wong, Hans Verkuil, Jiri Olsa, Jonathan Corbet, Al Viro,
	Davide Libenzi, Hans de Goede, Mauro Carvalho Chehab,
	David Miller, Eric Dumazet, Andrew Morton, Linus Torvalds,
	Andreas Voellmy, Junchang(Jason) Wang, netdev, linux-fsdevel

This patch seems to fix my issue with ppoll() being stuck on my
SMP machine: http://article.gmane.org/gmane.linux.file-systems/70414

The change to sock_poll_wait() in
commit 626cf236608505d376e4799adb4f7eb00a8594af
  (poll: add poll_requested_events() and poll_does_not_wait() functions)
seems to have allowed additional cases where the SMP memory barrier
is not issued before checking for readiness.

In my case, this affects the select()-family of functions
which register descriptors once and set _qproc to NULL before
checking events again (after poll_schedule_timeout() returns).
The set_mb() barrier in poll_schedule_timeout() appears to be
insufficient on my SMP x86-64 machine (as it's only an xchg()).

This may also be related to the epoll issue described by
Andreas Voellmy in http://thread.gmane.org/gmane.linux.kernel/1408782/

Signed-off-by: Eric Wong <normalperson@yhbt.net>
Cc: Hans Verkuil <hans.verkuil@cisco.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Davide Libenzi <davidel@xmailserver.org>
Cc: Hans de Goede <hdegoede@redhat.com>
Cc: Mauro Carvalho Chehab <mchehab@infradead.org>
Cc: David Miller <davem@davemloft.net>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andreas Voellmy <andreas.voellmy@yale.edu>
Cc: "Junchang(Jason) Wang" <junchang.wang@yale.edu>
Cc: netdev@vger.kernel.org
Cc: linux-fsdevel@vger.kernel.org
---
 If this patch is correct, I think we can just drop the
 poll_does_not_wait() function entirely since poll_wait()
 does the same check anyways...

 include/net/sock.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index c945fba..1923e48 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1925,8 +1925,9 @@ static inline bool wq_has_sleeper(struct socket_wq *wq)
 static inline void sock_poll_wait(struct file *filp,
 		wait_queue_head_t *wait_address, poll_table *p)
 {
-	if (!poll_does_not_wait(p) && wait_address) {
-		poll_wait(filp, wait_address, p);
+	if (wait_address) {
+		if (!poll_does_not_wait(p))
+			poll_wait(filp, wait_address, p);
 		/* We need to be sure we are in sync with the
 		 * socket flags modification.
 		 *
-- 
Eric Wong

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* Re: [PATCH] poll: prevent missed events if _qproc is NULL
  2012-12-31 13:21 ` [PATCH] poll: prevent missed events if _qproc is NULL Eric Wong
@ 2012-12-31 23:24   ` Eric Wong
  2013-01-01 16:58     ` Junchang(Jason) Wang
  2013-01-01 18:42   ` Eric Dumazet
  1 sibling, 1 reply; 53+ messages in thread
From: Eric Wong @ 2012-12-31 23:24 UTC (permalink / raw)
  To: linux-kernel
  Cc: Hans Verkuil, Jiri Olsa, Jonathan Corbet, Al Viro, Davide Libenzi,
	Hans de Goede, Mauro Carvalho Chehab, David Miller, Eric Dumazet,
	Andrew Morton, Linus Torvalds, Andreas Voellmy,
	Junchang(Jason) Wang, netdev, linux-fsdevel

Eric Wong <normalperson@yhbt.net> wrote:
> This patch seems to fix my issue with ppoll() being stuck on my
> SMP machine: http://article.gmane.org/gmane.linux.file-systems/70414

OK, it doesn't fix my issue, but it seems to make it harder-to-hit...

> The change to sock_poll_wait() in
> commit 626cf236608505d376e4799adb4f7eb00a8594af
>   (poll: add poll_requested_events() and poll_does_not_wait() functions)
> seems to have allowed additional cases where the SMP memory barrier
> is not issued before checking for readiness.
> 
> In my case, this affects the select()-family of functions
> which register descriptors once and set _qproc to NULL before
> checking events again (after poll_schedule_timeout() returns).
> The set_mb() barrier in poll_schedule_timeout() appears to be
> insufficient on my SMP x86-64 machine (as it's only an xchg()).
> 
> This may also be related to the epoll issue described by
> Andreas Voellmy in http://thread.gmane.org/gmane.linux.kernel/1408782/

However, I believe my patch will still fix Andreas' issue with epoll
due to how ep_modify() uses a NULL qproc when calling ->poll().

(I've never been able to reproduce Andreas' issue on my 4-core system,
 but he's been hitting it since 3.4 (at least))

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH] poll: prevent missed events if _qproc is NULL
  2012-12-31 23:24   ` Eric Wong
@ 2013-01-01 16:58     ` Junchang(Jason) Wang
  0 siblings, 0 replies; 53+ messages in thread
From: Junchang(Jason) Wang @ 2013-01-01 16:58 UTC (permalink / raw)
  To: Eric Wong
  Cc: linux-kernel, Hans Verkuil, Jiri Olsa, Jonathan Corbet, Al Viro,
	Davide Libenzi, Hans de Goede, Mauro Carvalho Chehab,
	David Miller, Eric Dumazet, Andrew Morton, Linus Torvalds,
	Andreas Voellmy, netdev, linux-fsdevel

Hi Eric and list,

Thanks a lot. The patch solves our (Andreas and my) issue in using
epoll. Here's our test program
https://github.com/AndreasVoellmy/epollbug/blob/master/epollbug.c  We
are using Linux 3.7.1 and a server with 80 cores.

Cheers!

--Jason

On Mon, Dec 31, 2012 at 6:24 PM, Eric Wong <normalperson@yhbt.net> wrote:
>
> Eric Wong <normalperson@yhbt.net> wrote:
> > This patch seems to fix my issue with ppoll() being stuck on my
> > SMP machine: http://article.gmane.org/gmane.linux.file-systems/70414
>
> OK, it doesn't fix my issue, but it seems to make it harder-to-hit...
>
> > The change to sock_poll_wait() in
> > commit 626cf236608505d376e4799adb4f7eb00a8594af
> >   (poll: add poll_requested_events() and poll_does_not_wait() functions)
> > seems to have allowed additional cases where the SMP memory barrier
> > is not issued before checking for readiness.
> >
> > In my case, this affects the select()-family of functions
> > which register descriptors once and set _qproc to NULL before
> > checking events again (after poll_schedule_timeout() returns).
> > The set_mb() barrier in poll_schedule_timeout() appears to be
> > insufficient on my SMP x86-64 machine (as it's only an xchg()).
> >
> > This may also be related to the epoll issue described by
> > Andreas Voellmy in http://thread.gmane.org/gmane.linux.kernel/1408782/
>
> However, I believe my patch will still fix Andreas' issue with epoll
> due to how ep_modify() uses a NULL qproc when calling ->poll().
>
> (I've never been able to reproduce Andreas' issue on my 4-core system,
>  but he's been hitting it since 3.4 (at least))

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH] poll: prevent missed events if _qproc is NULL
  2012-12-31 13:21 ` [PATCH] poll: prevent missed events if _qproc is NULL Eric Wong
  2012-12-31 23:24   ` Eric Wong
@ 2013-01-01 18:42   ` Eric Dumazet
  2013-01-01 21:00     ` Eric Wong
  1 sibling, 1 reply; 53+ messages in thread
From: Eric Dumazet @ 2013-01-01 18:42 UTC (permalink / raw)
  To: Eric Wong
  Cc: linux-kernel, Hans Verkuil, Jiri Olsa, Jonathan Corbet, Al Viro,
	Davide Libenzi, Hans de Goede, Mauro Carvalho Chehab,
	David Miller, Andrew Morton, Linus Torvalds, Andreas Voellmy,
	Junchang(Jason) Wang, netdev, linux-fsdevel

On Mon, 2012-12-31 at 13:21 +0000, Eric Wong wrote:
> This patch seems to fix my issue with ppoll() being stuck on my
> SMP machine: http://article.gmane.org/gmane.linux.file-systems/70414
> 
> The change to sock_poll_wait() in
> commit 626cf236608505d376e4799adb4f7eb00a8594af
>   (poll: add poll_requested_events() and poll_does_not_wait() functions)
> seems to have allowed additional cases where the SMP memory barrier
> is not issued before checking for readiness.
> 
> In my case, this affects the select()-family of functions
> which register descriptors once and set _qproc to NULL before
> checking events again (after poll_schedule_timeout() returns).
> The set_mb() barrier in poll_schedule_timeout() appears to be
> insufficient on my SMP x86-64 machine (as it's only an xchg()).
> 
> This may also be related to the epoll issue described by
> Andreas Voellmy in http://thread.gmane.org/gmane.linux.kernel/1408782/

Hmm, the change seems not very logical to me.

If it helps, I would like to understand the real issue.

commit 626cf236608505d376e4799adb4f7eb00a8594af should not have this
side effect, at least for poll()/select() functions. The epoll() changes
I am not yet very confident.

I suspect a race already existed before this commit, it would be nice to
track it properly.

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH] poll: prevent missed events if _qproc is NULL
  2013-01-01 18:42   ` Eric Dumazet
@ 2013-01-01 21:00     ` Eric Wong
  2013-01-01 21:17       ` Eric Wong
  0 siblings, 1 reply; 53+ messages in thread
From: Eric Wong @ 2013-01-01 21:00 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: linux-kernel, Hans Verkuil, Jiri Olsa, Jonathan Corbet, Al Viro,
	Davide Libenzi, Hans de Goede, Mauro Carvalho Chehab,
	David Miller, Andrew Morton, Linus Torvalds, Andreas Voellmy,
	Junchang(Jason) Wang, netdev, linux-fsdevel

Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Mon, 2012-12-31 at 13:21 +0000, Eric Wong wrote:
> > This patch seems to fix my issue with ppoll() being stuck on my
> > SMP machine: http://article.gmane.org/gmane.linux.file-systems/70414
> > 
> > The change to sock_poll_wait() in
> > commit 626cf236608505d376e4799adb4f7eb00a8594af
> >   (poll: add poll_requested_events() and poll_does_not_wait() functions)
> > seems to have allowed additional cases where the SMP memory barrier
> > is not issued before checking for readiness.
> > 
> > In my case, this affects the select()-family of functions
> > which register descriptors once and set _qproc to NULL before
> > checking events again (after poll_schedule_timeout() returns).
> > The set_mb() barrier in poll_schedule_timeout() appears to be
> > insufficient on my SMP x86-64 machine (as it's only an xchg()).
> > 
> > This may also be related to the epoll issue described by
> > Andreas Voellmy in http://thread.gmane.org/gmane.linux.kernel/1408782/
> 
> Hmm, the change seems not very logical to me.

My original description was not complete and I'm still bisecting
my problem (ppoll + send stuck).  However, my patch does solve the
issue Andreas encountered and I now understand why.

> If it helps, I would like to understand the real issue.
>
> commit 626cf236608505d376e4799adb4f7eb00a8594af should not have this
> side effect, at least for poll()/select() functions. The epoll() changes
> I am not yet very confident.

I have a better explanation of the epoll problem below.

An alternate version (limited to epoll) would be:

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index cd96649..ca5f3d0 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1299,6 +1299,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
 	 * Get current event bits. We can safely use the file* here because
 	 * its usage count has been increased by the caller of this function.
 	 */
+	smp_mb();
 	revents = epi->ffd.file->f_op->poll(epi->ffd.file, &pt);
 
 	/*

> I suspect a race already existed before this commit, it would be nice to
> track it properly.

I don't believe this race existed before that change.

Updated commit message below:

>From 87bca82bc39a941d9b8d5b8bc08b39a071a9884f Mon Sep 17 00:00:00 2001
From: Eric Wong <normalperson@yhbt.net>
Date: Mon, 31 Dec 2012 13:20:23 +0000
Subject: [PATCH] epoll: prevent missed events on EPOLL_CTL_MOD

ep_modify() works on files that are already registered with a wait queue
(and thus should not reregister).  For sockets, this means sk_sleep()
will return a non-NULL wait address.

ep_modify() must check for events that were received and ignored
_before_ ep_modify() was called.  So it must call f_op->poll() to
fish for events _after_ changing epi->event.events.

When f_op->poll() calls tcp_poll() (and thus sock_poll_wait()),
wait_address is non-NULL because the socket was already registered by
epoll.  Thus, ep_modify() passes a NULL pt to prevent re-registration.

When ep_modify() is called, sock_poll_wait() will see a wait_address,
but a NULL pt, and this caused the memory barrier to get skipped and
events to be missed (this memory barrier is described in the
documentation for wq_has_sleeper).

This regression appeared with the change to sock_poll_wait() in
commit 626cf236608505d376e4799adb4f7eb00a8594af
  (poll: add poll_requested_events() and poll_does_not_wait() functions)

This issue was encountered by Andreas Voellmy and Junchang(Jason) Wang:
http://thread.gmane.org/gmane.linux.kernel/1408782/

Signed-off-by: Eric Wong <normalperson@yhbt.net>
Cc: Hans Verkuil <hans.verkuil@cisco.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Davide Libenzi <davidel@xmailserver.org>
Cc: Hans de Goede <hdegoede@redhat.com>
Cc: Mauro Carvalho Chehab <mchehab@infradead.org>
Cc: David Miller <davem@davemloft.net>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Tested-by: Andreas Voellmy <andreas.voellmy@yale.edu>
Tested-by: "Junchang(Jason) Wang" <junchang.wang@yale.edu>
Cc: netdev@vger.kernel.org
Cc: linux-fsdevel@vger.kernel.org
---
 include/net/sock.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index c945fba..1923e48 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1925,8 +1925,9 @@ static inline bool wq_has_sleeper(struct socket_wq *wq)
 static inline void sock_poll_wait(struct file *filp,
 		wait_queue_head_t *wait_address, poll_table *p)
 {
-	if (!poll_does_not_wait(p) && wait_address) {
-		poll_wait(filp, wait_address, p);
+	if (wait_address) {
+		if (!poll_does_not_wait(p))
+			poll_wait(filp, wait_address, p);
 		/* We need to be sure we are in sync with the
 		 * socket flags modification.
 		 *
-- 
Eric Wong

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* Re: [PATCH] poll: prevent missed events if _qproc is NULL
  2013-01-01 21:00     ` Eric Wong
@ 2013-01-01 21:17       ` Eric Wong
  2013-01-01 22:53         ` Linus Torvalds
  0 siblings, 1 reply; 53+ messages in thread
From: Eric Wong @ 2013-01-01 21:17 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: linux-kernel, Hans Verkuil, Jiri Olsa, Jonathan Corbet, Al Viro,
	Davide Libenzi, Hans de Goede, Mauro Carvalho Chehab,
	David Miller, Andrew Morton, Linus Torvalds, Andreas Voellmy,
	Junchang(Jason) Wang, netdev, linux-fsdevel

Eric Wong <normalperson@yhbt.net> wrote:
> Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > commit 626cf236608505d376e4799adb4f7eb00a8594af should not have this
> > side effect, at least for poll()/select() functions. The epoll() changes
> > I am not yet very confident.
> 
> I have a better explanation of the epoll problem below.
> 
> An alternate version (limited to epoll) would be:
> 
> diff --git a/fs/eventpoll.c b/fs/eventpoll.c
> index cd96649..ca5f3d0 100644
> --- a/fs/eventpoll.c
> +++ b/fs/eventpoll.c
> @@ -1299,6 +1299,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
>  	 * Get current event bits. We can safely use the file* here because
>  	 * its usage count has been increased by the caller of this function.
>  	 */
> +	smp_mb();
>  	revents = epi->ffd.file->f_op->poll(epi->ffd.file, &pt);
>  
>  	/*
> 
> > I suspect a race already existed before this commit, it would be nice to
> > track it properly.
> 
> I don't believe this race existed before that change.

I was wrong, rereading 626cf236608505d376e4799adb4f7eb00a8594af,
I think this race existed before.

Perhaps my alternate patch above is a better fix.

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH] poll: prevent missed events if _qproc is NULL
  2013-01-01 21:17       ` Eric Wong
@ 2013-01-01 22:53         ` Linus Torvalds
  2013-01-01 23:21           ` Junchang(Jason) Wang
  2013-01-01 23:56           ` [PATCH] epoll: prevent missed events on EPOLL_CTL_MOD Eric Wong
  0 siblings, 2 replies; 53+ messages in thread
From: Linus Torvalds @ 2013-01-01 22:53 UTC (permalink / raw)
  To: Eric Wong
  Cc: Eric Dumazet, Linux Kernel Mailing List, Hans Verkuil, Jiri Olsa,
	Jonathan Corbet, Al Viro, Davide Libenzi, Hans de Goede,
	Mauro Carvalho Chehab, David Miller, Andrew Morton,
	Andreas Voellmy, Junchang(Jason) Wang, Network Development,
	linux-fsdevel

On Tue, Jan 1, 2013 at 1:17 PM, Eric Wong <normalperson@yhbt.net> wrote:
>>
>> An alternate version (limited to epoll) would be:
>>
>> diff --git a/fs/eventpoll.c b/fs/eventpoll.c
>> index cd96649..ca5f3d0 100644
>> --- a/fs/eventpoll.c
>> +++ b/fs/eventpoll.c
>> @@ -1299,6 +1299,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
>>        * Get current event bits. We can safely use the file* here because
>>        * its usage count has been increased by the caller of this function.
>>        */
>> +     smp_mb();
>>       revents = epi->ffd.file->f_op->poll(epi->ffd.file, &pt);
>>
>>       /*
>
> I was wrong, rereading 626cf236608505d376e4799adb4f7eb00a8594af,
> I think this race existed before.
>
> Perhaps my alternate patch above is a better fix.

Please document the barrier that this mb() pairs with, and then give
an explanation for the fix in the commit message, and I'll happily
take it. Even if it's just duplicating the comments above the
wq_has_sleeper() function, except modified for the ep_modify() case.

Of course, it would be good to get verification from Jason and Andreas
that the alternate patch also works for them.

               Linus

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH] poll: prevent missed events if _qproc is NULL
  2013-01-01 22:53         ` Linus Torvalds
@ 2013-01-01 23:21           ` Junchang(Jason) Wang
  2013-01-01 23:56           ` [PATCH] epoll: prevent missed events on EPOLL_CTL_MOD Eric Wong
  1 sibling, 0 replies; 53+ messages in thread
From: Junchang(Jason) Wang @ 2013-01-01 23:21 UTC (permalink / raw)
  To: Linus Torvalds, Eric Wong
  Cc: Eric Dumazet, Linux Kernel Mailing List, Hans Verkuil, Jiri Olsa,
	Jonathan Corbet, Al Viro, Davide Libenzi, Hans de Goede,
	Mauro Carvalho Chehab, David Miller, Andrew Morton,
	Andreas Voellmy, Network Development, linux-fsdevel

Hi all,

The alternate patch from Eric works well too. Even though I didn't see
a performance boost compared with the old version, this one is clearer
to me. Thanks your guys.


Cheers!

--Jason


On Tue, Jan 1, 2013 at 5:53 PM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
> On Tue, Jan 1, 2013 at 1:17 PM, Eric Wong <normalperson@yhbt.net> wrote:
>>>
>>> An alternate version (limited to epoll) would be:
>>>
>>> diff --git a/fs/eventpoll.c b/fs/eventpoll.c
>>> index cd96649..ca5f3d0 100644
>>> --- a/fs/eventpoll.c
>>> +++ b/fs/eventpoll.c
>>> @@ -1299,6 +1299,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
>>>        * Get current event bits. We can safely use the file* here because
>>>        * its usage count has been increased by the caller of this function.
>>>        */
>>> +     smp_mb();
>>>       revents = epi->ffd.file->f_op->poll(epi->ffd.file, &pt);
>>>
>>>       /*
>>
>> I was wrong, rereading 626cf236608505d376e4799adb4f7eb00a8594af,
>> I think this race existed before.
>>
>> Perhaps my alternate patch above is a better fix.
>
> Please document the barrier that this mb() pairs with, and then give
> an explanation for the fix in the commit message, and I'll happily
> take it. Even if it's just duplicating the comments above the
> wq_has_sleeper() function, except modified for the ep_modify() case.
>
> Of course, it would be good to get verification from Jason and Andreas
> that the alternate patch also works for them.
>
>                Linus

^ permalink raw reply	[flat|nested] 53+ messages in thread

* [PATCH] epoll: prevent missed events on EPOLL_CTL_MOD
  2013-01-01 22:53         ` Linus Torvalds
  2013-01-01 23:21           ` Junchang(Jason) Wang
@ 2013-01-01 23:56           ` Eric Wong
  2013-01-02 17:45             ` Eric Dumazet
  2013-01-02 21:16             ` Eric Wong
  1 sibling, 2 replies; 53+ messages in thread
From: Eric Wong @ 2013-01-01 23:56 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Eric Dumazet, Linux Kernel Mailing List, Hans Verkuil, Jiri Olsa,
	Jonathan Corbet, Al Viro, Davide Libenzi, Hans de Goede,
	Mauro Carvalho Chehab, David Miller, Andrew Morton,
	Andreas Voellmy, Junchang(Jason) Wang, Network Development,
	linux-fsdevel

Linus Torvalds <torvalds@linux-foundation.org> wrote:
> Please document the barrier that this mb() pairs with, and then give
> an explanation for the fix in the commit message, and I'll happily
> take it. Even if it's just duplicating the comments above the
> wq_has_sleeper() function, except modified for the ep_modify() case.

Hopefully my explanation is correct and makes sense below,
I think both effects of the barrier are needed

> Of course, it would be good to get verification from Jason and Andreas
> that the alternate patch also works for them.

Jason just confirmed it.

------------------------------- 8< ----------------------------
>From 02f43757d04bb6f2786e79eecf1cfa82e6574379 Mon Sep 17 00:00:00 2001
From: Eric Wong <normalperson@yhbt.net>
Date: Tue, 1 Jan 2013 21:20:27 +0000
Subject: [PATCH] epoll: prevent missed events on EPOLL_CTL_MOD

EPOLL_CTL_MOD sets the interest mask before calling f_op->poll() to
ensure events are not missed.  Since the modifications to the interest
mask are not protected by the same lock as ep_poll_callback, we need to
ensure the change is visible to other CPUs calling ep_poll_callback.

We also need to ensure f_op->poll() has an up-to-date view of past
events which occured before we modified the interest mask.  So this
barrier also pairs with the barrier in wq_has_sleeper().

This should guarantee either ep_poll_callback or f_op->poll() (or both)
will notice the readiness of a recently-ready/modified item.

This issue was encountered by Andreas Voellmy and Junchang(Jason) Wang in:
http://thread.gmane.org/gmane.linux.kernel/1408782/

Signed-off-by: Eric Wong <normalperson@yhbt.net>
Cc: Hans Verkuil <hans.verkuil@cisco.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Davide Libenzi <davidel@xmailserver.org>
Cc: Hans de Goede <hdegoede@redhat.com>
Cc: Mauro Carvalho Chehab <mchehab@infradead.org>
Cc: David Miller <davem@davemloft.net>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andreas Voellmy <andreas.voellmy@yale.edu>
Tested-by: "Junchang(Jason) Wang" <junchang.wang@yale.edu>
Cc: netdev@vger.kernel.org
Cc: linux-fsdevel@vger.kernel.org
---
 fs/eventpoll.c | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index cd96649..39573ee 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1285,7 +1285,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
 	 * otherwise we might miss an event that happens between the
 	 * f_op->poll() call and the new event set registering.
 	 */
-	epi->event.events = event->events;
+	epi->event.events = event->events; /* need barrier below */
 	pt._key = event->events;
 	epi->event.data = event->data; /* protected by mtx */
 	if (epi->event.events & EPOLLWAKEUP) {
@@ -1296,6 +1296,26 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
 	}
 
 	/*
+	 * The following barrier has two effects:
+	 *
+	 * 1) Flush epi changes above to other CPUs.  This ensures
+	 *    we do not miss events from ep_poll_callback if an
+	 *    event occurs immediately after we call f_op->poll().
+	 *    We need this because we did not take ep->lock while
+	 *    changing epi above (but ep_poll_callback does take
+	 *    ep->lock).
+	 *
+	 * 2) We also need to ensure we do not miss _past_ events
+	 *    when calling f_op->poll().  This barrier also
+	 *    pairs with the barrier in wq_has_sleeper (see
+	 *    comments for wq_has_sleeper).
+	 *
+	 * This barrier will now guarantee ep_poll_callback or f_op->poll
+	 * (or both) will notice the readiness of an item.
+	 */
+	smp_mb();
+
+	/*
 	 * Get current event bits. We can safely use the file* here because
 	 * its usage count has been increased by the caller of this function.
 	 */
-- 
Eric Wong

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* Re: [PATCH] epoll: prevent missed events on EPOLL_CTL_MOD
  2013-01-01 23:56           ` [PATCH] epoll: prevent missed events on EPOLL_CTL_MOD Eric Wong
@ 2013-01-02 17:45             ` Eric Dumazet
  2013-01-02 18:40               ` Eric Wong
  2013-01-02 21:16             ` Eric Wong
  1 sibling, 1 reply; 53+ messages in thread
From: Eric Dumazet @ 2013-01-02 17:45 UTC (permalink / raw)
  To: Eric Wong
  Cc: Linus Torvalds, Linux Kernel Mailing List, Hans Verkuil,
	Jiri Olsa, Jonathan Corbet, Al Viro, Davide Libenzi,
	Hans de Goede, Mauro Carvalho Chehab, David Miller, Andrew Morton,
	Andreas Voellmy, Junchang(Jason) Wang, Network Development,
	linux-fsdevel

On Tue, 2013-01-01 at 23:56 +0000, Eric Wong wrote:
> Linus Torvalds <torvalds@linux-foundation.org> wrote:
> > Please document the barrier that this mb() pairs with, and then give
> > an explanation for the fix in the commit message, and I'll happily
> > take it. Even if it's just duplicating the comments above the
> > wq_has_sleeper() function, except modified for the ep_modify() case.
> 
> Hopefully my explanation is correct and makes sense below,
> I think both effects of the barrier are needed
> 
> > Of course, it would be good to get verification from Jason and Andreas
> > that the alternate patch also works for them.
> 
> Jason just confirmed it.
> 
> ------------------------------- 8< ----------------------------
> From 02f43757d04bb6f2786e79eecf1cfa82e6574379 Mon Sep 17 00:00:00 2001
> From: Eric Wong <normalperson@yhbt.net>
> Date: Tue, 1 Jan 2013 21:20:27 +0000
> Subject: [PATCH] epoll: prevent missed events on EPOLL_CTL_MOD
> 
> EPOLL_CTL_MOD sets the interest mask before calling f_op->poll() to
> ensure events are not missed.  Since the modifications to the interest
> mask are not protected by the same lock as ep_poll_callback, we need to
> ensure the change is visible to other CPUs calling ep_poll_callback.
> 
> We also need to ensure f_op->poll() has an up-to-date view of past
> events which occured before we modified the interest mask.  So this
> barrier also pairs with the barrier in wq_has_sleeper().
> 
> This should guarantee either ep_poll_callback or f_op->poll() (or both)
> will notice the readiness of a recently-ready/modified item.
> 
> This issue was encountered by Andreas Voellmy and Junchang(Jason) Wang in:
> http://thread.gmane.org/gmane.linux.kernel/1408782/
> 
> Signed-off-by: Eric Wong <normalperson@yhbt.net>
> Cc: Hans Verkuil <hans.verkuil@cisco.com>
> Cc: Jiri Olsa <jolsa@redhat.com>
> Cc: Jonathan Corbet <corbet@lwn.net>
> Cc: Al Viro <viro@zeniv.linux.org.uk>
> Cc: Davide Libenzi <davidel@xmailserver.org>
> Cc: Hans de Goede <hdegoede@redhat.com>
> Cc: Mauro Carvalho Chehab <mchehab@infradead.org>
> Cc: David Miller <davem@davemloft.net>
> Cc: Eric Dumazet <eric.dumazet@gmail.com>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Linus Torvalds <torvalds@linux-foundation.org>
> Cc: Andreas Voellmy <andreas.voellmy@yale.edu>
> Tested-by: "Junchang(Jason) Wang" <junchang.wang@yale.edu>
> Cc: netdev@vger.kernel.org
> Cc: linux-fsdevel@vger.kernel.org
> ---
>  fs/eventpoll.c | 22 +++++++++++++++++++++-
>  1 file changed, 21 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/eventpoll.c b/fs/eventpoll.c
> index cd96649..39573ee 100644
> --- a/fs/eventpoll.c
> +++ b/fs/eventpoll.c
> @@ -1285,7 +1285,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
>  	 * otherwise we might miss an event that happens between the
>  	 * f_op->poll() call and the new event set registering.
>  	 */
> -	epi->event.events = event->events;
> +	epi->event.events = event->events; /* need barrier below */
>  	pt._key = event->events;
>  	epi->event.data = event->data; /* protected by mtx */
>  	if (epi->event.events & EPOLLWAKEUP) {
> @@ -1296,6 +1296,26 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
>  	}
>  
>  	/*
> +	 * The following barrier has two effects:
> +	 *
> +	 * 1) Flush epi changes above to other CPUs.  This ensures
> +	 *    we do not miss events from ep_poll_callback if an
> +	 *    event occurs immediately after we call f_op->poll().
> +	 *    We need this because we did not take ep->lock while
> +	 *    changing epi above (but ep_poll_callback does take
> +	 *    ep->lock).
> +	 *
> +	 * 2) We also need to ensure we do not miss _past_ events
> +	 *    when calling f_op->poll().  This barrier also
> +	 *    pairs with the barrier in wq_has_sleeper (see
> +	 *    comments for wq_has_sleeper).
> +	 *
> +	 * This barrier will now guarantee ep_poll_callback or f_op->poll
> +	 * (or both) will notice the readiness of an item.
> +	 */
> +	smp_mb();
> +
> +	/*
>  	 * Get current event bits. We can safely use the file* here because
>  	 * its usage count has been increased by the caller of this function.
>  	 */
> -- 
> Eric Wong

First, thanks for working on this issue.

It seems the real problem is the epi->event.events = event->events;
which is done without taking ep->lock

While a smp_mb() could reduce the race window, I believe there is still
a race, and the following patch would close it.

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index be56b21..25e5c53 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1313,7 +1313,10 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
 	 * otherwise we might miss an event that happens between the
 	 * f_op->poll() call and the new event set registering.
 	 */
+	spin_lock_irq(&ep->lock);
 	epi->event.events = event->events;
+	spin_unlock_irq(&ep->lock);
+
 	pt._key = event->events;
 	epi->event.data = event->data; /* protected by mtx */
 	if (epi->event.events & EPOLLWAKEUP) {






^ permalink raw reply related	[flat|nested] 53+ messages in thread

* Re: [PATCH] epoll: prevent missed events on EPOLL_CTL_MOD
  2013-01-02 17:45             ` Eric Dumazet
@ 2013-01-02 18:40               ` Eric Wong
  2013-01-02 19:03                 ` Eric Dumazet
  0 siblings, 1 reply; 53+ messages in thread
From: Eric Wong @ 2013-01-02 18:40 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Linus Torvalds, Linux Kernel Mailing List, Hans Verkuil,
	Jiri Olsa, Jonathan Corbet, Al Viro, Davide Libenzi,
	Hans de Goede, Mauro Carvalho Chehab, David Miller, Andrew Morton,
	Andreas Voellmy, Junchang(Jason) Wang, Network Development,
	linux-fsdevel

Eric Dumazet <eric.dumazet@gmail.com> wrote:
> First, thanks for working on this issue.

No problem!

> It seems the real problem is the epi->event.events = event->events;
> which is done without taking ep->lock

Yes.  I am hoping it is possible to do it without a lock there,
but your change is more obviously correct.

> While a smp_mb() could reduce the race window, I believe there is still
> a race, and the following patch would close it.

I'm not an experienced kernel hacker, can you describe where the race
would be?

> diff --git a/fs/eventpoll.c b/fs/eventpoll.c
> index be56b21..25e5c53 100644
> --- a/fs/eventpoll.c
> +++ b/fs/eventpoll.c
> @@ -1313,7 +1313,10 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
>  	 * otherwise we might miss an event that happens between the
>  	 * f_op->poll() call and the new event set registering.
>  	 */
> +	spin_lock_irq(&ep->lock);
>  	epi->event.events = event->events;
> +	spin_unlock_irq(&ep->lock);
> +
>  	pt._key = event->events;
>  	epi->event.data = event->data; /* protected by mtx */
>  	if (epi->event.events & EPOLLWAKEUP) {

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH] epoll: prevent missed events on EPOLL_CTL_MOD
  2013-01-02 18:40               ` Eric Wong
@ 2013-01-02 19:03                 ` Eric Dumazet
  2013-01-02 19:32                   ` Eric Wong
  0 siblings, 1 reply; 53+ messages in thread
From: Eric Dumazet @ 2013-01-02 19:03 UTC (permalink / raw)
  To: Eric Wong
  Cc: Linus Torvalds, Linux Kernel Mailing List, Hans Verkuil,
	Jiri Olsa, Jonathan Corbet, Al Viro, Davide Libenzi,
	Hans de Goede, Mauro Carvalho Chehab, David Miller, Andrew Morton,
	Andreas Voellmy, Junchang(Jason) Wang, Network Development,
	linux-fsdevel

On Wed, 2013-01-02 at 18:40 +0000, Eric Wong wrote:
> Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > First, thanks for working on this issue.
> 
> No problem!
> 
> > It seems the real problem is the epi->event.events = event->events;
> > which is done without taking ep->lock
> 
> Yes.  I am hoping it is possible to do it without a lock there,
> but your change is more obviously correct.
> 
> > While a smp_mb() could reduce the race window, I believe there is still
> > a race, and the following patch would close it.
> 
> I'm not an experienced kernel hacker, can you describe where the race
> would be?

It would be for example in ep_send_events_proc() doing :

if (epi->event.events & EPOLLONESHOT)
    epi->event.events &= EP_PRIVATE_BITS;

And this could happen at the same time.




^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH] epoll: prevent missed events on EPOLL_CTL_MOD
  2013-01-02 19:03                 ` Eric Dumazet
@ 2013-01-02 19:32                   ` Eric Wong
  2013-01-02 22:08                     ` Eric Dumazet
  0 siblings, 1 reply; 53+ messages in thread
From: Eric Wong @ 2013-01-02 19:32 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Linus Torvalds, Linux Kernel Mailing List, Hans Verkuil,
	Jiri Olsa, Jonathan Corbet, Al Viro, Davide Libenzi,
	Hans de Goede, Mauro Carvalho Chehab, David Miller, Andrew Morton,
	Andreas Voellmy, Junchang(Jason) Wang, Network Development,
	linux-fsdevel

Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Wed, 2013-01-02 at 18:40 +0000, Eric Wong wrote:
> > Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > > It seems the real problem is the epi->event.events = event->events;
> > > which is done without taking ep->lock
> > 
> > Yes.  I am hoping it is possible to do it without a lock there,
> > but your change is more obviously correct.
> > 
> > > While a smp_mb() could reduce the race window, I believe there is still
> > > a race, and the following patch would close it.
> > 
> > I'm not an experienced kernel hacker, can you describe where the race
> > would be?
> 
> It would be for example in ep_send_events_proc() doing :
> 
> if (epi->event.events & EPOLLONESHOT)
>     epi->event.events &= EP_PRIVATE_BITS;
> 
> And this could happen at the same time.

That modification in ep_send_events_proc() is protected by ep->mtx
(as is ep_modify()), though.  Maybe there are other places, but I
don't see it.

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH] epoll: prevent missed events on EPOLL_CTL_MOD
  2013-01-02 19:32                   ` Eric Wong
@ 2013-01-02 22:08                     ` Eric Dumazet
  0 siblings, 0 replies; 53+ messages in thread
From: Eric Dumazet @ 2013-01-02 22:08 UTC (permalink / raw)
  To: Eric Wong
  Cc: Linus Torvalds, Linux Kernel Mailing List, Hans Verkuil,
	Jiri Olsa, Jonathan Corbet, Al Viro, Davide Libenzi,
	Hans de Goede, Mauro Carvalho Chehab, David Miller, Andrew Morton,
	Andreas Voellmy, Junchang(Jason) Wang, Network Development,
	linux-fsdevel

On Wed, 2013-01-02 at 19:32 +0000, Eric Wong wrote:

> That modification in ep_send_events_proc() is protected by ep->mtx
> (as is ep_modify()), though.  Maybe there are other places, but I
> don't see it.

Yes, and using a mutex for protecting this field while its read from
interrupt context (so without mutex synch help) is why there were races.

Some users rely on barriers included in spin_lock/spin_unlock, others in
explicit barriers, or before your patch pure luck.

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH] epoll: prevent missed events on EPOLL_CTL_MOD
  2013-01-01 23:56           ` [PATCH] epoll: prevent missed events on EPOLL_CTL_MOD Eric Wong
  2013-01-02 17:45             ` Eric Dumazet
@ 2013-01-02 21:16             ` Eric Wong
  1 sibling, 0 replies; 53+ messages in thread
From: Eric Wong @ 2013-01-02 21:16 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Eric Dumazet, Linux Kernel Mailing List, Hans Verkuil, Jiri Olsa,
	Jonathan Corbet, Al Viro, Davide Libenzi, Hans de Goede,
	Mauro Carvalho Chehab, David Miller, Andrew Morton,
	Andreas Voellmy, Junchang(Jason) Wang, Network Development,
	linux-fsdevel

Eric Wong <normalperson@yhbt.net> wrote:
> Linus Torvalds <torvalds@linux-foundation.org> wrote:
> > Please document the barrier that this mb() pairs with, and then give
> > an explanation for the fix in the commit message, and I'll happily
> > take it. Even if it's just duplicating the comments above the
> > wq_has_sleeper() function, except modified for the ep_modify() case.
> 
> Hopefully my explanation is correct and makes sense below,
> I think both effects of the barrier are needed

I noticed Linus accepted this already.  This should probably go to
stable, right?

>From ancient git history[1], it seems this bug exists for all
2.6 kernels:

	commit 424980a87e226d63af46579b2af16ec1b8d17e52
	Author: Davide Libenzi <davidel@xmailserver.org>
	Date:   Thu Nov 14 16:17:23 2002 -0800

	    [PATCH] epoll bits 0.46 ...

[1] - git://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: ppoll() stuck on POLLIN while TCP peer is sending
  2012-12-28  1:45 ppoll() stuck on POLLIN while TCP peer is sending Eric Wong
  2012-12-28  7:06 ` Eric Wong
  2012-12-31 13:21 ` [PATCH] poll: prevent missed events if _qproc is NULL Eric Wong
@ 2013-01-02 20:08 ` Eric Wong
  2013-01-02 20:47   ` Eric Wong
  2013-01-04 16:01   ` Mel Gorman
  2 siblings, 2 replies; 53+ messages in thread
From: Eric Wong @ 2013-01-02 20:08 UTC (permalink / raw)
  To: Mel Gorman, linux-mm
  Cc: netdev, linux-kernel, Rik van Riel, Minchan Kim, Andrew Morton,
	Linus Torvalds

(changing Cc:)

Eric Wong <normalperson@yhbt.net> wrote:
> I'm finding ppoll() unexpectedly stuck when waiting for POLLIN on a
> local TCP socket.  The isolated code below can reproduces the issue
> after many minutes (<1 hour).  It might be easier to reproduce on
> a busy system while disk I/O is happening.

s/might be/is/

Strangely, I've bisected this seemingly networking-related issue down to
the following commit:

  commit 1fb3f8ca0e9222535a39b884cb67a34628411b9f
  Author: Mel Gorman <mgorman@suse.de>
  Date:   Mon Oct 8 16:29:12 2012 -0700

      mm: compaction: capture a suitable high-order page immediately when it is made available

That commit doesn't revert cleanly on v3.7.1, and I don't feel
comfortable touching that code myself.

Instead, I disabled THP+compaction under v3.7.1 and I've been unable to
reproduce the issue without THP+compaction.

As I mention in http://mid.gmane.org/20121229113434.GA13336@dcvr.yhbt.net
I run my below test (`toosleepy') with heavy network and disk activity
for a long time before hitting this.

My disk activity involves copying large files around to different local
drives over loopback[1], so perhaps the duplicate pages get compacted
away?  toosleepy also reuses the same 16K junk data all around.


[1] my full setup is very strange.

    Other than the FUSE component I forgot to mention, little depends on
    the kernel.  With all this, the standalone toosleepy can get stuck.
    I'll try to reproduce it with less...

    (possibly relevant info, I don't expect you to duplicate my setup
     as it requires many, many patched userspace components :x):

    fusedav (with many bugfixes[2]) -> (FUSE device)
    zbatery (Ruby 1.9.3-p362) -> omgdav (in zbatery process) -> (TCP)
    MogileFS (patched[3]) -> (TCP)
    cmogstored

    The (zbatery -> omgdav -> MogileFS -> cmogstored) path is all userspace.
    cmogstored uses sendfile and may talk to itself via MogileFS replication:
    MogileFS(replicate) -> HTTP GET from cmogstored -> HTTP PUT to cmogstored

    (MFS was designed for clusters, but I only have one machine right
    now) MogileFS replicate does not use splice between sockets, just
    read/write, cmogstored does not use splice (yet) either.

    The stuck ppoll() I noticed is from Ruby (zbatery/omgdav) while the
    send() was from fusedav (using neon).

[2] my patches on http://bugs.debian.org/fusedav and
    git clone git://bogomips.org/fusedav.git home
[3] git clone git://bogomips.org/MogileFS-Server.git testing

> This may also be related to an epoll-related issue reported
> by Andreas Voellmy:
> http://thread.gmane.org/gmane.linux.kernel/1408782/

(That epoll issue was unrelated and fixed while I was hunting this bug)

> My example involves a 3 thread data flow between two pairs
> of (4) sockets:
> 
> 	 send_loop ->   recv_loop(recv_send)   -> recv_loop(recv_only)
> 	 pair_a[1] -> (pair_a[0] -> pair_b[1]) -> pair_b[0]
> 
> At least 3.7 and 3.7.1 are affected.
> 
> I have tcp_low_latency=1 set, I will try 0 later
> 
> The last progress message I got was after receiving 2942052597760
> bytes on fd=7 (out of 64-bit ULONG_MAX / 2)
> 
> strace:
> 
> 3644  sendto(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384, 0, NULL, 0 <unfinished ...>
> 3643  sendto(6, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384, 0, NULL, 0 <unfinished ...>
> 3642  ppoll([{fd=7, events=POLLIN}], 1, NULL, NULL, 8 <unfinished ...>
> 3641  futex(0x7f23ed8129d0, FUTEX_WAIT, 3644, NULL <unfinished ...>
> 
> The first and last lines of the strace are expected:
> 
> + 3644	sendto(4) is blocked because 3643 is blocked on sendto(fd=6)
>   and not able to call recv().
> + 3641 is the main thread calling pthread_join
> 
> What is unexpected is the tid=3643 and tid=3642 interaction.  As confirmed
> by lsof below, fd=6 is sending to wake up fd=7, but ppoll(fd=7) seems
> to not be waking up.
> 
> lsof:
> toosleepy 3641   ew    4u  IPv4  12405      0t0     TCP localhost:55904->localhost:33249 (ESTABLISHED)
> toosleepy 3641   ew    5u  IPv4  12406      0t0     TCP localhost:33249->localhost:55904 (ESTABLISHED)
> toosleepy 3641   ew    6u  IPv4  12408      0t0     TCP localhost:48777->localhost:33348 (ESTABLISHED)
> toosleepy 3641   ew    7u  IPv4  12409      0t0     TCP localhost:33348->localhost:48777 (ESTABLISHED)
> 
> System info: Linux 3.7.1 x86_64 SMP PREEMPT
> AMD Phenom(tm) II X4 945 Processor (4 cores)
> Nothing interesting in dmesg, iptables rules are empty.
> 
> I have not yet been able to reproduce the issue using UNIX sockets,
> only TCP, but you can run:
> 
>   ./toosleepy unix
> 
> ...to test with UNIX sockets intead of TCP.
> 
> The following code is also available via git://bogomips.org/toosleepy
> gcc -o toosleepy -O2 -Wall -lpthread toosleepy.c
> -------------------------------- 8< ------------------------------------
> #define _GNU_SOURCE
> #include <poll.h>
> #include <sys/ioctl.h>
> #include <pthread.h>
> #include <sys/types.h>
> #include <sys/socket.h>
> #include <arpa/inet.h>
> #include <netinet/tcp.h>
> #include <stdint.h>
> #include <stdio.h>
> #include <stdlib.h>
> #include <errno.h>
> #include <string.h>
> #include <unistd.h>
> #include <fcntl.h>
> #include <assert.h>
> #include <limits.h>
> 
> struct receiver {
> 	int rfd;
> 	int sfd;
> };
> 
> /* blocking sender */
> static void * send_loop(void *fdp)
> {
> 	int fd = *(int *)fdp;
> 	char buf[16384];
> 	ssize_t s;
> 	size_t sent = 0;
> 	size_t max = (size_t)ULONG_MAX / 2;
> 
> 	while (sent < max) {
> 		s = send(fd, buf, sizeof(buf), 0);
> 		if (s > 0)
> 			sent += s;
> 		if (s == -1)
> 			assert(errno == EINTR);
> 	}
> 	dprintf(2, "%d done sending: %zu\n", fd, sent);
> 	close(fd);
> 	return NULL;
> }
> 
> /* non-blocking receiver, using ppoll */
> static void * recv_loop(void *p)
> {
> 	const struct receiver *rcvr = p;
> 	char buf[16384];
> 	nfds_t nfds = 1;
> 	struct pollfd fds;
> 	int rc;
> 	ssize_t r, s;
> 	size_t received = 0;
> 	size_t sent = 0;
> 
> 	for (;;) {
> 		r = recv(rcvr->rfd, buf, sizeof(buf), 0);
> 		if (r == 0) {
> 			break;
> 		} else if (r == -1) {
> 			assert(errno == EAGAIN);
> 
> 			fds.fd = rcvr->rfd;
> 			fds.events = POLLIN;
> 			errno = 0;
> 			rc = ppoll(&fds, nfds, NULL, NULL);
> 			assert(rc == 1);
> 		} else {
> 			assert(r > 0);
> 			received += r;
> 			if (rcvr->sfd >= 0) {
> 				s = send(rcvr->sfd, buf, sizeof(buf), 0);
> 				if (s > 0)
> 					sent += s;
> 				if (s == -1)
> 					assert(errno == EINTR);
> 			} else {
> 				/* just burn some cycles */
> 				write(-1, buf, sizeof(buf));
> 			}
> 		}
> 		if ((received % (sizeof(buf) * sizeof(buf) * 16) == 0))
> 			dprintf(2, " %d progress: %zu\n",
> 			        rcvr->rfd, received);
> 	}
> 	dprintf(2, "%d got: %zu\n", rcvr->rfd, received);
> 	if (rcvr->sfd >= 0) {
> 		dprintf(2, "%d sent: %zu\n", rcvr->sfd, sent);
> 		close(rcvr->sfd);
> 	}
> 
> 	return NULL;
> }
> 
> static void tcp_socketpair(int sv[2], int accept_flags)
> {
> 	struct sockaddr_in addr;
> 	socklen_t addrlen = sizeof(addr);
> 	int l = socket(PF_INET, SOCK_STREAM, 0);
> 	int c = socket(PF_INET, SOCK_STREAM, 0);
> 	int a;
> 
> 	addr.sin_family = AF_INET;
> 	addr.sin_addr.s_addr = INADDR_ANY;
> 	addr.sin_port = 0;
> 	assert(0 == bind(l, (struct sockaddr*)&addr, addrlen));
> 	assert(0 == listen(l, 1024));
> 	assert(0 == getsockname(l, (struct sockaddr *)&addr, &addrlen));
> 	assert(0 == connect(c, (struct sockaddr *)&addr, addrlen));
> 	a = accept4(l, NULL, NULL, accept_flags);
> 	assert(a >= 0);
> 	close(l);
> 	sv[0] = a;
> 	sv[1] = c;
> }
> 
> int main(int argc, char *argv[])
> {
> 	int pair_a[2];
> 	int pair_b[2];
> 	pthread_t s, rs, r;
> 	struct receiver recv_only;
> 	struct receiver recv_send;
> 
> 	if (argc == 2 && strcmp(argv[1], "unix") == 0) {
> 		int val;
> 		assert(0 == socketpair(AF_UNIX, SOCK_STREAM, 0, pair_a));
> 		assert(0 == socketpair(AF_UNIX, SOCK_STREAM, 0, pair_b));
> 		/* only make the receiver non-blocking */
> 		val = 1;
> 		assert(0 == ioctl(pair_a[0], FIONBIO, &val));
> 		val = 1;
> 		assert(0 == ioctl(pair_b[0], FIONBIO, &val));
> 	} else {
> 		tcp_socketpair(pair_a, SOCK_NONBLOCK);
> 		tcp_socketpair(pair_b, SOCK_NONBLOCK);
> 	}
> 
> 	recv_send.rfd = pair_a[0];
> 	recv_send.sfd = pair_b[1];
> 	recv_only.rfd = pair_b[0];
> 	recv_only.sfd = -1;
> 
> 	/*
> 	 * data flow:
> 	 * send_loop ->   recv_loop(recv_send)   -> recv_loop(recv_only)
> 	 * pair_a[1] -> (pair_a[0] -> pair_b[1]) -> pair_b[0]
> 	 */
> 	assert(0 == pthread_create(&r, NULL, recv_loop, &recv_only));
> 	assert(0 == pthread_create(&rs, NULL, recv_loop, &recv_send));
> 	assert(0 == pthread_create(&s, NULL, send_loop, &pair_a[1]));
> 	assert(0 == pthread_join(s, NULL));
> 	assert(0 == pthread_join(rs, NULL));
> 	assert(0 == pthread_join(r, NULL));
> 
> 	return 0;
> }
> -------------------------------- 8< ------------------------------------
> Any help/suggestions/test patches would be greatly appreciated.
> Thanks for reading!
> 
> -- 
> Eric Wong

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: ppoll() stuck on POLLIN while TCP peer is sending
  2013-01-02 20:08 ` ppoll() stuck on POLLIN while TCP peer is sending Eric Wong
@ 2013-01-02 20:47   ` Eric Wong
  2013-01-03 13:41     ` Eric Dumazet
  2013-01-04 16:01   ` Mel Gorman
  1 sibling, 1 reply; 53+ messages in thread
From: Eric Wong @ 2013-01-02 20:47 UTC (permalink / raw)
  To: Mel Gorman, linux-mm
  Cc: netdev, linux-kernel, Rik van Riel, Minchan Kim, Andrew Morton,
	Linus Torvalds

Eric Wong <normalperson@yhbt.net> wrote:
> [1] my full setup is very strange.
> 
>     Other than the FUSE component I forgot to mention, little depends on
>     the kernel.  With all this, the standalone toosleepy can get stuck.
>     I'll try to reproduce it with less...

I just confirmed my toosleepy processes will get stuck while just
doing "rsync -a" between local disks.  So this does not depend on
sendfile or FUSE to reproduce.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: ppoll() stuck on POLLIN while TCP peer is sending
  2013-01-02 20:47   ` Eric Wong
@ 2013-01-03 13:41     ` Eric Dumazet
  2013-01-03 18:32       ` Eric Wong
  0 siblings, 1 reply; 53+ messages in thread
From: Eric Dumazet @ 2013-01-03 13:41 UTC (permalink / raw)
  To: Eric Wong
  Cc: Mel Gorman, linux-mm, netdev, linux-kernel, Rik van Riel,
	Minchan Kim, Andrew Morton, Linus Torvalds

On Wed, 2013-01-02 at 20:47 +0000, Eric Wong wrote:
> Eric Wong <normalperson@yhbt.net> wrote:
> > [1] my full setup is very strange.
> > 
> >     Other than the FUSE component I forgot to mention, little depends on
> >     the kernel.  With all this, the standalone toosleepy can get stuck.
> >     I'll try to reproduce it with less...
> 
> I just confirmed my toosleepy processes will get stuck while just
> doing "rsync -a" between local disks.  So this does not depend on
> sendfile or FUSE to reproduce.
> --

How do you tell your 'toosleepy' is stuck ?

If reading its output, you should change its logic, there is no
guarantee the recv() will deliver exactly 16384 bytes each round.

With the following patch, I cant reproduce the 'apparent stuck'

diff --git a/toosleepy.c b/toosleepy.c
index e64b7cd..df3610f 100644
--- a/toosleepy.c
+++ b/toosleepy.c
@@ -15,6 +15,7 @@
 #include <fcntl.h>
 #include <assert.h>
 #include <limits.h>
+#include <time.h>
 
 struct receiver {
 	int rfd;
@@ -53,6 +54,7 @@ static void * recv_loop(void *p)
 	ssize_t r, s;
 	size_t received = 0;
 	size_t sent = 0;
+	time_t t0 = time(NULL), t1;
 
 	for (;;) {
 		r = recv(rcvr->rfd, buf, sizeof(buf), 0);
@@ -80,9 +82,12 @@ static void * recv_loop(void *p)
 				write(-1, buf, sizeof(buf));
 			}
 		}
-		if ((received % (sizeof(buf) * sizeof(buf) * 16) == 0))
+		t1 = time(NULL);
+		if (t1 != t0) {
 			dprintf(2, " %d progress: %zu\n",
 			        rcvr->rfd, received);
+			t0 = t1;
+		}
 	}
 	dprintf(2, "%d got: %zu\n", rcvr->rfd, received);
 	if (rcvr->sfd >= 0) {

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* Re: ppoll() stuck on POLLIN while TCP peer is sending
  2013-01-03 13:41     ` Eric Dumazet
@ 2013-01-03 18:32       ` Eric Wong
  2013-01-03 23:45         ` Eric Wong
  0 siblings, 1 reply; 53+ messages in thread
From: Eric Wong @ 2013-01-03 18:32 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Mel Gorman, linux-mm, netdev, linux-kernel, Rik van Riel,
	Minchan Kim, Andrew Morton, Linus Torvalds

Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Wed, 2013-01-02 at 20:47 +0000, Eric Wong wrote:
> > Eric Wong <normalperson@yhbt.net> wrote:
> > > [1] my full setup is very strange.
> > > 
> > >     Other than the FUSE component I forgot to mention, little depends on
> > >     the kernel.  With all this, the standalone toosleepy can get stuck.
> > >     I'll try to reproduce it with less...
> > 
> > I just confirmed my toosleepy processes will get stuck while just
> > doing "rsync -a" between local disks.  So this does not depend on
> > sendfile or FUSE to reproduce.
> > --
> 
> How do you tell your 'toosleepy' is stuck ?

My original post showed it stuck with strace (in ppoll + send).
I only strace after seeing it's not using any CPU in top.

http://mid.gmane.org/20121228014503.GA5017@dcvr.yhbt.net
(lsof also confirmed the ppoll/send sockets were peers)

> If reading its output, you should change its logic, there is no
> guarantee the recv() will deliver exactly 16384 bytes each round.
> 
> With the following patch, I cant reproduce the 'apparent stuck'

Right, the output is just an approximation and the logic there
was bogus.

Thanks for looking at this.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: ppoll() stuck on POLLIN while TCP peer is sending
  2013-01-03 18:32       ` Eric Wong
@ 2013-01-03 23:45         ` Eric Wong
  2013-01-04  0:26           ` Eric Wong
  0 siblings, 1 reply; 53+ messages in thread
From: Eric Wong @ 2013-01-03 23:45 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Mel Gorman, linux-mm, netdev, linux-kernel, Rik van Riel,
	Minchan Kim, Andrew Morton, Linus Torvalds

Eric Wong <normalperson@yhbt.net> wrote:
> Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > With the following patch, I cant reproduce the 'apparent stuck'
> 
> Right, the output is just an approximation and the logic there
> was bogus.
> 
> Thanks for looking at this.

I'm still able to reproduce the issue under v3.8-rc2 with your patch
for toosleepy.

(As expected when blocked,) TCP send() will eventually return
ETIMEOUT when I forget to check (and toosleepy will abort from it)

I think this requires frequent dirtying/cycling of pages to reproduce.
(from copying large files around) to interact with compaction.
I'll see if I can reproduce the issue with read-only FS activity.

With 3.7.1 and compaction/THP disabled, I was able to run ~21 hours
and copy a few TB around without anything getting stuck.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: ppoll() stuck on POLLIN while TCP peer is sending
  2013-01-03 23:45         ` Eric Wong
@ 2013-01-04  0:26           ` Eric Wong
  2013-01-04  3:52             ` Eric Wong
  0 siblings, 1 reply; 53+ messages in thread
From: Eric Wong @ 2013-01-04  0:26 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Mel Gorman, linux-mm, netdev, linux-kernel, Rik van Riel,
	Minchan Kim, Andrew Morton, Linus Torvalds

Eric Wong <normalperson@yhbt.net> wrote:
> I think this requires frequent dirtying/cycling of pages to reproduce.
> (from copying large files around) to interact with compaction.
> I'll see if I can reproduce the issue with read-only FS activity.

Still successfully running the read-only test on my main machine, will
provide another update in a few hours or so if it's still successful
(it usually takes <1 hour to hit).

I also fired up a VM on my laptop (still running v3.7) and was able to
get stuck with only 2 cores and 512M on the VM (x86_64).  On the small
VM with little disk space, it doesn't need much dirty data to trigger.
I just did this:

    find $45G_NFS_MOUNT -type f -print0 | \
       xargs -0 -n1 -P4 sh -c 'cat "$1" >> tmp; > tmp' --

...while running two instances of toosleepy (one got stuck and aborted).

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: ppoll() stuck on POLLIN while TCP peer is sending
  2013-01-04  0:26           ` Eric Wong
@ 2013-01-04  3:52             ` Eric Wong
  0 siblings, 0 replies; 53+ messages in thread
From: Eric Wong @ 2013-01-04  3:52 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Mel Gorman, linux-mm, netdev, linux-kernel, Rik van Riel,
	Minchan Kim, Andrew Morton, Linus Torvalds

Eric Wong <normalperson@yhbt.net> wrote:
> Eric Wong <normalperson@yhbt.net> wrote:
> > I think this requires frequent dirtying/cycling of pages to reproduce.
> > (from copying large files around) to interact with compaction.
> > I'll see if I can reproduce the issue with read-only FS activity.
> 
> Still successfully running the read-only test on my main machine, will
> provide another update in a few hours or so if it's still successful
> (it usually takes <1 hour to hit).

The read-only test is still going on my main machine.
I think writes/dirty data is required to reproduce the issue...

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: ppoll() stuck on POLLIN while TCP peer is sending
  2013-01-02 20:08 ` ppoll() stuck on POLLIN while TCP peer is sending Eric Wong
  2013-01-02 20:47   ` Eric Wong
@ 2013-01-04 16:01   ` Mel Gorman
  2013-01-04 17:15     ` Eric Dumazet
                       ` (3 more replies)
  1 sibling, 4 replies; 53+ messages in thread
From: Mel Gorman @ 2013-01-04 16:01 UTC (permalink / raw)
  To: Eric Wong
  Cc: linux-mm, netdev, linux-kernel, Rik van Riel, Minchan Kim,
	Andrew Morton, Linus Torvalds

On Wed, Jan 02, 2013 at 08:08:48PM +0000, Eric Wong wrote:
> (changing Cc:)
> 
> Eric Wong <normalperson@yhbt.net> wrote:
> > I'm finding ppoll() unexpectedly stuck when waiting for POLLIN on a
> > local TCP socket.  The isolated code below can reproduces the issue
> > after many minutes (<1 hour).  It might be easier to reproduce on
> > a busy system while disk I/O is happening.
> 
> s/might be/is/
> 
> Strangely, I've bisected this seemingly networking-related issue down to
> the following commit:
> 
>   commit 1fb3f8ca0e9222535a39b884cb67a34628411b9f
>   Author: Mel Gorman <mgorman@suse.de>
>   Date:   Mon Oct 8 16:29:12 2012 -0700
> 
>       mm: compaction: capture a suitable high-order page immediately when it is made available
> 
> That commit doesn't revert cleanly on v3.7.1, and I don't feel
> comfortable touching that code myself.
> 

That patch introduced an accounting bug that was corrected by ef6c5be6
(fix incorrect NR_FREE_PAGES accounting (appears like memory leak)). In
some cases that could look like a hang and potentially confuses a bisection.

That said, I see that you report that 3.7.1 and 3.8-rc2 are affected that
includes that fix and the finger is pointed at compaction so something
is wrong.

> Instead, I disabled THP+compaction under v3.7.1 and I've been unable to
> reproduce the issue without THP+compaction.
> 

Implying that it's stuck in compaction somewhere. It could be the case
that compaction alters timing enough to trigger another bug. You say it
tests differently depending on whether TCP or unix sockets are used
which might indicate multiple problems. However, lets try and see if
compaction is the primary problem or not.

> As I mention in http://mid.gmane.org/20121229113434.GA13336@dcvr.yhbt.net
> I run my below test (`toosleepy') with heavy network and disk activity
> for a long time before hitting this.
> 

Using a 3.7.1 or 3.8-rc2 kernel, can you reproduce the problem and then
answer the following questions please?

1. What are the contents of /proc/vmstat at the time it is stuck?

2. What are the contents of /proc/PID/stack for every toosleepy
   process when they are stuck?

3. Can you do a sysrq+m and post the resulting dmesg?

What I'm looking for is a throttling bug (if pgscan_direct_throttle is
elevated), an isolated page accounting bug (nr_isolated_* is elevated
and process is stuck in congestion_wait in a too_many_isolated() loop)
or a free page accounting bug (big difference between nr_free_pages and
buddy list figures).

I'll try reproducing this early next week if none of that shows an
obvious candidate.

Thanks.

-- 
Mel Gorman
SUSE Labs

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: ppoll() stuck on POLLIN while TCP peer is sending
  2013-01-04 16:01   ` Mel Gorman
@ 2013-01-04 17:15     ` Eric Dumazet
  2013-01-04 17:59     ` Eric Wong
                       ` (2 subsequent siblings)
  3 siblings, 0 replies; 53+ messages in thread
From: Eric Dumazet @ 2013-01-04 17:15 UTC (permalink / raw)
  To: Mel Gorman
  Cc: Eric Wong, linux-mm, netdev, linux-kernel, Rik van Riel,
	Minchan Kim, Andrew Morton, Linus Torvalds

On Fri, 2013-01-04 at 16:01 +0000, Mel Gorman wrote:

> Implying that it's stuck in compaction somewhere. It could be the case
> that compaction alters timing enough to trigger another bug. You say it
> tests differently depending on whether TCP or unix sockets are used
> which might indicate multiple problems. However, lets try and see if
> compaction is the primary problem or not.

One difference between TCP or unix socket is that :

Unix sockets try hard to limit the order of allocations.

For a 16KB (+ skb overhead) send(), we will probably use one order-2
page and one order-0 page as a frag (data_len being not 0) :

vi +1484 net/unix/af_unix.c

       if (len > SKB_MAX_ALLOC)
                data_len = min_t(size_t,
                                 len - SKB_MAX_ALLOC,
                                 MAX_SKB_FRAGS * PAGE_SIZE);

        skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
                                   msg->msg_flags & MSG_DONTWAIT, &err);

While TCP could use order-3 pages if available

Eric, you could try to change SKB_FRAG_PAGE_ORDER in net/core/sock.c to
lower values (16384, 8192, 4096) and check if the hang can disappear or
not.

Alternatively (no kernel patching needed), you could try to hang AF_UNIX
using buffers of 90KB, to force order-3 allocations as well (one 32KB
allocation plus 16 * 4KB frags)

Thanks

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: ppoll() stuck on POLLIN while TCP peer is sending
  2013-01-04 16:01   ` Mel Gorman
  2013-01-04 17:15     ` Eric Dumazet
@ 2013-01-04 17:59     ` Eric Wong
  2013-01-05  1:07     ` Eric Wong
  2013-01-06 12:07     ` Eric Wong
  3 siblings, 0 replies; 53+ messages in thread
From: Eric Wong @ 2013-01-04 17:59 UTC (permalink / raw)
  To: Mel Gorman
  Cc: linux-mm, netdev, linux-kernel, Rik van Riel, Minchan Kim,
	Andrew Morton, Linus Torvalds

Mel Gorman <mgorman@suse.de> wrote:
> On Wed, Jan 02, 2013 at 08:08:48PM +0000, Eric Wong wrote:
> > Instead, I disabled THP+compaction under v3.7.1 and I've been unable to
> > reproduce the issue without THP+compaction.
> > 
> 
> Implying that it's stuck in compaction somewhere. It could be the case
> that compaction alters timing enough to trigger another bug. You say it
> tests differently depending on whether TCP or unix sockets are used
> which might indicate multiple problems. However, lets try and see if
> compaction is the primary problem or not.

I haven't managed to reproduce the issue on Unix sockets, yet, just TCP.
Trying Unix with 90KB as Eric Dumazet suggested.

I'll get the info you need from /proc soon.
Thank you for looking at this!

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: ppoll() stuck on POLLIN while TCP peer is sending
  2013-01-04 16:01   ` Mel Gorman
  2013-01-04 17:15     ` Eric Dumazet
  2013-01-04 17:59     ` Eric Wong
@ 2013-01-05  1:07     ` Eric Wong
  2013-01-06 12:07     ` Eric Wong
  3 siblings, 0 replies; 53+ messages in thread
From: Eric Wong @ 2013-01-05  1:07 UTC (permalink / raw)
  To: Mel Gorman
  Cc: linux-mm, netdev, linux-kernel, Rik van Riel, Minchan Kim,
	Andrew Morton, Linus Torvalds

Mel Gorman <mgorman@suse.de> wrote:
> On Wed, Jan 02, 2013 at 08:08:48PM +0000, Eric Wong wrote:
> > Instead, I disabled THP+compaction under v3.7.1 and I've been unable to
> > reproduce the issue without THP+compaction.
> > 
> 
> Implying that it's stuck in compaction somewhere. It could be the case
> that compaction alters timing enough to trigger another bug. You say it
> tests differently depending on whether TCP or unix sockets are used
> which might indicate multiple problems. However, lets try and see if
> compaction is the primary problem or not.

I've only managed to encounter this issue with TCP sockets.

No luck reproducing the issue with Unix sockets, not even with 90K
buffers as suggested by Eric Dumazet.  This seems unique to TCP.

Fwiw, I also tried going back to a 16K MTU on loopback a few days ago,
but was still able to reproduce the issue, so
commit 0cf833aefaa85bbfce3ff70485e5534e09254773 doesn't seem
to be a culprit, either.

> > As I mention in http://mid.gmane.org/20121229113434.GA13336@dcvr.yhbt.net
> > I run my below test (`toosleepy') with heavy network and disk activity
> > for a long time before hitting this.
> > 
> 
> Using a 3.7.1 or 3.8-rc2 kernel, can you reproduce the problem and then
> answer the following questions please?

OK, I'm on 3.8-rc2.

> 1. What are the contents of /proc/vmstat at the time it is stuck?

nr_free_pages 1998
nr_inactive_anon 3401
nr_active_anon 3349
nr_inactive_file 94361
nr_active_file 10929
nr_unevictable 0
nr_mlock 0
nr_anon_pages 6643
nr_mapped 2255
nr_file_pages 105400
nr_dirty 44
nr_writeback 0
nr_slab_reclaimable 0
nr_slab_unreclaimable 0
nr_page_table_pages 697
nr_kernel_stack 161
nr_unstable 0
nr_bounce 0
nr_vmscan_write 0
nr_vmscan_immediate_reclaim 0
nr_writeback_temp 0
nr_isolated_anon 0
nr_isolated_file 0
nr_shmem 114
nr_dirtied 1076168
nr_written 46330
nr_anon_transparent_hugepages 0
nr_free_cma 0
nr_dirty_threshold 22495
nr_dirty_background_threshold 11247
pgpgin 4398164
pgpgout 188556
pswpin 0
pswpout 0
pgalloc_dma 369887
pgalloc_dma32 28406230
pgalloc_normal 0
pgalloc_movable 0
pgfree 28779104
pgactivate 18160
pgdeactivate 17404
pgfault 34862559
pgmajfault 358
pgrefill_dma 14076
pgrefill_dma32 3328
pgrefill_normal 0
pgrefill_movable 0
pgsteal_kswapd_dma 12708
pgsteal_kswapd_dma32 917837
pgsteal_kswapd_normal 0
pgsteal_kswapd_movable 0
pgsteal_direct_dma 73
pgsteal_direct_dma32 4085
pgsteal_direct_normal 0
pgsteal_direct_movable 0
pgscan_kswapd_dma 12708
pgscan_kswapd_dma32 918789
pgscan_kswapd_normal 0
pgscan_kswapd_movable 0
pgscan_direct_dma 73
pgscan_direct_dma32 4115
pgscan_direct_normal 0
pgscan_direct_movable 0
pgscan_direct_throttle 0
pginodesteal 0
slabs_scanned 257024
kswapd_inodesteal 69910
kswapd_low_wmark_hit_quickly 2165
kswapd_high_wmark_hit_quickly 275
kswapd_skip_congestion_wait 0
pageoutrun 13412
allocstall 73
pgrotated 3
pgmigrate_success 448
pgmigrate_fail 0
compact_migrate_scanned 14860
compact_free_scanned 219867
compact_isolated 1652
compact_stall 33
compact_fail 10
compact_success 23
unevictable_pgs_culled 1058
unevictable_pgs_scanned 0
unevictable_pgs_rescued 1671
unevictable_pgs_mlocked 1671
unevictable_pgs_munlocked 1671
unevictable_pgs_cleared 0
unevictable_pgs_stranded 0
thp_fault_alloc 0
thp_fault_fallback 0
thp_collapse_alloc 0
thp_collapse_alloc_failed 0
thp_split 0
thp_zero_page_alloc 0
thp_zero_page_alloc_failed 0

> 2. What are the contents of /proc/PID/stack for every toosleepy
>    process when they are stuck?

Oops, I needed a rebuild with CONFIG_STACKTRACE=y (it took some effort
to get the right combination of options).

I probably enabled a few more debugging options than I needed and it
seems to have taken longer to reproduce the issue.  Unfortunately I was
distracted when toosleepy got stuck and missed the change to inspect
before hitting ETIMEDOUT :x

Attempting to reproduce the issue while I'm looking.

> 3. Can you do a sysrq+m and post the resulting dmesg?

SysRq : Show Memory
Mem-Info:
DMA per-cpu:
CPU    0: hi:    0, btch:   1 usd:   0
CPU    1: hi:    0, btch:   1 usd:   0
DMA32 per-cpu:
CPU    0: hi:  186, btch:  31 usd: 144
CPU    1: hi:  186, btch:  31 usd: 160
active_anon:3358 inactive_anon:3379 isolated_anon:0
 active_file:10615 inactive_file:92319 isolated_file:0
 unevictable:0 dirty:3 writeback:0 unstable:0
 free:2240 slab_reclaimable:0 slab_unreclaimable:0
 mapped:2333 shmem:114 pagetables:697 bounce:0
 free_cma:0
DMA free:2408kB min:84kB low:104kB high:124kB active_anon:8kB inactive_anon:44kB active_file:824kB inactive_file:11512kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:15676kB managed:15900kB mlocked:0kB dirty:0kB writeback:0kB mapped:16kB shmem:0kB slab_reclaimable:0kB slab_unreclaimable:0kB kernel_stack:112kB pagetables:20kB unstable:0kB bounce:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 489 489 489
DMA32 free:6552kB min:2784kB low:3480kB high:4176kB active_anon:13424kB inactive_anon:13472kB active_file:41636kB inactive_file:357764kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:500952kB managed:491396kB mlocked:0kB dirty:12kB writeback:0kB mapped:9316kB shmem:456kB slab_reclaimable:0kB slab_unreclaimable:0kB kernel_stack:1160kB pagetables:2768kB unstable:0kB bounce:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 0 0 0
DMA: 52*4kB (UMR) 13*8kB (UMR) 4*16kB (R) 2*32kB (R) 1*64kB (R) 1*128kB (R) 1*256kB (R) 1*512kB (R) 1*1024kB (R) 0*2048kB 0*4096kB = 2424kB
DMA32: 1608*4kB (UM) 15*8kB (M) 0*16kB 0*32kB 0*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 6552kB
103053 total pagecache pages
0 pages in swap cache
Swap cache stats: add 0, delete 0, find 0/0
Free swap  = 392188kB
Total swap = 392188kB
131054 pages RAM
3477 pages reserved
283467 pages shared
111732 pages non-shared

> What I'm looking for is a throttling bug (if pgscan_direct_throttle is
> elevated), an isolated page accounting bug (nr_isolated_* is elevated
> and process is stuck in congestion_wait in a too_many_isolated() loop)
> or a free page accounting bug (big difference between nr_free_pages and
> buddy list figures).
> 
> I'll try reproducing this early next week if none of that shows an
> obvious candidate.

Thanks!  I'll try to get you more information as soon as possible.

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: ppoll() stuck on POLLIN while TCP peer is sending
  2013-01-04 16:01   ` Mel Gorman
                       ` (2 preceding siblings ...)
  2013-01-05  1:07     ` Eric Wong
@ 2013-01-06 12:07     ` Eric Wong
  2013-01-07 12:25       ` Mel Gorman
  3 siblings, 1 reply; 53+ messages in thread
From: Eric Wong @ 2013-01-06 12:07 UTC (permalink / raw)
  To: Mel Gorman
  Cc: linux-mm, netdev, linux-kernel, Rik van Riel, Minchan Kim,
	Andrew Morton, Linus Torvalds

Mel Gorman <mgorman@suse.de> wrote:
> Using a 3.7.1 or 3.8-rc2 kernel, can you reproduce the problem and then
> answer the following questions please?

This is on my main machine running 3.8-rc2

> 1. What are the contents of /proc/vmstat at the time it is stuck?

===> /proc/vmstat <===
nr_free_pages 40305
nr_inactive_anon 25023
nr_active_anon 85684
nr_inactive_file 2614786
nr_active_file 209440
nr_unevictable 0
nr_mlock 0
nr_anon_pages 73510
nr_mapped 6017
nr_file_pages 2843997
nr_dirty 695934
nr_writeback 629239
nr_slab_reclaimable 68414
nr_slab_unreclaimable 14178
nr_page_table_pages 3136
nr_kernel_stack 314
nr_unstable 0
nr_bounce 0
nr_vmscan_write 12220042
nr_vmscan_immediate_reclaim 31213310
nr_writeback_temp 0
nr_isolated_anon 0
nr_isolated_file 0
nr_shmem 24101
nr_dirtied 534655274
nr_written 281872191
nr_anon_transparent_hugepages 24
nr_free_cma 0
nr_dirty_threshold 2790220
nr_dirty_background_threshold 29370
pgpgin 6961109514
pgpgout 1124854772
pswpin 3940
pswpout 127109
pgalloc_dma 6
pgalloc_dma32 7750674038
pgalloc_normal 78295989795
pgalloc_movable 0
pgfree 86049272519
pgactivate 21397174
pgdeactivate 423853
pgfault 473074235
pgmajfault 20093
pgrefill_dma 0
pgrefill_dma32 158720
pgrefill_normal 233024
pgrefill_movable 0
pgsteal_kswapd_dma 0
pgsteal_kswapd_dma32 450844931
pgsteal_kswapd_normal 1288388818
pgsteal_kswapd_movable 0
pgsteal_direct_dma 0
pgsteal_direct_dma32 71774371
pgsteal_direct_normal 197326432
pgsteal_direct_movable 0
pgscan_kswapd_dma 0
pgscan_kswapd_dma32 459780161
pgscan_kswapd_normal 1334016908
pgscan_kswapd_movable 0
pgscan_direct_dma 0
pgscan_direct_dma32 75632525
pgscan_direct_normal 222990090
pgscan_direct_movable 0
pgscan_direct_throttle 0
pginodesteal 228906
slabs_scanned 4077568
kswapd_inodesteal 2591027
kswapd_low_wmark_hit_quickly 674289
kswapd_high_wmark_hit_quickly 39642
kswapd_skip_congestion_wait 506
pageoutrun 2908071
allocstall 431220
pgrotated 15736438
pgmigrate_success 865182
pgmigrate_fail 78157
compact_migrate_scanned 17276417
compact_free_scanned 204979571
compact_isolated 3463801
compact_stall 349792
compact_fail 160801
compact_success 188991
htlb_buddy_alloc_success 0
htlb_buddy_alloc_fail 0
unevictable_pgs_culled 0
unevictable_pgs_scanned 0
unevictable_pgs_rescued 0
unevictable_pgs_mlocked 0
unevictable_pgs_munlocked 0
unevictable_pgs_cleared 0
unevictable_pgs_stranded 0
thp_fault_alloc 720
thp_fault_fallback 1719
thp_collapse_alloc 8631
thp_collapse_alloc_failed 4110
thp_split 700
thp_zero_page_alloc 0
thp_zero_page_alloc_failed 0

> 2. What are the contents of /proc/PID/stack for every toosleepy
>    process when they are stuck?

pid and tid stack info, 28018 is the thread I used to automate
reporting (pushed to git://bogomips.org/toosleepy.git)

===> 28014[28014]/stack <===
[<ffffffff8105a97b>] futex_wait_queue_me+0xb7/0xd2
[<ffffffff8105b7fc>] futex_wait+0xf6/0x1f6
[<ffffffff811bb3af>] cpumask_next_and+0x2b/0x37
[<ffffffff8104ebfa>] select_task_rq_fair+0x518/0x59a
[<ffffffff8105c8f1>] do_futex+0xa9/0x88f
[<ffffffff810509a4>] check_preempt_wakeup+0x10d/0x1a7
[<ffffffff8104757d>] check_preempt_curr+0x25/0x62
[<ffffffff8104d4cc>] wake_up_new_task+0x96/0xc2
[<ffffffff8105d1e9>] sys_futex+0x112/0x14d
[<ffffffff81322a49>] stub_clone+0x69/0x90
[<ffffffff81322769>] system_call_fastpath+0x16/0x1b
[<ffffffffffffffff>] 0xffffffffffffffff
===> 28014[28015]/stack <===
[<ffffffff812ae316>] dev_hard_start_xmit+0x281/0x3f1
[<ffffffff81041010>] add_wait_queue+0x14/0x40
[<ffffffff810de0bc>] poll_schedule_timeout+0x43/0x5d
[<ffffffff810deb46>] do_sys_poll+0x314/0x39b
[<ffffffff810de220>] pollwake+0x0/0x4e
[<ffffffff8129fc1d>] release_sock+0xe5/0x11b
[<ffffffff812d7f61>] tcp_recvmsg+0x713/0x846
[<ffffffff812f432c>] inet_recvmsg+0x64/0x75
[<ffffffff8129a26b>] sock_recvmsg+0x86/0x9e
[<ffffffff8100541c>] emulate_vsyscall+0x1e6/0x28e
[<ffffffff8129a3bc>] sockfd_lookup_light+0x1a/0x50
[<ffffffff8129c18b>] sys_recvfrom+0x110/0x128
[<ffffffff81000e34>] __switch_to+0x235/0x3c5
[<ffffffff810ca402>] kmem_cache_free+0x32/0xb9
[<ffffffff810b809d>] remove_vma+0x44/0x4c
[<ffffffff810df0a5>] sys_ppoll+0xaf/0x123
[<ffffffff81322769>] system_call_fastpath+0x16/0x1b
[<ffffffffffffffff>] 0xffffffffffffffff
===> 28014[28016]/stack <===
[<ffffffff812ae7ad>] dev_queue_xmit+0x327/0x336
[<ffffffff8102cb9f>] _local_bh_enable_ip+0x7a/0x8b
[<ffffffff81041010>] add_wait_queue+0x14/0x40
[<ffffffff810de0bc>] poll_schedule_timeout+0x43/0x5d
[<ffffffff810deb46>] do_sys_poll+0x314/0x39b
[<ffffffff810de220>] pollwake+0x0/0x4e
[<ffffffff8129fc1d>] release_sock+0xe5/0x11b
[<ffffffff812d7f61>] tcp_recvmsg+0x713/0x846
[<ffffffff812f432c>] inet_recvmsg+0x64/0x75
[<ffffffff8129a26b>] sock_recvmsg+0x86/0x9e
[<ffffffff8100541c>] emulate_vsyscall+0x1e6/0x28e
[<ffffffff8129a3bc>] sockfd_lookup_light+0x1a/0x50
[<ffffffff8129c18b>] sys_recvfrom+0x110/0x128
[<ffffffff81000e34>] __switch_to+0x235/0x3c5
[<ffffffff810df0a5>] sys_ppoll+0xaf/0x123
[<ffffffff81322769>] system_call_fastpath+0x16/0x1b
[<ffffffffffffffff>] 0xffffffffffffffff
===> 28014[28017]/stack <===
[<ffffffff8129fc1d>] release_sock+0xe5/0x11b
[<ffffffff812a642c>] sk_stream_wait_memory+0x1f7/0x1fc
[<ffffffff81040d5e>] autoremove_wake_function+0x0/0x2a
[<ffffffff812d8fc3>] tcp_sendmsg+0x710/0x86d
[<ffffffff8129a33e>] sock_sendmsg+0x7b/0x93
[<ffffffff8129a642>] sys_sendto+0xee/0x145
[<ffffffff8129a3bc>] sockfd_lookup_light+0x1a/0x50
[<ffffffff8129a668>] sys_sendto+0x114/0x145
[<ffffffff81000e34>] __switch_to+0x235/0x3c5
[<ffffffff81322769>] system_call_fastpath+0x16/0x1b
[<ffffffffffffffff>] 0xffffffffffffffff
===> 28014[28018]/stack <===
[<ffffffff8102b23e>] do_wait+0x1a6/0x21a
[<ffffffff8104757d>] check_preempt_curr+0x25/0x62
[<ffffffff8102b34a>] sys_wait4+0x98/0xb5
[<ffffffff81026321>] do_fork+0x12c/0x1a7
[<ffffffff810297b0>] child_wait_callback+0x0/0x48
[<ffffffff8131c688>] page_fault+0x28/0x30
[<ffffffff81322769>] system_call_fastpath+0x16/0x1b
[<ffffffffffffffff>] 0xffffffffffffffff

> 3. Can you do a sysrq+m and post the resulting dmesg?

SysRq : Show Memory
Mem-Info:
DMA per-cpu:
CPU    0: hi:    0, btch:   1 usd:   0
CPU    1: hi:    0, btch:   1 usd:   0
CPU    2: hi:    0, btch:   1 usd:   0
CPU    3: hi:    0, btch:   1 usd:   0
DMA32 per-cpu:
CPU    0: hi:  186, btch:  31 usd:   4
CPU    1: hi:  186, btch:  31 usd: 181
CPU    2: hi:  186, btch:  31 usd:  46
CPU    3: hi:  186, btch:  31 usd:  13
Normal per-cpu:
CPU    0: hi:  186, btch:  31 usd: 106
CPU    1: hi:  186, btch:  31 usd: 183
CPU    2: hi:  186, btch:  31 usd:  20
CPU    3: hi:  186, btch:  31 usd:  76
active_anon:85782 inactive_anon:25023 isolated_anon:0
 active_file:209440 inactive_file:2610279 isolated_file:0
 unevictable:0 dirty:696664 writeback:629020 unstable:0
 free:44152 slab_reclaimable:68414 slab_unreclaimable:14178
 mapped:6017 shmem:24101 pagetables:3136 bounce:0
 free_cma:0
DMA free:15872kB min:84kB low:104kB high:124kB active_anon:0kB inactive_anon:0kB active_file:0kB inactive_file:0kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:15640kB managed:15896kB mlocked:0kB dirty:0kB writeback:0kB mapped:0kB shmem:0kB slab_reclaimable:0kB slab_unreclaimable:24kB kernel_stack:0kB pagetables:0kB unstable:0kB bounce:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? yes
lowmem_reserve[]: 0 3132 12078 12078
DMA32 free:85264kB min:17504kB low:21880kB high:26256kB active_anon:46808kB inactive_anon:21212kB active_file:122040kB inactive_file:2833064kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:3208020kB managed:3185856kB mlocked:0kB dirty:92120kB writeback:225356kB mapped:356kB shmem:6776kB slab_reclaimable:67156kB slab_unreclaimable:7412kB kernel_stack:80kB pagetables:816kB unstable:0kB bounce:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 0 8946 8946
Normal free:75472kB min:49988kB low:62484kB high:74980kB active_anon:296320kB inactive_anon:78880kB active_file:715720kB inactive_file:7608052kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:9160704kB managed:9084264kB mlocked:0kB dirty:2694536kB writeback:2290724kB mapped:23712kB shmem:89628kB slab_reclaimable:206500kB slab_unreclaimable:49276kB kernel_stack:2432kB pagetables:11728kB unstable:0kB bounce:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 0 0 0
DMA: 0*4kB 0*8kB 0*16kB 0*32kB 2*64kB (U) 1*128kB (U) 1*256kB (U) 0*512kB 1*1024kB (U) 1*2048kB (R) 3*4096kB (M) = 15872kB
DMA32: 1681*4kB (UEM) 3196*8kB (UEM) 3063*16kB (UEM) 63*32kB (UEM) 0*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 1*2048kB (R) 0*4096kB = 85364kB
Normal: 8874*4kB (UEM) 1885*8kB (UEM) 581*16kB (UEM) 412*32kB (UM) 0*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 1*2048kB (R) 0*4096kB = 75104kB
2839464 total pagecache pages
891 pages in swap cache
Swap cache stats: add 131049, delete 130158, find 1103447/1103954
Free swap  = 4152384kB
Total swap = 4194300kB
3145712 pages RAM
73642 pages reserved
3313060 pages shared
1432170 pages non-shared

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: ppoll() stuck on POLLIN while TCP peer is sending
  2013-01-06 12:07     ` Eric Wong
@ 2013-01-07 12:25       ` Mel Gorman
  2013-01-07 22:38         ` Eric Dumazet
  2013-01-07 22:38         ` Eric Wong
  0 siblings, 2 replies; 53+ messages in thread
From: Mel Gorman @ 2013-01-07 12:25 UTC (permalink / raw)
  To: Eric Wong
  Cc: linux-mm, netdev, linux-kernel, Rik van Riel, Minchan Kim,
	Eric Dumazet, Andrew Morton, Linus Torvalds

On Sun, Jan 06, 2013 at 12:07:00PM +0000, Eric Wong wrote:
> Mel Gorman <mgorman@suse.de> wrote:
> > Using a 3.7.1 or 3.8-rc2 kernel, can you reproduce the problem and then
> > answer the following questions please?
> 
> This is on my main machine running 3.8-rc2
> 
> > 1. What are the contents of /proc/vmstat at the time it is stuck?
> 
> ===> /proc/vmstat <===

According to this, THP is barely being used -- only 24 THP pages at the time
and the LRU lists are dominated by file pages. The isolated and throttled
counters look fine. There is a lot of memory currently under writeback and
a large number of dirty pages are reaching the end of the LRU list which
is inefficient but does not account for the reported bug.

> > 2. What are the contents of /proc/PID/stack for every toosleepy
> >    process when they are stuck?
> 
> pid and tid stack info, 28018 is the thread I used to automate
> reporting (pushed to git://bogomips.org/toosleepy.git)
> 
> ===> 28014[28014]/stack <===
> [<ffffffff8105a97b>] futex_wait_queue_me+0xb7/0xd2
> [<ffffffff8105b7fc>] futex_wait+0xf6/0x1f6
> [<ffffffff811bb3af>] cpumask_next_and+0x2b/0x37
> [<ffffffff8104ebfa>] select_task_rq_fair+0x518/0x59a
> [<ffffffff8105c8f1>] do_futex+0xa9/0x88f
> [<ffffffff810509a4>] check_preempt_wakeup+0x10d/0x1a7
> [<ffffffff8104757d>] check_preempt_curr+0x25/0x62
> [<ffffffff8104d4cc>] wake_up_new_task+0x96/0xc2
> [<ffffffff8105d1e9>] sys_futex+0x112/0x14d
> [<ffffffff81322a49>] stub_clone+0x69/0x90
> [<ffffffff81322769>] system_call_fastpath+0x16/0x1b
> [<ffffffffffffffff>] 0xffffffffffffffff

Looks ok.

> ===> 28014[28015]/stack <===
> [<ffffffff812ae316>] dev_hard_start_xmit+0x281/0x3f1
> [<ffffffff81041010>] add_wait_queue+0x14/0x40
> [<ffffffff810de0bc>] poll_schedule_timeout+0x43/0x5d
> [<ffffffff810deb46>] do_sys_poll+0x314/0x39b
> [<ffffffff810de220>] pollwake+0x0/0x4e
> [<ffffffff8129fc1d>] release_sock+0xe5/0x11b
> [<ffffffff812d7f61>] tcp_recvmsg+0x713/0x846
> [<ffffffff812f432c>] inet_recvmsg+0x64/0x75
> [<ffffffff8129a26b>] sock_recvmsg+0x86/0x9e
> [<ffffffff8100541c>] emulate_vsyscall+0x1e6/0x28e
> [<ffffffff8129a3bc>] sockfd_lookup_light+0x1a/0x50
> [<ffffffff8129c18b>] sys_recvfrom+0x110/0x128
> [<ffffffff81000e34>] __switch_to+0x235/0x3c5
> [<ffffffff810ca402>] kmem_cache_free+0x32/0xb9
> [<ffffffff810b809d>] remove_vma+0x44/0x4c
> [<ffffffff810df0a5>] sys_ppoll+0xaf/0x123
> [<ffffffff81322769>] system_call_fastpath+0x16/0x1b
> [<ffffffffffffffff>] 0xffffffffffffffff

Polling waiting for data, looks ok to me.

> ===> 28014[28016]/stack <===
> [<ffffffff812ae7ad>] dev_queue_xmit+0x327/0x336
> [<ffffffff8102cb9f>] _local_bh_enable_ip+0x7a/0x8b
> [<ffffffff81041010>] add_wait_queue+0x14/0x40
> [<ffffffff810de0bc>] poll_schedule_timeout+0x43/0x5d
> [<ffffffff810deb46>] do_sys_poll+0x314/0x39b
> [<ffffffff810de220>] pollwake+0x0/0x4e
> [<ffffffff8129fc1d>] release_sock+0xe5/0x11b
> [<ffffffff812d7f61>] tcp_recvmsg+0x713/0x846
> [<ffffffff812f432c>] inet_recvmsg+0x64/0x75
> [<ffffffff8129a26b>] sock_recvmsg+0x86/0x9e
> [<ffffffff8100541c>] emulate_vsyscall+0x1e6/0x28e
> [<ffffffff8129a3bc>] sockfd_lookup_light+0x1a/0x50
> [<ffffffff8129c18b>] sys_recvfrom+0x110/0x128
> [<ffffffff81000e34>] __switch_to+0x235/0x3c5
> [<ffffffff810df0a5>] sys_ppoll+0xaf/0x123
> [<ffffffff81322769>] system_call_fastpath+0x16/0x1b
> [<ffffffffffffffff>] 0xffffffffffffffff

Waiting on receive again.

> ===> 28014[28017]/stack <===
> [<ffffffff8129fc1d>] release_sock+0xe5/0x11b
> [<ffffffff812a642c>] sk_stream_wait_memory+0x1f7/0x1fc
> [<ffffffff81040d5e>] autoremove_wake_function+0x0/0x2a
> [<ffffffff812d8fc3>] tcp_sendmsg+0x710/0x86d
> [<ffffffff8129a33e>] sock_sendmsg+0x7b/0x93
> [<ffffffff8129a642>] sys_sendto+0xee/0x145
> [<ffffffff8129a3bc>] sockfd_lookup_light+0x1a/0x50
> [<ffffffff8129a668>] sys_sendto+0x114/0x145
> [<ffffffff81000e34>] __switch_to+0x235/0x3c5
> [<ffffffff81322769>] system_call_fastpath+0x16/0x1b
> [<ffffffffffffffff>] 0xffffffffffffffff

This seems to be the guy that's stuck. It's waiting for more memory for
the socket but who or what is allocating that memory? There are a few other
bugs from over the weekend that I want to take a look at so I did not dig
further or try to reproduce this bug yet. I'm adding Eric Dumazet back to
the cc in case he has the quick answer.

> ===> 28014[28018]/stack <===
> [<ffffffff8102b23e>] do_wait+0x1a6/0x21a
> [<ffffffff8104757d>] check_preempt_curr+0x25/0x62
> [<ffffffff8102b34a>] sys_wait4+0x98/0xb5
> [<ffffffff81026321>] do_fork+0x12c/0x1a7
> [<ffffffff810297b0>] child_wait_callback+0x0/0x48
> [<ffffffff8131c688>] page_fault+0x28/0x30
> [<ffffffff81322769>] system_call_fastpath+0x16/0x1b
> [<ffffffffffffffff>] 0xffffffffffffffff
> 
> > 3. Can you do a sysrq+m and post the resulting dmesg?
> 
> SysRq : Show Memory
> Mem-Info:
> DMA per-cpu:
> CPU    0: hi:    0, btch:   1 usd:   0
> CPU    1: hi:    0, btch:   1 usd:   0
> CPU    2: hi:    0, btch:   1 usd:   0
> CPU    3: hi:    0, btch:   1 usd:   0
> DMA32 per-cpu:
> CPU    0: hi:  186, btch:  31 usd:   4
> CPU    1: hi:  186, btch:  31 usd: 181
> CPU    2: hi:  186, btch:  31 usd:  46
> CPU    3: hi:  186, btch:  31 usd:  13
> Normal per-cpu:
> CPU    0: hi:  186, btch:  31 usd: 106
> CPU    1: hi:  186, btch:  31 usd: 183
> CPU    2: hi:  186, btch:  31 usd:  20
> CPU    3: hi:  186, btch:  31 usd:  76
> active_anon:85782 inactive_anon:25023 isolated_anon:0
>  active_file:209440 inactive_file:2610279 isolated_file:0
>  unevictable:0 dirty:696664 writeback:629020 unstable:0
>  free:44152 slab_reclaimable:68414 slab_unreclaimable:14178
>  mapped:6017 shmem:24101 pagetables:3136 bounce:0
>  free_cma:0
> DMA free:15872kB min:84kB low:104kB high:124kB active_anon:0kB inactive_anon:0kB active_file:0kB inactive_file:0kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:15640kB managed:15896kB mlocked:0kB dirty:0kB writeback:0kB mapped:0kB shmem:0kB slab_reclaimable:0kB slab_unreclaimable:24kB kernel_stack:0kB pagetables:0kB unstable:0kB bounce:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? yes
> lowmem_reserve[]: 0 3132 12078 12078
> DMA32 free:85264kB min:17504kB low:21880kB high:26256kB active_anon:46808kB inactive_anon:21212kB active_file:122040kB inactive_file:2833064kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:3208020kB managed:3185856kB mlocked:0kB dirty:92120kB writeback:225356kB mapped:356kB shmem:6776kB slab_reclaimable:67156kB slab_unreclaimable:7412kB kernel_stack:80kB pagetables:816kB unstable:0kB bounce:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no
> lowmem_reserve[]: 0 0 8946 8946
> Normal free:75472kB min:49988kB low:62484kB high:74980kB active_anon:296320kB inactive_anon:78880kB active_file:715720kB inactive_file:7608052kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:9160704kB managed:9084264kB mlocked:0kB dirty:2694536kB writeback:2290724kB mapped:23712kB shmem:89628kB slab_reclaimable:206500kB slab_unreclaimable:49276kB kernel_stack:2432kB pagetables:11728kB unstable:0kB bounce:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no
> lowmem_reserve[]: 0 0 0 0
> DMA: 0*4kB 0*8kB 0*16kB 0*32kB 2*64kB (U) 1*128kB (U) 1*256kB (U) 0*512kB 1*1024kB (U) 1*2048kB (R) 3*4096kB (M) = 15872kB
> DMA32: 1681*4kB (UEM) 3196*8kB (UEM) 3063*16kB (UEM) 63*32kB (UEM) 0*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 1*2048kB (R) 0*4096kB = 85364kB
> Normal: 8874*4kB (UEM) 1885*8kB (UEM) 581*16kB (UEM) 412*32kB (UM) 0*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 1*2048kB (R) 0*4096kB = 75104kB

Nothing wrong there that I can see. The free list contents roughly match
up with the NR_FREE_PAGES counter so it doesn't look like an accounting
bug. However, an accounting bug could have broken the bisection and
found a different bug.

When taking pages straight off the buddy list like this patch does,
there is a danger that the watermarks will be broken resulting in a
livelock but the watermarks are checked properly and the free pages are
over the min watermark above.

There is this patch https://lkml.org/lkml/2013/1/6/219 but it is
unlikely that it has anything to do with your workload as it does not
use splice().

Right now it's difficult to see how the capture could be the source of
this bug but I'm not ruling it out either so try the following (untested
but should be ok) patch.  It's not a proper revert, it just disables the
capture page logic to see if it's at fault.

diff --git a/mm/compaction.c b/mm/compaction.c
index 6b807e4..81a637d 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1054,9 +1054,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 				goto out;
 			}
 		}
-
-		/* Capture a page now if it is a suitable size */
-		compact_capture_page(cc);
 	}
 
 out:
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4ba5e37..85d3f9d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2179,11 +2179,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 						contended_compaction, &page);
 	current->flags &= ~PF_MEMALLOC;
 
-	/* If compaction captured a page, prep and use it */
-	if (page) {
-		prep_new_page(page, order, gfp_mask);
-		goto got_page;
-	}
+	/* capture page is disabled, this should be impossible */
+	BUG_ON(page);
 
 	if (*did_some_progress != COMPACT_SKIPPED) {
 		/* Page migration frees to the PCP lists but we want merging */
@@ -2195,7 +2192,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 				alloc_flags & ~ALLOC_NO_WATERMARKS,
 				preferred_zone, migratetype);
 		if (page) {
-got_page:
 			preferred_zone->compact_blockskip_flush = false;
 			preferred_zone->compact_considered = 0;
 			preferred_zone->compact_defer_shift = 0;

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* Re: ppoll() stuck on POLLIN while TCP peer is sending
  2013-01-07 12:25       ` Mel Gorman
@ 2013-01-07 22:38         ` Eric Dumazet
  2013-01-08  0:21           ` Eric Wong
  2013-01-07 22:38         ` Eric Wong
  1 sibling, 1 reply; 53+ messages in thread
From: Eric Dumazet @ 2013-01-07 22:38 UTC (permalink / raw)
  To: Mel Gorman
  Cc: Eric Wong, linux-mm, netdev, linux-kernel, Rik van Riel,
	Minchan Kim, Andrew Morton, Linus Torvalds

On Mon, 2013-01-07 at 12:25 +0000, Mel Gorman wrote:

> 
> > ===> 28014[28017]/stack <===
> > [<ffffffff8129fc1d>] release_sock+0xe5/0x11b
> > [<ffffffff812a642c>] sk_stream_wait_memory+0x1f7/0x1fc
> > [<ffffffff81040d5e>] autoremove_wake_function+0x0/0x2a
> > [<ffffffff812d8fc3>] tcp_sendmsg+0x710/0x86d
> > [<ffffffff8129a33e>] sock_sendmsg+0x7b/0x93
> > [<ffffffff8129a642>] sys_sendto+0xee/0x145
> > [<ffffffff8129a3bc>] sockfd_lookup_light+0x1a/0x50
> > [<ffffffff8129a668>] sys_sendto+0x114/0x145
> > [<ffffffff81000e34>] __switch_to+0x235/0x3c5
> > [<ffffffff81322769>] system_call_fastpath+0x16/0x1b
> > [<ffffffffffffffff>] 0xffffffffffffffff
> 
> This seems to be the guy that's stuck. It's waiting for more memory for
> the socket but who or what is allocating that memory? There are a few other
> bugs from over the weekend that I want to take a look at so I did not dig
> further or try to reproduce this bug yet. I'm adding Eric Dumazet back to
> the cc in case he has the quick answer.

Thanks Mel

It would not surprise me if sk_stream_wait_memory() have plain bug(s) or
race(s).

In 2010, in commit 482964e56e132 Nagendra Tomar fixed a pretty severe
long standing bug.

This path is not taken very often on most machines.

I would try the following patch :

diff --git a/net/core/stream.c b/net/core/stream.c
index f5df85d..6f09979 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -126,6 +126,7 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
 
 	while (1) {
 		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 
 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
 
@@ -139,7 +140,6 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
 		if (sk_stream_memory_free(sk) && !vm_wait)
 			break;
 
-		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 		sk->sk_write_pending++;
 		sk_wait_event(sk, &current_timeo, sk->sk_err ||
 						  (sk->sk_shutdown & SEND_SHUTDOWN) ||




--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* Re: ppoll() stuck on POLLIN while TCP peer is sending
  2013-01-07 22:38         ` Eric Dumazet
@ 2013-01-08  0:21           ` Eric Wong
  0 siblings, 0 replies; 53+ messages in thread
From: Eric Wong @ 2013-01-08  0:21 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Mel Gorman, linux-mm, netdev, linux-kernel, Rik van Riel,
	Minchan Kim, Andrew Morton, Linus Torvalds

Eric Dumazet <eric.dumazet@gmail.com> wrote:
> It would not surprise me if sk_stream_wait_memory() have plain bug(s) or
> race(s).
> 
> In 2010, in commit 482964e56e132 Nagendra Tomar fixed a pretty severe
> long standing bug.
> 
> This path is not taken very often on most machines.
> 
> I would try the following patch :

With your patch alone (on top of 3.8-rc2) running on my VM,
I was able to get toosleepy stuck within a few minutes.

===> /proc/vmstat <===
nr_free_pages 3251
nr_inactive_anon 3974
nr_active_anon 3638
nr_inactive_file 100973
nr_active_file 4669
nr_unevictable 0
nr_mlock 0
nr_anon_pages 7515
nr_mapped 2328
nr_file_pages 105739
nr_dirty 6
nr_writeback 0
nr_slab_reclaimable 1703
nr_slab_unreclaimable 5465
nr_page_table_pages 735
nr_kernel_stack 161
nr_unstable 0
nr_bounce 0
nr_vmscan_write 0
nr_vmscan_immediate_reclaim 17
nr_writeback_temp 0
nr_isolated_anon 0
nr_isolated_file 0
nr_shmem 115
nr_dirtied 1575304
nr_written 95797
nr_anon_transparent_hugepages 0
nr_free_cma 0
nr_dirty_threshold 22988
nr_dirty_background_threshold 11494
pgpgin 61164
pgpgout 385372
pswpin 0
pswpout 0
pgalloc_dma 123943
pgalloc_dma32 15694247
pgalloc_normal 0
pgalloc_movable 0
pgfree 15823064
pgactivate 6119
pgdeactivate 5134
pgfault 1439865
pgmajfault 495
pgrefill_dma 1230
pgrefill_dma32 3904
pgrefill_normal 0
pgrefill_movable 0
pgsteal_kswapd_dma 22875
pgsteal_kswapd_dma32 1272136
pgsteal_kswapd_normal 0
pgsteal_kswapd_movable 0
pgsteal_direct_dma 3351
pgsteal_direct_dma32 187467
pgsteal_direct_normal 0
pgsteal_direct_movable 0
pgscan_kswapd_dma 22879
pgscan_kswapd_dma32 1273059
pgscan_kswapd_normal 0
pgscan_kswapd_movable 0
pgscan_direct_dma 3351
pgscan_direct_dma32 187566
pgscan_direct_normal 0
pgscan_direct_movable 0
pgscan_direct_throttle 0
pginodesteal 8
slabs_scanned 14336
kswapd_inodesteal 91
kswapd_low_wmark_hit_quickly 2797
kswapd_high_wmark_hit_quickly 65
kswapd_skip_congestion_wait 3
pageoutrun 18900
allocstall 3350
pgrotated 10
pgmigrate_success 278
pgmigrate_fail 0
compact_migrate_scanned 68864
compact_free_scanned 118486
compact_isolated 6958
compact_stall 306
compact_fail 128
compact_success 178
unevictable_pgs_culled 1063
unevictable_pgs_scanned 0
unevictable_pgs_rescued 1669
unevictable_pgs_mlocked 1669
unevictable_pgs_munlocked 1666
unevictable_pgs_cleared 3
unevictable_pgs_stranded 0
thp_fault_alloc 0
thp_fault_fallback 0
thp_collapse_alloc 0
thp_collapse_alloc_failed 0
thp_split 0
thp_zero_page_alloc 0
thp_zero_page_alloc_failed 0
===> 6018[6018]/stack <===
[<ffffffff81077300>] futex_wait_queue_me+0xc0/0xf0
[<ffffffff81077a9d>] futex_wait+0x17d/0x280
[<ffffffff8107988c>] do_futex+0x11c/0xae0
[<ffffffff8107a2d8>] sys_futex+0x88/0x180
[<ffffffff813b06a9>] system_call_fastpath+0x16/0x1b
[<ffffffffffffffff>] 0xffffffffffffffff
===> 6018[6019]/stack <===
[<ffffffff810f5904>] poll_schedule_timeout+0x44/0x60
[<ffffffff810f6d54>] do_sys_poll+0x374/0x4b0
[<ffffffff810f718e>] sys_ppoll+0x19e/0x1b0
[<ffffffff813b06a9>] system_call_fastpath+0x16/0x1b
[<ffffffffffffffff>] 0xffffffffffffffff
===> 6018[6020]/stack <===
[<ffffffff810f5904>] poll_schedule_timeout+0x44/0x60
[<ffffffff810f6d54>] do_sys_poll+0x374/0x4b0
[<ffffffff810f718e>] sys_ppoll+0x19e/0x1b0
[<ffffffff813b06a9>] system_call_fastpath+0x16/0x1b
[<ffffffffffffffff>] 0xffffffffffffffff
===> 6018[6021]/stack <===
[<ffffffff81310058>] sk_stream_wait_memory+0x1b8/0x250
[<ffffffff8134be67>] tcp_sendmsg+0x697/0xd80
[<ffffffff81370cce>] inet_sendmsg+0x5e/0xa0
[<ffffffff81300a77>] sock_sendmsg+0x87/0xa0
[<ffffffff81303a59>] sys_sendto+0x119/0x160
[<ffffffff813b06a9>] system_call_fastpath+0x16/0x1b
[<ffffffffffffffff>] 0xffffffffffffffff
===> 6018[6022]/stack <===
[<ffffffff810400b8>] do_wait+0x1f8/0x220
[<ffffffff81040ea0>] sys_wait4+0x70/0xf0
[<ffffffff813b06a9>] system_call_fastpath+0x16/0x1b
[<ffffffffffffffff>] 0xffffffffffffffff
SysRq : Show Memory
Mem-Info:
DMA per-cpu:
CPU    0: hi:    0, btch:   1 usd:   0
CPU    1: hi:    0, btch:   1 usd:   0
DMA32 per-cpu:
CPU    0: hi:  186, btch:  31 usd: 156
CPU    1: hi:  186, btch:  31 usd: 158
active_anon:3546 inactive_anon:3645 isolated_anon:0
 active_file:4327 inactive_file:101560 isolated_file:0
 unevictable:0 dirty:1 writeback:0 unstable:0
 free:3057 slab_reclaimable:1435 slab_unreclaimable:5441
 mapped:2308 shmem:115 pagetables:798 bounce:0
 free_cma:0
DMA free:2080kB min:84kB low:104kB high:124kB active_anon:0kB inactive_anon:0kB active_file:0kB inactive_file:13428kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:15644kB managed:15900kB mlocked:0kB dirty:0kB writeback:0kB mapped:0kB shmem:0kB slab_reclaimable:32kB slab_unreclaimable:84kB kernel_stack:0kB pagetables:0kB unstable:0kB bounce:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 488 488 488
DMA32 free:10148kB min:2784kB low:3480kB high:4176kB active_anon:14184kB inactive_anon:14580kB active_file:17308kB inactive_file:392812kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:499960kB managed:491256kB mlocked:0kB dirty:4kB writeback:0kB mapped:9232kB shmem:460kB slab_reclaimable:5708kB slab_unreclaimable:21680kB kernel_stack:1352kB pagetables:3192kB unstable:0kB bounce:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 0 0 0
DMA: 10*4kB (UR) 0*8kB 2*16kB (U) 3*32kB (R) 2*64kB (R) 0*128kB 1*256kB (R) 1*512kB (R) 1*1024kB (R) 0*2048kB 0*4096kB = 2088kB
DMA32: 370*4kB (UEM) 194*8kB (UM) 72*16kB (UM) 33*32kB (UM) 25*64kB (UEM) 18*128kB (EM) 4*256kB (M) 0*512kB 0*1024kB 0*2048kB 0*4096kB = 10168kB
105998 total pagecache pages
0 pages in swap cache
Swap cache stats: add 0, delete 0, find 0/0
Free swap  = 392188kB
Total swap = 392188kB
131054 pages RAM
3820 pages reserved
276280 pages shared
118656 pages non-shared
SysRq : Show Memory
Mem-Info:
DMA per-cpu:
CPU    0: hi:    0, btch:   1 usd:   0
CPU    1: hi:    0, btch:   1 usd:   0
DMA32 per-cpu:
CPU    0: hi:  186, btch:  31 usd: 158
CPU    1: hi:  186, btch:  31 usd: 176
active_anon:3376 inactive_anon:3666 isolated_anon:0
 active_file:4331 inactive_file:101207 isolated_file:0
 unevictable:0 dirty:0 writeback:38 unstable:0
 free:3683 slab_reclaimable:1460 slab_unreclaimable:5398
 mapped:2306 shmem:115 pagetables:762 bounce:0
 free_cma:0
DMA free:2168kB min:84kB low:104kB high:124kB active_anon:0kB inactive_anon:16kB active_file:0kB inactive_file:13348kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:15644kB managed:15900kB mlocked:0kB dirty:0kB writeback:0kB mapped:0kB shmem:0kB slab_reclaimable:40kB slab_unreclaimable:76kB kernel_stack:0kB pagetables:0kB unstable:0kB bounce:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 488 488 488
DMA32 free:12564kB min:2784kB low:3480kB high:4176kB active_anon:13504kB inactive_anon:14648kB active_file:17324kB inactive_file:391480kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:499960kB managed:491256kB mlocked:0kB dirty:0kB writeback:152kB mapped:9224kB shmem:460kB slab_reclaimable:5800kB slab_unreclaimable:21516kB kernel_stack:1368kB pagetables:3048kB unstable:0kB bounce:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 0 0 0
DMA: 25*4kB (MR) 1*8kB (R) 3*16kB (R) 3*32kB (R) 2*64kB (R) 0*128kB 1*256kB (R) 1*512kB (R) 1*1024kB (R) 0*2048kB 0*4096kB = 2172kB
DMA32: 427*4kB (UM) 336*8kB (UEM) 126*16kB (UEM) 49*32kB (UEM) 40*64kB (UEM) 10*128kB (UM) 3*256kB (M) 0*512kB 0*1024kB 0*2048kB 0*4096kB = 12588kB
105658 total pagecache pages
0 pages in swap cache
Swap cache stats: add 0, delete 0, find 0/0
Free swap  = 392188kB
Total swap = 392188kB
131054 pages RAM
3820 pages reserved
275229 pages shared
118788 pages non-shared
SysRq : Show Memory
Mem-Info:
DMA per-cpu:
CPU    0: hi:    0, btch:   1 usd:   0
CPU    1: hi:    0, btch:   1 usd:   0
DMA32 per-cpu:
CPU    0: hi:  186, btch:  31 usd: 108
CPU    1: hi:  186, btch:  31 usd: 166
active_anon:3022 inactive_anon:3664 isolated_anon:0
 active_file:4405 inactive_file:69838 isolated_file:0
 unevictable:0 dirty:5 writeback:4813 unstable:0
 free:34429 slab_reclaimable:1723 slab_unreclaimable:5522
 mapped:2322 shmem:115 pagetables:748 bounce:0
 free_cma:0
DMA free:3616kB min:84kB low:104kB high:124kB active_anon:0kB inactive_anon:20kB active_file:0kB inactive_file:10480kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:15644kB managed:15900kB mlocked:0kB dirty:0kB writeback:560kB mapped:0kB shmem:0kB slab_reclaimable:108kB slab_unreclaimable:328kB kernel_stack:0kB pagetables:0kB unstable:0kB bounce:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 488 488 488
DMA32 free:134100kB min:2784kB low:3480kB high:4176kB active_anon:12088kB inactive_anon:14636kB active_file:17620kB inactive_file:268872kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:499960kB managed:491256kB mlocked:0kB dirty:20kB writeback:18692kB mapped:9288kB shmem:460kB slab_reclaimable:6784kB slab_unreclaimable:21760kB kernel_stack:1328kB pagetables:2992kB unstable:0kB bounce:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 0 0 0
DMA: 136*4kB (UEMR) 89*8kB (UMR) 42*16kB (UMR) 7*32kB (UMR) 3*64kB (R) 0*128kB 1*256kB (R) 0*512kB 1*1024kB (R) 0*2048kB 0*4096kB = 3624kB
DMA32: 4839*4kB (UEM) 4648*8kB (UEM) 2853*16kB (UEM) 868*32kB (UEM) 65*64kB (UEM) 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 134124kB
74344 total pagecache pages
0 pages in swap cache
Swap cache stats: add 0, delete 0, find 0/0
Free swap  = 392188kB
Total swap = 392188kB
131054 pages RAM
3820 pages reserved
285651 pages shared
82395 pages non-shared
SysRq : Show Memory
Mem-Info:
DMA per-cpu:
CPU    0: hi:    0, btch:   1 usd:   0
CPU    1: hi:    0, btch:   1 usd:   0
DMA32 per-cpu:
CPU    0: hi:  186, btch:  31 usd: 166
CPU    1: hi:  186, btch:  31 usd:  28
active_anon:3729 inactive_anon:3974 isolated_anon:0
 active_file:4669 inactive_file:101127 isolated_file:0
 unevictable:0 dirty:6 writeback:0 unstable:0
 free:3244 slab_reclaimable:1703 slab_unreclaimable:5465
 mapped:2328 shmem:115 pagetables:754 bounce:0
 free_cma:0
DMA free:2360kB min:84kB low:104kB high:124kB active_anon:0kB inactive_anon:4kB active_file:20kB inactive_file:9756kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:15644kB managed:15900kB mlocked:0kB dirty:12kB writeback:0kB mapped:0kB shmem:0kB slab_reclaimable:368kB slab_unreclaimable:324kB kernel_stack:0kB pagetables:0kB unstable:0kB bounce:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 488 488 488
DMA32 free:10616kB min:2784kB low:3480kB high:4176kB active_anon:14916kB inactive_anon:15892kB active_file:18656kB inactive_file:394752kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:499960kB managed:491256kB mlocked:0kB dirty:12kB writeback:0kB mapped:9312kB shmem:460kB slab_reclaimable:6444kB slab_unreclaimable:21536kB kernel_stack:1288kB pagetables:3016kB unstable:0kB bounce:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 0 0 0
DMA: 43*4kB (UMR) 16*8kB (UMR) 19*16kB (MR) 23*32kB (UR) 8*64kB (R) 2*128kB (R) 1*256kB (R) 0*512kB 0*1024kB 0*2048kB 0*4096kB = 2364kB
DMA32: 634*4kB (UEM) 615*8kB (UEM) 199*16kB (UEM) 1*32kB (M) 0*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 10672kB
105892 total pagecache pages
0 pages in swap cache
Swap cache stats: add 0, delete 0, find 0/0
Free swap  = 392188kB
Total swap = 392188kB
131054 pages RAM
3820 pages reserved
274934 pages shared
119600 pages non-shared

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: ppoll() stuck on POLLIN while TCP peer is sending
  2013-01-07 12:25       ` Mel Gorman
  2013-01-07 22:38         ` Eric Dumazet
@ 2013-01-07 22:38         ` Eric Wong
  2013-01-08 20:14           ` Eric Wong
  2013-01-08 22:43           ` Mel Gorman
  1 sibling, 2 replies; 53+ messages in thread
From: Eric Wong @ 2013-01-07 22:38 UTC (permalink / raw)
  To: Mel Gorman
  Cc: linux-mm, netdev, linux-kernel, Rik van Riel, Minchan Kim,
	Eric Dumazet, Andrew Morton, Linus Torvalds

Mel Gorman <mgorman@suse.de> wrote:
> Right now it's difficult to see how the capture could be the source of
> this bug but I'm not ruling it out either so try the following (untested
> but should be ok) patch.  It's not a proper revert, it just disables the
> capture page logic to see if it's at fault.

Things look good so far with your change.
It's been running 2 hours on a VM and 1 hour on my regular machine.
Will update again in a few hours (or sooner if it's stuck again).

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: ppoll() stuck on POLLIN while TCP peer is sending
  2013-01-07 22:38         ` Eric Wong
@ 2013-01-08 20:14           ` Eric Wong
  2013-01-08 22:43           ` Mel Gorman
  1 sibling, 0 replies; 53+ messages in thread
From: Eric Wong @ 2013-01-08 20:14 UTC (permalink / raw)
  To: Mel Gorman
  Cc: linux-mm, netdev, linux-kernel, Rik van Riel, Minchan Kim,
	Eric Dumazet, Andrew Morton, Linus Torvalds

Eric Wong <normalperson@yhbt.net> wrote:
> Mel Gorman <mgorman@suse.de> wrote:
> > Right now it's difficult to see how the capture could be the source of
> > this bug but I'm not ruling it out either so try the following (untested
> > but should be ok) patch.  It's not a proper revert, it just disables the
> > capture page logic to see if it's at fault.
> 
> Things look good so far with your change.
> It's been running 2 hours on a VM and 1 hour on my regular machine.
> Will update again in a few hours (or sooner if it's stuck again).

Things still seem good on my regular machine.

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: ppoll() stuck on POLLIN while TCP peer is sending
  2013-01-07 22:38         ` Eric Wong
  2013-01-08 20:14           ` Eric Wong
@ 2013-01-08 22:43           ` Mel Gorman
  2013-01-08 23:23             ` Eric Wong
  2013-01-09 21:29             ` Eric Wong
  1 sibling, 2 replies; 53+ messages in thread
From: Mel Gorman @ 2013-01-08 22:43 UTC (permalink / raw)
  To: Eric Wong
  Cc: linux-mm, netdev, linux-kernel, Rik van Riel, Minchan Kim,
	Eric Dumazet, Andrew Morton, Linus Torvalds

On Mon, Jan 07, 2013 at 10:38:50PM +0000, Eric Wong wrote:
> Mel Gorman <mgorman@suse.de> wrote:
> > Right now it's difficult to see how the capture could be the source of
> > this bug but I'm not ruling it out either so try the following (untested
> > but should be ok) patch.  It's not a proper revert, it just disables the
> > capture page logic to see if it's at fault.
> 
> Things look good so far with your change.

Ok, so minimally reverting is an option once 2e30abd1 is preserved. The
original motivation for the patch was to improve allocation success rates
under load but due to a bug in the patch the likely source of the improvement
was due to compacting more for THP allocations.

> It's been running 2 hours on a VM and 1 hour on my regular machine.
> Will update again in a few hours (or sooner if it's stuck again).

When I looked at it for long enough I found a number of problems. Most
affect timing but two serious issues are in there. One affects how long
kswapd spends compacting versus reclaiming and the other increases lock
contention meaning that async compaction can abort early. Both are serious
and could explain why a driver would fail high-order allocations.

Please try the following patch. However, even if it works the benefit of
capture may be so marginal that partially reverting it and simplifying
compaction.c is the better decision.

diff --git a/mm/compaction.c b/mm/compaction.c
index 6b807e4..03c82c0 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -857,7 +857,8 @@ static int compact_finished(struct zone *zone,
 	} else {
 		unsigned int order;
 		for (order = cc->order; order < MAX_ORDER; order++) {
-			struct free_area *area = &zone->free_area[cc->order];
+			struct free_area *area = &zone->free_area[order];
+
 			/* Job done if page is free of the right migratetype */
 			if (!list_empty(&area->free_list[cc->migratetype]))
 				return COMPACT_PARTIAL;
@@ -929,6 +930,11 @@ static void compact_capture_page(struct compact_control *cc)
 	if (!cc->page || *cc->page)
 		return;
 
+	/* Check that watermarks are satisifed before acquiring locks */
+	if (!zone_watermark_ok(cc->zone, cc->order, low_wmark_pages(cc->zone),
+									0, 0))
+		return;
+
 	/*
 	 * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP
 	 * regardless of the migratetype of the freelist is is captured from.
@@ -941,7 +947,7 @@ static void compact_capture_page(struct compact_control *cc)
 	 */
 	if (cc->migratetype == MIGRATE_MOVABLE) {
 		mtype_low = 0;
-		mtype_high = MIGRATE_PCPTYPES;
+		mtype_high = MIGRATE_PCPTYPES + 1;
 	} else {
 		mtype_low = cc->migratetype;
 		mtype_high = cc->migratetype + 1;
@@ -1118,7 +1124,6 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
 	struct zoneref *z;
 	struct zone *zone;
 	int rc = COMPACT_SKIPPED;
-	int alloc_flags = 0;
 
 	/* Check if the GFP flags allow compaction */
 	if (!order || !may_enter_fs || !may_perform_io)
@@ -1126,10 +1131,6 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
 
 	count_compact_event(COMPACTSTALL);
 
-#ifdef CONFIG_CMA
-	if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
-		alloc_flags |= ALLOC_CMA;
-#endif
 	/* Compact each zone in the list */
 	for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
 								nodemask) {
@@ -1139,9 +1140,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
 						contended, page);
 		rc = max(status, rc);
 
-		/* If a normal allocation would succeed, stop compacting */
-		if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0,
-				      alloc_flags))
+		/* If a page was captured, stop compacting */
+		if (*page)
 			break;
 	}
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4ba5e37..9d20c13 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2180,10 +2180,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	current->flags &= ~PF_MEMALLOC;
 
 	/* If compaction captured a page, prep and use it */
-	if (page) {
-		prep_new_page(page, order, gfp_mask);
+	if (page && !prep_new_page(page, order, gfp_mask))
 		goto got_page;
-	}
 
 	if (*did_some_progress != COMPACT_SKIPPED) {
 		/* Page migration frees to the PCP lists but we want merging */

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* Re: ppoll() stuck on POLLIN while TCP peer is sending
  2013-01-08 22:43           ` Mel Gorman
@ 2013-01-08 23:23             ` Eric Wong
  2013-01-09  2:14               ` Eric Dumazet
  2013-01-09 13:37               ` Mel Gorman
  2013-01-09 21:29             ` Eric Wong
  1 sibling, 2 replies; 53+ messages in thread
From: Eric Wong @ 2013-01-08 23:23 UTC (permalink / raw)
  To: Mel Gorman
  Cc: linux-mm, netdev, linux-kernel, Rik van Riel, Minchan Kim,
	Eric Dumazet, Andrew Morton, Linus Torvalds

Mel Gorman <mgorman@suse.de> wrote:
> Please try the following patch. However, even if it works the benefit of
> capture may be so marginal that partially reverting it and simplifying
> compaction.c is the better decision.

I already got my VM stuck on this one.  I had two twosleepy instances,
2774 was the one that got stuck (also confirmed by watching top).

Btw, have you been able to reproduce this on your end?

I think the easiest reproduction on my 2-core VM is by running 2
twosleepy processes and doing the following to dirty a lot of pages:

  while time find $LARGISH_NFS_MOUNT -type f -print0 | \
    xargs -0 -n1 -P4 sh -c 'cat "$1" >> /tmp/z; > /tmp/z' --; do date; done

I've updated git://bogomips.org/toosleepy.git to automate the reporting
for me.

===> /proc/vmstat <===
nr_free_pages 2035
nr_inactive_anon 4044
nr_active_anon 3913
nr_inactive_file 98877
nr_active_file 4373
nr_unevictable 0
nr_mlock 0
nr_anon_pages 7839
nr_mapped 2350
nr_file_pages 103382
nr_dirty 512
nr_writeback 0
nr_slab_reclaimable 1578
nr_slab_unreclaimable 5642
nr_page_table_pages 800
nr_kernel_stack 170
nr_unstable 0
nr_bounce 0
nr_vmscan_write 0
nr_vmscan_immediate_reclaim 0
nr_writeback_temp 0
nr_isolated_anon 0
nr_isolated_file 0
nr_shmem 115
nr_dirtied 889731
nr_written 25225
nr_anon_transparent_hugepages 0
nr_free_cma 0
nr_dirty_threshold 22336
nr_dirty_background_threshold 11168
pgpgin 45284
pgpgout 101948
pswpin 0
pswpout 0
pgalloc_dma 299007
pgalloc_dma32 24235925
pgalloc_normal 0
pgalloc_movable 0
pgfree 24539843
pgactivate 5440
pgdeactivate 4476
pgfault 1072378
pgmajfault 338
pgrefill_dma 508
pgrefill_dma32 3968
pgrefill_normal 0
pgrefill_movable 0
pgsteal_kswapd_dma 22463
pgsteal_kswapd_dma32 553340
pgsteal_kswapd_normal 0
pgsteal_kswapd_movable 0
pgsteal_direct_dma 3956
pgsteal_direct_dma32 220354
pgsteal_direct_normal 0
pgsteal_direct_movable 0
pgscan_kswapd_dma 22463
pgscan_kswapd_dma32 554313
pgscan_kswapd_normal 0
pgscan_kswapd_movable 0
pgscan_direct_dma 3956
pgscan_direct_dma32 220397
pgscan_direct_normal 0
pgscan_direct_movable 0
pgscan_direct_throttle 0
pginodesteal 0
slabs_scanned 4096
kswapd_inodesteal 0
kswapd_low_wmark_hit_quickly 1726
kswapd_high_wmark_hit_quickly 21
kswapd_skip_congestion_wait 0
pageoutrun 9065
allocstall 4004
pgrotated 0
pgmigrate_success 1242
pgmigrate_fail 0
compact_migrate_scanned 141232
compact_free_scanned 181666
compact_isolated 52638
compact_stall 2024
compact_fail 1450
compact_success 574
unevictable_pgs_culled 1063
unevictable_pgs_scanned 0
unevictable_pgs_rescued 1653
unevictable_pgs_mlocked 1653
unevictable_pgs_munlocked 1652
unevictable_pgs_cleared 1
unevictable_pgs_stranded 0
thp_fault_alloc 0
thp_fault_fallback 0
thp_collapse_alloc 0
thp_collapse_alloc_failed 0
thp_split 0
thp_zero_page_alloc 0
thp_zero_page_alloc_failed 0
===> 2724[2724]/stack <===
[<ffffffff81077300>] futex_wait_queue_me+0xc0/0xf0
[<ffffffff81077a9d>] futex_wait+0x17d/0x280
[<ffffffff8107988c>] do_futex+0x11c/0xae0
[<ffffffff8107a2d8>] sys_futex+0x88/0x180
[<ffffffff813b0729>] system_call_fastpath+0x16/0x1b
[<ffffffffffffffff>] 0xffffffffffffffff
===> 2724[2725]/stack <===
[<ffffffff810f5944>] poll_schedule_timeout+0x44/0x60
[<ffffffff810f6d94>] do_sys_poll+0x374/0x4b0
[<ffffffff810f71ce>] sys_ppoll+0x19e/0x1b0
[<ffffffff813b0729>] system_call_fastpath+0x16/0x1b
[<ffffffffffffffff>] 0xffffffffffffffff
===> 2724[2726]/stack <===
[<ffffffffffffffff>] 0xffffffffffffffff
===> 2724[2727]/stack <===
[<ffffffff81310098>] sk_stream_wait_memory+0x1b8/0x250
[<ffffffff8134bea7>] tcp_sendmsg+0x697/0xd80
[<ffffffff81370d0e>] inet_sendmsg+0x5e/0xa0
[<ffffffff81300ab7>] sock_sendmsg+0x87/0xa0
[<ffffffff81303a99>] sys_sendto+0x119/0x160
[<ffffffff813b0729>] system_call_fastpath+0x16/0x1b
[<ffffffffffffffff>] 0xffffffffffffffff
===> 2724[2728]/stack <===
[<ffffffff8105d02c>] hrtimer_nanosleep+0x9c/0x150
[<ffffffff8105d13e>] sys_nanosleep+0x5e/0x80
[<ffffffff813b0729>] system_call_fastpath+0x16/0x1b
[<ffffffffffffffff>] 0xffffffffffffffff
===> 2774[2774]/stack <===
[<ffffffff81077300>] futex_wait_queue_me+0xc0/0xf0
[<ffffffff81077a9d>] futex_wait+0x17d/0x280
[<ffffffff8107988c>] do_futex+0x11c/0xae0
[<ffffffff8107a2d8>] sys_futex+0x88/0x180
[<ffffffff813b0729>] system_call_fastpath+0x16/0x1b
[<ffffffffffffffff>] 0xffffffffffffffff
===> 2774[2775]/stack <===
[<ffffffff810f5944>] poll_schedule_timeout+0x44/0x60
[<ffffffff810f6d94>] do_sys_poll+0x374/0x4b0
[<ffffffff810f71ce>] sys_ppoll+0x19e/0x1b0
[<ffffffff813b0729>] system_call_fastpath+0x16/0x1b
[<ffffffffffffffff>] 0xffffffffffffffff
===> 2774[2776]/stack <===
[<ffffffff81310098>] sk_stream_wait_memory+0x1b8/0x250
[<ffffffff8134bea7>] tcp_sendmsg+0x697/0xd80
[<ffffffff81370d0e>] inet_sendmsg+0x5e/0xa0
[<ffffffff81300ab7>] sock_sendmsg+0x87/0xa0
[<ffffffff81303a99>] sys_sendto+0x119/0x160
[<ffffffff813b0729>] system_call_fastpath+0x16/0x1b
[<ffffffffffffffff>] 0xffffffffffffffff
===> 2774[2777]/stack <===
[<ffffffff81310098>] sk_stream_wait_memory+0x1b8/0x250
[<ffffffff8134bea7>] tcp_sendmsg+0x697/0xd80
[<ffffffff81370d0e>] inet_sendmsg+0x5e/0xa0
[<ffffffff81300ab7>] sock_sendmsg+0x87/0xa0
[<ffffffff81303a99>] sys_sendto+0x119/0x160
[<ffffffff813b0729>] system_call_fastpath+0x16/0x1b
[<ffffffffffffffff>] 0xffffffffffffffff
===> 2774[2778]/stack <===
[<ffffffff810400b8>] do_wait+0x1f8/0x220
[<ffffffff81040ea0>] sys_wait4+0x70/0xf0
[<ffffffff813b0729>] system_call_fastpath+0x16/0x1b
[<ffffffffffffffff>] 0xffffffffffffffff
SysRq : Show Memory
Mem-Info:
DMA per-cpu:
CPU    0: hi:    0, btch:   1 usd:   0
CPU    1: hi:    0, btch:   1 usd:   0
DMA32 per-cpu:
CPU    0: hi:  186, btch:  31 usd: 108
CPU    1: hi:  186, btch:  31 usd: 162
active_anon:3990 inactive_anon:4042 isolated_anon:0
 active_file:4362 inactive_file:98536 isolated_file:0
 unevictable:0 dirty:513 writeback:0 unstable:0
 free:1896 slab_reclaimable:1530 slab_unreclaimable:5661
 mapped:2342 shmem:115 pagetables:784 bounce:0
 free_cma:0
DMA free:2080kB min:84kB low:104kB high:124kB active_anon:0kB inactive_anon:0kB active_file:0kB inactive_file:12168kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:15644kB managed:15900kB mlocked:0kB dirty:0kB writeback:0kB mapped:0kB shmem:0kB slab_reclaimable:16kB slab_unreclaimable:192kB kernel_stack:0kB pagetables:0kB unstable:0kB bounce:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 488 488 488
DMA32 free:5568kB min:2784kB low:3480kB high:4176kB active_anon:15960kB inactive_anon:16168kB active_file:17448kB inactive_file:381976kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:499960kB managed:491256kB mlocked:0kB dirty:2052kB writeback:0kB mapped:9368kB shmem:460kB slab_reclaimable:6104kB slab_unreclaimable:22452kB kernel_stack:1416kB pagetables:3136kB unstable:0kB bounce:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 0 0 0
DMA: 6*4kB (UR) 5*8kB (UR) 2*16kB (U) 0*32kB 11*64kB (R) 2*128kB (R) 0*256kB 0*512kB 1*1024kB (R) 0*2048kB 0*4096kB = 2080kB
DMA32: 280*4kB (UEM) 66*8kB (UEM) 99*16kB (U) 40*32kB (UM) 16*64kB (M) 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 5536kB
103002 total pagecache pages
0 pages in swap cache
Swap cache stats: add 0, delete 0, find 0/0
Free swap  = 392188kB
Total swap = 392188kB
131054 pages RAM
3820 pages reserved
411919 pages shared
116133 pages non-shared

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: ppoll() stuck on POLLIN while TCP peer is sending
  2013-01-08 23:23             ` Eric Wong
@ 2013-01-09  2:14               ` Eric Dumazet
  2013-01-09  2:32                 ` Eric Dumazet
  2013-01-09 13:37               ` Mel Gorman
  1 sibling, 1 reply; 53+ messages in thread
From: Eric Dumazet @ 2013-01-09  2:14 UTC (permalink / raw)
  To: Eric Wong
  Cc: Mel Gorman, linux-mm, netdev, linux-kernel, Rik van Riel,
	Minchan Kim, Andrew Morton, Linus Torvalds

On Tue, 2013-01-08 at 23:23 +0000, Eric Wong wrote:
> Mel Gorman <mgorman@suse.de> wrote:
> > Please try the following patch. However, even if it works the benefit of
> > capture may be so marginal that partially reverting it and simplifying
> > compaction.c is the better decision.
> 
> I already got my VM stuck on this one.  I had two twosleepy instances,
> 2774 was the one that got stuck (also confirmed by watching top).
> 
> Btw, have you been able to reproduce this on your end?
> 
> I think the easiest reproduction on my 2-core VM is by running 2
> twosleepy processes and doing the following to dirty a lot of pages:

Given the persistent sk_stream_wait_memory() traces I suspect a plain
TCP bug, triggered by some extra wait somewhere.

Please mm guys don't spend too much time right now, I'll try to
reproduce the problem.

Don't be confused by sk_stream_wait_memory() name.
A thread is stuck here because TCP stack is failing to wake it.



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: ppoll() stuck on POLLIN while TCP peer is sending
  2013-01-09  2:14               ` Eric Dumazet
@ 2013-01-09  2:32                 ` Eric Dumazet
  2013-01-09  2:54                   ` Eric Dumazet
  2013-01-09 13:42                   ` Mel Gorman
  0 siblings, 2 replies; 53+ messages in thread
From: Eric Dumazet @ 2013-01-09  2:32 UTC (permalink / raw)
  To: Eric Wong
  Cc: Mel Gorman, linux-mm, netdev, linux-kernel, Rik van Riel,
	Minchan Kim, Andrew Morton, Linus Torvalds

On Tue, 2013-01-08 at 18:14 -0800, Eric Dumazet wrote:
> On Tue, 2013-01-08 at 23:23 +0000, Eric Wong wrote:
> > Mel Gorman <mgorman@suse.de> wrote:
> > > Please try the following patch. However, even if it works the benefit of
> > > capture may be so marginal that partially reverting it and simplifying
> > > compaction.c is the better decision.
> > 
> > I already got my VM stuck on this one.  I had two twosleepy instances,
> > 2774 was the one that got stuck (also confirmed by watching top).
> > 
> > Btw, have you been able to reproduce this on your end?
> > 
> > I think the easiest reproduction on my 2-core VM is by running 2
> > twosleepy processes and doing the following to dirty a lot of pages:
> 
> Given the persistent sk_stream_wait_memory() traces I suspect a plain
> TCP bug, triggered by some extra wait somewhere.
> 
> Please mm guys don't spend too much time right now, I'll try to
> reproduce the problem.
> 
> Don't be confused by sk_stream_wait_memory() name.
> A thread is stuck here because TCP stack is failing to wake it.
> 

Hmm, it seems sk_filter() can return -ENOMEM because skb has the
pfmemalloc() set.

It seems nobody really tested this stuff under memory stress.

Mel, it looks like you are the guy who could fix this, after all ;)

One TCP socket keeps retransmitting an SKB via loopback, and TCP stack 
drops the packet again and again.


commit c93bdd0e03e848555d144eb44a1f275b871a8dd5
Author: Mel Gorman <mgorman@suse.de>
Date:   Tue Jul 31 16:44:19 2012 -0700

    netvm: allow skb allocation to use PFMEMALLOC reserves
    
    Change the skb allocation API to indicate RX usage and use this to fall
    back to the PFMEMALLOC reserve when needed.  SKBs allocated from the
    reserve are tagged in skb->pfmemalloc.  If an SKB is allocated from the
    reserve and the socket is later found to be unrelated to page reclaim, the
    packet is dropped so that the memory remains available for page reclaim.
    Network protocols are expected to recover from this packet loss.
    
    [a.p.zijlstra@chello.nl: Ideas taken from various patches]
    [davem@davemloft.net: Use static branches, coding style corrections]
    [sebastian@breakpoint.cc: Avoid unnecessary cast, fix !CONFIG_NET build]
    Signed-off-by: Mel Gorman <mgorman@suse.de>
    Acked-by: David S. Miller <davem@davemloft.net>
    Cc: Neil Brown <neilb@suse.de>
    Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Cc: Mike Christie <michaelc@cs.wisc.edu>
    Cc: Eric B Munson <emunson@mgebm.net>
    Cc: Eric Dumazet <eric.dumazet@gmail.com>
    Cc: Sebastian Andrzej Siewior <sebastian@breakpoint.cc>
    Cc: Mel Gorman <mgorman@suse.de>
    Cc: Christoph Lameter <cl@linux.com>
    Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
    Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: ppoll() stuck on POLLIN while TCP peer is sending
  2013-01-09  2:32                 ` Eric Dumazet
@ 2013-01-09  2:54                   ` Eric Dumazet
  2013-01-09  3:55                     ` Eric Wong
  2013-01-09 13:42                   ` Mel Gorman
  1 sibling, 1 reply; 53+ messages in thread
From: Eric Dumazet @ 2013-01-09  2:54 UTC (permalink / raw)
  To: Eric Wong, Mel Gorman
  Cc: linux-mm, netdev, linux-kernel, Rik van Riel, Minchan Kim,
	Andrew Morton, Linus Torvalds

On Tue, 2013-01-08 at 18:32 -0800, Eric Dumazet wrote:

> 
> Hmm, it seems sk_filter() can return -ENOMEM because skb has the
> pfmemalloc() set.

> 
> One TCP socket keeps retransmitting an SKB via loopback, and TCP stack 
> drops the packet again and again.

sock_init_data() sets sk->sk_allocation to GFP_KERNEL

Shouldnt it use (GFP_KERNEL | __GFP_NOMEMALLOC) instead ?



diff --git a/net/core/sock.c b/net/core/sock.c
index bc131d4..76c4b39 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -286,6 +286,7 @@ void sk_set_memalloc(struct sock *sk)
 {
 	sock_set_flag(sk, SOCK_MEMALLOC);
 	sk->sk_allocation |= __GFP_MEMALLOC;
+	sk->sk_allocation &= ~__GFP_NOMEMALLOC;
 	static_key_slow_inc(&memalloc_socks);
 }
 EXPORT_SYMBOL_GPL(sk_set_memalloc);
@@ -294,6 +295,7 @@ void sk_clear_memalloc(struct sock *sk)
 {
 	sock_reset_flag(sk, SOCK_MEMALLOC);
 	sk->sk_allocation &= ~__GFP_MEMALLOC;
+	sk->sk_allocation |= __GFP_NOMEMALLOC;
 	static_key_slow_dec(&memalloc_socks);
 
 	/*
@@ -2230,7 +2232,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 
 	init_timer(&sk->sk_timer);
 
-	sk->sk_allocation	=	GFP_KERNEL;
+	sk->sk_allocation	=	GFP_KERNEL | __GFP_NOMEMALLOC;
 	sk->sk_rcvbuf		=	sysctl_rmem_default;
 	sk->sk_sndbuf		=	sysctl_wmem_default;
 	sk->sk_state		=	TCP_CLOSE;


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* Re: ppoll() stuck on POLLIN while TCP peer is sending
  2013-01-09  2:54                   ` Eric Dumazet
@ 2013-01-09  3:55                     ` Eric Wong
  2013-01-09  8:42                       ` Eric Wong
  0 siblings, 1 reply; 53+ messages in thread
From: Eric Wong @ 2013-01-09  3:55 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Mel Gorman, linux-mm, netdev, linux-kernel, Rik van Riel,
	Minchan Kim, Andrew Morton, Linus Torvalds

Eric Dumazet <erdnetdev@gmail.com> wrote:
> On Tue, 2013-01-08 at 18:32 -0800, Eric Dumazet wrote:
> > Hmm, it seems sk_filter() can return -ENOMEM because skb has the
> > pfmemalloc() set.
> 
> > 
> > One TCP socket keeps retransmitting an SKB via loopback, and TCP stack 
> > drops the packet again and again.
> 
> sock_init_data() sets sk->sk_allocation to GFP_KERNEL
> 
> Shouldnt it use (GFP_KERNEL | __GFP_NOMEMALLOC) instead ?

Thanks, things are running good after ~35 minutes so far.
Will report back if things break (hopefully I don't run out
of laptop battery power :x).

I'm now getting allocation failure warnings (which I don't believe
happened before, and should be expected, I think...)

kworker/1:1: page allocation failure: order:0, mode:0x20
Pid: 236, comm: kworker/1:1 Not tainted 3.8.0-rc2w5+ #76
Call Trace:
 <IRQ>  [<ffffffff810a2411>] warn_alloc_failed+0xe1/0x130
 [<ffffffff810a5779>] __alloc_pages_nodemask+0x5e9/0x840
 [<ffffffff8133df8d>] ? ip_rcv+0x24d/0x340
 [<ffffffff811f35b3>] ? sg_init_table+0x23/0x50
 [<ffffffffa002162a>] get_a_page.isra.25+0x3a/0x40 [virtio_net]
 [<ffffffffa0022258>] try_fill_recv+0x318/0x4a0 [virtio_net]
 [<ffffffffa00227bd>] virtnet_poll+0x3dd/0x610 [virtio_net]
 [<ffffffff8131767d>] net_rx_action+0x9d/0x1a0
 [<ffffffff8104284a>] __do_softirq+0xba/0x170
 [<ffffffff813b199c>] call_softirq+0x1c/0x30
 <EOI>  [<ffffffff8100c61d>] do_softirq+0x6d/0xa0
 [<ffffffff81042424>] local_bh_enable+0x94/0xa0
 [<ffffffff813aed45>] __cond_resched_softirq+0x35/0x50
 [<ffffffff81305e7c>] release_sock+0x9c/0x150
 [<ffffffff8134b90e>] tcp_sendmsg+0x11e/0xd80
 [<ffffffff81370cee>] inet_sendmsg+0x5e/0xa0
 [<ffffffff81300a77>] sock_sendmsg+0x87/0xa0
 [<ffffffff810a48c9>] ? __free_memcg_kmem_pages+0x9/0x10
 [<ffffffff81067819>] ? select_task_rq_fair+0x699/0x6b0
 [<ffffffff81300acb>] kernel_sendmsg+0x3b/0x50
 [<ffffffffa0052dc9>] xs_send_kvec+0x89/0xa0 [sunrpc]
 [<ffffffffa00534bf>] xs_sendpages+0x5f/0x1e0 [sunrpc]
 [<ffffffff81047d63>] ? lock_timer_base.isra.32+0x33/0x60
 [<ffffffffa00548e7>] xs_tcp_send_request+0x57/0x110 [sunrpc]
 [<ffffffffa0051c0d>] xprt_transmit+0x6d/0x260 [sunrpc]
 [<ffffffffa004f108>] call_transmit+0x1a8/0x240 [sunrpc]
 [<ffffffffa0056316>] __rpc_execute+0x56/0x250 [sunrpc]
 [<ffffffffa0056535>] rpc_async_schedule+0x25/0x40 [sunrpc]
 [<ffffffff810515cc>] process_one_work+0x12c/0x480
 [<ffffffffa0056510>] ? __rpc_execute+0x250/0x250 [sunrpc]
 [<ffffffff810538ad>] worker_thread+0x15d/0x460
 [<ffffffff81053750>] ? flush_delayed_work+0x60/0x60
 [<ffffffff8105865b>] kthread+0xbb/0xc0
 [<ffffffff810585a0>] ? kthread_create_on_node+0x120/0x120
 [<ffffffff813b063c>] ret_from_fork+0x7c/0xb0
 [<ffffffff810585a0>] ? kthread_create_on_node+0x120/0x120
Mem-Info:
DMA per-cpu:
CPU    0: hi:    0, btch:   1 usd:   0
CPU    1: hi:    0, btch:   1 usd:   0
DMA32 per-cpu:
CPU    0: hi:  186, btch:  31 usd:   0
CPU    1: hi:  186, btch:  31 usd:   0
active_anon:3620 inactive_anon:3624 isolated_anon:0
 active_file:4290 inactive_file:101218 isolated_file:0
 unevictable:0 dirty:2306 writeback:0 unstable:0
 free:1711 slab_reclaimable:1529 slab_unreclaimable:5796
 mapped:2325 shmem:66 pagetables:759 bounce:0
 free_cma:0
DMA free:2012kB min:84kB low:104kB high:124kB active_anon:0kB inactive_anon:0kB active_file:4kB inactive_file:13624kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:15644kB managed:15900kB mlocked:0kB dirty:244kB writeback:0kB mapped:0kB shmem:0kB slab_reclaimable:16kB slab_unreclaimable:80kB kernel_stack:0kB pagetables:0kB unstable:0kB bounce:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 488 488 488
DMA32 free:4832kB min:2784kB low:3480kB high:4176kB active_anon:14480kB inactive_anon:14496kB active_file:17156kB inactive_file:391248kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:499960kB managed:491256kB mlocked:0kB dirty:8980kB writeback:0kB mapped:9300kB shmem:264kB slab_reclaimable:6100kB slab_unreclaimable:23104kB kernel_stack:1336kB pagetables:3036kB unstable:0kB bounce:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 0 0 0
DMA: 2*4kB (U) 1*8kB (R) 1*16kB (U) 0*32kB 1*64kB (R) 1*128kB (R) 1*256kB (R) 1*512kB (R) 1*1024kB (R) 0*2048kB 0*4096kB = 2016kB
DMA32: 207*4kB (UEM) 116*8kB (UEM) 32*16kB (UM) 58*32kB (UM) 13*64kB (UM) 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 4956kB
108890 total pagecache pages
3302 pages in swap cache
Swap cache stats: add 4086, delete 784, find 494/535
Free swap  = 378980kB
Total swap = 392188kB
131054 pages RAM
3820 pages reserved
541743 pages shared
117221 pages non-shared
cat: page allocation failure: order:0, mode:0x20
Pid: 23684, comm: cat Not tainted 3.8.0-rc2w5+ #76
Call Trace:
 <IRQ>  [<ffffffff810a2411>] warn_alloc_failed+0xe1/0x130
 [<ffffffff810a5779>] __alloc_pages_nodemask+0x5e9/0x840
 [<ffffffff8133df8d>] ? ip_rcv+0x24d/0x340
 [<ffffffff811f35b3>] ? sg_init_table+0x23/0x50
 [<ffffffffa002162a>] get_a_page.isra.25+0x3a/0x40 [virtio_net]
 [<ffffffffa0022258>] try_fill_recv+0x318/0x4a0 [virtio_net]
 [<ffffffffa00227bd>] virtnet_poll+0x3dd/0x610 [virtio_net]
 [<ffffffff8131767d>] net_rx_action+0x9d/0x1a0
 [<ffffffff8104284a>] __do_softirq+0xba/0x170
 [<ffffffff813b199c>] call_softirq+0x1c/0x30
 [<ffffffff8100c61d>] do_softirq+0x6d/0xa0
 [<ffffffff81042a75>] irq_exit+0xa5/0xb0
 [<ffffffff8100c25e>] do_IRQ+0x5e/0xd0
 [<ffffffff813afe2d>] common_interrupt+0x6d/0x6d
 <EOI>  [<ffffffff813af82c>] ? _raw_spin_unlock_irqrestore+0xc/0x20
 [<ffffffff810a91b6>] pagevec_lru_move_fn+0xb6/0xe0
 [<ffffffff810a8780>] ? compound_unlock_irqrestore+0x20/0x20
 [<ffffffffa00acd30>] ? nfs_read_completion+0x190/0x190 [nfs]
 [<ffffffff810a91f7>] __pagevec_lru_add+0x17/0x20
 [<ffffffff810a95c8>] __lru_cache_add+0x68/0x90
 [<ffffffff8109e869>] add_to_page_cache_lru+0x29/0x40
 [<ffffffff810a80cc>] read_cache_pages+0x6c/0x100
 [<ffffffffa00ad4dc>] nfs_readpages+0xcc/0x160 [nfs]
 [<ffffffff810a7f57>] __do_page_cache_readahead+0x1c7/0x280
 [<ffffffff810a827c>] ra_submit+0x1c/0x20
 [<ffffffff810a83ad>] ondemand_readahead+0x12d/0x250
 [<ffffffff8109f37d>] ? __generic_file_aio_write+0x1bd/0x3c0
 [<ffffffff810a8550>] page_cache_async_readahead+0x80/0xa0
 [<ffffffff8109e0b8>] ? find_get_page+0x28/0xd0
 [<ffffffff8109fb73>] generic_file_aio_read+0x503/0x6c0
 [<ffffffffa00a4231>] nfs_file_read+0x91/0xb0 [nfs]
 [<ffffffff810e4477>] do_sync_read+0xa7/0xe0
 [<ffffffff810e4b50>] vfs_read+0xa0/0x160
 [<ffffffff810e4c5d>] sys_read+0x4d/0x90
 [<ffffffff813b06e9>] system_call_fastpath+0x16/0x1b
Mem-Info:
DMA per-cpu:
CPU    0: hi:    0, btch:   1 usd:   0
CPU    1: hi:    0, btch:   1 usd:   0
DMA32 per-cpu:
CPU    0: hi:  186, btch:  31 usd:  86
CPU    1: hi:  186, btch:  31 usd: 167
active_anon:2376 inactive_anon:2431 isolated_anon:0
 active_file:3712 inactive_file:103686 isolated_file:17
 unevictable:0 dirty:646 writeback:0 unstable:0
 free:807 slab_reclaimable:1485 slab_unreclaimable:5873
 mapped:2343 shmem:66 pagetables:791 bounce:0
 free_cma:0
DMA free:2032kB min:84kB low:104kB high:124kB active_anon:0kB inactive_anon:0kB active_file:24kB inactive_file:12916kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:15644kB managed:15900kB mlocked:0kB dirty:0kB writeback:0kB mapped:12kB shmem:0kB slab_reclaimable:16kB slab_unreclaimable:124kB kernel_stack:0kB pagetables:0kB unstable:0kB bounce:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 488 488 488
DMA32 free:1196kB min:2784kB low:3480kB high:4176kB active_anon:9504kB inactive_anon:9724kB active_file:14824kB inactive_file:401828kB unevictable:0kB isolated(anon):0kB isolated(file):68kB present:499960kB managed:491256kB mlocked:0kB dirty:2588kB writeback:0kB mapped:9360kB shmem:264kB slab_reclaimable:5924kB slab_unreclaimable:23368kB kernel_stack:1336kB pagetables:3164kB unstable:0kB bounce:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 0 0 0
DMA: 11*4kB (MR) 1*8kB (R) 0*16kB 2*32kB (R) 4*64kB (R) 3*128kB (R) 1*256kB (R) 0*512kB 1*1024kB (R) 0*2048kB 0*4096kB = 2036kB
DMA32: 40*4kB (UEM) 28*8kB (UM) 22*16kB (UEM) 5*32kB (UM) 5*64kB (UM) 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 1216kB
108199 total pagecache pages
714 pages in swap cache
Swap cache stats: add 4829, delete 4115, find 583/626
Free swap  = 376280kB
Total swap = 392188kB
131054 pages RAM
3820 pages reserved
542491 pages shared
117305 pages non-shared
cat: page allocation failure: order:0, mode:0x20
Pid: 24171, comm: cat Not tainted 3.8.0-rc2w5+ #76
Call Trace:
 <IRQ>  [<ffffffff810a2411>] warn_alloc_failed+0xe1/0x130
 [<ffffffff8137871e>] ? fib_table_lookup+0x26e/0x2d0
 [<ffffffff810a5779>] __alloc_pages_nodemask+0x5e9/0x840
 [<ffffffff8130a6c0>] __netdev_alloc_frag+0xa0/0x150
 [<ffffffff8130d9d2>] __netdev_alloc_skb+0x82/0xe0
 [<ffffffffa00225b7>] virtnet_poll+0x1d7/0x610 [virtio_net]
 [<ffffffff8131767d>] net_rx_action+0x9d/0x1a0
 [<ffffffff8104284a>] __do_softirq+0xba/0x170
 [<ffffffff813b199c>] call_softirq+0x1c/0x30
 [<ffffffff8100c61d>] do_softirq+0x6d/0xa0
 [<ffffffff81042a75>] irq_exit+0xa5/0xb0
 [<ffffffff8100c25e>] do_IRQ+0x5e/0xd0
 [<ffffffff813afe2d>] common_interrupt+0x6d/0x6d
 <EOI>  [<ffffffff813aeeb5>] ? io_schedule+0xa5/0xd0
 [<ffffffff811ef000>] ? copy_user_generic_string+0x30/0x40
 [<ffffffff8109db32>] ? __lock_page_killable+0x62/0x70
 [<ffffffff8109da85>] ? file_read_actor+0x135/0x180
 [<ffffffff8109f950>] generic_file_aio_read+0x2e0/0x6c0
 [<ffffffffa00a4231>] nfs_file_read+0x91/0xb0 [nfs]
 [<ffffffff810e4477>] do_sync_read+0xa7/0xe0
 [<ffffffff810e4b50>] vfs_read+0xa0/0x160
 [<ffffffff810e4c5d>] sys_read+0x4d/0x90
 [<ffffffff813b06e9>] system_call_fastpath+0x16/0x1b
Mem-Info:
DMA per-cpu:
CPU    0: hi:    0, btch:   1 usd:   0
CPU    1: hi:    0, btch:   1 usd:   0
DMA32 per-cpu:
CPU    0: hi:  186, btch:  31 usd:  50
CPU    1: hi:  186, btch:  31 usd:  30
active_anon:2367 inactive_anon:2431 isolated_anon:0
 active_file:3719 inactive_file:103732 isolated_file:0
 unevictable:0 dirty:1302 writeback:0 unstable:0
 free:754 slab_reclaimable:1589 slab_unreclaimable:5896
 mapped:2343 shmem:66 pagetables:781 bounce:0
 free_cma:0
DMA free:1980kB min:84kB low:104kB high:124kB active_anon:0kB inactive_anon:0kB active_file:24kB inactive_file:9704kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:15644kB managed:15900kB mlocked:0kB dirty:84kB writeback:0kB mapped:12kB shmem:0kB slab_reclaimable:16kB slab_unreclaimable:180kB kernel_stack:0kB pagetables:0kB unstable:0kB bounce:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 488 488 488
DMA32 free:1036kB min:2784kB low:3480kB high:4176kB active_anon:9468kB inactive_anon:9724kB active_file:14852kB inactive_file:405224kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:499960kB managed:491256kB mlocked:0kB dirty:5124kB writeback:0kB mapped:9360kB shmem:264kB slab_reclaimable:6340kB slab_unreclaimable:23404kB kernel_stack:1368kB pagetables:3124kB unstable:0kB bounce:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 0 0 0
DMA: 34*4kB (UR) 23*8kB (UMR) 40*16kB (UMR) 2*32kB (UM) 1*64kB (R) 1*128kB (R) 1*256kB (R) 1*512kB (R) 0*1024kB 0*2048kB 0*4096kB = 1984kB
DMA32: 1*4kB (U) 33*8kB (UEM) 8*16kB (UM) 4*32kB (UM) 8*64kB (UM) 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 1036kB
108257 total pagecache pages
712 pages in swap cache
Swap cache stats: add 4829, delete 4117, find 585/628
Free swap  = 376288kB
Total swap = 392188kB
131054 pages RAM
3820 pages reserved
280574 pages shared
116800 pages non-shared
-- 
Eric Wong

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: ppoll() stuck on POLLIN while TCP peer is sending
  2013-01-09  3:55                     ` Eric Wong
@ 2013-01-09  8:42                       ` Eric Wong
  2013-01-09  8:51                         ` Eric Wong
  0 siblings, 1 reply; 53+ messages in thread
From: Eric Wong @ 2013-01-09  8:42 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Mel Gorman, linux-mm, netdev, linux-kernel, Rik van Riel,
	Minchan Kim, Andrew Morton, Linus Torvalds

Eric Wong <normalperson@yhbt.net> wrote:
> Eric Dumazet <erdnetdev@gmail.com> wrote:
> > On Tue, 2013-01-08 at 18:32 -0800, Eric Dumazet wrote:
> > > Hmm, it seems sk_filter() can return -ENOMEM because skb has the
> > > pfmemalloc() set.
> > 
> > > 
> > > One TCP socket keeps retransmitting an SKB via loopback, and TCP stack 
> > > drops the packet again and again.
> > 
> > sock_init_data() sets sk->sk_allocation to GFP_KERNEL
> > 
> > Shouldnt it use (GFP_KERNEL | __GFP_NOMEMALLOC) instead ?
> 
> Thanks, things are running good after ~35 minutes so far.
> Will report back if things break (hopefully I don't run out
> of laptop battery power :x).

Oops, I had to restart my test :x.  However, I was able to reproduce the
issue very quickly again with your patch.  I've double-checked I'm
booting into the correct kernel, but I do have more load on this
laptop host now, so maybe that made it happen more quickly...

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: ppoll() stuck on POLLIN while TCP peer is sending
  2013-01-09  8:42                       ` Eric Wong
@ 2013-01-09  8:51                         ` Eric Wong
  0 siblings, 0 replies; 53+ messages in thread
From: Eric Wong @ 2013-01-09  8:51 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Mel Gorman, linux-mm, netdev, linux-kernel, Rik van Riel,
	Minchan Kim, Andrew Morton, Linus Torvalds

Eric Wong <normalperson@yhbt.net> wrote:
> Oops, I had to restart my test :x.  However, I was able to reproduce the
> issue very quickly again with your patch.  I've double-checked I'm
> booting into the correct kernel, but I do have more load on this
> laptop host now, so maybe that made it happen more quickly...

Oops, I forgot to include the debugging output.
(Is this information still useful to you guys?)

2724 process stuck!
===> /proc/vmstat <===
nr_free_pages 2401
nr_inactive_anon 3242
nr_active_anon 3044
nr_inactive_file 103091
nr_active_file 4305
nr_unevictable 0
nr_mlock 0
nr_anon_pages 6204
nr_mapped 2332
nr_file_pages 107533
nr_dirty 144
nr_writeback 0
nr_slab_reclaimable 1440
nr_slab_unreclaimable 5202
nr_page_table_pages 773
nr_kernel_stack 167
nr_unstable 0
nr_bounce 0
nr_vmscan_write 0
nr_vmscan_immediate_reclaim 0
nr_writeback_temp 0
nr_isolated_anon 0
nr_isolated_file 0
nr_shmem 115
nr_dirtied 340718
nr_written 4979
nr_anon_transparent_hugepages 0
nr_free_cma 0
nr_dirty_threshold 22904
nr_dirty_background_threshold 11452
pgpgin 43068
pgpgout 20484
pswpin 0
pswpout 0
pgalloc_dma 57018
pgalloc_dma32 9428296
pgalloc_normal 0
pgalloc_movable 0
pgfree 9488417
pgactivate 5151
pgdeactivate 3251
pgfault 751069
pgmajfault 272
pgrefill_dma 115
pgrefill_dma32 3136
pgrefill_normal 0
pgrefill_movable 0
pgsteal_kswapd_dma 2865
pgsteal_kswapd_dma32 209744
pgsteal_kswapd_normal 0
pgsteal_kswapd_movable 0
pgsteal_direct_dma 568
pgsteal_direct_dma32 31692
pgsteal_direct_normal 0
pgsteal_direct_movable 0
pgscan_kswapd_dma 2865
pgscan_kswapd_dma32 210678
pgscan_kswapd_normal 0
pgscan_kswapd_movable 0
pgscan_direct_dma 568
pgscan_direct_dma32 31760
pgscan_direct_normal 0
pgscan_direct_movable 0
pgscan_direct_throttle 0
pginodesteal 0
slabs_scanned 0
kswapd_inodesteal 0
kswapd_low_wmark_hit_quickly 666
kswapd_high_wmark_hit_quickly 2
kswapd_skip_congestion_wait 0
pageoutrun 3135
allocstall 566
pgrotated 2
pgmigrate_success 348
pgmigrate_fail 0
compact_migrate_scanned 335538
compact_free_scanned 144705
compact_isolated 11328
compact_stall 451
compact_fail 279
compact_success 172
unevictable_pgs_culled 1064
unevictable_pgs_scanned 0
unevictable_pgs_rescued 1632
unevictable_pgs_mlocked 1632
unevictable_pgs_munlocked 1632
unevictable_pgs_cleared 0
unevictable_pgs_stranded 0
thp_fault_alloc 0
thp_fault_fallback 0
thp_collapse_alloc 0
thp_collapse_alloc_failed 0
thp_split 0
thp_zero_page_alloc 0
thp_zero_page_alloc_failed 0
===> 2724[2724]/stack <===
[<ffffffff81077300>] futex_wait_queue_me+0xc0/0xf0
[<ffffffff81077a9d>] futex_wait+0x17d/0x280
[<ffffffff8107988c>] do_futex+0x11c/0xae0
[<ffffffff8107a2d8>] sys_futex+0x88/0x180
[<ffffffff813b06e9>] system_call_fastpath+0x16/0x1b
[<ffffffffffffffff>] 0xffffffffffffffff
===> 2724[2725]/stack <===
[<ffffffff810f5904>] poll_schedule_timeout+0x44/0x60
[<ffffffff810f6d54>] do_sys_poll+0x374/0x4b0
[<ffffffff810f718e>] sys_ppoll+0x19e/0x1b0
[<ffffffff813b06e9>] system_call_fastpath+0x16/0x1b
[<ffffffffffffffff>] 0xffffffffffffffff
===> 2724[2726]/stack <===
[<ffffffff810f5904>] poll_schedule_timeout+0x44/0x60
[<ffffffff810f6d54>] do_sys_poll+0x374/0x4b0
[<ffffffff810f718e>] sys_ppoll+0x19e/0x1b0
[<ffffffff813b06e9>] system_call_fastpath+0x16/0x1b
[<ffffffffffffffff>] 0xffffffffffffffff
===> 2724[2727]/stack <===
[<ffffffff81310078>] sk_stream_wait_memory+0x1b8/0x250
[<ffffffff8134be87>] tcp_sendmsg+0x697/0xd80
[<ffffffff81370cee>] inet_sendmsg+0x5e/0xa0
[<ffffffff81300a77>] sock_sendmsg+0x87/0xa0
[<ffffffff81303a59>] sys_sendto+0x119/0x160
[<ffffffff813b06e9>] system_call_fastpath+0x16/0x1b
[<ffffffffffffffff>] 0xffffffffffffffff
===> 2724[2728]/stack <===
[<ffffffff810400b8>] do_wait+0x1f8/0x220
[<ffffffff81040ea0>] sys_wait4+0x70/0xf0
[<ffffffff813b06e9>] system_call_fastpath+0x16/0x1b
[<ffffffffffffffff>] 0xffffffffffffffff
===> 2773[2773]/stack <===
[<ffffffff81077300>] futex_wait_queue_me+0xc0/0xf0
[<ffffffff81077a9d>] futex_wait+0x17d/0x280
[<ffffffff8107988c>] do_futex+0x11c/0xae0
[<ffffffff8107a2d8>] sys_futex+0x88/0x180
[<ffffffff813b06e9>] system_call_fastpath+0x16/0x1b
[<ffffffffffffffff>] 0xffffffffffffffff
===> 2773[2774]/stack <===
[<ffffffff810f5904>] poll_schedule_timeout+0x44/0x60
[<ffffffff810f6d54>] do_sys_poll+0x374/0x4b0
[<ffffffff810f718e>] sys_ppoll+0x19e/0x1b0
[<ffffffff813b06e9>] system_call_fastpath+0x16/0x1b
[<ffffffffffffffff>] 0xffffffffffffffff
===> 2773[2775]/stack <===
[<ffffffffffffffff>] 0xffffffffffffffff
===> 2773[2776]/stack <===
[<ffffffffffffffff>] 0xffffffffffffffff
===> 2773[2777]/stack <===
[<ffffffff8105d02c>] hrtimer_nanosleep+0x9c/0x150
[<ffffffff8105d13e>] sys_nanosleep+0x5e/0x80
[<ffffffff813b06e9>] system_call_fastpath+0x16/0x1b
[<ffffffffffffffff>] 0xffffffffffffffff
SysRq : Show Memory
Mem-Info:
DMA per-cpu:
CPU    0: hi:    0, btch:   1 usd:   0
CPU    1: hi:    0, btch:   1 usd:   0
DMA32 per-cpu:
CPU    0: hi:  186, btch:  31 usd:  72
CPU    1: hi:  186, btch:  31 usd: 100
active_anon:3130 inactive_anon:3283 isolated_anon:0
 active_file:4305 inactive_file:101390 isolated_file:0
 unevictable:0 dirty:103 writeback:0 unstable:0
 free:3675 slab_reclaimable:1453 slab_unreclaimable:5186
 mapped:2332 shmem:115 pagetables:754 bounce:0
 free_cma:0
DMA free:2116kB min:84kB low:104kB high:124kB active_anon:0kB inactive_anon:16kB active_file:0kB inactive_file:13692kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:15644kB managed:15900kB mlocked:0kB dirty:0kB writeback:0kB mapped:0kB shmem:0kB slab_reclaimable:0kB slab_unreclaimable:4kB kernel_stack:0kB pagetables:0kB unstable:0kB bounce:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 488 488 488
DMA32 free:12584kB min:2784kB low:3480kB high:4176kB active_anon:12520kB inactive_anon:13116kB active_file:17220kB inactive_file:391868kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:499960kB managed:491256kB mlocked:0kB dirty:420kB writeback:0kB mapped:9328kB shmem:460kB slab_reclaimable:5812kB slab_unreclaimable:20740kB kernel_stack:1336kB pagetables:3016kB unstable:0kB bounce:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 0 0 0
DMA: 4*4kB (UMR) 1*8kB (M) 1*16kB (M) 3*32kB (MR) 1*64kB (R) 1*128kB (R) 1*256kB (R) 1*512kB (R) 1*1024kB (R) 0*2048kB 0*4096kB = 2120kB
DMA32: 411*4kB (UEM) 359*8kB (UEM) 207*16kB (UM) 84*32kB (UM) 25*64kB (UM) 4*128kB (M) 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 12628kB
105835 total pagecache pages
0 pages in swap cache
Swap cache stats: add 0, delete 0, find 0/0
Free swap  = 392188kB
Total swap = 392188kB
131054 pages RAM
3820 pages reserved
276721 pages shared
117464 pages non-shared

2773 process stuck!
===> /proc/vmstat <===
nr_free_pages 1579
nr_inactive_anon 3302
nr_active_anon 3078
nr_inactive_file 103991
nr_active_file 4357
nr_unevictable 0
nr_mlock 0
nr_anon_pages 6260
nr_mapped 2319
nr_file_pages 108478
nr_dirty 648
nr_writeback 0
nr_slab_reclaimable 1603
nr_slab_unreclaimable 5380
nr_page_table_pages 748
nr_kernel_stack 171
nr_unstable 0
nr_bounce 0
nr_vmscan_write 0
nr_vmscan_immediate_reclaim 0
nr_writeback_temp 0
nr_isolated_anon 0
nr_isolated_file 0
nr_shmem 115
nr_dirtied 841467
nr_written 15931
nr_anon_transparent_hugepages 0
nr_free_cma 0
nr_dirty_threshold 22949
nr_dirty_background_threshold 11474
pgpgin 43832
pgpgout 64464
pswpin 0
pswpout 0
pgalloc_dma 105241
pgalloc_dma32 12655633
pgalloc_normal 0
pgalloc_movable 0
pgfree 12763390
pgactivate 5358
pgdeactivate 3607
pgfault 1011343
pgmajfault 302
pgrefill_dma 407
pgrefill_dma32 3200
pgrefill_normal 0
pgrefill_movable 0
pgsteal_kswapd_dma 10785
pgsteal_kswapd_dma32 612431
pgsteal_kswapd_normal 0
pgsteal_kswapd_movable 0
pgsteal_direct_dma 2159
pgsteal_direct_dma32 120797
pgsteal_direct_normal 0
pgsteal_direct_movable 0
pgscan_kswapd_dma 10785
pgscan_kswapd_dma32 613376
pgscan_kswapd_normal 0
pgscan_kswapd_movable 0
pgscan_direct_dma 2159
pgscan_direct_dma32 120866
pgscan_direct_normal 0
pgscan_direct_movable 0
pgscan_direct_throttle 0
pginodesteal 0
slabs_scanned 3072
kswapd_inodesteal 0
kswapd_low_wmark_hit_quickly 1810
kswapd_high_wmark_hit_quickly 13
kswapd_skip_congestion_wait 0
pageoutrun 9178
allocstall 2157
pgrotated 2
pgmigrate_success 509
pgmigrate_fail 0
compact_migrate_scanned 818935
compact_free_scanned 214217
compact_isolated 27006
compact_stall 1014
compact_fail 674
compact_success 340
unevictable_pgs_culled 1064
unevictable_pgs_scanned 0
unevictable_pgs_rescued 1632
unevictable_pgs_mlocked 1632
unevictable_pgs_munlocked 1632
unevictable_pgs_cleared 0
unevictable_pgs_stranded 0
thp_fault_alloc 0
thp_fault_fallback 0
thp_collapse_alloc 0
thp_collapse_alloc_failed 0
thp_split 0
thp_zero_page_alloc 0
thp_zero_page_alloc_failed 0
===> 2724[2724]/stack <===
[<ffffffff81077300>] futex_wait_queue_me+0xc0/0xf0
[<ffffffff81077a9d>] futex_wait+0x17d/0x280
[<ffffffff8107988c>] do_futex+0x11c/0xae0
[<ffffffff8107a2d8>] sys_futex+0x88/0x180
[<ffffffff813b06e9>] system_call_fastpath+0x16/0x1b
[<ffffffffffffffff>] 0xffffffffffffffff
===> 2724[2725]/stack <===
[<ffffffff810f5904>] poll_schedule_timeout+0x44/0x60
[<ffffffff810f6d54>] do_sys_poll+0x374/0x4b0
[<ffffffff810f718e>] sys_ppoll+0x19e/0x1b0
[<ffffffff813b06e9>] system_call_fastpath+0x16/0x1b
[<ffffffffffffffff>] 0xffffffffffffffff
===> 2724[2726]/stack <===
[<ffffffff810f5904>] poll_schedule_timeout+0x44/0x60
[<ffffffff810f6d54>] do_sys_poll+0x374/0x4b0
[<ffffffff810f718e>] sys_ppoll+0x19e/0x1b0
[<ffffffff813b06e9>] system_call_fastpath+0x16/0x1b
[<ffffffffffffffff>] 0xffffffffffffffff
===> 2724[2727]/stack <===
[<ffffffff81310078>] sk_stream_wait_memory+0x1b8/0x250
[<ffffffff8134be87>] tcp_sendmsg+0x697/0xd80
[<ffffffff81370cee>] inet_sendmsg+0x5e/0xa0
[<ffffffff81300a77>] sock_sendmsg+0x87/0xa0
[<ffffffff81303a59>] sys_sendto+0x119/0x160
[<ffffffff813b06e9>] system_call_fastpath+0x16/0x1b
[<ffffffffffffffff>] 0xffffffffffffffff
===> 2724[2728]/stack <===
[<ffffffff8105d02c>] hrtimer_nanosleep+0x9c/0x150
[<ffffffff8105d13e>] sys_nanosleep+0x5e/0x80
[<ffffffff813b06e9>] system_call_fastpath+0x16/0x1b
[<ffffffffffffffff>] 0xffffffffffffffff
===> 2773[2773]/stack <===
[<ffffffff81077300>] futex_wait_queue_me+0xc0/0xf0
[<ffffffff81077a9d>] futex_wait+0x17d/0x280
[<ffffffff8107988c>] do_futex+0x11c/0xae0
[<ffffffff8107a2d8>] sys_futex+0x88/0x180
[<ffffffff813b06e9>] system_call_fastpath+0x16/0x1b
[<ffffffffffffffff>] 0xffffffffffffffff
===> 2773[2774]/stack <===
[<ffffffff810f5904>] poll_schedule_timeout+0x44/0x60
[<ffffffff810f6d54>] do_sys_poll+0x374/0x4b0
[<ffffffff810f718e>] sys_ppoll+0x19e/0x1b0
[<ffffffff813b06e9>] system_call_fastpath+0x16/0x1b
[<ffffffffffffffff>] 0xffffffffffffffff
===> 2773[2775]/stack <===
[<ffffffff810f5904>] poll_schedule_timeout+0x44/0x60
[<ffffffff810f6d54>] do_sys_poll+0x374/0x4b0
[<ffffffff810f718e>] sys_ppoll+0x19e/0x1b0
[<ffffffff813b06e9>] system_call_fastpath+0x16/0x1b
[<ffffffffffffffff>] 0xffffffffffffffff
===> 2773[2776]/stack <===
[<ffffffff81310078>] sk_stream_wait_memory+0x1b8/0x250
[<ffffffff8134be87>] tcp_sendmsg+0x697/0xd80
[<ffffffff81370cee>] inet_sendmsg+0x5e/0xa0
[<ffffffff81300a77>] sock_sendmsg+0x87/0xa0
[<ffffffff81303a59>] sys_sendto+0x119/0x160
[<ffffffff813b06e9>] system_call_fastpath+0x16/0x1b
[<ffffffffffffffff>] 0xffffffffffffffff
===> 2773[2777]/stack <===
[<ffffffff810400b8>] do_wait+0x1f8/0x220
[<ffffffff81040ea0>] sys_wait4+0x70/0xf0
[<ffffffff813b06e9>] system_call_fastpath+0x16/0x1b
[<ffffffffffffffff>] 0xffffffffffffffff
<redundant "SysRq : Show Memory" from previous process omitted>
SysRq : Show Memory
Mem-Info:
DMA per-cpu:
CPU    0: hi:    0, btch:   1 usd:   0
CPU    1: hi:    0, btch:   1 usd:   0
DMA32 per-cpu:
CPU    0: hi:  186, btch:  31 usd: 164
CPU    1: hi:  186, btch:  31 usd: 117
active_anon:3016 inactive_anon:3281 isolated_anon:0
 active_file:4357 inactive_file:104163 isolated_file:0
 unevictable:0 dirty:142 writeback:0 unstable:0
 free:1582 slab_reclaimable:1598 slab_unreclaimable:5380
 mapped:2316 shmem:115 pagetables:773 bounce:0
 free_cma:0
DMA free:2332kB min:84kB low:104kB high:124kB active_anon:8kB inactive_anon:8kB active_file:0kB inactive_file:13476kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:15644kB managed:15900kB mlocked:0kB dirty:12kB writeback:0kB mapped:0kB shmem:0kB slab_reclaimable:0kB slab_unreclaimable:12kB kernel_stack:0kB pagetables:8kB unstable:0kB bounce:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 488 488 488
DMA32 free:3996kB min:2784kB low:3480kB high:4176kB active_anon:12056kB inactive_anon:13116kB active_file:17428kB inactive_file:403176kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:499960kB managed:491256kB mlocked:0kB dirty:556kB writeback:0kB mapped:9264kB shmem:460kB slab_reclaimable:6392kB slab_unreclaimable:21508kB kernel_stack:1360kB pagetables:3084kB unstable:0kB bounce:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 0 0 0
DMA: 19*4kB (UER) 20*8kB (U) 7*16kB (U) 2*32kB (R) 0*64kB 3*128kB (R) 0*256kB 1*512kB (R) 1*1024kB (R) 0*2048kB 0*4096kB = 2332kB
DMA32: 151*4kB (UEM) 210*8kB (UE) 108*16kB (U) 0*32kB 0*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 4012kB
108629 total pagecache pages
0 pages in swap cache
Swap cache stats: add 0, delete 0, find 0/0
Free swap  = 392188kB
Total swap = 392188kB
131054 pages RAM
3820 pages reserved
275952 pages shared
119896 pages non-shared

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: ppoll() stuck on POLLIN while TCP peer is sending
  2013-01-09  2:32                 ` Eric Dumazet
  2013-01-09  2:54                   ` Eric Dumazet
@ 2013-01-09 13:42                   ` Mel Gorman
  1 sibling, 0 replies; 53+ messages in thread
From: Mel Gorman @ 2013-01-09 13:42 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Eric Wong, linux-mm, netdev, linux-kernel, Rik van Riel,
	Minchan Kim, Andrew Morton, Linus Torvalds

On Tue, Jan 08, 2013 at 06:32:29PM -0800, Eric Dumazet wrote:
> On Tue, 2013-01-08 at 18:14 -0800, Eric Dumazet wrote:
> > On Tue, 2013-01-08 at 23:23 +0000, Eric Wong wrote:
> > > Mel Gorman <mgorman@suse.de> wrote:
> > > > Please try the following patch. However, even if it works the benefit of
> > > > capture may be so marginal that partially reverting it and simplifying
> > > > compaction.c is the better decision.
> > > 
> > > I already got my VM stuck on this one.  I had two twosleepy instances,
> > > 2774 was the one that got stuck (also confirmed by watching top).
> > > 
> > > Btw, have you been able to reproduce this on your end?
> > > 
> > > I think the easiest reproduction on my 2-core VM is by running 2
> > > twosleepy processes and doing the following to dirty a lot of pages:
> > 
> > Given the persistent sk_stream_wait_memory() traces I suspect a plain
> > TCP bug, triggered by some extra wait somewhere.
> > 
> > Please mm guys don't spend too much time right now, I'll try to
> > reproduce the problem.
> > 
> > Don't be confused by sk_stream_wait_memory() name.
> > A thread is stuck here because TCP stack is failing to wake it.
> > 
> 
> Hmm, it seems sk_filter() can return -ENOMEM because skb has the
> pfmemalloc() set.
> 

The skb should not have pfmemalloc set in most cases, particularly after
cfd19c5a (mm: only set page->pfmemalloc when ALLOC_NO_WATERMARKS was used)
but the capture patch also failed to clear pfmemalloc properly so it could
be set in error.

-- 
Mel Gorman
SUSE Labs

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: ppoll() stuck on POLLIN while TCP peer is sending
  2013-01-08 23:23             ` Eric Wong
  2013-01-09  2:14               ` Eric Dumazet
@ 2013-01-09 13:37               ` Mel Gorman
  2013-01-09 13:50                 ` Mel Gorman
  2013-01-10  9:25                 ` Eric Wong
  1 sibling, 2 replies; 53+ messages in thread
From: Mel Gorman @ 2013-01-09 13:37 UTC (permalink / raw)
  To: Eric Wong
  Cc: linux-mm, netdev, linux-kernel, Rik van Riel, Minchan Kim,
	Eric Dumazet, Andrew Morton, Linus Torvalds

On Tue, Jan 08, 2013 at 11:23:25PM +0000, Eric Wong wrote:
> Mel Gorman <mgorman@suse.de> wrote:
> > Please try the following patch. However, even if it works the benefit of
> > capture may be so marginal that partially reverting it and simplifying
> > compaction.c is the better decision.
> 
> I already got my VM stuck on this one.  I had two twosleepy instances,
> 2774 was the one that got stuck (also confirmed by watching top).
> 

page->pfmemalloc can be left set for captured pages so try this but as
capture is rarely used I'm strongly favouring a partial revert even if
this works for you. I haven't reproduced this using your workload yet
but I have found that high-order allocation stress tests for 3.8-rc2 are
completely screwed. 71% success rates at rest in 3.7 and 6% in 3.8-rc2 so
I have to chase that down too.

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9d20c13..c242d21 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2180,8 +2180,10 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	current->flags &= ~PF_MEMALLOC;
 
 	/* If compaction captured a page, prep and use it */
-	if (page && !prep_new_page(page, order, gfp_mask))
+	if (page && !prep_new_page(page, order, gfp_mask)) {
+		page->pfmemalloc = false;
 		goto got_page;
+	}
 
 	if (*did_some_progress != COMPACT_SKIPPED) {
 		/* Page migration frees to the PCP lists but we want merging */

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* Re: ppoll() stuck on POLLIN while TCP peer is sending
  2013-01-09 13:37               ` Mel Gorman
@ 2013-01-09 13:50                 ` Mel Gorman
  2013-01-10  9:25                 ` Eric Wong
  1 sibling, 0 replies; 53+ messages in thread
From: Mel Gorman @ 2013-01-09 13:50 UTC (permalink / raw)
  To: Eric Wong
  Cc: linux-mm, netdev, linux-kernel, Rik van Riel, Minchan Kim,
	Eric Dumazet, Andrew Morton, Linus Torvalds

On Wed, Jan 09, 2013 at 01:37:46PM +0000, Mel Gorman wrote:
> On Tue, Jan 08, 2013 at 11:23:25PM +0000, Eric Wong wrote:
> > Mel Gorman <mgorman@suse.de> wrote:
> > > Please try the following patch. However, even if it works the benefit of
> > > capture may be so marginal that partially reverting it and simplifying
> > > compaction.c is the better decision.
> > 
> > I already got my VM stuck on this one.  I had two twosleepy instances,
> > 2774 was the one that got stuck (also confirmed by watching top).
> > 
> 
> page->pfmemalloc can be left set for captured pages so try this but as
> capture is rarely used I'm strongly favouring a partial revert even if
> this works for you.

Partial revert looks like this

---8<---

---
 include/linux/compaction.h |    4 +-
 include/linux/mm.h         |    1 -
 mm/compaction.c            |   91 ++++++--------------------------------------
 mm/internal.h              |    1 -
 mm/page_alloc.c            |   38 +++++-------------
 5 files changed, 23 insertions(+), 112 deletions(-)

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 6ecb6dc..cc7bdde 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -22,7 +22,7 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
 extern int fragmentation_index(struct zone *zone, unsigned int order);
 extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
 			int order, gfp_t gfp_mask, nodemask_t *mask,
-			bool sync, bool *contended, struct page **page);
+			bool sync, bool *contended);
 extern int compact_pgdat(pg_data_t *pgdat, int order);
 extern void reset_isolation_suitable(pg_data_t *pgdat);
 extern unsigned long compaction_suitable(struct zone *zone, int order);
@@ -75,7 +75,7 @@ static inline bool compaction_restarting(struct zone *zone, int order)
 #else
 static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
 			int order, gfp_t gfp_mask, nodemask_t *nodemask,
-			bool sync, bool *contended, struct page **page)
+			bool sync, bool *contended)
 {
 	return COMPACT_CONTINUE;
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 6320407..66e2f7c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -455,7 +455,6 @@ void put_pages_list(struct list_head *pages);
 
 void split_page(struct page *page, unsigned int order);
 int split_free_page(struct page *page);
-int capture_free_page(struct page *page, int alloc_order, int migratetype);
 
 /*
  * Compound pages have a destructor function.  Provide a
diff --git a/mm/compaction.c b/mm/compaction.c
index 6b807e4..8bc3066 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -816,6 +816,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
 static int compact_finished(struct zone *zone,
 			    struct compact_control *cc)
 {
+	unsigned int order;
 	unsigned long watermark;
 
 	if (fatal_signal_pending(current))
@@ -850,22 +851,15 @@ static int compact_finished(struct zone *zone,
 		return COMPACT_CONTINUE;
 
 	/* Direct compactor: Is a suitable page free? */
-	if (cc->page) {
-		/* Was a suitable page captured? */
-		if (*cc->page)
+	for (order = cc->order; order < MAX_ORDER; order++) {
+		struct free_area *area = &zone->free_area[cc->order];
+		/* Job done if page is free of the right migratetype */
+		if (!list_empty(&area->free_list[cc->migratetype]))
+			return COMPACT_PARTIAL;
+
+		/* Job done if allocation would set block type */
+		if (cc->order >= pageblock_order && area->nr_free)
 			return COMPACT_PARTIAL;
-	} else {
-		unsigned int order;
-		for (order = cc->order; order < MAX_ORDER; order++) {
-			struct free_area *area = &zone->free_area[cc->order];
-			/* Job done if page is free of the right migratetype */
-			if (!list_empty(&area->free_list[cc->migratetype]))
-				return COMPACT_PARTIAL;
-
-			/* Job done if allocation would set block type */
-			if (cc->order >= pageblock_order && area->nr_free)
-				return COMPACT_PARTIAL;
-		}
 	}
 
 	return COMPACT_CONTINUE;
@@ -921,60 +915,6 @@ unsigned long compaction_suitable(struct zone *zone, int order)
 	return COMPACT_CONTINUE;
 }
 
-static void compact_capture_page(struct compact_control *cc)
-{
-	unsigned long flags;
-	int mtype, mtype_low, mtype_high;
-
-	if (!cc->page || *cc->page)
-		return;
-
-	/*
-	 * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP
-	 * regardless of the migratetype of the freelist is is captured from.
-	 * This is fine because the order for a high-order MIGRATE_MOVABLE
-	 * allocation is typically at least a pageblock size and overall
-	 * fragmentation is not impaired. Other allocation types must
-	 * capture pages from their own migratelist because otherwise they
-	 * could pollute other pageblocks like MIGRATE_MOVABLE with
-	 * difficult to move pages and making fragmentation worse overall.
-	 */
-	if (cc->migratetype == MIGRATE_MOVABLE) {
-		mtype_low = 0;
-		mtype_high = MIGRATE_PCPTYPES;
-	} else {
-		mtype_low = cc->migratetype;
-		mtype_high = cc->migratetype + 1;
-	}
-
-	/* Speculatively examine the free lists without zone lock */
-	for (mtype = mtype_low; mtype < mtype_high; mtype++) {
-		int order;
-		for (order = cc->order; order < MAX_ORDER; order++) {
-			struct page *page;
-			struct free_area *area;
-			area = &(cc->zone->free_area[order]);
-			if (list_empty(&area->free_list[mtype]))
-				continue;
-
-			/* Take the lock and attempt capture of the page */
-			if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc))
-				return;
-			if (!list_empty(&area->free_list[mtype])) {
-				page = list_entry(area->free_list[mtype].next,
-							struct page, lru);
-				if (capture_free_page(page, cc->order, mtype)) {
-					spin_unlock_irqrestore(&cc->zone->lock,
-									flags);
-					*cc->page = page;
-					return;
-				}
-			}
-			spin_unlock_irqrestore(&cc->zone->lock, flags);
-		}
-	}
-}
-
 static int compact_zone(struct zone *zone, struct compact_control *cc)
 {
 	int ret;
@@ -1054,9 +994,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 				goto out;
 			}
 		}
-
-		/* Capture a page now if it is a suitable size */
-		compact_capture_page(cc);
 	}
 
 out:
@@ -1069,8 +1006,7 @@ out:
 
 static unsigned long compact_zone_order(struct zone *zone,
 				 int order, gfp_t gfp_mask,
-				 bool sync, bool *contended,
-				 struct page **page)
+				 bool sync, bool *contended)
 {
 	unsigned long ret;
 	struct compact_control cc = {
@@ -1080,7 +1016,6 @@ static unsigned long compact_zone_order(struct zone *zone,
 		.migratetype = allocflags_to_migratetype(gfp_mask),
 		.zone = zone,
 		.sync = sync,
-		.page = page,
 	};
 	INIT_LIST_HEAD(&cc.freepages);
 	INIT_LIST_HEAD(&cc.migratepages);
@@ -1110,7 +1045,7 @@ int sysctl_extfrag_threshold = 500;
  */
 unsigned long try_to_compact_pages(struct zonelist *zonelist,
 			int order, gfp_t gfp_mask, nodemask_t *nodemask,
-			bool sync, bool *contended, struct page **page)
+			bool sync, bool *contended)
 {
 	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
 	int may_enter_fs = gfp_mask & __GFP_FS;
@@ -1136,7 +1071,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
 		int status;
 
 		status = compact_zone_order(zone, order, gfp_mask, sync,
-						contended, page);
+						contended);
 		rc = max(status, rc);
 
 		/* If a normal allocation would succeed, stop compacting */
@@ -1192,7 +1127,6 @@ int compact_pgdat(pg_data_t *pgdat, int order)
 	struct compact_control cc = {
 		.order = order,
 		.sync = false,
-		.page = NULL,
 	};
 
 	return __compact_pgdat(pgdat, &cc);
@@ -1203,7 +1137,6 @@ static int compact_node(int nid)
 	struct compact_control cc = {
 		.order = -1,
 		.sync = true,
-		.page = NULL,
 	};
 
 	return __compact_pgdat(NODE_DATA(nid), &cc);
diff --git a/mm/internal.h b/mm/internal.h
index d597f94..9ba2110 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -135,7 +135,6 @@ struct compact_control {
 	int migratetype;		/* MOVABLE, RECLAIMABLE etc */
 	struct zone *zone;
 	bool contended;			/* True if a lock was contended */
-	struct page **page;		/* Page captured of requested size */
 };
 
 unsigned long
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4ba5e37..ebf7fd8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1389,22 +1389,14 @@ void split_page(struct page *page, unsigned int order)
 		set_page_refcounted(page + i);
 }
 
-/*
- * Similar to the split_page family of functions except that the page
- * required at the given order and being isolated now to prevent races
- * with parallel allocators
- */
-int capture_free_page(struct page *page, int alloc_order, int migratetype)
+static int __isolate_free_page(struct page *page, unsigned int order)
 {
-	unsigned int order;
 	unsigned long watermark;
 	struct zone *zone;
 	int mt;
 
 	BUG_ON(!PageBuddy(page));
-
 	zone = page_zone(page);
-	order = page_order(page);
 	mt = get_pageblock_migratetype(page);
 
 	if (mt != MIGRATE_ISOLATE) {
@@ -1413,7 +1405,7 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype)
 		if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
 			return 0;
 
-		__mod_zone_freepage_state(zone, -(1UL << alloc_order), mt);
+		__mod_zone_freepage_state(zone, -(1UL << order), mt);
 	}
 
 	/* Remove page from free list */
@@ -1421,11 +1413,7 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype)
 	zone->free_area[order].nr_free--;
 	rmv_page_order(page);
 
-	if (alloc_order != order)
-		expand(zone, page, alloc_order, order,
-			&zone->free_area[order], migratetype);
-
-	/* Set the pageblock if the captured page is at least a pageblock */
+	/* Set the pageblock if the isolated page is at least a pageblock */
 	if (order >= pageblock_order - 1) {
 		struct page *endpage = page + (1 << order) - 1;
 		for (; page < endpage; page += pageblock_nr_pages) {
@@ -1436,7 +1424,7 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype)
 		}
 	}
 
-	return 1UL << alloc_order;
+	return 1UL << order;
 }
 
 /*
@@ -1451,13 +1439,12 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype)
  */
 int split_free_page(struct page *page)
 {
-	unsigned int order;
+	unsigned int order = page_order(page);
 	int nr_pages;
 
-	BUG_ON(!PageBuddy(page));
 	order = page_order(page);
 
-	nr_pages = capture_free_page(page, order, 0);
+	nr_pages = __isolate_free_page(page, order);
 	if (!nr_pages)
 		return 0;
 
@@ -2163,8 +2150,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	bool *contended_compaction, bool *deferred_compaction,
 	unsigned long *did_some_progress)
 {
-	struct page *page = NULL;
-
 	if (!order)
 		return NULL;
 
@@ -2176,16 +2161,12 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	current->flags |= PF_MEMALLOC;
 	*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
 						nodemask, sync_migration,
-						contended_compaction, &page);
+						contended_compaction);
 	current->flags &= ~PF_MEMALLOC;
 
-	/* If compaction captured a page, prep and use it */
-	if (page) {
-		prep_new_page(page, order, gfp_mask);
-		goto got_page;
-	}
-
 	if (*did_some_progress != COMPACT_SKIPPED) {
+		struct page *page;
+
 		/* Page migration frees to the PCP lists but we want merging */
 		drain_pages(get_cpu());
 		put_cpu();
@@ -2195,7 +2176,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 				alloc_flags & ~ALLOC_NO_WATERMARKS,
 				preferred_zone, migratetype);
 		if (page) {
-got_page:
 			preferred_zone->compact_blockskip_flush = false;
 			preferred_zone->compact_considered = 0;
 			preferred_zone->compact_defer_shift = 0;

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* Re: ppoll() stuck on POLLIN while TCP peer is sending
  2013-01-09 13:37               ` Mel Gorman
  2013-01-09 13:50                 ` Mel Gorman
@ 2013-01-10  9:25                 ` Eric Wong
  2013-01-10 19:42                   ` Mel Gorman
  1 sibling, 1 reply; 53+ messages in thread
From: Eric Wong @ 2013-01-10  9:25 UTC (permalink / raw)
  To: Mel Gorman
  Cc: linux-mm, netdev, linux-kernel, Rik van Riel, Minchan Kim,
	Eric Dumazet, Andrew Morton, Linus Torvalds

Mel Gorman <mgorman@suse.de> wrote:
> page->pfmemalloc can be left set for captured pages so try this but as
> capture is rarely used I'm strongly favouring a partial revert even if
> this works for you. I haven't reproduced this using your workload yet
> but I have found that high-order allocation stress tests for 3.8-rc2 are
> completely screwed. 71% success rates at rest in 3.7 and 6% in 3.8-rc2 so
> I have to chase that down too.
> 
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 9d20c13..c242d21 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -2180,8 +2180,10 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
>  	current->flags &= ~PF_MEMALLOC;
>  
>  	/* If compaction captured a page, prep and use it */
> -	if (page && !prep_new_page(page, order, gfp_mask))
> +	if (page && !prep_new_page(page, order, gfp_mask)) {
> +		page->pfmemalloc = false;
>  		goto got_page;
> +	}
>  
>  	if (*did_some_progress != COMPACT_SKIPPED) {
>  		/* Page migration frees to the PCP lists but we want merging */

This (on top of your previous patch) seems to work great after several
hours of testing on both my VM and real machine.  I haven't tried your
partial revert, yet.  Will try that in a bit on the VM.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: ppoll() stuck on POLLIN while TCP peer is sending
  2013-01-10  9:25                 ` Eric Wong
@ 2013-01-10 19:42                   ` Mel Gorman
  2013-01-10 20:03                     ` Eric Wong
                                       ` (2 more replies)
  0 siblings, 3 replies; 53+ messages in thread
From: Mel Gorman @ 2013-01-10 19:42 UTC (permalink / raw)
  To: Eric Wong
  Cc: linux-mm, netdev, linux-kernel, Rik van Riel, Minchan Kim,
	Eric Dumazet, Andrew Morton, Linus Torvalds

On Thu, Jan 10, 2013 at 09:25:11AM +0000, Eric Wong wrote:
> Mel Gorman <mgorman@suse.de> wrote:
> > page->pfmemalloc can be left set for captured pages so try this but as
> > capture is rarely used I'm strongly favouring a partial revert even if
> > this works for you. I haven't reproduced this using your workload yet
> > but I have found that high-order allocation stress tests for 3.8-rc2 are
> > completely screwed. 71% success rates at rest in 3.7 and 6% in 3.8-rc2 so
> > I have to chase that down too.
> > 
> > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > index 9d20c13..c242d21 100644
> > --- a/mm/page_alloc.c
> > +++ b/mm/page_alloc.c
> > @@ -2180,8 +2180,10 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
> >  	current->flags &= ~PF_MEMALLOC;
> >  
> >  	/* If compaction captured a page, prep and use it */
> > -	if (page && !prep_new_page(page, order, gfp_mask))
> > +	if (page && !prep_new_page(page, order, gfp_mask)) {
> > +		page->pfmemalloc = false;
> >  		goto got_page;
> > +	}
> >  
> >  	if (*did_some_progress != COMPACT_SKIPPED) {
> >  		/* Page migration frees to the PCP lists but we want merging */
> 
> This (on top of your previous patch) seems to work great after several
> hours of testing on both my VM and real machine.  I haven't tried your
> partial revert, yet.  Will try that in a bit on the VM.

Thanks Eric, it's much appreciated. However, I'm still very much in favour
of a partial revert as in retrospect the implementation of capture took the
wrong approach. Could you confirm the following patch works for you?
It's should functionally have the same effect as the first revert and
there are only minor changes from the last revert prototype I sent you
but there is no harm in being sure.

---8<---
mm: compaction: Partially revert capture of suitable high-order page

Eric Wong reported on 3.7 and 3.8-rc2 that ppoll() got stuck when waiting
for POLLIN on a local TCP socket. It was easier to trigger if there was disk
IO and dirty pages at the same time and he bisected it to commit 1fb3f8ca
"mm: compaction: capture a suitable high-order page immediately when it
is made available".

The intention of that patch was to improve high-order allocations under
memory pressure after changes made to reclaim in 3.6 drastically hurt
THP allocations but the approach was flawed. For Eric, the problem was
that page->pfmemalloc was not being cleared for captured pages leading to
a poor interaction with swap-over-NFS support causing the packets to be
dropped. However, I identified a few more problems with the patch including
the fact that it can increase contention on zone->lock in some cases which
could result in async direct compaction being aborted early.

In retrospect the capture patch took the wrong approach. What it should
have done is mark the pageblock being migrated as MIGRATE_ISOLATE if it
was allocating for THP and avoided races that way. While the patch was
showing to improve allocation success rates at the time, the benefit is
marginal given the relative complexity and it should be revisited from
scratch in the context of the other reclaim-related changes that have taken
place since the patch was first written and tested. This patch partially
reverts commit 1fb3f8ca "mm: compaction: capture a suitable high-order
page immediately when it is made available".

Reported-by: Eric Wong <normalperson@yhbt.net>
Cc: stable@vger.kernel.org
Signed-off-by: Mel Gorman <mgorman@suse.de>
---
 include/linux/compaction.h |    4 +-
 include/linux/mm.h         |    1 -
 mm/compaction.c            |   92 +++++++-------------------------------------
 mm/internal.h              |    1 -
 mm/page_alloc.c            |   35 ++++-------------
 5 files changed, 23 insertions(+), 110 deletions(-)

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 6ecb6dc..cc7bdde 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -22,7 +22,7 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
 extern int fragmentation_index(struct zone *zone, unsigned int order);
 extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
 			int order, gfp_t gfp_mask, nodemask_t *mask,
-			bool sync, bool *contended, struct page **page);
+			bool sync, bool *contended);
 extern int compact_pgdat(pg_data_t *pgdat, int order);
 extern void reset_isolation_suitable(pg_data_t *pgdat);
 extern unsigned long compaction_suitable(struct zone *zone, int order);
@@ -75,7 +75,7 @@ static inline bool compaction_restarting(struct zone *zone, int order)
 #else
 static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
 			int order, gfp_t gfp_mask, nodemask_t *nodemask,
-			bool sync, bool *contended, struct page **page)
+			bool sync, bool *contended)
 {
 	return COMPACT_CONTINUE;
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 6320407..66e2f7c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -455,7 +455,6 @@ void put_pages_list(struct list_head *pages);
 
 void split_page(struct page *page, unsigned int order);
 int split_free_page(struct page *page);
-int capture_free_page(struct page *page, int alloc_order, int migratetype);
 
 /*
  * Compound pages have a destructor function.  Provide a
diff --git a/mm/compaction.c b/mm/compaction.c
index 6b807e4..2c57043 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -816,6 +816,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
 static int compact_finished(struct zone *zone,
 			    struct compact_control *cc)
 {
+	unsigned int order;
 	unsigned long watermark;
 
 	if (fatal_signal_pending(current))
@@ -850,22 +851,16 @@ static int compact_finished(struct zone *zone,
 		return COMPACT_CONTINUE;
 
 	/* Direct compactor: Is a suitable page free? */
-	if (cc->page) {
-		/* Was a suitable page captured? */
-		if (*cc->page)
+	for (order = cc->order; order < MAX_ORDER; order++) {
+		struct free_area *area = &zone->free_area[order];
+
+		/* Job done if page is free of the right migratetype */
+		if (!list_empty(&area->free_list[cc->migratetype]))
+			return COMPACT_PARTIAL;
+
+		/* Job done if allocation would set block type */
+		if (cc->order >= pageblock_order && area->nr_free)
 			return COMPACT_PARTIAL;
-	} else {
-		unsigned int order;
-		for (order = cc->order; order < MAX_ORDER; order++) {
-			struct free_area *area = &zone->free_area[cc->order];
-			/* Job done if page is free of the right migratetype */
-			if (!list_empty(&area->free_list[cc->migratetype]))
-				return COMPACT_PARTIAL;
-
-			/* Job done if allocation would set block type */
-			if (cc->order >= pageblock_order && area->nr_free)
-				return COMPACT_PARTIAL;
-		}
 	}
 
 	return COMPACT_CONTINUE;
@@ -921,60 +916,6 @@ unsigned long compaction_suitable(struct zone *zone, int order)
 	return COMPACT_CONTINUE;
 }
 
-static void compact_capture_page(struct compact_control *cc)
-{
-	unsigned long flags;
-	int mtype, mtype_low, mtype_high;
-
-	if (!cc->page || *cc->page)
-		return;
-
-	/*
-	 * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP
-	 * regardless of the migratetype of the freelist is is captured from.
-	 * This is fine because the order for a high-order MIGRATE_MOVABLE
-	 * allocation is typically at least a pageblock size and overall
-	 * fragmentation is not impaired. Other allocation types must
-	 * capture pages from their own migratelist because otherwise they
-	 * could pollute other pageblocks like MIGRATE_MOVABLE with
-	 * difficult to move pages and making fragmentation worse overall.
-	 */
-	if (cc->migratetype == MIGRATE_MOVABLE) {
-		mtype_low = 0;
-		mtype_high = MIGRATE_PCPTYPES;
-	} else {
-		mtype_low = cc->migratetype;
-		mtype_high = cc->migratetype + 1;
-	}
-
-	/* Speculatively examine the free lists without zone lock */
-	for (mtype = mtype_low; mtype < mtype_high; mtype++) {
-		int order;
-		for (order = cc->order; order < MAX_ORDER; order++) {
-			struct page *page;
-			struct free_area *area;
-			area = &(cc->zone->free_area[order]);
-			if (list_empty(&area->free_list[mtype]))
-				continue;
-
-			/* Take the lock and attempt capture of the page */
-			if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc))
-				return;
-			if (!list_empty(&area->free_list[mtype])) {
-				page = list_entry(area->free_list[mtype].next,
-							struct page, lru);
-				if (capture_free_page(page, cc->order, mtype)) {
-					spin_unlock_irqrestore(&cc->zone->lock,
-									flags);
-					*cc->page = page;
-					return;
-				}
-			}
-			spin_unlock_irqrestore(&cc->zone->lock, flags);
-		}
-	}
-}
-
 static int compact_zone(struct zone *zone, struct compact_control *cc)
 {
 	int ret;
@@ -1054,9 +995,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 				goto out;
 			}
 		}
-
-		/* Capture a page now if it is a suitable size */
-		compact_capture_page(cc);
 	}
 
 out:
@@ -1069,8 +1007,7 @@ out:
 
 static unsigned long compact_zone_order(struct zone *zone,
 				 int order, gfp_t gfp_mask,
-				 bool sync, bool *contended,
-				 struct page **page)
+				 bool sync, bool *contended)
 {
 	unsigned long ret;
 	struct compact_control cc = {
@@ -1080,7 +1017,6 @@ static unsigned long compact_zone_order(struct zone *zone,
 		.migratetype = allocflags_to_migratetype(gfp_mask),
 		.zone = zone,
 		.sync = sync,
-		.page = page,
 	};
 	INIT_LIST_HEAD(&cc.freepages);
 	INIT_LIST_HEAD(&cc.migratepages);
@@ -1110,7 +1046,7 @@ int sysctl_extfrag_threshold = 500;
  */
 unsigned long try_to_compact_pages(struct zonelist *zonelist,
 			int order, gfp_t gfp_mask, nodemask_t *nodemask,
-			bool sync, bool *contended, struct page **page)
+			bool sync, bool *contended)
 {
 	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
 	int may_enter_fs = gfp_mask & __GFP_FS;
@@ -1136,7 +1072,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
 		int status;
 
 		status = compact_zone_order(zone, order, gfp_mask, sync,
-						contended, page);
+						contended);
 		rc = max(status, rc);
 
 		/* If a normal allocation would succeed, stop compacting */
@@ -1192,7 +1128,6 @@ int compact_pgdat(pg_data_t *pgdat, int order)
 	struct compact_control cc = {
 		.order = order,
 		.sync = false,
-		.page = NULL,
 	};
 
 	return __compact_pgdat(pgdat, &cc);
@@ -1203,7 +1138,6 @@ static int compact_node(int nid)
 	struct compact_control cc = {
 		.order = -1,
 		.sync = true,
-		.page = NULL,
 	};
 
 	return __compact_pgdat(NODE_DATA(nid), &cc);
diff --git a/mm/internal.h b/mm/internal.h
index d597f94..9ba2110 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -135,7 +135,6 @@ struct compact_control {
 	int migratetype;		/* MOVABLE, RECLAIMABLE etc */
 	struct zone *zone;
 	bool contended;			/* True if a lock was contended */
-	struct page **page;		/* Page captured of requested size */
 };
 
 unsigned long
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4ba5e37..7e4ae85 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1389,14 +1389,8 @@ void split_page(struct page *page, unsigned int order)
 		set_page_refcounted(page + i);
 }
 
-/*
- * Similar to the split_page family of functions except that the page
- * required at the given order and being isolated now to prevent races
- * with parallel allocators
- */
-int capture_free_page(struct page *page, int alloc_order, int migratetype)
+static int __isolate_free_page(struct page *page, unsigned int order)
 {
-	unsigned int order;
 	unsigned long watermark;
 	struct zone *zone;
 	int mt;
@@ -1404,7 +1398,6 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype)
 	BUG_ON(!PageBuddy(page));
 
 	zone = page_zone(page);
-	order = page_order(page);
 	mt = get_pageblock_migratetype(page);
 
 	if (mt != MIGRATE_ISOLATE) {
@@ -1413,7 +1406,7 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype)
 		if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
 			return 0;
 
-		__mod_zone_freepage_state(zone, -(1UL << alloc_order), mt);
+		__mod_zone_freepage_state(zone, -(1UL << order), mt);
 	}
 
 	/* Remove page from free list */
@@ -1421,11 +1414,7 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype)
 	zone->free_area[order].nr_free--;
 	rmv_page_order(page);
 
-	if (alloc_order != order)
-		expand(zone, page, alloc_order, order,
-			&zone->free_area[order], migratetype);
-
-	/* Set the pageblock if the captured page is at least a pageblock */
+	/* Set the pageblock if the isolated page is at least a pageblock */
 	if (order >= pageblock_order - 1) {
 		struct page *endpage = page + (1 << order) - 1;
 		for (; page < endpage; page += pageblock_nr_pages) {
@@ -1436,7 +1425,7 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype)
 		}
 	}
 
-	return 1UL << alloc_order;
+	return 1UL << order;
 }
 
 /*
@@ -1454,10 +1443,9 @@ int split_free_page(struct page *page)
 	unsigned int order;
 	int nr_pages;
 
-	BUG_ON(!PageBuddy(page));
 	order = page_order(page);
 
-	nr_pages = capture_free_page(page, order, 0);
+	nr_pages = __isolate_free_page(page, order);
 	if (!nr_pages)
 		return 0;
 
@@ -2163,8 +2151,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	bool *contended_compaction, bool *deferred_compaction,
 	unsigned long *did_some_progress)
 {
-	struct page *page = NULL;
-
 	if (!order)
 		return NULL;
 
@@ -2176,16 +2162,12 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	current->flags |= PF_MEMALLOC;
 	*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
 						nodemask, sync_migration,
-						contended_compaction, &page);
+						contended_compaction);
 	current->flags &= ~PF_MEMALLOC;
 
-	/* If compaction captured a page, prep and use it */
-	if (page) {
-		prep_new_page(page, order, gfp_mask);
-		goto got_page;
-	}
-
 	if (*did_some_progress != COMPACT_SKIPPED) {
+		struct page *page;
+
 		/* Page migration frees to the PCP lists but we want merging */
 		drain_pages(get_cpu());
 		put_cpu();
@@ -2195,7 +2177,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 				alloc_flags & ~ALLOC_NO_WATERMARKS,
 				preferred_zone, migratetype);
 		if (page) {
-got_page:
 			preferred_zone->compact_blockskip_flush = false;
 			preferred_zone->compact_considered = 0;
 			preferred_zone->compact_defer_shift = 0;

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* Re: ppoll() stuck on POLLIN while TCP peer is sending
  2013-01-10 19:42                   ` Mel Gorman
@ 2013-01-10 20:03                     ` Eric Wong
  2013-01-10 20:58                     ` Eric Dumazet
  2013-01-11  0:51                     ` Eric Wong
  2 siblings, 0 replies; 53+ messages in thread
From: Eric Wong @ 2013-01-10 20:03 UTC (permalink / raw)
  To: Mel Gorman
  Cc: linux-mm, netdev, linux-kernel, Rik van Riel, Minchan Kim,
	Eric Dumazet, Andrew Morton, Linus Torvalds

Mel Gorman <mgorman@suse.de> wrote:
> Thanks Eric, it's much appreciated. However, I'm still very much in favour
> of a partial revert as in retrospect the implementation of capture took the
> wrong approach. Could you confirm the following patch works for you?
> It's should functionally have the same effect as the first revert and
> there are only minor changes from the last revert prototype I sent you
> but there is no harm in being sure.

Thanks, I was just about to report back on the last partial revert
being successful :)  Will start testing this one, now.

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: ppoll() stuck on POLLIN while TCP peer is sending
  2013-01-10 19:42                   ` Mel Gorman
  2013-01-10 20:03                     ` Eric Wong
@ 2013-01-10 20:58                     ` Eric Dumazet
  2013-01-11  0:51                     ` Eric Wong
  2 siblings, 0 replies; 53+ messages in thread
From: Eric Dumazet @ 2013-01-10 20:58 UTC (permalink / raw)
  To: Mel Gorman
  Cc: Eric Wong, linux-mm, netdev, linux-kernel, Rik van Riel,
	Minchan Kim, Andrew Morton, Linus Torvalds

On Thu, 2013-01-10 at 19:42 +0000, Mel Gorman wrote:

> Thanks Eric, it's much appreciated. However, I'm still very much in favour
> of a partial revert as in retrospect the implementation of capture took the
> wrong approach. Could you confirm the following patch works for you?
> It's should functionally have the same effect as the first revert and
> there are only minor changes from the last revert prototype I sent you
> but there is no harm in being sure.
> 
> ---8<---
> mm: compaction: Partially revert capture of suitable high-order page
> 
> Eric Wong reported on 3.7 and 3.8-rc2 that ppoll() got stuck when waiting
> for POLLIN on a local TCP socket. It was easier to trigger if there was disk
> IO and dirty pages at the same time and he bisected it to commit 1fb3f8ca
> "mm: compaction: capture a suitable high-order page immediately when it
> is made available".
> 
> The intention of that patch was to improve high-order allocations under
> memory pressure after changes made to reclaim in 3.6 drastically hurt
> THP allocations but the approach was flawed. For Eric, the problem was
> that page->pfmemalloc was not being cleared for captured pages leading to
> a poor interaction with swap-over-NFS support causing the packets to be
> dropped. However, I identified a few more problems with the patch including
> the fact that it can increase contention on zone->lock in some cases which
> could result in async direct compaction being aborted early.
> 
> In retrospect the capture patch took the wrong approach. What it should
> have done is mark the pageblock being migrated as MIGRATE_ISOLATE if it
> was allocating for THP and avoided races that way. While the patch was
> showing to improve allocation success rates at the time, the benefit is
> marginal given the relative complexity and it should be revisited from
> scratch in the context of the other reclaim-related changes that have taken
> place since the patch was first written and tested. This patch partially
> reverts commit 1fb3f8ca "mm: compaction: capture a suitable high-order
> page immediately when it is made available".
> 
> Reported-by: Eric Wong <normalperson@yhbt.net>
> Cc: stable@vger.kernel.org
> Signed-off-by: Mel Gorman <mgorman@suse.de>
> ---

It seems to solve the problem on my kvm testbed

(512 MB of ram, 2 vcpus)

Tested-by: Eric Dumazet <edumazet@google.com>


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: ppoll() stuck on POLLIN while TCP peer is sending
  2013-01-10 19:42                   ` Mel Gorman
  2013-01-10 20:03                     ` Eric Wong
  2013-01-10 20:58                     ` Eric Dumazet
@ 2013-01-11  0:51                     ` Eric Wong
  2013-01-11  9:30                       ` Mel Gorman
  2 siblings, 1 reply; 53+ messages in thread
From: Eric Wong @ 2013-01-11  0:51 UTC (permalink / raw)
  To: Mel Gorman
  Cc: linux-mm, netdev, linux-kernel, Rik van Riel, Minchan Kim,
	Eric Dumazet, Andrew Morton, Linus Torvalds

Mel Gorman <mgorman@suse.de> wrote:
> mm: compaction: Partially revert capture of suitable high-order page

<snip>
 
> Reported-by: Eric Wong <normalperson@yhbt.net>
> Cc: stable@vger.kernel.org
> Signed-off-by: Mel Gorman <mgorman@suse.de>

Thanks, my original use case and test works great after several hours!

Tested-by: Eric Wong <normalperson@yhbt.net>


Unfortunately, I also hit a new bug in 3.8 (not in 3.7.x).  based on Eric
Dumazet's observations, sk_stream_wait_memory may be to blame.
Fortunately this is easier to reproduce (I've cc-ed participants
on this thread already): <20130111004915.GA15415@dcvr.yhbt.net>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: ppoll() stuck on POLLIN while TCP peer is sending
  2013-01-11  0:51                     ` Eric Wong
@ 2013-01-11  9:30                       ` Mel Gorman
  0 siblings, 0 replies; 53+ messages in thread
From: Mel Gorman @ 2013-01-11  9:30 UTC (permalink / raw)
  To: Eric Wong
  Cc: linux-mm, netdev, linux-kernel, Rik van Riel, Minchan Kim,
	Eric Dumazet, Andrew Morton, Linus Torvalds

On Fri, Jan 11, 2013 at 12:51:05AM +0000, Eric Wong wrote:
> Mel Gorman <mgorman@suse.de> wrote:
> > mm: compaction: Partially revert capture of suitable high-order page
> 
> <snip>
>  
> > Reported-by: Eric Wong <normalperson@yhbt.net>
> > Cc: stable@vger.kernel.org
> > Signed-off-by: Mel Gorman <mgorman@suse.de>
> 
> Thanks, my original use case and test works great after several hours!
> 
> Tested-by: Eric Wong <normalperson@yhbt.net>
> 

Thanks very much Eric. I've resent the patch to Andrew so it should make
its way to mainline. It'll fail to apply to 3.7-stable but I should get
a notification from Greg when that happens and fix it up.

> Unfortunately, I also hit a new bug in 3.8 (not in 3.7.x).  based on Eric
> Dumazet's observations, sk_stream_wait_memory may be to blame.
> Fortunately this is easier to reproduce (I've cc-ed participants
> on this thread already): <20130111004915.GA15415@dcvr.yhbt.net>

It looks like the relevant fix for this has already been written by Eric
Dumazet and picked up by David Miller.

-- 
Mel Gorman
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: ppoll() stuck on POLLIN while TCP peer is sending
  2013-01-08 22:43           ` Mel Gorman
  2013-01-08 23:23             ` Eric Wong
@ 2013-01-09 21:29             ` Eric Wong
  1 sibling, 0 replies; 53+ messages in thread
From: Eric Wong @ 2013-01-09 21:29 UTC (permalink / raw)
  To: Mel Gorman
  Cc: linux-mm, netdev, linux-kernel, Rik van Riel, Minchan Kim,
	Eric Dumazet, Andrew Morton, Linus Torvalds

Mel Gorman <mgorman@suse.de> wrote:
> When I looked at it for long enough I found a number of problems. Most
> affect timing but two serious issues are in there. One affects how long
> kswapd spends compacting versus reclaiming and the other increases lock
> contention meaning that async compaction can abort early. Both are serious
> and could explain why a driver would fail high-order allocations.
> 
> Please try the following patch. However, even if it works the benefit of
> capture may be so marginal that partially reverting it and simplifying
> compaction.c is the better decision.

Btw, I'm still testing this patch with the "page->pfemalloc = false"
change on top of it.

> diff --git a/mm/compaction.c b/mm/compaction.c
> index 6b807e4..03c82c0 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -857,7 +857,8 @@ static int compact_finished(struct zone *zone,
>  	} else {
>  		unsigned int order;
>  		for (order = cc->order; order < MAX_ORDER; order++) {
> -			struct free_area *area = &zone->free_area[cc->order];
> +			struct free_area *area = &zone->free_area[order];

I noticed something like this hunk wasn't in your latest partial revert
(<20130109135010.GB13475@suse.de>)
I admit I don't understand this code, but this jumped out at me.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 53+ messages in thread

end of thread, other threads:[~2013-01-11  9:30 UTC | newest]

Thread overview: 53+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-12-28  1:45 ppoll() stuck on POLLIN while TCP peer is sending Eric Wong
2012-12-28  7:06 ` Eric Wong
2012-12-29 11:34   ` Eric Wong
2012-12-31 13:21 ` [PATCH] poll: prevent missed events if _qproc is NULL Eric Wong
2012-12-31 23:24   ` Eric Wong
2013-01-01 16:58     ` Junchang(Jason) Wang
2013-01-01 18:42   ` Eric Dumazet
2013-01-01 21:00     ` Eric Wong
2013-01-01 21:17       ` Eric Wong
2013-01-01 22:53         ` Linus Torvalds
2013-01-01 23:21           ` Junchang(Jason) Wang
2013-01-01 23:56           ` [PATCH] epoll: prevent missed events on EPOLL_CTL_MOD Eric Wong
2013-01-02 17:45             ` Eric Dumazet
2013-01-02 18:40               ` Eric Wong
2013-01-02 19:03                 ` Eric Dumazet
2013-01-02 19:32                   ` Eric Wong
2013-01-02 22:08                     ` Eric Dumazet
2013-01-02 21:16             ` Eric Wong
2013-01-02 20:08 ` ppoll() stuck on POLLIN while TCP peer is sending Eric Wong
2013-01-02 20:47   ` Eric Wong
2013-01-03 13:41     ` Eric Dumazet
2013-01-03 18:32       ` Eric Wong
2013-01-03 23:45         ` Eric Wong
2013-01-04  0:26           ` Eric Wong
2013-01-04  3:52             ` Eric Wong
2013-01-04 16:01   ` Mel Gorman
2013-01-04 17:15     ` Eric Dumazet
2013-01-04 17:59     ` Eric Wong
2013-01-05  1:07     ` Eric Wong
2013-01-06 12:07     ` Eric Wong
2013-01-07 12:25       ` Mel Gorman
2013-01-07 22:38         ` Eric Dumazet
2013-01-08  0:21           ` Eric Wong
2013-01-07 22:38         ` Eric Wong
2013-01-08 20:14           ` Eric Wong
2013-01-08 22:43           ` Mel Gorman
2013-01-08 23:23             ` Eric Wong
2013-01-09  2:14               ` Eric Dumazet
2013-01-09  2:32                 ` Eric Dumazet
2013-01-09  2:54                   ` Eric Dumazet
2013-01-09  3:55                     ` Eric Wong
2013-01-09  8:42                       ` Eric Wong
2013-01-09  8:51                         ` Eric Wong
2013-01-09 13:42                   ` Mel Gorman
2013-01-09 13:37               ` Mel Gorman
2013-01-09 13:50                 ` Mel Gorman
2013-01-10  9:25                 ` Eric Wong
2013-01-10 19:42                   ` Mel Gorman
2013-01-10 20:03                     ` Eric Wong
2013-01-10 20:58                     ` Eric Dumazet
2013-01-11  0:51                     ` Eric Wong
2013-01-11  9:30                       ` Mel Gorman
2013-01-09 21:29             ` Eric Wong

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).