Netdev List
 help / color / mirror / Atom feed
* Re: [PATCH net-next-2.6] net: sk_sleep() helper
From: Eric Dumazet @ 2010-04-26  8:20 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20100420.163919.82125824.davem@davemloft.net>

Le mardi 20 avril 2010 à 16:39 -0700, David Miller a écrit :
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Wed, 21 Apr 2010 01:03:51 +0200
> 
> > [PATCH net-next-2.6] net: sk_sleep() helper
> 
> Looks good, applied, thanks Eric.

Here is a followup to this patch, I missed three files in this
conversion.

Thanks !

[PATCH net-next-2.6] net: use sk_sleep()

Commit aa395145 (net: sk_sleep() helper) missed three files in the
conversion.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 drivers/net/macvtap.c  |    6 +++---
 net/caif/caif_socket.c |   30 +++++++++++++++---------------
 net/rxrpc/ar-recvmsg.c |    6 +++---
 3 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 85d6420..d97e1fd 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -181,7 +181,7 @@ static int macvtap_forward(struct net_device *dev, struct sk_buff *skb)
 		return -ENOLINK;
 
 	skb_queue_tail(&q->sk.sk_receive_queue, skb);
-	wake_up_interruptible_poll(q->sk.sk_sleep, POLLIN | POLLRDNORM | POLLRDBAND);
+	wake_up_interruptible_poll(sk_sleep(&q->sk), POLLIN | POLLRDNORM | POLLRDBAND);
 	return 0;
 }
 
@@ -562,7 +562,7 @@ static ssize_t macvtap_do_read(struct macvtap_queue *q, struct kiocb *iocb,
 	struct sk_buff *skb;
 	ssize_t ret = 0;
 
-	add_wait_queue(q->sk.sk_sleep, &wait);
+	add_wait_queue(sk_sleep(&q->sk), &wait);
 	while (len) {
 		current->state = TASK_INTERRUPTIBLE;
 
@@ -587,7 +587,7 @@ static ssize_t macvtap_do_read(struct macvtap_queue *q, struct kiocb *iocb,
 	}
 
 	current->state = TASK_RUNNING;
-	remove_wait_queue(q->sk.sk_sleep, &wait);
+	remove_wait_queue(sk_sleep(&q->sk), &wait);
 	return ret;
 }
 
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index 90317e7..d455375 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -169,7 +169,7 @@ static int caif_sktrecv_cb(struct cflayer *layr, struct cfpkt *pkt)
 
 	/* Signal reader that data is available. */
 
-	wake_up_interruptible(cf_sk->sk.sk_sleep);
+	wake_up_interruptible(sk_sleep(&cf_sk->sk));
 
 	return 0;
 }
@@ -203,7 +203,7 @@ static void caif_sktflowctrl_cb(struct cflayer *layr,
 		dbfs_atomic_inc(&cnt.num_tx_flow_on_ind);
 		/* Signal reader that data is available. */
 		SET_TX_FLOW_ON(cf_sk);
-		wake_up_interruptible(cf_sk->sk.sk_sleep);
+		wake_up_interruptible(sk_sleep(&cf_sk->sk));
 		break;
 
 	case CAIF_CTRLCMD_FLOW_OFF_IND:
@@ -217,7 +217,7 @@ static void caif_sktflowctrl_cb(struct cflayer *layr,
 		caif_assert(STATE_IS_OPEN(cf_sk));
 		SET_PENDING_OFF(cf_sk);
 		SET_TX_FLOW_ON(cf_sk);
-		wake_up_interruptible(cf_sk->sk.sk_sleep);
+		wake_up_interruptible(sk_sleep(&cf_sk->sk));
 		break;
 
 	case CAIF_CTRLCMD_DEINIT_RSP:
@@ -225,8 +225,8 @@ static void caif_sktflowctrl_cb(struct cflayer *layr,
 		caif_assert(!STATE_IS_OPEN(cf_sk));
 		SET_PENDING_OFF(cf_sk);
 		if (!STATE_IS_PENDING_DESTROY(cf_sk)) {
-			if (cf_sk->sk.sk_sleep != NULL)
-				wake_up_interruptible(cf_sk->sk.sk_sleep);
+			if (sk_sleep(&cf_sk->sk) != NULL)
+				wake_up_interruptible(sk_sleep(&cf_sk->sk));
 		}
 		dbfs_atomic_inc(&cnt.num_deinit);
 		sock_put(&cf_sk->sk);
@@ -238,7 +238,7 @@ static void caif_sktflowctrl_cb(struct cflayer *layr,
 		SET_STATE_CLOSED(cf_sk);
 		SET_PENDING_OFF(cf_sk);
 		SET_TX_FLOW_OFF(cf_sk);
-		wake_up_interruptible(cf_sk->sk.sk_sleep);
+		wake_up_interruptible(sk_sleep(&cf_sk->sk));
 		break;
 
 	case CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND:
@@ -247,7 +247,7 @@ static void caif_sktflowctrl_cb(struct cflayer *layr,
 		/* Use sk_shutdown to indicate remote shutdown indication */
 		cf_sk->sk.sk_shutdown |= RCV_SHUTDOWN;
 		cf_sk->file_mode = 0;
-		wake_up_interruptible(cf_sk->sk.sk_sleep);
+		wake_up_interruptible(sk_sleep(&cf_sk->sk));
 		break;
 
 	default:
@@ -325,7 +325,7 @@ static int caif_recvmsg(struct kiocb *iocb, struct socket *sock,
 		release_sock(&cf_sk->sk);
 
 		result =
-		    wait_event_interruptible(*cf_sk->sk.sk_sleep,
+		    wait_event_interruptible(*sk_sleep(&cf_sk->sk),
 					     !STATE_IS_PENDING(cf_sk));
 
 		lock_sock(&(cf_sk->sk));
@@ -365,7 +365,7 @@ static int caif_recvmsg(struct kiocb *iocb, struct socket *sock,
 		release_sock(&cf_sk->sk);
 
 		/* Block reader until data arrives or socket is closed. */
-		if (wait_event_interruptible(*cf_sk->sk.sk_sleep,
+		if (wait_event_interruptible(*sk_sleep(&cf_sk->sk),
 					cfpkt_qpeek(cf_sk->pktq)
 					|| STATE_IS_REMOTE_SHUTDOWN(cf_sk)
 					|| !STATE_IS_OPEN(cf_sk)) ==
@@ -537,7 +537,7 @@ static int caif_sendmsg(struct kiocb *kiocb, struct socket *sock,
 		 * for its conclusion.
 		 */
 		result =
-		    wait_event_interruptible(*cf_sk->sk.sk_sleep,
+		    wait_event_interruptible(*sk_sleep(&cf_sk->sk),
 					     !STATE_IS_PENDING(cf_sk));
 		/* I want to be alone on cf_sk (except status and queue) */
 		lock_sock(&(cf_sk->sk));
@@ -573,7 +573,7 @@ static int caif_sendmsg(struct kiocb *kiocb, struct socket *sock,
 		release_sock(&cf_sk->sk);
 
 		/* Wait until flow is on or socket is closed */
-		if (wait_event_interruptible(*cf_sk->sk.sk_sleep,
+		if (wait_event_interruptible(*sk_sleep(&cf_sk->sk),
 					TX_FLOW_IS_ON(cf_sk)
 					|| !STATE_IS_OPEN(cf_sk)
 					|| STATE_IS_REMOTE_SHUTDOWN(cf_sk)
@@ -650,7 +650,7 @@ static int caif_sendmsg(struct kiocb *kiocb, struct socket *sock,
 		release_sock(&cf_sk->sk);
 
 		/* Wait until flow is on or socket is closed */
-		if (wait_event_interruptible(*cf_sk->sk.sk_sleep,
+		if (wait_event_interruptible(*sk_sleep(&cf_sk->sk),
 					TX_FLOW_IS_ON(cf_sk)
 					|| !STATE_IS_OPEN(cf_sk)
 					|| STATE_IS_REMOTE_SHUTDOWN(cf_sk)
@@ -898,7 +898,7 @@ static int caif_connect(struct socket *sock, struct sockaddr *uservaddr,
 			 * for its conclusion.
 			 */
 			result =
-			    wait_event_interruptible(*cf_sk->sk.sk_sleep,
+			    wait_event_interruptible(*sk_sleep(&cf_sk->sk),
 						     !STATE_IS_PENDING(cf_sk));
 
 			lock_sock(&(cf_sk->sk));
@@ -965,7 +965,7 @@ static int caif_connect(struct socket *sock, struct sockaddr *uservaddr,
 		release_sock(&cf_sk->sk);
 
 		result =
-		    wait_event_interruptible(*cf_sk->sk.sk_sleep,
+		    wait_event_interruptible(*sk_sleep(&cf_sk->sk),
 					     !STATE_IS_PENDING(cf_sk));
 
 		lock_sock(&(cf_sk->sk));
@@ -1107,7 +1107,7 @@ static int caif_release(struct socket *sock)
 	 * CAIF stack.
 	 */
 	if (!(sock->file->f_flags & O_NONBLOCK)) {
-		res = wait_event_interruptible(*cf_sk->sk.sk_sleep,
+		res = wait_event_interruptible(*sk_sleep(&cf_sk->sk),
 						!STATE_IS_PENDING(cf_sk));
 
 		if (res == -ERESTARTSYS) {
diff --git a/net/rxrpc/ar-recvmsg.c b/net/rxrpc/ar-recvmsg.c
index 60c2b94..0c65013 100644
--- a/net/rxrpc/ar-recvmsg.c
+++ b/net/rxrpc/ar-recvmsg.c
@@ -91,7 +91,7 @@ int rxrpc_recvmsg(struct kiocb *iocb, struct socket *sock,
 
 			/* wait for a message to turn up */
 			release_sock(&rx->sk);
-			prepare_to_wait_exclusive(rx->sk.sk_sleep, &wait,
+			prepare_to_wait_exclusive(sk_sleep(&rx->sk), &wait,
 						  TASK_INTERRUPTIBLE);
 			ret = sock_error(&rx->sk);
 			if (ret)
@@ -102,7 +102,7 @@ int rxrpc_recvmsg(struct kiocb *iocb, struct socket *sock,
 					goto wait_interrupted;
 				timeo = schedule_timeout(timeo);
 			}
-			finish_wait(rx->sk.sk_sleep, &wait);
+			finish_wait(sk_sleep(&rx->sk), &wait);
 			lock_sock(&rx->sk);
 			continue;
 		}
@@ -356,7 +356,7 @@ csum_copy_error:
 wait_interrupted:
 	ret = sock_intr_errno(timeo);
 wait_error:
-	finish_wait(rx->sk.sk_sleep, &wait);
+	finish_wait(sk_sleep(&rx->sk), &wait);
 	if (continue_call)
 		rxrpc_put_call(continue_call);
 	if (copied)



^ permalink raw reply related

* Re: [PATCH 1/7] Topcliff GbE: Add The Main code [1/3]
From: Masayuki Ohtake @ 2010-04-26  8:00 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: NETDEV, Wang, Yong Y, Wang, Qi, Intel OTC, Andrew
In-Reply-To: <20100423082624.147de667@nehalam>

On: Fri, 23 Apr 2010 08:26:24 -0700, Stephen Hemminger
<shemminger@vyatta.com> wrote.

Hi Stephen

Thank you for your review.
We will confirm them and update the new patch.
Our company have a vacation.
Therefore we will update patch after May 6.

Best regards,
Masayuki Ohtake <masa-korg@dsn.okisemi.com>


^ permalink raw reply

* Re: [PATCH] Topcliff PHUB: Add The Packet Hub driver [1/2]
From: Masayuki Ohtake @ 2010-04-26  7:53 UTC (permalink / raw)
  To: Arnd Bergmann; +Cc: NETDEV, Wang, Yong Y, Wang, Qi, andrew.chih.howe.khor
In-Reply-To: <201004231657.57466.arnd@arndb.de>

On: Fri, 23 Apr 2010 16:57:57 +0200, Arnd Bergmann <arnd@arndb.de>  wrote:

Hi Arnd

Thank you for your review.
We will confirm them and update the new patch.
Our company have a vacation.
Therefore we will update patch after May 6.

Best regards,
Masayuki Ohtake <masa-korg@dsn.okisemi.com>

^ permalink raw reply

* Re: [PATCH 1/7] Topcliff GbE: Add The Main code
From: Masayuki Ohtake @ 2010-04-26  7:53 UTC (permalink / raw)
  To: Arnd Bergmann; +Cc: NETDEV, Wang, Yong Y, Wang, Qi, Intel OTC, Andrew
In-Reply-To: <201004231727.23246.arnd@arndb.de>

On: Fri, 23 Apr 2010 17:27:23 +0200, Arnd Bergmann <arnd@arndb.de> wrote.

Hi Arnd

Thank you for your review.
We will confirm them and update the new patch.
Our company have a vacation.
Therefore we will update patch after May 6.

Best regards,
Masayuki Ohtake <masa-korg@dsn.okisemi.com>

^ permalink raw reply

* Re: RPS and forwarding
From: Herbert Xu @ 2010-04-26  7:41 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, therbert, netdev
In-Reply-To: <1272267534.2346.7.camel@edumazet-laptop>

On Mon, Apr 26, 2010 at 09:38:54AM +0200, Eric Dumazet wrote:
>
> Ah OK, with same rxhash I guess ?

It's worse than that I think, because the actual table contains
less than 4G entries.  So either the same rxhash or the same index
in the table.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: RPS and forwarding
From: Eric Dumazet @ 2010-04-26  7:38 UTC (permalink / raw)
  To: Herbert Xu; +Cc: David Miller, therbert, netdev
In-Reply-To: <20100426073017.GA22803@gondor.apana.org.au>

Le lundi 26 avril 2010 à 15:30 +0800, Herbert Xu a écrit :
> On Mon, Apr 26, 2010 at 09:25:16AM +0200, Eric Dumazet wrote:
> >
> > RFS is already working like that, if RPS is not used in conjonction.
> > 
> > Forwarded traffic will see a non tagged rfs flow entry, so this skb will
> > be processed by this cpu, not a remote one.
> > 
> > Currently, only an application can set a RFS flow entry.
> 
> I was referring to the case when a forwarded flow gets hashed to
> the same index as a local flow.

Ah OK, with same rxhash I guess ?

(we could store rxhash next to cpu in struct rps_dev_flow, or 16bit part
of it in the existing padding)




^ permalink raw reply

* Re: RPS and forwarding
From: Herbert Xu @ 2010-04-26  7:30 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, therbert, netdev
In-Reply-To: <1272266716.2346.5.camel@edumazet-laptop>

On Mon, Apr 26, 2010 at 09:25:16AM +0200, Eric Dumazet wrote:
>
> RFS is already working like that, if RPS is not used in conjonction.
> 
> Forwarded traffic will see a non tagged rfs flow entry, so this skb will
> be processed by this cpu, not a remote one.
> 
> Currently, only an application can set a RFS flow entry.

I was referring to the case when a forwarded flow gets hashed to
the same index as a local flow.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: RPS and forwarding
From: Eric Dumazet @ 2010-04-26  7:25 UTC (permalink / raw)
  To: Herbert Xu; +Cc: David Miller, therbert, netdev
In-Reply-To: <20100426063136.GA22401@gondor.apana.org.au>

Le lundi 26 avril 2010 à 14:31 +0800, Herbert Xu a écrit :
> On Sun, Apr 25, 2010 at 11:28:57PM -0700, David Miller wrote:

> 
> > It might be possible to somehow make the sock table get bypassed for
> > forwarded traffic, but I can't think of a cheap way to do that at
> > the moment.
> 
> Yeah I thought about that too but as we need to go through the
> routing table before we know whether something is forwarded or
> not, it certainly seems non-trivial.
> 
> Hmm, maybe if we used a routing cache keyed by rxhash we could
> make an appromixation? After all, we only need to make sure that
> no forwarded traffic is redirected, and not the reverse.  IOW,
> it's OK if we incorrectly classify some local traffic as forwarded.
> 

RFS is already working like that, if RPS is not used in conjonction.

Forwarded traffic will see a non tagged rfs flow entry, so this skb will
be processed by this cpu, not a remote one.

Currently, only an application can set a RFS flow entry.




^ permalink raw reply

* Re: [PATCH 1/2] net: reimplement completion_queue as a FIFO queue
From: David Miller @ 2010-04-26  7:22 UTC (permalink / raw)
  To: eric.dumazet; +Cc: xiaosuo, netdev
In-Reply-To: <1272266509.2346.2.camel@edumazet-laptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 26 Apr 2010 09:21:49 +0200

> Le dimanche 25 avril 2010 à 23:27 -0700, David Miller a écrit :
> 
>> No matter what is faster, we have to process packets in
>> FIFO order, otherwise we get reordering within a flow
>> which is to be absolutely avoided.
> 
> Hmm, completion queue is about freeing skb, and its name is misleading.
> 
> This queue is feed by dev_kfree_skb_irq() only

Ok I see, thanks for explaining.

^ permalink raw reply

* Re: [PATCH 1/2] net: reimplement completion_queue as a FIFO queue
From: Eric Dumazet @ 2010-04-26  7:21 UTC (permalink / raw)
  To: David Miller; +Cc: xiaosuo, netdev
In-Reply-To: <20100425.232758.101478192.davem@davemloft.net>

Le dimanche 25 avril 2010 à 23:27 -0700, David Miller a écrit :

> No matter what is faster, we have to process packets in
> FIFO order, otherwise we get reordering within a flow
> which is to be absolutely avoided.

Hmm, completion queue is about freeing skb, and its name is misleading.

This queue is feed by dev_kfree_skb_irq() only




^ permalink raw reply

* Re: RPS and forwarding
From: Herbert Xu @ 2010-04-26  6:31 UTC (permalink / raw)
  To: David Miller; +Cc: therbert, netdev
In-Reply-To: <20100425.232857.145290769.davem@davemloft.net>

On Sun, Apr 25, 2010 at 11:28:57PM -0700, David Miller wrote:
>
> Right, none of this stuff is on by default.

Great, that puts my mind at ease :)

> It might be possible to somehow make the sock table get bypassed for
> forwarded traffic, but I can't think of a cheap way to do that at
> the moment.

Yeah I thought about that too but as we need to go through the
routing table before we know whether something is forwarded or
not, it certainly seems non-trivial.

Hmm, maybe if we used a routing cache keyed by rxhash we could
make an appromixation? After all, we only need to make sure that
no forwarded traffic is redirected, and not the reverse.  IOW,
it's OK if we incorrectly classify some local traffic as forwarded.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: RPS and forwarding
From: David Miller @ 2010-04-26  6:28 UTC (permalink / raw)
  To: therbert; +Cc: herbert, netdev
In-Reply-To: <r2n65634d661004252200x4e7a49f1h181cdd69b3e4ebc0@mail.gmail.com>

From: Tom Herbert <therbert@google.com>
Date: Sun, 25 Apr 2010 22:00:09 -0700

>> Right.  The problem is that if you run a distribution kernel on
>> a router with CONFIG_RPS (and hence RFS) enabled, you may suddenly
>> start seeing packet reordering on forwarded traffic due to the
>> presence of local traffic.
>>
>> Can we perhaps add a run-time toggle to disable RFS?
>>
> RFS is not on at run-time by default.  The number of entries in the
> global_rps_sock table needs to be set in sysctl, and the number of
> entries in rps_dev_flow_table needs to be set per RX queue in sysfs.
> You can turn it on for some devices, but not others if that helps.

Right, none of this stuff is on by default.

It might be possible to somehow make the sock table get bypassed for
forwarded traffic, but I can't think of a cheap way to do that at
the moment.

^ permalink raw reply

* Re: [PATCH 1/2] net: reimplement completion_queue as a FIFO queue
From: David Miller @ 2010-04-26  6:27 UTC (permalink / raw)
  To: eric.dumazet; +Cc: xiaosuo, netdev
In-Reply-To: <1272262154.2069.1032.camel@edumazet-laptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 26 Apr 2010 08:09:14 +0200

> LIFO is the slub behavior, for the exact reason its better for cache
> reuse. Same for completion queue.
> 
> I repeat : 
> - slub dont touch objects in normal situations.
> - LIFO is better for caches.
> - LIFO is faster (one pointer for the queue head)

No matter what is faster, we have to process packets in
FIFO order, otherwise we get reordering within a flow
which is to be absolutely avoided.

^ permalink raw reply

* Re: [PATCH 1/2] net: reimplement completion_queue as a FIFO queue
From: Eric Dumazet @ 2010-04-26  6:09 UTC (permalink / raw)
  To: Changli Gao; +Cc: David S. Miller, netdev
In-Reply-To: <q2w412e6f7f1004252244s3a5000eft850c2a5dc0fd72d1@mail.gmail.com>

Le lundi 26 avril 2010 à 13:44 +0800, Changli Gao a écrit :
> On Mon, Apr 26, 2010 at 1:14 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > Le lundi 26 avril 2010 à 11:20 +0800, Changli Gao a écrit :
> >> reimplement completion_queue as a FIFO queue.
> >>
> >> As slab allocator always does its best to return the latest unused objects, we'd
> >> better release skb in order, this patch reimplement completion_queue as a FIFO
> >> queue instead of the old LIFO queue.
> >
> >
> > 1) New devices dont use completion queue.
> 
> It is good enough reason for rejection.
> 

I said, this part of the kernel is not used today.

> >
> > 2) Hot objects are the last enqueued.
> >   If many objects were queued, the old one are not hot anymore.
> >
> >   Using FIFO will give more cache misses, when walking the chain
> > (skb->next)
> >
> 
> I meaned that slab allocator maintains objects in the LIFO manner, and
> if we call kmem_cache_alloc(cache) after a kmem_cache_free(cache)
> call, the last object freed by kmem_cache_free(cache) will be
> returned, as this object are more likely in cache and hot. If we don't
> realse skbs in FIFO manner, the last and hot one will not been
> returned at first.
> 

But your patch doesnt change this behavior. This is irrelevant.

LIFO is the slub behavior, for the exact reason its better for cache
reuse. Same for completion queue.

I repeat : 
- slub dont touch objects in normal situations.
- LIFO is better for caches.
- LIFO is faster (one pointer for the queue head)



^ permalink raw reply

* Re: [PATCH 1/2] net: reimplement completion_queue as a FIFO queue
From: Changli Gao @ 2010-04-26  5:44 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David S. Miller, netdev
In-Reply-To: <1272258841.2069.968.camel@edumazet-laptop>

On Mon, Apr 26, 2010 at 1:14 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Le lundi 26 avril 2010 à 11:20 +0800, Changli Gao a écrit :
>> reimplement completion_queue as a FIFO queue.
>>
>> As slab allocator always does its best to return the latest unused objects, we'd
>> better release skb in order, this patch reimplement completion_queue as a FIFO
>> queue instead of the old LIFO queue.
>>
>
> 1) New devices dont use completion queue.

It is good enough reason for rejection.

>
> 2) Hot objects are the last enqueued.
>   If many objects were queued, the old one are not hot anymore.
>
>   Using FIFO will give more cache misses, when walking the chain
> (skb->next)
>

I meaned that slab allocator maintains objects in the LIFO manner, and
if we call kmem_cache_alloc(cache) after a kmem_cache_free(cache)
call, the last object freed by kmem_cache_free(cache) will be
returned, as this object are more likely in cache and hot. If we don't
realse skbs in FIFO manner, the last and hot one will not been
returned at first.

-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply

* Re: RPS and forwarding
From: Eric Dumazet @ 2010-04-26  5:15 UTC (permalink / raw)
  To: Tom Herbert; +Cc: Herbert Xu, David S. Miller, netdev
In-Reply-To: <z2w65634d661004252144z1bf72880wdf7a0143b8490ddf@mail.gmail.com>

Le dimanche 25 avril 2010 à 21:44 -0700, Tom Herbert a écrit :
> > Apart from not using RPS on routers, I suppose people doing
> > forwarding will simply have to maintain a constant RPS table,
> > and forgo its local redirection capabilities.
> >
> 
> I think you'd want RPS on router (steering packets using hash and CPU
> mask).  Most of your concerns are about RFS (steering by flows), which
> is probably more appropriate for a server.

We could probably rename RFS parts to RFS to avoid confusion ?




^ permalink raw reply

* Re: [PATCH 1/2] net: reimplement completion_queue as a FIFO queue
From: Eric Dumazet @ 2010-04-26  5:14 UTC (permalink / raw)
  To: Changli Gao; +Cc: David S. Miller, netdev
In-Reply-To: <1272252016-11755-1-git-send-email-xiaosuo@gmail.com>

Le lundi 26 avril 2010 à 11:20 +0800, Changli Gao a écrit :
> reimplement completion_queue as a FIFO queue.
> 
> As slab allocator always does its best to return the latest unused objects, we'd
> better release skb in order, this patch reimplement completion_queue as a FIFO
> queue instead of the old LIFO queue.
> 

1) New devices dont use completion queue.

2) Hot objects are the last enqueued.
   If many objects were queued, the old one are not hot anymore.

   Using FIFO will give more cache misses, when walking the chain
(skb->next)

3) Claim about slab allocators properties are irrelevant. SLUB doesnt
touch objects, unless some debugging is on.

> At the same time, a opencoding skb queue is replaced with a sk_buff_head queue.
> 

Yes, this is a pro, yet it cost a bit more, because of extra things
(qlen management, and two pointers per object)

For example, compare text size before and after the patch.

If you feel open coding a LIFO is error prone, you could add generic
primitives to kernel ?

> Signed-off-by: Changli Gao <xiaosuo@gmail.com>
> ----
>  include/linux/netdevice.h |    2 +-
>  net/core/dev.c            |   28 ++++++++++------------------
>  net/core/netpoll.c        |   12 +++++-------
>  3 files changed, 16 insertions(+), 26 deletions(-)
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 3c5ed5f..785e514 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -1386,7 +1386,7 @@ static inline int unregister_gifconf(unsigned int family)
>  struct softnet_data {
>  	struct Qdisc		*output_queue;
>  	struct list_head	poll_list;
> -	struct sk_buff		*completion_queue;
> +	struct sk_buff_head	completion_queue;
>  
>  #ifdef CONFIG_RPS
>  	struct softnet_data	*rps_ipi_list;
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 4d43f1a..5bbfa0f 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -1578,8 +1578,7 @@ void dev_kfree_skb_irq(struct sk_buff *skb)
>  
>  		local_irq_save(flags);
>  		sd = &__get_cpu_var(softnet_data);
> -		skb->next = sd->completion_queue;
> -		sd->completion_queue = skb;
> +		__skb_queue_tail(&sd->completion_queue, skb);
>  		raise_softirq_irqoff(NET_TX_SOFTIRQ);
>  		local_irq_restore(flags);
>  	}
> @@ -2506,18 +2505,16 @@ static void net_tx_action(struct softirq_action *h)
>  {
>  	struct softnet_data *sd = &__get_cpu_var(softnet_data);
>  
> -	if (sd->completion_queue) {
> -		struct sk_buff *clist;
> +	if (!skb_queue_empty(&sd->completion_queue)) {
> +		struct sk_buff_head queue;
> +		struct sk_buff *skb;
>  
> +		__skb_queue_head_init(&queue);
>  		local_irq_disable();
> -		clist = sd->completion_queue;
> -		sd->completion_queue = NULL;
> +		skb_queue_splice_tail_init(&sd->completion_queue, &queue);
>  		local_irq_enable();
>  
> -		while (clist) {
> -			struct sk_buff *skb = clist;
> -			clist = clist->next;
> -
> +		while ((skb = __skb_dequeue(&queue))) {
>  			WARN_ON(atomic_read(&skb->users));
>  			__kfree_skb(skb);
>  		}
> @@ -5593,7 +5590,6 @@ static int dev_cpu_callback(struct notifier_block *nfb,
>  			    unsigned long action,
>  			    void *ocpu)
>  {
> -	struct sk_buff **list_skb;
>  	struct Qdisc **list_net;
>  	struct sk_buff *skb;
>  	unsigned int cpu, oldcpu = (unsigned long)ocpu;
> @@ -5607,13 +5603,9 @@ static int dev_cpu_callback(struct notifier_block *nfb,
>  	sd = &per_cpu(softnet_data, cpu);
>  	oldsd = &per_cpu(softnet_data, oldcpu);
>  
> -	/* Find end of our completion_queue. */
> -	list_skb = &sd->completion_queue;
> -	while (*list_skb)
> -		list_skb = &(*list_skb)->next;
>  	/* Append completion queue from offline CPU. */
> -	*list_skb = oldsd->completion_queue;
> -	oldsd->completion_queue = NULL;
> +	skb_queue_splice_tail_init(&oldsd->completion_queue,
> +				   &sd->completion_queue);
>  
>  	/* Find end of our output_queue. */
>  	list_net = &sd->output_queue;
> @@ -5849,7 +5841,7 @@ static int __init net_dev_init(void)
>  		struct softnet_data *sd = &per_cpu(softnet_data, i);
>  
>  		skb_queue_head_init(&sd->input_pkt_queue);
> -		sd->completion_queue = NULL;
> +		__skb_queue_head_init(&sd->completion_queue);
>  		INIT_LIST_HEAD(&sd->poll_list);
>  
>  #ifdef CONFIG_RPS
> diff --git a/net/core/netpoll.c b/net/core/netpoll.c
> index a58f59b..3905a14 100644
> --- a/net/core/netpoll.c
> +++ b/net/core/netpoll.c
> @@ -222,17 +222,15 @@ static void zap_completion_queue(void)
>  	unsigned long flags;
>  	struct softnet_data *sd = &get_cpu_var(softnet_data);
>  
> -	if (sd->completion_queue) {
> -		struct sk_buff *clist;
> +	if (!skb_queue_empty(&sd->completion_queue)) {
> +		struct sk_buff_head queue;
> +		struct sk_buff *skb;
>  
>  		local_irq_save(flags);
> -		clist = sd->completion_queue;
> -		sd->completion_queue = NULL;
> +		skb_queue_splice_tail_init(&sd->completion_queue, &queue);
>  		local_irq_restore(flags);
>  
> -		while (clist != NULL) {
> -			struct sk_buff *skb = clist;
> -			clist = clist->next;
> +		while ((skb = __skb_dequeue(&queue))) {
>  			if (skb->destructor) {
>  				atomic_inc(&skb->users);
>  				dev_kfree_skb_any(skb); /* put this one back */
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 



^ permalink raw reply

* Re: RPS and forwarding
From: Tom Herbert @ 2010-04-26  5:00 UTC (permalink / raw)
  To: Herbert Xu; +Cc: David S. Miller, netdev
In-Reply-To: <20100426045316.GA21810@gondor.apana.org.au>

> Right.  The problem is that if you run a distribution kernel on
> a router with CONFIG_RPS (and hence RFS) enabled, you may suddenly
> start seeing packet reordering on forwarded traffic due to the
> presence of local traffic.
>
> Can we perhaps add a run-time toggle to disable RFS?
>
RFS is not on at run-time by default.  The number of entries in the
global_rps_sock table needs to be set in sysctl, and the number of
entries in rps_dev_flow_table needs to be set per RX queue in sysfs.
You can turn it on for some devices, but not others if that helps.

> Cheers,
> --
> Visit Openswan at http://www.openswan.org/
> Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
> Home Page: http://gondor.apana.org.au/~herbert/
> PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
>

^ permalink raw reply

* Re: RPS and forwarding
From: Herbert Xu @ 2010-04-26  4:53 UTC (permalink / raw)
  To: Tom Herbert; +Cc: David S. Miller, netdev
In-Reply-To: <z2w65634d661004252144z1bf72880wdf7a0143b8490ddf@mail.gmail.com>

On Sun, Apr 25, 2010 at 09:44:05PM -0700, Tom Herbert wrote:
> 
> I think you'd want RPS on router (steering packets using hash and CPU
> mask).  Most of your concerns are about RFS (steering by flows), which
> is probably more appropriate for a server.

Right.  The problem is that if you run a distribution kernel on
a router with CONFIG_RPS (and hence RFS) enabled, you may suddenly
start seeing packet reordering on forwarded traffic due to the
presence of local traffic.

Can we perhaps add a run-time toggle to disable RFS?

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: RPS and forwarding
From: Tom Herbert @ 2010-04-26  4:44 UTC (permalink / raw)
  To: Herbert Xu; +Cc: David S. Miller, netdev
In-Reply-To: <20100426022415.GA20323@gondor.apana.org.au>

> Apart from not using RPS on routers, I suppose people doing
> forwarding will simply have to maintain a constant RPS table,
> and forgo its local redirection capabilities.
>

I think you'd want RPS on router (steering packets using hash and CPU
mask).  Most of your concerns are about RFS (steering by flows), which
is probably more appropriate for a server.

Tom

> Cheers,
> --
> Visit Openswan at http://www.openswan.org/
> Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
> Home Page: http://gondor.apana.org.au/~herbert/
> PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>

^ permalink raw reply

* [PATCH 2/2] net: reimplement softnet_data.output_queue as a FIFO queue
From: Changli Gao @ 2010-04-26  3:26 UTC (permalink / raw)
  To: David S. Miller; +Cc: Eric Dumazet, netdev, Changli Gao

reimplement softnet_data.output_queue as a FIFO queue.

reimplement softnet_data.output_queue as a FIFO queue to keep the fairness among
the qdiscs rescheduled.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
----
 include/linux/netdevice.h |    1 +
 net/core/dev.c            |   22 ++++++++++++----------
 2 files changed, 13 insertions(+), 10 deletions(-)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 785e514..7984c15 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1385,6 +1385,7 @@ static inline int unregister_gifconf(unsigned int family)
  */
 struct softnet_data {
 	struct Qdisc		*output_queue;
+	struct Qdisc		**output_queue_tailp;
 	struct list_head	poll_list;
 	struct sk_buff_head	completion_queue;
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 5bbfa0f..b2b7869 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1557,8 +1557,9 @@ static inline void __netif_reschedule(struct Qdisc *q)
 
 	local_irq_save(flags);
 	sd = &__get_cpu_var(softnet_data);
-	q->next_sched = sd->output_queue;
-	sd->output_queue = q;
+	q->next_sched = NULL;
+	*sd->output_queue_tailp = q;
+	sd->output_queue_tailp = &q->next_sched;
 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
 	local_irq_restore(flags);
 }
@@ -2526,6 +2527,7 @@ static void net_tx_action(struct softirq_action *h)
 		local_irq_disable();
 		head = sd->output_queue;
 		sd->output_queue = NULL;
+		sd->output_queue_tailp = &sd->output_queue;
 		local_irq_enable();
 
 		while (head) {
@@ -5590,7 +5592,6 @@ static int dev_cpu_callback(struct notifier_block *nfb,
 			    unsigned long action,
 			    void *ocpu)
 {
-	struct Qdisc **list_net;
 	struct sk_buff *skb;
 	unsigned int cpu, oldcpu = (unsigned long)ocpu;
 	struct softnet_data *sd, *oldsd;
@@ -5607,13 +5608,12 @@ static int dev_cpu_callback(struct notifier_block *nfb,
 	skb_queue_splice_tail_init(&oldsd->completion_queue,
 				   &sd->completion_queue);
 
-	/* Find end of our output_queue. */
-	list_net = &sd->output_queue;
-	while (*list_net)
-		list_net = &(*list_net)->next_sched;
-	/* Append output queue from offline CPU. */
-	*list_net = oldsd->output_queue;
-	oldsd->output_queue = NULL;
+	if (oldsd->output_queue) {
+		*sd->output_queue_tailp = oldsd->output_queue;
+		sd->output_queue_tailp = oldsd->output_queue_tailp;
+		oldsd->output_queue = NULL;
+		oldsd->output_queue_tailp = &oldsd->output_queue;
+	}
 
 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
 	local_irq_enable();
@@ -5843,6 +5843,8 @@ static int __init net_dev_init(void)
 		skb_queue_head_init(&sd->input_pkt_queue);
 		__skb_queue_head_init(&sd->completion_queue);
 		INIT_LIST_HEAD(&sd->poll_list);
+		sd->output_queue = NULL;
+		sd->output_queue_tailp = &sd->output_queue;
 
 #ifdef CONFIG_RPS
 		sd->csd.func = rps_trigger_softirq;

^ permalink raw reply related

* [PATCH 1/2] net: reimplement completion_queue as a FIFO queue
From: Changli Gao @ 2010-04-26  3:20 UTC (permalink / raw)
  To: David S. Miller; +Cc: Eric Dumazet, netdev, Changli Gao

reimplement completion_queue as a FIFO queue.

As slab allocator always does its best to return the latest unused objects, we'd
better release skb in order, this patch reimplement completion_queue as a FIFO
queue instead of the old LIFO queue.

At the same time, a opencoding skb queue is replaced with a sk_buff_head queue.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
----
 include/linux/netdevice.h |    2 +-
 net/core/dev.c            |   28 ++++++++++------------------
 net/core/netpoll.c        |   12 +++++-------
 3 files changed, 16 insertions(+), 26 deletions(-)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3c5ed5f..785e514 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1386,7 +1386,7 @@ static inline int unregister_gifconf(unsigned int family)
 struct softnet_data {
 	struct Qdisc		*output_queue;
 	struct list_head	poll_list;
-	struct sk_buff		*completion_queue;
+	struct sk_buff_head	completion_queue;
 
 #ifdef CONFIG_RPS
 	struct softnet_data	*rps_ipi_list;
diff --git a/net/core/dev.c b/net/core/dev.c
index 4d43f1a..5bbfa0f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1578,8 +1578,7 @@ void dev_kfree_skb_irq(struct sk_buff *skb)
 
 		local_irq_save(flags);
 		sd = &__get_cpu_var(softnet_data);
-		skb->next = sd->completion_queue;
-		sd->completion_queue = skb;
+		__skb_queue_tail(&sd->completion_queue, skb);
 		raise_softirq_irqoff(NET_TX_SOFTIRQ);
 		local_irq_restore(flags);
 	}
@@ -2506,18 +2505,16 @@ static void net_tx_action(struct softirq_action *h)
 {
 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
 
-	if (sd->completion_queue) {
-		struct sk_buff *clist;
+	if (!skb_queue_empty(&sd->completion_queue)) {
+		struct sk_buff_head queue;
+		struct sk_buff *skb;
 
+		__skb_queue_head_init(&queue);
 		local_irq_disable();
-		clist = sd->completion_queue;
-		sd->completion_queue = NULL;
+		skb_queue_splice_tail_init(&sd->completion_queue, &queue);
 		local_irq_enable();
 
-		while (clist) {
-			struct sk_buff *skb = clist;
-			clist = clist->next;
-
+		while ((skb = __skb_dequeue(&queue))) {
 			WARN_ON(atomic_read(&skb->users));
 			__kfree_skb(skb);
 		}
@@ -5593,7 +5590,6 @@ static int dev_cpu_callback(struct notifier_block *nfb,
 			    unsigned long action,
 			    void *ocpu)
 {
-	struct sk_buff **list_skb;
 	struct Qdisc **list_net;
 	struct sk_buff *skb;
 	unsigned int cpu, oldcpu = (unsigned long)ocpu;
@@ -5607,13 +5603,9 @@ static int dev_cpu_callback(struct notifier_block *nfb,
 	sd = &per_cpu(softnet_data, cpu);
 	oldsd = &per_cpu(softnet_data, oldcpu);
 
-	/* Find end of our completion_queue. */
-	list_skb = &sd->completion_queue;
-	while (*list_skb)
-		list_skb = &(*list_skb)->next;
 	/* Append completion queue from offline CPU. */
-	*list_skb = oldsd->completion_queue;
-	oldsd->completion_queue = NULL;
+	skb_queue_splice_tail_init(&oldsd->completion_queue,
+				   &sd->completion_queue);
 
 	/* Find end of our output_queue. */
 	list_net = &sd->output_queue;
@@ -5849,7 +5841,7 @@ static int __init net_dev_init(void)
 		struct softnet_data *sd = &per_cpu(softnet_data, i);
 
 		skb_queue_head_init(&sd->input_pkt_queue);
-		sd->completion_queue = NULL;
+		__skb_queue_head_init(&sd->completion_queue);
 		INIT_LIST_HEAD(&sd->poll_list);
 
 #ifdef CONFIG_RPS
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index a58f59b..3905a14 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -222,17 +222,15 @@ static void zap_completion_queue(void)
 	unsigned long flags;
 	struct softnet_data *sd = &get_cpu_var(softnet_data);
 
-	if (sd->completion_queue) {
-		struct sk_buff *clist;
+	if (!skb_queue_empty(&sd->completion_queue)) {
+		struct sk_buff_head queue;
+		struct sk_buff *skb;
 
 		local_irq_save(flags);
-		clist = sd->completion_queue;
-		sd->completion_queue = NULL;
+		skb_queue_splice_tail_init(&sd->completion_queue, &queue);
 		local_irq_restore(flags);
 
-		while (clist != NULL) {
-			struct sk_buff *skb = clist;
-			clist = clist->next;
+		while ((skb = __skb_dequeue(&queue))) {
 			if (skb->destructor) {
 				atomic_inc(&skb->users);
 				dev_kfree_skb_any(skb); /* put this one back */

^ permalink raw reply related

* Re: [PATCH net-next-2.6] netns: call ops_free right after ops_exit
From: Eric W. Biederman @ 2010-04-26  3:06 UTC (permalink / raw)
  To: Jiri Pirko; +Cc: David Miller, netdev
In-Reply-To: <20100425184430.GA2891@psychotron.redhat.com>

Jiri Pirko <jpirko@redhat.com> writes:

> Sun, Apr 25, 2010 at 04:50:34PM CEST, ebiederm@xmission.com wrote:
>>David Miller <davem@davemloft.net> writes:
>>
>>> From: Jiri Pirko <jpirko@redhat.com>
>>> Date: Sun, 25 Apr 2010 11:26:01 +0200
>>>
>>>> There's no need to iterate this twice. We can free net generic
>>>> variables right after exit is called.
>>>>
>>>> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
>>>
>>> Are you sure there are no problems with doing this?
>>>
>>> What if there are inter-net variable reference dependencies
>>> or something like that?
>>>
>>> I really suspect it is being done this way on purpose, but
>>> in the end I defer to experts like Eric B. :-)
>>
>>I am pretty certain there is a problem.  My memory is fuzzy this
>>morning but I believe we can have rcu references between various
>>pieces of the networking stack for a single network namespace.  So we
>>need to cause all of the network namespace to exit before it is safe
>>to free those pieces.
>
> Hmm, that doesn't make much sense to me. Since the allocated memory in question
> is used locally, after exit() is called, the memory chunk should not be used by
> anyone and if it is, I think it's a bug.
>
> Earlier, when the memory wasn't allocated automatically (by filling .size)
> memory was individually freed in exit(). From what I understood from your reply,
> you are telling this was buggy?

Now I remember clearly.  The use case for not freeing memory
immediately is the delayed freeing of network devices.  Ideally we
delay unregistering all of the network devices into
default_device_exit_batch, when we terminate one or namespaces.

Since network devices are freed outside of their exit routines we need
to keep their per net memory around until they are freed.

Ultimately all of this is much easier to think about if these chunks of
memory can be logically thought of as living on struct net and have the
same lifetime rules.

Eric

^ permalink raw reply

* [PATCH v3] TCP: avoid to send keepalive probes if receiving data
From: Flavio Leitner @ 2010-04-26  2:40 UTC (permalink / raw)
  To: netdev; +Cc: David Miller, Ilpo, Eric Dumazet, Flavio Leitner
In-Reply-To: <20100425235549.GB2550@sysclose.org>

RFC 1122 says the following:
...
  Keep-alive packets MUST only be sent when no data or
  acknowledgement packets have been received for the
  connection within an interval.
...

The acknowledgement packet is reseting the keepalive
timer but the data packet isn't. This patch fixes it by
checking the timestamp of the last received data packet
too when the keepalive timer expires.

Signed-off-by: Flavio Leitner <fleitner@redhat.com>
---
 net/ipv4/tcp.c       |    5 ++++-
 net/ipv4/tcp_timer.c |    8 ++++++++
 2 files changed, 12 insertions(+), 1 deletions(-)

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 0f8caf6..94f605c 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2298,7 +2298,10 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 			if (sock_flag(sk, SOCK_KEEPOPEN) &&
 			    !((1 << sk->sk_state) &
 			      (TCPF_CLOSE | TCPF_LISTEN))) {
-				__u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
+				u32 elapsed = min_t(u32,
+				      tcp_time_stamp - icsk->icsk_ack.lrcvtime,
+				      tcp_time_stamp - tp->rcv_tstamp);
+
 				if (tp->keepalive_time > elapsed)
 					elapsed = tp->keepalive_time - elapsed;
 				else
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 8a0ab29..9d1bb6f 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -554,6 +554,14 @@ static void tcp_keepalive_timer (unsigned long data)
 	if (tp->packets_out || tcp_send_head(sk))
 		goto resched;
 
+	elapsed = tcp_time_stamp - icsk->icsk_ack.lrcvtime;
+
+	/* receiving data means alive */
+	if (elapsed < keepalive_time_when(tp)) {
+		elapsed = keepalive_time_when(tp) - elapsed;
+		goto resched;
+	}
+
 	elapsed = tcp_time_stamp - tp->rcv_tstamp;
 
 	if (elapsed >= keepalive_time_when(tp)) {
-- 
1.5.5.6


^ permalink raw reply related

* RPS and forwarding
From: Herbert Xu @ 2010-04-26  2:24 UTC (permalink / raw)
  To: David S. Miller, netdev

Hi:

I'm sorry I didn't have time to jump into the RPS discussions
earlier, so in a way I'm just getting what I deserved :)

Anyway, I am specifically concerned about the possibility of
reordering of forwarded traffic.

As RPS is doing fuzzy matching, it is possible (and quite likely
if rps_sock_flow_table is small) for a forwarded flow to be hashed
to the same index as a local flow.

In that case we may end up redirecting a forwarded flow.  That
in itself is undesirable because for forwarded flows the best
solution is to stay on the ingress CPU.

What's worse is that if the local flow bounces around different
CPUs, the forwarded flow will follow it.

For a local flow RPS can guarantee original ordering (assuming
we're not doing anything weird like netfilter queueing), but
this doesn't work for forwarded flows.

Even if netif_receive_skb has completed, the forwarded packet
may still be sitting in a hardware TX queue, selected based
on the processing CPU.  If you then bounce the forwarded flow
then packets may be placed in a different hardware TX queue,
causing reordering.

BTW, selecting hardware TX queues for forwarded flows based
on the rxhash is not a good solution, as that causes cache-line
bouncing between CPUs.

Apart from not using RPS on routers, I suppose people doing
forwarding will simply have to maintain a constant RPS table,
and forgo its local redirection capabilities.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox