Netdev List
 help / color / mirror / Atom feed
* [PATCH v6] Phonet: set the pipe handle using setsockopt
From: Hemant Vilas RAMDASI @ 2011-11-16  8:52 UTC (permalink / raw)
  To: netdev-owner
  Cc: netdev, remi.denis-courmont, Dinesh Kumar Sharma, Hemant Ramdasi

From: Dinesh Kumar Sharma <dinesh.sharma@stericsson.com>

This provides flexibility to set the pipe handle
using setsockopt. The pipe can be enabled (if disabled) later
using ioctl.

Signed-off-by: Hemant Ramdasi <hemant.ramdasi@stericsson.com>
Signed-off-by: Dinesh Kumar Sharma <dinesh.sharma@stericsson.com>
---
 include/linux/phonet.h |    3 +
 net/phonet/pep.c       |  103 ++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 99 insertions(+), 7 deletions(-)

diff --git a/include/linux/phonet.h b/include/linux/phonet.h
index 6fb1384..e80fefe 100644
--- a/include/linux/phonet.h
+++ b/include/linux/phonet.h
@@ -37,6 +37,8 @@
 #define PNPIPE_ENCAP		1
 #define PNPIPE_IFINDEX		2
 #define PNPIPE_HANDLE		3
+#define PNPIPE_ENABLE		4
+#define PNPIPE_INITSTATE	5
 
 #define PNADDR_ANY		0
 #define PNADDR_BROADCAST	0xFC
@@ -48,6 +50,7 @@
 
 /* ioctls */
 #define SIOCPNGETOBJECT		(SIOCPROTOPRIVATE + 0)
+#define SIOCPNENABLEPIPE	(SIOCPROTOPRIVATE + 13)
 #define SIOCPNADDRESOURCE	(SIOCPROTOPRIVATE + 14)
 #define SIOCPNDELRESOURCE	(SIOCPROTOPRIVATE + 15)
 
diff --git a/net/phonet/pep.c b/net/phonet/pep.c
index f17fd84..179a14a 100644
--- a/net/phonet/pep.c
+++ b/net/phonet/pep.c
@@ -533,6 +533,29 @@ static int pep_connresp_rcv(struct sock *sk, struct sk_buff *skb)
 	return pipe_handler_send_created_ind(sk);
 }
 
+static int pep_enableresp_rcv(struct sock *sk, struct sk_buff *skb)
+{
+	struct pnpipehdr *hdr = pnp_hdr(skb);
+
+	if (hdr->error_code != PN_PIPE_NO_ERROR)
+		return -ECONNREFUSED;
+
+	return pep_indicate(sk, PNS_PIPE_ENABLED_IND, 0 /* sub-blocks */,
+		NULL, 0, GFP_ATOMIC);
+
+}
+
+static void pipe_start_flow_control(struct sock *sk)
+{
+	struct pep_sock *pn = pep_sk(sk);
+
+	if (!pn_flow_safe(pn->tx_fc)) {
+		atomic_set(&pn->tx_credits, 1);
+		sk->sk_write_space(sk);
+	}
+	pipe_grant_credits(sk, GFP_ATOMIC);
+}
+
 /* Queue an skb to an actively connected sock.
  * Socket lock must be held. */
 static int pipe_handler_do_rcv(struct sock *sk, struct sk_buff *skb)
@@ -578,13 +601,25 @@ static int pipe_handler_do_rcv(struct sock *sk, struct sk_buff *skb)
 			sk->sk_state = TCP_CLOSE_WAIT;
 			break;
 		}
+		if (pn->init_enable == PN_PIPE_DISABLE)
+			sk->sk_state = TCP_SYN_RECV;
+		else {
+			sk->sk_state = TCP_ESTABLISHED;
+			pipe_start_flow_control(sk);
+		}
+		break;
 
-		sk->sk_state = TCP_ESTABLISHED;
-		if (!pn_flow_safe(pn->tx_fc)) {
-			atomic_set(&pn->tx_credits, 1);
-			sk->sk_write_space(sk);
+	case PNS_PEP_ENABLE_RESP:
+		if (sk->sk_state != TCP_SYN_SENT)
+			break;
+
+		if (pep_enableresp_rcv(sk, skb)) {
+			sk->sk_state = TCP_CLOSE_WAIT;
+			break;
 		}
-		pipe_grant_credits(sk, GFP_ATOMIC);
+
+		sk->sk_state = TCP_ESTABLISHED;
+		pipe_start_flow_control(sk);
 		break;
 
 	case PNS_PEP_DISCONNECT_RESP:
@@ -863,14 +898,32 @@ static int pep_sock_connect(struct sock *sk, struct sockaddr *addr, int len)
 	int err;
 	u8 data[4] = { 0 /* sub-blocks */, PAD, PAD, PAD };
 
-	pn->pipe_handle = 1; /* anything but INVALID_HANDLE */
+	if (pn->pipe_handle == PN_PIPE_INVALID_HANDLE)
+		pn->pipe_handle = 1; /* anything but INVALID_HANDLE */
+
 	err = pipe_handler_request(sk, PNS_PEP_CONNECT_REQ,
-					PN_PIPE_ENABLE, data, 4);
+				pn->init_enable, data, 4);
 	if (err) {
 		pn->pipe_handle = PN_PIPE_INVALID_HANDLE;
 		return err;
 	}
+
+	sk->sk_state = TCP_SYN_SENT;
+
+	return 0;
+}
+
+static int pep_sock_enable(struct sock *sk, struct sockaddr *addr, int len)
+{
+	int err;
+
+	err = pipe_handler_request(sk, PNS_PEP_ENABLE_REQ, PAD,
+				NULL, 0);
+	if (err)
+		return err;
+
 	sk->sk_state = TCP_SYN_SENT;
+
 	return 0;
 }
 
@@ -894,6 +947,19 @@ static int pep_ioctl(struct sock *sk, int cmd, unsigned long arg)
 			answ = 0;
 		release_sock(sk);
 		return put_user(answ, (int __user *)arg);
+		break;
+
+	case SIOCPNENABLEPIPE:
+		lock_sock(sk);
+		if (sk->sk_state == TCP_SYN_SENT)
+			answ =  -EBUSY;
+		else if (sk->sk_state == TCP_ESTABLISHED)
+			answ = -EISCONN;
+		else
+			answ = pep_sock_enable(sk, NULL, 0);
+		release_sock(sk);
+		return answ;
+		break;
 	}
 
 	return -ENOIOCTLCMD;
@@ -959,6 +1025,18 @@ static int pep_setsockopt(struct sock *sk, int level, int optname,
 		}
 		goto out_norel;
 
+	case PNPIPE_HANDLE:
+		if ((sk->sk_state == TCP_CLOSE) &&
+			(val >= 0) && (val < PN_PIPE_INVALID_HANDLE))
+			pn->pipe_handle = val;
+		else
+			err = -EINVAL;
+		break;
+
+	case PNPIPE_INITSTATE:
+		pn->init_enable = !!val;
+		break;
+
 	default:
 		err = -ENOPROTOOPT;
 	}
@@ -994,6 +1072,17 @@ static int pep_getsockopt(struct sock *sk, int level, int optname,
 			return -EINVAL;
 		break;
 
+	case PNPIPE_ENABLE:
+		if (sk->sk_state == TCP_ESTABLISHED)
+			val = 1;
+		else
+			val = 0;
+		break;
+
+	case PNPIPE_INITSTATE:
+		val = pn->init_enable;
+		break;
+
 	default:
 		return -ENOPROTOOPT;
 	}
-- 
1.7.4.3

^ permalink raw reply related

* RE: [PATCH] Phonet: set the pipe handle using setsockopt
From: Hemant-vilas RAMDASI @ 2011-11-16  8:46 UTC (permalink / raw)
  To: Rémi Denis-Courmont, netdev@vger.kernel.org
In-Reply-To: <3443203.gVgdO3ogFg@hector>

Remi,

> > > > +#define SIOPNPIPE_ENABLE	_IO(SIOCPNGAUTOCONF,   1)
> > >
> > > Does this even work? I am not an expert on this, but I would think
> that
> > > device-private controls are routed to the network device, not the
> > > socket. In
> > > any case, it does not seem right.
> >
> > Yes, it works. The ioctl is routed to per-socket functions.
> 
> Even if it works, sockets are probably not supposed to use the device-
> private
> ioctl() range, are they?
> 
> And why is this inside __KERNEL__ ?
Ok..We move it.

> > > Do you still need this read-only option?
> >
> > Yes.
> 
> Why and how?
This is needed for user to poll pipe-state.

Regards,
Hemant

^ permalink raw reply

* Re: [patch -next v2] 6LoWPAN: double free in lowpan_fragment_xmit()
From: Alexander Smirnov @ 2011-11-16  8:42 UTC (permalink / raw)
  To: Dan Carpenter
  Cc: Dmitry Eremin-Solenikov, Sergey Lapin, David S. Miller,
	linux-zigbee-devel, netdev, kernel-janitors
In-Reply-To: <20111116083643.GA25612@mwanda>

2011/11/16 Dan Carpenter <dan.carpenter@oracle.com>:
> dev_queue_xmit() consumes its own skb, so the call to dev_kfree_skb()
> in lowpan_fragment_xmit() is a double free.
>
> Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
> ---
> v2: fixed commit message.
>
> diff --git a/net/ieee802154/6lowpan.c b/net/ieee802154/6lowpan.c
> index 602f318..e4ecc1e 100644
> --- a/net/ieee802154/6lowpan.c
> +++ b/net/ieee802154/6lowpan.c
> @@ -980,9 +980,6 @@ lowpan_fragment_xmit(struct sk_buff *skb, u8 *head,
>
>        ret = dev_queue_xmit(frag);
>
> -       if (ret < 0)
> -               dev_kfree_skb(frag);
> -
>        return ret;
>  }
>
>
>

Acked-by: Alexander Smirnov <alex.bluesman.smirnov@gmail.com>

^ permalink raw reply

* [patch -next v2] 6LoWPAN: double free in lowpan_fragment_xmit()
From: Dan Carpenter @ 2011-11-16  8:36 UTC (permalink / raw)
  To: Dmitry Eremin-Solenikov, Alexander Smirnov
  Cc: Sergey Lapin, David S. Miller, linux-zigbee-devel, netdev,
	kernel-janitors
In-Reply-To: <20111116083254.GB4349@mwanda>

[-- Attachment #1: Type: text/plain, Size: 548 bytes --]

dev_queue_xmit() consumes its own skb, so the call to dev_kfree_skb()
in lowpan_fragment_xmit() is a double free.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
---
v2: fixed commit message.

diff --git a/net/ieee802154/6lowpan.c b/net/ieee802154/6lowpan.c
index 602f318..e4ecc1e 100644
--- a/net/ieee802154/6lowpan.c
+++ b/net/ieee802154/6lowpan.c
@@ -980,9 +980,6 @@ lowpan_fragment_xmit(struct sk_buff *skb, u8 *head,
 
 	ret = dev_queue_xmit(frag);
 
-	if (ret < 0)
-		dev_kfree_skb(frag);
-
 	return ret;
 }
 


[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply related

* Re: [patch -next] 6LoWPAN: double free in lowpan_fragment_xmit()
From: Dan Carpenter @ 2011-11-16  8:32 UTC (permalink / raw)
  To: Dmitry Eremin-Solenikov, Alexander Smirnov
  Cc: Sergey Lapin, David S. Miller, linux-zigbee-devel, netdev,
	kernel-janitors
In-Reply-To: <20111116082138.GA10264@elgon.mountain>

[-- Attachment #1: Type: text/plain, Size: 339 bytes --]

On Wed, Nov 16, 2011 at 11:21:38AM +0300, Dan Carpenter wrote:
> dev_queue_xmit() consumes its own skb, so the call to dev_kfree_skb()
> ieee802154/6lowpan.clowpan_fragment_xmits a double free.
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Dur...  I messed up my commit message right before sending.  Will
resend.

regards,
dan carpenter



[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply

* [patch -next] 6LoWPAN: double free in lowpan_fragment_xmit()
From: Dan Carpenter @ 2011-11-16  8:21 UTC (permalink / raw)
  To: Dmitry Eremin-Solenikov, Alexander Smirnov
  Cc: Sergey Lapin, David S. Miller, linux-zigbee-devel, netdev,
	kernel-janitors

dev_queue_xmit() consumes its own skb, so the call to dev_kfree_skb()
ieee802154/6lowpan.clowpan_fragment_xmits a double free.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>

diff --git a/net/ieee802154/6lowpan.c b/net/ieee802154/6lowpan.c
index 602f318..e4ecc1e 100644
--- a/net/ieee802154/6lowpan.c
+++ b/net/ieee802154/6lowpan.c
@@ -980,9 +980,6 @@ lowpan_fragment_xmit(struct sk_buff *skb, u8 *head,
 
 	ret = dev_queue_xmit(frag);
 
-	if (ret < 0)
-		dev_kfree_skb(frag);
-
 	return ret;
 }
 

^ permalink raw reply related

* Re: Hitting BUG_ON() from napi_enable in e1000e
From: Jeff Kirsher @ 2011-11-16  7:32 UTC (permalink / raw)
  To: Mike McElroy; +Cc: netdev
In-Reply-To: <4EC16DE3.5020701@stratus.com>

On Mon, Nov 14, 2011 at 11:37, Mike McElroy <mike.mcelroy@stratus.com> wrote:
>
> Hitting the BUG_ON in napi_enable(). Code inspection shows that this can
> only be triggered by calling napi_enable() twice without an intervening
> napi_disable().
>
> I saw the following sequence of events in the stack trace:
>
> 1) We simulated a cable pull using an Extreme switch.
> 2) e1000_tx_timeout() was entered.
> 3) e1000_reset_task() was called. Saw the message from e_err() in the
> console log.
> 4) e1000_reinit_locked was called. This function calls e1000_down() and
> e1000_up(). These functions call napi_disable() and napi_enable()
> respectively.
> 5) Then on another thread, a monitor task saw carrier was down and executed
> 'ip set link down' and 'ip set link up' commands.
> 6) Saw the '_E1000_RESETTING'warning fron the e1000_close function.
> 7) Either the e1000_open() executed between the e1000_down() and e1000_up()
> calls in step 4 or the e1000_open() call executed after the e0001_up() call.
> In either case, napi_enable() is called twice which triggers the BUG_ON.
>
> This code sequence is present in the e1000 driver also.
>
> There are two bugs here:
> 1) The napi_enable() and napi_disable() should only be called in the
> e1000_open and e1000_close functions respectively
> 2) There no synchronization preventing a call to the driver close while
> executing error processing.
>
> Here is a patch for the napi_enable BUG_ON:
>
> diff --git a/drivers/net/e1000e/netdev.c b/drivers/net/e1000e/netdev.c
> index 5ec1f99..e1af6fa 100755
> --- a/drivers/net/e1000e/netdev.c
> +++ b/drivers/net/e1000e/netdev.c
> @@ -4242,9 +4242,6 @@ int e1000e_up(struct e1000_adapter *adapter)
>
>        clear_bit(__E1000_DOWN, &adapter->state);
>
> -#ifdef CONFIG_E1000E_NAPI
> -       napi_enable(&adapter->napi);
> -#endif
>  #ifdef CONFIG_E1000E_MSIX
>        if (adapter->msix_entries)
>                e1000_configure_msix(adapter);
> @@ -4307,10 +4304,6 @@ void e1000e_down(struct e1000_adapter *adapter)
>        /* flush both disables and wait for them to finish */
>        e1e_flush();
>        usleep_range(10000, 20000);
> -
> -#ifdef CONFIG_E1000E_NAPI
> -       napi_disable(&adapter->napi);
> -#endif
>        e1000_irq_disable(adapter);
>
>        del_timer_sync(&adapter->watchdog_timer);
> @@ -4677,6 +4670,10 @@ static int e1000_close(struct net_device *netdev)
>
>        pm_runtime_get_sync(&pdev->dev);
>
> +#ifdef CONFIG_E1000E_NAPI
> +       napi_disable(&adapter->napi);
> +#endif
> +
>        if (!test_bit(__E1000_DOWN, &adapter->state)) {
>                e1000e_down(adapter);
>                e1000_free_irq(adapter);
>

Thanks, I will add this patch to my queue.

-- 
Cheers,
Jeff

^ permalink raw reply

* Re: [RFC] kvm tools: Implement multiple VQ for virtio-net
From: Michael S. Tsirkin @ 2011-11-16  7:23 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Krishna Kumar, gorcunov, kvm, Asias He, virtualization,
	Pekka Enberg, Sasha Levin, netdev, mingo, Stephen Hemminger
In-Reply-To: <877h31ortx.fsf@rustcorp.com.au>

On Wed, Nov 16, 2011 at 10:34:42AM +1030, Rusty Russell wrote:
> On Mon, 14 Nov 2011 15:05:07 +0200, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > On Mon, Nov 14, 2011 at 02:25:17PM +0200, Pekka Enberg wrote:
> > > On Mon, Nov 14, 2011 at 4:04 AM, Asias He <asias.hejun@gmail.com> wrote:
> > > > Why both the bandwidth and latency performance are dropping so dramatically
> > > > with multiple VQ?
> > > 
> > > What's the expected benefit from multiple VQs
> > 
> > Heh, the original patchset didn't mention this :) It really should.
> > They are supposed to speed up networking for high smp guests.
> 
> If we have one queue per guest CPU, does this allow us to run lockless?
> 
> Thanks,
> Rusty.

LLTX? It's supposed to be deprecated, isn't it?

-- 
MST

^ permalink raw reply

* Re: [PATCH V2] vlan:return error when real dev is enslaved
From: Weiping Pan @ 2011-11-16  7:16 UTC (permalink / raw)
  To: Nicolas de Pesloüan
  Cc: Patrick McHardy (maintainer:VLAN (802.1Q)),
	"David S. Miller" (maintainer:NETWORKING [GENERAL]),
	open list:VLAN (802.1Q), open list
In-Reply-To: <4EC2BB48.2070802@gmail.com>

On 11/16/2011 03:19 AM, Nicolas de Pesloüan wrote:
> Le 15/11/2011 13:44, Weiping Pan a écrit :
>> Qinhuibin reported a kernel panic when he do some operation about vlan.
>> https://lkml.org/lkml/2011/11/6/218
>>
>> The operation is as below:
>> ifconfig eth2 up
>> modprobe bonding
>> modprobe 8021q
>> ifconfig bond0 up
>> ifenslave bond0 eth2
>> vconfig add eth2 3300
>> vconfig add bond0 33
>> vconfig rem eth2.3300
>>
>> the panic stack is as below:
>> [<ffffffffa002f1c9>] panic_event+0x49/0x70 [ipmi_msghandler]
>> [<ffffffff80378917>] notifier_call_chain+0x37/0x70
>> [<ffffffff80372122>] panic+0xa2/0x195
>> [<ffffffff80376ed8>] oops_end+0xd8/0x140
>> [<ffffffff8001bea7>] no_context+0xf7/0x280
>> [<ffffffff8001c1a5>] __bad_area_nosemaphore+0x175/0x250
>> [<ffffffff80376318>] page_fault+0x28/0x30
>> [<ffffffffa039dabd>] igb_vlan_rx_kill_vid+0x4d/0x100 [igb]
>> [<ffffffffa044045f>] bond_vlan_rx_kill_vid+0x9f/0x290 [bonding]
>> [<ffffffffa047e636>] unregister_vlan_dev+0x136/0x180 [8021q]
>> [<ffffffffa047ed20>] vlan_ioctl_handler+0x170/0x3f0 [8021q]
>> [<ffffffff802c1d3f>] sock_ioctl+0x21f/0x280
>> [<ffffffff800e6d7f>] vfs_ioctl+0x2f/0xb0
>> [<ffffffff800e726b>] do_vfs_ioctl+0x3cb/0x5a0
>> [<ffffffff800e74e1>] sys_ioctl+0xa1/0xb0
>> [<ffffffff80007388>] system_call_fastpath+0x16/0x1b
>> [<00007f108a2b8bd7>] 0x7f108a2b8bd7
>> And the nic is as below:
>> [root@localhost ~]# ethtool -i eth2
>> driver: igb
>> version: 3.0.6-k2
>> firmware-version: 1.2-1
>> bus-info: 0000:04:00.0
>> kernel version:
>> 2.6.32.12-0.7 also happen in 2.6.32-131
>>
>> For kernel 2.6.32, the reason of this bug is that when we do "vconfig 
>> add bond0 33",
>> adapter->vlgrp is overwritten in igb_vlan_rx_register. So when we do 
>> "vconfig rem
>> eth2.3300", it can't find the correct vlgrp.
>>
>> And this bug is avoided by vlan cleanup patchset from Jiri Pirko
>> <jpirko@redhat.com>, especially commit b2cb09b1a772(igb: do vlan 
>> cleanup).
>>
>> But it is not a correct operation to creat a vlan interface on eth2
>> when it have been enslaved by bond0, so this patch is to return error
>> when the real dev is already enslaved.
>
> Why isn't this setup correct?
>
> Compare to bridge, where ebtables allow for some sort of sharing of 
> the physical interface between bridge and vlan.
>
> I think bonding should behave the same way instead of denying this setup.
>
>     Nicolas.
>
Hi, Nicolas,

After some investigation I agree with you that this setup is correct,
since we can  "switchport trunk allowed vlan 2-3" on the switch.

I can confirm that both bonding and bridge support this kind of setup.

bond0.2                                        br0.2
       |                                                |
   eth0 ----- eth0.3                          eth0 -----eth0.3

Both works fine(git commit 06236ac3726f1),  so please discard this patch.

thanks
Weiping Pan

^ permalink raw reply

* Re: [PATCH] Phonet: set the pipe handle using setsockopt
From: Rémi Denis-Courmont @ 2011-11-16  6:30 UTC (permalink / raw)
  To: netdev@vger.kernel.org
In-Reply-To: <81C3A93C17462B4BBD7E272753C105791FB090B0B8@EXDCVYMBSTM005.EQ1STM.local>

Le Lundi 14 Novembre 2011 11:36:12 ext Hemant-vilas RAMDASI a écrit :
 > > sockaddr_pn *spn) /* Phonet device ioctl requests */
> > > 
> > >  #ifdef __KERNEL__
> > >  #define SIOCPNGAUTOCONF		(SIOCDEVPRIVATE + 0)
> > > 
> > > +#define SIOPNPIPE_ENABLE	_IO(SIOCPNGAUTOCONF,   1)
> > 
> > Does this even work? I am not an expert on this, but I would think that
> > device-private controls are routed to the network device, not the
> > socket. In
> > any case, it does not seem right.
> 
> Yes, it works. The ioctl is routed to per-socket functions.

Even if it works, sockets are probably not supposed to use the device-private 
ioctl() range, are they?

And why is this inside __KERNEL__ ?

> > > @@ -994,6 +1068,17 @@ static int pep_getsockopt(struct sock *sk, int
> > 
> > level,
> > 
> > > int optname, return -EINVAL;
> > > 
> > >  		break;
> > > 
> > > +	case PNPIPE_ENABLE:
> > > +		if (sk->sk_state == TCP_ESTABLISHED)
> > > +			val = 1;
> > > +		else
> > > +			val = 0;
> > > +		break;
> > 
> > Do you still need this read-only option?
> 
> Yes.

Why and how?

-- 
Rémi Denis-Courmont
http://www.remlab.net/

^ permalink raw reply

* Re: [RFC] kvm tools: Implement multiple VQ for virtio-net
From: jason wang @ 2011-11-16  6:10 UTC (permalink / raw)
  To: Krishna Kumar2
  Cc: penberg, kvm, Michael S. Tsirkin, Asias He, virtualization,
	gorcunov, Sasha Levin, netdev, mingo
In-Reply-To: <OFDA747DDD.8D1C8FD8-ON65257949.001837DA-65257949.0019D25F@in.ibm.com>

On 11/15/2011 12:44 PM, Krishna Kumar2 wrote:
> Sasha Levin <levinsasha928@gmail.com> wrote on 11/14/2011 03:45:40 PM:
>
>>> Why both the bandwidth and latency performance are dropping so
>>> dramatically with multiple VQ?
>> It looks like theres no hash sync between host and guest, which makes
>> the RX VQ change for every packet. This is my guess.
> Yes, I confirmed this happens for macvtap. I am
> using ixgbe - it calls skb_record_rx_queue when
> a skb is allocated, but sets rxhash when a packet
> arrives. Macvtap is relying on record_rx_queue
> first ahead of rxhash (as part of my patch making
> macvtap multiqueue), hence different skbs result
> in macvtap selecting different vq's.
>
> Reordering macvtap to use rxhash first results in
> all packets going to the same VQ. The code snippet
> is:
>
> {
> 	...
> 	if (!numvtaps)
>                 goto out;
>
> 	rxq = skb_get_rxhash(skb);
> 	if (rxq) {
> 		tap = rcu_dereference(vlan->taps[rxq % numvtaps]);
> 		if (tap)
> 			goto out;
> 	}
>
> 	if (likely(skb_rx_queue_recorded(skb))) {
> 		rxq = skb_get_rx_queue(skb);
>
> 		while (unlikely(rxq >= numvtaps))
> 			rxq -= numvtaps;
> 			tap = rcu_dereference(vlan->taps[rxq]);
> 			if (tap)
> 				goto out;
> 	}
> }
>
> I will submit a patch for macvtap separately. I am working
> towards the other issue pointed out - different vhost
> threads handling rx/tx of a single flow.
Hello Krishna:

Have any thought in mind to solve the issue of flow handling?

Maybe some performance numbers first is better, it would let us know
where we are. During the test of my patchset, I find big regression of
small packet transmission, and more retransmissions were noticed. This
maybe also the issue of flow affinity. One interesting things is to see
whether this happens in your patches :)

I've played with a basic flow director implementation based on my series
which want to make sure the packets of a flow was handled by the same
vhost thread/guest vcpu. This is done by:

- bind virtqueue to guest cpu
- record the hash to queue mapping when guest sending packets and use
this mapping to choose the virtqueue when forwarding packets to guest

Test shows some help during for receiving packets from external host and
packet sending to local host. But it would hurt the performance of
sending packets to remote host. This is not the perfect solution as it
can not handle guest moving processes among vcpus, I plan to try
accelerate RFS and sharing the mapping between host and guest.

Anyway this is just for receiving, the small packet sending need more
thoughts.

Thanks

>
> thanks,
>
> - KK
>
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [PATCH v5] Phonet: set the pipe handle using setsockopt
From: Hemant Vilas RAMDASI @ 2011-11-16  5:44 UTC (permalink / raw)
  To: netdev-owner
  Cc: netdev, remi.denis-courmont, Dinesh Kumar Sharma, Hemant Ramdasi

From: Dinesh Kumar Sharma <dinesh.sharma@stericsson.com>

This provides flexibility to set the pipe handle
using setsockopt. The pipe can be enabled (if disabled) later
using ioctl.

Signed-off-by: Hemant Ramdasi <hemant.ramdasi@stericsson.com>
Signed-off-by: Dinesh Kumar Sharma <dinesh.sharma@stericsson.com>
---
 include/linux/phonet.h |    3 +
 net/phonet/pep.c       |  103 ++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 99 insertions(+), 7 deletions(-)

diff --git a/include/linux/phonet.h b/include/linux/phonet.h
index 6fb1384..4c00551 100644
--- a/include/linux/phonet.h
+++ b/include/linux/phonet.h
@@ -37,6 +37,8 @@
 #define PNPIPE_ENCAP		1
 #define PNPIPE_IFINDEX		2
 #define PNPIPE_HANDLE		3
+#define PNPIPE_ENABLE		4
+#define PNPIPE_INITSTATE	5
 
 #define PNADDR_ANY		0
 #define PNADDR_BROADCAST	0xFC
@@ -180,6 +182,7 @@ static inline __u8 pn_sockaddr_get_resource(const struct sockaddr_pn *spn)
 /* Phonet device ioctl requests */
 #ifdef __KERNEL__
 #define SIOCPNGAUTOCONF		(SIOCDEVPRIVATE + 0)
+#define SIOPNPIPE_ENABLE	_IO(SIOCPNGAUTOCONF,   1)
 
 struct if_phonet_autoconf {
 	uint8_t device;
diff --git a/net/phonet/pep.c b/net/phonet/pep.c
index f17fd84..6019b7e 100644
--- a/net/phonet/pep.c
+++ b/net/phonet/pep.c
@@ -533,6 +533,29 @@ static int pep_connresp_rcv(struct sock *sk, struct sk_buff *skb)
 	return pipe_handler_send_created_ind(sk);
 }
 
+static int pep_enableresp_rcv(struct sock *sk, struct sk_buff *skb)
+{
+	struct pnpipehdr *hdr = pnp_hdr(skb);
+
+	if (hdr->error_code != PN_PIPE_NO_ERROR)
+		return -ECONNREFUSED;
+
+	return pep_indicate(sk, PNS_PIPE_ENABLED_IND, 0 /* sub-blocks */,
+		NULL, 0, GFP_ATOMIC);
+
+}
+
+static void pipe_start_flow_control(struct sock *sk)
+{
+	struct pep_sock *pn = pep_sk(sk);
+
+	if (!pn_flow_safe(pn->tx_fc)) {
+		atomic_set(&pn->tx_credits, 1);
+		sk->sk_write_space(sk);
+	}
+	pipe_grant_credits(sk, GFP_ATOMIC);
+}
+
 /* Queue an skb to an actively connected sock.
  * Socket lock must be held. */
 static int pipe_handler_do_rcv(struct sock *sk, struct sk_buff *skb)
@@ -578,13 +601,25 @@ static int pipe_handler_do_rcv(struct sock *sk, struct sk_buff *skb)
 			sk->sk_state = TCP_CLOSE_WAIT;
 			break;
 		}
+		if (pn->init_enable == PN_PIPE_DISABLE)
+			sk->sk_state = TCP_SYN_RECV;
+		else {
+			sk->sk_state = TCP_ESTABLISHED;
+			pipe_start_flow_control(sk);
+		}
+		break;
 
-		sk->sk_state = TCP_ESTABLISHED;
-		if (!pn_flow_safe(pn->tx_fc)) {
-			atomic_set(&pn->tx_credits, 1);
-			sk->sk_write_space(sk);
+	case PNS_PEP_ENABLE_RESP:
+		if (sk->sk_state != TCP_SYN_SENT)
+			break;
+
+		if (pep_enableresp_rcv(sk, skb)) {
+			sk->sk_state = TCP_CLOSE_WAIT;
+			break;
 		}
-		pipe_grant_credits(sk, GFP_ATOMIC);
+
+		sk->sk_state = TCP_ESTABLISHED;
+		pipe_start_flow_control(sk);
 		break;
 
 	case PNS_PEP_DISCONNECT_RESP:
@@ -863,14 +898,32 @@ static int pep_sock_connect(struct sock *sk, struct sockaddr *addr, int len)
 	int err;
 	u8 data[4] = { 0 /* sub-blocks */, PAD, PAD, PAD };
 
-	pn->pipe_handle = 1; /* anything but INVALID_HANDLE */
+	if (pn->pipe_handle == PN_PIPE_INVALID_HANDLE)
+		pn->pipe_handle = 1; /* anything but INVALID_HANDLE */
+
 	err = pipe_handler_request(sk, PNS_PEP_CONNECT_REQ,
-					PN_PIPE_ENABLE, data, 4);
+				pn->init_enable, data, 4);
 	if (err) {
 		pn->pipe_handle = PN_PIPE_INVALID_HANDLE;
 		return err;
 	}
+
+	sk->sk_state = TCP_SYN_SENT;
+
+	return 0;
+}
+
+static int pep_sock_enable(struct sock *sk, struct sockaddr *addr, int len)
+{
+	int err;
+
+	err = pipe_handler_request(sk, PNS_PEP_ENABLE_REQ, PAD,
+				NULL, 0);
+	if (err)
+		return err;
+
 	sk->sk_state = TCP_SYN_SENT;
+
 	return 0;
 }
 
@@ -894,6 +947,19 @@ static int pep_ioctl(struct sock *sk, int cmd, unsigned long arg)
 			answ = 0;
 		release_sock(sk);
 		return put_user(answ, (int __user *)arg);
+		break;
+
+	case SIOPNPIPE_ENABLE:
+		lock_sock(sk);
+		if (sk->sk_state == TCP_SYN_SENT)
+			answ =  -EBUSY;
+		else if (sk->sk_state == TCP_ESTABLISHED)
+			answ = -EISCONN;
+		else
+			answ = pep_sock_enable(sk, NULL, 0);
+		release_sock(sk);
+		return answ;
+		break;
 	}
 
 	return -ENOIOCTLCMD;
@@ -959,6 +1025,18 @@ static int pep_setsockopt(struct sock *sk, int level, int optname,
 		}
 		goto out_norel;
 
+	case PNPIPE_HANDLE:
+		if ((sk->sk_state == TCP_CLOSE) &&
+			(val >= 0) && (val < PN_PIPE_INVALID_HANDLE))
+			pn->pipe_handle = val;
+		else
+			err = -EINVAL;
+		break;
+
+	case PNPIPE_INITSTATE:
+		pn->init_enable = !!val;
+		break;
+
 	default:
 		err = -ENOPROTOOPT;
 	}
@@ -994,6 +1072,17 @@ static int pep_getsockopt(struct sock *sk, int level, int optname,
 			return -EINVAL;
 		break;
 
+	case PNPIPE_ENABLE:
+		if (sk->sk_state == TCP_ESTABLISHED)
+			val = 1;
+		else
+			val = 0;
+		break;
+
+	case PNPIPE_INITSTATE:
+		val = pn->init_enable;
+		break;
+
 	default:
 		return -ENOPROTOOPT;
 	}
-- 
1.7.4.3

^ permalink raw reply related

* [PATCH net-next v5 06/10] forcedeth: allow to silence "TX timeout" debug messages
From: David Decotigny @ 2011-11-16  5:15 UTC (permalink / raw)
  To: netdev, linux-kernel
  Cc: David S. Miller, Ian Campbell, Eric Dumazet, Jeff Kirsher,
	Ben Hutchings, Jiri Pirko, Joe Perches, Szymon Janc,
	Richard Jones, Ayaz Abdulla, Sameer Nanda, David Decotigny
In-Reply-To: <cover.1321420513.git.david.decotigny@google.com>

From: Sameer Nanda <snanda@google.com>

This adds a new module parameter "debug_tx_timeout" to silence most
debug messages in case of TX timeout. These messages don't provide a
signal/noise ratio high enough for production systems and, with ~30kB
logged each time, they tend to add to a cascade effect if the system
is already under stress (memory pressure, disk, etc.).

By default, the parameter is clear, meaning that only a single warning
will be reported.



Signed-off-by: David Decotigny <david.decotigny@google.com>
---
 drivers/net/ethernet/nvidia/forcedeth.c |   98 ++++++++++++++++++-------------
 1 files changed, 57 insertions(+), 41 deletions(-)

diff --git a/drivers/net/ethernet/nvidia/forcedeth.c b/drivers/net/ethernet/nvidia/forcedeth.c
index fe17e42..9b917ff 100644
--- a/drivers/net/ethernet/nvidia/forcedeth.c
+++ b/drivers/net/ethernet/nvidia/forcedeth.c
@@ -892,6 +892,11 @@ enum {
 static int dma_64bit = NV_DMA_64BIT_ENABLED;
 
 /*
+ * Debug output control for tx_timeout
+ */
+static bool debug_tx_timeout = false;
+
+/*
  * Crossover Detection
  * Realtek 8201 phy + some OEM boards do not work properly.
  */
@@ -2477,56 +2482,64 @@ static void nv_tx_timeout(struct net_device *dev)
 	u32 status;
 	union ring_type put_tx;
 	int saved_tx_limit;
-	int i;
 
 	if (np->msi_flags & NV_MSI_X_ENABLED)
 		status = readl(base + NvRegMSIXIrqStatus) & NVREG_IRQSTAT_MASK;
 	else
 		status = readl(base + NvRegIrqStatus) & NVREG_IRQSTAT_MASK;
 
-	netdev_info(dev, "Got tx_timeout. irq: %08x\n", status);
+	netdev_warn(dev, "Got tx_timeout. irq status: %08x\n", status);
 
-	netdev_info(dev, "Ring at %lx\n", (unsigned long)np->ring_addr);
-	netdev_info(dev, "Dumping tx registers\n");
-	for (i = 0; i <= np->register_size; i += 32) {
-		netdev_info(dev,
-			    "%3x: %08x %08x %08x %08x %08x %08x %08x %08x\n",
-			    i,
-			    readl(base + i + 0), readl(base + i + 4),
-			    readl(base + i + 8), readl(base + i + 12),
-			    readl(base + i + 16), readl(base + i + 20),
-			    readl(base + i + 24), readl(base + i + 28));
-	}
-	netdev_info(dev, "Dumping tx ring\n");
-	for (i = 0; i < np->tx_ring_size; i += 4) {
-		if (!nv_optimized(np)) {
-			netdev_info(dev,
-				    "%03x: %08x %08x // %08x %08x // %08x %08x // %08x %08x\n",
-				    i,
-				    le32_to_cpu(np->tx_ring.orig[i].buf),
-				    le32_to_cpu(np->tx_ring.orig[i].flaglen),
-				    le32_to_cpu(np->tx_ring.orig[i+1].buf),
-				    le32_to_cpu(np->tx_ring.orig[i+1].flaglen),
-				    le32_to_cpu(np->tx_ring.orig[i+2].buf),
-				    le32_to_cpu(np->tx_ring.orig[i+2].flaglen),
-				    le32_to_cpu(np->tx_ring.orig[i+3].buf),
-				    le32_to_cpu(np->tx_ring.orig[i+3].flaglen));
-		} else {
+	if (unlikely(debug_tx_timeout)) {
+		int i;
+
+		netdev_info(dev, "Ring at %lx\n", (unsigned long)np->ring_addr);
+		netdev_info(dev, "Dumping tx registers\n");
+		for (i = 0; i <= np->register_size; i += 32) {
 			netdev_info(dev,
-				    "%03x: %08x %08x %08x // %08x %08x %08x // %08x %08x %08x // %08x %08x %08x\n",
+				    "%3x: %08x %08x %08x %08x "
+				    "%08x %08x %08x %08x\n",
 				    i,
-				    le32_to_cpu(np->tx_ring.ex[i].bufhigh),
-				    le32_to_cpu(np->tx_ring.ex[i].buflow),
-				    le32_to_cpu(np->tx_ring.ex[i].flaglen),
-				    le32_to_cpu(np->tx_ring.ex[i+1].bufhigh),
-				    le32_to_cpu(np->tx_ring.ex[i+1].buflow),
-				    le32_to_cpu(np->tx_ring.ex[i+1].flaglen),
-				    le32_to_cpu(np->tx_ring.ex[i+2].bufhigh),
-				    le32_to_cpu(np->tx_ring.ex[i+2].buflow),
-				    le32_to_cpu(np->tx_ring.ex[i+2].flaglen),
-				    le32_to_cpu(np->tx_ring.ex[i+3].bufhigh),
-				    le32_to_cpu(np->tx_ring.ex[i+3].buflow),
-				    le32_to_cpu(np->tx_ring.ex[i+3].flaglen));
+				    readl(base + i + 0), readl(base + i + 4),
+				    readl(base + i + 8), readl(base + i + 12),
+				    readl(base + i + 16), readl(base + i + 20),
+				    readl(base + i + 24), readl(base + i + 28));
+		}
+		netdev_info(dev, "Dumping tx ring\n");
+		for (i = 0; i < np->tx_ring_size; i += 4) {
+			if (!nv_optimized(np)) {
+				netdev_info(dev,
+					    "%03x: %08x %08x // %08x %08x "
+					    "// %08x %08x // %08x %08x\n",
+					    i,
+					    le32_to_cpu(np->tx_ring.orig[i].buf),
+					    le32_to_cpu(np->tx_ring.orig[i].flaglen),
+					    le32_to_cpu(np->tx_ring.orig[i+1].buf),
+					    le32_to_cpu(np->tx_ring.orig[i+1].flaglen),
+					    le32_to_cpu(np->tx_ring.orig[i+2].buf),
+					    le32_to_cpu(np->tx_ring.orig[i+2].flaglen),
+					    le32_to_cpu(np->tx_ring.orig[i+3].buf),
+					    le32_to_cpu(np->tx_ring.orig[i+3].flaglen));
+			} else {
+				netdev_info(dev,
+					    "%03x: %08x %08x %08x "
+					    "// %08x %08x %08x "
+					    "// %08x %08x %08x "
+					    "// %08x %08x %08x\n",
+					    i,
+					    le32_to_cpu(np->tx_ring.ex[i].bufhigh),
+					    le32_to_cpu(np->tx_ring.ex[i].buflow),
+					    le32_to_cpu(np->tx_ring.ex[i].flaglen),
+					    le32_to_cpu(np->tx_ring.ex[i+1].bufhigh),
+					    le32_to_cpu(np->tx_ring.ex[i+1].buflow),
+					    le32_to_cpu(np->tx_ring.ex[i+1].flaglen),
+					    le32_to_cpu(np->tx_ring.ex[i+2].bufhigh),
+					    le32_to_cpu(np->tx_ring.ex[i+2].buflow),
+					    le32_to_cpu(np->tx_ring.ex[i+2].flaglen),
+					    le32_to_cpu(np->tx_ring.ex[i+3].bufhigh),
+					    le32_to_cpu(np->tx_ring.ex[i+3].buflow),
+					    le32_to_cpu(np->tx_ring.ex[i+3].flaglen));
+			}
 		}
 	}
 
@@ -6156,6 +6169,9 @@ module_param(phy_cross, int, 0);
 MODULE_PARM_DESC(phy_cross, "Phy crossover detection for Realtek 8201 phy is enabled by setting to 1 and disabled by setting to 0.");
 module_param(phy_power_down, int, 0);
 MODULE_PARM_DESC(phy_power_down, "Power down phy and disable link when interface is down (1), or leave phy powered up (0).");
+module_param(debug_tx_timeout, bool, 0);
+MODULE_PARM_DESC(debug_tx_timeout,
+		 "Dump tx related registers and ring when tx_timeout happens");
 
 MODULE_AUTHOR("Manfred Spraul <manfred@colorfullife.com>");
 MODULE_DESCRIPTION("Reverse Engineered nForce ethernet driver");
-- 
1.7.3.1

^ permalink raw reply related

* [PATCH net-next v5 07/10] forcedeth: implement ndo_get_stats64() API
From: David Decotigny @ 2011-11-16  5:15 UTC (permalink / raw)
  To: netdev, linux-kernel
  Cc: David S. Miller, Ian Campbell, Eric Dumazet, Jeff Kirsher,
	Ben Hutchings, Jiri Pirko, Joe Perches, Szymon Janc,
	Richard Jones, Ayaz Abdulla, David Decotigny
In-Reply-To: <cover.1321420513.git.david.decotigny@google.com>

This commit implements the ndo_get_stats64() API for forcedeth. Since
these stats are being updated from different contexts (process and
timer), this commit adds protection (locking + atomic variables).

Tested:
  - 16-way SMP x86_64 ->
    RX bytes:7244556582 (7.2 GB)  TX bytes:181904254 (181.9 MB)
  - pktgen + loopback: identical rx_bytes/tx_bytes and rx_packets/tx_packets



Signed-off-by: David Decotigny <david.decotigny@google.com>
---
 drivers/net/ethernet/nvidia/forcedeth.c |  182 ++++++++++++++++++++++++-------
 1 files changed, 141 insertions(+), 41 deletions(-)

diff --git a/drivers/net/ethernet/nvidia/forcedeth.c b/drivers/net/ethernet/nvidia/forcedeth.c
index 9b917ff..6aeb0d6 100644
--- a/drivers/net/ethernet/nvidia/forcedeth.c
+++ b/drivers/net/ethernet/nvidia/forcedeth.c
@@ -692,6 +692,21 @@ struct nv_ethtool_stats {
 #define NV_DEV_STATISTICS_V2_COUNT (NV_DEV_STATISTICS_V3_COUNT - 3)
 #define NV_DEV_STATISTICS_V1_COUNT (NV_DEV_STATISTICS_V2_COUNT - 6)
 
+/* driver statistics */
+struct nv_driver_stat {
+	atomic_t delta;  /* increase since last nv_update_stats() */
+	u64 total;  /* cumulative, requires netdev_priv(dev)->stats_lock */
+};
+
+#define NV_DRIVER_STAT_ATOMIC_INC(ptr_stat) /* atomic */ \
+	({ atomic_inc(&(ptr_stat)->delta); })
+#define NV_DRIVER_STAT_ATOMIC_ADD(ptr_stat,increment) /* atomic */	\
+	({ atomic_add((increment), &(ptr_stat)->delta); })
+#define NV_DRIVER_STAT_UPDATE_TOTAL(ptr_stat) /* requires stats_lock */ \
+	({ (ptr_stat)->total += atomic_xchg(&(ptr_stat)->delta, 0); })
+#define NV_DRIVER_STAT_GET_TOTAL(ptr_stat) /* requires stats_lock */ \
+	((ptr_stat)->total)
+
 /* diagnostics */
 #define NV_TEST_COUNT_BASE 3
 #define NV_TEST_COUNT_EXTENDED 4
@@ -736,6 +751,12 @@ struct nv_skb_map {
  * - tx setup is lockless: it relies on netif_tx_lock. Actual submission
  *	needs netdev_priv(dev)->lock :-(
  * - set_multicast_list: preparation lockless, relies on netif_tx_lock.
+ *
+ * Stats are protected with stats_lock:
+ * - updated by nv_do_stats_poll (timer). This is meant to avoid
+ *   integer wraparound in the NIC stats registers, at low frequency
+ *   (0.1 Hz)
+ * - updated by nv_get_ethtool_stats + nv_get_stats64
  */
 
 /* in dev: base, irq */
@@ -745,9 +766,10 @@ struct fe_priv {
 	struct net_device *dev;
 	struct napi_struct napi;
 
-	/* General data:
-	 * Locking: spin_lock(&np->lock); */
+	/* stats are updated in syscall and timer */
+	spinlock_t stats_lock;
 	struct nv_ethtool_stats estats;
+
 	int in_shutdown;
 	u32 linkspeed;
 	int duplex;
@@ -798,6 +820,11 @@ struct fe_priv {
 	u32 nic_poll_irq;
 	int rx_ring_size;
 
+	/* RX software stats */
+	struct nv_driver_stat stat_rx_packets;
+	struct nv_driver_stat stat_rx_bytes; /* not always available in HW */
+	struct nv_driver_stat stat_rx_missed_errors;
+
 	/* media detection workaround.
 	 * Locking: Within irq hander or disable_irq+spin_lock(&np->lock);
 	 */
@@ -820,6 +847,11 @@ struct fe_priv {
 	struct nv_skb_map *tx_end_flip;
 	int tx_stop;
 
+	/* TX software stats */
+	struct nv_driver_stat stat_tx_packets; /* not always available in HW */
+	struct nv_driver_stat stat_tx_bytes;
+	struct nv_driver_stat stat_tx_dropped;
+
 	/* msi/msi-x fields */
 	u32 msi_flags;
 	struct msix_entry msi_x_entry[NV_MSI_X_MAX_VECTORS];
@@ -1635,11 +1667,19 @@ static void nv_mac_reset(struct net_device *dev)
 	pci_push(base);
 }
 
-static void nv_get_hw_stats(struct net_device *dev)
+/* Caller must appropriately lock netdev_priv(dev)->stats_lock */
+static void nv_update_stats(struct net_device *dev)
 {
 	struct fe_priv *np = netdev_priv(dev);
 	u8 __iomem *base = get_hwbase(dev);
 
+	/* If it happens that this is run in top-half context, then
+	 * replace the spin_lock of stats_lock with
+	 * spin_lock_irqsave() in calling functions. */
+	WARN_ONCE(in_irq(), "forcedeth: estats spin_lock(_bh) from top-half");
+	assert_spin_locked(&np->stats_lock);
+
+	/* query hardware */
 	np->estats.tx_bytes += readl(base + NvRegTxCnt);
 	np->estats.tx_zero_rexmt += readl(base + NvRegTxZeroReXmt);
 	np->estats.tx_one_rexmt += readl(base + NvRegTxOneReXmt);
@@ -1695,21 +1735,35 @@ static void nv_get_hw_stats(struct net_device *dev)
 		np->estats.tx_multicast += readl(base + NvRegTxMulticast);
 		np->estats.tx_broadcast += readl(base + NvRegTxBroadcast);
 	}
+
+	/* update software stats */
+	NV_DRIVER_STAT_UPDATE_TOTAL(&np->stat_rx_packets);
+	NV_DRIVER_STAT_UPDATE_TOTAL(&np->stat_rx_bytes);
+	NV_DRIVER_STAT_UPDATE_TOTAL(&np->stat_rx_missed_errors);
+
+	NV_DRIVER_STAT_UPDATE_TOTAL(&np->stat_tx_packets);
+	NV_DRIVER_STAT_UPDATE_TOTAL(&np->stat_tx_bytes);
+	NV_DRIVER_STAT_UPDATE_TOTAL(&np->stat_tx_dropped);
 }
 
 /*
- * nv_get_stats: dev->get_stats function
+ * nv_get_stats64: dev->ndo_get_stats64 function
  * Get latest stats value from the nic.
  * Called with read_lock(&dev_base_lock) held for read -
  * only synchronized against unregister_netdevice.
  */
-static struct net_device_stats *nv_get_stats(struct net_device *dev)
+static struct rtnl_link_stats64*
+nv_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *storage)
+	__acquires(&netdev_priv(dev)->stats_lock)
+	__releases(&netdev_priv(dev)->stats_lock)
 {
 	struct fe_priv *np = netdev_priv(dev);
 
 	/* If the nic supports hw counters then retrieve latest values */
-	if (np->driver_data & (DEV_HAS_STATISTICS_V1|DEV_HAS_STATISTICS_V2|DEV_HAS_STATISTICS_V3)) {
-		nv_get_hw_stats(dev);
+	if (np->driver_data & DEV_HAS_STATISTICS_V123) {
+		spin_lock_bh(&np->stats_lock);
+
+		nv_update_stats(dev);
 
 		/*
 		 * Note: because HW stats are not always available and
@@ -1721,17 +1775,40 @@ static struct net_device_stats *nv_get_stats(struct net_device *dev)
 		 * packet (Ethernet FCS CRC).
 		 */
 
-		/* copy to net_device stats */
-		dev->stats.tx_fifo_errors = np->estats.tx_fifo_errors;
-		dev->stats.tx_carrier_errors = np->estats.tx_carrier_errors;
-		dev->stats.rx_crc_errors = np->estats.rx_crc_errors;
-		dev->stats.rx_over_errors = np->estats.rx_over_errors;
-		dev->stats.rx_fifo_errors = np->estats.rx_drop_frame;
-		dev->stats.rx_errors = np->estats.rx_errors_total;
-		dev->stats.tx_errors = np->estats.tx_errors_total;
-	}
-
-	return &dev->stats;
+		/* generic stats */
+		storage->rx_packets = NV_DRIVER_STAT_GET_TOTAL(
+			&np->stat_rx_packets);
+		storage->tx_packets = NV_DRIVER_STAT_GET_TOTAL(
+			&np->stat_tx_packets);
+		storage->rx_bytes   = NV_DRIVER_STAT_GET_TOTAL(
+			&np->stat_rx_bytes);
+		storage->tx_bytes   = NV_DRIVER_STAT_GET_TOTAL(
+			&np->stat_tx_bytes);
+		storage->rx_errors  = np->estats.rx_errors_total;
+		storage->tx_errors  = np->estats.tx_errors_total;
+		storage->tx_dropped = NV_DRIVER_STAT_GET_TOTAL(
+			&np->stat_tx_dropped);
+
+		/* meaningful only when NIC supports stats v3 */
+		storage->multicast  = np->estats.rx_multicast;
+
+		/* detailed rx_errors */
+		storage->rx_length_errors = np->estats.rx_length_error;
+		storage->rx_over_errors   = np->estats.rx_over_errors;
+		storage->rx_crc_errors    = np->estats.rx_crc_errors;
+		storage->rx_frame_errors  = np->estats.rx_frame_align_error;
+		storage->rx_fifo_errors   = np->estats.rx_drop_frame;
+		storage->rx_missed_errors = NV_DRIVER_STAT_GET_TOTAL(
+			&np->stat_rx_missed_errors);
+
+		/* detailed tx_errors */
+		storage->tx_carrier_errors = np->estats.tx_carrier_errors;
+		storage->tx_fifo_errors    = np->estats.tx_fifo_errors;
+
+		spin_unlock_bh(&np->stats_lock);
+	}
+
+	return storage;
 }
 
 /*
@@ -1933,7 +2010,7 @@ static void nv_drain_tx(struct net_device *dev)
 			np->tx_ring.ex[i].buflow = 0;
 		}
 		if (nv_release_txskb(np, &np->tx_skb[i]))
-			dev->stats.tx_dropped++;
+			NV_DRIVER_STAT_ATOMIC_INC(&np->stat_tx_dropped);
 		np->tx_skb[i].dma = 0;
 		np->tx_skb[i].dma_len = 0;
 		np->tx_skb[i].dma_single = 0;
@@ -2390,11 +2467,15 @@ static int nv_tx_done(struct net_device *dev, int limit)
 		if (np->desc_ver == DESC_VER_1) {
 			if (flags & NV_TX_LASTPACKET) {
 				if (flags & NV_TX_ERROR) {
-					if ((flags & NV_TX_RETRYERROR) && !(flags & NV_TX_RETRYCOUNT_MASK))
+					if ((flags & NV_TX_RETRYERROR)
+					    && !(flags & NV_TX_RETRYCOUNT_MASK))
 						nv_legacybackoff_reseed(dev);
 				} else {
-					dev->stats.tx_packets++;
-					dev->stats.tx_bytes += np->get_tx_ctx->skb->len;
+					NV_DRIVER_STAT_ATOMIC_INC(
+						&np->stat_tx_packets);
+					NV_DRIVER_STAT_ATOMIC_ADD(
+						&np->stat_tx_bytes,
+						np->get_tx_ctx->skb->len);
 				}
 				dev_kfree_skb_any(np->get_tx_ctx->skb);
 				np->get_tx_ctx->skb = NULL;
@@ -2403,11 +2484,15 @@ static int nv_tx_done(struct net_device *dev, int limit)
 		} else {
 			if (flags & NV_TX2_LASTPACKET) {
 				if (flags & NV_TX2_ERROR) {
-					if ((flags & NV_TX2_RETRYERROR) && !(flags & NV_TX2_RETRYCOUNT_MASK))
+					if ((flags & NV_TX2_RETRYERROR)
+					    && !(flags & NV_TX2_RETRYCOUNT_MASK))
 						nv_legacybackoff_reseed(dev);
 				} else {
-					dev->stats.tx_packets++;
-					dev->stats.tx_bytes += np->get_tx_ctx->skb->len;
+					NV_DRIVER_STAT_ATOMIC_INC(
+						&np->stat_tx_packets);
+					NV_DRIVER_STAT_ATOMIC_ADD(
+						&np->stat_tx_bytes,
+						np->get_tx_ctx->skb->len);
 				}
 				dev_kfree_skb_any(np->get_tx_ctx->skb);
 				np->get_tx_ctx->skb = NULL;
@@ -2441,15 +2526,18 @@ static int nv_tx_done_optimized(struct net_device *dev, int limit)
 
 		if (flags & NV_TX2_LASTPACKET) {
 			if (flags & NV_TX2_ERROR) {
-				if ((flags & NV_TX2_RETRYERROR) && !(flags & NV_TX2_RETRYCOUNT_MASK)) {
+				if ((flags & NV_TX2_RETRYERROR)
+				    && !(flags & NV_TX2_RETRYCOUNT_MASK)) {
 					if (np->driver_data & DEV_HAS_GEAR_MODE)
 						nv_gear_backoff_reseed(dev);
 					else
 						nv_legacybackoff_reseed(dev);
 				}
 			} else {
-				dev->stats.tx_packets++;
-				dev->stats.tx_bytes += np->get_tx_ctx->skb->len;
+				NV_DRIVER_STAT_ATOMIC_INC(&np->stat_tx_packets);
+				NV_DRIVER_STAT_ATOMIC_ADD(
+					&np->stat_tx_bytes,
+					np->get_tx_ctx->skb->len);
 			}
 
 			dev_kfree_skb_any(np->get_tx_ctx->skb);
@@ -2663,7 +2751,7 @@ static int nv_rx_process(struct net_device *dev, int limit)
 					/* the rest are hard errors */
 					else {
 						if (flags & NV_RX_MISSEDFRAME)
-							dev->stats.rx_missed_errors++;
+							NV_DRIVER_STAT_ATOMIC_INC(&np->stat_rx_missed_errors);
 						dev_kfree_skb(skb);
 						goto next_pkt;
 					}
@@ -2706,8 +2794,8 @@ static int nv_rx_process(struct net_device *dev, int limit)
 		skb_put(skb, len);
 		skb->protocol = eth_type_trans(skb, dev);
 		napi_gro_receive(&np->napi, skb);
-		dev->stats.rx_packets++;
-		dev->stats.rx_bytes += len;
+		NV_DRIVER_STAT_ATOMIC_INC(&np->stat_rx_packets);
+		NV_DRIVER_STAT_ATOMIC_ADD(&np->stat_rx_bytes, len);
 next_pkt:
 		if (unlikely(np->get_rx.orig++ == np->last_rx.orig))
 			np->get_rx.orig = np->first_rx.orig;
@@ -2790,8 +2878,8 @@ static int nv_rx_process_optimized(struct net_device *dev, int limit)
 				__vlan_hwaccel_put_tag(skb, vid);
 			}
 			napi_gro_receive(&np->napi, skb);
-			dev->stats.rx_packets++;
-			dev->stats.rx_bytes += len;
+			NV_DRIVER_STAT_ATOMIC_INC(&np->stat_rx_packets);
+			NV_DRIVER_STAT_ATOMIC_ADD(&np->stat_rx_bytes, len);
 		} else {
 			dev_kfree_skb(skb);
 		}
@@ -4000,11 +4088,18 @@ static void nv_poll_controller(struct net_device *dev)
 #endif
 
 static void nv_do_stats_poll(unsigned long data)
+	__acquires(&netdev_priv(dev)->stats_lock)
+	__releases(&netdev_priv(dev)->stats_lock)
 {
 	struct net_device *dev = (struct net_device *) data;
 	struct fe_priv *np = netdev_priv(dev);
 
-	nv_get_hw_stats(dev);
+	/* If lock is currently taken, the stats are being refreshed
+	 * and hence fresh enough */
+	if (spin_trylock(&np->stats_lock)) {
+		nv_update_stats(dev);
+		spin_unlock(&np->stats_lock);
+	}
 
 	if (!np->in_shutdown)
 		mod_timer(&np->stats_poll,
@@ -4711,14 +4806,18 @@ static int nv_get_sset_count(struct net_device *dev, int sset)
 	}
 }
 
-static void nv_get_ethtool_stats(struct net_device *dev, struct ethtool_stats *estats, u64 *buffer)
+static void nv_get_ethtool_stats(struct net_device *dev,
+				 struct ethtool_stats *estats, u64 *buffer)
+	__acquires(&netdev_priv(dev)->stats_lock)
+	__releases(&netdev_priv(dev)->stats_lock)
 {
 	struct fe_priv *np = netdev_priv(dev);
 
-	/* update stats */
-	nv_get_hw_stats(dev);
-
-	memcpy(buffer, &np->estats, nv_get_sset_count(dev, ETH_SS_STATS)*sizeof(u64));
+	spin_lock_bh(&np->stats_lock);
+	nv_update_stats(dev);
+	memcpy(buffer, &np->estats,
+	       nv_get_sset_count(dev, ETH_SS_STATS)*sizeof(u64));
+	spin_unlock_bh(&np->stats_lock);
 }
 
 static int nv_link_test(struct net_device *dev)
@@ -5362,7 +5461,7 @@ static int nv_close(struct net_device *dev)
 static const struct net_device_ops nv_netdev_ops = {
 	.ndo_open		= nv_open,
 	.ndo_stop		= nv_close,
-	.ndo_get_stats		= nv_get_stats,
+	.ndo_get_stats64	= nv_get_stats64,
 	.ndo_start_xmit		= nv_start_xmit,
 	.ndo_tx_timeout		= nv_tx_timeout,
 	.ndo_change_mtu		= nv_change_mtu,
@@ -5379,7 +5478,7 @@ static const struct net_device_ops nv_netdev_ops = {
 static const struct net_device_ops nv_netdev_ops_optimized = {
 	.ndo_open		= nv_open,
 	.ndo_stop		= nv_close,
-	.ndo_get_stats		= nv_get_stats,
+	.ndo_get_stats64	= nv_get_stats64,
 	.ndo_start_xmit		= nv_start_xmit_optimized,
 	.ndo_tx_timeout		= nv_tx_timeout,
 	.ndo_change_mtu		= nv_change_mtu,
@@ -5418,6 +5517,7 @@ static int __devinit nv_probe(struct pci_dev *pci_dev, const struct pci_device_i
 	np->dev = dev;
 	np->pci_dev = pci_dev;
 	spin_lock_init(&np->lock);
+	spin_lock_init(&np->stats_lock);
 	SET_NETDEV_DEV(dev, &pci_dev->dev);
 
 	init_timer(&np->oom_kick);
-- 
1.7.3.1

^ permalink raw reply related

* [PATCH net-next v5 03/10] kbuild: document RPS/XPS network Kconfig options
From: David Decotigny @ 2011-11-16  5:15 UTC (permalink / raw)
  To: netdev, linux-kernel
  Cc: David S. Miller, Ian Campbell, Eric Dumazet, Jeff Kirsher,
	Ben Hutchings, Jiri Pirko, Joe Perches, Szymon Janc,
	Richard Jones, Ayaz Abdulla, David Decotigny
In-Reply-To: <cover.1321420513.git.david.decotigny@google.com>

This adds a description of RPS/XPS options and allow them to be
changed at make menuconfig time.



Signed-off-by: David Decotigny <david.decotigny@google.com>
---
 net/Kconfig |   16 +++++++++++++---
 1 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/net/Kconfig b/net/Kconfig
index a073148..1422c34 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -217,20 +217,30 @@ source "net/dns_resolver/Kconfig"
 source "net/batman-adv/Kconfig"
 
 config RPS
-	boolean
+	boolean "Enable Receive Packet Steering"
 	depends on SMP && SYSFS && USE_GENERIC_SMP_HELPERS
 	default y
+	  RPS distributes the load of received packet processing
+	  across multiple CPUs. If unsure, say Y.
 
 config RFS_ACCEL
-	boolean
+	boolean "Enable Hardware Acceleration of RFS"
 	depends on RPS && GENERIC_HARDIRQS
 	select CPU_RMAP
 	default y
+	  This is the hardware version of RPS. On multi-queue network
+	  devices, this configures the hardware to distribute the
+	  received packets across multiple CPUs. If unsure, say Y.
 
 config XPS
-	boolean
+	boolean "Enable Transmit Packet Steering"
 	depends on SMP && SYSFS && USE_GENERIC_SMP_HELPERS
 	default y
+	  For multiqueue devices, XPS selects a transmit queue during
+	  packet transmission based on configuration. This is done by
+	  mapping the CPU transmitting the packet to a queue. XPS can
+	  reduce transmit network latency on SMP systems. If unsure,
+	  say Y.
 
 config HAVE_BPF_JIT
 	bool
-- 
1.7.3.1

^ permalink raw reply related

* [PATCH net-next v5 02/10] net-sysfs: fixed minor sparse warning
From: David Decotigny @ 2011-11-16  5:15 UTC (permalink / raw)
  To: netdev, linux-kernel
  Cc: David S. Miller, Ian Campbell, Eric Dumazet, Jeff Kirsher,
	Ben Hutchings, Jiri Pirko, Joe Perches, Szymon Janc,
	Richard Jones, Ayaz Abdulla, David Decotigny
In-Reply-To: <cover.1321420513.git.david.decotigny@google.com>

This commit fixes following warning:
net/core/net-sysfs.c:921:6: warning: symbol 'numa_node' shadows an earlier one
include/linux/topology.h:222:1: originally declared here



Signed-off-by: David Decotigny <david.decotigny@google.com>
---
 net/core/net-sysfs.c |   12 ++++++------
 1 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index c71c434..a64382f 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -901,7 +901,7 @@ static ssize_t store_xps_map(struct netdev_queue *queue,
 	struct xps_map *map, *new_map;
 	struct xps_dev_maps *dev_maps, *new_dev_maps;
 	int nonempty = 0;
-	int numa_node = -2;
+	int numa_node_id = -2;
 
 	if (!capable(CAP_NET_ADMIN))
 		return -EPERM;
@@ -944,10 +944,10 @@ static ssize_t store_xps_map(struct netdev_queue *queue,
 		need_set = cpumask_test_cpu(cpu, mask) && cpu_online(cpu);
 #ifdef CONFIG_NUMA
 		if (need_set) {
-			if (numa_node == -2)
-				numa_node = cpu_to_node(cpu);
-			else if (numa_node != cpu_to_node(cpu))
-				numa_node = -1;
+			if (numa_node_id == -2)
+				numa_node_id = cpu_to_node(cpu);
+			else if (numa_node_id != cpu_to_node(cpu))
+				numa_node_id = -1;
 		}
 #endif
 		if (need_set && pos >= map_len) {
@@ -997,7 +997,7 @@ static ssize_t store_xps_map(struct netdev_queue *queue,
 	if (dev_maps)
 		kfree_rcu(dev_maps, rcu);
 
-	netdev_queue_numa_node_write(queue, (numa_node >= 0) ? numa_node :
+	netdev_queue_numa_node_write(queue, (numa_node_id >= 0) ? numa_node_id :
 					    NUMA_NO_NODE);
 
 	mutex_unlock(&xps_map_mutex);
-- 
1.7.3.1

^ permalink raw reply related

* [PATCH net-next v5 05/10] forcedeth: Add messages to indicate using MSI or MSI-X
From: David Decotigny @ 2011-11-16  5:15 UTC (permalink / raw)
  To: netdev, linux-kernel
  Cc: David S. Miller, Ian Campbell, Eric Dumazet, Jeff Kirsher,
	Ben Hutchings, Jiri Pirko, Joe Perches, Szymon Janc,
	Richard Jones, Ayaz Abdulla, Mike Ditto, David Decotigny
In-Reply-To: <cover.1321420513.git.david.decotigny@google.com>

From: Mike Ditto <mditto@google.com>

This adds a few kernel messages to indicate whether PCIe interrupts
are signaled with MSI or MSI-X.



Signed-off-by: David Decotigny <david.decotigny@google.com>
---
 drivers/net/ethernet/nvidia/forcedeth.c |    2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/drivers/net/ethernet/nvidia/forcedeth.c b/drivers/net/ethernet/nvidia/forcedeth.c
index 374625b..fe17e42 100644
--- a/drivers/net/ethernet/nvidia/forcedeth.c
+++ b/drivers/net/ethernet/nvidia/forcedeth.c
@@ -3810,6 +3810,7 @@ static int nv_request_irq(struct net_device *dev, int intr_test)
 				writel(0, base + NvRegMSIXMap0);
 				writel(0, base + NvRegMSIXMap1);
 			}
+			netdev_info(dev, "MSI-X enabled\n");
 		}
 	}
 	if (ret != 0 && np->msi_flags & NV_MSI_CAPABLE) {
@@ -3831,6 +3832,7 @@ static int nv_request_irq(struct net_device *dev, int intr_test)
 			writel(0, base + NvRegMSIMap1);
 			/* enable msi vector 0 */
 			writel(NVREG_MSI_VECTOR_0_ENABLED, base + NvRegMSIIrqMask);
+			netdev_info(dev, "MSI enabled\n");
 		}
 	}
 	if (ret != 0) {
-- 
1.7.3.1

^ permalink raw reply related

* [PATCH net-next v5 10/10] forcedeth: whitespace/indentation fixes
From: David Decotigny @ 2011-11-16  5:15 UTC (permalink / raw)
  To: netdev, linux-kernel
  Cc: David S. Miller, Ian Campbell, Eric Dumazet, Jeff Kirsher,
	Ben Hutchings, Jiri Pirko, Joe Perches, Szymon Janc,
	Richard Jones, Ayaz Abdulla, David Decotigny
In-Reply-To: <cover.1321420513.git.david.decotigny@google.com>



Signed-off-by: David Decotigny <david.decotigny@google.com>
---
 drivers/net/ethernet/nvidia/forcedeth.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/net/ethernet/nvidia/forcedeth.c b/drivers/net/ethernet/nvidia/forcedeth.c
index 5d436b5..973cf79 100644
--- a/drivers/net/ethernet/nvidia/forcedeth.c
+++ b/drivers/net/ethernet/nvidia/forcedeth.c
@@ -65,7 +65,7 @@
 #include <linux/slab.h>
 #include <linux/uaccess.h>
 #include <linux/prefetch.h>
-#include  <linux/io.h>
+#include <linux/io.h>
 
 #include <asm/irq.h>
 #include <asm/system.h>
-- 
1.7.3.1

^ permalink raw reply related

* [PATCH net-next v5 09/10] forcedeth: stats updated with a deferrable timer
From: David Decotigny @ 2011-11-16  5:15 UTC (permalink / raw)
  To: netdev, linux-kernel
  Cc: David S. Miller, Ian Campbell, Eric Dumazet, Jeff Kirsher,
	Ben Hutchings, Jiri Pirko, Joe Perches, Szymon Janc,
	Richard Jones, Ayaz Abdulla, David Decotigny
In-Reply-To: <cover.1321420513.git.david.decotigny@google.com>

Mark stats timer as deferrable: punctuality in waking the stats timer
callback doesn't matter much, as it is responsible only to avoid
integer wraparound.

We need at least 1 other timer to fire within 17s (fully loaded 1Gbps)
to avoid wrap-arounds. Desired period is still 10s.



Signed-off-by: David Decotigny <david.decotigny@google.com>
---
 drivers/net/ethernet/nvidia/forcedeth.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/net/ethernet/nvidia/forcedeth.c b/drivers/net/ethernet/nvidia/forcedeth.c
index d9b5a4d..5d436b5 100644
--- a/drivers/net/ethernet/nvidia/forcedeth.c
+++ b/drivers/net/ethernet/nvidia/forcedeth.c
@@ -5534,7 +5534,7 @@ static int __devinit nv_probe(struct pci_dev *pci_dev, const struct pci_device_i
 	init_timer(&np->nic_poll);
 	np->nic_poll.data = (unsigned long) dev;
 	np->nic_poll.function = nv_do_nic_poll;	/* timer handler */
-	init_timer(&np->stats_poll);
+	init_timer_deferrable(&np->stats_poll);
 	np->stats_poll.data = (unsigned long) dev;
 	np->stats_poll.function = nv_do_stats_poll;	/* timer handler */
 
-- 
1.7.3.1

^ permalink raw reply related

* [PATCH net-next v5 08/10] forcedeth: account for dropped RX frames
From: David Decotigny @ 2011-11-16  5:15 UTC (permalink / raw)
  To: netdev, linux-kernel
  Cc: David S. Miller, Ian Campbell, Eric Dumazet, Jeff Kirsher,
	Ben Hutchings, Jiri Pirko, Joe Perches, Szymon Janc,
	Richard Jones, Ayaz Abdulla, David Decotigny
In-Reply-To: <cover.1321420513.git.david.decotigny@google.com>

This adds code to update the stats counter for dropped RX frames.



Signed-off-by: David Decotigny <david.decotigny@google.com>
---
 drivers/net/ethernet/nvidia/forcedeth.c |   12 ++++++++++--
 1 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/nvidia/forcedeth.c b/drivers/net/ethernet/nvidia/forcedeth.c
index 6aeb0d6..d9b5a4d 100644
--- a/drivers/net/ethernet/nvidia/forcedeth.c
+++ b/drivers/net/ethernet/nvidia/forcedeth.c
@@ -824,6 +824,7 @@ struct fe_priv {
 	struct nv_driver_stat stat_rx_packets;
 	struct nv_driver_stat stat_rx_bytes; /* not always available in HW */
 	struct nv_driver_stat stat_rx_missed_errors;
+	struct nv_driver_stat stat_rx_dropped;
 
 	/* media detection workaround.
 	 * Locking: Within irq hander or disable_irq+spin_lock(&np->lock);
@@ -1739,6 +1740,7 @@ static void nv_update_stats(struct net_device *dev)
 	/* update software stats */
 	NV_DRIVER_STAT_UPDATE_TOTAL(&np->stat_rx_packets);
 	NV_DRIVER_STAT_UPDATE_TOTAL(&np->stat_rx_bytes);
+	NV_DRIVER_STAT_UPDATE_TOTAL(&np->stat_rx_dropped);
 	NV_DRIVER_STAT_UPDATE_TOTAL(&np->stat_rx_missed_errors);
 
 	NV_DRIVER_STAT_UPDATE_TOTAL(&np->stat_tx_packets);
@@ -1786,6 +1788,8 @@ nv_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *storage)
 			&np->stat_tx_bytes);
 		storage->rx_errors  = np->estats.rx_errors_total;
 		storage->tx_errors  = np->estats.tx_errors_total;
+		storage->rx_dropped = NV_DRIVER_STAT_GET_TOTAL(
+			&np->stat_rx_dropped);
 		storage->tx_dropped = NV_DRIVER_STAT_GET_TOTAL(
 			&np->stat_tx_dropped);
 
@@ -1841,8 +1845,10 @@ static int nv_alloc_rx(struct net_device *dev)
 				np->put_rx.orig = np->first_rx.orig;
 			if (unlikely(np->put_rx_ctx++ == np->last_rx_ctx))
 				np->put_rx_ctx = np->first_rx_ctx;
-		} else
+		} else {
+			NV_DRIVER_STAT_ATOMIC_INC(&np->stat_rx_dropped);
 			return 1;
+		}
 	}
 	return 0;
 }
@@ -1873,8 +1879,10 @@ static int nv_alloc_rx_optimized(struct net_device *dev)
 				np->put_rx.ex = np->first_rx.ex;
 			if (unlikely(np->put_rx_ctx++ == np->last_rx_ctx))
 				np->put_rx_ctx = np->first_rx_ctx;
-		} else
+		} else {
+			NV_DRIVER_STAT_ATOMIC_INC(&np->stat_rx_dropped);
 			return 1;
+		}
 	}
 	return 0;
 }
-- 
1.7.3.1

^ permalink raw reply related

* [PATCH net-next v5 04/10] net: provide counter for tx_timeout errors in sysfs
From: David Decotigny @ 2011-11-16  5:15 UTC (permalink / raw)
  To: netdev, linux-kernel
  Cc: David S. Miller, Ian Campbell, Eric Dumazet, Jeff Kirsher,
	Ben Hutchings, Jiri Pirko, Joe Perches, Szymon Janc,
	Richard Jones, Ayaz Abdulla, David Decotigny
In-Reply-To: <cover.1321420513.git.david.decotigny@google.com>

This adds the /sys/class/net/DEV/queues/Q/tx_timeout attribute
containing the total number of timeout events on the given queue. It
is always available, independently of CONFIG_RPS/XPS.

Credits to Stephen Hemminger for a preliminary version of this patch.

Tested:
  without CONFIG_SYSFS (compilation only)
  with sysfs and without CONFIG_RPS & CONFIG_XPS
  with sysfs and without CONFIG_RPS
  with sysfs and without CONFIG_XPS
  with defaults



Signed-off-by: David Decotigny <david.decotigny@google.com>
---
 include/linux/netdevice.h |   12 ++++++++++--
 net/core/net-sysfs.c      |   37 +++++++++++++++++++++++++++++++------
 net/sched/sch_generic.c   |    1 +
 3 files changed, 42 insertions(+), 8 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index cbeb586..9e6cf09 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -530,7 +530,7 @@ struct netdev_queue {
 	struct Qdisc		*qdisc;
 	unsigned long		state;
 	struct Qdisc		*qdisc_sleeping;
-#if defined(CONFIG_RPS) || defined(CONFIG_XPS)
+#ifdef CONFIG_SYSFS
 	struct kobject		kobj;
 #endif
 #if defined(CONFIG_XPS) && defined(CONFIG_NUMA)
@@ -545,6 +545,12 @@ struct netdev_queue {
 	 * please use this field instead of dev->trans_start
 	 */
 	unsigned long		trans_start;
+
+	/*
+	 * Number of TX timeouts for this queue
+	 * (/sys/class/net/DEV/Q/trans_timeout)
+	 */
+	unsigned long		trans_timeout;
 } ____cacheline_aligned_in_smp;
 
 static inline int netdev_queue_numa_node_read(const struct netdev_queue *q)
@@ -1184,9 +1190,11 @@ struct net_device {
 
 	unsigned char		broadcast[MAX_ADDR_LEN];	/* hw bcast add	*/
 
-#if defined(CONFIG_RPS) || defined(CONFIG_XPS)
+#ifdef CONFIG_SYSFS
 	struct kset		*queues_kset;
+#endif
 
+#ifdef CONFIG_RPS
 	struct netdev_rx_queue	*_rx;
 
 	/* Number of RX queues allocated at register_netdev() time */
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index a64382f..602b141 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -780,7 +780,7 @@ net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
 #endif
 }
 
-#ifdef CONFIG_XPS
+#ifdef CONFIG_SYSFS
 /*
  * netdev_queue sysfs structures and functions.
  */
@@ -826,6 +826,23 @@ static const struct sysfs_ops netdev_queue_sysfs_ops = {
 	.store = netdev_queue_attr_store,
 };
 
+static ssize_t show_trans_timeout(struct netdev_queue *queue,
+				  struct netdev_queue_attribute *attribute,
+				  char *buf)
+{
+	unsigned long trans_timeout;
+
+	spin_lock_irq(&queue->_xmit_lock);
+	trans_timeout = queue->trans_timeout;
+	spin_unlock_irq(&queue->_xmit_lock);
+
+	return sprintf(buf, "%lu", trans_timeout);
+}
+
+static struct netdev_queue_attribute queue_trans_timeout =
+	__ATTR(tx_timeout, S_IRUGO, show_trans_timeout, NULL);
+
+#ifdef CONFIG_XPS
 static inline unsigned int get_netdev_queue_index(struct netdev_queue *queue)
 {
 	struct net_device *dev = queue->dev;
@@ -1020,12 +1037,17 @@ error:
 
 static struct netdev_queue_attribute xps_cpus_attribute =
     __ATTR(xps_cpus, S_IRUGO | S_IWUSR, show_xps_map, store_xps_map);
+#endif /* CONFIG_XPS */
 
 static struct attribute *netdev_queue_default_attrs[] = {
+	&queue_trans_timeout.attr,
+#ifdef CONFIG_XPS
 	&xps_cpus_attribute.attr,
+#endif
 	NULL
 };
 
+#ifdef CONFIG_XPS
 static void netdev_queue_release(struct kobject *kobj)
 {
 	struct netdev_queue *queue = to_netdev_queue(kobj);
@@ -1076,10 +1098,13 @@ static void netdev_queue_release(struct kobject *kobj)
 	memset(kobj, 0, sizeof(*kobj));
 	dev_put(queue->dev);
 }
+#endif /* CONFIG_XPS */
 
 static struct kobj_type netdev_queue_ktype = {
 	.sysfs_ops = &netdev_queue_sysfs_ops,
+#ifdef CONFIG_XPS
 	.release = netdev_queue_release,
+#endif
 	.default_attrs = netdev_queue_default_attrs,
 };
 
@@ -1102,12 +1127,12 @@ static int netdev_queue_add_kobject(struct net_device *net, int index)
 
 	return error;
 }
-#endif /* CONFIG_XPS */
+#endif /* CONFIG_SYSFS */
 
 int
 netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
 {
-#ifdef CONFIG_XPS
+#ifdef CONFIG_SYSFS
 	int i;
 	int error = 0;
 
@@ -1125,14 +1150,14 @@ netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
 	return error;
 #else
 	return 0;
-#endif
+#endif /* CONFIG_SYSFS */
 }
 
 static int register_queue_kobjects(struct net_device *net)
 {
 	int error = 0, txq = 0, rxq = 0, real_rx = 0, real_tx = 0;
 
-#if defined(CONFIG_RPS) || defined(CONFIG_XPS)
+#ifdef CONFIG_SYSFS
 	net->queues_kset = kset_create_and_add("queues",
 	    NULL, &net->dev.kobj);
 	if (!net->queues_kset)
@@ -1173,7 +1198,7 @@ static void remove_queue_kobjects(struct net_device *net)
 
 	net_rx_queue_update_kobjects(net, real_rx, 0);
 	netdev_queue_update_kobjects(net, real_tx, 0);
-#if defined(CONFIG_RPS) || defined(CONFIG_XPS)
+#ifdef CONFIG_SYSFS
 	kset_unregister(net->queues_kset);
 #endif
 }
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 69fca27..79ac145 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -246,6 +246,7 @@ static void dev_watchdog(unsigned long arg)
 				    time_after(jiffies, (trans_start +
 							 dev->watchdog_timeo))) {
 					some_queue_timedout = 1;
+					txq->trans_timeout++;
 					break;
 				}
 			}
-- 
1.7.3.1

^ permalink raw reply related

* [PATCH net-next v5 01/10] forcedeth: fix stats on hardware without extended stats support
From: David Decotigny @ 2011-11-16  5:15 UTC (permalink / raw)
  To: netdev, linux-kernel
  Cc: David S. Miller, Ian Campbell, Eric Dumazet, Jeff Kirsher,
	Ben Hutchings, Jiri Pirko, Joe Perches, Szymon Janc,
	Richard Jones, Ayaz Abdulla, david decotigny
In-Reply-To: <cover.1321420513.git.david.decotigny@google.com>

From: david decotigny <david.decotigny@google.com>

This change makes sure that tx_packets/rx_bytes ifconfig counters are
updated even on NICs that don't provide hardware support for these
stats: they are now updated in software. For the sake of consistency,
we also now have tx_bytes updated in software (hardware counters
include ethernet CRC, and software doesn't account for it).

This reverts parts of:
 - "forcedeth: statistics optimization" (21828163b2)
 - "forcedeth: Improve stats counters" (0bdfea8ba8)
 - "forcedeth: remove unneeded stats updates" (4687f3f364)

Tested:
  pktgen + loopback (http://patchwork.ozlabs.org/patch/124698/)
  reports identical tx_packets/rx_packets and tx_bytes/rx_bytes.

Signed-off-by: David Decotigny <david.decotigny@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
(cherry picked from commit 898bdf2cb43eb0a962c397eb4dd1aec2c7211be2)


---
 drivers/net/ethernet/nvidia/forcedeth.c |   36 +++++++++++++++++++++++-------
 1 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/nvidia/forcedeth.c b/drivers/net/ethernet/nvidia/forcedeth.c
index e8a5ae3..374625b 100644
--- a/drivers/net/ethernet/nvidia/forcedeth.c
+++ b/drivers/net/ethernet/nvidia/forcedeth.c
@@ -609,7 +609,7 @@ struct nv_ethtool_str {
 };
 
 static const struct nv_ethtool_str nv_estats_str[] = {
-	{ "tx_bytes" },
+	{ "tx_bytes" }, /* includes Ethernet FCS CRC */
 	{ "tx_zero_rexmt" },
 	{ "tx_one_rexmt" },
 	{ "tx_many_rexmt" },
@@ -637,7 +637,7 @@ static const struct nv_ethtool_str nv_estats_str[] = {
 	/* version 2 stats */
 	{ "tx_deferral" },
 	{ "tx_packets" },
-	{ "rx_bytes" },
+	{ "rx_bytes" }, /* includes Ethernet FCS CRC */
 	{ "tx_pause" },
 	{ "rx_pause" },
 	{ "rx_drop_frame" },
@@ -649,7 +649,7 @@ static const struct nv_ethtool_str nv_estats_str[] = {
 };
 
 struct nv_ethtool_stats {
-	u64 tx_bytes;
+	u64 tx_bytes; /* should be ifconfig->tx_bytes + 4*tx_packets */
 	u64 tx_zero_rexmt;
 	u64 tx_one_rexmt;
 	u64 tx_many_rexmt;
@@ -670,14 +670,14 @@ struct nv_ethtool_stats {
 	u64 rx_unicast;
 	u64 rx_multicast;
 	u64 rx_broadcast;
-	u64 rx_packets;
+	u64 rx_packets; /* should be ifconfig->rx_packets */
 	u64 rx_errors_total;
 	u64 tx_errors_total;
 
 	/* version 2 stats */
 	u64 tx_deferral;
-	u64 tx_packets;
-	u64 rx_bytes;
+	u64 tx_packets; /* should be ifconfig->tx_packets */
+	u64 rx_bytes;   /* should be ifconfig->rx_bytes + 4*rx_packets */
 	u64 tx_pause;
 	u64 rx_pause;
 	u64 rx_drop_frame;
@@ -1706,10 +1706,17 @@ static struct net_device_stats *nv_get_stats(struct net_device *dev)
 	if (np->driver_data & (DEV_HAS_STATISTICS_V1|DEV_HAS_STATISTICS_V2|DEV_HAS_STATISTICS_V3)) {
 		nv_get_hw_stats(dev);
 
+		/*
+		 * Note: because HW stats are not always available and
+		 * for consistency reasons, the following ifconfig
+		 * stats are managed by software: rx_bytes, tx_bytes,
+		 * rx_packets and tx_packets. The related hardware
+		 * stats reported by ethtool should be equivalent to
+		 * these ifconfig stats, with 4 additional bytes per
+		 * packet (Ethernet FCS CRC).
+		 */
+
 		/* copy to net_device stats */
-		dev->stats.tx_packets = np->estats.tx_packets;
-		dev->stats.rx_bytes = np->estats.rx_bytes;
-		dev->stats.tx_bytes = np->estats.tx_bytes;
 		dev->stats.tx_fifo_errors = np->estats.tx_fifo_errors;
 		dev->stats.tx_carrier_errors = np->estats.tx_carrier_errors;
 		dev->stats.rx_crc_errors = np->estats.rx_crc_errors;
@@ -2380,6 +2387,9 @@ static int nv_tx_done(struct net_device *dev, int limit)
 				if (flags & NV_TX_ERROR) {
 					if ((flags & NV_TX_RETRYERROR) && !(flags & NV_TX_RETRYCOUNT_MASK))
 						nv_legacybackoff_reseed(dev);
+				} else {
+					dev->stats.tx_packets++;
+					dev->stats.tx_bytes += np->get_tx_ctx->skb->len;
 				}
 				dev_kfree_skb_any(np->get_tx_ctx->skb);
 				np->get_tx_ctx->skb = NULL;
@@ -2390,6 +2400,9 @@ static int nv_tx_done(struct net_device *dev, int limit)
 				if (flags & NV_TX2_ERROR) {
 					if ((flags & NV_TX2_RETRYERROR) && !(flags & NV_TX2_RETRYCOUNT_MASK))
 						nv_legacybackoff_reseed(dev);
+				} else {
+					dev->stats.tx_packets++;
+					dev->stats.tx_bytes += np->get_tx_ctx->skb->len;
 				}
 				dev_kfree_skb_any(np->get_tx_ctx->skb);
 				np->get_tx_ctx->skb = NULL;
@@ -2429,6 +2442,9 @@ static int nv_tx_done_optimized(struct net_device *dev, int limit)
 					else
 						nv_legacybackoff_reseed(dev);
 				}
+			} else {
+				dev->stats.tx_packets++;
+				dev->stats.tx_bytes += np->get_tx_ctx->skb->len;
 			}
 
 			dev_kfree_skb_any(np->get_tx_ctx->skb);
@@ -2678,6 +2694,7 @@ static int nv_rx_process(struct net_device *dev, int limit)
 		skb->protocol = eth_type_trans(skb, dev);
 		napi_gro_receive(&np->napi, skb);
 		dev->stats.rx_packets++;
+		dev->stats.rx_bytes += len;
 next_pkt:
 		if (unlikely(np->get_rx.orig++ == np->last_rx.orig))
 			np->get_rx.orig = np->first_rx.orig;
@@ -2761,6 +2778,7 @@ static int nv_rx_process_optimized(struct net_device *dev, int limit)
 			}
 			napi_gro_receive(&np->napi, skb);
 			dev->stats.rx_packets++;
+			dev->stats.rx_bytes += len;
 		} else {
 			dev_kfree_skb(skb);
 		}
-- 
1.7.3.1

^ permalink raw reply related

* [PATCH net-next v5 00/10] net-sysfs+forcedeth: stats & debug enhancements
From: David Decotigny @ 2011-11-16  5:15 UTC (permalink / raw)
  To: netdev, linux-kernel
  Cc: David S. Miller, Ian Campbell, Eric Dumazet, Jeff Kirsher,
	Ben Hutchings, Jiri Pirko, Joe Perches, Szymon Janc,
	Richard Jones, Ayaz Abdulla, David Decotigny

These changes provide a new tx_timeout sysfs attribute and implement
the ndo_get_stats64 API. They also add a few more stats and debugging
features for forcedeth. They ensure that stats updates are correct in
SMP systems, 32 or 64-bits.

Note: patch 1 is the cherry-pick of 898bdf2cb43e ("forcedeth: fix
stats on hardware without extended stats support")

Changes since v4:
  - tx_timeout counter now a generic sysfs attribute. Credits to
    Stephen Hemminger for initial implementation
  - revert get_stats64 to using atomic variables: see
    http://patchwork.ozlabs.org/patch/125861/ for motivations
  - dropped patch "expose module parameters in /sys/module" for now: I
    will work on this later, following Stephen's recommendations
    (http://patchwork.ozlabs.org/patch/125862/).

Changes since v3:
  - updated get_stats64 + rx_dropped patches to use u64_stats_sync.h
  - dropped indentation "whitespace/indentation fixes" (included in
    get_stats64 api patch)

Changes since v2:
  - patch 1/9 is the cherry-pick of 898bdf2cb43e ("forcedeth: fix
    stats on hardware without extended stats support")
  - removed patch 5/10 "stats for rx_packets based on hardware
    registers" because packets&bytes stats are updated in software
    only (898bdf2cb43e)

Changes since v1:
  - patch 1/10 is the same as
    http://patchwork.ozlabs.org/patch/125017/ (targetting net)
  - other patches updated to take patch 1/10 into account
  - various commit message updates


Tested:
  ~150Mbps incoming TCP, ethtool -S in a loop, x86_64 16-way:
     tx_bytes: 5441989419
     rx_packets: 5439224
     tx_timeout: 0
     tx_packets: 5456705
     rx_bytes: 5566763850

Tested:
  pktgen + loopback report same RX/TX packets and bytes stats

Tested:
  tests above with Kconfig DEBUG_PAGEALLOC DEBUG_MUTEXES
  DEBUG_SPINLOCK LOCKUP_DETECTOR DEBUG_RT_MUTEXES DEBUG_LOCK_ALLOC
  PROVE_LOCKING DEBUG_ATOMIC_SLEEP DEBUG_STACK_USAGE DEBUG_KOBJECT
  DEBUG_VM DEBUG_LIST DEBUG_SG DEBUG_NOTIFIERS TEST_KSTRTOX
  STRICT_DEVMEM DEBUG_STACKOVERFLOW


############################################
# Patch Set Summary:

David Decotigny (7):
  net-sysfs: fixed minor sparse warning
  kbuild: document RPS/XPS network Kconfig options
  net: provide counter for tx_timeout errors in sysfs
  forcedeth: implement ndo_get_stats64() API
  forcedeth: account for dropped RX frames
  forcedeth: stats updated with a deferrable timer
  forcedeth: whitespace/indentation fixes

Mike Ditto (1):
  forcedeth: Add messages to indicate using MSI or MSI-X

Sameer Nanda (1):
  forcedeth: allow to silence "TX timeout" debug messages

david decotigny (1):
  forcedeth: fix stats on hardware without extended stats support

 drivers/net/ethernet/nvidia/forcedeth.c |  320 ++++++++++++++++++++++---------
 include/linux/netdevice.h               |   12 +-
 net/Kconfig                             |   16 ++-
 net/core/net-sysfs.c                    |   49 ++++--
 net/sched/sch_generic.c                 |    1 +
 5 files changed, 293 insertions(+), 105 deletions(-)

-- 
1.7.3.1

^ permalink raw reply

* [PATCH net-next v4 00/10] net-sysfs+forcedeth: stats & debug enhancements
From: David Decotigny @ 2011-11-16  5:13 UTC (permalink / raw)
  To: netdev, linux-kernel
  Cc: David S. Miller, Ian Campbell, Eric Dumazet, Jeff Kirsher,
	Ben Hutchings, Jiri Pirko, Joe Perches, Szymon Janc,
	Richard Jones, Ayaz Abdulla, David Decotigny

These changes provide a new tx_timeout sysfs attribute and implement
the ndo_get_stats64 API. They also add a few more stats and debugging
features for forcedeth. They ensure that stats updates are correct in
SMP systems, 32 or 64-bits.

Note: patch 1 is the cherry-pick of 898bdf2cb43e ("forcedeth: fix
stats on hardware without extended stats support")

Changes since v4:
  - tx_timeout counter now a generic sysfs attribute. Credits to
    Stephen Hemminger for initial implementation
  - revert get_stats64 to using atomic variables: see
    http://patchwork.ozlabs.org/patch/125861/ for motivations
  - dropped patch "expose module parameters in /sys/module" for now: I
    will work on this later, following Stephen's recommendations
    (http://patchwork.ozlabs.org/patch/125862/).

Changes since v3:
  - updated get_stats64 + rx_dropped patches to use u64_stats_sync.h
  - dropped indentation "whitespace/indentation fixes" (included in
    get_stats64 api patch)

Changes since v2:
  - patch 1/9 is the cherry-pick of 898bdf2cb43e ("forcedeth: fix
    stats on hardware without extended stats support")
  - removed patch 5/10 "stats for rx_packets based on hardware
    registers" because packets&bytes stats are updated in software
    only (898bdf2cb43e)

Changes since v1:
  - patch 1/10 is the same as
    http://patchwork.ozlabs.org/patch/125017/ (targetting net)
  - other patches updated to take patch 1/10 into account
  - various commit message updates


Tested:
  ~150Mbps incoming TCP, ethtool -S in a loop, x86_64 16-way:
     tx_bytes: 5441989419
     rx_packets: 5439224
     tx_timeout: 0
     tx_packets: 5456705
     rx_bytes: 5566763850

Tested:
  pktgen + loopback report same RX/TX packets and bytes stats

Tested:
  tests above with Kconfig DEBUG_PAGEALLOC DEBUG_MUTEXES
  DEBUG_SPINLOCK LOCKUP_DETECTOR DEBUG_RT_MUTEXES DEBUG_LOCK_ALLOC
  PROVE_LOCKING DEBUG_ATOMIC_SLEEP DEBUG_STACK_USAGE DEBUG_KOBJECT
  DEBUG_VM DEBUG_LIST DEBUG_SG DEBUG_NOTIFIERS TEST_KSTRTOX
  STRICT_DEVMEM DEBUG_STACKOVERFLOW


############################################
# Patch Set Summary:

David Decotigny (7):
  net-sysfs: fixed minor sparse warning
  kbuild: document RPS/XPS network Kconfig options
  net: provide counter for tx_timeout errors in sysfs
  forcedeth: implement ndo_get_stats64() API
  forcedeth: account for dropped RX frames
  forcedeth: stats updated with a deferrable timer
  forcedeth: whitespace/indentation fixes

Mike Ditto (1):
  forcedeth: Add messages to indicate using MSI or MSI-X

Sameer Nanda (1):
  forcedeth: allow to silence "TX timeout" debug messages

david decotigny (1):
  forcedeth: fix stats on hardware without extended stats support

 drivers/net/ethernet/nvidia/forcedeth.c |  320 ++++++++++++++++++++++---------
 include/linux/netdevice.h               |   12 +-
 net/Kconfig                             |   16 ++-
 net/core/net-sysfs.c                    |   49 ++++--
 net/sched/sch_generic.c                 |    1 +
 5 files changed, 293 insertions(+), 105 deletions(-)

-- 
1.7.3.1

^ permalink raw reply

* [PATCH iproute2] ss: report ecnseen
From: Eric Dumazet @ 2011-11-16  4:51 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev
In-Reply-To: <1318389501.3686.8.camel@edumazet-laptop>

Support ECNSEEN reporting in ss command.

ESTAB      0      0           10.170.73.123:4900
10.170.73.125:51001    uid:501 ino:385994 sk:f31e5f00
         mem:(r0,w0,f0,t0) ts sack ecn ecnseen bic wscale:8,8 rto:210
rtt:18.75/15 ato:40 cwnd:10 send 69.9Mbps rcv_space:32768

"ecn" means TCP session negociated ECN capability (TCP layer) at setup
time

"ecnseen" at least one frame with ECT(0) or ECT(1) or ECN (IP layer) was
received from peer.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 include/netinet/tcp.h |    1 +
 misc/ss.c             |    2 ++
 2 files changed, 3 insertions(+)

diff --git a/include/netinet/tcp.h b/include/netinet/tcp.h
index 282b29c..95a4fe6 100644
--- a/include/netinet/tcp.h
+++ b/include/netinet/tcp.h
@@ -172,6 +172,7 @@ enum
 # define TCPI_OPT_SACK		2
 # define TCPI_OPT_WSCALE	4
 # define TCPI_OPT_ECN		8
+# define TCPI_OPT_ECNSEEN	16
 
 /* Values for tcpi_state.  */
 enum tcp_ca_state
diff --git a/misc/ss.c b/misc/ss.c
index b00841b..778bf0a 100644
--- a/misc/ss.c
+++ b/misc/ss.c
@@ -1357,6 +1357,8 @@ static void tcp_show_info(const struct nlmsghdr *nlh, struct inet_diag_msg *r)
 				printf(" sack");
 			if (info->tcpi_options & TCPI_OPT_ECN)
 				printf(" ecn");
+			if (info->tcpi_options & TCPI_OPT_ECNSEEN)
+				printf(" ecnseen");
 		}
 
 		if (tb[INET_DIAG_CONG])

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox