Netdev List
 help / color / mirror / Atom feed
* Re: [PATCH] myri10ge: 64bit slice stats
From: Eric Dumazet @ 2011-06-20 18:52 UTC (permalink / raw)
  To: Jon Mason; +Cc: David Miller, netdev, Andrew Gallatin
In-Reply-To: <1308595758.2658.16.camel@edumazet-laptop>

Le lundi 20 juin 2011 à 20:49 +0200, Eric Dumazet a écrit :

> Hmm, no sorry, this wont work on 32bit arches :(
> 
> Please take a look at various patches addressing this problem.
> 
> 

For example take a look at the once sent some hours ago :

http://patchwork.ozlabs.org/patch/101128/




^ permalink raw reply

* Re: [PATCH] myri10ge: 64bit slice stats
From: Eric Dumazet @ 2011-06-20 18:49 UTC (permalink / raw)
  To: Jon Mason; +Cc: David Miller, netdev, Andrew Gallatin
In-Reply-To: <20110620184635.GA18577@myri.com>

Le lundi 20 juin 2011 à 13:46 -0500, Jon Mason a écrit :
> There is a potential issue of internal 32bit stats wrapping before
> updating the external 64bit stats, thus leading to incorrect stats.
> Since there is no hardware necessity for these counters to be 32bit,
> change them to be 64bit to avoid this issue.
> 
> Patch suggested by Eric Dumazet.
> 
> Signed-off-by: Jon Mason <mason@myri.com>
> ---
>  drivers/net/myri10ge/myri10ge.c |   12 ++++++------
>  1 files changed, 6 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/net/myri10ge/myri10ge.c b/drivers/net/myri10ge/myri10ge.c
> index 04e10f4..2a51de1 100644
> --- a/drivers/net/myri10ge/myri10ge.c
> +++ b/drivers/net/myri10ge/myri10ge.c
> @@ -170,12 +170,12 @@ struct myri10ge_rx_done {
>  };
>  
>  struct myri10ge_slice_netstats {
> -	unsigned long rx_packets;
> -	unsigned long tx_packets;
> -	unsigned long rx_bytes;
> -	unsigned long tx_bytes;
> -	unsigned long rx_dropped;
> -	unsigned long tx_dropped;
> +	u64 rx_packets;
> +	u64 tx_packets;
> +	u64 rx_bytes;
> +	u64 tx_bytes;
> +	u64 rx_dropped;
> +	u64 tx_dropped;
>  };
>  
>  struct myri10ge_slice_state {

Hmm, no sorry, this wont work on 32bit arches :(

Please take a look at various patches addressing this problem.




^ permalink raw reply

* [PATCH] myri10ge: 64bit slice stats
From: Jon Mason @ 2011-06-20 18:46 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Andrew Gallatin, Eric Dumazet

There is a potential issue of internal 32bit stats wrapping before
updating the external 64bit stats, thus leading to incorrect stats.
Since there is no hardware necessity for these counters to be 32bit,
change them to be 64bit to avoid this issue.

Patch suggested by Eric Dumazet.

Signed-off-by: Jon Mason <mason@myri.com>
---
 drivers/net/myri10ge/myri10ge.c |   12 ++++++------
 1 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/net/myri10ge/myri10ge.c b/drivers/net/myri10ge/myri10ge.c
index 04e10f4..2a51de1 100644
--- a/drivers/net/myri10ge/myri10ge.c
+++ b/drivers/net/myri10ge/myri10ge.c
@@ -170,12 +170,12 @@ struct myri10ge_rx_done {
 };
 
 struct myri10ge_slice_netstats {
-	unsigned long rx_packets;
-	unsigned long tx_packets;
-	unsigned long rx_bytes;
-	unsigned long tx_bytes;
-	unsigned long rx_dropped;
-	unsigned long tx_dropped;
+	u64 rx_packets;
+	u64 tx_packets;
+	u64 rx_bytes;
+	u64 tx_bytes;
+	u64 rx_dropped;
+	u64 tx_dropped;
 };
 
 struct myri10ge_slice_state {
-- 
1.7.5.4


^ permalink raw reply related

* Re: [PATCH v2 3/5] e100:  Support receiving errored frames.
From: Ben Greear @ 2011-06-20 18:06 UTC (permalink / raw)
  To: Ben Hutchings; +Cc: netdev
In-Reply-To: <1308592672.2701.146.camel@bwh-desktop>

On 06/20/2011 10:57 AM, Ben Hutchings wrote:
> On Sat, 2011-06-18 at 13:47 -0700, greearb@candelatech.com wrote:
>> From: Ben Greear<greearb@candelatech.com>
>>
>> This can be helpful when sniffing dodgy networks.
>>
>> Signed-off-by: Ben Greear<greearb@candelatech.com>
>> ---
>> :100644 100644 647d8c6... aad303d... M	drivers/net/e100.c
>>   drivers/net/e100.c |   42 ++++++++++++++++++++++++++++++++++++++++--
>>   1 files changed, 40 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/net/e100.c b/drivers/net/e100.c
>> index 647d8c6..aad303d 100644
>> --- a/drivers/net/e100.c
>> +++ b/drivers/net/e100.c
>> @@ -588,6 +588,7 @@ struct nic {
>>   		wol_magic          = (1<<  3),
>>   		ich_10h_workaround = (1<<  4),
>>   		save_rxfcs         = (1<<  5),
>> +		save_rxerr         = (1<<  6),
>>   	} flags					____cacheline_aligned;
>>
>>   	enum mac mac;
>> @@ -1126,9 +1127,13 @@ static void e100_configure(struct nic *nic, struct cb *cb, struct sk_buff *skb)
>>   		config->full_duplex_force = 0x1;	/* 1=force, 0=auto */
>>
>>   	if (nic->flags&  promiscuous || nic->loopback) {
>> +		config->promiscuous_mode = 0x1;		/* 1=on, 0=off */
>> +	}
>> +
>> +	if (nic->flags&  save_rxerr) {
>> +		config->rx_discard_overruns = 0x1;	/* 1=save, 0=discard */
>>   		config->rx_save_bad_frames = 0x1;	/* 1=save, 0=discard */
>>   		config->rx_discard_short_frames = 0x0;	/* 1=discard, 0=save */
>
> Any idea why these were previously set in promiscuous or loopback mode?

No, but it appears they would have been thrown away anyway, so I
think this change is safe.

>> -		config->promiscuous_mode = 0x1;		/* 1=on, 0=off */
>>   	}
>>
>>   	if (nic->flags&  save_rxfcs)
>> @@ -1983,7 +1988,18 @@ static int e100_rx_indicate(struct nic *nic, struct rx *rx,
>>   	skb_put(skb, actual_size);
>>   	skb->protocol = eth_type_trans(skb, nic->netdev);
>>
>> -	if (unlikely(!(rfd_status&  cb_ok))) {
>> +	if (unlikely(nic->flags&  save_rxerr)) {
>> +		if (!(rfd_status&  cb_ok)) {
>> +			skb->pkt_type = PACKET_INVALID;
>> +		} else if (actual_size>
>> +			   ETH_DATA_LEN + VLAN_ETH_HLEN + rxfcs_pad) {
>> +			nic->rx_over_length_errors++;
>> +			skb->pkt_type = PACKET_INVALID;
>> +		}
>> +		goto process_skb;
>> +	}
>> +
>> +	if (unlikely((nic->flags&  save_rxerr)&&  !(rfd_status&  cb_ok))) {
> [...]
>
> You're adding an if-statement to cover the save_rxerr case, and the
> existing if-else-statement should cover the !save_rxerr case - so you
> are missing a '!'.  But since the new if-statement's body ends with a
> goto, there should be no need to change the condition for the existing
> if-else statement at all.

Right..I will fix that.  I forgot to back that last chunk out
of some previous attempts at that code.

I'm going to have to re-spin this entire series after somehow
dealing with the FEATURES extension, so it will likely be a while.

Thanks,
Ben


-- 
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc  http://www.candelatech.com


^ permalink raw reply

* Re: [PATCH net-next-2.6] myricom: remove stats_lock
From: Jon Mason @ 2011-06-20 18:00 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, netdev, Andrew Gallatin, Brice Goglin,
	Stephen Hemminger
In-Reply-To: <1308550066.3539.129.camel@edumazet-laptop>

On Mon, Jun 20, 2011 at 1:07 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> stats_lock is non useless, lets remove it.
>
> Also, ndo_get_stats64() doesnt have to clear the stats, caller takes
> care of this.
>
> Note: folding 32bit fields in 64bit one is problematic when one of 32bit
> values wraps : SNMP reader will see a ~2^32 back change in a 64bit
> value. A future patch should fix this.

Good point, I'll push a patch shortly to increase the size of the
slice stats to 64b.

>
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> CC: Stephen Hemminger <shemminger@vyatta.com>
> CC: Andrew Gallatin <gallatin@myri.com>
> CC: Brice Goglin <brice@myri.com>

Acked-by: Jon Mason <mason@myri.com>

> ---
>  drivers/net/myri10ge/myri10ge.c |   10 +++-------
>  1 file changed, 3 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/net/myri10ge/myri10ge.c b/drivers/net/myri10ge/myri10ge.c
> index 04e10f4..3e89a84 100644
> --- a/drivers/net/myri10ge/myri10ge.c
> +++ b/drivers/net/myri10ge/myri10ge.c
> @@ -210,7 +210,6 @@ struct myri10ge_priv {
>        int big_bytes;
>        int max_intr_slots;
>        struct net_device *dev;
> -       spinlock_t stats_lock;
>        u8 __iomem *sram;
>        int sram_size;
>        unsigned long board_span;
> @@ -1837,6 +1836,7 @@ myri10ge_get_ethtool_stats(struct net_device *netdev,
>        int i;
>
>        /* force stats update */
> +       memset(&link_stats, 0, sizeof(link_stats));
>        (void)myri10ge_get_stats(netdev, &link_stats);
>        for (i = 0; i < MYRI10GE_NET_STATS_LEN; i++)
>                data[i] = ((u64 *)&link_stats)[i];
> @@ -2981,12 +2981,10 @@ drop:
>  static struct rtnl_link_stats64 *myri10ge_get_stats(struct net_device *dev,
>                                                    struct rtnl_link_stats64 *stats)
>  {
> -       struct myri10ge_priv *mgp = netdev_priv(dev);
> -       struct myri10ge_slice_netstats *slice_stats;
> +       const struct myri10ge_priv *mgp = netdev_priv(dev);
> +       const struct myri10ge_slice_netstats *slice_stats;
>        int i;
>
> -       spin_lock(&mgp->stats_lock);
> -       memset(stats, 0, sizeof(*stats));
>        for (i = 0; i < mgp->num_slices; i++) {
>                slice_stats = &mgp->ss[i].stats;
>                stats->rx_packets += slice_stats->rx_packets;
> @@ -2996,7 +2994,6 @@ static struct rtnl_link_stats64 *myri10ge_get_stats(struct net_device *dev,
>                stats->rx_dropped += slice_stats->rx_dropped;
>                stats->tx_dropped += slice_stats->tx_dropped;
>        }
> -       spin_unlock(&mgp->stats_lock);
>        return stats;
>  }
>
> @@ -3966,7 +3963,6 @@ static int myri10ge_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
>        setup_timer(&mgp->watchdog_timer, myri10ge_watchdog_timer,
>                    (unsigned long)mgp);
>
> -       spin_lock_init(&mgp->stats_lock);
>        SET_ETHTOOL_OPS(netdev, &myri10ge_ethtool_ops);
>        INIT_WORK(&mgp->watchdog_work, myri10ge_watchdog);
>        status = register_netdev(netdev);
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>

^ permalink raw reply

* Re: [PATCH v2 3/5] e100:  Support receiving errored frames.
From: Ben Hutchings @ 2011-06-20 17:57 UTC (permalink / raw)
  To: greearb; +Cc: netdev
In-Reply-To: <1308430045-24816-4-git-send-email-greearb@candelatech.com>

On Sat, 2011-06-18 at 13:47 -0700, greearb@candelatech.com wrote:
> From: Ben Greear <greearb@candelatech.com>
> 
> This can be helpful when sniffing dodgy networks.
> 
> Signed-off-by: Ben Greear <greearb@candelatech.com>
> ---
> :100644 100644 647d8c6... aad303d... M	drivers/net/e100.c
>  drivers/net/e100.c |   42 ++++++++++++++++++++++++++++++++++++++++--
>  1 files changed, 40 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/net/e100.c b/drivers/net/e100.c
> index 647d8c6..aad303d 100644
> --- a/drivers/net/e100.c
> +++ b/drivers/net/e100.c
> @@ -588,6 +588,7 @@ struct nic {
>  		wol_magic          = (1 << 3),
>  		ich_10h_workaround = (1 << 4),
>  		save_rxfcs         = (1 << 5),
> +		save_rxerr         = (1 << 6),
>  	} flags					____cacheline_aligned;
>  
>  	enum mac mac;
> @@ -1126,9 +1127,13 @@ static void e100_configure(struct nic *nic, struct cb *cb, struct sk_buff *skb)
>  		config->full_duplex_force = 0x1;	/* 1=force, 0=auto */
>  
>  	if (nic->flags & promiscuous || nic->loopback) {
> +		config->promiscuous_mode = 0x1;		/* 1=on, 0=off */
> +	}
> +
> +	if (nic->flags & save_rxerr) {
> +		config->rx_discard_overruns = 0x1;	/* 1=save, 0=discard */
>  		config->rx_save_bad_frames = 0x1;	/* 1=save, 0=discard */
>  		config->rx_discard_short_frames = 0x0;	/* 1=discard, 0=save */

Any idea why these were previously set in promiscuous or loopback mode?

> -		config->promiscuous_mode = 0x1;		/* 1=on, 0=off */
>  	}
>  
>  	if (nic->flags & save_rxfcs)
> @@ -1983,7 +1988,18 @@ static int e100_rx_indicate(struct nic *nic, struct rx *rx,
>  	skb_put(skb, actual_size);
>  	skb->protocol = eth_type_trans(skb, nic->netdev);
>  
> -	if (unlikely(!(rfd_status & cb_ok))) {
> +	if (unlikely(nic->flags & save_rxerr)) {
> +		if (!(rfd_status & cb_ok)) {
> +			skb->pkt_type = PACKET_INVALID;
> +		} else if (actual_size >
> +			   ETH_DATA_LEN + VLAN_ETH_HLEN + rxfcs_pad) {
> +			nic->rx_over_length_errors++;
> +			skb->pkt_type = PACKET_INVALID;
> +		}
> +		goto process_skb;
> +	}
> +
> +	if (unlikely((nic->flags & save_rxerr) && !(rfd_status & cb_ok))) {
[...]

You're adding an if-statement to cover the save_rxerr case, and the
existing if-else-statement should cover the !save_rxerr case - so you
are missing a '!'.  But since the new if-statement's body ends with a
goto, there should be no need to change the condition for the existing
if-else statement at all.

Ben.

-- 
Ben Hutchings, Senior Software Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply

* Re: [PATCH 2/2] mv643xx_eth: fix race in trasmit path.
From: Eric Dumazet @ 2011-06-20 17:21 UTC (permalink / raw)
  To: Lennert Buytenhek; +Cc: Richard Cochran, netdev, David Miller, stable
In-Reply-To: <1308590350.2658.6.camel@edumazet-laptop>

Le lundi 20 juin 2011 à 19:19 +0200, Eric Dumazet a écrit :
> Le lundi 20 juin 2011 à 18:33 +0200, Lennert Buytenhek a écrit :
> > On Mon, Jun 20, 2011 at 09:48:07AM +0200, Richard Cochran wrote:
> > 
> > > Because the socket buffer is freed in the completion interrupt, it
> > > is not safe to access it after submitting it to the hardware.
> > 
> > Maybe I'm missing something here, but mv643xx_eth TX reclaim is done
> > from NAPI poll, under __netif_tx_lock(), while mv643xx_eth_xmit() also
> > runs under __netif_tx_lock().
> 
> See my previous answer. Its true this driver _currently_ holds tx queue
> lock in its TX completion. But that might/should change.
> 
> Goal is to make tx completion not use tx queue lock in fast path, like
> its done in tg3, bnx2, bnx2x ... and other recent drivers.
> 
> Its obviously correct to move skb->len access in start_xmit() before
> starting the IO, even if not a bug fix, it makes all drivers behave the
> same : When reviewing them, its easier not to worry about these possible
> use after free.
> 
> 

One random example found in drivers/staging/hv/netvsc_drv.c

Is the "net->stats.tx_bytes += skb->len;" found in line 188 is correct ?

Who knows ?

I believe its not correct ;)




^ permalink raw reply

* Re: [PATCH 2/2] mv643xx_eth: fix race in trasmit path.
From: Eric Dumazet @ 2011-06-20 17:19 UTC (permalink / raw)
  To: Lennert Buytenhek; +Cc: Richard Cochran, netdev, David Miller, stable
In-Reply-To: <20110620163316.GC994@wantstofly.org>

Le lundi 20 juin 2011 à 18:33 +0200, Lennert Buytenhek a écrit :
> On Mon, Jun 20, 2011 at 09:48:07AM +0200, Richard Cochran wrote:
> 
> > Because the socket buffer is freed in the completion interrupt, it
> > is not safe to access it after submitting it to the hardware.
> 
> Maybe I'm missing something here, but mv643xx_eth TX reclaim is done
> from NAPI poll, under __netif_tx_lock(), while mv643xx_eth_xmit() also
> runs under __netif_tx_lock().

See my previous answer. Its true this driver _currently_ holds tx queue
lock in its TX completion. But that might/should change.

Goal is to make tx completion not use tx queue lock in fast path, like
its done in tg3, bnx2, bnx2x ... and other recent drivers.

Its obviously correct to move skb->len access in start_xmit() before
starting the IO, even if not a bug fix, it makes all drivers behave the
same : When reviewing them, its easier not to worry about these possible
use after free.




^ permalink raw reply

* Re: [PATCH V3 10/11] mv643xx_eth: enable transmit time stamping.
From: Lennert Buytenhek @ 2011-06-20 16:35 UTC (permalink / raw)
  To: Richard Cochran; +Cc: netdev, David Miller, Eric Dumazet
In-Reply-To: <96e6a2899b55ac1272cee8ce63e2549e24a10386.1308556146.git.richard.cochran@omicron.at>

On Mon, Jun 20, 2011 at 09:51:32AM +0200, Richard Cochran wrote:

> This patch enables software (and phy device) transmit time stamping.
> Compile tested only.
> 
> Cc: Lennert Buytenhek <buytenh@wantstofly.org>
> Signed-off-by: Richard Cochran <richard.cochran@omicron.at>
> ---
>  drivers/net/mv643xx_eth.c |    2 ++
>  1 files changed, 2 insertions(+), 0 deletions(-)
> 
> diff --git a/drivers/net/mv643xx_eth.c b/drivers/net/mv643xx_eth.c
> index 1b7d2c1..3671714 100644
> --- a/drivers/net/mv643xx_eth.c
> +++ b/drivers/net/mv643xx_eth.c
> @@ -847,6 +847,8 @@ no_csum:
>  	/* clear TX_END status */
>  	mp->work_tx_end &= ~(1 << txq->index);
>  
> +	skb_tx_timestamp(skb);
> +
>  	/* ensure all descriptors are written before poking hardware */
>  	wmb();
>  	txq_enable(txq);

This only timestamps the skb _after_ writing the final (cmd_sts) HW TX
descriptor word, and so if your previous patch is correct (which I don't
think it is), this would be buggy as well.

^ permalink raw reply

* Re: [PATCH 2/2] mv643xx_eth: fix race in trasmit path.
From: Lennert Buytenhek @ 2011-06-20 16:33 UTC (permalink / raw)
  To: Richard Cochran; +Cc: netdev, David Miller, Eric Dumazet, stable
In-Reply-To: <25a379578b71bf01f3c77ac76a193d26554f9e0c.1308555865.git.richard.cochran@omicron.at>

On Mon, Jun 20, 2011 at 09:48:07AM +0200, Richard Cochran wrote:

> Because the socket buffer is freed in the completion interrupt, it
> is not safe to access it after submitting it to the hardware.

Maybe I'm missing something here, but mv643xx_eth TX reclaim is done
from NAPI poll, under __netif_tx_lock(), while mv643xx_eth_xmit() also
runs under __netif_tx_lock().

^ permalink raw reply

* Re: [PATCH 10/14] SIWv2: Transmit path: siw_qp_tx.c
From: Bart Van Assche @ 2011-06-20 16:32 UTC (permalink / raw)
  To: Bernard Metzler
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <OF564FAAA3.FC8A4AB9-ONC12578B5.003C1E38-C12578B5.0044F0C1-Xeyd2O9EBijQT0dZR+AlfA@public.gmane.org>

On Mon, Jun 20, 2011 at 2:33 PM, Bernard Metzler <BMT-OA+xvbQnYDHMbYB6QlFGEg@public.gmane.org> wrote:
> bart.vanassche-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org wrote on 06/18/2011 07:47:29 PM:
> > On Thu, Jun 16, 2011 at 2:42 PM, Bernard Metzler <bmt-OA+xvbQnYDG0NRUx5Wjc1Q@public.gmane.orgm>
> wrote:
> > > +               if (tx_type == RDMAP_RDMA_READ_REQ) {
> >
> > This is what the compiler says about the above statement:
> >
> > drivers/infiniband/hw/siw/siw_qp_tx.c:1145:15: warning: comparison
> > between enum siw_wr_opcode and enum rdma_opcode
> >
> > So how can that statement be correct ?
> >
> it will not - must be changed to 'SIW_WR_RDMA_READ_REQ'...many thanks!
> I obviously used sparse in a wrong way:
> building within kernel tree with 'make C=1 CF="-Wsparse-all' does not give
> me the indicated warning (neither does setting CF="-Wenum-mismatch").

That warning was not reported by sparse but by gcc 4.5.

Bart.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH 2/7] net: dcbnl, add multicast group for DCB
From: John Fastabend @ 2011-06-20 16:16 UTC (permalink / raw)
  To: Shmulik Ravid; +Cc: davem@davemloft.net, netdev@vger.kernel.org
In-Reply-To: <1308507279.15528.14.camel@lb-tlvb-shmulik.il.broadcom.com>

On 6/19/2011 11:14 AM, Shmulik Ravid wrote:
> 
> On Fri, 2011-06-17 at 14:16 -0700, John Fastabend wrote:
>> Now that dcbnl is being used in many cases by more
>> than a single agent it is beneficial to be notified
>> when some entity either driver or user space has
>> changed the DCB attributes.
>>
>> Today applications either end up polling the interface
>> or relying on a user space database to maintain the DCB
>> state and post events. Polling is a poor solution for
>> obvious reasons. And relying on a user space database
>> has its own downside. Namely it has created strange
>> boot dependencies requiring the database be populated
>> before any applications dependent on DCB attributes
>> starts or the application goes into a polling loop.
>> Populating the database requires negotiating link
>> setting with the peer and can take anywhere from less
>> than a second up to a few seconds depending on the switch
>> implementation.
>>
>> Perhaps more importantly if another application or an
>> embedded agent sets a DCB link attribute the database
>> has no way of knowing other than polling the kernel.
>> This prevents applications from responding quickly to
>> changes in link events which at least in the FCoE case
>> and probably any other protocols expecting a lossless
>> link may result in IO errors.
>>
>> By adding a multicast group for DCB we have clean way
>> to disseminate kernel DCB link attributes up to user
>> space. Avoiding the need for user space to maintain
>> a coherant database and disperse events that potentially
>> do not reflect the current link state.
>>
>> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
>> ---
>>
>>  include/linux/rtnetlink.h |    2 
>>  net/dcb/dcbnl.c           |  228 +++++++++++++++++++++++++++++----------------
>>  2 files changed, 147 insertions(+), 83 deletions(-)
>>
> [...]
>> +static int dcbnl_notify(struct net_device *dev, int event, int cmd,
>> +			u32 seq, u32 pid)
> [...]
> A driver supporting an embedded dcbx stack would want to directly invoke
> the notification routine, therefore it should be exported. I'd like also

We can export this function when we add support to the driver.

> to add support for a CEE notification. Also when the driver invokes the
> notification shouldn't it use a new event type something like
> RTM_NEWDCB?
>  

I'm not sure a RTM_NEWDCB event is needed. How would user space be expected
to handle the two events? FCoE needs to know if the link is lossless or not
it shouldn't care if a firmware agent is running or a user space LLDP agent.
Other applications (multipathd, iscsid) probably just want to know the current
DCB attributes.

Maybe I missed your point could you provide more details?

Thanks,
John

^ permalink raw reply

* Re: [PATCH 0/7] Series short description
From: John Fastabend @ 2011-06-20 16:05 UTC (permalink / raw)
  To: Shmulik Ravid; +Cc: davem@davemloft.net, netdev@vger.kernel.org
In-Reply-To: <1308506310.15528.6.camel@lb-tlvb-shmulik.il.broadcom.com>

On 6/19/2011 10:58 AM, Shmulik Ravid wrote:
> 
> On Fri, 2011-06-17 at 14:16 -0700, John Fastabend wrote:
>> The following series implements improvements to the DCB netlink
>> interface. This work was done to improve Application handling
>> and support firmware based LLDP agents.
>>
>> This adds a multicast group for DCB currently user space has
>> trouble keeping in sync across multiple applications and with
>> firmware agents making DCB attribute changes in the driver.
>>
> 
> I was working on something along the same lines and would like to add on
> top of these changes a notification for the CEE flavor.
>  

Yes please do. This would be helpful I just haven't got to it
yet. Anyways we will need something like this to support CEE
based firmware implementations.

Thanks,
John

^ permalink raw reply

* Re: [PATCH 2/2] proc: Usable inode numbers for the namespace file descriptors.
From: Serge E. Hallyn @ 2011-06-20 16:06 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Linux Containers, Alexey Dobriyan, netdev, David Lamparter,
	linux-kernel, Serge E. Hallyn
In-Reply-To: <m1mxhgav9c.fsf_-_@fess.ebiederm.org>

Quoting Eric W. Biederman (ebiederm@xmission.com):
> 
> Assign a unique proc inode to each namespace, yielding an
> identifier that userspace can use for identifying a namespace.
> 
> This has been a long requested feature and only blocked because
> a naive implementation would put the id in a global space and
> would ultimately require having a namespace for the names of
> namespaces, making migration and certain virtualization tricks
> impossible.
> 
> We still don't have per superblock inode numbers for proc, which
> appears necessary for application unaware checkpoint/restart and
> migrations (if the application is using namespace filedescriptors)
> but that is now allowd by the design if it becomes important.
> 
> I have preallocated the ipc and uts initial proc inode numbers so
> their structures can be statically initialized.
> 
> Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>

I've not looked at the setns patches enough, but from what I can see
here it looks good.

Acked-by: Serge Hallyn <serge@hallyn.com>

thanks,
-serge

> ---
>  fs/proc/namespaces.c          |    1 +
>  include/linux/ipc_namespace.h |    2 ++
>  include/linux/proc_fs.h       |    4 ++++
>  include/linux/utsname.h       |    1 +
>  include/net/net_namespace.h   |    2 ++
>  init/version.c                |    2 ++
>  ipc/msgutil.c                 |    2 ++
>  ipc/namespace.c               |   16 ++++++++++++++++
>  kernel/utsname.c              |   17 ++++++++++++++++-
>  net/core/net_namespace.c      |   24 ++++++++++++++++++++++++
>  10 files changed, 70 insertions(+), 1 deletions(-)
> 
> diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
> index be177f7..ddc2bb4 100644
> --- a/fs/proc/namespaces.c
> +++ b/fs/proc/namespaces.c
> @@ -54,6 +54,7 @@ static struct dentry *proc_ns_instantiate(struct inode *dir,
>  	ei->ns_ops    = ns_ops;
>  	ei->ns	      = ns;
>  
> +	inode->i_ino = ns_ops->inum(ei->ns);
>  	dentry->d_op = &pid_dentry_operations;
>  	d_add(dentry, inode);
>  	/* Close the race of the process dying before we return the dentry */
> diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
> index a6d1655..22a4dc4 100644
> --- a/include/linux/ipc_namespace.h
> +++ b/include/linux/ipc_namespace.h
> @@ -60,6 +60,8 @@ struct ipc_namespace {
>  
>  	/* user_ns which owns the ipc ns */
>  	struct user_namespace *user_ns;
> +
> +	unsigned int	proc_inum;
>  };
>  
>  extern struct ipc_namespace init_ipc_ns;
> diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
> index 3067b44..1aee7f0 100644
> --- a/include/linux/proc_fs.h
> +++ b/include/linux/proc_fs.h
> @@ -29,8 +29,11 @@ struct mm_struct;
>  
>  enum {
>  	PROC_ROOT_INO = 1,
> +	PROC_IPC_INIT_INO = 2,
> +	PROC_UTS_INIT_INO = 3,
>  };
>  
> +
>  /*
>   * This is not completely implemented yet. The idea is to
>   * create an in-memory tree (like the actual /proc filesystem
> @@ -257,6 +260,7 @@ struct proc_ns_operations {
>  	void *(*get)(struct task_struct *task);
>  	void (*put)(void *ns);
>  	int (*install)(struct nsproxy *nsproxy, void *ns);
> +	unsigned int (*inum)(void *ns);
>  };
>  extern const struct proc_ns_operations netns_operations;
>  extern const struct proc_ns_operations utsns_operations;
> diff --git a/include/linux/utsname.h b/include/linux/utsname.h
> index 4e5b021..03db764 100644
> --- a/include/linux/utsname.h
> +++ b/include/linux/utsname.h
> @@ -44,6 +44,7 @@ struct uts_namespace {
>  	struct kref kref;
>  	struct new_utsname name;
>  	struct user_namespace *user_ns;
> +	unsigned int proc_inum;
>  };
>  extern struct uts_namespace init_uts_ns;
>  
> diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
> index 2bf9ed9..4b85be2 100644
> --- a/include/net/net_namespace.h
> +++ b/include/net/net_namespace.h
> @@ -49,6 +49,8 @@ struct net {
>  	struct list_head	cleanup_list;	/* namespaces on death row */
>  	struct list_head	exit_list;	/* Use only net_mutex */
>  
> +	unsigned int		proc_inum;
> +
>  	struct proc_dir_entry 	*proc_net;
>  	struct proc_dir_entry 	*proc_net_stat;
>  
> diff --git a/init/version.c b/init/version.c
> index 86fe0cc..58170f1 100644
> --- a/init/version.c
> +++ b/init/version.c
> @@ -12,6 +12,7 @@
>  #include <linux/utsname.h>
>  #include <generated/utsrelease.h>
>  #include <linux/version.h>
> +#include <linux/proc_fs.h>
>  
>  #ifndef CONFIG_KALLSYMS
>  #define version(a) Version_ ## a
> @@ -34,6 +35,7 @@ struct uts_namespace init_uts_ns = {
>  		.domainname	= UTS_DOMAINNAME,
>  	},
>  	.user_ns = &init_user_ns,
> +	.proc_inum = PROC_UTS_INIT_INO,
>  };
>  EXPORT_SYMBOL_GPL(init_uts_ns);
>  
> diff --git a/ipc/msgutil.c b/ipc/msgutil.c
> index 8b5ce5d..f7da485 100644
> --- a/ipc/msgutil.c
> +++ b/ipc/msgutil.c
> @@ -14,6 +14,7 @@
>  #include <linux/slab.h>
>  #include <linux/ipc.h>
>  #include <linux/ipc_namespace.h>
> +#include <linux/proc_fs.h>
>  #include <asm/uaccess.h>
>  
>  #include "util.h"
> @@ -33,6 +34,7 @@ struct ipc_namespace init_ipc_ns = {
>  	.mq_msgsize_max  = DFLT_MSGSIZEMAX,
>  #endif
>  	.user_ns = &init_user_ns,
> +	.proc_inum = PROC_IPC_INIT_INO,
>  };
>  
>  atomic_t nr_ipc_ns = ATOMIC_INIT(1);
> diff --git a/ipc/namespace.c b/ipc/namespace.c
> index ce0a647..cd7f733 100644
> --- a/ipc/namespace.c
> +++ b/ipc/namespace.c
> @@ -26,9 +26,16 @@ static struct ipc_namespace *create_ipc_ns(struct task_struct *tsk,
>  	if (ns == NULL)
>  		return ERR_PTR(-ENOMEM);
>  
> +	err = proc_alloc_inum(&ns->proc_inum);
> +	if (err) {
> +		kfree(ns);
> +		return ERR_PTR(err);
> +	}
> +
>  	atomic_set(&ns->count, 1);
>  	err = mq_init_ns(ns);
>  	if (err) {
> +		proc_free_inum(ns->proc_inum);
>  		kfree(ns);
>  		return ERR_PTR(err);
>  	}
> @@ -113,6 +120,7 @@ static void free_ipc_ns(struct ipc_namespace *ns)
>  	 */
>  	ipcns_notify(IPCNS_REMOVED);
>  	put_user_ns(ns->user_ns);
> +	proc_free_inum(ns->proc_inum);
>  	kfree(ns);
>  }
>  
> @@ -170,10 +178,18 @@ static int ipcns_install(struct nsproxy *nsproxy, void *ns)
>  	return 0;
>  }
>  
> +static unsigned int ipcns_inum(void *vp)
> +{
> +	struct ipc_namespace *ns = vp;
> +
> +	return ns->proc_inum;
> +}
> +
>  const struct proc_ns_operations ipcns_operations = {
>  	.name		= "ipc",
>  	.type		= CLONE_NEWIPC,
>  	.get		= ipcns_get,
>  	.put		= ipcns_put,
>  	.install	= ipcns_install,
> +	.inum		= ipcns_inum,
>  };
> diff --git a/kernel/utsname.c b/kernel/utsname.c
> index bff131b..3ab6a08 100644
> --- a/kernel/utsname.c
> +++ b/kernel/utsname.c
> @@ -36,11 +36,18 @@ static struct uts_namespace *clone_uts_ns(struct task_struct *tsk,
>  					  struct uts_namespace *old_ns)
>  {
>  	struct uts_namespace *ns;
> +	int err;
>  
>  	ns = create_uts_ns();
>  	if (!ns)
>  		return ERR_PTR(-ENOMEM);
>  
> +	err = proc_alloc_inum(&ns->proc_inum);
> +	if (err) {
> +		kfree(ns);
> +		return ERR_PTR(err);
> +	}
> +
>  	down_read(&uts_sem);
>  	memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
>  	ns->user_ns = get_user_ns(task_cred_xxx(tsk, user)->user_ns);
> @@ -78,6 +85,7 @@ void free_uts_ns(struct kref *kref)
>  
>  	ns = container_of(kref, struct uts_namespace, kref);
>  	put_user_ns(ns->user_ns);
> +	proc_free_inum(ns->proc_inum);
>  	kfree(ns);
>  }
>  
> @@ -110,11 +118,18 @@ static int utsns_install(struct nsproxy *nsproxy, void *ns)
>  	return 0;
>  }
>  
> +static unsigned int utsns_inum(void *vp)
> +{
> +	struct uts_namespace *ns = vp;
> +
> +	return ns->proc_inum;
> +}
> +
>  const struct proc_ns_operations utsns_operations = {
>  	.name		= "uts",
>  	.type		= CLONE_NEWUTS,
>  	.get		= utsns_get,
>  	.put		= utsns_put,
>  	.install	= utsns_install,
> +	.inum		= utsns_inum,
>  };
> -
> diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
> index e41e511..6199ec2 100644
> --- a/net/core/net_namespace.c
> +++ b/net/core/net_namespace.c
> @@ -358,6 +358,21 @@ struct net *get_net_ns_by_pid(pid_t pid)
>  }
>  EXPORT_SYMBOL_GPL(get_net_ns_by_pid);
>  
> +static __net_init int net_ns_net_init(struct net *net)
> +{
> +	return proc_alloc_inum(&net->proc_inum);
> +}
> +
> +static __net_exit void net_ns_net_exit(struct net *net)
> +{
> +	proc_free_inum(net->proc_inum);
> +}
> +
> +static struct pernet_operations __net_initdata net_ns_ops = {
> +	.init = net_ns_net_init,
> +	.exit = net_ns_net_exit,		
> +};
> +
>  static int __init net_ns_init(void)
>  {
>  	struct net_generic *ng;
> @@ -389,6 +404,8 @@ static int __init net_ns_init(void)
>  
>  	mutex_unlock(&net_mutex);
>  
> +	register_pernet_subsys(&net_ns_ops);
> +
>  	return 0;
>  }
>  
> @@ -616,11 +633,18 @@ static int netns_install(struct nsproxy *nsproxy, void *ns)
>  	return 0;
>  }
>  
> +static unsigned int netns_inum(void *ns)
> +{
> +	struct net *net = ns;
> +	return net->proc_inum;
> +}
> +
>  const struct proc_ns_operations netns_operations = {
>  	.name		= "net",
>  	.type		= CLONE_NEWNET,
>  	.get		= netns_get,
>  	.put		= netns_put,
>  	.install	= netns_install,
> +	.inum		= netns_inum,
>  };
>  #endif
> -- 
> 1.7.5.1.217.g4e3aa
> 

^ permalink raw reply

* Re: [PATCH net-next] ip: introduce ip_is_fragment helper inline function
From: Flavio Leitner @ 2011-06-20 15:35 UTC (permalink / raw)
  To: Paul Gortmaker; +Cc: davem, netdev
In-Reply-To: <1308334091-10031-1-git-send-email-paul.gortmaker@windriver.com>

On 06/17/2011 03:08 PM, Paul Gortmaker wrote:
> There are enough instances of this:
> 
>     iph->frag_off & htons(IP_MF | IP_OFFSET)
> 
> that a helper function is probably warranted.
> 
> Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
> ---
[...]
> diff --git a/include/net/ip.h b/include/net/ip.h
> index e9ea7c7..114a152 100644
> --- a/include/net/ip.h
> +++ b/include/net/ip.h
> @@ -250,6 +250,11 @@ int ip_decrease_ttl(struct iphdr *iph)
>  	return --iph->ttl;
>  }
>  
> +static inline int ip_is_fragment(const struct iphdr *iph)
> +{
> +	return iph->frag_off & htons(IP_MF | IP_OFFSET);
> +}
> +

Do we really need 'inline' there?

fbl

^ permalink raw reply

* Re: [RFC] Moving files around in drivers/net
From: Jon Mason @ 2011-06-20 15:23 UTC (permalink / raw)
  To: jeffrey.t.kirsher
  Cc: Joe Perches, netdev, David Miller, Paul Gortmaker, Jan Engelhardt,
	Andrew Gallatin
In-Reply-To: <1308535174.22851.26.camel@jtkirshe-mobl>

[snip]
>> > commit 83df87d655e25119eafaeb4225bc5eedaa82c6e0
>> > Author: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
>> > Date:   Fri May 13 02:24:46 2011 -0700
>> >
>> >    myri*: Move the Myricom drivers
>> >
>> >    Move the Myricom drivers into drivers/net/ethernet/myricom/ and
>> make
>> >    the necessary Kconfig and Makefile changes.
>> >
>> >    CC: Andrew Gallatin <gallatin@myri.com>
>> >    CC: Brice Goglin <brice@myri.com>
>> >    Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
>> >
>> >  MAINTAINERS                                        |    2 +-
>> >  drivers/net/Kconfig                                |   35
>> -------------
>> >  drivers/net/Makefile                               |    3 -
>> >  drivers/net/ethernet/Kconfig                       |    1 +
>> >  drivers/net/ethernet/Makefile                      |    1 +
>> >  drivers/net/ethernet/myricom/Kconfig               |   51
>> ++++++++++++++++++++
>> >  drivers/net/ethernet/myricom/Makefile              |    6 ++
>> >  .../net/{ => ethernet/myricom}/myri10ge/Makefile   |    0
>> >  .../net/{ => ethernet/myricom}/myri10ge/myri10ge.c |    0
>> >  .../{ => ethernet/myricom}/myri10ge/myri10ge_mcp.h |    0
>> >  .../myricom}/myri10ge/myri10ge_mcp_gen_header.h    |    0
>> >  drivers/net/{ => ethernet/myricom}/myri_sbus.c     |    0
>> >  drivers/net/{ => ethernet/myricom}/myri_sbus.h     |    0
>>
>> /me puts on Myricom hat
>>
>> I believe we want to put the driver above in the sbus/legacy
>> directory.  Better yet, move this driver to /dev/null.  There is no
>> possibility of ethernet mode on this adapter, so it's Myrinet only.
>> It won't inter-op with modern versions of Myrinet, and thus can only
>> work with legacy adapters.  There are no drivers for the PCI version
>> of this adapter, so it only can work on ~15year old hardware.  It's
>> long in the tooth, let's take it to the knackers.
>>
>> Thanks,
>> Jon
>
> That sounds good.  Since I am not the maintainer or author of the
> driver, I think such a move should come from Myricom.  I can drop the
> move of myri_sbus.[ch] from my patches to organize the driver/net/
> directory.


I'll submit a patch shortly to remove it.  Assuming that it will be
rejected, I'd recommend putting myri_sbus.* into
 drivers/net/myrinet/
similar to how you have an ethernet directory.

Thanks,
Jon

^ permalink raw reply

* Re: [PATCH 2/2] mv643xx_eth: fix race in trasmit path.
From: Eric Dumazet @ 2011-06-20 16:31 UTC (permalink / raw)
  To: mbizon; +Cc: Richard Cochran, netdev, David Miller, stable, Lennert Buytenhek
In-Reply-To: <1308578878.32446.31.camel@sakura.staff.proxad.net>

Le lundi 20 juin 2011 à 16:07 +0200, Maxime Bizon a écrit :
> On Mon, 2011-06-20 at 09:48 +0200, Richard Cochran wrote:
> 
> Hi Richard,
> 
> > Because the socket buffer is freed in the completion interrupt, it is not
> > safe to access it after submitting it to the hardware.
> 
> I don't see why.
> 
> skb is freed from txq_reclaim() which grabs the tx queue lock before,
> (hence the lockless __skb_queue_xxx() in both functions)
> 
> What am I missing ?

You are right, this driver still takes tx queue lock in its TX
completion path.

txq_reclaim() can currently run for a very long time, blocking other
cpus to make progress if blocked on tx queue lock.

So consider this patch as a cleanup :

Its better to make all drivers behave the same way

1) Either increment tx stats in their start_xmit(), and making sure they
dont access skb->len too late.

2) increment tx stats in their TX completion path.

Then we can work on making TX completion path not taking tx queue lock
in its fast path.



^ permalink raw reply

* Re: [PATCH 2/2] mv643xx_eth: fix race in trasmit path.
From: Maxime Bizon @ 2011-06-20 14:07 UTC (permalink / raw)
  To: Richard Cochran
  Cc: netdev, David Miller, Eric Dumazet, stable, Lennert Buytenhek
In-Reply-To: <25a379578b71bf01f3c77ac76a193d26554f9e0c.1308555865.git.richard.cochran@omicron.at>


On Mon, 2011-06-20 at 09:48 +0200, Richard Cochran wrote:

Hi Richard,

> Because the socket buffer is freed in the completion interrupt, it is not
> safe to access it after submitting it to the hardware.

I don't see why.

skb is freed from txq_reclaim() which grabs the tx queue lock before,
(hence the lockless __skb_queue_xxx() in both functions)

What am I missing ?

> Cc: stable@kernel.org
> Cc: Lennert Buytenhek <buytenh@wantstofly.org>
> Signed-off-by: Richard Cochran <richard.cochran@omicron.at>
> ---
>  drivers/net/mv643xx_eth.c |    6 ++++--
>  1 files changed, 4 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/net/mv643xx_eth.c b/drivers/net/mv643xx_eth.c
> index a5d9b1c..1b7d2c1 100644
> --- a/drivers/net/mv643xx_eth.c
> +++ b/drivers/net/mv643xx_eth.c
> @@ -859,7 +859,7 @@ no_csum:
>  static netdev_tx_t mv643xx_eth_xmit(struct sk_buff *skb, struct net_device *dev)
>  {
>  	struct mv643xx_eth_private *mp = netdev_priv(dev);
> -	int queue;
> +	int length, queue;
>  	struct tx_queue *txq;
>  	struct netdev_queue *nq;
>  
> @@ -881,10 +881,12 @@ static netdev_tx_t mv643xx_eth_xmit(struct sk_buff *skb, struct net_device *dev)
>  		return NETDEV_TX_OK;
>  	}
>  
> +	length = skb->len;
> +
>  	if (!txq_submit_skb(txq, skb)) {
>  		int entries_left;
>  
> -		txq->tx_bytes += skb->len;
> +		txq->tx_bytes += length;
>  		txq->tx_packets++;
>  
>  		entries_left = txq->tx_ring_size - txq->tx_desc_count;

-- 
Maxime



^ permalink raw reply

* Re: [PATCH 2/2] mv643xx_eth: fix race in trasmit path.
From: Eric Dumazet @ 2011-06-20 15:49 UTC (permalink / raw)
  To: Richard Cochran; +Cc: netdev, David Miller, stable, Lennert Buytenhek
In-Reply-To: <25a379578b71bf01f3c77ac76a193d26554f9e0c.1308555865.git.richard.cochran@omicron.at>

Le lundi 20 juin 2011 à 09:48 +0200, Richard Cochran a écrit :
> Because the socket buffer is freed in the completion interrupt, it is not
> safe to access it after submitting it to the hardware.
> 
> Cc: stable@kernel.org
> Cc: Lennert Buytenhek <buytenh@wantstofly.org>
> Signed-off-by: Richard Cochran <richard.cochran@omicron.at>
> ---
>  drivers/net/mv643xx_eth.c |    6 ++++--
>  1 files changed, 4 insertions(+), 2 deletions(-)
> 

Acked-by: Eric Dumazet <eric.dumazet@gmail.com>




^ permalink raw reply

* Re: [PATCH 1/2] pxa168_eth: fix race in transmit path.
From: Eric Dumazet @ 2011-06-20 15:49 UTC (permalink / raw)
  To: Richard Cochran
  Cc: netdev, David Miller, stable, Sachin Sanap, Zhangfei Gao,
	Philip Rakity
In-Reply-To: <180237f9f567de29068e51b1442629509bb03e43.1308555865.git.richard.cochran@omicron.at>

Le lundi 20 juin 2011 à 09:48 +0200, Richard Cochran a écrit :
> Because the socket buffer is freed in the completion interrupt, it is not
> safe to access it after submitting it to the hardware.
> 
> Cc: stable@kernel.org
> Cc: Sachin Sanap <ssanap@marvell.com>
> Cc: Zhangfei Gao <zgao6@marvell.com>
> Cc: Philip Rakity <prakity@marvell.com>
> Signed-off-by: Richard Cochran <richard.cochran@omicron.at>

Acked-by: Eric Dumazet <eric.dumazet@gmail.com>



^ permalink raw reply

* Re: [net-next 04/17] ixgbevf: provide 64 bit statistics
From: Eric Dumazet @ 2011-06-20 15:46 UTC (permalink / raw)
  To: jeffrey.t.kirsher
  Cc: davem@davemloft.net, Stephen Hemminger, netdev@vger.kernel.org,
	gospo@redhat.com
In-Reply-To: <1308572991.22851.50.camel@jtkirshe-mobl>

Le lundi 20 juin 2011 à 05:29 -0700, Jeff Kirsher a écrit :

> I will drop Stephen's patch from this series and look forward to your
> fixes.  Thanks Eric.
> 
> 

Here is an updated patch :

[PATCH net-next] ixgbevf: provide 64 bit statistics

Compute statistics per ring using 64 bits, and provide
network device stats in 64 bits.

It should make this driver multiqueue operations faster (no more cache
line ping pongs on netdev->stats structure)

Use u64_stats_sync infrastructure so that its safe on 32bit arches as
well.

Based on a prior patch from Stephen Hemminger

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Stephen Hemminger <shemminger@vyatta.com>
---
 drivers/net/ixgbevf/ixgbevf.h      |    8 ++--
 drivers/net/ixgbevf/ixgbevf_main.c |   52 ++++++++++++++++++++++-----
 2 files changed, 48 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ixgbevf/ixgbevf.h b/drivers/net/ixgbevf/ixgbevf.h
index b703f60..162cff1 100644
--- a/drivers/net/ixgbevf/ixgbevf.h
+++ b/drivers/net/ixgbevf/ixgbevf.h
@@ -32,6 +32,7 @@
 #include <linux/timer.h>
 #include <linux/io.h>
 #include <linux/netdevice.h>
+#include <linux/u64_stats_sync.h>
 
 #include "vf.h"
 
@@ -69,12 +70,13 @@ struct ixgbevf_ring {
 		struct ixgbevf_rx_buffer *rx_buffer_info;
 	};
 
+	u64			total_bytes;
+	u64			total_packets;
+	struct u64_stats_sync	syncp;
+
 	u16 head;
 	u16 tail;
 
-	unsigned int total_bytes;
-	unsigned int total_packets;
-
 	u16 reg_idx; /* holds the special value that gets the hardware register
 		      * offset associated with this ring, which is different
 		      * for DCB and RSS modes */
diff --git a/drivers/net/ixgbevf/ixgbevf_main.c b/drivers/net/ixgbevf/ixgbevf_main.c
index b2c5ecd..ee13e26 100644
--- a/drivers/net/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ixgbevf/ixgbevf_main.c
@@ -264,11 +264,10 @@ static bool ixgbevf_clean_tx_irq(struct ixgbevf_adapter *adapter,
 		IXGBE_WRITE_REG(hw, IXGBE_VTEICS, tx_ring->v_idx);
 	}
 
+	u64_stats_update_begin(&tx_ring->syncp);
 	tx_ring->total_bytes += total_bytes;
 	tx_ring->total_packets += total_packets;
-
-	netdev->stats.tx_bytes += total_bytes;
-	netdev->stats.tx_packets += total_packets;
+	u64_stats_update_end(&tx_ring->syncp);
 
 	return count < tx_ring->work_limit;
 }
@@ -595,10 +594,10 @@ next_desc:
 	if (cleaned_count)
 		ixgbevf_alloc_rx_buffers(adapter, rx_ring, cleaned_count);
 
+	u64_stats_update_begin(&rx_ring->syncp);
 	rx_ring->total_packets += total_rx_packets;
 	rx_ring->total_bytes += total_rx_bytes;
-	adapter->netdev->stats.rx_bytes += total_rx_bytes;
-	adapter->netdev->stats.rx_packets += total_rx_packets;
+	u64_stats_update_end(&rx_ring->syncp);
 
 	return cleaned;
 }
@@ -2288,10 +2287,6 @@ void ixgbevf_update_stats(struct ixgbevf_adapter *adapter)
 				adapter->stats.vfgotc);
 	UPDATE_VF_COUNTER_32bit(IXGBE_VFMPRC, adapter->stats.last_vfmprc,
 				adapter->stats.vfmprc);
-
-	/* Fill out the OS statistics structure */
-	adapter->netdev->stats.multicast = adapter->stats.vfmprc -
-		adapter->stats.base_vfmprc;
 }
 
 /**
@@ -3248,11 +3243,50 @@ static void ixgbevf_shutdown(struct pci_dev *pdev)
 	pci_disable_device(pdev);
 }
 
+static struct rtnl_link_stats64 *ixgbevf_get_stats(struct net_device *netdev,
+						   struct rtnl_link_stats64 *stats)
+{
+	struct ixgbevf_adapter *adapter = netdev_priv(netdev);
+	unsigned int start;
+	u64 bytes, packets;
+	const struct ixgbevf_ring *ring;
+	int i;
+
+	ixgbevf_update_stats(adapter);
+
+	stats->multicast = adapter->stats.vfmprc - adapter->stats.base_vfmprc;
+
+	for (i = 0; i < adapter->num_rx_queues; i++) {
+		ring = &adapter->rx_ring[i];
+		do {
+			start = u64_stats_fetch_begin_bh(&ring->syncp);
+			bytes = ring->total_bytes;
+			packets = ring->total_packets;
+		} while (u64_stats_fetch_retry_bh(&ring->syncp, start));
+		stats->rx_bytes += bytes;
+		stats->rx_packets += packets;
+	}
+
+	for (i = 0; i < adapter->num_tx_queues; i++) {
+		ring = &adapter->tx_ring[i];
+		do {
+			start = u64_stats_fetch_begin_bh(&ring->syncp);
+			bytes = ring->total_bytes;
+			packets = ring->total_packets;
+		} while (u64_stats_fetch_retry_bh(&ring->syncp, start));
+		stats->tx_bytes += bytes;
+		stats->tx_packets += packets;
+	}
+
+	return stats;
+}
+
 static const struct net_device_ops ixgbe_netdev_ops = {
 	.ndo_open		= &ixgbevf_open,
 	.ndo_stop		= &ixgbevf_close,
 	.ndo_start_xmit		= &ixgbevf_xmit_frame,
 	.ndo_set_rx_mode	= &ixgbevf_set_rx_mode,
+	.ndo_get_stats64	= ixgbevf_get_stats,
 	.ndo_set_multicast_list	= &ixgbevf_set_rx_mode,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_mac_address	= &ixgbevf_set_mac,



^ permalink raw reply related

* [PATCH] ethtool v2: NFC corrections
From: Sebastian Pöhn @ 2011-06-20 13:29 UTC (permalink / raw)
  To: Ben Hutchings, Linux Netdev; +Cc: Sebastian Pöhn, Alexander Duyck

The first patch has checked for IP_USER after FLOW_EXT setting! Corrected that ...

This patch:
# Adds an alias for ip4 called l4data pointing to spi (first 4 Layer 4
bytes)
# [TRIVIAL] Corrects the permutation of dst and src for ethernet
# Suggests to always set the ip_ver field of usr_ip to ETH_RX_NFC_IP4 at
least as long there is no opportunity to use others than IPv4 and there
is no frontend option to enter ip_ver.

Signed-off-by: Sebastian Poehn <sebastian.poehn@belden.com>
---

 ethtool.8.in |    5 +++++
 rxclass.c    |   11 ++++++++---
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/ethtool.8.in b/ethtool.8.in
index 7b1cdf5..0a64d75 100644
--- a/ethtool.8.in
+++ b/ethtool.8.in
@@ -296,6 +296,7 @@ ethtool \- query or control network driver and
hardware settings
 .BM src\-port
 .BM dst\-port
 .BM spi
+.BM l4data
 .BM vlan\-etype
 .BM vlan
 .BM user\-def
@@ -718,6 +719,10 @@ Specify the value of the security parameter index
field (applicable to
 AH/ESP packets)in the incoming packet to match along with an optional
 mask.  Valid for flow-types ip4, ah4, and esp4.
 .TP
+.BI l4data \ N \\fR\ [\\fPm \ N \\fR]\\fP
+Specify the value of the first 4 Bytes of Layer 4 in the incoming
packet to
+match along with an optional mask.  Valid for ip4 flow-type.
+.TP
 .BI vlan\-etype \ N \\fR\ [\\fPm \ N \\fR]\\fP
 Includes the VLAN tag Ethertype and an optional mask.
 .TP
diff --git a/rxclass.c b/rxclass.c
index ee486f7..b227901 100644
--- a/rxclass.c
+++ b/rxclass.c
@@ -622,6 +622,9 @@ static struct rule_opts rule_nfc_usr_ip4[] = {
 	{ "l4proto", OPT_U8, NFC_FLAG_PROTO,
 	  offsetof(struct ethtool_rx_flow_spec, h_u.usr_ip4_spec.proto),
 	  offsetof(struct ethtool_rx_flow_spec, m_u.usr_ip4_spec.proto) },
+	{ "l4data", OPT_BE32, NFC_FLAG_SPI,
+	  offsetof(struct ethtool_rx_flow_spec, h_u.usr_ip4_spec.l4_4_bytes),
+	  offsetof(struct ethtool_rx_flow_spec,
m_u.usr_ip4_spec.l4_4_bytes) },
 	{ "spi", OPT_BE32, NFC_FLAG_SPI,
 	  offsetof(struct ethtool_rx_flow_spec, h_u.usr_ip4_spec.l4_4_bytes),
 	  offsetof(struct ethtool_rx_flow_spec,
m_u.usr_ip4_spec.l4_4_bytes) },
@@ -648,11 +651,11 @@ static struct rule_opts rule_nfc_usr_ip4[] = {
 
 static struct rule_opts rule_nfc_ether[] = {
 	{ "src", OPT_MAC, NFC_FLAG_SADDR,
-	  offsetof(struct ethtool_rx_flow_spec, h_u.ether_spec.h_dest),
-	  offsetof(struct ethtool_rx_flow_spec, m_u.ether_spec.h_dest) },
-	{ "dst", OPT_MAC, NFC_FLAG_DADDR,
 	  offsetof(struct ethtool_rx_flow_spec, h_u.ether_spec.h_source),
 	  offsetof(struct ethtool_rx_flow_spec, m_u.ether_spec.h_source) },
+	{ "dst", OPT_MAC, NFC_FLAG_DADDR,
+	  offsetof(struct ethtool_rx_flow_spec, h_u.ether_spec.h_dest),
+	  offsetof(struct ethtool_rx_flow_spec, m_u.ether_spec.h_dest) },
 	{ "proto", OPT_BE16, NFC_FLAG_PROTO,
 	  offsetof(struct ethtool_rx_flow_spec, h_u.ether_spec.h_proto),
 	  offsetof(struct ethtool_rx_flow_spec, m_u.ether_spec.h_proto) },
@@ -1062,6 +1065,8 @@ int rxclass_parse_ruleopts(char **argp, int argc,
 		}
 	}
 
+	if (flow_type == IP_USER_FLOW)
+		fsp->h_u.usr_ip4_spec.ip_ver = ETH_RX_NFC_IP4;
 	if (flags & (NTUPLE_FLAG_VLAN | NTUPLE_FLAG_UDEF | NTUPLE_FLAG_VETH))
 		fsp->flow_type |= FLOW_EXT;
 



^ permalink raw reply related

* [PATCH 14/14] mm: Account for the number of times direct reclaimers get throttled
From: Mel Gorman @ 2011-06-20 13:12 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linux-MM, Linux-Netdev, LKML, David Miller, Neil Brown,
	Peter Zijlstra, Mel Gorman
In-Reply-To: <1308575540-25219-1-git-send-email-mgorman@suse.de>

Under significant pressure when writing back to network-backed storage,
direct reclaimers may get throttled. This is expected to be a
short-lived event and the processes get woken up again but processes do
get stalled. This patch counts how many times such stalling occurs. It's
up to the administrator whether to reduce these stalls by increasing
min_free_kbytes.

Signed-off-by: Mel Gorman <mgorman@suse.de>
---
 include/linux/vm_event_item.h |    1 +
 mm/vmscan.c                   |    1 +
 mm/vmstat.c                   |    1 +
 3 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 03b90cdc..652e5f3 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -29,6 +29,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		FOR_ALL_ZONES(PGSTEAL),
 		FOR_ALL_ZONES(PGSCAN_KSWAPD),
 		FOR_ALL_ZONES(PGSCAN_DIRECT),
+		PGSCAN_DIRECT_THROTTLE,
 #ifdef CONFIG_NUMA
 		PGSCAN_ZONE_RECLAIM_FAILED,
 #endif
diff --git a/mm/vmscan.c b/mm/vmscan.c
index fe95e4f..f75a5f2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2189,6 +2189,7 @@ static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
 		return;
 
 	/* Throttle */
+	count_vm_event(PGSCAN_DIRECT_THROTTLE);
 	wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
 		pfmemalloc_watermark_ok(zone->zone_pgdat, high_zoneidx));
 }
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 20c18b7..0ab4a3d 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -740,6 +740,7 @@ const char * const vmstat_text[] = {
 	TEXTS_FOR_ZONES("pgsteal")
 	TEXTS_FOR_ZONES("pgscan_kswapd")
 	TEXTS_FOR_ZONES("pgscan_direct")
+	"pgscan_direct_throttle",
 
 #ifdef CONFIG_NUMA
 	"zone_reclaim_failed",
-- 
1.7.3.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH 13/14] mm: Throttle direct reclaimers if PF_MEMALLOC reserves are low and swap is backed by network storage
From: Mel Gorman @ 2011-06-20 13:12 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linux-MM, Linux-Netdev, LKML, David Miller, Neil Brown,
	Peter Zijlstra, Mel Gorman
In-Reply-To: <1308575540-25219-1-git-send-email-mgorman@suse.de>

If swap is backed by network storage such as NBD, there is a risk that a
large number of reclaimers can hang the system by consuming all
PF_MEMALLOC reserves. To avoid these hangs, the administrator must tune
min_free_kbytes in advance. This patch will throttle direct reclaimers
if half the PF_MEMALLOC reserves are in use as the system is at risk of
hanging.

Signed-off-by: Mel Gorman <mgorman@suse.de>
---
 include/linux/mmzone.h |    1 +
 mm/page_alloc.c        |    1 +
 mm/vmscan.c            |   54 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 56 insertions(+), 0 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index c928dac..5b32906 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -633,6 +633,7 @@ typedef struct pglist_data {
 					     range, including holes */
 	int node_id;
 	wait_queue_head_t kswapd_wait;
+	wait_queue_head_t pfmemalloc_wait;
 	struct task_struct *kswapd;
 	int kswapd_max_order;
 	enum zone_type classzone_idx;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ac779f5..00eea0f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4288,6 +4288,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
 	pgdat_resize_init(pgdat);
 	pgdat->nr_zones = 0;
 	init_waitqueue_head(&pgdat->kswapd_wait);
+	init_waitqueue_head(&pgdat->pfmemalloc_wait);
 	pgdat->kswapd_max_order = 0;
 	pgdat_page_cgroup_init(pgdat);
 	
diff --git a/mm/vmscan.c b/mm/vmscan.c
index faa0a08..fe95e4f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2154,6 +2154,45 @@ out:
 	return 0;
 }
 
+static bool pfmemalloc_watermark_ok(pg_data_t *pgdat, int high_zoneidx)
+{
+	struct zone *zone;
+	unsigned long pfmemalloc_reserve = 0;
+	unsigned long free_pages = 0;
+	int i;
+
+	for (i = 0; i <= high_zoneidx; i++) {
+		zone = &pgdat->node_zones[i];
+		pfmemalloc_reserve += min_wmark_pages(zone);
+		free_pages += zone_page_state(zone, NR_FREE_PAGES);
+	}
+
+	return (free_pages > pfmemalloc_reserve / 2) ? true : false;
+}
+
+/*
+ * Throttle direct reclaimers if backing storage is backed by the network
+ * and the PFMEMALLOC reserve for the preferred node is getting dangerously
+ * depleted. kswapd will continue to make progress and wake the processes
+ * when the low watermark is reached
+ */
+static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
+					nodemask_t *nodemask)
+{
+	struct zone *zone;
+	int high_zoneidx = gfp_zone(gfp_mask);
+	DEFINE_WAIT(wait);
+
+	/* Check if the pfmemalloc reserves are ok */
+	first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
+	if (pfmemalloc_watermark_ok(zone->zone_pgdat, high_zoneidx))
+		return;
+
+	/* Throttle */
+	wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
+		pfmemalloc_watermark_ok(zone->zone_pgdat, high_zoneidx));
+}
+
 unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 				gfp_t gfp_mask, nodemask_t *nodemask)
 {
@@ -2173,6 +2212,15 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 		.gfp_mask = sc.gfp_mask,
 	};
 
+	throttle_direct_reclaim(gfp_mask, zonelist, nodemask);
+
+	/*
+	 * Do not enter reclaim if fatal signal is pending. 1 is returned so
+	 * that the page allocator does not consider triggering OOM
+	 */
+	if (fatal_signal_pending(current))
+		return 1;
+
 	trace_mm_vmscan_direct_reclaim_begin(order,
 				sc.may_writepage,
 				gfp_mask);
@@ -2541,6 +2589,12 @@ loop_again:
 			}
 
 		}
+
+		/* Wake throttled direct reclaimers if low watermark is met */
+		if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
+				pfmemalloc_watermark_ok(pgdat, MAX_NR_ZONES - 1))
+			wake_up_interruptible(&pgdat->pfmemalloc_wait);
+
 		if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
 			break;		/* kswapd: all done */
 		/*
-- 
1.7.3.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH 12/14] nbd: Set SOCK_MEMALLOC for access to PFMEMALLOC reserves
From: Mel Gorman @ 2011-06-20 13:12 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linux-MM, Linux-Netdev, LKML, David Miller, Neil Brown,
	Peter Zijlstra, Mel Gorman
In-Reply-To: <1308575540-25219-1-git-send-email-mgorman@suse.de>

Set SOCK_MEMALLOC on the NBD socket to allow access to PFMEMALLOC
reserves so pages backed by NBD, particularly if swap related,
can be cleaned to prevent the machine being deadlocked. It is
still possible that the PFMEMALLOC reserves get depleted resulting
in deadlock but this can be resolved by the administrator by
increasing min_free_kbytes.

Signed-off-by: Mel Gorman <mgorman@suse.de>
---
 drivers/block/nbd.c |    7 ++++++-
 1 files changed, 6 insertions(+), 1 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index f533f33..ca7cd81 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -156,6 +156,7 @@ static int sock_xmit(struct nbd_device *lo, int send, void *buf, int size,
 	struct msghdr msg;
 	struct kvec iov;
 	sigset_t blocked, oldset;
+	unsigned long pflags = current->flags;
 
 	if (unlikely(!sock)) {
 		printk(KERN_ERR "%s: Attempted %s on closed socket in sock_xmit\n",
@@ -168,8 +169,9 @@ static int sock_xmit(struct nbd_device *lo, int send, void *buf, int size,
 	siginitsetinv(&blocked, sigmask(SIGKILL));
 	sigprocmask(SIG_SETMASK, &blocked, &oldset);
 
+	current->flags |= PF_MEMALLOC;
 	do {
-		sock->sk->sk_allocation = GFP_NOIO;
+		sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC;
 		iov.iov_base = buf;
 		iov.iov_len = size;
 		msg.msg_name = NULL;
@@ -215,6 +217,7 @@ static int sock_xmit(struct nbd_device *lo, int send, void *buf, int size,
 	} while (size > 0);
 
 	sigprocmask(SIG_SETMASK, &oldset, NULL);
+	tsk_restore_flags(current, pflags, PF_MEMALLOC);
 
 	return result;
 }
@@ -405,6 +408,8 @@ static int nbd_do_it(struct nbd_device *lo)
 
 	BUG_ON(lo->magic != LO_MAGIC);
 
+	sk_set_memalloc(lo->sock->sk);
+
 	lo->pid = current->pid;
 	ret = sysfs_create_file(&disk_to_dev(lo->disk)->kobj, &pid_attr.attr);
 	if (ret) {
-- 
1.7.3.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox