Netdev List
 help / color / mirror / Atom feed
* [PATCH 1/2] ethtool: Add generic structure and functions for named flags
From: Ben Hutchings @ 2010-05-18 16:32 UTC (permalink / raw)
  To: Jeff Garzik; +Cc: netdev, sf-linux-drivers

This will be used to support named message type flags.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
 ethtool.c |   41 +++++++++++++++++++++++++++++++++++++++++
 1 files changed, 41 insertions(+), 0 deletions(-)

diff --git a/ethtool.c b/ethtool.c
index 4226a67..7004b7f 100644
--- a/ethtool.c
+++ b/ethtool.c
@@ -355,6 +355,12 @@ struct cmdline_info {
 	void *ioctl_val;
 };
 
+struct named_flag {
+	const char *name;
+	u32 flag;
+	int *wanted;
+};
+
 static struct cmdline_info cmdline_gregs[] = {
 	{ "raw", CMDL_BOOL, &gregs_dump_raw, NULL },
 	{ "hex", CMDL_BOOL, &gregs_dump_hex, NULL },
@@ -520,6 +526,41 @@ static void parse_generic_cmdline(int argc, char **argp,
 	}
 }
 
+static void
+print_flags(const struct named_flag *flags, unsigned int n_flags, u32 value)
+{
+	const char *sep = "";
+
+	while (n_flags) {
+		if (value & flags->flag) {
+			printf("%s%s", sep, flags->name);
+			sep = " ";
+			value &= ~flags->flag;
+		}
+		++flags;
+		--n_flags;
+	}
+
+	/* Print any unrecognised flags in hex */
+	if (value)
+		printf("%s%#x", sep, value);
+}
+
+static u32
+update_flags(const struct named_flag *flags, unsigned int n_flags, u32 value)
+{
+	while (n_flags) {
+		if (*flags->wanted == 0)
+			value &= ~flags->flag;
+		else if (*flags->wanted == 1)
+			value |= flags->flag;
+		++flags;
+		--n_flags;
+	}
+
+	return value;
+}
+
 static int rxflow_str_to_type(const char *str)
 {
 	int flow_type = 0;
-- 
1.6.2.5


-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply related

* [PATCH 2/2] ethtool: Implement named message type flags
From: Ben Hutchings @ 2010-05-18 16:33 UTC (permalink / raw)
  To: Jeff Garzik; +Cc: netdev, sf-linux-drivers
In-Reply-To: <1274200336.2113.0.camel@achroite.uk.solarflarecom.com>

Allow message type flags to be turned on and off by name.
Print the names of the currently set flags below the numeric value.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
 ethtool.8 |   66 ++++++++++++++++++++++++++++++-
 ethtool.c |  127 +++++++++++++++++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 178 insertions(+), 15 deletions(-)

diff --git a/ethtool.8 b/ethtool.8
index a7b43d5..5983d0e 100644
--- a/ethtool.8
+++ b/ethtool.8
@@ -200,7 +200,10 @@ ethtool \- Display or change ethernet card settings
 .RB [ wol \ \*(WO]
 .RB [ sopass \ \*(MA]
 .RB [ msglvl
-.IR N ]
+.IR N \ |
+.BI msglvl \ type
+.A1 on off
+.RB ...]
 
 .B ethtool \-n
 .I ethX
@@ -482,9 +485,66 @@ Disable (wake on nothing).  This option clears all previous options.
 .B sopass \*(MA\c
 Sets the SecureOn(tm) password.  The argument to this option must be 6
 bytes in ethernet MAC hex format (\*(MA).
-.TP
+.PP
 .BI msglvl \ N
-Sets the driver message level. Meanings differ per driver.
+.br
+.BI msglvl \ type
+.A1 on off
+.RB ...
+.RS
+Sets the driver message type flags by name or number. \fItype\fR
+names the type of message to enable or disable; \fIN\fR specifies the
+new flags numerically. The defined type names and numbers are:
+.PD 0
+.TP 12
+.B drv
+0x0001  General driver status
+.TP 12
+.B probe
+0x0002  Hardware probing
+.TP 12
+.B link
+0x0004  Link state
+.TP 12
+.B timer
+0x0008  Periodic status check
+.TP 12
+.B ifdown
+0x0010  Interface being brought down
+.TP 12
+.B ifup
+0x0020  Interface being brought up
+.TP 12
+.B rx_err
+0x0040  Receive error
+.TP 12
+.B tx_err
+0x0080  Transmit error
+.TP 12
+.B tx_queued
+0x0100  Transmit queueing
+.TP 12
+.B intr
+0x0200  Interrupt handling
+.TP 12
+.B tx_done
+0x0400  Transmit completion
+.TP 12
+.B rx_status
+0x0800  Receive completion
+.TP 12
+.B pktdata
+0x1000  Packet contents
+.TP 12
+.B hw
+0x2000  Hardware status
+.TP 12
+.B wol
+0x4000  Wake-on-LAN status
+.PP
+The precise meanings of these type flags differ between drivers.
+.PD
+.RE
 .TP
 .B \-n \-\-show-nfc
 Retrieves the receive network flow classification configurations.
diff --git a/ethtool.c b/ethtool.c
index 7004b7f..380a054 100644
--- a/ethtool.c
+++ b/ethtool.c
@@ -20,7 +20,6 @@
  *   * better man page (steal from mii-tool?)
  *   * fall back on SIOCMII* ioctl()s and possibly SIOCDEVPRIVATE*
  *   * abstract ioctls to allow for fallback modes of data gathering
- *   * symbolic names for msglvl bitmask
  */
 
 #ifdef HAVE_CONFIG_H
@@ -39,6 +38,7 @@
 #include <net/if.h>
 #include <sys/utsname.h>
 #include <limits.h>
+#include <ctype.h>
 
 #include <linux/sockios.h>
 #include "ethtool-util.h"
@@ -51,6 +51,26 @@
 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
 #endif
 
+#ifndef HAVE_NETIF_MSG
+enum {
+	NETIF_MSG_DRV		= 0x0001,
+	NETIF_MSG_PROBE		= 0x0002,
+	NETIF_MSG_LINK		= 0x0004,
+	NETIF_MSG_TIMER		= 0x0008,
+	NETIF_MSG_IFDOWN	= 0x0010,
+	NETIF_MSG_IFUP		= 0x0020,
+	NETIF_MSG_RX_ERR	= 0x0040,
+	NETIF_MSG_TX_ERR	= 0x0080,
+	NETIF_MSG_TX_QUEUED	= 0x0100,
+	NETIF_MSG_INTR		= 0x0200,
+	NETIF_MSG_TX_DONE	= 0x0400,
+	NETIF_MSG_RX_STATUS	= 0x0800,
+	NETIF_MSG_PKTDATA	= 0x1000,
+	NETIF_MSG_HW		= 0x2000,
+	NETIF_MSG_WOL		= 0x4000,
+};
+#endif
+
 static int parse_wolopts(char *optstr, u32 *data);
 static char *unparse_wolopts(int wolopts);
 static int parse_sopass(char *src, unsigned char *dest);
@@ -126,7 +146,7 @@ static struct option {
 		"		[ xcvr internal|external ]\n"
 		"		[ wol p|u|m|b|a|g|s|d... ]\n"
 		"		[ sopass %x:%x:%x:%x:%x:%x ]\n"
-		"		[ msglvl %d ] \n" },
+		"		[ msglvl %d | msglvl type on|off ... ]\n" },
     { "-a", "--show-pause", MODE_GPAUSE, "Show pause options" },
     { "-A", "--pause", MODE_SPAUSE, "Set pause options",
       "		[ autoneg on|off ]\n"
@@ -311,7 +331,6 @@ static int wol_change = 0;
 static u8 sopass_wanted[SOPASS_MAX];
 static int sopass_change = 0;
 static int gwol_changed = 0; /* did anything in GWOL change? */
-static int msglvl_wanted = -1;
 static int phys_id_time = 0;
 static int gregs_changed = 0;
 static int gregs_dump_raw = 0;
@@ -335,6 +354,25 @@ static struct ethtool_rx_ntuple_flow_spec ntuple_fs;
 static char *flash_file = NULL;
 static int flash = -1;
 static int flash_region = -1;
+
+static int msglvl_changed = 0;
+static int msglvl_wanted = -1;
+static int msg_drv_wanted = -1;
+static int msg_probe_wanted = -1;
+static int msg_link_wanted = -1;
+static int msg_timer_wanted = -1;
+static int msg_ifdown_wanted = -1;
+static int msg_ifup_wanted = -1;
+static int msg_rx_err_wanted = -1;
+static int msg_tx_err_wanted = -1;
+static int msg_tx_queued_wanted = -1;
+static int msg_intr_wanted = -1;
+static int msg_tx_done_wanted = -1;
+static int msg_rx_status_wanted = -1;
+static int msg_pktdata_wanted = -1;
+static int msg_hw_wanted = -1;
+static int msg_wol_wanted = -1;
+
 static enum {
 	ONLINE=0,
 	OFFLINE,
@@ -447,6 +485,42 @@ static struct cmdline_info cmdline_ntuple[] = {
 	{ "action", CMDL_INT, &ntuple_fs.action, NULL },
 };
 
+static struct cmdline_info cmdline_msglvl[] = {
+	{ "drv", CMDL_BOOL, &msg_drv_wanted, NULL },
+	{ "probe", CMDL_BOOL, &msg_probe_wanted, NULL },
+	{ "link", CMDL_BOOL, &msg_link_wanted, NULL },
+	{ "timer", CMDL_BOOL, &msg_timer_wanted, NULL },
+	{ "ifdown", CMDL_BOOL, &msg_ifdown_wanted, NULL },
+	{ "ifup", CMDL_BOOL, &msg_ifup_wanted, NULL },
+	{ "rx_err", CMDL_BOOL, &msg_rx_err_wanted, NULL },
+	{ "tx_err", CMDL_BOOL, &msg_tx_err_wanted, NULL },
+	{ "tx_queued", CMDL_BOOL, &msg_tx_queued_wanted, NULL },
+	{ "intr", CMDL_BOOL, &msg_intr_wanted, NULL },
+	{ "tx_done", CMDL_BOOL, &msg_tx_done_wanted, NULL },
+	{ "rx_status", CMDL_BOOL, &msg_rx_status_wanted, NULL },
+	{ "pktdata", CMDL_BOOL, &msg_pktdata_wanted, NULL },
+	{ "hw", CMDL_BOOL, &msg_hw_wanted, NULL },
+	{ "wol", CMDL_BOOL, &msg_wol_wanted, NULL },
+};
+
+static struct named_flag flag_msglvl[] = {
+	{ "drv", NETIF_MSG_DRV, &msg_drv_wanted },
+	{ "probe", NETIF_MSG_PROBE, &msg_probe_wanted },
+	{ "link", NETIF_MSG_LINK, &msg_link_wanted },
+	{ "timer", NETIF_MSG_TIMER, &msg_timer_wanted },
+	{ "ifdown", NETIF_MSG_IFDOWN, &msg_ifdown_wanted },
+	{ "ifup", NETIF_MSG_IFUP, &msg_ifup_wanted },
+	{ "rx_err", NETIF_MSG_RX_ERR, &msg_rx_err_wanted },
+	{ "tx_err", NETIF_MSG_TX_ERR, &msg_tx_err_wanted },
+	{ "tx_queued", NETIF_MSG_TX_QUEUED, &msg_tx_queued_wanted },
+	{ "intr", NETIF_MSG_INTR, &msg_intr_wanted },
+	{ "tx_done", NETIF_MSG_TX_DONE, &msg_tx_done_wanted },
+	{ "rx_status", NETIF_MSG_RX_STATUS, &msg_rx_status_wanted },
+	{ "pktdata", NETIF_MSG_PKTDATA, &msg_pktdata_wanted },
+	{ "hw", NETIF_MSG_HW, &msg_hw_wanted },
+	{ "wol", NETIF_MSG_WOL, &msg_wol_wanted },
+};
+
 static int get_int(char *str, int base)
 {
 	long v;
@@ -877,7 +951,17 @@ static void parse_cmdline(int argc, char **argp)
 				i++;
 				if (i >= argc)
 					show_usage(1);
-				msglvl_wanted = get_int(argp[i], 0);
+				if (isdigit((unsigned char)argp[i][0])) {
+					msglvl_wanted = get_int(argp[i], 0);
+					msglvl_changed = 1;
+				} else {
+					parse_generic_cmdline(
+						argc, argp, i,
+						&msglvl_changed,
+						cmdline_msglvl,
+						ARRAY_SIZE(cmdline_msglvl));
+					i = argc;
+				}
 				break;
 			}
 			show_usage(1);
@@ -2203,8 +2287,11 @@ static int do_gset(int fd, struct ifreq *ifr)
 	ifr->ifr_data = (caddr_t)&edata;
 	err = send_ioctl(fd, ifr);
 	if (err == 0) {
-		fprintf(stdout, "	Current message level: 0x%08x (%d)\n",
+		fprintf(stdout, "	Current message level: 0x%08x (%d)\n"
+			"			       ",
 			edata.data, edata.data);
+		print_flags(flag_msglvl, ARRAY_SIZE(flag_msglvl), edata.data);
+		fprintf(stdout, "\n");
 		allfail = 0;
 	} else if (errno != EOPNOTSUPP) {
 		perror("Cannot get message level");
@@ -2327,15 +2414,31 @@ static int do_sset(int fd, struct ifreq *ifr)
 		}
 	}
 
-	if (msglvl_wanted != -1) {
+	if (msglvl_changed) {
 		struct ethtool_value edata;
 
-		edata.cmd = ETHTOOL_SMSGLVL;
-		edata.data = msglvl_wanted;
-		ifr->ifr_data = (caddr_t)&edata;;
-		err = send_ioctl(fd, ifr);
-		if (err < 0)
-			perror("Cannot set new msglvl");
+		if (msglvl_wanted == -1) {
+			edata.cmd = ETHTOOL_GMSGLVL;
+			ifr->ifr_data = (caddr_t)&edata;;
+			err = send_ioctl(fd, ifr);
+			if (err < 0)
+				perror("Cannot get msglvl");
+			else
+				msglvl_wanted = update_flags(
+					flag_msglvl, ARRAY_SIZE(flag_msglvl),
+					edata.data);
+		} else {
+			err = 0;
+		}
+
+		if (err == 0) {
+			edata.cmd = ETHTOOL_SMSGLVL;
+			edata.data = msglvl_wanted;
+			ifr->ifr_data = (caddr_t)&edata;;
+			err = send_ioctl(fd, ifr);
+			if (err < 0)
+				perror("Cannot set new msglvl");
+		}
 	}
 
 	return 0;
-- 
1.6.2.5

-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply related

* Re: get beyond 1Gbps with pktgen on 10Gb nic?
From: Rick Jones @ 2010-05-18 16:50 UTC (permalink / raw)
  To: Jon Zhou; +Cc: netdev@vger.kernel.org
In-Reply-To: <4A6A2125329CFD4D8CC40C9E8ABCAB9F2497EFC00D@MILEXCH2.ds.jdsu.net>

Jon Zhou wrote:
> hi rick:
> 
> do you mean "TCP_NODELAY" will send with packet size as I expect
> without this option,netperf might sent packet with large size? (but
> eventually it will be splitted into MTU size?)

First things first - netperf only ever calls send() with the size you give it 
via the command line.

It is what happens after that which matters.  Specifically, then when/how TCP 
decides to send the data across the network.  Setting TCP_NODELAY will disable 
the Nagle Algorithm, which, 99 times out of 10, will cause each send() call by 
the application to be a separate TCP segment.  The 100th time out of 10, 
something like a retransmission or a zero window from the remote etc may still 
cause multiple small send() calls to be aggregated into larger segments.  How 
much larger will depend on the Maximum Segment Size (MSS) for the connection, 
the MTU is one of the inputs to the decision of what to use for the MSS.

At the end of this message is a bit of boilerplate I have on the aforementioned 
Nagle algorithm. It is a bit generic, not stack-specific.  It discusses issues 
beyond benchmarking considerations, so keep that in mind while you are reading it.

happy benchmarking,

rick jones


$ cat usenet_replies/nagle_algorithm

 > I'm not familiar with this issue, and I'm mostly ignorant about what
 > tcp does below the sockets interface. Can anybody briefly explain what
 > "nagle" is, and how and when to turn it off? Or point me to the
 > appropriate manual.

In broad terms, whenever an application does a send() call, the logic
of the Nagle algorithm is supposed to go something like this:

1) Is the quantity of data in this send, plus any queued, unsent data,
greater than the MSS (Maximum Segment Size) for this connection? If
yes, send the data in the user's send now (modulo any other
constraints such as receiver's advertised window and the TCP
congestion window). If no, go to 2.

2) Is the connection to the remote otherwise idle? That is, is there
no unACKed data outstanding on the network. If yes, send the data in
the user's send now. If no, queue the data and wait. Either the
application will continue to call send() with enough data to get to a
full MSS-worth of data, or the remote will ACK all the currently sent,
unACKed data, or our retransmission timer will expire.

Now, where applications run into trouble is when they have what might
be described as "write, write, read" behaviour, where they present
logically associated data to the transport in separate 'send' calls
and those sends are typically less than the MSS for the connection.
It isn't so much that they run afoul of Nagle as they run into issues
with the interaction of Nagle and the other heuristics operating on
the remote. In particular, the delayed ACK heuristics.

When a receiving TCP is deciding whether or not to send an ACK back to
the sender, in broad handwaving terms it goes through logic similar to
this:

a) is there data being sent back to the sender? if yes, piggy-back the
ACK on the data segment.

b) is there a window update being sent back to the sender? if yes,
piggy-back the ACK on the window update.

c) has the standalone ACK timer expired.

Window updates are generally triggered by the following heuristics:

i) would the window update be for a non-trivial fraction of the window
- typically somewhere at or above 1/4 the window, that is, has the
application "consumed" at least that much data? if yes, send a
window update. if no, check ii.

ii) would the window update be for, the application "consumed," at
least 2*MSS worth of data? if yes, send a window update, if no wait.

Now, going back to that write, write, read application, on the sending
side, the first write will be transmitted by TCP via logic rule 2 -
the connection is otherwise idle. However, the second small send will
be delayed as there is at that point unACKnowledged data outstanding
on the connection.

At the receiver, that small TCP segment will arrive and will be passed
to the application. The application does not have the entire app-level
message, so it will not send a reply (data to TCP) back. The typical
TCP window is much much larger than the MSS, so no window update would
be triggered by heuristic i. The data just arrived is < 2*MSS, so no
window update from heuristic ii. Since there is no window update, no
ACK is sent by heuristic b.

So, that leaves heuristic c - the standalone ACK timer. That ranges
anywhere between 50 and 200 milliseconds depending on the TCP stack in
use.

If you've read this far :) now we can take a look at the effect of
various things touted as "fixes" to applications experiencing this
interaction.  We take as our example a client-server application where
both the client and the server are implemented with a write of a small
application header, followed by application data.  First, the
"default" case which is with Nagle enabled (TCP_NODELAY _NOT_ set) and
with standard ACK behaviour:

               Client                     Server
              Req Header        ->
                                <-        Standalone ACK after Nms
              Req Data          ->
                                <-        Possible standalone ACK
                                <-        Rsp Header
              Standalone ACK    ->
                                <-        Rsp Data
     Possible standalone ACK    ->


For two "messages" we end-up with at least six segments on the wire.
The possible standalone ACKs will depend on whether the server's
response time, or client's think time is longer than the standalone
ACK interval on their respective sides. Now, if TCP_NODELAY is set we
see:


               Client                     Server
              Req Header        ->
              Req Data          ->
                                <-        Possible Standalone ACK after Nms
                                <-        Rsp Header
                                <-        Rsp Data
      Possible Standalone ACK   ->

In theory, we are down two four segments on the wire which seems good,
but frankly we can do better.  First though, consider what happens
when someone disables delayed ACKs

               Client                     Server
              Req Header        ->
                                <-        Immediate Standalone ACK
              Req Data          ->
                                <-        Immediate Standalone ACK
                                <-        Rsp Header
    Immediate Standalone ACK    ->
                                <-        Rsp Data
    Immediate Standalone ACK    ->

Now we definitly see 8 segments on the wire.  It will also be that way
if both TCP_NODELAY is set and delayed ACKs are disabled.

How about if the application did the "right" think in the first place?
That is sent the logically associated data at the same time:


               Client                     Server
              Request        ->
                             <-           Possible Standalone ACK
                                <-        Response
    Possible Standalone ACK    ->

We are down to two segments on the wire.

For "small" packets, the CPU cost is about the same regardless of data
or ACK.  This means that the application which is making the propper
gathering send call will spend far fewer CPU cycles in the networking
stack.

^ permalink raw reply

* [PATCH] net: Fix definition of netif_vdbg() when VERBOSE_DEBUG is not defined
From: Ben Hutchings @ 2010-05-18 16:56 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-net-drivers, Joe Perches

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
 include/linux/netdevice.h |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c1b2341..667c2a8 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2314,7 +2314,7 @@ do {								\
 #define netif_vdbg(priv, type, dev, format, args...)		\
 ({								\
 	if (0)							\
-		netif_printk(KERN_DEBUG, dev, format, ##args);	\
+		netif_printk(priv, type, KERN_DEBUG, dev, format, ##args); \
 	0;							\
 })
 #endif
-- 
1.6.2.5

-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply related

* Re: [PATCH 1/4] ipv6: Replace inet6_ifaddr->dead with state
From: Stephen Hemminger @ 2010-05-18 17:23 UTC (permalink / raw)
  To: Herbert Xu; +Cc: David Miller, jbohac, yoshfuji, netdev
In-Reply-To: <E1OEKax-00022W-6K@gondolin.me.apana.org.au>

On Tue, 18 May 2010 21:04:19 +1000
Herbert Xu <herbert@gondor.apana.org.au> wrote:

>  
> +enum {
> +	INET6_IFADDR_STATE_DAD,
> +	INET6_IFADDR_STATE_POSTDAD,
> +	INET6_IFADDR_STATE_UP,
> +	INET6_IFADDR_STATE_DEAD,
> +};
> +
>  #ifdef __KERNEL__

Does this really need to be visible to user applications?

-- 

^ permalink raw reply

* Re: [0/4] Fix addrconf race conditions
From: Stephen Hemminger @ 2010-05-18 17:25 UTC (permalink / raw)
  To: Herbert Xu; +Cc: David Miller, jbohac, yoshfuji, netdev
In-Reply-To: <20100518110243.GA7750@gondor.apana.org.au>

On Tue, 18 May 2010 21:02:43 +1000
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> On Fri, Apr 23, 2010 at 11:05:40PM +0800, Herbert Xu wrote:
> >
> > This stuff is more broken than I thought.  For example, we perform
> > a number of actions when DAD succeeds, e.g., joining an anycast
> > group.  However, this is not synchronised with respect to address
> > deletion at all, so if DAD succeeds just as someone deletes the
> > address, you can easily get stuck on that anycast group.
> > 
> > I will try to untangle this mess tomorrow.
> 
> Tomorrow took a while to arrive :)
> 
> Here is a first batch of patches.  Note that this is by no means
> a comprehensive fix for all the ndisc/addrconf race conditions.
> It is just a first step in trying to address the problems.
> 
> The patchset revolves around a new lock, ifp->state_lock.  I
> added it instead of trying to reuse the existing ifp->lock because
> the latter has serious nesting issues that prevent it from easily
> being used.  My long term plan is to restructure the locking and
> eventually phase out ifp->lock in favour of ifp->state_lock.

I wonder if so many fine grained locks are really necessary at
all.  Everything but timers looks like it is under RTNL mutex
already.

^ permalink raw reply

* [PATCH -next] bridge: fix build for CONFIG_SYSFS disabled
From: Stephen Hemminger @ 2010-05-18 17:28 UTC (permalink / raw)
  To: David Miller; +Cc: randy.dunlap, sfr, linux-next, linux-kernel, netdev
In-Reply-To: <20100517.223237.55871633.davem@davemloft.net>


From: Randy Dunlap <randy.dunlap@oracle.com>

Fix build when CONFIG_SYSFS is not enabled:
net/bridge/br_if.c:136: error: 'struct net_bridge_port' has no member named 'sysfs_name'

Note: dev->name == sysfs_name except when change name is in
progress, and we are protected from that by RTNL mutex.

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Acked-off-by: Stephen Hemminger <shemminger@vyatta.com>


--- a/net/bridge/br_if.c	2010-05-17 10:51:48.638634187 -0700
+++ b/net/bridge/br_if.c	2010-05-18 10:22:28.892111158 -0700
@@ -133,7 +133,7 @@ static void del_nbp(struct net_bridge_po
 	struct net_bridge *br = p->br;
 	struct net_device *dev = p->dev;
 
-	sysfs_remove_link(br->ifobj, p->sysfs_name);
+	sysfs_remove_link(br->ifobj, p->dev->name);
 
 	dev_set_promiscuity(dev, -1);
 

^ permalink raw reply

* [PATCH] net-2.6 : fix dev_get_valid_name
From: Daniel Lezcano @ 2010-05-18 17:35 UTC (permalink / raw)
  To: davem; +Cc: opurdila, netdev

the commit:

commit d90310243fd750240755e217c5faa13e24f41536
Author: Octavian Purdila <opurdila@ixiacom.com>
Date:   Wed Nov 18 02:36:59 2009 +0000

    net: device name allocation cleanups

introduced a bug when there is a hash collision making impossible
to rename a device with eth%d. This bug is very hard to reproduce
and appears rarely.

The problem is coming from we don't pass a temporary buffer to
__dev_alloc_name but 'dev->name' which is modified by the function.

A detailed explanation is here:

http://marc.info/?l=linux-netdev&m=127417784011987&w=2

Signed-off-by: Daniel Lezcano <daniel.lezcano@free.fr>
---
 net/core/dev.c |   20 ++++++++++++--------
 1 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 264137f..4704a1a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -936,18 +936,22 @@ int dev_alloc_name(struct net_device *dev, const char *name)
 }
 EXPORT_SYMBOL(dev_alloc_name);
 
-static int dev_get_valid_name(struct net *net, const char *name, char *buf,
-			      bool fmt)
+static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt)
 {
+	struct net *net;
+
+	BUG_ON(!dev_net(dev));
+	net = dev_net(dev);
+
 	if (!dev_valid_name(name))
 		return -EINVAL;
 
 	if (fmt && strchr(name, '%'))
-		return __dev_alloc_name(net, name, buf);
+		return dev_alloc_name(dev, name);
 	else if (__dev_get_by_name(net, name))
 		return -EEXIST;
-	else if (buf != name)
-		strlcpy(buf, name, IFNAMSIZ);
+	else if (strncmp(dev->name, name, IFNAMSIZ))
+		 strlcpy(dev->name, name, IFNAMSIZ);
 
 	return 0;
 }
@@ -979,7 +983,7 @@ int dev_change_name(struct net_device *dev, const char *newname)
 
 	memcpy(oldname, dev->name, IFNAMSIZ);
 
-	err = dev_get_valid_name(net, newname, dev->name, 1);
+	err = dev_get_valid_name(dev, newname, 1);
 	if (err < 0)
 		return err;
 
@@ -5083,7 +5087,7 @@ int register_netdevice(struct net_device *dev)
 		}
 	}
 
-	ret = dev_get_valid_name(net, dev->name, dev->name, 0);
+	ret = dev_get_valid_name(dev, dev->name, 0);
 	if (ret)
 		goto err_uninit;
 
@@ -5661,7 +5665,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
 		/* We get here if we can't use the current device name */
 		if (!pat)
 			goto out;
-		if (dev_get_valid_name(net, pat, dev->name, 1))
+		if (dev_get_valid_name(dev, pat, 1))
 			goto out;
 	}
 
-- 
1.7.0.4


^ permalink raw reply related

* Re: [PATCHv2] vhost-net: utilize PUBLISH_USED_IDX feature
From: Avi Kivity @ 2010-05-18 17:47 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: davem, Juan Quintela, Rusty Russell, Paul E. McKenney,
	Arnd Bergmann, kvm, virtualization, netdev, linux-kernel,
	alex.williamson, amit.shah
In-Reply-To: <20100518022105.GA23129@redhat.com>

On 05/18/2010 05:21 AM, Michael S. Tsirkin wrote:
> With PUBLISH_USED_IDX, guest tells us which used entries
> it has consumed. This can be used to reduce the number
> of interrupts: after we write a used entry, if the guest has not yet
> consumed the previous entry, or if the guest has already consumed the
> new entry, we do not need to interrupt.
> This imporves bandwidth by 30% under some workflows.
>    

Seems to be missing the cacheline alignment.

Rusty's clarification did not satisfy me, I think it's needed.

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

^ permalink raw reply

* Re: [PATCH] net-2.6 : fix dev_get_valid_name
From: Stephen Hemminger @ 2010-05-18 17:54 UTC (permalink / raw)
  To: Daniel Lezcano; +Cc: davem, opurdila, netdev
In-Reply-To: <1274204112-8920-1-git-send-email-daniel.lezcano@free.fr>

On Tue, 18 May 2010 19:35:12 +0200
Daniel Lezcano <daniel.lezcano@free.fr> wrote:

> +static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt)
>  {
> +	struct net *net;
> +
> +	BUG_ON(!dev_net(dev));
> +	net = dev_net(dev);

This is not really part of the same fix. And not sure why it is added here?

-- 

^ permalink raw reply

* Re: [PATCH v2] Fix SJA1000 command register writes on SMP systems
From: Oliver Hartkopp @ 2010-05-18 18:56 UTC (permalink / raw)
  To: David Miller
  Cc: socketcan-core-0fE9KPoRgkgATYTw5x5z8w,
	netdev-u79uwXL29TY76Z2rM5mHXA, wg-5Yr1BZd7O62+XT7JhA+gdA
In-Reply-To: <20100517.223415.184853371.davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>

On 18.05.2010 07:34, David Miller wrote:
> From: Oliver Hartkopp <socketcan-fJ+pQTUTwRTk1uMJSBkQmQ@public.gmane.org>
> Date: Mon, 17 May 2010 22:17:40 +0200
> 
>> diff --git a/drivers/net/can/sja1000/sja1000.c b/drivers/net/can/sja1000/sja1000.c
>> index 145b1a7..2760085 100644
>> --- a/drivers/net/can/sja1000/sja1000.c
>> +++ b/drivers/net/can/sja1000/sja1000.c
>> @@ -84,6 +84,27 @@ static struct can_bittiming_const sja1000_bittiming_const = {
>>  	.brp_inc = 1,
>>  };
>>  +static void sja1000_write_cmdreg(struct sja1000_priv *priv, u8 val)
>> +{
>> +	/* the command register needs some locking on SMP systems */
>> +
>> +#ifdef CONFIG_SMP
> 
> Something is adding spurious leading spaces to lines in your patch.

Sorry. This was probably due to the v2 resend. Will fix.

> Also, please don't SMP conditionalize this code.  It makes it such that
> lock debugging et al. can't be used to check this code on uniprocessor.

Ok.

Will send an updated v3 patch shortly.

Thanks,
Oliver

^ permalink raw reply

* Re: [PATCH] net: avoid one atomic op per cloned skb
From: Eric Dumazet @ 2010-05-18 18:58 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <1274190047.8274.1.camel@edumazet-laptop>

Le mardi 18 mai 2010 à 15:40 +0200, Eric Dumazet a écrit :
> Hi David
> 
> I know you said 'only patches', but I found following patch small
> enough ?
> 

> Thanks
> 
> [PATCH] net: avoid one atomic op per cloned skb
> 
> skb_clone() can use atomic_set(clone_ref, 2) safely, because only
> current thread can possibly touch clone_ref at this point.
>    
> Add a WARN_ON_ONCE() for a while, to catch wrong assumptions.
> 
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> ---
>  net/core/skbuff.c |    3 ++-
>  1 files changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index c543dd2..4444f15 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -628,7 +628,8 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
>  	    n->fclone == SKB_FCLONE_UNAVAILABLE) {
>  		atomic_t *fclone_ref = (atomic_t *) (n + 1);
>  		n->fclone = SKB_FCLONE_CLONE;
> -		atomic_inc(fclone_ref);
> +		WARN_ON_ONCE(atomic_read(fclone_ref) != 1);
> +		atomic_set(fclone_ref, 2);
>  	} else {
>  		n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
>  		if (!n)
> 
> 

Oops, it needs more thinking, definitely not a 2.6.35 thing...

There would be a race between skb_clone() and kfree_skbmem()

kfree_skbmem() must perform the atomic_dec_and_test() before setting
skb->fclone to SKB_FCLONE_UNAVAILABLE.

Doing so avoids dirtying skb->fclone right before kmem_cache_free()...


V2 would be :

[RFC v2] net: avoid one atomic op per cloned skb

skb_clone() can use atomic_set(clone_ref, 2) safely, because only
current thread can possibly touch clone_ref at this point.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index c543dd2..77d5a6b 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -370,13 +370,13 @@ static void kfree_skbmem(struct sk_buff *skb)
 		fclone_ref = (atomic_t *) (skb + 1);
 		other = skb - 1;
 
-		/* The clone portion is available for
-		 * fast-cloning again.
-		 */
-		skb->fclone = SKB_FCLONE_UNAVAILABLE;
-
 		if (atomic_dec_and_test(fclone_ref))
 			kmem_cache_free(skbuff_fclone_cache, other);
+		else
+			/* The clone portion is available for fast-cloning. 
+			 * Note this must be done after the fclone_ref change.
+			 */
+			skb->fclone = SKB_FCLONE_UNAVAILABLE;
 		break;
 	}
 }
@@ -628,7 +628,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
 	    n->fclone == SKB_FCLONE_UNAVAILABLE) {
 		atomic_t *fclone_ref = (atomic_t *) (n + 1);
 		n->fclone = SKB_FCLONE_CLONE;
-		atomic_inc(fclone_ref);
+		atomic_set(fclone_ref, 2);
 	} else {
 		n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
 		if (!n)



^ permalink raw reply related

* [PATCH v3] Fix SJA1000 command register writes on SMP systems
From: Oliver Hartkopp @ 2010-05-18 19:05 UTC (permalink / raw)
  To: David Miller
  Cc: SocketCAN Core Mailing List, Linux Netdev List,
	Wolfgang Grandegger

The SJA1000 command register is concurrently written in the rx-path to free
the receive buffer _and_ in the tx-path to start the transmission.

The SJA1000 data sheet, 6.4.4 COMMAND REGISTER (CMR) states:
"Between two commands at least one internal clock cycle is needed in
order to proceed. The internal clock is half of the external oscillator
frequency."

On SMP systems the current implementation leads to a write stall in the
tx-path, which can be solved by adding some general locking and some time
to settle the write_reg() operation for the command register.

Thanks to Klaus Hitschler for the original fix and detailed problem
description.

This patch applies on net-2.6 and (with some offsets) on net-next-2.6 .

Signed-off-by: Oliver Hartkopp <socketcan-fJ+pQTUTwRTk1uMJSBkQmQ@public.gmane.org>
Acked-by: Wolfgang Grandegger <wg-5Yr1BZd7O62+XT7JhA+gdA@public.gmane.org>

---

diff --git a/drivers/net/can/sja1000/sja1000.c b/drivers/net/can/sja1000/sja1000.c
index 145b1a7..dd70d0c 100644
--- a/drivers/net/can/sja1000/sja1000.c
+++ b/drivers/net/can/sja1000/sja1000.c
@@ -84,6 +84,20 @@ static struct can_bittiming_const sja1000_bittiming_const = {
 	.brp_inc = 1,
 };
 
+static void sja1000_write_cmdreg(struct sja1000_priv *priv, u8 val)
+{
+	unsigned long flags;
+
+	/*
+	 * The command register needs some locking and time to settle
+	 * the write_reg() operation - especially on SMP systems.
+	 */
+	spin_lock_irqsave(&priv->cmdreg_lock, flags);
+	priv->write_reg(priv, REG_CMR, val);
+	priv->read_reg(priv, REG_SR);
+	spin_unlock_irqrestore(&priv->cmdreg_lock, flags);
+}
+
 static int sja1000_probe_chip(struct net_device *dev)
 {
 	struct sja1000_priv *priv = netdev_priv(dev);
@@ -297,7 +311,7 @@ static netdev_tx_t sja1000_start_xmit(struct sk_buff *skb,
 
 	can_put_echo_skb(skb, dev, 0);
 
-	priv->write_reg(priv, REG_CMR, CMD_TR);
+	sja1000_write_cmdreg(priv, CMD_TR);
 
 	return NETDEV_TX_OK;
 }
@@ -346,7 +360,7 @@ static void sja1000_rx(struct net_device *dev)
 	cf->can_id = id;
 
 	/* release receive buffer */
-	priv->write_reg(priv, REG_CMR, CMD_RRB);
+	sja1000_write_cmdreg(priv, CMD_RRB);
 
 	netif_rx(skb);
 
@@ -374,7 +388,7 @@ static int sja1000_err(struct net_device *dev, uint8_t isrc, uint8_t status)
 		cf->data[1] = CAN_ERR_CRTL_RX_OVERFLOW;
 		stats->rx_over_errors++;
 		stats->rx_errors++;
-		priv->write_reg(priv, REG_CMR, CMD_CDO);	/* clear bit */
+		sja1000_write_cmdreg(priv, CMD_CDO);	/* clear bit */
 	}
 
 	if (isrc & IRQ_EI) {
diff --git a/drivers/net/can/sja1000/sja1000.h b/drivers/net/can/sja1000/sja1000.h
index 97a622b..de8e778 100644
--- a/drivers/net/can/sja1000/sja1000.h
+++ b/drivers/net/can/sja1000/sja1000.h
@@ -167,6 +167,7 @@ struct sja1000_priv {
 
 	void __iomem *reg_base;	 /* ioremap'ed address to registers */
 	unsigned long irq_flags; /* for request_irq() */
+	spinlock_t cmdreg_lock;  /* lock for concurrent cmd register writes */
 
 	u16 flags;		/* custom mode flags */
 	u8 ocr;			/* output control register */

^ permalink raw reply related

* Re: [PATCH -next] bridge: fix build for CONFIG_SYSFS disabled
From: David Miller @ 2010-05-18 19:26 UTC (permalink / raw)
  To: shemminger; +Cc: randy.dunlap, sfr, linux-next, linux-kernel, netdev
In-Reply-To: <20100518102837.2622c3b1@nehalam>

From: Stephen Hemminger <shemminger@vyatta.com>
Date: Tue, 18 May 2010 10:28:37 -0700

> 
> From: Randy Dunlap <randy.dunlap@oracle.com>
> 
> Fix build when CONFIG_SYSFS is not enabled:
> net/bridge/br_if.c:136: error: 'struct net_bridge_port' has no member named 'sysfs_name'
> 
> Note: dev->name == sysfs_name except when change name is in
> progress, and we are protected from that by RTNL mutex.
> 
> Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
> Acked-off-by: Stephen Hemminger <shemminger@vyatta.com>

It's "Acked-by" :-)

Applied, thanks!

^ permalink raw reply

* Re: [PATCH 06/11] netdev: bfin_mac: avoid tx skb overflows in the tx DMA ring
From: David Miller @ 2010-05-18 19:29 UTC (permalink / raw)
  To: sonic.adi; +Cc: netdev
In-Reply-To: <AANLkTimWjjXKwi_YLFPxBrCBu1qkiuaUKyo1UW5373lH@mail.gmail.com>

From: Sonic Zhang <sonic.adi@gmail.com>
Date: Tue, 18 May 2010 18:52:17 +0800

[ Please never drop the mailing list when you're trying to
  discuss something networking related with me, thanks.
  I've put the CC: back. ]

> On Mon, May 10, 2010 at 7:40 PM, David Miller <davem@davemloft.net> wrote:
>> From: Mike Frysinger <vapier@gentoo.org>
>> Date: Sun,  9 May 2010 06:18:52 -0400
>>
>>> From: Sonic Zhang <sonic.zhang@analog.com>
>>>
>>> Signed-off-by: Sonic Zhang <sonic.zhang@analog.com>
>>> Signed-off-by: Mike Frysinger <vapier@gentoo.org>
>>
>> This should never ever happen, it's a bug and you should print a warning
>> message when and if it does actually occur.
>>
>> At any point where your ->next pointer hits tx_list_head, the queue
>> should have been stopped by your driver and therefore the networking
>> core will never pass another packet to you.
> 
> To reduce the tx output overhead, the tx interrupt is not enabled and
> handled in this driver. So, the driver doesn't know when to restart
> the tx queue if the queue is stopped in ndo_start_xmit() at the point
> where next pointer hits tx_list_head.
> 
> In this case, although TX buffer list full rarely happens, the check
> is still a safeguard.

You can't do that, if you don't use the TX interrupt then you can leave
SKBs stale in your TX ring for indefinite periods of time which is
illegal.

SKBs hold onto resources that can't be held indefinitely, such as TCP
socket references and netfilter conntrack state.  So if you leave a
packet in your TX ring for a long time, there might be a TCP socket
that now cannot be closed and freed up because of that.

You must therefore free them very as soon as possible after the
hardware is done with them.

^ permalink raw reply

* Re: [PATCH] net-2.6 : fix dev_get_valid_name
From: Daniel Lezcano @ 2010-05-18 19:55 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: davem, opurdila, netdev
In-Reply-To: <20100518105451.05b060e8@nehalam>

On 05/18/2010 07:54 PM, Stephen Hemminger wrote:
> On Tue, 18 May 2010 19:35:12 +0200
> Daniel Lezcano<daniel.lezcano@free.fr>  wrote:
>
>    
>> +static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt)
>>   {
>> +	struct net *net;
>> +
>> +	BUG_ON(!dev_net(dev));
>> +	net = dev_net(dev);
>>      
> This is not really part of the same fix. And not sure why it is added here?
>    

I could have created a temporary buffer within the function and have 
passed it to __dev_alloc_name but this is exactly what does 
'dev_alloc_name'.
So 'dev_alloc_name' replaces '__dev_alloc_name' but takes a 'dev' 
parameter which is not passed to 'dev_get_valid_name'.
Instead of passing an extra parameter 'dev', I just replaced the 'net' 
parameter by the 'dev' parameter and extracted the 'net' pointer from it.

I have a few patches cleaning up these functions and the callers but I 
prefer to send them against net-next-2.6.

^ permalink raw reply

* [PATCH 3/3] cxgb4: notify upper drivers if the device is already up when they load
From: Dimitris Michailidis @ 2010-05-18 20:07 UTC (permalink / raw)
  To: netdev; +Cc: Dimitris Michailidis
In-Reply-To: <1274213233-26309-2-git-send-email-dm@chelsio.com>

Upper layer drivers aren't notified that a device is ready if their modules
load after the device becomes ready.  Add the missing notification.

Signed-off-by: Dimitris Michailidis <dm@chelsio.com>
---
 drivers/net/cxgb4/cxgb4_main.c |    3 +++
 1 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/drivers/net/cxgb4/cxgb4_main.c b/drivers/net/cxgb4/cxgb4_main.c
index 87161ce..58045b0 100644
--- a/drivers/net/cxgb4/cxgb4_main.c
+++ b/drivers/net/cxgb4/cxgb4_main.c
@@ -2327,6 +2327,9 @@ static void uld_attach(struct adapter *adap, unsigned int uld)
 		register_netevent_notifier(&cxgb4_netevent_nb);
 		netevent_registered = true;
 	}
+
+	if (adap->flags & FULL_INIT_DONE)
+		ulds[uld].state_change(handle, CXGB4_STATE_UP);
 }
 
 static void attach_ulds(struct adapter *adap)
-- 
1.5.4


^ permalink raw reply related

* [PATCH 1/3] cxgb4: fix initial addition of MAC address
From: Dimitris Michailidis @ 2010-05-18 20:07 UTC (permalink / raw)
  To: netdev; +Cc: Dimitris Michailidis

The call to add the MAC address during link initialization wasn't adding the
address to one of the HW tables.  Fix.

Signed-off-by: Dimitris Michailidis <dm@chelsio.com>
---
 drivers/net/cxgb4/cxgb4_main.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/net/cxgb4/cxgb4_main.c b/drivers/net/cxgb4/cxgb4_main.c
index 90d375b..1f59971 100644
--- a/drivers/net/cxgb4/cxgb4_main.c
+++ b/drivers/net/cxgb4/cxgb4_main.c
@@ -315,7 +315,7 @@ static int link_start(struct net_device *dev)
 	if (ret == 0) {
 		ret = t4_change_mac(pi->adapter, 0, pi->viid,
 				    pi->xact_addr_filt, dev->dev_addr, true,
-				    false);
+				    true);
 		if (ret >= 0) {
 			pi->xact_addr_filt = ret;
 			ret = 0;
-- 
1.5.4


^ permalink raw reply related

* [PATCH 2/3] cxgb4: keep interrupts available when the ports are brought down
From: Dimitris Michailidis @ 2010-05-18 20:07 UTC (permalink / raw)
  To: netdev; +Cc: Dimitris Michailidis
In-Reply-To: <1274213233-26309-1-git-send-email-dm@chelsio.com>

The PF driver needs to remain alert while its ports are down to service
requests or errors from virtual functions or FW so keep interrupts and
queues available when the ports are brought down.  The change makes
open_device_map unnecessary so remove it.

Signed-off-by: Dimitris Michailidis <dm@chelsio.com>
---
 drivers/net/cxgb4/cxgb4.h      |    1 -
 drivers/net/cxgb4/cxgb4_main.c |   49 ++++++++++++++++++----------------------
 2 files changed, 22 insertions(+), 28 deletions(-)

diff --git a/drivers/net/cxgb4/cxgb4.h b/drivers/net/cxgb4/cxgb4.h
index d3a5c34..dd1770e 100644
--- a/drivers/net/cxgb4/cxgb4.h
+++ b/drivers/net/cxgb4/cxgb4.h
@@ -477,7 +477,6 @@ struct adapter {
 	struct pci_dev *pdev;
 	struct device *pdev_dev;
 	unsigned long registered_device_map;
-	unsigned long open_device_map;
 	unsigned long flags;
 
 	const char *name;
diff --git a/drivers/net/cxgb4/cxgb4_main.c b/drivers/net/cxgb4/cxgb4_main.c
index 1f59971..87161ce 100644
--- a/drivers/net/cxgb4/cxgb4_main.c
+++ b/drivers/net/cxgb4/cxgb4_main.c
@@ -2433,23 +2433,17 @@ EXPORT_SYMBOL(cxgb4_unregister_uld);
  */
 static int cxgb_up(struct adapter *adap)
 {
-	int err = 0;
+	int err;
 
-	if (!(adap->flags & FULL_INIT_DONE)) {
-		err = setup_sge_queues(adap);
-		if (err)
-			goto out;
-		err = setup_rss(adap);
-		if (err) {
-			t4_free_sge_resources(adap);
-			goto out;
-		}
-		if (adap->flags & USING_MSIX)
-			name_msix_vecs(adap);
-		adap->flags |= FULL_INIT_DONE;
-	}
+	err = setup_sge_queues(adap);
+	if (err)
+		goto out;
+	err = setup_rss(adap);
+	if (err)
+		goto freeq;
 
 	if (adap->flags & USING_MSIX) {
+		name_msix_vecs(adap);
 		err = request_irq(adap->msix_info[0].vec, t4_nondata_intr, 0,
 				  adap->msix_info[0].desc, adap);
 		if (err)
@@ -2470,11 +2464,14 @@ static int cxgb_up(struct adapter *adap)
 	enable_rx(adap);
 	t4_sge_start(adap);
 	t4_intr_enable(adap);
+	adap->flags |= FULL_INIT_DONE;
 	notify_ulds(adap, CXGB4_STATE_UP);
  out:
 	return err;
  irq_err:
 	dev_err(adap->pdev_dev, "request_irq failed, err %d\n", err);
+ freeq:
+	t4_free_sge_resources(adap);
 	goto out;
 }
 
@@ -2490,6 +2487,9 @@ static void cxgb_down(struct adapter *adapter)
 	} else
 		free_irq(adapter->pdev->irq, adapter);
 	quiesce_rx(adapter);
+	t4_sge_stop(adapter);
+	t4_free_sge_resources(adapter);
+	adapter->flags &= ~FULL_INIT_DONE;
 }
 
 /*
@@ -2501,11 +2501,13 @@ static int cxgb_open(struct net_device *dev)
 	struct port_info *pi = netdev_priv(dev);
 	struct adapter *adapter = pi->adapter;
 
-	if (!adapter->open_device_map && (err = cxgb_up(adapter)) < 0)
-		return err;
+	if (!(adapter->flags & FULL_INIT_DONE)) {
+		err = cxgb_up(adapter);
+		if (err < 0)
+			return err;
+	}
 
 	dev->real_num_tx_queues = pi->nqsets;
-	set_bit(pi->tx_chan, &adapter->open_device_map);
 	link_start(dev);
 	netif_tx_start_all_queues(dev);
 	return 0;
@@ -2513,19 +2515,12 @@ static int cxgb_open(struct net_device *dev)
 
 static int cxgb_close(struct net_device *dev)
 {
-	int ret;
 	struct port_info *pi = netdev_priv(dev);
 	struct adapter *adapter = pi->adapter;
 
 	netif_tx_stop_all_queues(dev);
 	netif_carrier_off(dev);
-	ret = t4_enable_vi(adapter, 0, pi->viid, false, false);
-
-	clear_bit(pi->tx_chan, &adapter->open_device_map);
-
-	if (!adapter->open_device_map)
-		cxgb_down(adapter);
-	return 0;
+	return t4_enable_vi(adapter, 0, pi->viid, false, false);
 }
 
 static struct net_device_stats *cxgb_get_stats(struct net_device *dev)
@@ -3360,8 +3355,8 @@ static void __devexit remove_one(struct pci_dev *pdev)
 		if (adapter->debugfs_root)
 			debugfs_remove_recursive(adapter->debugfs_root);
 
-		t4_sge_stop(adapter);
-		t4_free_sge_resources(adapter);
+		if (adapter->flags & FULL_INIT_DONE)
+			cxgb_down(adapter);
 		t4_free_mem(adapter->l2t);
 		t4_free_mem(adapter->tids.tid_tab);
 		disable_msi(adapter);
-- 
1.5.4


^ permalink raw reply related

* Re: [PATCH v3] Fix SJA1000 command register writes on SMP systems
From: David Miller @ 2010-05-18 21:03 UTC (permalink / raw)
  To: socketcan; +Cc: wg, netdev, socketcan-core
In-Reply-To: <4BF2E512.1090109@hartkopp.net>

From: Oliver Hartkopp <socketcan@hartkopp.net>
Date: Tue, 18 May 2010 21:05:54 +0200

> The SJA1000 command register is concurrently written in the rx-path to free
> the receive buffer _and_ in the tx-path to start the transmission.
> 
> The SJA1000 data sheet, 6.4.4 COMMAND REGISTER (CMR) states:
> "Between two commands at least one internal clock cycle is needed in
> order to proceed. The internal clock is half of the external oscillator
> frequency."
> 
> On SMP systems the current implementation leads to a write stall in the
> tx-path, which can be solved by adding some general locking and some time
> to settle the write_reg() operation for the command register.
> 
> Thanks to Klaus Hitschler for the original fix and detailed problem
> description.
> 
> This patch applies on net-2.6 and (with some offsets) on net-next-2.6 .
> 
> Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
> Acked-by: Wolfgang Grandegger <wg@grandegger.com>

Applied, thanks.

^ permalink raw reply

* Re: [PATCH] bfin_mac: fix memleak in mii_bus{probe|remove}
From: David Miller @ 2010-05-18 21:13 UTC (permalink / raw)
  To: kirjanov; +Cc: sonic.adi, michael.hennerich, cooloney, uclinux-dist-devel,
	netdev
In-Reply-To: <20100518113445.GA28386@coldcone>

From: Denis Kirjanov <kirjanov@gmail.com>
Date: Tue, 18 May 2010 15:34:46 +0400

> [PATCH] bfin_mac: fix memleak in mii_bus_{probe|remove}
> Fix memory leak with miibus->irq
> 
> Signed-off-by: Denis Kirjanov <dkirjanov@kernel.org>

I've already applied the original patch.

Therefore, it makes no sense to send a fresh complete version
of the original patch to fix this.

You'll need to send a relative patch against the other one to
fix the problem.

^ permalink raw reply

* Re: [PATCH] bfin_mac: fix memleak in mii_bus{probe|remove}
From: David Miller @ 2010-05-18 21:15 UTC (permalink / raw)
  To: kirjanov; +Cc: sonic.adi, michael.hennerich, cooloney, uclinux-dist-devel,
	netdev
In-Reply-To: <20100518.141318.45114681.davem@davemloft.net>

From: David Miller <davem@davemloft.net>
Date: Tue, 18 May 2010 14:13:18 -0700 (PDT)

> From: Denis Kirjanov <kirjanov@gmail.com>
> Date: Tue, 18 May 2010 15:34:46 +0400
> 
>> [PATCH] bfin_mac: fix memleak in mii_bus_{probe|remove}
>> Fix memory leak with miibus->irq
>> 
>> Signed-off-by: Denis Kirjanov <dkirjanov@kernel.org>
> 
> I've already applied the original patch.

Sorry, ignore this, I didn't see that this is a new patch.

^ permalink raw reply

* Re: [PATCH v3] Fix SJA1000 command register writes on SMP systems
From: Sam Ravnborg @ 2010-05-18 21:31 UTC (permalink / raw)
  To: Oliver Hartkopp
  Cc: David Miller, SocketCAN Core Mailing List, Linux Netdev List,
	Wolfgang Grandegger
In-Reply-To: <4BF2E512.1090109@hartkopp.net>

Hi Oliver.

> diff --git a/drivers/net/can/sja1000/sja1000.h b/drivers/net/can/sja1000/sja1000.h
> index 97a622b..de8e778 100644
> --- a/drivers/net/can/sja1000/sja1000.h
> +++ b/drivers/net/can/sja1000/sja1000.h
> @@ -167,6 +167,7 @@ struct sja1000_priv {
>  
>  	void __iomem *reg_base;	 /* ioremap'ed address to registers */
>  	unsigned long irq_flags; /* for request_irq() */
> +	spinlock_t cmdreg_lock;  /* lock for concurrent cmd register writes */
>  
>  	u16 flags;		/* custom mode flags */
>  	u8 ocr;			/* output control register */

You define your spinlock inside a struct so you cannot use
DEFINE_SPINLOCK().

But then you need to use spin_lock_init() - which I fail to see
you are doing in your patch.

	Sam

^ permalink raw reply

* [PATCH 1/2] cnic: Convert cnic_local_flags to atomic ops.
From: Michael Chan @ 2010-05-18 21:32 UTC (permalink / raw)
  To: davem; +Cc: netdev, Michael Chan

It is easier to expand the flags for other purposes because it does
not require a spin_lock.  The next bug fix patch needs a flag in
cnic_local_flags.

Signed-off-by: Michael Chan <mchan@broadcom.com>
---
 drivers/net/cnic.c |    6 +++---
 drivers/net/cnic.h |    5 ++---
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/drivers/net/cnic.c b/drivers/net/cnic.c
index 4b08b18..7c6d325 100644
--- a/drivers/net/cnic.c
+++ b/drivers/net/cnic.c
@@ -1143,12 +1143,12 @@ static int cnic_submit_bnx2_kwqes(struct cnic_dev *dev, struct kwqe *wqes[],
 
 	spin_lock_bh(&cp->cnic_ulp_lock);
 	if (num_wqes > cnic_kwq_avail(cp) &&
-	    !(cp->cnic_local_flags & CNIC_LCL_FL_KWQ_INIT)) {
+	    !test_bit(CNIC_LCL_FL_KWQ_INIT, &cp->cnic_local_flags)) {
 		spin_unlock_bh(&cp->cnic_ulp_lock);
 		return -EAGAIN;
 	}
 
-	cp->cnic_local_flags &= ~CNIC_LCL_FL_KWQ_INIT;
+	clear_bit(CNIC_LCL_FL_KWQ_INIT, &cp->cnic_local_flags);
 
 	prod = cp->kwq_prod_idx;
 	sw_prod = prod & MAX_KWQ_IDX;
@@ -3690,7 +3690,7 @@ static int cnic_start_bnx2_hw(struct cnic_dev *dev)
 	cp->max_kwq_idx = MAX_KWQ_IDX;
 	cp->kwq_prod_idx = 0;
 	cp->kwq_con_idx = 0;
-	cp->cnic_local_flags |= CNIC_LCL_FL_KWQ_INIT;
+	set_bit(CNIC_LCL_FL_KWQ_INIT, &cp->cnic_local_flags);
 
 	if (CHIP_NUM(cp) == CHIP_NUM_5706 || CHIP_NUM(cp) == CHIP_NUM_5708)
 		cp->kwq_con_idx_ptr = &sblk->status_rx_quick_consumer_index15;
diff --git a/drivers/net/cnic.h b/drivers/net/cnic.h
index a0d853d..4422497 100644
--- a/drivers/net/cnic.h
+++ b/drivers/net/cnic.h
@@ -179,9 +179,8 @@ struct cnic_local {
 #define ULP_F_CALL_PENDING	2
 	struct cnic_ulp_ops *ulp_ops[MAX_CNIC_ULP_TYPE];
 
-	/* protected by ulp_lock */
-	u32 cnic_local_flags;
-#define	CNIC_LCL_FL_KWQ_INIT	0x00000001
+	unsigned long cnic_local_flags;
+#define	CNIC_LCL_FL_KWQ_INIT		0x0
 
 	struct cnic_dev *dev;
 
-- 
1.6.4.GIT



^ permalink raw reply related

* [PATCH 2/2] cnic: Return SPQ credit to bnx2x after ring setup and shutdown.
From: Michael Chan @ 2010-05-18 21:32 UTC (permalink / raw)
  To: davem; +Cc: netdev, Michael Chan
In-Reply-To: <1274218373-16324-1-git-send-email-mchan@broadcom.com>

Everytime the iSCSI ring finishes setup or shutdown, we need to return
the SPQ (slow path queue) credit to the bnx2x driver.  Without this step,
the SPQ will eventually be full causing iSCSI to fail.  This can happen
after 3 or 4 MTU changes for example.

Add code to wait for these slow path commands to complete in the RX ring
and return the SPQ credit to bnx2x.

Signed-off-by: Michael Chan <mchan@broadcom.com>
---
 drivers/net/cnic.c |   71 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 drivers/net/cnic.h |    5 +++
 2 files changed, 75 insertions(+), 1 deletions(-)

diff --git a/drivers/net/cnic.c b/drivers/net/cnic.c
index 7c6d325..be90d35 100644
--- a/drivers/net/cnic.c
+++ b/drivers/net/cnic.c
@@ -2145,17 +2145,56 @@ static int cnic_get_kcqes(struct cnic_dev *dev, u16 hw_prod, u16 *sw_prod)
 	return last_cnt;
 }
 
+static int cnic_l2_completion(struct cnic_local *cp)
+{
+	u16 hw_cons, sw_cons;
+	union eth_rx_cqe *cqe, *cqe_ring = (union eth_rx_cqe *)
+					(cp->l2_ring + (2 * BCM_PAGE_SIZE));
+	u32 cmd;
+	int comp = 0;
+
+	if (!test_bit(CNIC_F_BNX2X_CLASS, &cp->dev->flags))
+		return 0;
+
+	hw_cons = *cp->rx_cons_ptr;
+	if ((hw_cons & BNX2X_MAX_RCQ_DESC_CNT) == BNX2X_MAX_RCQ_DESC_CNT)
+		hw_cons++;
+
+	sw_cons = cp->rx_cons;
+	while (sw_cons != hw_cons) {
+		u8 cqe_fp_flags;
+
+		cqe = &cqe_ring[sw_cons & BNX2X_MAX_RCQ_DESC_CNT];
+		cqe_fp_flags = cqe->fast_path_cqe.type_error_flags;
+		if (cqe_fp_flags & ETH_FAST_PATH_RX_CQE_TYPE) {
+			cmd = le32_to_cpu(cqe->ramrod_cqe.conn_and_cmd_data);
+			cmd >>= COMMON_RAMROD_ETH_RX_CQE_CMD_ID_SHIFT;
+			if (cmd == RAMROD_CMD_ID_ETH_CLIENT_SETUP ||
+			    cmd == RAMROD_CMD_ID_ETH_HALT)
+				comp++;
+		}
+		sw_cons = BNX2X_NEXT_RCQE(sw_cons);
+	}
+	return comp;
+}
+
 static void cnic_chk_pkt_rings(struct cnic_local *cp)
 {
 	u16 rx_cons = *cp->rx_cons_ptr;
 	u16 tx_cons = *cp->tx_cons_ptr;
+	int comp = 0;
 
 	if (cp->tx_cons != tx_cons || cp->rx_cons != rx_cons) {
+		if (test_bit(CNIC_LCL_FL_L2_WAIT, &cp->cnic_local_flags))
+			comp = cnic_l2_completion(cp);
+
 		cp->tx_cons = tx_cons;
 		cp->rx_cons = rx_cons;
 
 		uio_event_notify(cp->cnic_uinfo);
 	}
+	if (comp)
+		clear_bit(CNIC_LCL_FL_L2_WAIT, &cp->cnic_local_flags);
 }
 
 static int cnic_service_bnx2(void *data, void *status_blk)
@@ -4168,6 +4207,8 @@ static void cnic_init_rings(struct cnic_dev *dev)
 		for (i = 0; i < sizeof(struct ustorm_eth_rx_producers) / 4; i++)
 			CNIC_WR(dev, off + i * 4, ((u32 *) &rx_prods)[i]);
 
+		set_bit(CNIC_LCL_FL_L2_WAIT, &cp->cnic_local_flags);
+
 		cnic_init_bnx2x_tx_ring(dev);
 		cnic_init_bnx2x_rx_ring(dev);
 
@@ -4175,6 +4216,15 @@ static void cnic_init_rings(struct cnic_dev *dev)
 		l5_data.phy_address.hi = 0;
 		cnic_submit_kwqe_16(dev, RAMROD_CMD_ID_ETH_CLIENT_SETUP,
 			BNX2X_ISCSI_L2_CID, ETH_CONNECTION_TYPE, &l5_data);
+		i = 0;
+		while (test_bit(CNIC_LCL_FL_L2_WAIT, &cp->cnic_local_flags) &&
+		       ++i < 10)
+			msleep(1);
+
+		if (test_bit(CNIC_LCL_FL_L2_WAIT, &cp->cnic_local_flags))
+			netdev_err(dev->netdev,
+				"iSCSI CLIENT_SETUP did not complete\n");
+		cnic_kwq_completion(dev, 1);
 		cnic_ring_ctl(dev, BNX2X_ISCSI_L2_CID, cli, 1);
 	}
 }
@@ -4187,14 +4237,25 @@ static void cnic_shutdown_rings(struct cnic_dev *dev)
 		struct cnic_local *cp = dev->cnic_priv;
 		u32 cli = BNX2X_ISCSI_CL_ID(CNIC_E1HVN(cp));
 		union l5cm_specific_data l5_data;
+		int i;
 
 		cnic_ring_ctl(dev, BNX2X_ISCSI_L2_CID, cli, 0);
 
+		set_bit(CNIC_LCL_FL_L2_WAIT, &cp->cnic_local_flags);
+
 		l5_data.phy_address.lo = cli;
 		l5_data.phy_address.hi = 0;
 		cnic_submit_kwqe_16(dev, RAMROD_CMD_ID_ETH_HALT,
 			BNX2X_ISCSI_L2_CID, ETH_CONNECTION_TYPE, &l5_data);
-		msleep(10);
+		i = 0;
+		while (test_bit(CNIC_LCL_FL_L2_WAIT, &cp->cnic_local_flags) &&
+		       ++i < 10)
+			msleep(1);
+
+		if (test_bit(CNIC_LCL_FL_L2_WAIT, &cp->cnic_local_flags))
+			netdev_err(dev->netdev,
+				"iSCSI CLIENT_HALT did not complete\n");
+		cnic_kwq_completion(dev, 1);
 
 		memset(&l5_data, 0, sizeof(l5_data));
 		cnic_submit_kwqe_16(dev, RAMROD_CMD_ID_ETH_CFC_DEL,
@@ -4315,7 +4376,15 @@ static void cnic_stop_hw(struct cnic_dev *dev)
 {
 	if (test_bit(CNIC_F_CNIC_UP, &dev->flags)) {
 		struct cnic_local *cp = dev->cnic_priv;
+		int i = 0;
 
+		/* Need to wait for the ring shutdown event to complete
+		 * before clearing the CNIC_UP flag.
+		 */
+		while (cp->uio_dev != -1 && i < 15) {
+			msleep(100);
+			i++;
+		}
 		clear_bit(CNIC_F_CNIC_UP, &dev->flags);
 		rcu_assign_pointer(cp->ulp_ops[CNIC_ULP_L4], NULL);
 		synchronize_rcu();
diff --git a/drivers/net/cnic.h b/drivers/net/cnic.h
index 4422497..08b1235 100644
--- a/drivers/net/cnic.h
+++ b/drivers/net/cnic.h
@@ -181,6 +181,7 @@ struct cnic_local {
 
 	unsigned long cnic_local_flags;
 #define	CNIC_LCL_FL_KWQ_INIT		0x0
+#define	CNIC_LCL_FL_L2_WAIT		0x1
 
 	struct cnic_dev *dev;
 
@@ -348,6 +349,10 @@ struct bnx2x_bd_chain_next {
 #define BNX2X_RCQ_DESC_CNT		(BCM_PAGE_SIZE / sizeof(union eth_rx_cqe))
 #define BNX2X_MAX_RCQ_DESC_CNT		(BNX2X_RCQ_DESC_CNT - 1)
 
+#define BNX2X_NEXT_RCQE(x) (((x) & BNX2X_MAX_RCQ_DESC_CNT) ==		\
+		(BNX2X_MAX_RCQ_DESC_CNT - 1)) ?				\
+		((x) + 2) : ((x) + 1)
+
 #define BNX2X_DEF_SB_ID			16
 
 #define BNX2X_ISCSI_RX_SB_INDEX_NUM					\
-- 
1.6.4.GIT



^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox