Netdev List

Netdev List
 help / color / mirror / Atom feed

* Re: [PATCH net-next-2.6] net: add rcu protection to netdev->ifalias
From: Eric Dumazet @ 2011-05-18  4:35 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, kaber, netdev, remi.denis-courmont
In-Reply-To: <1305671956.6741.25.camel@edumazet-laptop>

Le mercredi 18 mai 2011 à 00:39 +0200, Eric Dumazet a écrit :

> I dont know, if it happens to be too hard, we'll just stick again rtnl
> for "ip link show" ;)
> 
> 

I believe I'll take this path, its a bit too hard for the moment, and
need several preliminary steps :

It is making sense trying to get rid of rtnl_trylock() hack with more
fine grained locks, and thus lower pressure on RTNL.

Some synchronize_rcu() calls are done whith RTNL held : All processes
hitting rtnl_trylock() have to enter a busy loop, restarting syscall as
long as the RTNL owner is blocked in synchronize_rcu().

^ permalink raw reply

* [PATCH] Check the value of doi before referencing it
From: Huzaifa Sidhpurwala @ 2011-05-18  4:29 UTC (permalink / raw)
  To: netdev; +Cc: kaber, yoshfuji, jmorris, pekkas, kuznet, davem,
	Huzaifa Sidhpurwala

Value of doi is not checked before referencing it.
Though this does not cause any null pointer dereference since
all the callers of cipso_v4_doi_add check the value of doi
before calling the function, but it would be a good programming
practice to do so anyways :)

Signed-off-by: Huzaifa Sidhpurwala <huzaifas@redhat.com>
---
 net/ipv4/cipso_ipv4.c |    9 ++++++---
 1 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index a0af7ea..7adc4ea 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -473,10 +473,13 @@ int cipso_v4_doi_add(struct cipso_v4_doi *doi_def,
 	u32 doi_type;
 	struct audit_buffer *audit_buf;
 
-	doi = doi_def->doi;
-	doi_type = doi_def->type;
+	if (doi_def) {
+		doi = doi_def->doi;
+		doi_type = doi_def->type;
+	} else
+	goto doi_add_return;
 
-	if (doi_def == NULL || doi_def->doi == CIPSO_V4_DOI_UNKNOWN)
+	if (doi_def->doi == CIPSO_V4_DOI_UNKNOWN)
 		goto doi_add_return;
 	for (iter = 0; iter < CIPSO_V4_TAG_MAXCNT; iter++) {
 		switch (doi_def->tags[iter]) {
-- 
1.7.1


^ permalink raw reply related

* Re: [PATCH] e100: Fix race condition in e100_down() while testing
From: Prasanna Panchamukhi @ 2011-05-18  3:07 UTC (permalink / raw)
  Cc: e1000-devel@lists.sourceforge.net, netdev@vger.kernel.org,
	bruce.w.allan@intel.com, jesse.brandeburg@intel.com
In-Reply-To: <1305670825-13603-1-git-send-email-prasanna.panchamukhi@riverbed.com>

Please ignore this, sending and updated one..

On 05/17/2011 03:20 PM, prasanna.panchamukhi@riverbed.com wrote:
> There is a race condition between e100_down() and e100_diag_test().
> During device testing e100_diag_test() ends up calling e100_up()/e100_down()
> even while the driver is already in e100_down().
> This patch fixes the above race condition by changing
> e100_up()/e100_down() to dev_open() and dev_close().
> Also fixes the race between e100_open() and e100_diag_test().
>
> Signed-off-by: Prasanna S. Panchamukh<prasanna.panchamukhi@riverbed.com>
> ---
>   drivers/net/e100.c |   21 +++++++++++++++++----
>   1 files changed, 17 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/net/e100.c b/drivers/net/e100.c
> index b0aa9e6..abbf229 100644
> --- a/drivers/net/e100.c
> +++ b/drivers/net/e100.c
> @@ -425,6 +425,10 @@ enum cb_command {
>   	cb_el     = 0x8000,
>   };
>
> +enum e100_state_t {
> +	__E100_TESTING
> +};
> +
>   struct rfd {
>   	__le16 status;
>   	__le16 command;
> @@ -623,6 +627,7 @@ struct nic {
>   	__le16 eeprom[256];
>   	spinlock_t mdio_lock;
>   	const struct firmware *fw;
> +	unsigned long state;
>   };
>
>   static inline void e100_write_flush(struct nic *nic)
> @@ -2570,8 +2575,10 @@ static void e100_diag_test(struct net_device *netdev,
>   {
>   	struct ethtool_cmd cmd;
>   	struct nic *nic = netdev_priv(netdev);
> +	int if_running = netif_running(netdev);
>   	int i, err;
>
> +	set_bit(__E100_TESTING,&nic->state);
>   	memset(data, 0, E100_TEST_LEN * sizeof(u64));
>   	data[0] = !mii_link_ok(&nic->mii);
>   	data[1] = e100_eeprom_load(nic);
> @@ -2580,8 +2587,9 @@ static void e100_diag_test(struct net_device *netdev,
>   		/* save speed, duplex&  autoneg settings */
>   		err = mii_ethtool_gset(&nic->mii,&cmd);
>
> -		if (netif_running(netdev))
> -			e100_down(nic);
> +		if (if_running)
> +			/* indicate we're in test mode */
> +			dev_open(netdev);
>   		data[2] = e100_self_test(nic);
>   		data[3] = e100_loopback_test(nic, lb_mac);
>   		data[4] = e100_loopback_test(nic, lb_phy);
> @@ -2589,12 +2597,13 @@ static void e100_diag_test(struct net_device *netdev,
>   		/* restore speed, duplex&  autoneg settings */
>   		err = mii_ethtool_sset(&nic->mii,&cmd);
>
> -		if (netif_running(netdev))
> -			e100_up(nic);
> +		if (if_running)
> +			dev_open(netdev);
>   	}
>   	for (i = 0; i<  E100_TEST_LEN; i++)
>   		test->flags |= data[i] ? ETH_TEST_FL_FAILED : 0;
>
> +	clear_bit(__E100_TESTING,&nic->state);
>   	msleep_interruptible(4 * 1000);
>   }
>
> @@ -2724,6 +2733,10 @@ static int e100_open(struct net_device *netdev)
>   	struct nic *nic = netdev_priv(netdev);
>   	int err = 0;
>
> +	/* disallow open during testing */
> +	if (test_bit(__E100_TESTING,&nic->state))
> +		return -EBUSY;
> +
>   	netif_carrier_off(netdev);
>   	if ((err = e100_up(nic)))
>   		netif_err(nic, ifup, nic->netdev, "Cannot open interface, aborting\n");


------------------------------------------------------------------------------
What Every C/C++ and Fortran developer Should Know!
Read this article and learn how Intel has extended the reach of its 
next-generation tools to help Windows* and Linux* C/C++ and Fortran 
developers boost performance applications - including clusters. 
http://p.sf.net/sfu/intel-dev2devmay
_______________________________________________
E1000-devel mailing list
E1000-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/e1000-devel
To learn more about Intel&#174; Ethernet, visit http://communities.intel.com/community/wired

^ permalink raw reply

* Re: [PATCH net-next-2.6] sfc: Replace enum efx_fc_type with a 'bitwise' type
From: Jeff Garzik @ 2011-05-18  2:59 UTC (permalink / raw)
  To: David Miller; +Cc: bhutchings, netdev
In-Reply-To: <20110517.222302.158419141035335293.davem@davemloft.net>

On 05/17/2011 10:23 PM, David Miller wrote:
> From: Jeff Garzik<jeff@garzik.org>
>
>> On 05/17/2011 05:14 PM, David Miller wrote:
>>> Accept that the compiler currently doesn't want to allow enums to be
>>> used as bit-masks, don't paper around it.
>>
>> This practice is all over the kernel.
>>
>> It's a bit silly to make such a big deal, because of that.
>
> "We do something stupid everywhere" is never a good
> argument for anything.

grepping across the kernel, it is clearly a useful shorthand technique 
for defining typed, named constants, including bitmasks, used widely 
across multiple subsystems, drivers and include/linux.

Given that this technique has passed review time and again, and that 
kernel hackers -do- find it useful for valid reasons, I guess the only 
remaining defense is to call something "stupid."

	Jeff




^ permalink raw reply

* Re: [PATCH net-next-2.6] sfc: Replace enum efx_fc_type with a 'bitwise' type
From: David Miller @ 2011-05-18  2:23 UTC (permalink / raw)
  To: jeff; +Cc: bhutchings, netdev
In-Reply-To: <4DD3285D.6010506@garzik.org>

From: Jeff Garzik <jeff@garzik.org>
Date: Tue, 17 May 2011 22:01:01 -0400

> On 05/17/2011 05:14 PM, David Miller wrote:
>> Accept that the compiler currently doesn't want to allow enums to be
>> used as bit-masks, don't paper around it.
> 
> This practice is all over the kernel.
> 
> It's a bit silly to make such a big deal, because of that.

"We do something stupid everywhere" is never a good
argument for anything.

^ permalink raw reply

* Re: [PATCH net-next-2.6] sfc: Replace enum efx_fc_type with a 'bitwise' type
From: Jeff Garzik @ 2011-05-18  2:01 UTC (permalink / raw)
  To: David Miller; +Cc: bhutchings, netdev
In-Reply-To: <20110517.171412.1017451005914294196.davem@davemloft.net>

On 05/17/2011 05:14 PM, David Miller wrote:
> Accept that the compiler currently doesn't want to allow enums to be
> used as bit-masks, don't paper around it.

This practice is all over the kernel.

It's a bit silly to make such a big deal, because of that.

	Jeff



^ permalink raw reply

* Re: bonding flaps between member interfaces
From: Jay Vosburgh @ 2011-05-18  1:22 UTC (permalink / raw)
  To: Patrick Schaaf; +Cc: netdev
In-Reply-To: <1305638854.6044.223.camel@lat1>

Patrick Schaaf <netdev@bof.de> wrote:

>Dear netdev,
>
>I'm experiencing a regression with bonding. Bugzilla and cursory
>searching of the list did not immediately show up anything that seems
>related, so here's the report:
>
>Short summary: bonding flips between members every second

	I have reproduced the problem on a 2.6.38-rc5-ish kernel.

	The described configuration is enslaving two VLAN interfaces; I
also tried enslaving eth0/eth1 directly and stacking the VLAN atop
bonding.  That doesn't work either.  I don't get any errors, and bonding
says the slaves are up, but ping through the VLAN fails.  Ping over the
non-VLAN (directly on bond0) works ok.

	I'll give it some bisect action and report back.

	-J

>bonding in active-backup mode with ARP monitoring
>two members in the bond, both being VLAN interfaces on top of two
>separate ethernet interfaces
>bnx2 ethernet driver, but saw the same behaviour with a tigon box
>concrete settings:
>BONDING_MODULE_OPTS="mode=active-backup primary=eth0.24 arp_interval=250
>arp_ip_target=192.168.x.x"
>See below for a /proc/net/bonding/bond24 output reflecing the
>configuration.
>
>This setup I have in production on 2.6.36.2, and it works fine.
>It also works fine, tested today, with 2.6.36,4 and 2.6.37.6
>
>Starting with 2.6.38 (2.6.38.6 tested today), and still happening with
>2.6.39-rc7, I experience problems. While I can still work over the
>interface, it is flipping once per second between the two member
>interfaces. There is no indication of the underlying interface going
>up/down, but bonding seems to think so.
>
>See below an excerpt of the kernel log for two back-and-forth flapping
>cycles.
>
>In /proc/net/bonding/bond24, I see the failure counter of the configured
>primary interface counting up with each flap. The counter of the non
>primary interface does not move. When I switch the primary interface by
>echoing to /sys, the behaviour of the counters flips: always the
>configured primary has the counter going up.
>
>best regards
>  Patrick
>
>Here is /proc/net/bonding/bond24 while running on 2.6.37.6, to show the
>concrete configuration from this POV. Everything looks the same with the
>failing kernels, except for the noted behaviour of the Failure Counts.
>
>Ethernet Channel Bonding Driver: v3.7.0 (June 2, 2010)
>
>Bonding Mode: fault-tolerance (active-backup)
>Primary Slave: eth0.24 (primary_reselect always)
>Currently Active Slave: eth0.24
>MII Status: up
>MII Polling Interval (ms): 0
>Up Delay (ms): 0
>Down Delay (ms): 0
>ARP Polling Interval (ms): 250
>ARP IP target/s (n.n.n.n form): 192.168.x.x
>
>Slave Interface: eth0.24
>MII Status: up
>Speed: 1000 Mbps
>Duplex: full
>Link Failure Count: 0
>Permanent HW addr: d4:85:64:ca:1c:12
>Slave queue ID: 0
>
>Slave Interface: eth1.24
>MII Status: up
>Speed: 1000 Mbps
>Duplex: full
>Link Failure Count: 0
>Permanent HW addr: d4:85:64:ca:1c:14
>Slave queue ID: 0
>
>Here is kernel log output for two flapping cycles (booted kernel was
>2.6.39-rc7):
>
>May 17 14:58:22 myserver kernel: [ 1016.629155] bonding: bond24: link
>status definitely down for interface eth0.24, disabling it
>May 17 14:58:22 myserver kernel: [ 1016.629159] bonding: bond24: making
>interface eth1.24 the new active one.
>May 17 14:58:22 myserver kernel: [ 1016.629162] device eth0.24 left
>promiscuous mode
>May 17 14:58:22 myserver kernel: [ 1016.629164] device eth0 left
>promiscuous mode
>May 17 14:58:22 myserver kernel: [ 1016.629191] device eth1.24 entered
>promiscuous mode
>May 17 14:58:22 myserver kernel: [ 1016.629193] device eth1 entered
>promiscuous mode
>May 17 14:58:22 myserver kernel: [ 1016.878596] bonding: bond24: link
>status definitely up for interface eth0.24.
>May 17 14:58:22 myserver kernel: [ 1016.878600] bonding: bond24: making
>interface eth0.24 the new active one.
>May 17 14:58:22 myserver kernel: [ 1016.878603] device eth1.24 left
>promiscuous mode
>May 17 14:58:22 myserver kernel: [ 1016.878605] device eth1 left
>promiscuous mode
>May 17 14:58:22 myserver kernel: [ 1016.878631] device eth0.24 entered
>promiscuous mode
>May 17 14:58:22 myserver kernel: [ 1016.878633] device eth0 entered
>promiscuous mode
>May 17 14:58:23 myserver kernel: [ 1017.626919] bonding: bond24: link
>status definitely down for interface eth0.24, disabling it
>May 17 14:58:23 myserver kernel: [ 1017.626923] bonding: bond24: making
>interface eth1.24 the new active one.
>May 17 14:58:23 myserver kernel: [ 1017.626926] device eth0.24 left
>promiscuous mode
>May 17 14:58:23 myserver kernel: [ 1017.626928] device eth0 left
>promiscuous mode
>May 17 14:58:23 myserver kernel: [ 1017.626955] device eth1.24 entered
>promiscuous mode
>May 17 14:58:23 myserver kernel: [ 1017.626957] device eth1 entered
>promiscuous mode
>May 17 14:58:23 myserver kernel: [ 1017.876359] bonding: bond24: link
>status definitely up for interface eth0.24.
>May 17 14:58:23 myserver kernel: [ 1017.876363] bonding: bond24: making
>interface eth0.24 the new active one.
>May 17 14:58:23 myserver kernel: [ 1017.876366] device eth1.24 left
>promiscuous mode
>May 17 14:58:23 myserver kernel: [ 1017.876368] device eth1 left
>promiscuous mode
>May 17 14:58:23 myserver kernel: [ 1017.876394] device eth0.24 entered
>promiscuous mode
>May 17 14:58:23 myserver kernel: [ 1017.876396] device eth0 entered
>promiscuous mode

---
	-Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com

^ permalink raw reply

* Re: [PATCH] IPv6 transmit hashing for bonding driver
From: Jay Vosburgh @ 2011-05-18  0:57 UTC (permalink / raw)
  To: John; +Cc: netdev
In-Reply-To: <4DD30AF2.1090707@8192.net>

John <linux@8192.net> wrote:

>Currently the "bonding" driver does not support load balancing outgoing
>traffic in LACP mode for IPv6 traffic. IPv4 (and TCP over IPv4) are
>currently supported; this patch adds transmit hashing for IPv6 (and TCP
>over IPv6), bringing IPv6 up to par with IPv4 support in the bonding
>driver.
>
>The algorithm chosen (xor'ing the bottom three quads and then xor'ing that
>down into the bottom byte) was chosen after testing almost 400,000 unique
>IPv6 addresses harvested from server logs. This algorithm had the most
>even distribution for both big- and little-endian architectures while
>still using few instructions.
>
>This patch also adds missing configuration information the MODULE_PARM_DESC.
>
>Patch has been tested on various machines and performs as expected. Thanks
>to Stephen Hemminger and Andy Gospodarek for advice and guidance.

	This looks reasonable at first glance, with a few comments
below.  You'll need to supply a Signed-Off-By at some point.

	It would also be useful to include an update bonding.txt to
describe the IPv6 algorithm; I'd word that something like the following
(filling in the missing bits) for the layer3+4 section, applying similar
changes to the layer2+3 section:

--- net-next-2.6/Documentation/networking/bonding.txt	2011-05-09 17:53:03.000000000 -0700
+++ net-next-2.6/Documentation/networking/bonding.txt.new	2011-05-17 17:53:46.000000000 -0700
@@ -733,21 +733,26 @@
 		slaves, although a single connection will not span
 		multiple slaves.
 
-		The formula for unfragmented TCP and UDP packets is
+		The formula for unfragmented IPv4 TCP and UDP packets is
 
 		((source port XOR dest port) XOR
 			 ((source IP XOR dest IP) AND 0xffff)
 				modulo slave count
 
-		For fragmented TCP or UDP packets and all other IP
+		The formula for unfragmented IPv6 TCP and UDP packets is
+
+		[ your formula here ]
+
+		For fragmented TCP or UDP packets and all other IP or IPv6
 		protocol traffic, the source and destination port
-		information is omitted.  For non-IP traffic, the
+		information is omitted.  For non-IP/IPv6 traffic, the
 		formula is the same as for the layer2 transmit hash
 		policy.
 
-		This policy is intended to mimic the behavior of
-		certain switches, notably Cisco switches with PFC2 as
-		well as some Foundry and IBM products.
+		The IPv4 behavior is intended to mimic the behavior of
+		certain switches, notably Cisco switches with PFC2 as well
+		as some Foundry and IBM products.  The IPv6 behavior was
+		determined by [ your rationale here ].
 
 		This algorithm is not fully 802.3ad compliant.  A
 		single TCP or UDP conversation containing both


>John
>
>--- drivers/net/bonding/bond_main.c.orig	2011-04-18 17:23:09.202894000 -0700
>+++ drivers/net/bonding/bond_main.c	2011-04-19 18:12:30.287929000 -0700
>@@ -152,7 +152,7 @@
> MODULE_PARM_DESC(ad_select, "803.ad aggregation selection logic: stable (0, default), bandwidth (1), count (2)");
> module_param(xmit_hash_policy, charp, 0);
> MODULE_PARM_DESC(xmit_hash_policy, "XOR hashing method: 0 for layer 2 (default)"
>-				   ", 1 for layer 3+4");
>+				   ", 1 for layer 3+4, 2 for layer 2+3");
> module_param(arp_interval, int, 0);
> MODULE_PARM_DESC(arp_interval, "arp interval in milliseconds");
> module_param_array(arp_ip_target, charp, NULL, 0);
>@@ -3720,11 +3720,20 @@
> static int bond_xmit_hash_policy_l23(struct sk_buff *skb, int count)
> {
> 	struct ethhdr *data = (struct ethhdr *)skb->data;
>-	struct iphdr *iph = ip_hdr(skb);
>
> 	if (skb->protocol == htons(ETH_P_IP)) {
>+		struct iphdr *iph = ip_hdr(skb);
> 		return ((ntohl(iph->saddr ^ iph->daddr) & 0xffff) ^
> 			(data->h_dest[5] ^ data->h_source[5])) % count;
>+	} else if (skb->protocol == htons(ETH_P_IPV6)) {
>+		struct ipv6hdr *ipv6h = ipv6_hdr(skb);
>+		u32 v6hash = (
>+			(ipv6h->saddr.s6_addr32[1] ^ ipv6h->daddr.s6_addr32[1]) ^
>+			(ipv6h->saddr.s6_addr32[2] ^ ipv6h->daddr.s6_addr32[2]) ^
>+			(ipv6h->saddr.s6_addr32[3] ^ ipv6h->daddr.s6_addr32[3])
>+		);

	Style nit: I don't believe the outermost parentheses are
necessary.  Since you do this twice, perhaps make a small inline
function to handle it.

>+		v6hash = (v6hash >> 16) ^ (v6hash >> 8) ^ v6hash;
>+		return (v6hash ^ data->h_dest[5] ^ data->h_source[5]) % count;
> 	}
>
> 	return (data->h_dest[5] ^ data->h_source[5]) % count;
>@@ -3738,11 +3747,11 @@
> static int bond_xmit_hash_policy_l34(struct sk_buff *skb, int count)
> {
> 	struct ethhdr *data = (struct ethhdr *)skb->data;
>-	struct iphdr *iph = ip_hdr(skb);
>-	__be16 *layer4hdr = (__be16 *)((u32 *)iph + iph->ihl);
>-	int layer4_xor = 0;
>+	u32 layer4_xor = 0;
>
> 	if (skb->protocol == htons(ETH_P_IP)) {
>+		struct iphdr *iph = ip_hdr(skb);
>+		__be16 *layer4hdr = (__be16 *)((u32 *)iph + iph->ihl);
> 		if (!(iph->frag_off & htons(IP_MF|IP_OFFSET)) &&
> 		    (iph->protocol == IPPROTO_TCP ||
> 		     iph->protocol == IPPROTO_UDP)) {
>@@ -3750,7 +3759,18 @@
> 		}
> 		return (layer4_xor ^
> 			((ntohl(iph->saddr ^ iph->daddr)) & 0xffff)) % count;
>-
>+	} else if (skb->protocol == htons(ETH_P_IPV6)) {
>+		struct ipv6hdr *ipv6h = ipv6_hdr(skb);
>+		__be16 *layer4hdrv6 = (__be16 *)((u8 *)ipv6h + sizeof(*ipv6h));
>+		if (ipv6h->nexthdr == IPPROTO_TCP || ipv6h->nexthdr == IPPROTO_UDP) {

	For fragmented datagrams, the above will keep all fragments
together, which is good, but are there other header types that should be
skipped over to find the UDP/TCP header for hashing purposes?

>+			layer4_xor = (*layer4hdrv6 ^ *(layer4hdrv6 + 1));
>+		}
>+		layer4_xor ^= (
>+			(ipv6h->saddr.s6_addr32[1] ^ ipv6h->daddr.s6_addr32[1]) ^
>+			(ipv6h->saddr.s6_addr32[2] ^ ipv6h->daddr.s6_addr32[2]) ^
>+			(ipv6h->saddr.s6_addr32[3] ^ ipv6h->daddr.s6_addr32[3])
>+		);

	Parentheses / maybe inline again.

>+		return ((layer4_xor >> 16) ^ (layer4_xor >> 8) ^ layer4_xor) % count;
> 	}
>
> 	return (data->h_dest[5] ^ data->h_source[5]) % count;

	-J

---
	-Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com

^ permalink raw reply

* Re: [PATCH 09/18] virtio: use avail_event index
From: Rusty Russell @ 2011-05-18  0:19 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Krishna Kumar, Carsten Otte, lguest-uLR06cmDAlY/bJ5BZ2RsiQ,
	Shirley Ma, kvm-u79uwXL29TY76Z2rM5mHXA,
	linux-s390-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA,
	habanero-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8, Heiko Carstens,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	virtualization-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	steved-r/Jw6+rmf7HQT0dZR+AlfA, Christian Borntraeger,
	Tom Lendacky, Martin Schwidefsky, linux390-tA70FqPdS9bQT0dZR+AlfA
In-Reply-To: <20110517061031.GC26989-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>

On Tue, 17 May 2011 09:10:31 +0300, "Michael S. Tsirkin" <mst-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> wrote:
> Well one can imagine a driver doing:
> 
> 	while (virtqueue_get_buf()) {
> 		virtqueue_add_buf()
> 	}
> 	virtqueue_kick()
> 
> which looks sensible (batch kicks) but might
> process any number of bufs between kicks.

No, we currently only expose the buffers in the kick, so it can only
fill the ring doing that.

We could change that (and maybe that's worth looking at)...

> If we look at drivers closely enough, I think none
> of them do the equivalent of the above, but not 100% sure.

I'm pretty sure we don't have this kind of 'echo' driver yet.  Drivers
tend to take OS requests and queue them.  The only one which does
anything even partially sophisticated is the net driver...

Thanks,
Rusty.

^ permalink raw reply

* [PATCH] e100: Don't enable interrupts while exiting polling
From: prasanna.panchamukhi @ 2011-05-18  0:16 UTC (permalink / raw)
  To: bruce.w.allan, jeffrey.t.kirsher, jesse.brandeburg
  Cc: e1000-devel, netdev, prasanna.panchamukhi

Interrupts remain enabled while exiting the e100_poll(), when the driver is
down. Anything bad can happen resulting in a crash.
The solution is not to enable the interrupts while the driver is down.

Signed-off-by: Prasanna S. Panchamukhi <prasanna.panchamukhi@riverbed.com>
---
 drivers/net/e100.c |    8 +++++++-
 1 files changed, 7 insertions(+), 1 deletions(-)

diff --git a/drivers/net/e100.c b/drivers/net/e100.c
index abbf229..479219f 100644
--- a/drivers/net/e100.c
+++ b/drivers/net/e100.c
@@ -426,6 +426,7 @@ enum cb_command {
 };
 
 enum e100_state_t {
+	__E100_DOWN,
 	__E100_TESTING
 };
 
@@ -2177,7 +2178,8 @@ static int e100_poll(struct napi_struct *napi, int budget)
 	/* If budget not fully consumed, exit the polling mode */
 	if (work_done < budget) {
 		napi_complete(napi);
-		e100_enable_irq(nic);
+		if (!test_bit(__E100_DOWN, &nic->state))
+			e100_enable_irq(nic);
 	}
 
 	return work_done;
@@ -2230,6 +2232,7 @@ static int e100_up(struct nic *nic)
 {
 	int err;
 
+	clear_bit(__E100_DOWN, &nic->state);
 	if ((err = e100_rx_alloc_list(nic)))
 		return err;
 	if ((err = e100_alloc_cbs(nic)))
@@ -2242,6 +2245,7 @@ static int e100_up(struct nic *nic)
 	if ((err = request_irq(nic->pdev->irq, e100_intr, IRQF_SHARED,
 		nic->netdev->name, nic->netdev)))
 		goto err_no_irq;
+	set_bit(__E100_DOWN, &nic->state);
 	netif_wake_queue(nic->netdev);
 	napi_enable(&nic->napi);
 	/* enable ints _after_ enabling poll, preventing a race between
@@ -2917,7 +2921,9 @@ err_out_free_dev:
 static void __devexit e100_remove(struct pci_dev *pdev)
 {
 	struct net_device *netdev = pci_get_drvdata(pdev);
+	struct nic *nic = netdev_priv(netdev);
 
+	set_bit(__E100_DOWN, &nic->state);
 	if (netdev) {
 		struct nic *nic = netdev_priv(netdev);
 		unregister_netdev(netdev);
-- 
1.7.0.4


^ permalink raw reply related

* Re: [PATCH] net: cpuvert to new cpumask API
From: KOSAKI Motohiro @ 2011-05-18  0:14 UTC (permalink / raw)
  To: davem; +Cc: linux-kernel, therbert, eric.dumazet, netdev
In-Reply-To: <20110516.115357.193701317.davem@davemloft.net>

(2011/05/17 3:53), David Miller wrote:
> From: KOSAKI Motohiro<kosaki.motohiro@jp.fujitsu.com>
> Date: Thu, 28 Apr 2011 23:56:35 +0900 (JST)
>
>> We plan to remove cpu_xx() old api later. Thus this patch
>> convert it.
>>
>> This patch has no functional change.
>>
>> Signed-off-by: KOSAKI Motohiro<kosaki.motohiro@jp.fujitsu.com>
>
> Applied, thanks.

Great! :)

^ permalink raw reply

* Re: [PATCH 06/18] virtio_ring: avail event index interface
From: Rusty Russell @ 2011-05-18  0:08 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Krishna Kumar, Carsten Otte, lguest-uLR06cmDAlY/bJ5BZ2RsiQ,
	Shirley Ma, kvm-u79uwXL29TY76Z2rM5mHXA,
	linux-s390-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA,
	habanero-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8, Heiko Carstens,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	virtualization-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	steved-r/Jw6+rmf7HQT0dZR+AlfA, Christian Borntraeger,
	Tom Lendacky, Martin Schwidefsky, linux390-tA70FqPdS9bQT0dZR+AlfA
In-Reply-To: <20110517060052.GB26989-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>

On Tue, 17 May 2011 09:00:52 +0300, "Michael S. Tsirkin" <mst-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> wrote:
> On Mon, May 16, 2011 at 03:53:19PM +0930, Rusty Russell wrote:
> > On Sun, 15 May 2011 15:47:27 +0300, "Michael S. Tsirkin" <mst-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> wrote:
> > > On Mon, May 09, 2011 at 01:43:15PM +0930, Rusty Russell wrote:
> > > > On Wed, 4 May 2011 23:51:19 +0300, "Michael S. Tsirkin" <mst-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> wrote:
> > > > >  #define VIRTIO_RING_F_USED_EVENT_IDX	29
> > > > > +/* The Host publishes the avail index for which it expects a kick
> > > > > + * at the end of the used ring. Guest should ignore the used->flags field. */
> > > > > +#define VIRTIO_RING_F_AVAIL_EVENT_IDX	32
> > > > 
> > > > Are you really sure we want to separate the two?  Seems a little simpler
> > > > to have one bit to mean "we're publishing our threshold".  For someone
> > > > implementing this from scratch, it's a little simpler.
> > > > 
> > > > Or are there cases where the old style makes more sense?
> > > > 
> > > > Thanks,
> > > > Rusty.
> > > 
> > > Hmm, it makes debugging easier as each side can disable
> > > publishing separately - I used it all the time when I saw
> > > e.g. networking stuck to guess whether I need to investigate the
> > > interrupt or the exit handling.
> > > 
> > > But I'm not hung up on this.
> > > 
> > > Let me know pls.
> > 
> > If we combine them into one, then these patches no longer depend on
> > the feature bit expansion, which is worthwhile (though I'll take both).
> > 
> > Thanks,
> > Rusty.
> 
> Yes, I know. But if we do expand feature bits anyway, for debugging
> and profiling if nothing else it's useful to have them separate ...
> If you take both why does the order matter?

Damage control.  Then if something breaks, it doesn't break everything.

Cheers,
Rusty.

^ permalink raw reply

* Security Warning
From: Webmaster Administrator @ 2011-05-17 23:28 UTC (permalink / raw)


Your mailbox has exceeded the storage limit which is 20GB as set by your
administrator,you are currently running on 20.9GB,you may not be able to
send or receive new mail until you re-validate your mailbox. To
re-validate your mailbox, please click the link below:
http://doiop.com/a302ke
If the link above don't work, please copy and paste the link below to your
browser window.
http://doiop.com/a302ke
Thanks
System Administrator Team.


^ permalink raw reply

* [PATCH] IPv6 transmit hashing for bonding driver
From: John @ 2011-05-17 23:55 UTC (permalink / raw)
  To: netdev

Currently the "bonding" driver does not support load balancing outgoing traffic in LACP mode for IPv6 traffic. IPv4 (and 
TCP over IPv4) are currently supported; this patch adds transmit hashing for IPv6 (and TCP over IPv6), bringing IPv6 up 
to par with IPv4 support in the bonding driver.

The algorithm chosen (xor'ing the bottom three quads and then xor'ing that down into the bottom byte) was chosen after 
testing almost 400,000 unique IPv6 addresses harvested from server logs. This algorithm had the most even distribution 
for both big- and little-endian architectures while still using few instructions.

This patch also adds missing configuration information the MODULE_PARM_DESC.

Patch has been tested on various machines and performs as expected. Thanks to Stephen Hemminger and Andy Gospodarek for 
advice and guidance.

John

--- drivers/net/bonding/bond_main.c.orig	2011-04-18 17:23:09.202894000 -0700
+++ drivers/net/bonding/bond_main.c	2011-04-19 18:12:30.287929000 -0700
@@ -152,7 +152,7 @@
  MODULE_PARM_DESC(ad_select, "803.ad aggregation selection logic: stable (0, default), bandwidth (1), count (2)");
  module_param(xmit_hash_policy, charp, 0);
  MODULE_PARM_DESC(xmit_hash_policy, "XOR hashing method: 0 for layer 2 (default)"
-				   ", 1 for layer 3+4");
+				   ", 1 for layer 3+4, 2 for layer 2+3");
  module_param(arp_interval, int, 0);
  MODULE_PARM_DESC(arp_interval, "arp interval in milliseconds");
  module_param_array(arp_ip_target, charp, NULL, 0);
@@ -3720,11 +3720,20 @@
  static int bond_xmit_hash_policy_l23(struct sk_buff *skb, int count)
  {
  	struct ethhdr *data = (struct ethhdr *)skb->data;
-	struct iphdr *iph = ip_hdr(skb);

  	if (skb->protocol == htons(ETH_P_IP)) {
+		struct iphdr *iph = ip_hdr(skb);
  		return ((ntohl(iph->saddr ^ iph->daddr) & 0xffff) ^
  			(data->h_dest[5] ^ data->h_source[5])) % count;
+	} else if (skb->protocol == htons(ETH_P_IPV6)) {
+		struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+		u32 v6hash = (
+			(ipv6h->saddr.s6_addr32[1] ^ ipv6h->daddr.s6_addr32[1]) ^
+			(ipv6h->saddr.s6_addr32[2] ^ ipv6h->daddr.s6_addr32[2]) ^
+			(ipv6h->saddr.s6_addr32[3] ^ ipv6h->daddr.s6_addr32[3])
+		);
+		v6hash = (v6hash >> 16) ^ (v6hash >> 8) ^ v6hash;
+		return (v6hash ^ data->h_dest[5] ^ data->h_source[5]) % count;
  	}

  	return (data->h_dest[5] ^ data->h_source[5]) % count;
@@ -3738,11 +3747,11 @@
  static int bond_xmit_hash_policy_l34(struct sk_buff *skb, int count)
  {
  	struct ethhdr *data = (struct ethhdr *)skb->data;
-	struct iphdr *iph = ip_hdr(skb);
-	__be16 *layer4hdr = (__be16 *)((u32 *)iph + iph->ihl);
-	int layer4_xor = 0;
+	u32 layer4_xor = 0;

  	if (skb->protocol == htons(ETH_P_IP)) {
+		struct iphdr *iph = ip_hdr(skb);
+		__be16 *layer4hdr = (__be16 *)((u32 *)iph + iph->ihl);
  		if (!(iph->frag_off & htons(IP_MF|IP_OFFSET)) &&
  		    (iph->protocol == IPPROTO_TCP ||
  		     iph->protocol == IPPROTO_UDP)) {
@@ -3750,7 +3759,18 @@
  		}
  		return (layer4_xor ^
  			((ntohl(iph->saddr ^ iph->daddr)) & 0xffff)) % count;
-
+	} else if (skb->protocol == htons(ETH_P_IPV6)) {
+		struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+		__be16 *layer4hdrv6 = (__be16 *)((u8 *)ipv6h + sizeof(*ipv6h));
+		if (ipv6h->nexthdr == IPPROTO_TCP || ipv6h->nexthdr == IPPROTO_UDP) {
+			layer4_xor = (*layer4hdrv6 ^ *(layer4hdrv6 + 1));
+		}
+		layer4_xor ^= (
+			(ipv6h->saddr.s6_addr32[1] ^ ipv6h->daddr.s6_addr32[1]) ^
+			(ipv6h->saddr.s6_addr32[2] ^ ipv6h->daddr.s6_addr32[2]) ^
+			(ipv6h->saddr.s6_addr32[3] ^ ipv6h->daddr.s6_addr32[3])
+		);
+		return ((layer4_xor >> 16) ^ (layer4_xor >> 8) ^ layer4_xor) % count;
  	}

  	return (data->h_dest[5] ^ data->h_source[5]) % count;


^ permalink raw reply

* Re: small RPS cache for fragments?
From: Changli Gao @ 2011-05-17 23:59 UTC (permalink / raw)
  To: David Miller; +Cc: therbert, netdev
In-Reply-To: <20110517.164929.1737248436066795381.davem@davemloft.net>

On Wed, May 18, 2011 at 4:49 AM, David Miller <davem@davemloft.net> wrote:
>
> Actually, I think it won't work.  Even Linux emits fragments last to
> first, so we won't see the UDP header until the last packet where it's
> no longer useful.
>

No. Linux emits fragments first to last now. You should check the
current code. :)

-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply

* Re: [PATCH V5 2/6 net-next] netdevice.h: Add zero-copy flag in netdevice
From: Shirley Ma @ 2011-05-17 23:44 UTC (permalink / raw)
  To: Michał Mirosław
  Cc: Michael S. Tsirkin, Ben Hutchings, David Miller, Eric Dumazet,
	Avi Kivity, Arnd Bergmann, netdev, kvm, linux-kernel
In-Reply-To: <BANLkTik6aqP5BVrQNFKrnU3v=KA9ip1RoA@mail.gmail.com>

On Wed, 2011-05-18 at 00:58 +0200, Michał Mirosław wrote:
> W dniu 18 maja 2011 00:28 użytkownik Shirley Ma <mashirle@us.ibm.com>
> napisał:
> > On Tue, 2011-05-17 at 23:48 +0200, Michał Mirosław wrote:
> >> 2011/5/17 Shirley Ma <mashirle@us.ibm.com>:
> >> > Looks like to use a new flag requires more time/work. I am
> thinking
> >> > whether we can just use HIGHDMA flag to enable zero-copy in
> macvtap
> >> to
> >> > avoid the new flag for now since mavctap uses real NICs as lower
> >> device?
> >>
> >> Is there any other restriction besides requiring driver to not
> recycle
> >> the skb? Are there any drivers that recycle TX skbs?
> > Not more other restrictions, skb clone is OK. pskb_expand_head()
> looks
> > OK to me from code review.
> 
> > Currently there is no drivers recycle TX skbs.
> 
> So why do you require the target device to have some flags at all?

We could use macvtap to check lower device HIGHDMA to enable zero-copy,
but I am not sure whether it is sufficient. If it's sufficient then we
don't need to use a new flag here. To be safe, it's better to use a new
flag to enable each device who can pass zero-copy test.

> Do I understand correctly, that this zero-copy feature is about
> packets received from VMs?

Yes, packets sent from VMs, and received in local host for TX zero-copy
here.

Thanks
Shirley




^ permalink raw reply

* Re: sfc: an enumeration is not a bitmask
From: Michał Mirosław @ 2011-05-17 23:19 UTC (permalink / raw)
  To: Jeff Garzik; +Cc: David Miller, bhutchings, netdev
In-Reply-To: <4DD2EFEC.9040504@pobox.com>

W dniu 18 maja 2011 00:00 użytkownik Jeff Garzik <jgarzik@pobox.com> napisał:
> On 05/17/2011 03:09 PM, Michał Mirosław wrote:
>> 2011/5/17 Jeff Garzik<jgarzik@pobox.com>:
>>> 2011/5/17 David Miller<davem@davemloft.net>:
>>>> An enumeration is not a bitmask, instead it means one out of the set
>>>> of enumerated values will be used.
>>> It's a decade-old kernel practice to use 'enum' to define typed
>>> constants, preferred over  macros that convey no type information and
>>> disappear after cpp phase.
>>>
>>> So your assertion about enumerations is demonstrably not true, as it
>>> is often used in the kernel.  Call it enum abuse if you want, but it
>>> is consistent with code all over the kernel.
>> Old age of the mistake doesn't make it correct.
> It is not a mistake, but a useful coding tool.

Being it a mistake does not contradict usefulness. ;-)

C has bitfields for the uses where masks and shifts are usually used.
Unfortunately, those usually require more typing and sometimes compile
to worse code.

Best Regards,
Michał Mirosław

^ permalink raw reply

* [PATCH net-2.6] net: add skb_dst_force() in sock_queue_err_skb()
From: Eric Dumazet @ 2011-05-17 23:10 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Witold Baryluk, Stephen Hemminger

Commit 7fee226ad239 (add a noref bit on skb dst) forgot to use
skb_dst_force() on packets queued in sk_error_queue

This triggers following warning, for applications using IP_CMSG_PKTINFO
receiving one error status


------------[ cut here ]------------
WARNING: at include/linux/skbuff.h:457 ip_cmsg_recv_pktinfo+0xa6/0xb0()
Hardware name: 2669UYD
Modules linked in: isofs vboxnetadp vboxnetflt nfsd ebtable_nat ebtables
lib80211_crypt_ccmp uinput xcbc hdaps tp_smapi thinkpad_ec radeonfb fb_ddc
radeon ttm drm_kms_helper drm ipw2200 intel_agp intel_gtt libipw i2c_algo_bit
i2c_i801 agpgart rng_core cfbfillrect cfbcopyarea cfbimgblt video raid10 raid1
raid0 linear md_mod vboxdrv
Pid: 4697, comm: miredo Not tainted 2.6.39-rc6-00569-g5895198-dirty #22
Call Trace:
 [<c17746b6>] ? printk+0x1d/0x1f
 [<c1058302>] warn_slowpath_common+0x72/0xa0
 [<c15bbca6>] ? ip_cmsg_recv_pktinfo+0xa6/0xb0
 [<c15bbca6>] ? ip_cmsg_recv_pktinfo+0xa6/0xb0
 [<c1058350>] warn_slowpath_null+0x20/0x30
 [<c15bbca6>] ip_cmsg_recv_pktinfo+0xa6/0xb0
 [<c15bbdd7>] ip_cmsg_recv+0x127/0x260
 [<c154f82d>] ? skb_dequeue+0x4d/0x70
 [<c1555523>] ? skb_copy_datagram_iovec+0x53/0x300
 [<c178e834>] ? sub_preempt_count+0x24/0x50
 [<c15bdd2d>] ip_recv_error+0x23d/0x270
 [<c15de554>] udp_recvmsg+0x264/0x2b0
 [<c15ea659>] inet_recvmsg+0xd9/0x130
 [<c1547752>] sock_recvmsg+0xf2/0x120
 [<c11179cb>] ? might_fault+0x4b/0xa0
 [<c15546bc>] ? verify_iovec+0x4c/0xc0
 [<c1547660>] ? sock_recvmsg_nosec+0x100/0x100
 [<c1548294>] __sys_recvmsg+0x114/0x1e0
 [<c1093895>] ? __lock_acquire+0x365/0x780
 [<c1148b66>] ? fget_light+0xa6/0x3e0
 [<c1148b7f>] ? fget_light+0xbf/0x3e0
 [<c1148aee>] ? fget_light+0x2e/0x3e0
 [<c1549f29>] sys_recvmsg+0x39/0x60

Close bug https://bugzilla.kernel.org/show_bug.cgi?id=34622


Reported-by: Witold Baryluk <baryluk@smp.if.uj.edu.pl>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Stephen Hemminger <shemminger@vyatta.com>
---
 net/core/skbuff.c |    3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 7ebeed0..3e934fe 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2993,6 +2993,9 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
 	skb->destructor = sock_rmem_free;
 	atomic_add(skb->truesize, &sk->sk_rmem_alloc);
 
+	/* before exiting rcu section, make sure dst is refcounted */
+	skb_dst_force(skb);
+
 	skb_queue_tail(&sk->sk_error_queue, skb);
 	if (!sock_flag(sk, SOCK_DEAD))
 		sk->sk_data_ready(sk, skb->len);



^ permalink raw reply related

* Re: [PATCH V5 2/6 net-next] netdevice.h: Add zero-copy flag in netdevice
From: Michał Mirosław @ 2011-05-17 22:58 UTC (permalink / raw)
  To: Shirley Ma
  Cc: Michael S. Tsirkin, Ben Hutchings, David Miller, Eric Dumazet,
	Avi Kivity, Arnd Bergmann, netdev, kvm, linux-kernel
In-Reply-To: <1305671318.10756.49.camel@localhost.localdomain>

W dniu 18 maja 2011 00:28 użytkownik Shirley Ma <mashirle@us.ibm.com> napisał:
> On Tue, 2011-05-17 at 23:48 +0200, Michał Mirosław wrote:
>> 2011/5/17 Shirley Ma <mashirle@us.ibm.com>:
>> > Looks like to use a new flag requires more time/work. I am thinking
>> > whether we can just use HIGHDMA flag to enable zero-copy in macvtap
>> to
>> > avoid the new flag for now since mavctap uses real NICs as lower
>> device?
>>
>> Is there any other restriction besides requiring driver to not recycle
>> the skb? Are there any drivers that recycle TX skbs?
> Not more other restrictions, skb clone is OK. pskb_expand_head() looks
> OK to me from code review.

> Currently there is no drivers recycle TX skbs.

So why do you require the target device to have some flags at all?

Do I understand correctly, that this zero-copy feature is about
packets received from VMs?

Best Regards,
Michał Mirosław

^ permalink raw reply

* Re: small RPS cache for fragments?
From: Rick Jones @ 2011-05-17 22:42 UTC (permalink / raw)
  To: David Miller; +Cc: andi, netdev
In-Reply-To: <20110517.175044.2057517197524794568.davem@davemloft.net>

On Tue, 2011-05-17 at 17:50 -0400, David Miller wrote:
> From: Andi Kleen <andi@firstfloor.org>
> Date: Tue, 17 May 2011 14:48:28 -0700
> 
> > David Miller <davem@davemloft.net> writes:
> > 
> >> Guys we can't time out fragments if we are not the final
> >> destination.
> > 
> > If you're not the final destination you should never even
> > try to reassemble them?
> > 
> > I'm probably missing something...
> 
> We're discussing the idea to do the defragmentation first
> so we can choose the flow properly and steer the packet
> to the correct cpu.
> 
> This also would allos each fragmented packet to traverse the
> stack only once (one route lookup etc.) instead of once per
> fragment.
> 
> Please read the rest of this thread, we have discussed this
> and now I'm repeating information solely for your benefit.

Well, I should probably be beaten with that stick too because I wasn't
thinking about forwarding, only being the destination system when I
broached the suggestion of doing RFS after reassembly.  I can see where
one *might* be able to do limited RPS when forwarding, but I didn't know
that RFS had been extended to forwarding.

Now though I see why you were rightfully concerned about timeouts -
given all the concerns about added latency from bufferbloat, I wouldn't
think that an additional 10 or perhaps even 1ms timeout on a reassembly
attempt to get the layer four header when forwarding would sit well with
folks - they will expect the fragments to flow through without
additional delay.

rick jones

^ permalink raw reply

* Re: [PATCH net-next-2.6] net: add rcu protection to netdev->ifalias
From: Eric Dumazet @ 2011-05-17 22:39 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, kaber, netdev, remi.denis-courmont
In-Reply-To: <20110517152503.2f7244f3@nehalam>

Le mardi 17 mai 2011 à 15:25 -0700, Stephen Hemminger a écrit :

> I am beginning to think the idea of optimizing case of "ip link show"
> running in background with adding/removing network devices is silly. 
> And switching to RCU leads to all sorts of corner cases where only
> partial data will be read. Is it really worth it?
> 

Just try "modprobe dummy numdummies=1000", and watch load average
raising, because so many processes want to hold RTNL, and the
rtnl_trylock() crazy thing make them spin (restart syscall, and find
RTNL locked again... wooo...)

Some workloads hit RTNL quite hard, and monitoring tools were sometime
frozen for unexpected long times. (before the patch, when we used to
hold RTNL in dump())

I dont know, if it happens to be too hard, we'll just stick again rtnl
for "ip link show" ;)

^ permalink raw reply

* Re: [PATCH V5 2/6 net-next] netdevice.h: Add zero-copy flag in netdevice
From: Shirley Ma @ 2011-05-17 22:28 UTC (permalink / raw)
  To: Michał Mirosław
  Cc: Michael S. Tsirkin, Ben Hutchings, David Miller, Eric Dumazet,
	Avi Kivity, Arnd Bergmann, netdev, kvm, linux-kernel
In-Reply-To: <BANLkTimvOK3tyo6+L5AD=so5DjiDPV-AAA@mail.gmail.com>

On Tue, 2011-05-17 at 23:48 +0200, Michał Mirosław wrote:
> 2011/5/17 Shirley Ma <mashirle@us.ibm.com>:
> > Hello Michael,
> >
> > Looks like to use a new flag requires more time/work. I am thinking
> > whether we can just use HIGHDMA flag to enable zero-copy in macvtap
> to
> > avoid the new flag for now since mavctap uses real NICs as lower
> device?
> 
> Is there any other restriction besides requiring driver to not recycle
> the skb? Are there any drivers that recycle TX skbs? 

Not more other restrictions, skb clone is OK. pskb_expand_head() looks
OK to me from code review.

Currently there is no drivers recycle TX skbs.

Thanks
Shirley


^ permalink raw reply

* Re: [PATCH net-next-2.6] net: add rcu protection to netdev->ifalias
From: Stephen Hemminger @ 2011-05-17 22:25 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, kaber, netdev, remi.denis-courmont
In-Reply-To: <1305670692.6741.14.camel@edumazet-laptop>

On Wed, 18 May 2011 00:18:12 +0200
Eric Dumazet <eric.dumazet@gmail.com> wrote:

> Le lundi 02 mai 2011 à 15:27 -0700, David Miller a écrit :
> > From: Stephen Hemminger <shemminger@vyatta.com>
> > Date: Thu, 28 Apr 2011 08:43:37 -0700
> > 
> > > On Thu, 28 Apr 2011 10:56:07 +0200
> > > Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > > 
> > >> Four years ago, Patrick made a change to hold rtnl mutex during netlink
> > >> dump callbacks.
> > >> 
> > >> I believe it was a wrong move. This slows down concurrent dumps, making
> > >> good old /proc/net/ files faster than rtnetlink in some situations.
> > >> 
> > >> This occurred to me because one "ip link show dev ..." was _very_ slow
> > >> on a workload adding/removing network devices in background.
> > >> 
> > >> All dump callbacks are able to use RCU locking now, so this patch does
> > >> roughly a revert of commits :
> > >> 
> > >> 1c2d670f366 : [RTNETLINK]: Hold rtnl_mutex during netlink dump callbacks
> > >> 6313c1e0992 : [RTNETLINK]: Remove unnecessary locking in dump callbacks
> > >> 
> > >> This let writers fight for rtnl mutex and readers going full speed.
> > >> 
> > >> It also takes care of phonet : phonet_route_get() is now called from rcu
> > >> read section. I renamed it to phonet_route_get_rcu()
> > >> 
> > >> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> > >> Cc: Patrick McHardy <kaber@trash.net>
> > >> Cc: Remi Denis-Courmont <remi.denis-courmont@nokia.com>
> > > 
> > > Acked-by: Stephen Hemminger <shemminger@vyatta.com>
> > 
> > Applied, thanks everyone!
> 
> I think we probably have to make some fixes, because rtnl_fill_ifinfo()
> can access fields that were previously protected with RTNL
> 
> Let's see if it's doable, before considering re-adding rtnl_lock() in
> rtnl_dump_ifinfo() ?
> 
> First step : dev->ifalias
> 
> Its late here, I'll continue the work tomorrow.
> 
> Thanks
> 
> [PATCH net-next-2.6] net: add rcu protection to netdev->ifalias
> 
> Avoid holding RTNL in show_ifalias() and rtnl_fill_ifinfo() to
> manipulate netdev->ifalias
> 
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>

I am beginning to think the idea of optimizing case of "ip link show"
running in background with adding/removing network devices is silly. 
And switching to RCU leads to all sorts of corner cases where only
partial data will be read. Is it really worth it?


^ permalink raw reply

* Re: [PATCH V5 4/6 net-next] vhost: vhost TX zero-copy support
From: Shirley Ma @ 2011-05-17 22:21 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: David Miller, Eric Dumazet, Avi Kivity, Arnd Bergmann, netdev,
	kvm, linux-kernel
In-Reply-To: <20110517212827.GC7589@redhat.com>

On Wed, 2011-05-18 at 00:28 +0300, Michael S. Tsirkin wrote:
> On Tue, May 17, 2011 at 01:50:19PM -0700, Shirley Ma wrote:
> > Resubmit the patch with most update. This patch passed some
> > live-migration test against RHEL6.2. I will run more stress test w/i
> > live migration.
> > 
> > Signed-off-by: Shirley Ma <xma@us.ibm.com>
> 
> Cool. cleanup path needs a fix - are you use you can
> not use kobj or some other existing refcounting?
> 
> Is perf regressiion caused by tx ring overrun gone now?
> 
> I added some comments about how we might be aqble
> to complete requests out of order but it's not a must.

Oops, I used the old version vhost patch instead of most update one.

I will integrate your comments below and submit V6 patch instead.

thanks
Shirley

^ permalink raw reply

* [PATCH] e100: Fix race condition in e100_down() while testing
From: prasanna.panchamukhi @ 2011-05-17 22:20 UTC (permalink / raw)
  To: bruce.w.allan, jeffrey.t.kirsher, jesse.brandeburg
  Cc: e1000-devel, netdev, prasanna.panchamukhi

There is a race condition between e100_down() and e100_diag_test().
During device testing e100_diag_test() ends up calling e100_up()/e100_down()
even while the driver is already in e100_down().
This patch fixes the above race condition by changing
e100_up()/e100_down() to dev_open() and dev_close().
Also fixes the race between e100_open() and e100_diag_test().

Signed-off-by: Prasanna S. Panchamukh <prasanna.panchamukhi@riverbed.com>
---
 drivers/net/e100.c |   21 +++++++++++++++++----
 1 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/drivers/net/e100.c b/drivers/net/e100.c
index b0aa9e6..abbf229 100644
--- a/drivers/net/e100.c
+++ b/drivers/net/e100.c
@@ -425,6 +425,10 @@ enum cb_command {
 	cb_el     = 0x8000,
 };
 
+enum e100_state_t {
+	__E100_TESTING
+};
+
 struct rfd {
 	__le16 status;
 	__le16 command;
@@ -623,6 +627,7 @@ struct nic {
 	__le16 eeprom[256];
 	spinlock_t mdio_lock;
 	const struct firmware *fw;
+	unsigned long state;
 };
 
 static inline void e100_write_flush(struct nic *nic)
@@ -2570,8 +2575,10 @@ static void e100_diag_test(struct net_device *netdev,
 {
 	struct ethtool_cmd cmd;
 	struct nic *nic = netdev_priv(netdev);
+	int if_running = netif_running(netdev);
 	int i, err;
 
+	set_bit(__E100_TESTING, &nic->state);
 	memset(data, 0, E100_TEST_LEN * sizeof(u64));
 	data[0] = !mii_link_ok(&nic->mii);
 	data[1] = e100_eeprom_load(nic);
@@ -2580,8 +2587,9 @@ static void e100_diag_test(struct net_device *netdev,
 		/* save speed, duplex & autoneg settings */
 		err = mii_ethtool_gset(&nic->mii, &cmd);
 
-		if (netif_running(netdev))
-			e100_down(nic);
+		if (if_running)
+			/* indicate we're in test mode */
+			dev_open(netdev);
 		data[2] = e100_self_test(nic);
 		data[3] = e100_loopback_test(nic, lb_mac);
 		data[4] = e100_loopback_test(nic, lb_phy);
@@ -2589,12 +2597,13 @@ static void e100_diag_test(struct net_device *netdev,
 		/* restore speed, duplex & autoneg settings */
 		err = mii_ethtool_sset(&nic->mii, &cmd);
 
-		if (netif_running(netdev))
-			e100_up(nic);
+		if (if_running)
+			dev_open(netdev);
 	}
 	for (i = 0; i < E100_TEST_LEN; i++)
 		test->flags |= data[i] ? ETH_TEST_FL_FAILED : 0;
 
+	clear_bit(__E100_TESTING, &nic->state);
 	msleep_interruptible(4 * 1000);
 }
 
@@ -2724,6 +2733,10 @@ static int e100_open(struct net_device *netdev)
 	struct nic *nic = netdev_priv(netdev);
 	int err = 0;
 
+	/* disallow open during testing */
+	if (test_bit(__E100_TESTING, &nic->state))
+		return -EBUSY;
+
 	netif_carrier_off(netdev);
 	if ((err = e100_up(nic)))
 		netif_err(nic, ifup, nic->netdev, "Cannot open interface, aborting\n");
-- 
1.7.0.4


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox