Netdev List
 help / color / mirror / Atom feed
* [PATCH 2/4] ipv6: Use state_lock to protect ifa state
From: Herbert Xu @ 2010-05-18 11:04 UTC (permalink / raw)
  To: David Miller, jbohac, yoshfuji, netdev, shemminger
In-Reply-To: <20100518110243.GA7750@gondor.apana.org.au>

ipv6: Use state_lock to protect ifa state

This patch makes use of the new state_lock to synchronise between
updates to the ifa state.  This fixes the issue where a remotely
triggered address deletion (through DAD failure) coincides with a
local administrative address deletion, causing certain actions to
be performed twice incorrectly.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---

 net/ipv6/addrconf.c |   23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 14b2ccf..d8d7c63 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -711,13 +711,20 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp)
 {
 	struct inet6_ifaddr *ifa, **ifap;
 	struct inet6_dev *idev = ifp->idev;
+	int state;
 	int hash;
 	int deleted = 0, onlink = 0;
 	unsigned long expires = jiffies;
 
 	hash = ipv6_addr_hash(&ifp->addr);
 
+	spin_lock_bh(&ifp->state_lock);
+	state = ifp->state;
 	ifp->state = INET6_IFADDR_STATE_DEAD;
+	spin_unlock_bh(&ifp->state_lock);
+
+	if (state == INET6_IFADDR_STATE_DEAD)
+		goto out;
 
 	write_lock_bh(&addrconf_hash_lock);
 	for (ifap = &inet6_addr_lst[hash]; (ifa=*ifap) != NULL;
@@ -831,6 +838,7 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp)
 		dst_release(&rt->u.dst);
 	}
 
+out:
 	in6_ifa_put(ifp);
 }
 
@@ -2621,6 +2629,7 @@ static int addrconf_ifdown(struct net_device *dev, int how)
 	struct inet6_dev *idev;
 	struct inet6_ifaddr *ifa, *keep_list, **bifa;
 	struct net *net = dev_net(dev);
+	int state;
 	int i;
 
 	ASSERT_RTNL();
@@ -2680,7 +2689,6 @@ static int addrconf_ifdown(struct net_device *dev, int how)
 	while ((ifa = idev->tempaddr_list) != NULL) {
 		idev->tempaddr_list = ifa->tmp_next;
 		ifa->tmp_next = NULL;
-		ifa->state = INET6_IFADDR_STATE_DEAD;
 		write_unlock_bh(&idev->lock);
 		spin_lock_bh(&ifa->lock);
 
@@ -2723,14 +2731,25 @@ static int addrconf_ifdown(struct net_device *dev, int how)
 
 			/* Flag it for later restoration when link comes up */
 			ifa->flags |= IFA_F_TENTATIVE;
+
+			write_unlock_bh(&idev->lock);
+
 			in6_ifa_hold(ifa);
 		} else {
+			write_unlock_bh(&idev->lock);
+			spin_lock_bh(&ifa->state_lock);
+			state = ifa->state;
 			ifa->state = INET6_IFADDR_STATE_DEAD;
+			spin_unlock_bh(&ifa->state_lock);
+
+			if (state == INET6_IFADDR_STATE_DEAD)
+				goto put_ifa;
 		}
-		write_unlock_bh(&idev->lock);
 
 		__ipv6_ifa_notify(RTM_DELADDR, ifa);
 		atomic_notifier_call_chain(&inet6addr_chain, NETDEV_DOWN, ifa);
+
+put_ifa:
 		in6_ifa_put(ifa);
 
 		write_lock_bh(&idev->lock);

^ permalink raw reply related

* [PATCH 1/4] ipv6: Replace inet6_ifaddr->dead with state
From: Herbert Xu @ 2010-05-18 11:04 UTC (permalink / raw)
  To: David Miller, jbohac, yoshfuji, netdev, shemminger
In-Reply-To: <20100518110243.GA7750@gondor.apana.org.au>

ipv6: Replace inet6_ifaddr->dead with state

This patch replaces the boolean dead flag on inet6_ifaddr with
a state enum.  This allows us to roll back changes when deleting
an address according to whether DAD has completed or not.

This patch only adds the state field and does not change the logic.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---

 include/net/if_inet6.h |   12 ++++++++++--
 net/ipv6/addrconf.c    |   11 ++++++-----
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h
index 545d8b0..421a1e8 100644
--- a/include/net/if_inet6.h
+++ b/include/net/if_inet6.h
@@ -30,6 +30,13 @@
 #define IF_PREFIX_ONLINK	0x01
 #define IF_PREFIX_AUTOCONF	0x02
 
+enum {
+	INET6_IFADDR_STATE_DAD,
+	INET6_IFADDR_STATE_POSTDAD,
+	INET6_IFADDR_STATE_UP,
+	INET6_IFADDR_STATE_DEAD,
+};
+
 #ifdef __KERNEL__
 
 struct inet6_ifaddr {
@@ -40,6 +47,9 @@ struct inet6_ifaddr {
 	__u32			prefered_lft;
 	atomic_t		refcnt;
 	spinlock_t		lock;
+	spinlock_t		state_lock;
+
+	int			state;
 
 	__u8			probes;
 	__u8			flags;
@@ -62,8 +72,6 @@ struct inet6_ifaddr {
 	struct inet6_ifaddr	*ifpub;
 	int			regen_count;
 #endif
-
-	int			dead;
 };
 
 struct ip6_sf_socklist {
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 413054f..14b2ccf 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -539,7 +539,7 @@ void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp)
 	if (del_timer(&ifp->timer))
 		printk("Timer is still running, when freeing ifa=%p\n", ifp);
 
-	if (!ifp->dead) {
+	if (ifp->state != INET6_IFADDR_STATE_DEAD) {
 		printk("Freeing alive inet6 address %p\n", ifp);
 		return;
 	}
@@ -642,6 +642,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, int pfxlen,
 	ipv6_addr_copy(&ifa->addr, addr);
 
 	spin_lock_init(&ifa->lock);
+	spin_lock_init(&ifa->state_lock);
 	init_timer(&ifa->timer);
 	ifa->timer.data = (unsigned long) ifa;
 	ifa->scope = scope;
@@ -716,7 +717,7 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp)
 
 	hash = ipv6_addr_hash(&ifp->addr);
 
-	ifp->dead = 1;
+	ifp->state = INET6_IFADDR_STATE_DEAD;
 
 	write_lock_bh(&addrconf_hash_lock);
 	for (ifap = &inet6_addr_lst[hash]; (ifa=*ifap) != NULL;
@@ -2679,7 +2680,7 @@ static int addrconf_ifdown(struct net_device *dev, int how)
 	while ((ifa = idev->tempaddr_list) != NULL) {
 		idev->tempaddr_list = ifa->tmp_next;
 		ifa->tmp_next = NULL;
-		ifa->dead = 1;
+		ifa->state = INET6_IFADDR_STATE_DEAD;
 		write_unlock_bh(&idev->lock);
 		spin_lock_bh(&ifa->lock);
 
@@ -2724,7 +2725,7 @@ static int addrconf_ifdown(struct net_device *dev, int how)
 			ifa->flags |= IFA_F_TENTATIVE;
 			in6_ifa_hold(ifa);
 		} else {
-			ifa->dead = 1;
+			ifa->state = INET6_IFADDR_STATE_DEAD;
 		}
 		write_unlock_bh(&idev->lock);
 
@@ -2827,7 +2828,7 @@ static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags)
 	net_srandom(ifp->addr.s6_addr32[3]);
 
 	read_lock_bh(&idev->lock);
-	if (ifp->dead)
+	if (ifp->state == INET6_IFADDR_STATE_DEAD)
 		goto out;
 
 	spin_lock(&ifp->lock);

^ permalink raw reply related

* [PATCH 4/4] ipv6: Never schedule DAD timer on dead address
From: Herbert Xu @ 2010-05-18 11:04 UTC (permalink / raw)
  To: David Miller, jbohac, yoshfuji, netdev, shemminger
In-Reply-To: <20100518110243.GA7750@gondor.apana.org.au>

ipv6: Never schedule DAD timer on dead address

This patch ensures that all places that schedule the DAD timer
look at the address state in a safe manner before scheduling the
timer.  This ensures that we don't end up with pending timers
after deleting an address.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---

 net/ipv6/addrconf.c |   10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index e7e9643..dd72f9c 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2865,10 +2865,10 @@ static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags)
 	net_srandom(ifp->addr.s6_addr32[3]);
 
 	read_lock_bh(&idev->lock);
+	spin_lock(&ifp->lock);
 	if (ifp->state == INET6_IFADDR_STATE_DEAD)
 		goto out;
 
-	spin_lock(&ifp->lock);
 	if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) ||
 	    idev->cnf.accept_dad < 1 ||
 	    !(ifp->flags&IFA_F_TENTATIVE) ||
@@ -2902,8 +2902,8 @@ static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags)
 		ip6_ins_rt(ifp->rt);
 
 	addrconf_dad_kick(ifp);
-	spin_unlock(&ifp->lock);
 out:
+	spin_unlock(&ifp->lock);
 	read_unlock_bh(&idev->lock);
 }
 
@@ -2923,6 +2923,12 @@ static void addrconf_dad_timer(unsigned long data)
 	}
 
 	spin_lock(&ifp->lock);
+	if (ifp->state == INET6_IFADDR_STATE_DEAD) {
+		spin_unlock(&ifp->lock);
+		read_unlock(&idev->lock);
+		goto out;
+	}
+
 	if (ifp->probes == 0) {
 		/*
 		 * DAD was successful

^ permalink raw reply related

* [PATCH 3/4] ipv6: Use POSTDAD state
From: Herbert Xu @ 2010-05-18 11:04 UTC (permalink / raw)
  To: David Miller, jbohac, yoshfuji, netdev, shemminger
In-Reply-To: <20100518110243.GA7750@gondor.apana.org.au>

ipv6: Use POSTDAD state

This patch makes use of the new POSTDAD state.  This prevents
a race between DAD completion and failure.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---

 net/ipv6/addrconf.c |   29 ++++++++++++++++++++++++-----
 1 file changed, 24 insertions(+), 5 deletions(-)

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index d8d7c63..e7e9643 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1412,10 +1412,27 @@ static void addrconf_dad_stop(struct inet6_ifaddr *ifp, int dad_failed)
 		ipv6_del_addr(ifp);
 }
 
+static int addrconf_dad_end(struct inet6_ifaddr *ifp)
+{
+	int err = -ENOENT;
+
+	spin_lock(&ifp->state_lock);
+	if (ifp->state == INET6_IFADDR_STATE_DAD) {
+		ifp->state = INET6_IFADDR_STATE_POSTDAD;
+		err = 0;
+	}
+	spin_unlock(&ifp->state_lock);
+
+	return err;
+}
+
 void addrconf_dad_failure(struct inet6_ifaddr *ifp)
 {
 	struct inet6_dev *idev = ifp->idev;
 
+	if (addrconf_dad_end(ifp))
+		return;
+
 	if (net_ratelimit())
 		printk(KERN_INFO "%s: IPv6 duplicate address %pI6c detected!\n",
 			ifp->idev->dev->name, &ifp->addr);
@@ -2731,6 +2748,7 @@ static int addrconf_ifdown(struct net_device *dev, int how)
 
 			/* Flag it for later restoration when link comes up */
 			ifa->flags |= IFA_F_TENTATIVE;
+			ifa->state = INET6_IFADDR_STATE_DAD;
 
 			write_unlock_bh(&idev->lock);
 
@@ -2895,6 +2913,9 @@ static void addrconf_dad_timer(unsigned long data)
 	struct inet6_dev *idev = ifp->idev;
 	struct in6_addr mcaddr;
 
+	if (!ifp->probes && addrconf_dad_end(ifp))
+		goto out;
+
 	read_lock(&idev->lock);
 	if (idev->dead || !(idev->if_flags & IF_READY)) {
 		read_unlock(&idev->lock);
@@ -2967,12 +2988,10 @@ static void addrconf_dad_run(struct inet6_dev *idev) {
 	read_lock_bh(&idev->lock);
 	for (ifp = idev->addr_list; ifp; ifp = ifp->if_next) {
 		spin_lock(&ifp->lock);
-		if (!(ifp->flags & IFA_F_TENTATIVE)) {
-			spin_unlock(&ifp->lock);
-			continue;
-		}
+		if (ifp->flags & IFA_F_TENTATIVE &&
+		    ifp->state == INET6_IFADDR_STATE_DAD)
+			addrconf_dad_kick(ifp);
 		spin_unlock(&ifp->lock);
-		addrconf_dad_kick(ifp);
 	}
 	read_unlock_bh(&idev->lock);
 }

^ permalink raw reply related

* RE: get beyond 1Gbps with pktgen on 10Gb nic?
From: Jon Zhou @ 2010-05-18 11:14 UTC (permalink / raw)
  To: Rick Jones; +Cc: netdev@vger.kernel.org
In-Reply-To: <4BEAF44E.8090201@hp.com>

hi rick:

do you mean "TCP_NODELAY" will send with packet size as I expect

without this option,netperf might sent packet with large size? (but eventually it will be splitted into MTU size?)

thanks
jon


-----Original Message-----
From: Rick Jones [mailto:rick.jones2@hp.com] 
Sent: Thursday, May 13, 2010 2:33 AM
To: Jesse Brandeburg
Cc: Jon Zhou; Ben Greear; Ben Hutchings; netdev@vger.kernel.org
Subject: Re: get beyond 1Gbps with pktgen on 10Gb nic?

Jesse Brandeburg wrote:
> On Tue, May 11, 2010 at 9:00 PM, Jon Zhou <Jon.Zhou@jdsu.com> wrote:
> 
>> I just used multi netperf instances to reach 900K pps/8Gb+ traffic on the
>> Broadcom 10G nic:

>>
>>command:
>>
>>for i in 1 2 3 4 5 6 7 8 9 10
>>do
>>netperf -l 60 -H 192.168.0.53 -- -m 60 -s 100M -S 100M &
>>done

100 Megabytes seems a trifle excessive as a socket buffer size.  I would suggest 
  lopping-off a few zeros and use 1M instead.  Or, one can let linux auto-tune 
the socket buffers/windows - just don't accept the socket buffer size reported 
by the classic netperf command - it is from the initial creation of the socket. 
  To get what it became by the end of the test one should use the "omni" tests. 
  Contact me offlist or via netperf-talk in the netperf.org domain for more on that.

>> the msg size was assigned as 64 bytes, but when I checked the file captured
>> by tcpdump, found that netperf sent many frames which are large than 64
>> bytes(i.e.4000-10K+ bytes) and these frames were truncated by tcpdump.
>>
>> so that the actual avg packet size is around 1500 bytes, but what I want is
>> avg packet: 300-400 bytes and reach 5Gb+.
>>
>>does it make sense?
> 
> 
> if you set the TCP_NODELAY (to disable nagle) option on netperf 

If he was seeing 4K to 10K byte frames in his tcpdump, that was likely TSO above 
and beyond nagle.  I was going to say it also suggests he was running tcpdump on 
the sending side rather than the receiver, but then there is LRO/GRO isn't there...

> (check netperf -t TCP_STREAM -- -h) then you should be able to control packet
>  size.

Or at least influence it meaningfully :)  If he was seeing 4K to 10K byte frames 
in his tcpdump, that was likely TSO above and beyond nagle.If there are packet 
losses and retransmissions, the retransmissions, which may or may not include 
new data may be larger.

The "netperf -t TCP_STREAM -- -h" to get test-specific help shown by Jesse will 
show the option you need to add to set TCP_NODELAY.  For additional descriptions 
of netperf command options:

http://www.netperf.org/svn/netperf2/tags/netperf-2.4.5/doc/netperf.html

For "quick and dirty" testing, the loop as it appears above is "ok" but I would 
suggest abusing the confidence intervals code to minimize the skew error:

http://www.netperf.org/svn/netperf2/tags/netperf-2.4.5/doc/netperf.html#Using-Netperf-to-Measure-Aggregate-Performance

happy benchmarking,

rick jones

> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


^ permalink raw reply

* Re: [PATCHv2] virtio: put last seen used index into ring itself
From: Juan Quintela @ 2010-05-18 11:26 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Rusty Russell, Jiri Pirko, Shirley Ma, Amit Shah, Mark McLoughlin,
	netdev, linux-kernel, alex.williamson
In-Reply-To: <20100518021315.GA22852@redhat.com>

"Michael S. Tsirkin" <mst@redhat.com> wrote:
> Generally, the Host end of the virtio ring doesn't need to see where
> Guest is up to in consuming the ring.  However, to completely understand
> what's going on from the outside, this information must be exposed.
> For example, host can reduce the number of interrupts by detecting
> that the guest is currently handling previous buffers.
>
> Fortunately, we have room to expand: the ring is always a whole number
> of pages and there's hundreds of bytes of padding after the avail ring
> and the used ring, whatever the number of descriptors (which must be a
> power of 2).
>
> We add a feature bit so the guest can tell the host that it's writing
> out the current value there, if it wants to use that.
>
> This is based on a patch by Rusty Russell, with the main difference
> being that we dedicate a feature bit to guest to tell the host it is
> writing the used index.  This way we don't need to force host to publish
> the last available index until we have a use for it.
>
> Another difference is that while the feature helps virtio-net,
> there have been conflicting reports wrt virtio-blk.
> The reason is unknown, it could be due to the fact that
> virtio-blk does not bother to disable interrupts at all.
> So for now, this patch only acks this feature for -net.
>
> Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>

It looks good.

Later, Juan.

^ permalink raw reply

* Re: [PATCHv2] vhost-net: utilize PUBLISH_USED_IDX feature
From: Juan Quintela @ 2010-05-18 11:31 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: davem, Rusty Russell, Paul E. McKenney, Arnd Bergmann, kvm,
	virtualization, netdev, linux-kernel, alex.williamson, amit.shah
In-Reply-To: <20100518022105.GA23129@redhat.com>

"Michael S. Tsirkin" <mst@redhat.com> wrote:
> With PUBLISH_USED_IDX, guest tells us which used entries
> it has consumed. This can be used to reduce the number
> of interrupts: after we write a used entry, if the guest has not yet
> consumed the previous entry, or if the guest has already consumed the
> new entry, we do not need to interrupt.
> This imporves bandwidth by 30% under some workflows.
>
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> ---
>
> This is on top of Rusty's tree and depends on the virtio patch.
>
> Changes from v1:
> fix build

Minor nit if you have to respin it:

>  /* This actually signals the guest, using eventfd. */
> -void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq)
> +static void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq)
>  {
>  	__u16 flags;
> +	__u16 used;

local var named "used" here

>  	/* Flush out used index updates. This is paired
>  	 * with the barrier that the Guest executes when enabling
>  	 * interrupts. */
> @@ -1053,6 +1057,17 @@ void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq)
>  	     !vhost_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY)))
>  		return;
>  
> +	if (vhost_has_feature(dev, VIRTIO_RING_F_PUBLISH_USED)) {
> +		__u16 used;

and a "more" local one also named "used" :)

I guess you want to remove the previous declaration, as var is only used
in this block.

> +		if (get_user(used, vq->last_used)) {
> +			vq_err(vq, "Failed to get last used idx");
> +			return;
> +		}
> +
> +		if (used != (u16)(vq->last_used_idx - 1))
> +			return;
> +	}
> +
>  	/* Signal the Guest tell them we used something up. */
>  	if (vq->call_ctx)
>  		eventfd_signal(vq->call_ctx, 1);

Rest of patch looks ok, and as a bonus un-export two functions.

Later, Juan.

^ permalink raw reply

* Re: [PATCH] bfin_mac: fix memleak in mii_bus{probe|remove}
From: Denis Kirjanov @ 2010-05-18 11:34 UTC (permalink / raw)
  To: Sonic Zhang
  Cc: davem, michael.hennerich, cooloney, uclinux-dist-devel, netdev
In-Reply-To: <AANLkTikcB3f0KmkcnDsV46Xrpm3WcCLQklxbLSJfMTlW@mail.gmail.com>

On Tue, May 18, 2010 at 18:05 +0800, Sonic Zhang wrote:
> 2010/5/18 Denis Kirjanov <kirjanov@gmail.com <kirjanov@gmail.com>:
> > Fix memory leak with miibus->irq
> >
> > Signed-off-by: Denis Kirjanov <dkirjanov@kernel.org>
> > ---
> >
> > drivers/net/bfin_mac.c |    2 ++
> >  1 files changed, 2 insertions(+), 0 deletions(-)
> >
> > diff --git a/drivers/net/bfin_mac.c b/drivers/net/bfin_mac.c
> > index 39a54ba..7a17b8a 100644
> > --- a/drivers/net/bfin_mac.c
> > +++ b/drivers/net/bfin_mac.c
> > @@ -1627,6 +1627,7 @@ static int __devinit bfin_mii_bus_probe(struct platform_device *pdev)
> >
> >  out_err_mdiobus_register:
> >        mdiobus_free(miibus);
> > +       kfree(miibus->irq);
> 
> Should you move this kfree before mdiobus_free?
> 
> >  out_err_alloc:
> >        peripheral_free_list(pin_req);
> >
> > @@ -1638,6 +1639,7 @@ static int __devexit bfin_mii_bus_remove(struct platform_device *pdev)
> >        struct mii_bus *miibus = platform_get_drvdata(pdev);
> >        platform_set_drvdata(pdev, NULL);
> >        mdiobus_unregister(miibus);
> > +       kfree(miibus->irq);
> >        mdiobus_free(miibus);
> >        peripheral_free_list(pin_req);
> >        return 0;
> > --
> > To unsubscribe from this list: send the line "unsubscribe netdev" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> >
Oops, yeah, fixed.
Thanks.

[PATCH] bfin_mac: fix memleak in mii_bus_{probe|remove}
Fix memory leak with miibus->irq

Signed-off-by: Denis Kirjanov <dkirjanov@kernel.org>
---
drivers/net/bfin_mac.c |    2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/drivers/net/bfin_mac.c b/drivers/net/bfin_mac.c
index 39a54ba..368f333 100644
--- a/drivers/net/bfin_mac.c
+++ b/drivers/net/bfin_mac.c
@@ -1626,6 +1626,7 @@ static int __devinit bfin_mii_bus_probe(struct platform_device *pdev)
 	return 0;
 
 out_err_mdiobus_register:
+	kfree(miibus->irq);
 	mdiobus_free(miibus);
 out_err_alloc:
 	peripheral_free_list(pin_req);
@@ -1638,6 +1639,7 @@ static int __devexit bfin_mii_bus_remove(struct platform_device *pdev)
 	struct mii_bus *miibus = platform_get_drvdata(pdev);
 	platform_set_drvdata(pdev, NULL);
 	mdiobus_unregister(miibus);
+	kfree(miibus->irq);
 	mdiobus_free(miibus);
 	peripheral_free_list(pin_req);
 	return 0;


^ permalink raw reply related

* [PATCH net-next-2.6] bonding: remove redundant checks from bonding_store_slaves
From: Jiri Pirko @ 2010-05-18 12:09 UTC (permalink / raw)
  To: netdev; +Cc: davem, fubar, bonding-devel

Remove checks that duplicates similar checks in bond_enslave.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>

diff --git a/drivers/net/bonding/bond_sysfs.c b/drivers/net/bonding/bond_sysfs.c
index 4e84cfc..6c44c07 100644
--- a/drivers/net/bonding/bond_sysfs.c
+++ b/drivers/net/bonding/bond_sysfs.c
@@ -243,7 +243,7 @@ static ssize_t bonding_store_slaves(struct device *d,
 
 	if (command[0] == '+') {
 
-		/* Got a slave name in ifname.  Is it already in the list? */
+		/* Got a slave name in ifname. */
 
 		dev = __dev_get_by_name(dev_net(bond->dev), ifname);
 		if (!dev) {
@@ -253,24 +253,6 @@ static ssize_t bonding_store_slaves(struct device *d,
 			goto out;
 		}
 
-		if (dev->flags & IFF_UP) {
-			pr_err("%s: Error: Unable to enslave %s because it is already up.\n",
-			       bond->dev->name, dev->name);
-			ret = -EPERM;
-			goto out;
-		}
-
-		read_lock(&bond->lock);
-		bond_for_each_slave(bond, slave, i)
-			if (slave->dev == dev) {
-				pr_err("%s: Interface %s is already enslaved!\n",
-				       bond->dev->name, ifname);
-				ret = -EPERM;
-				read_unlock(&bond->lock);
-				goto out;
-			}
-		read_unlock(&bond->lock);
-
 		pr_info("%s: Adding slave %s.\n", bond->dev->name, ifname);
 
 		/* If this is the first slave, then we need to set

^ permalink raw reply related

* Re: [PATCH net-next-2.6] bonding: move slave MTU handling from sysfs
From: Jiri Pirko @ 2010-05-18 12:21 UTC (permalink / raw)
  To: netdev; +Cc: davem, fubar, bonding-devel, monis
In-Reply-To: <20100517144731.GE2878@psychotron.lab.eng.brq.redhat.com>

Actually, please scratch this, the patch is not correct (res/ret). Will post the
correct one soon.

Jirka

Mon, May 17, 2010 at 04:47:31PM CEST, jpirko@redhat.com wrote:
>For some reason, MTU handling (storing, and restoring) is taking  place in
>bond_sysfs. The correct place for this code is in bond_enslave, bond_release.
>So move it there.
>
>Jirka
>
>Signed-off-by: Jiri Pirko <jpirko@redhat.com>
>
>diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
>index 5e12462..2c3f9db 100644
>--- a/drivers/net/bonding/bond_main.c
>+++ b/drivers/net/bonding/bond_main.c
>@@ -1533,6 +1533,14 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
> 	 */
> 	new_slave->original_flags = slave_dev->flags;
> 
>+	/* Save slave's original mtu and then set it to match the bond */
>+	new_slave->original_mtu = slave_dev->mtu;
>+	res = dev_set_mtu(slave_dev, bond->dev->mtu);
>+	if (res) {
>+		pr_debug("Error %d calling dev_set_mtu\n", res);
>+		goto err_free;
>+	}
>+
> 	/*
> 	 * Save slave's original ("permanent") mac address for modes
> 	 * that need it, and for restoring it upon release, and then
>@@ -1550,7 +1558,7 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
> 		res = dev_set_mac_address(slave_dev, &addr);
> 		if (res) {
> 			pr_debug("Error %d calling set_mac_address\n", res);
>-			goto err_free;
>+			goto err_restore_mtu;
> 		}
> 	}
> 
>@@ -1785,6 +1793,9 @@ err_restore_mac:
> 		dev_set_mac_address(slave_dev, &addr);
> 	}
> 
>+err_restore_mtu:
>+	dev_set_mtu(slave_dev, new_slave->original_mtu);
>+
> err_free:
> 	kfree(new_slave);
> 
>@@ -1969,6 +1980,8 @@ int bond_release(struct net_device *bond_dev, struct net_device *slave_dev)
> 		dev_set_mac_address(slave_dev, &addr);
> 	}
> 
>+	dev_set_mtu(slave_dev, slave->original_mtu);
>+
> 	slave_dev->priv_flags &= ~(IFF_MASTER_8023AD | IFF_MASTER_ALB |
> 				   IFF_SLAVE_INACTIVE | IFF_BONDING |
> 				   IFF_SLAVE_NEEDARP);
>diff --git a/drivers/net/bonding/bond_sysfs.c b/drivers/net/bonding/bond_sysfs.c
>index 392e291..4e84cfc 100644
>--- a/drivers/net/bonding/bond_sysfs.c
>+++ b/drivers/net/bonding/bond_sysfs.c
>@@ -220,7 +220,6 @@ static ssize_t bonding_store_slaves(struct device *d,
> 	char command[IFNAMSIZ + 1] = { 0, };
> 	char *ifname;
> 	int i, res, ret = count;
>-	u32 original_mtu;
> 	struct slave *slave;
> 	struct net_device *dev = NULL;
> 	struct bonding *bond = to_bond(d);
>@@ -281,43 +280,22 @@ static ssize_t bonding_store_slaves(struct device *d,
> 			memcpy(bond->dev->dev_addr, dev->dev_addr,
> 			       dev->addr_len);
> 
>-		/* Set the slave's MTU to match the bond */
>-		original_mtu = dev->mtu;
>-		res = dev_set_mtu(dev, bond->dev->mtu);
>-		if (res) {
>-			ret = res;
>-			goto out;
>-		}
>-
> 		res = bond_enslave(bond->dev, dev);
>-		bond_for_each_slave(bond, slave, i)
>-			if (strnicmp(slave->dev->name, ifname, IFNAMSIZ) == 0)
>-				slave->original_mtu = original_mtu;
>-		if (res)
>-			ret = res;
> 
> 		goto out;
> 	}
> 
> 	if (command[0] == '-') {
> 		dev = NULL;
>-		original_mtu = 0;
> 		bond_for_each_slave(bond, slave, i)
> 			if (strnicmp(slave->dev->name, ifname, IFNAMSIZ) == 0) {
> 				dev = slave->dev;
>-				original_mtu = slave->original_mtu;
> 				break;
> 			}
> 		if (dev) {
> 			pr_info("%s: Removing slave %s\n",
> 				bond->dev->name, dev->name);
>-				res = bond_release(bond->dev, dev);
>-			if (res) {
>-				ret = res;
>-				goto out;
>-			}
>-			/* set the slave MTU to the default */
>-			dev_set_mtu(dev, original_mtu);
>+			res = bond_release(bond->dev, dev);
> 		} else {
> 			pr_err("unable to remove non-existent slave %s for bond %s.\n",
> 			       ifname, bond->dev->name);

^ permalink raw reply

* Re: [PATCH net-next-2.6] bonding: remove redundant checks from bonding_store_slaves
From: Jiri Pirko @ 2010-05-18 12:23 UTC (permalink / raw)
  To: netdev; +Cc: davem, fubar, bonding-devel
In-Reply-To: <20100518120944.GA2878@psychotron.lab.eng.brq.redhat.com>

Please scratch this one too, will repost it after I post 2nd version of
"[PATCH net-next-2.6] bonding: move slave MTU handling from sysfs"

Thanks, Jirka

Tue, May 18, 2010 at 02:09:45PM CEST, jpirko@redhat.com wrote:
>Remove checks that duplicates similar checks in bond_enslave.
>
>Signed-off-by: Jiri Pirko <jpirko@redhat.com>
>
>diff --git a/drivers/net/bonding/bond_sysfs.c b/drivers/net/bonding/bond_sysfs.c
>index 4e84cfc..6c44c07 100644
>--- a/drivers/net/bonding/bond_sysfs.c
>+++ b/drivers/net/bonding/bond_sysfs.c
>@@ -243,7 +243,7 @@ static ssize_t bonding_store_slaves(struct device *d,
> 
> 	if (command[0] == '+') {
> 
>-		/* Got a slave name in ifname.  Is it already in the list? */
>+		/* Got a slave name in ifname. */
> 
> 		dev = __dev_get_by_name(dev_net(bond->dev), ifname);
> 		if (!dev) {
>@@ -253,24 +253,6 @@ static ssize_t bonding_store_slaves(struct device *d,
> 			goto out;
> 		}
> 
>-		if (dev->flags & IFF_UP) {
>-			pr_err("%s: Error: Unable to enslave %s because it is already up.\n",
>-			       bond->dev->name, dev->name);
>-			ret = -EPERM;
>-			goto out;
>-		}
>-
>-		read_lock(&bond->lock);
>-		bond_for_each_slave(bond, slave, i)
>-			if (slave->dev == dev) {
>-				pr_err("%s: Interface %s is already enslaved!\n",
>-				       bond->dev->name, ifname);
>-				ret = -EPERM;
>-				read_unlock(&bond->lock);
>-				goto out;
>-			}
>-		read_unlock(&bond->lock);
>-
> 		pr_info("%s: Adding slave %s.\n", bond->dev->name, ifname);
> 
> 		/* If this is the first slave, then we need to set

^ permalink raw reply

* Re: dev_get_valid_name buggy with hash collision
From: Octavian Purdila @ 2010-05-18 12:29 UTC (permalink / raw)
  To: Daniel Lezcano; +Cc: Linux Netdev List
In-Reply-To: <4BF26926.4070507@free.fr>

On Tuesday 18 May 2010 13:17:10 you wrote:

> the commit:
> 
> commit d90310243fd750240755e217c5faa13e24f41536
> Author: Octavian Purdila <opurdila@ixiacom.com>
> Date:   Wed Nov 18 02:36:59 2009 +0000
> 
>      net: device name allocation cleanups
> 
> introduced a bug when there is a hash collision making impossible to
> rename a device with eth%d
<snip>

Hi Daniel,

Thanks for the detailed explanation ! 

> 
> IMO, the bug is to pass the 'dev->name' parameter to __dev_alloc_name
> directly instead of a temporary name.

I agree.

> 
> The patch in attachment fix the problem but I am not sure we shouldn't
> go further and do more cleanup around this bug, so you may consider it
> as a RFC more than a fix. If it is acceptable, I will send it as a patch
> against net-2.6.
> 

The patch looks good to me, just one doubt here:

>--- net-2.6.orig/net/core/dev.c
>+++ net-2.6/net/core/dev.c
>@@ -936,18 +936,22 @@ int dev_alloc_name(struct net_device *de
> }
> EXPORT_SYMBOL(dev_alloc_name);
> 
>-static int dev_get_valid_name(struct net *net, const char *name, char *buf,
>-                             bool fmt)
>+static int dev_get_valid_name(struct net_device *dev, const char *name, bool 
fmt)
> {
>+       struct net *net;
>+
>+       BUG_ON(!dev_net(dev));
>+       net = dev_net(dev);
>+
>        if (!dev_valid_name(name))
>                return -EINVAL;
> 
>        if (fmt && strchr(name, '%'))
>-               return __dev_alloc_name(net, name, buf);
>+               return dev_alloc_name(dev, name);
>        else if (__dev_get_by_name(net, name))
>                return -EEXIST;
>-       else if (buf != name)
>-               strlcpy(buf, name, IFNAMSIZ);
>+       else if (strncmp(dev->name, name, IFNAMSIZ))
>+                strlcpy(dev->name, name, IFNAMSIZ);
> 

Why do the strncmp, can't we preserve the (buf != name) condition?

Thanks,
tavi

^ permalink raw reply

* Re: [PATCH net-next-2.6 2/2] bonding: allow user-controlled output slave selection
From: Andy Gospodarek @ 2010-05-18 12:57 UTC (permalink / raw)
  To: Jay Vosburgh; +Cc: Neil Horman, netdev
In-Reply-To: <12958.1274145714@death.nxdomain.ibm.com>

On Mon, May 17, 2010 at 9:21 PM, Jay Vosburgh <fubar@us.ibm.com> wrote:
> Jay Vosburgh <fubar@us.ibm.com> wrote:
> [...]
>>       For your patch, I'm exploring the idea of not setting
>>IFF_SLAVE_INACTIVE on "inactive" slaves for an "all_slaves_active"
>>option (I think that's a more descriptive name than "keep_all") instead
>>of adding a new KEEP_ALL flag bit to priv_flags.  Did you consider this
>>methodology and exclude it for some reason?
>
>        Following up to myself, I coded up approximately what I was
> talking about.  This doesn't require the extra priv_flag, and the sysfs
> _store is a little more complicated, but this appears to work (testing
> with ping -f after clearing the switch's MAC table to induce traffic
> flooding).  I didn't change the option name from "keep_all" here, but as
> far as the functionality goes, this seems to do what I think you want it
> to.
>

I can test it here to be sure, but overall this seems like a fine
approach.  The IFF_SLAVE_INACTIVE doesn't seem to be used for much
other than duplicate suppression at this point, so it seems like a
logical choice.

As for the original name 'keep_all,' I tried to use something that
user would find easy to understand.  Obviously not all users think
alike. :-)


>        -J
>
> diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
> index 5e12462..c80d2ff 100644
> --- a/drivers/net/bonding/bond_main.c
> +++ b/drivers/net/bonding/bond_main.c
> @@ -106,6 +106,7 @@ static int arp_interval = BOND_LINK_ARP_INTERV;
>  static char *arp_ip_target[BOND_MAX_ARP_TARGETS];
>  static char *arp_validate;
>  static char *fail_over_mac;
> +static int keep_all    = 0;
>  static struct bond_params bonding_defaults;
>
>  module_param(max_bonds, int, 0);
> @@ -155,6 +156,9 @@ module_param(arp_validate, charp, 0);
>  MODULE_PARM_DESC(arp_validate, "validate src/dst of ARP probes: none (default), active, backup or all");
>  module_param(fail_over_mac, charp, 0);
>  MODULE_PARM_DESC(fail_over_mac, "For active-backup, do not set all slaves to the same MAC.  none (default), active or follow");
> +module_param(keep_all, int, 0);
> +MODULE_PARM_DESC(keep_all, "Keep all frames received on an interface"
> +                          "0 for never (default), 1 for always.");
>
>  /*----------------------------- Global variables ----------------------------*/
>
> @@ -4756,6 +4760,13 @@ static int bond_check_params(struct bond_params *params)
>                }
>        }
>
> +       if ((keep_all != 0) && (keep_all != 1)) {
> +               pr_warning("Warning: keep_all module parameter (%d), "
> +                          "not of valid value (0/1), so it was set to "
> +                          "0\n", keep_all);
> +               keep_all = 0;
> +       }
> +
>        /* reset values for TLB/ALB */
>        if ((bond_mode == BOND_MODE_TLB) ||
>            (bond_mode == BOND_MODE_ALB)) {
> @@ -4926,6 +4937,7 @@ static int bond_check_params(struct bond_params *params)
>        params->primary[0] = 0;
>        params->primary_reselect = primary_reselect_value;
>        params->fail_over_mac = fail_over_mac_value;
> +       params->keep_all = keep_all;
>
>        if (primary) {
>                strncpy(params->primary, primary, IFNAMSIZ);
> diff --git a/drivers/net/bonding/bond_sysfs.c b/drivers/net/bonding/bond_sysfs.c
> index b8bec08..73b3c03 100644
> --- a/drivers/net/bonding/bond_sysfs.c
> +++ b/drivers/net/bonding/bond_sysfs.c
> @@ -339,7 +339,6 @@ out:
>
>  static DEVICE_ATTR(slaves, S_IRUGO | S_IWUSR, bonding_show_slaves,
>                   bonding_store_slaves);
> -
>  /*
>  * Show and set the bonding mode.  The bond interface must be down to
>  * change the mode.
> @@ -1472,6 +1471,59 @@ static ssize_t bonding_show_ad_partner_mac(struct device *d,
>  }
>  static DEVICE_ATTR(ad_partner_mac, S_IRUGO, bonding_show_ad_partner_mac, NULL);
>
> +/*
> + * Show and set the keep_all flag.
> + */
> +static ssize_t bonding_show_keep(struct device *d,
> +                                struct device_attribute *attr,
> +                                char *buf)
> +{
> +       struct bonding *bond = to_bond(d);
> +
> +       return sprintf(buf, "%d\n", bond->params.keep_all);
> +}
> +
> +static ssize_t bonding_store_keep(struct device *d,
> +                                 struct device_attribute *attr,
> +                                 const char *buf, size_t count)
> +{
> +       int i, new_value, ret = count;
> +       struct bonding *bond = to_bond(d);
> +       struct slave *slave;
> +
> +       if (sscanf(buf, "%d", &new_value) != 1) {
> +               pr_err("%s: no keep_all value specified.\n",
> +                      bond->dev->name);
> +               ret = -EINVAL;
> +               goto out;
> +       }
> +
> +       if (new_value == bond->params.keep_all)
> +               goto out;
> +
> +       if ((new_value == 0) || (new_value == 1)) {
> +               bond->params.keep_all = new_value;
> +       } else {
> +               pr_info("%s: Ignoring invalid keep_all value %d.\n",
> +                       bond->dev->name, new_value);
> +               ret = -EINVAL;
> +               goto out;
> +       }
> +
> +       bond_for_each_slave(bond, slave, i) {
> +               if (slave->state == BOND_STATE_BACKUP) {
> +                       if (new_value)
> +                               slave->dev->priv_flags &= ~IFF_SLAVE_INACTIVE;
> +                       else
> +                               slave->dev->priv_flags |= IFF_SLAVE_INACTIVE;
> +               }
> +       }
> +out:
> +       return count;
> +}
> +static DEVICE_ATTR(keep_all, S_IRUGO | S_IWUSR,
> +                  bonding_show_keep, bonding_store_keep);
> +
>
>
>  static struct attribute *per_bond_attrs[] = {
> @@ -1499,6 +1551,7 @@ static struct attribute *per_bond_attrs[] = {
>        &dev_attr_ad_actor_key.attr,
>        &dev_attr_ad_partner_key.attr,
>        &dev_attr_ad_partner_mac.attr,
> +       &dev_attr_keep_all.attr,
>        NULL,
>  };
>
> diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h
> index 2aa3367..ef7969b 100644
> --- a/drivers/net/bonding/bonding.h
> +++ b/drivers/net/bonding/bonding.h
> @@ -131,6 +131,7 @@ struct bond_params {
>        char primary[IFNAMSIZ];
>        int primary_reselect;
>        __be32 arp_targets[BOND_MAX_ARP_TARGETS];
> +       int keep_all;
>  };
>
>  struct bond_parm_tbl {
> @@ -291,7 +292,8 @@ static inline void bond_set_slave_inactive_flags(struct slave *slave)
>        struct bonding *bond = netdev_priv(slave->dev->master);
>        if (!bond_is_lb(bond))
>                slave->state = BOND_STATE_BACKUP;
> -       slave->dev->priv_flags |= IFF_SLAVE_INACTIVE;
> +       if (!bond->params.keep_all)
> +               slave->dev->priv_flags |= IFF_SLAVE_INACTIVE;
>        if (slave_do_arp_validate(bond, slave))
>                slave->dev->priv_flags |= IFF_SLAVE_NEEDARP;
>  }
>
>
> ---
>        -Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com
>

^ permalink raw reply

* Re: loosing IPMI-card by loading netconsole
From: Henning Fehrmann @ 2010-05-18 13:12 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Ronciak, John, Kirsher, Jeffrey T, Brandeburg, Jesse,
	Allan, Bruce W, Waskiewicz Jr, Peter P, netdev@vger.kernel.org,
	Carsten Aulbert
In-Reply-To: <4BEDCD87.4090602@kernel.org>


Hello,

> >> Yeap, sure, it would be effective but I kind of want to leave
> >> bisection as the last resort.  Bisection is a somewhat painful process
> >> especially when the machine isn't right next to you and someone who
> >> has overall knowledge can often identify the problem much easier with
> >> appropriate debugging info.
> > 
> > Well nothing jumps to mind in the netpoll/netconsole code and I haven't
> > heard any similar reports. My guess is it's something obscure, so I
> > think the sooner you start bisecting... Even one or two tests will get
> > us a lot closer to figuring out what changed in the last 1.5 years.
> 
> I see.  I was hoping it would ring a bell to someone.  We'll probably
> try to provide the info Jesse asked and if that doesn't lead anywhere
> start bisecting.


Let me re-describe the symptoms.
I am not loading any ipmi related modules and not the netconsole
module.
When booting out current 2.6.32 kernel we can not access the IPMI
remotely.

We had one case where the IPMI card was accessible while using this
kernel but probably due to the fact that eth0 was removed. We do not
consider this case anymore. 

This problem does not occur when using an older kernel. 

It has likely nothing to do with netconsole.

Here is the bisecting result:

The sha1 sum of the first bad commit is: 
6e50912a442947d5fafd296ca6fdcbeb36b163ff

Hence, the last good commit has:
b2f8f7525c8aa1fdd8ad8c72c832dfb571d5f768

Cheers,
Henning

^ permalink raw reply

* [PATCH] net: avoid one atomic op per cloned skb
From: Eric Dumazet @ 2010-05-18 13:40 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

Hi David

I know you said 'only patches', but I found following patch small
enough ?

I have a followup patch to avoid two atomic ops per cloned skb on
dataref (helps TCP tx path) but will submit it for 2.6.36, since its
diffstat is a bit more than 3++- :)

Thanks

[PATCH] net: avoid one atomic op per cloned skb

skb_clone() can use atomic_set(clone_ref, 2) safely, because only
current thread can possibly touch clone_ref at this point.
   
Add a WARN_ON_ONCE() for a while, to catch wrong assumptions.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 net/core/skbuff.c |    3 ++-
 1 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index c543dd2..4444f15 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -628,7 +628,8 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
 	    n->fclone == SKB_FCLONE_UNAVAILABLE) {
 		atomic_t *fclone_ref = (atomic_t *) (n + 1);
 		n->fclone = SKB_FCLONE_CLONE;
-		atomic_inc(fclone_ref);
+		WARN_ON_ONCE(atomic_read(fclone_ref) != 1);
+		atomic_set(fclone_ref, 2);
 	} else {
 		n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
 		if (!n)




^ permalink raw reply related

* Re: [PATCH] virtio: put last seen used index into ring itself
From: Rusty Russell @ 2010-05-18  7:41 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Jiri Pirko, Shirley Ma, Amit Shah, Mark McLoughlin, netdev,
	linux-kernel, quintela, alex.williamson
In-Reply-To: <20100518004744.GA21359@redhat.com>

On Tue, 18 May 2010 10:17:44 am Michael S. Tsirkin wrote:
> Generally, the Host end of the virtio ring doesn't need to see where
> Guest is up to in consuming the ring.  However, to completely understand
> what's going on from the outside, this information must be exposed.
> For example, host can reduce the number of interrupts by detecting
> that the guest is currently handling previous buffers.

Thanks applied.

Cheers,
Rusty.

^ permalink raw reply

* Re: bnx2/BCM5709: why 5 interrupts on a 4 core system (2.6.33.3)
From: Krzysztof Olędzki @ 2010-05-18 14:22 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Michael Chan, netdev@vger.kernel.org
In-Reply-To: <1274045180.2299.38.camel@edumazet-laptop>

On 2010-05-16 23:26, Eric Dumazet wrote:

<CUT>

>> My normal workload is TCP and UDP based so if it is only ICMP then there
>> is no problem. Actually I have noticeably more UDP traffic than an
>> average network, mainly because of LWAPP/CAPWAP, so I'm interested in
>> good performance for both TCP and UDP.
>>
>> During my initial tests ICMP ping showed the same behavior like UDP/TCP
>> with iperf, so I sticked with it. I'll redo everyting with UDP and TCP
>> of course. :)
>>
>>>> BTW: With a normal router workload, should I expect big performance drop
>>>> when receiving and forwarding the same packet using different CPUs?
>>>> Bonding provides very important functionality, I'm not able to drop it. :(
>>>>
>>>
>>> Not sure what you mean by forwarding same packet using different CPUs.
>>> You probably meant different queues, because in normal case, only one
>>> cpu is involved (the one receiving the packet is also the one
>>> transmitting it, unless you have congestion or trafic shaping)
>>
>> I mean to receive it on a one CPU and to send it on a different one. I
>> would like to assing different vectors (eth1-0 .. eth1-4) to different
>> CPUs, but with bnx2x+bonding packets are received on queues 1-4 (eth1-1
>> .. eth1-4) and sent from queue 0 (eth1-0). So, for a one packet, two
>> different CPUs will be involved (RX on q1-q4, TX on q0).
>
> As I said, (unless you use RPS), one forwarded packet only uses one CPU.
> How tx queue is selected is another story. We try to do a 1-1 mapping.

OK, but with multi-queue NIC, I can assign each queue to a different 
CPU. So, while forwarding packets from a flow, I would like to assign 
the same queue on both input and output.

>>> If you have 4 cpus, you can use following patch and have a transparent
>>> bonding against multiqueue.
>>
>> Thanks! If I get it right: with the patch, packets should be sent using
>> the same CPU (queue?) that was used when receiving?
>
> Yes, for forwarding loads.
>
> (You might use 5 or 8 instead of 4, because its not clear to me if bnx2
> has 5 txqueues or 4 in your case)

Thank you. What happens if I set it to a lower/bigger value, than 
avaliable txqueues in a NIC?

Best regards,

			Krzysztof Olędzki

^ permalink raw reply

* Re: bnx2/BCM5709: why 5 interrupts on a 4 core system (2.6.33.3)
From: Eric Dumazet @ 2010-05-18 14:26 UTC (permalink / raw)
  To: Krzysztof Olędzki; +Cc: Michael Chan, netdev@vger.kernel.org
In-Reply-To: <4BF2A288.7040304@ans.pl>

Le mardi 18 mai 2010 à 16:22 +0200, Krzysztof Olędzki a écrit :

> Thank you. What happens if I set it to a lower/bigger value, than 
> avaliable txqueues in a NIC?

lower values -> same situation than today (not all txqueues will be
used)

bigger values -> it will be capped, so its only a bit more ram
allocated.




^ permalink raw reply

* Re: bnx2/BCM5709: why 5 interrupts on a 4 core system (2.6.33.3)
From: Krzysztof Olędzki @ 2010-05-18 14:55 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Michael Chan, netdev@vger.kernel.org
In-Reply-To: <1274192761.8274.3.camel@edumazet-laptop>

On 2010-05-18 16:26, Eric Dumazet wrote:
> Le mardi 18 mai 2010 à 16:22 +0200, Krzysztof Olędzki a écrit :
>
>> Thank you. What happens if I set it to a lower/bigger value, than
>> avaliable txqueues in a NIC?
>
> lower values ->  same situation than today (not all txqueues will be
> used)
>
> bigger values ->  it will be capped, so its only a bit more ram
> allocated.

So it is safe to put there little bigger value than needed. Thanks.

Best regards,

			Krzysztof Olędzki

^ permalink raw reply

* Re: dev_get_valid_name buggy with hash collision
From: Daniel Lezcano @ 2010-05-18 14:55 UTC (permalink / raw)
  To: Octavian Purdila; +Cc: Linux Netdev List
In-Reply-To: <201005181529.37420.opurdila@ixiacom.com>

On 05/18/2010 02:29 PM, Octavian Purdila wrote:
> On Tuesday 18 May 2010 13:17:10 you wrote:
>
>    
>> the commit:
>>
>> commit d90310243fd750240755e217c5faa13e24f41536
>> Author: Octavian Purdila<opurdila@ixiacom.com>
>> Date:   Wed Nov 18 02:36:59 2009 +0000
>>
>>       net: device name allocation cleanups
>>
>> introduced a bug when there is a hash collision making impossible to
>> rename a device with eth%d
>>
>>      

[ ... ]

>> --- net-2.6.orig/net/core/dev.c
>> +++ net-2.6/net/core/dev.c
>> @@ -936,18 +936,22 @@ int dev_alloc_name(struct net_device *de
>> }
>> EXPORT_SYMBOL(dev_alloc_name);
>>
>> -static int dev_get_valid_name(struct net *net, const char *name, char *buf,
>> -                             bool fmt)
>> +static int dev_get_valid_name(struct net_device *dev, const char *name, bool
>>      
> fmt)
>    
>> {
>> +       struct net *net;
>> +
>> +       BUG_ON(!dev_net(dev));
>> +       net = dev_net(dev);
>> +
>>         if (!dev_valid_name(name))
>>                 return -EINVAL;
>>
>>         if (fmt&&  strchr(name, '%'))
>> -               return __dev_alloc_name(net, name, buf);
>> +               return dev_alloc_name(dev, name);
>>         else if (__dev_get_by_name(net, name))
>>                 return -EEXIST;
>> -       else if (buf != name)
>> -               strlcpy(buf, name, IFNAMSIZ);
>> +       else if (strncmp(dev->name, name, IFNAMSIZ))
>> +                strlcpy(dev->name, name, IFNAMSIZ);
>>
>>      
> Why do the strncmp, can't we preserve the (buf != name) condition

The 'buf' parameter is no longer passed to the function. We have the 
'dev'  and the 'newname' parameters.
The pointer test was just to check 'dev_get_valid_name' was called from 
the 'register_netdevice' function context with 'dev_get_valid_name(net, 
dev->name, dev->name, 0)'. Comparing the strings is valid in this case.

Otherwise dev_get_valid_name is called from:

  *  "dev_change_net_namespace" with "dev%d" or "ifname" specified 
within the netlink message. Both are different pointers, the first will 
fall in the "if (fmt && strchr(name, '%'))".

  * "dev_change_name", where the pointers are different and the strings 
are different.

I think it is safe to do the string comparison here. But maybe there are 
a few simplifications (eg. remove fmt) to do.
If you agree, I will send this patch against net-2.6 and the 
simplifications against net-next-2.6.

Thanks
   -- Daniel



^ permalink raw reply

* Re: [PATCH] iproute2: rework SR-IOV VF support
From: Stephen Hemminger @ 2010-05-18 15:15 UTC (permalink / raw)
  To: Chris Wright; +Cc: netdev, Williams, Mitch A
In-Reply-To: <20100518075700.GE8301@sequoia.sous-sol.org>

On Tue, 18 May 2010 00:57:00 -0700
Chris Wright <chrisw@sous-sol.org> wrote:

> The kernel interface changed just before 2.6.34 was released.  This brings
> iproute2 in line with the current changes.  The VF portion of setlink is
> comprised of a set of nested attributes.
> 
>   IFLA_VFINFO_LIST (NESTED)
>     IFLA_VF_INFO (NESTED)
>       IFLA_VF_MAC
>       IFLA_VF_VLAN
>       IFLA_VF_TX_RATE
> 
> Signed-off-by: Chris Wright <chrisw@sous-sol.org>
> ---

Applied 

-- 

^ permalink raw reply

* [PATCH net-next] ixgbe: return error in set_rar when index out of range
From: Shirley Ma @ 2010-05-18 15:34 UTC (permalink / raw)
  To: jeffrey.t.kirsher; +Cc: e1000-devel, netdev, davem, kvm

Return -1 when set_rar index is out of range

Signed-off-by: Shirley Ma <xma@us.ibm.com>
---
 
 drivers/net/ixgbe/ixgbe_common.c |    1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/drivers/net/ixgbe/ixgbe_common.c
b/drivers/net/ixgbe/ixgbe_common.c
index 1159d91..77b3cf4 100644
--- a/drivers/net/ixgbe/ixgbe_common.c
+++ b/drivers/net/ixgbe/ixgbe_common.c
@@ -1188,6 +1188,7 @@ s32 ixgbe_set_rar_generic(struct ixgbe_hw *hw, u32
index, u8 *addr, u32 vmdq,
 		IXGBE_WRITE_REG(hw, IXGBE_RAH(index), rar_high);
 	} else {
 		hw_dbg(hw, "RAR index %d is out of range.\n", index);
+		return -1;
 	}
 
 	return 0;



------------------------------------------------------------------------------

_______________________________________________
E1000-devel mailing list
E1000-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/e1000-devel
To learn more about Intel&#174; Ethernet, visit http://communities.intel.com/community/wired

^ permalink raw reply related

* Re: bnx2/BCM5709: why 5 interrupts on a 4 core system (2.6.33.3)
From: Krzysztof Olędzki @ 2010-05-18 15:35 UTC (permalink / raw)
  To: Michael Chan; +Cc: netdev@vger.kernel.org
In-Reply-To: <C27F8246C663564A84BB7AB3439772421B78147539@IRVEXCHCCR01.corp.ad.broadcom.com>

On 2010-05-16 20:51, Michael Chan wrote:
> Krzysztof Oledzki wrote:
>
>>
>> Why the driver registers 5 interrupts instead of 4? How to
>> limit it to 4?
>>
>
> The first vector (eth0-0) handles link interrupt and other slow
> path events.  It also has an RX ring for non-IP packets that are
> not hashed by the RSS hash.  The majority of the rx packets should
> be hashed to the rx rings eth0-1 - eth0-4, so I would assign these
> vectors to different CPUs.

Did some more test on a two 4 core CPUs (8 CPUs reported to the system) 
and on a two 4 core CPUs with HT (16 CPUs reported to the system) and in 
both cases there are 8 instead of 9 vectors: eth0-0 .. eth0-7 (irqs 61 
.. 68). However, dmesg shows that 9 interrupts are allocated:

bnx2 0000:01:00.0: irq 61 for MSI/MSI-X
bnx2 0000:01:00.0: irq 62 for MSI/MSI-X
bnx2 0000:01:00.0: irq 63 for MSI/MSI-X
bnx2 0000:01:00.0: irq 64 for MSI/MSI-X
bnx2 0000:01:00.0: irq 65 for MSI/MSI-X
bnx2 0000:01:00.0: irq 66 for MSI/MSI-X
bnx2 0000:01:00.0: irq 67 for MSI/MSI-X
bnx2 0000:01:00.0: irq 68 for MSI/MSI-X
bnx2 0000:01:00.0: irq 69 for MSI/MSI-X

It such case, which ring will be used for slow path and non-IP packets 
and why there is no additional queue like in a 4CPU case?

Best regards,

			Krzysztof Olędzki

^ permalink raw reply

* [PATCH net-next-2.6] bonding: move slave MTU handling from sysfs V2
From: Jiri Pirko @ 2010-05-18 15:42 UTC (permalink / raw)
  To: netdev; +Cc: davem, fubar, bonding-devel, monis

V1->V2: corrected res/ret use

For some reason, MTU handling (storing, and restoring) is taking  place in
bond_sysfs. The correct place for this code is in bond_enslave, bond_release.
So move it there.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
---
 drivers/net/bonding/bond_main.c  |   15 ++++++++++++++-
 drivers/net/bonding/bond_sysfs.c |   22 ++--------------------
 2 files changed, 16 insertions(+), 21 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 5e12462..2c3f9db 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1533,6 +1533,14 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
 	 */
 	new_slave->original_flags = slave_dev->flags;
 
+	/* Save slave's original mtu and then set it to match the bond */
+	new_slave->original_mtu = slave_dev->mtu;
+	res = dev_set_mtu(slave_dev, bond->dev->mtu);
+	if (res) {
+		pr_debug("Error %d calling dev_set_mtu\n", res);
+		goto err_free;
+	}
+
 	/*
 	 * Save slave's original ("permanent") mac address for modes
 	 * that need it, and for restoring it upon release, and then
@@ -1550,7 +1558,7 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
 		res = dev_set_mac_address(slave_dev, &addr);
 		if (res) {
 			pr_debug("Error %d calling set_mac_address\n", res);
-			goto err_free;
+			goto err_restore_mtu;
 		}
 	}
 
@@ -1785,6 +1793,9 @@ err_restore_mac:
 		dev_set_mac_address(slave_dev, &addr);
 	}
 
+err_restore_mtu:
+	dev_set_mtu(slave_dev, new_slave->original_mtu);
+
 err_free:
 	kfree(new_slave);
 
@@ -1969,6 +1980,8 @@ int bond_release(struct net_device *bond_dev, struct net_device *slave_dev)
 		dev_set_mac_address(slave_dev, &addr);
 	}
 
+	dev_set_mtu(slave_dev, slave->original_mtu);
+
 	slave_dev->priv_flags &= ~(IFF_MASTER_8023AD | IFF_MASTER_ALB |
 				   IFF_SLAVE_INACTIVE | IFF_BONDING |
 				   IFF_SLAVE_NEEDARP);
diff --git a/drivers/net/bonding/bond_sysfs.c b/drivers/net/bonding/bond_sysfs.c
index 392e291..29a7a8a 100644
--- a/drivers/net/bonding/bond_sysfs.c
+++ b/drivers/net/bonding/bond_sysfs.c
@@ -220,7 +220,6 @@ static ssize_t bonding_store_slaves(struct device *d,
 	char command[IFNAMSIZ + 1] = { 0, };
 	char *ifname;
 	int i, res, ret = count;
-	u32 original_mtu;
 	struct slave *slave;
 	struct net_device *dev = NULL;
 	struct bonding *bond = to_bond(d);
@@ -281,18 +280,7 @@ static ssize_t bonding_store_slaves(struct device *d,
 			memcpy(bond->dev->dev_addr, dev->dev_addr,
 			       dev->addr_len);
 
-		/* Set the slave's MTU to match the bond */
-		original_mtu = dev->mtu;
-		res = dev_set_mtu(dev, bond->dev->mtu);
-		if (res) {
-			ret = res;
-			goto out;
-		}
-
 		res = bond_enslave(bond->dev, dev);
-		bond_for_each_slave(bond, slave, i)
-			if (strnicmp(slave->dev->name, ifname, IFNAMSIZ) == 0)
-				slave->original_mtu = original_mtu;
 		if (res)
 			ret = res;
 
@@ -301,23 +289,17 @@ static ssize_t bonding_store_slaves(struct device *d,
 
 	if (command[0] == '-') {
 		dev = NULL;
-		original_mtu = 0;
 		bond_for_each_slave(bond, slave, i)
 			if (strnicmp(slave->dev->name, ifname, IFNAMSIZ) == 0) {
 				dev = slave->dev;
-				original_mtu = slave->original_mtu;
 				break;
 			}
 		if (dev) {
 			pr_info("%s: Removing slave %s\n",
 				bond->dev->name, dev->name);
-				res = bond_release(bond->dev, dev);
-			if (res) {
+			res = bond_release(bond->dev, dev);
+			if (res)
 				ret = res;
-				goto out;
-			}
-			/* set the slave MTU to the default */
-			dev_set_mtu(dev, original_mtu);
 		} else {
 			pr_err("unable to remove non-existent slave %s for bond %s.\n",
 			       ifname, bond->dev->name);
-- 
1.6.6.1


^ permalink raw reply related

* [PATCH net-next-2.6] bonding: remove redundant checks from bonding_store_slaves V2
From: Jiri Pirko @ 2010-05-18 15:44 UTC (permalink / raw)
  To: netdev; +Cc: davem, fubar, bonding-devel

(it's actually the same as v1)

Remove checks that duplicates similar checks in bond_enslave.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
---
 drivers/net/bonding/bond_sysfs.c |   20 +-------------------
 1 files changed, 1 insertions(+), 19 deletions(-)

diff --git a/drivers/net/bonding/bond_sysfs.c b/drivers/net/bonding/bond_sysfs.c
index 29a7a8a..7911438 100644
--- a/drivers/net/bonding/bond_sysfs.c
+++ b/drivers/net/bonding/bond_sysfs.c
@@ -243,7 +243,7 @@ static ssize_t bonding_store_slaves(struct device *d,
 
 	if (command[0] == '+') {
 
-		/* Got a slave name in ifname.  Is it already in the list? */
+		/* Got a slave name in ifname. */
 
 		dev = __dev_get_by_name(dev_net(bond->dev), ifname);
 		if (!dev) {
@@ -253,24 +253,6 @@ static ssize_t bonding_store_slaves(struct device *d,
 			goto out;
 		}
 
-		if (dev->flags & IFF_UP) {
-			pr_err("%s: Error: Unable to enslave %s because it is already up.\n",
-			       bond->dev->name, dev->name);
-			ret = -EPERM;
-			goto out;
-		}
-
-		read_lock(&bond->lock);
-		bond_for_each_slave(bond, slave, i)
-			if (slave->dev == dev) {
-				pr_err("%s: Interface %s is already enslaved!\n",
-				       bond->dev->name, ifname);
-				ret = -EPERM;
-				read_unlock(&bond->lock);
-				goto out;
-			}
-		read_unlock(&bond->lock);
-
 		pr_info("%s: Adding slave %s.\n", bond->dev->name, ifname);
 
 		/* If this is the first slave, then we need to set
-- 
1.6.6.1


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox