Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH 2/2] net: Add classid to sk_buff.
From: David Miller @ 2010-05-20  2:55 UTC (permalink / raw)
  To: bmb; +Cc: tgraf, nhorman, nhorman, eric.dumazet, herbert, netdev


We make this zero cost by moving queue_mapping into an existing
empty __u16 slot.  Thus making a __u32 available, which we use
for the 'classid'.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |    6 ++----
 1 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 7c16f24..f847ec2 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -367,12 +367,9 @@ struct sk_buff {
 #ifdef CONFIG_NET_CLS_ACT
 	__u16			tc_verd;	/* traffic control verdict */
 #endif
+	__u32			classid;
 #endif
 
-	__u16			queue_mapping;
-
-	/* 16 bit hole */
-
 #ifdef CONFIG_NET_DMA
 	dma_cookie_t		dma_cookie;
 #endif
@@ -385,6 +382,7 @@ struct sk_buff {
 	};
 
 	__u16			vlan_tci;
+	__u16			queue_mapping;
 
 	sk_buff_data_t		transport_header;
 	sk_buff_data_t		network_header;
-- 
1.7.0.4


^ permalink raw reply related

* [PATCH 1/2] ipv6: Store ndisc_nodeid in IP6CB().
From: David Miller @ 2010-05-20  2:55 UTC (permalink / raw)
  To: bmb; +Cc: tgraf, nhorman, nhorman, eric.dumazet, herbert, netdev


There is no reason we need to use up space in the generic
SKB area for this.  All packet input paths in ipv6 explicitly
clear out the IP6CB() area and therefore the default value
for ndisc_nodeid will be correct.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h   |    2 +-
 include/linux/skbuff.h |    9 ++-------
 net/ipv6/ndisc.c       |   11 ++++++-----
 net/ipv6/sit.c         |    9 ++++++---
 4 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index e0cc9a7..fc39add 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -247,7 +247,7 @@ struct inet6_skb_parm {
 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 	__u16			dsthao;
 #endif
-
+	__u8			ndisc_nodetype;
 #define IP6SKB_XFRM_TRANSFORMED	1
 #define IP6SKB_FORWARDED	2
 };
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 124f90c..7c16f24 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -369,14 +369,9 @@ struct sk_buff {
 #endif
 #endif
 
-	kmemcheck_bitfield_begin(flags2);
-	__u16			queue_mapping:16;
-#ifdef CONFIG_IPV6_NDISC_NODETYPE
-	__u8			ndisc_nodetype:2;
-#endif
-	kmemcheck_bitfield_end(flags2);
+	__u16			queue_mapping;
 
-	/* 0/14 bit hole */
+	/* 16 bit hole */
 
 #ifdef CONFIG_NET_DMA
 	dma_cookie_t		dma_cookie;
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index da0a4d2..e7c0897 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1134,7 +1134,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 	}
 
 #ifdef CONFIG_IPV6_NDISC_NODETYPE
-	if (skb->ndisc_nodetype == NDISC_NODETYPE_HOST) {
+	if (IP6CB(skb)->ndisc_nodetype == NDISC_NODETYPE_HOST) {
 		ND_PRINTK2(KERN_WARNING
 			   "ICMPv6 RA: from host or unauthorized router\n");
 		return;
@@ -1166,7 +1166,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 
 #ifdef CONFIG_IPV6_NDISC_NODETYPE
 	/* skip link-specific parameters from interior routers */
-	if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT)
+	if (IP6CB(skb)->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT)
 		goto skip_linkparms;
 #endif
 
@@ -1323,7 +1323,8 @@ skip_linkparms:
 		     p = ndisc_next_option(p, ndopts.nd_opts_ri_end)) {
 			struct route_info *ri = (struct route_info *)p;
 #ifdef CONFIG_IPV6_NDISC_NODETYPE
-			if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT &&
+			if (IP6CB(skb)->ndisc_nodetype ==
+			    NDISC_NODETYPE_NODEFAULT &&
 			    ri->prefix_len == 0)
 				continue;
 #endif
@@ -1337,7 +1338,7 @@ skip_linkparms:
 
 #ifdef CONFIG_IPV6_NDISC_NODETYPE
 	/* skip link-specific ndopts from interior routers */
-	if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT)
+	if (IP6CB(skb)->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT)
 		goto out;
 #endif
 
@@ -1405,7 +1406,7 @@ static void ndisc_redirect_rcv(struct sk_buff *skb)
 	u8 *lladdr = NULL;
 
 #ifdef CONFIG_IPV6_NDISC_NODETYPE
-	switch (skb->ndisc_nodetype) {
+	switch (IP6CB(skb)->ndisc_nodetype) {
 	case NDISC_NODETYPE_HOST:
 	case NDISC_NODETYPE_NODEFAULT:
 		ND_PRINTK2(KERN_WARNING
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 5abae10..ac014e0 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -427,21 +427,24 @@ static int
 isatap_chksrc(struct sk_buff *skb, struct iphdr *iph, struct ip_tunnel *t)
 {
 	struct ip_tunnel_prl_entry *p;
+	struct inet6_skb_parm *cb;
 	int ok = 1;
 
+	cb = IP6CB(skb);
+
 	rcu_read_lock();
 	p = __ipip6_tunnel_locate_prl(t, iph->saddr);
 	if (p) {
 		if (p->flags & PRL_DEFAULT)
-			skb->ndisc_nodetype = NDISC_NODETYPE_DEFAULT;
+			cb->ndisc_nodetype = NDISC_NODETYPE_DEFAULT;
 		else
-			skb->ndisc_nodetype = NDISC_NODETYPE_NODEFAULT;
+			cb->ndisc_nodetype = NDISC_NODETYPE_NODEFAULT;
 	} else {
 		struct in6_addr *addr6 = &ipv6_hdr(skb)->saddr;
 		if (ipv6_addr_is_isatap(addr6) &&
 		    (addr6->s6_addr32[3] == iph->saddr) &&
 		    ipv6_chk_prefix(addr6, t->dev))
-			skb->ndisc_nodetype = NDISC_NODETYPE_HOST;
+			cb->ndisc_nodetype = NDISC_NODETYPE_HOST;
 		else
 			ok = 0;
 	}
-- 
1.7.0.4


^ permalink raw reply related

* Re: tun: Use netif_receive_skb instead of netif_rx
From: David Miller @ 2010-05-20  2:55 UTC (permalink / raw)
  To: bmb; +Cc: tgraf, nhorman, nhorman, eric.dumazet, herbert, netdev
In-Reply-To: <4BF4517F.1010206@athenacr.com>

Since it seems now inevitable to me that we'll need to store the
'classid' in struct sk_buff, I've prepared two patches to do that at
zero cost.

Someone please go with this.

^ permalink raw reply

* Re: how many msi (msi-x) vectors can be setup?
From: Yinghai @ 2010-05-20  2:53 UTC (permalink / raw)
  To: zhou rui; +Cc: netdev
In-Reply-To: <AANLkTinV8MHMEySnJWzqr0fUdEb_sbJ484y86VpvtlhP@mail.gmail.com>

On 05/19/2010 07:04 PM, zhou rui wrote:
> it is kernel2.6.27, Intel(R) Xeon(R) CPU           E5540,64bit,16
> processors.,so there should be 16 vectors?

are you using sles 11 and x2apic?

YH

^ permalink raw reply

* Re: [PATCH] net: fix problem in dequeuing from input_pkt_queue
From: Tom Herbert @ 2010-05-20  2:48 UTC (permalink / raw)
  To: Changli Gao; +Cc: davem, eric.dumazet, netdev
In-Reply-To: <AANLkTinZHzZykWYWu9vOmK5ydtcB6ToVBKG1RKYSuSn4@mail.gmail.com>

>> It should be okay?  process_backlog only runs in softirq so bottom
>> halves are already disabled, and I don't think flush_backlog runs out
>> of an interrupt.
>>
>
> Oh no. It is an IRQ handler.
>
Very well, I will fix that.

Now I'm wondering, though, what the purpose of flush_backlog is...
since __netif_receive_skb is called with interrupts enabled it's
obvious flush_backlog won't catch all the skb's that reference the
device go away.  Is there a reason these packets need to be flushed
and can't just be processed?

>  on_each_cpu(flush_backlog, dev, 1);
> ...
> int on_each_cpu(void (*func) (void *info), void *info, int wait)
> {
>        int ret = 0;
>
>        preempt_disable();
>        ret = smp_call_function(func, info, wait);
>        local_irq_disable();
>        func(info);
>        local_irq_enable();
>        preempt_enable();
>        return ret;
> }
>
> --
> Regards，
> Changli Gao(xiaosuo@gmail.com)
>

^ permalink raw reply

* Re: [GIT] Networking
From: David Miller @ 2010-05-20  2:34 UTC (permalink / raw)
  To: torvalds; +Cc: akpm, netdev, linux-kernel
In-Reply-To: <20100518.233752.256882583.davem@davemloft.net>

From: David Miller <davem@davemloft.net>
Date: Tue, 18 May 2010 23:37:52 -0700 (PDT)

> Please pull, thanks a lot!
> 
> The following changes since commit 537b60d17894b7c19a6060feae40299d7109d6e7:
>   Linus Torvalds (1):
>         Merge branch 'x86-uv-for-linus' of git://git.kernel.org/.../tip/linux-2.6-tip
> 
> are available in the git repository at:
> 
>   master.kernel.org:/pub/scm/linux/kernel/git/davem/net-next-2.6.git master

Ping?

^ permalink raw reply

* Re: how many msi (msi-x) vectors can be setup?
From: zhou rui @ 2010-05-20  2:04 UTC (permalink / raw)
  To: Yinghai Lu; +Cc: netdev
In-Reply-To: <AANLkTimrQPhPomBC-x-E0b9BKDUmO2j2flZp_Qm2aAtK@mail.gmail.com>

it is kernel2.6.27, Intel(R) Xeon(R) CPU           E5540,64bit,16
processors.,so there should be 16 vectors?



On Thu, May 20, 2010 at 4:33 AM, Yinghai Lu <yinghai@kernel.org> wrote:
> On Wed, May 19, 2010 at 8:18 AM, zhou rui <wirelesser@gmail.com> wrote:
>> hi there:
>> how many msi (msi-x) vectors can be setup?
>> the number is limited by hardware resource(nic), or kernel ?
>> I found that the driver (broadcom 57711 ver 1.5.12) tried to request
>> 16 queues on my kernel2.6.27,but only 2  available
>> will it be increased if I update the driver or kernel?
>> and there is a limitiation in the system? if the other devices have
>> already occupied too many MSI vectors then it is not enough.
>
> from kernel 2.6.19 x86_64, there is per-cpu vector irq support.
>
> depends your system : CPU num? 64bit or 32bit.
>
> YH
>

^ permalink raw reply

* Re: HFSC classes going out of bounds, regression in recent kernels?
From: Denys Fedorysychenko @ 2010-05-20  1:34 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: netdev, Jeff Garzik, Eric Dumazet
In-Reply-To: <4BBB5543.40607@trash.net>

I am trying to track down HFSC bug.
It seems, most probably it is related to PSCHED_SHIFT at the end, i am doing 
testing again. I will try to do complete clean build, maybe last time some .o 
was left or i forgot to do make clean.

SM_SHIFT in HFSC is calculated as 30 - PSCHED_SHIFT, and it is shifted too 
much (or not enough) with new changes (ISM_SHIFT seems wrong too). So it is 
most probably overflow or not enough resolution.
I will try to change PSCHED_SHIFT back to confirm that, and at least i found 
way to reproduce bug.

Additionally in sch_hfsc.c i notice mentioned that PSCHED_SHIFT 10 is tick per 
1024us, but i try to calculate their table (in source comments), it doesn't 
fit with my calculations based on 1024us/tick, but fits well with 1024 
nanosecond.

Is it was 1024ns per tick and now 64ns per tick? Or it is microseconds(us) ?

^ permalink raw reply

* Re: [RFC] netem: correlated loss generation (v3)
From: Hagen Paul Pfeifer @ 2010-05-20  0:43 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Stefano Salsano, David Miller, Fabio Ludovici, netdev, netem
In-Reply-To: <20100519171733.71c24539@nehalam>

* Stephen Hemminger | 2010-05-19 17:17:33 [-0700]:

>The old model was useful, but it really didn't do correlated loss.
>For legacy, the old syntax will go through the same code and generate
>the same result.

Is this really necessary? The right thing is to fix the broken behavior! If
the new patch provides this, great.  Imaging a network analysis for a PhD
dissertation based on a broken correlation algorithm - the whole results are
misleading and wrong. No one deserves this ... ;-)

If the current algorithm is broken then the mechanism must be fixed. Preserve
compatibility is counterproductive in this case.

>tc qdisc change dev eth0 root netem 
>      loss 2 10                              # compat syntax
>      loss random 2 10                       # same as above
>      loss deterministic file                # loss model based on bitmap
>      loss state p13 [p31 [p32 [p23 [p14]]]] # 4 state 
>      loss model  p [r [1-h [1-k]]]          # gilbert elliot model
>
>Any suggestions for better syntax are appreciated.

Not at the moment, looks clear and understandable.


Cheers, Hagen

-- 
Hagen Paul Pfeifer <hagen@jauu.net>  ||  http://jauu.net/
Telephone: +49 174 5455209           ||  Key Id: 0x98350C22
Key Fingerprint: 490F 557B 6C48 6D7E 5706 2EA2 4A22 8D45 9835 0C22


^ permalink raw reply

* Re: [RFC] netem: correlated loss generation (v3)
From: Stefano Salsano @ 2010-05-20  0:22 UTC (permalink / raw)
  To: Hagen Paul Pfeifer
  Cc: Stephen Hemminger, David Miller, Fabio Ludovici, netdev, netem
In-Reply-To: <20100519230433.GE5146@nuttenaction>

Hagen Paul Pfeifer wrote:
> * Stefano Salsano | 2010-05-20 00:52:00 [+0200]:
> 
>> So my opinion is that the need to emulate "correlated" loss patterns
>> is not academic, but it is a real need from industry... of course we
>> can debate if it is a "niche" requirement or not
> 
> netem is not in the processing hot path, so there is no issue to add an
> additional component. If there are some[TM] users and it is usable, I am
> fine with this patch!
> 
>> tc qdisc change dev wlan0 root netem loss 2 10
>>
>> because this produces broken results...
> 
> How to model this specific network characteristic (2% loss, correlation 10%)
> with your modifications? Can you give us an example?
> 

The definition of "correlation" for the correlated loss was 
intrinsically broken.

We can now use two models to introduce correlated loss events.

One is called GI (General and Intuitive), where the "burst lenght" of 
consecutive loss events is used to measure correlation, so the second 
(optional) parameter is not the "correlation" but the burst lenght:

tc qdisc add dev wlan0 root netem loss_GI ploss burst_length

for example if ploss = 2% then the burst lenght for uncorrelated loss 
will be 1/(1-ploss) = 1 / 0.98 ~= 1.02

this means that you will have almost always isolated loss events if 
burst_lengh is 1.02

everything greater than 1.02 for burst_lenght will add a correlation in 
the loss patterns, for example:

tc qdisc add dev wlan0 root netem loss_GI 2 3

will mean that the loss events will be grouped in bursts of average 
lenght 3 (to keep the 2% loss this will result in less frequent loss 
bursts, but with more consecutive losses per bursts)

The second model is called Gilbert-Elliot model, you have to input two 
parameters p and r:

tc qdisc add dev eth0 root netem loss_gilb_ell p r

p and r are related to ploss and burst_length in the following way:
ploss = p/(p+r)
burst_length = 1/r

Cheers,
Stefano

PS Thank you for your question! It was important to clarify with such an 
example the new approach. We will soon add this discussion to the 
documentation available at 
http://netgroup.uniroma2.it/twiki/bin/view.cgi/Main/NetemCLG


> Cheers, Hagen
> 


-- 
*******************************************************************
Stefano Salsano
Dipartimento Ingegneria Elettronica
Universita' di Roma "Tor Vergata"
Via del Politecnico, 1 - 00133 Roma - ITALY

http://netgroup.uniroma2.it/Stefano_Salsano/

E-mail  : stefano.salsano@uniroma2.it
Cell.   : +39 320 4307310
Office  : (Tel.) +39 06 72597770  (Fax.) +39 06 72597435
*******************************************************************

^ permalink raw reply

* Re: [RFC] netem: correlated loss generation (v3)
From: Stephen Hemminger @ 2010-05-20  0:17 UTC (permalink / raw)
  To: Hagen Paul Pfeifer
  Cc: Stefano Salsano, David Miller, Fabio Ludovici, netdev, netem
In-Reply-To: <20100519230433.GE5146@nuttenaction>

On Thu, 20 May 2010 01:04:33 +0200
Hagen Paul Pfeifer <hagen@jauu.net> wrote:

> * Stefano Salsano | 2010-05-20 00:52:00 [+0200]:
> 
> >So my opinion is that the need to emulate "correlated" loss patterns
> >is not academic, but it is a real need from industry... of course we
> >can debate if it is a "niche" requirement or not
> 
> netem is not in the processing hot path, so there is no issue to add an
> additional component. If there are some[TM] users and it is usable, I am
> fine with this patch!
> 
> >tc qdisc change dev wlan0 root netem loss 2 10
> >
> >because this produces broken results...
> 
> How to model this specific network characteristic (2% loss, correlation 10%)
> with your modifications? Can you give us an example?

The old model was useful, but it really didn't do correlated loss.
For legacy, the old syntax will go through the same code and generate
the same result.

iproute2 syntax is not finalized but, plan is simplified version of
the NetemCLG paper.

tc qdisc change dev eth0 root netem 
      loss 2 10                              # compat syntax
      loss random 2 10                       # same as above
      loss deterministic file                # loss model based on bitmap
      loss state p13 [p31 [p32 [p23 [p14]]]] # 4 state 
      loss model  p [r [1-h [1-k]]]          # gilbert elliot model

Any suggestions for better syntax are appreciated.

^ permalink raw reply

* Re: [PATCH] net: fix problem in dequeuing from input_pkt_queue
From: Changli Gao @ 2010-05-20  0:09 UTC (permalink / raw)
  To: Tom Herbert; +Cc: davem, eric.dumazet, netdev
In-Reply-To: <AANLkTilxZdJifPtBANrDNdYWxiAMLY6AwA0KYm1yWjle@mail.gmail.com>

On Thu, May 20, 2010 at 7:58 AM, Tom Herbert <therbert@google.com> wrote:
>>>        napi->weight = weight_p;
>>> -       local_irq_disable();
>>>        while (work < quota) {
>>>                struct sk_buff *skb;
>>>                unsigned int qlen;
>>>
>>>                while ((skb = __skb_dequeue(&sd->process_queue))) {
>>> -                       local_irq_enable();
>>
>> we need to keep local irq disabled. If not, flush_backlog may be
>> called, and it will access sd->process_queue.
>>
>
> It should be okay?  process_backlog only runs in softirq so bottom
> halves are already disabled, and I don't think flush_backlog runs out
> of an interrupt.
>

Oh no. It is an IRQ handler.

  on_each_cpu(flush_backlog, dev, 1);
...
int on_each_cpu(void (*func) (void *info), void *info, int wait)
{
        int ret = 0;

        preempt_disable();
        ret = smp_call_function(func, info, wait);
        local_irq_disable();
        func(info);
        local_irq_enable();
        preempt_enable();
        return ret;
}

-- 
Regards，
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply

* Re: [PATCH net-next-2.6] bonding: move slave MTU handling from sysfs V2
From: Jay Vosburgh @ 2010-05-20  0:07 UTC (permalink / raw)
  To: Jiri Pirko; +Cc: netdev, davem, bonding-devel, monis
In-Reply-To: <20100518154016.GD2878@psychotron.lab.eng.brq.redhat.com>

Jiri Pirko <jpirko@redhat.com> wrote:

>V1->V2: corrected res/ret use
>
>For some reason, MTU handling (storing, and restoring) is taking  place in
>bond_sysfs. The correct place for this code is in bond_enslave, bond_release.
>So move it there.

	In principle this looks ok, as do the other patches, but none of
them apply to net-next-2.6 for me except for the "optimize
tlb_get_least_loaded_slave" patch.  It looks like you left out a patch,
see below.

>Signed-off-by: Jiri Pirko <jpirko@redhat.com>
>---
> drivers/net/bonding/bond_main.c  |   15 ++++++++++++++-
> drivers/net/bonding/bond_sysfs.c |   22 ++--------------------
> 2 files changed, 16 insertions(+), 21 deletions(-)
>
>diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
>index 5e12462..2c3f9db 100644
>--- a/drivers/net/bonding/bond_main.c
>+++ b/drivers/net/bonding/bond_main.c
>@@ -1533,6 +1533,14 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
> 	 */
> 	new_slave->original_flags = slave_dev->flags;
>
>+	/* Save slave's original mtu and then set it to match the bond */
>+	new_slave->original_mtu = slave_dev->mtu;
>+	res = dev_set_mtu(slave_dev, bond->dev->mtu);
>+	if (res) {
>+		pr_debug("Error %d calling dev_set_mtu\n", res);
>+		goto err_free;
>+	}
>+
> 	/*
> 	 * Save slave's original ("permanent") mac address for modes
> 	 * that need it, and for restoring it upon release, and then
>@@ -1550,7 +1558,7 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
> 		res = dev_set_mac_address(slave_dev, &addr);
> 		if (res) {
> 			pr_debug("Error %d calling set_mac_address\n", res);
>-			goto err_free;
>+			goto err_restore_mtu;
> 		}
> 	}
>
>@@ -1785,6 +1793,9 @@ err_restore_mac:
> 		dev_set_mac_address(slave_dev, &addr);
> 	}
>
>+err_restore_mtu:
>+	dev_set_mtu(slave_dev, new_slave->original_mtu);
>+
> err_free:
> 	kfree(new_slave);
>
>@@ -1969,6 +1980,8 @@ int bond_release(struct net_device *bond_dev, struct net_device *slave_dev)
> 		dev_set_mac_address(slave_dev, &addr);
> 	}
>
>+	dev_set_mtu(slave_dev, slave->original_mtu);
>+
> 	slave_dev->priv_flags &= ~(IFF_MASTER_8023AD | IFF_MASTER_ALB |
> 				   IFF_SLAVE_INACTIVE | IFF_BONDING |
> 				   IFF_SLAVE_NEEDARP);
>diff --git a/drivers/net/bonding/bond_sysfs.c b/drivers/net/bonding/bond_sysfs.c
>index 392e291..29a7a8a 100644
>--- a/drivers/net/bonding/bond_sysfs.c
>+++ b/drivers/net/bonding/bond_sysfs.c
>@@ -220,7 +220,6 @@ static ssize_t bonding_store_slaves(struct device *d,
> 	char command[IFNAMSIZ + 1] = { 0, };
> 	char *ifname;
> 	int i, res, ret = count;
>-	u32 original_mtu;
> 	struct slave *slave;
> 	struct net_device *dev = NULL;
> 	struct bonding *bond = to_bond(d);

	This chunk doesn't apply to net-next-2.6 because your context
doesn't match; it looks like you've removed the variable "found" in your
"before" source.  On closer inspection, "found" isn't actually used
meaningfully, so I'm guessing you removed it in a prior patch but didn't
submit that patch.

	If that's the case, could you repost the whole series, with
sequence numbers?

	-J

>@@ -281,18 +280,7 @@ static ssize_t bonding_store_slaves(struct device *d,
> 			memcpy(bond->dev->dev_addr, dev->dev_addr,
> 			       dev->addr_len);
>
>-		/* Set the slave's MTU to match the bond */
>-		original_mtu = dev->mtu;
>-		res = dev_set_mtu(dev, bond->dev->mtu);
>-		if (res) {
>-			ret = res;
>-			goto out;
>-		}
>-
> 		res = bond_enslave(bond->dev, dev);
>-		bond_for_each_slave(bond, slave, i)
>-			if (strnicmp(slave->dev->name, ifname, IFNAMSIZ) == 0)
>-				slave->original_mtu = original_mtu;
> 		if (res)
> 			ret = res;
>
>@@ -301,23 +289,17 @@ static ssize_t bonding_store_slaves(struct device *d,
>
> 	if (command[0] == '-') {
> 		dev = NULL;
>-		original_mtu = 0;
> 		bond_for_each_slave(bond, slave, i)
> 			if (strnicmp(slave->dev->name, ifname, IFNAMSIZ) == 0) {
> 				dev = slave->dev;
>-				original_mtu = slave->original_mtu;
> 				break;
> 			}
> 		if (dev) {
> 			pr_info("%s: Removing slave %s\n",
> 				bond->dev->name, dev->name);
>-				res = bond_release(bond->dev, dev);
>-			if (res) {
>+			res = bond_release(bond->dev, dev);
>+			if (res)
> 				ret = res;
>-				goto out;
>-			}
>-			/* set the slave MTU to the default */
>-			dev_set_mtu(dev, original_mtu);
> 		} else {
> 			pr_err("unable to remove non-existent slave %s for bond %s.\n",
> 			       ifname, bond->dev->name);
>-- 
>1.6.6.1

---
	-Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com

^ permalink raw reply

* Re: [PATCH] net: fix problem in dequeuing from input_pkt_queue
From: Tom Herbert @ 2010-05-19 23:58 UTC (permalink / raw)
  To: Changli Gao; +Cc: davem, eric.dumazet, netdev
In-Reply-To: <AANLkTinvSgc3MTAJZEmhBqmRpYlHdOZ6HQ_j43_m8XX1@mail.gmail.com>

>>        napi->weight = weight_p;
>> -       local_irq_disable();
>>        while (work < quota) {
>>                struct sk_buff *skb;
>>                unsigned int qlen;
>>
>>                while ((skb = __skb_dequeue(&sd->process_queue))) {
>> -                       local_irq_enable();
>
> we need to keep local irq disabled. If not, flush_backlog may be
> called, and it will access sd->process_queue.
>

It should be okay?  process_backlog only runs in softirq so bottom
halves are already disabled, and I don't think flush_backlog runs out
of an interrupt.

>
> --
> Regards，
> Changli Gao(xiaosuo@gmail.com)
>

^ permalink raw reply

* Re: [PATCH] net: fix problem in dequeuing from input_pkt_queue
From: Changli Gao @ 2010-05-19 23:45 UTC (permalink / raw)
  To: Tom Herbert; +Cc: davem, eric.dumazet, netdev
In-Reply-To: <alpine.DEB.1.00.1005191440440.23271@pokey.mtv.corp.google.com>

On Thu, May 20, 2010 at 5:47 AM, Tom Herbert <therbert@google.com> wrote:
> Fix some issues introduced in batch skb dequeuing for input_pkt_queue.
> The primary issue it that the queue head must be incremented only
> after a packet has been processed, that is only after
> __netif_receive_skb has been called.  This is needed for the mechanism
> to prevent OOO packet in RFS.  Also when flushing the input_pkt_queue
> and process_queue, the process queue should be done first to prevent
> OOO packets.
>
> Because the input_pkt_queue has been effectively split into two queues,
> the calculation of the tail ptr is no longer correct.  The correct value
> would be head+input_pkt_queue->len+process_queue->len.  To avoid
> this calculation we added an explict input_queue_tail in softnet_data.
> The tail value is simply incremented when queuing to input_pkt_queue.
>
> In process_backlog the processing of the packet queue can be done
> without irq's being disabled.
>
>  static int napi_gro_complete(struct sk_buff *skb)
> @@ -3320,26 +3319,24 @@ static int process_backlog(struct napi_struct *napi, int quota)
>        }
>  #endif
>        napi->weight = weight_p;
> -       local_irq_disable();
>        while (work < quota) {
>                struct sk_buff *skb;
>                unsigned int qlen;
>
>                while ((skb = __skb_dequeue(&sd->process_queue))) {
> -                       local_irq_enable();

we need to keep local irq disabled. If not, flush_backlog may be
called, and it will access sd->process_queue.


-- 
Regards，
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply

* Re: [RFC] netem: correlated loss generation (v3)
From: Stefano Salsano @ 2010-05-19 22:52 UTC (permalink / raw)
  To: Hagen Paul Pfeifer
  Cc: Stephen Hemminger, David Miller, Fabio Ludovici, netdev, netem
In-Reply-To: <20100519214239.GD5146@nuttenaction>

Hagen Paul Pfeifer wrote:
> * Stephen Hemminger | 2010-05-17 20:56:21 [-0700]:
> 
> Why mainline? I questioning the advantage for the big audience, it looks like
> a academic only piece of software - correct me if I'm wrong.
>

as an author, I can only give a "biased" point of view... anyway our 
work started from a cooperation with an industry which needed to test 
its solutions for fax/modem over IP under correlated loss.  When we put 
our first version on the netem list, we were asked by people from 
another industry to add the feature of loss patterns coming out from a 
deterministic table.

So my opinion is that the need to emulate "correlated" loss patterns is 
not academic, but it is a real need from industry... of course we can 
debate if it is a "niche" requirement or not

> The authors pointed to some weak points in the implementation of the current
> loss/correlation logic. But this "fix", add another - complicated component -
> and let the broken components untouched ...

Leaving or removing the broken component is an independent issue.

May be we should allow to use the old syntax like this:

tc qdisc change dev wlan0 root netem loss 2

because it was working OK, and we should disallow to use the old model 
in this way:

tc qdisc change dev wlan0 root netem loss 2 10

because this produces broken results...

BR,
Stefano

> 
> HGN
> 

-- 
*******************************************************************
Stefano Salsano
Dipartimento Ingegneria Elettronica
Universita' di Roma "Tor Vergata"
Via del Politecnico, 1 - 00133 Roma - ITALY

http://netgroup.uniroma2.it/Stefano_Salsano/

E-mail  : stefano.salsano@uniroma2.it
Cell.   : +39 320 4307310
Office  : (Tel.) +39 06 72597770  (Fax.) +39 06 72597435
*******************************************************************

^ permalink raw reply

* Re: [RFC] netem: correlated loss generation (v3)
From: Hagen Paul Pfeifer @ 2010-05-19 23:04 UTC (permalink / raw)
  To: Stefano Salsano
  Cc: Stephen Hemminger, David Miller, Fabio Ludovici, netdev, netem
In-Reply-To: <4BF46B90.1000806@uniroma2.it>

* Stefano Salsano | 2010-05-20 00:52:00 [+0200]:

>So my opinion is that the need to emulate "correlated" loss patterns
>is not academic, but it is a real need from industry... of course we
>can debate if it is a "niche" requirement or not

netem is not in the processing hot path, so there is no issue to add an
additional component. If there are some[TM] users and it is usable, I am
fine with this patch!

>tc qdisc change dev wlan0 root netem loss 2 10
>
>because this produces broken results...

How to model this specific network characteristic (2% loss, correlation 10%)
with your modifications? Can you give us an example?

Cheers, Hagen

-- 
Hagen Paul Pfeifer <hagen@jauu.net>  ||  http://jauu.net/
Telephone: +49 174 5455209           ||  Key Id: 0x98350C22
Key Fingerprint: 490F 557B 6C48 6D7E 5706 2EA2 4A22 8D45 9835 0C22

^ permalink raw reply

* [net-next PATCH v2] ixgbe:add support for a new 82599 10G Base-T device
From: Jeff Kirsher @ 2010-05-19 22:46 UTC (permalink / raw)
  To: davem; +Cc: netdev, gospo, Mallikarjuna R Chilakala, Jeff Kirsher

From: Mallikarjuna R Chilakala <mallikarjuna.chilakala@intel.com>

This adds support for a new copper device for 82599, device id 0x151c.
This 82599 10GBase-T device uses the PHY's internal temperature sensor
to guard against over-temp conditions. In this scenario the PHY will be
put in a low power mode and link will no longer be able to transmit or
receive any data. When this occurs, the over-temp interrupt is latched
and driver logs this error message. A HW reset or power cycle is
required to clear this status.

Signed-off-by: Mallikarjuna R Chilakala <mallikarjuna.chilakala@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---

 drivers/net/ixgbe/ixgbe.h       |    3 ++
 drivers/net/ixgbe/ixgbe_82598.c |    1 +
 drivers/net/ixgbe/ixgbe_82599.c |    1 +
 drivers/net/ixgbe/ixgbe_main.c  |   69 +++++++++++++++++++++++++++++++++++++++
 drivers/net/ixgbe/ixgbe_phy.c   |   31 ++++++++++++++++++
 drivers/net/ixgbe/ixgbe_phy.h   |    3 ++
 drivers/net/ixgbe/ixgbe_type.h  |    4 ++
 7 files changed, 112 insertions(+), 0 deletions(-)

diff --git a/drivers/net/ixgbe/ixgbe.h b/drivers/net/ixgbe/ixgbe.h
index d0ea3d6..ffae480 100644
--- a/drivers/net/ixgbe/ixgbe.h
+++ b/drivers/net/ixgbe/ixgbe.h
@@ -360,6 +360,7 @@ struct ixgbe_adapter {
 	u32 flags2;
 #define IXGBE_FLAG2_RSC_CAPABLE                 (u32)(1)
 #define IXGBE_FLAG2_RSC_ENABLED                 (u32)(1 << 1)
+#define IXGBE_FLAG2_TEMP_SENSOR_CAPABLE         (u32)(1 << 2)
 /* default to trying for four seconds */
 #define IXGBE_TRY_LINK_TIMEOUT (4 * HZ)
 
@@ -407,6 +408,8 @@ struct ixgbe_adapter {
 	u16 eeprom_version;
 
 	int node;
+	struct work_struct check_overtemp_task;
+	u32 interrupt_event;
 
 	/* SR-IOV */
 	DECLARE_BITMAP(active_vfs, IXGBE_MAX_VF_FUNCTIONS);
diff --git a/drivers/net/ixgbe/ixgbe_82598.c b/drivers/net/ixgbe/ixgbe_82598.c
index f2b7ff4..9c02d60 100644
--- a/drivers/net/ixgbe/ixgbe_82598.c
+++ b/drivers/net/ixgbe/ixgbe_82598.c
@@ -1236,6 +1236,7 @@ static struct ixgbe_phy_operations phy_ops_82598 = {
 	.setup_link		= &ixgbe_setup_phy_link_generic,
 	.setup_link_speed	= &ixgbe_setup_phy_link_speed_generic,
 	.read_i2c_eeprom	= &ixgbe_read_i2c_eeprom_82598,
+	.check_overtemp   = &ixgbe_tn_check_overtemp,
 };
 
 struct ixgbe_info ixgbe_82598_info = {
diff --git a/drivers/net/ixgbe/ixgbe_82599.c b/drivers/net/ixgbe/ixgbe_82599.c
index e9706eb..a4e2901 100644
--- a/drivers/net/ixgbe/ixgbe_82599.c
+++ b/drivers/net/ixgbe/ixgbe_82599.c
@@ -2395,6 +2395,7 @@ static struct ixgbe_phy_operations phy_ops_82599 = {
 	.write_i2c_byte         = &ixgbe_write_i2c_byte_generic,
 	.read_i2c_eeprom        = &ixgbe_read_i2c_eeprom_generic,
 	.write_i2c_eeprom       = &ixgbe_write_i2c_eeprom_generic,
+	.check_overtemp         = &ixgbe_tn_check_overtemp,
 };
 
 struct ixgbe_info ixgbe_82599_info = {
diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c
index 9551cbb..d571d10 100644
--- a/drivers/net/ixgbe/ixgbe_main.c
+++ b/drivers/net/ixgbe/ixgbe_main.c
@@ -108,6 +108,8 @@ static DEFINE_PCI_DEVICE_TABLE(ixgbe_pci_tbl) = {
 	 board_82599 },
 	{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82599_CX4),
 	 board_82599 },
+	{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82599_T3_LOM),
+	 board_82599 },
 	{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82599_COMBO_BACKPLANE),
 	 board_82599 },
 
@@ -1618,6 +1620,48 @@ static void ixgbe_set_itr_msix(struct ixgbe_q_vector *q_vector)
 	}
 }
 
+/**
+ * ixgbe_check_overtemp_task - worker thread to check over tempurature
+ * @work: pointer to work_struct containing our data
+ **/
+static void ixgbe_check_overtemp_task(struct work_struct *work)
+{
+	struct ixgbe_adapter *adapter = container_of(work,
+	                                             struct ixgbe_adapter,
+	                                             check_overtemp_task);
+	struct ixgbe_hw *hw = &adapter->hw;
+	u32 eicr = adapter->interrupt_event;
+
+	if (adapter->flags2 & IXGBE_FLAG2_TEMP_SENSOR_CAPABLE) {
+		switch (hw->device_id) {
+		case IXGBE_DEV_ID_82599_T3_LOM: {
+			u32 autoneg;
+			bool link_up = false;
+
+			if (hw->mac.ops.check_link)
+				hw->mac.ops.check_link(hw, &autoneg, &link_up, false);
+
+			if (((eicr & IXGBE_EICR_GPI_SDP0) && (!link_up)) ||
+			    (eicr & IXGBE_EICR_LSC))
+				/* Check if this is due to overtemp */
+				if (hw->phy.ops.check_overtemp(hw) == IXGBE_ERR_OVERTEMP)
+					break;
+			}
+			return;
+		default:
+			if (!(eicr & IXGBE_EICR_GPI_SDP0))
+				return;
+			break;
+		}
+		DPRINTK(DRV, ERR, "Network adapter has been stopped because it "
+		        "has over heated. Restart the computer. If the problem "
+		        "persists, power off the system and replace the "
+		        "adapter\n");
+		/* write to clear the interrupt */
+		IXGBE_WRITE_REG(hw, IXGBE_EICR, IXGBE_EICR_GPI_SDP0);
+	}
+}
+
 static void ixgbe_check_fan_failure(struct ixgbe_adapter *adapter, u32 eicr)
 {
 	struct ixgbe_hw *hw = &adapter->hw;
@@ -1689,6 +1733,10 @@ static irqreturn_t ixgbe_msix_lsc(int irq, void *data)
 
 	if (hw->mac.type == ixgbe_mac_82599EB) {
 		ixgbe_check_sfp_event(adapter, eicr);
+		adapter->interrupt_event = eicr;
+		if ((adapter->flags2 & IXGBE_FLAG2_TEMP_SENSOR_CAPABLE) &&
+		    ((eicr & IXGBE_EICR_GPI_SDP0) || (eicr & IXGBE_EICR_LSC)))
+			schedule_work(&adapter->check_overtemp_task);
 
 		/* Handle Flow Director Full threshold interrupt */
 		if (eicr & IXGBE_EICR_FLOW_DIR) {
@@ -2190,6 +2238,8 @@ static inline void ixgbe_irq_enable(struct ixgbe_adapter *adapter)
 	u32 mask;
 
 	mask = (IXGBE_EIMS_ENABLE_MASK & ~IXGBE_EIMS_RTX_QUEUE);
+	if (adapter->flags2 & IXGBE_FLAG2_TEMP_SENSOR_CAPABLE)
+		mask |= IXGBE_EIMS_GPI_SDP0;
 	if (adapter->flags & IXGBE_FLAG_FAN_FAIL_CAPABLE)
 		mask |= IXGBE_EIMS_GPI_SDP1;
 	if (adapter->hw.mac.type == ixgbe_mac_82599EB) {
@@ -2250,6 +2300,9 @@ static irqreturn_t ixgbe_intr(int irq, void *data)
 		ixgbe_check_sfp_event(adapter, eicr);
 
 	ixgbe_check_fan_failure(adapter, eicr);
+	if ((adapter->flags2 & IXGBE_FLAG2_TEMP_SENSOR_CAPABLE) &&
+	    ((eicr & IXGBE_EICR_GPI_SDP0) || (eicr & IXGBE_EICR_LSC)))
+		schedule_work(&adapter->check_overtemp_task);
 
 	if (napi_schedule_prep(&(q_vector->napi))) {
 		adapter->tx_ring[0]->total_packets = 0;
@@ -3265,6 +3318,13 @@ static int ixgbe_up_complete(struct ixgbe_adapter *adapter)
 		IXGBE_WRITE_REG(hw, IXGBE_EIAM, IXGBE_EICS_RTX_QUEUE);
 	}
 
+	/* Enable Thermal over heat sensor interrupt */
+	if (adapter->flags2 & IXGBE_FLAG2_TEMP_SENSOR_CAPABLE) {
+		gpie = IXGBE_READ_REG(hw, IXGBE_GPIE);
+		gpie |= IXGBE_SDP0_GPIEN;
+		IXGBE_WRITE_REG(hw, IXGBE_GPIE, gpie);
+	}
+
 	/* Enable fan failure interrupt if media type is copper */
 	if (adapter->flags & IXGBE_FLAG_FAN_FAIL_CAPABLE) {
 		gpie = IXGBE_READ_REG(hw, IXGBE_GPIE);
@@ -3666,6 +3726,9 @@ void ixgbe_down(struct ixgbe_adapter *adapter)
 	    adapter->flags & IXGBE_FLAG_FDIR_PERFECT_CAPABLE)
 		cancel_work_sync(&adapter->fdir_reinit_task);
 
+	if (adapter->flags2 & IXGBE_FLAG2_TEMP_SENSOR_CAPABLE)
+		cancel_work_sync(&adapter->check_overtemp_task);
+
 	/* disable transmits in the hardware now that interrupts are off */
 	for (i = 0; i < adapter->num_tx_queues; i++) {
 		j = adapter->tx_ring[i]->reg_idx;
@@ -4645,6 +4708,8 @@ static int __devinit ixgbe_sw_init(struct ixgbe_adapter *adapter)
 		adapter->max_msix_q_vectors = MAX_MSIX_Q_VECTORS_82599;
 		adapter->flags2 |= IXGBE_FLAG2_RSC_CAPABLE;
 		adapter->flags2 |= IXGBE_FLAG2_RSC_ENABLED;
+		if (hw->device_id == IXGBE_DEV_ID_82599_T3_LOM)
+			adapter->flags2 |= IXGBE_FLAG2_TEMP_SENSOR_CAPABLE;
 		if (dev->features & NETIF_F_NTUPLE) {
 			/* Flow Director perfect filter enabled */
 			adapter->flags |= IXGBE_FLAG_FDIR_PERFECT_CAPABLE;
@@ -6561,7 +6626,9 @@ static int __devinit ixgbe_probe(struct pci_dev *pdev,
 	}
 
 	/* reset_hw fills in the perm_addr as well */
+	hw->phy.reset_if_overtemp = true;
 	err = hw->mac.ops.reset_hw(hw);
+	hw->phy.reset_if_overtemp = false;
 	if (err == IXGBE_ERR_SFP_NOT_PRESENT &&
 	    hw->mac.type == ixgbe_mac_82598EB) {
 		/*
@@ -6730,6 +6797,8 @@ static int __devinit ixgbe_probe(struct pci_dev *pdev,
 	    adapter->flags & IXGBE_FLAG_FDIR_PERFECT_CAPABLE)
 		INIT_WORK(&adapter->fdir_reinit_task, ixgbe_fdir_reinit_task);
 
+	if (adapter->flags2 & IXGBE_FLAG2_TEMP_SENSOR_CAPABLE)
+		INIT_WORK(&adapter->check_overtemp_task, ixgbe_check_overtemp_task);
 #ifdef CONFIG_IXGBE_DCA
 	if (dca_add_requester(&pdev->dev) == 0) {
 		adapter->flags |= IXGBE_FLAG_DCA_ENABLED;
diff --git a/drivers/net/ixgbe/ixgbe_phy.c b/drivers/net/ixgbe/ixgbe_phy.c
index 22d21af..9c8fb85 100644
--- a/drivers/net/ixgbe/ixgbe_phy.c
+++ b/drivers/net/ixgbe/ixgbe_phy.c
@@ -135,6 +135,11 @@ static enum ixgbe_phy_type ixgbe_get_phy_type_from_id(u32 phy_id)
  **/
 s32 ixgbe_reset_phy_generic(struct ixgbe_hw *hw)
 {
+	/* Don't reset PHY if it's shut down due to overtemp. */
+	if (!hw->phy.reset_if_overtemp &&
+	    (IXGBE_ERR_OVERTEMP == hw->phy.ops.check_overtemp(hw)))
+		return 0;
+
 	/*
 	 * Perform soft PHY reset to the PHY_XS.
 	 * This will cause a soft reset to the PHY
@@ -1345,3 +1350,29 @@ s32 ixgbe_get_phy_firmware_version_tnx(struct ixgbe_hw *hw,
 	return status;
 }
 
+/**
+ *  ixgbe_tn_check_overtemp - Checks if an overtemp occured.
+ *  @hw: pointer to hardware structure
+ *
+ *  Checks if the LASI temp alarm status was triggered due to overtemp
+ **/
+s32 ixgbe_tn_check_overtemp(struct ixgbe_hw *hw)
+{
+	s32 status = 0;
+	u16 phy_data = 0;
+
+	if (hw->device_id != IXGBE_DEV_ID_82599_T3_LOM)
+		goto out;
+
+	/* Check that the LASI temp alarm status was triggered */
+	hw->phy.ops.read_reg(hw, IXGBE_TN_LASI_STATUS_REG,
+	                     MDIO_MMD_PMAPMD, &phy_data);
+
+	if (!(phy_data & IXGBE_TN_LASI_STATUS_TEMP_ALARM))
+		goto out;
+
+	status = IXGBE_ERR_OVERTEMP;
+out:
+	return status;
+}
+
diff --git a/drivers/net/ixgbe/ixgbe_phy.h b/drivers/net/ixgbe/ixgbe_phy.h
index c9c5459..ef4ba83 100644
--- a/drivers/net/ixgbe/ixgbe_phy.h
+++ b/drivers/net/ixgbe/ixgbe_phy.h
@@ -80,6 +80,8 @@
 #define IXGBE_I2C_T_SU_STO  4
 #define IXGBE_I2C_T_BUF     5
 
+#define IXGBE_TN_LASI_STATUS_REG        0x9005
+#define IXGBE_TN_LASI_STATUS_TEMP_ALARM 0x0008
 
 s32 ixgbe_init_phy_ops_generic(struct ixgbe_hw *hw);
 s32 ixgbe_identify_phy_generic(struct ixgbe_hw *hw);
@@ -106,6 +108,7 @@ s32 ixgbe_identify_sfp_module_generic(struct ixgbe_hw *hw);
 s32 ixgbe_get_sfp_init_sequence_offsets(struct ixgbe_hw *hw,
                                         u16 *list_offset,
                                         u16 *data_offset);
+s32 ixgbe_tn_check_overtemp(struct ixgbe_hw *hw);
 s32 ixgbe_read_i2c_byte_generic(struct ixgbe_hw *hw, u8 byte_offset,
                                 u8 dev_addr, u8 *data);
 s32 ixgbe_write_i2c_byte_generic(struct ixgbe_hw *hw, u8 byte_offset,
diff --git a/drivers/net/ixgbe/ixgbe_type.h b/drivers/net/ixgbe/ixgbe_type.h
index 39b9be8..2eb6e15 100644
--- a/drivers/net/ixgbe/ixgbe_type.h
+++ b/drivers/net/ixgbe/ixgbe_type.h
@@ -51,6 +51,7 @@
 #define IXGBE_DEV_ID_82599_KX4           0x10F7
 #define IXGBE_DEV_ID_82599_KX4_MEZZ      0x1514
 #define IXGBE_DEV_ID_82599_KR            0x1517
+#define IXGBE_DEV_ID_82599_T3_LOM        0x151C
 #define IXGBE_DEV_ID_82599_CX4           0x10F9
 #define IXGBE_DEV_ID_82599_SFP           0x10FB
 #define IXGBE_DEV_ID_82599_SFP_EM        0x1507
@@ -2470,6 +2471,7 @@ struct ixgbe_phy_operations {
 	s32 (*write_i2c_byte)(struct ixgbe_hw *, u8, u8, u8);
 	s32 (*read_i2c_eeprom)(struct ixgbe_hw *, u8 , u8 *);
 	s32 (*write_i2c_eeprom)(struct ixgbe_hw *, u8, u8);
+	s32 (*check_overtemp)(struct ixgbe_hw *);
 };
 
 struct ixgbe_eeprom_info {
@@ -2518,6 +2520,7 @@ struct ixgbe_phy_info {
 	enum ixgbe_smart_speed          smart_speed;
 	bool                            smart_speed_active;
 	bool                            multispeed_fiber;
+	bool                            reset_if_overtemp;
 };
 
 #include "ixgbe_mbx.h"
@@ -2605,6 +2608,7 @@ struct ixgbe_info {
 #define IXGBE_ERR_FDIR_REINIT_FAILED            -23
 #define IXGBE_ERR_EEPROM_VERSION                -24
 #define IXGBE_ERR_NO_SPACE                      -25
+#define IXGBE_ERR_OVERTEMP                      -26
 #define IXGBE_NOT_IMPLEMENTED                   0x7FFFFFFF
 
 #endif /* _IXGBE_TYPE_H_ */


^ permalink raw reply related

* Re: [PATCH] vhost-net: utilize PUBLISH_USED_IDX feature
From: Michael S. Tsirkin @ 2010-05-19 22:27 UTC (permalink / raw)
  To: Avi Kivity
  Cc: davem, Juan Quintela, Rusty Russell, Paul E. McKenney,
	Arnd Bergmann, kvm, virtualization, netdev, linux-kernel,
	alex.williamson, amit.shah
In-Reply-To: <4BF41A33.8090309@redhat.com>

On Wed, May 19, 2010 at 08:04:51PM +0300, Avi Kivity wrote:
> On 05/18/2010 04:19 AM, Michael S. Tsirkin wrote:
>> With PUBLISH_USED_IDX, guest tells us which used entries
>> it has consumed. This can be used to reduce the number
>> of interrupts: after we write a used entry, if the guest has not yet
>> consumed the previous entry, or if the guest has already consumed the
>> new entry, we do not need to interrupt.
>> This imporves bandwidth by 30% under some workflows.
>>
>> Signed-off-by: Michael S. Tsirkin<mst@redhat.com>
>> ---
>>
>> Rusty, Dave, this patch depends on the patch
>> "virtio: put last seen used index into ring itself"
>> which is currently destined at Rusty's tree.
>> Rusty, if you are taking that one for 2.6.35, please
>> take this one as well.
>> Dave, any objections?
>>    
>
> I object: I think the index should have its own cacheline,

The issue here is that host/guest do not know each
other's cache line size. I guess we could just put it
at offset 128 or something like that ... Rusty?

> and that it should be documented before merging.

I think you meant to object to the virtio patch, not this one.  This
patch does not introduce new layout, just implements host support.
virtio spec patch will follow: it is not part of linux tree so
there is no patch dependency.

> -- 
> Do not meddle in the internals of kernels, for they are subtle and quick to panic.

^ permalink raw reply

* [net-next PATCH] ixgbe:add support for a new 82599 10G Base-T device
From: Jeff Kirsher @ 2010-05-19 22:16 UTC (permalink / raw)
  To: davem; +Cc: netdev, gospo, Mallikarjuna R Chilakala, Jeff Kirsher

From: Mallikarjuna R Chilakala <mallikarjuna.chilakala@intel.com>

This adds support for a new copper device for 82599, device id 0x151c.
This 82599 10GBase-T device uses the PHY's internal temperature sensor
to guard against over-temp conditions. In this scenario the PHY will be
put in a low power mode and link will no longer be able to transmit or
receive any data. When this occurs, the over-temp interrupt is latched
and driver logs this error message. A HW reset or power cycle is
required to clear this status.

Signed-off-by: Mallikarjuna R Chilakala <mallikarjuna.chilakala@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---

 drivers/net/ixgbe/ixgbe.h       |    3 ++
 drivers/net/ixgbe/ixgbe_82598.c |    1 +
 drivers/net/ixgbe/ixgbe_82599.c |    1 +
 drivers/net/ixgbe/ixgbe_main.c  |   68 +++++++++++++++++++++++++++++++++++++++
 drivers/net/ixgbe/ixgbe_phy.c   |   31 ++++++++++++++++++
 drivers/net/ixgbe/ixgbe_phy.h   |    3 ++
 drivers/net/ixgbe/ixgbe_type.h  |    4 ++
 7 files changed, 111 insertions(+), 0 deletions(-)

diff --git a/drivers/net/ixgbe/ixgbe.h b/drivers/net/ixgbe/ixgbe.h
index d0ea3d6..ffae480 100644
--- a/drivers/net/ixgbe/ixgbe.h
+++ b/drivers/net/ixgbe/ixgbe.h
@@ -360,6 +360,7 @@ struct ixgbe_adapter {
 	u32 flags2;
 #define IXGBE_FLAG2_RSC_CAPABLE                 (u32)(1)
 #define IXGBE_FLAG2_RSC_ENABLED                 (u32)(1 << 1)
+#define IXGBE_FLAG2_TEMP_SENSOR_CAPABLE         (u32)(1 << 2)
 /* default to trying for four seconds */
 #define IXGBE_TRY_LINK_TIMEOUT (4 * HZ)
 
@@ -407,6 +408,8 @@ struct ixgbe_adapter {
 	u16 eeprom_version;
 
 	int node;
+	struct work_struct check_overtemp_task;
+	u32 interrupt_event;
 
 	/* SR-IOV */
 	DECLARE_BITMAP(active_vfs, IXGBE_MAX_VF_FUNCTIONS);
diff --git a/drivers/net/ixgbe/ixgbe_82598.c b/drivers/net/ixgbe/ixgbe_82598.c
index f2b7ff4..9c02d60 100644
--- a/drivers/net/ixgbe/ixgbe_82598.c
+++ b/drivers/net/ixgbe/ixgbe_82598.c
@@ -1236,6 +1236,7 @@ static struct ixgbe_phy_operations phy_ops_82598 = {
 	.setup_link		= &ixgbe_setup_phy_link_generic,
 	.setup_link_speed	= &ixgbe_setup_phy_link_speed_generic,
 	.read_i2c_eeprom	= &ixgbe_read_i2c_eeprom_82598,
+	.check_overtemp   = &ixgbe_tn_check_overtemp,
 };
 
 struct ixgbe_info ixgbe_82598_info = {
diff --git a/drivers/net/ixgbe/ixgbe_82599.c b/drivers/net/ixgbe/ixgbe_82599.c
index e9706eb..a4e2901 100644
--- a/drivers/net/ixgbe/ixgbe_82599.c
+++ b/drivers/net/ixgbe/ixgbe_82599.c
@@ -2395,6 +2395,7 @@ static struct ixgbe_phy_operations phy_ops_82599 = {
 	.write_i2c_byte         = &ixgbe_write_i2c_byte_generic,
 	.read_i2c_eeprom        = &ixgbe_read_i2c_eeprom_generic,
 	.write_i2c_eeprom       = &ixgbe_write_i2c_eeprom_generic,
+	.check_overtemp         = &ixgbe_tn_check_overtemp,
 };
 
 struct ixgbe_info ixgbe_82599_info = {
diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c
index 9551cbb..3ee702b 100644
--- a/drivers/net/ixgbe/ixgbe_main.c
+++ b/drivers/net/ixgbe/ixgbe_main.c
@@ -108,6 +108,8 @@ static DEFINE_PCI_DEVICE_TABLE(ixgbe_pci_tbl) = {
 	 board_82599 },
 	{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82599_CX4),
 	 board_82599 },
+	{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82599_T3_LOM),
+	 board_82599 },
 	{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82599_COMBO_BACKPLANE),
 	 board_82599 },
 
@@ -1618,6 +1620,47 @@ static void ixgbe_set_itr_msix(struct ixgbe_q_vector *q_vector)
 	}
 }
 
+/**
+ * ixgbe_check_overtemp_task - worker thread to check over tempurature
+ * @work: pointer to work_struct containing our data
+ **/
+static void ixgbe_check_overtemp_task(struct work_struct *work)
+{
+	struct ixgbe_adapter *adapter = container_of(work,
+	                                             struct ixgbe_adapter,
+	                                             check_overtemp_task);
+	struct ixgbe_hw *hw = &adapter->hw;
+	u32 eicr = adapter->interrupt_event;
+
+	if (adapter->flags2 & IXGBE_FLAG2_TEMP_SENSOR_CAPABLE) {
+		switch (hw->device_id) {
+		case IXGBE_DEV_ID_82599_T3_LOM: {
+			u32 autoneg;
+			bool link_up = false;
+
+			if (hw->mac.ops.check_link)
+				hw->mac.ops.check_link(hw, &autoneg, &link_up, false);
+
+			if (((eicr & IXGBE_EICR_GPI_SDP0) && (!link_up)) ||
+			    (eicr & IXGBE_EICR_LSC))
+				/* Check if this is due to overtemp */
+				if (hw->phy.ops.check_overtemp(hw) == IXGBE_ERR_OVERTEMP)
+					break;
+			}
+			return;
+		default:
+			if (!(eicr & IXGBE_EICR_GPI_SDP0))
+				return;
+			break;
+		}
+		e_crit("Network adapter has been stopped because it has "
+		        "over heated. Restart the computer. If the problem persists, "
+		        "power off the system and replace the adapter\n");
+		/* write to clear the interrupt */
+		IXGBE_WRITE_REG(hw, IXGBE_EICR, IXGBE_EICR_GPI_SDP0);
+	}
+}
+
 static void ixgbe_check_fan_failure(struct ixgbe_adapter *adapter, u32 eicr)
 {
 	struct ixgbe_hw *hw = &adapter->hw;
@@ -1689,6 +1732,10 @@ static irqreturn_t ixgbe_msix_lsc(int irq, void *data)
 
 	if (hw->mac.type == ixgbe_mac_82599EB) {
 		ixgbe_check_sfp_event(adapter, eicr);
+		adapter->interrupt_event = eicr;
+		if ((adapter->flags2 & IXGBE_FLAG2_TEMP_SENSOR_CAPABLE) &&
+		    ((eicr & IXGBE_EICR_GPI_SDP0) || (eicr & IXGBE_EICR_LSC)))
+			schedule_work(&adapter->check_overtemp_task);
 
 		/* Handle Flow Director Full threshold interrupt */
 		if (eicr & IXGBE_EICR_FLOW_DIR) {
@@ -2190,6 +2237,8 @@ static inline void ixgbe_irq_enable(struct ixgbe_adapter *adapter)
 	u32 mask;
 
 	mask = (IXGBE_EIMS_ENABLE_MASK & ~IXGBE_EIMS_RTX_QUEUE);
+	if (adapter->flags2 & IXGBE_FLAG2_TEMP_SENSOR_CAPABLE)
+		mask |= IXGBE_EIMS_GPI_SDP0;
 	if (adapter->flags & IXGBE_FLAG_FAN_FAIL_CAPABLE)
 		mask |= IXGBE_EIMS_GPI_SDP1;
 	if (adapter->hw.mac.type == ixgbe_mac_82599EB) {
@@ -2250,6 +2299,9 @@ static irqreturn_t ixgbe_intr(int irq, void *data)
 		ixgbe_check_sfp_event(adapter, eicr);
 
 	ixgbe_check_fan_failure(adapter, eicr);
+	if ((adapter->flags2 & IXGBE_FLAG2_TEMP_SENSOR_CAPABLE) &&
+	    ((eicr & IXGBE_EICR_GPI_SDP0) || (eicr & IXGBE_EICR_LSC)))
+		schedule_work(&adapter->check_overtemp_task);
 
 	if (napi_schedule_prep(&(q_vector->napi))) {
 		adapter->tx_ring[0]->total_packets = 0;
@@ -3265,6 +3317,13 @@ static int ixgbe_up_complete(struct ixgbe_adapter *adapter)
 		IXGBE_WRITE_REG(hw, IXGBE_EIAM, IXGBE_EICS_RTX_QUEUE);
 	}
 
+	/* Enable Thermal over heat sensor interrupt */
+	if (adapter->flags2 & IXGBE_FLAG2_TEMP_SENSOR_CAPABLE) {
+		gpie = IXGBE_READ_REG(hw, IXGBE_GPIE);
+		gpie |= IXGBE_SDP0_GPIEN;
+		IXGBE_WRITE_REG(hw, IXGBE_GPIE, gpie);
+	}
+
 	/* Enable fan failure interrupt if media type is copper */
 	if (adapter->flags & IXGBE_FLAG_FAN_FAIL_CAPABLE) {
 		gpie = IXGBE_READ_REG(hw, IXGBE_GPIE);
@@ -3666,6 +3725,9 @@ void ixgbe_down(struct ixgbe_adapter *adapter)
 	    adapter->flags & IXGBE_FLAG_FDIR_PERFECT_CAPABLE)
 		cancel_work_sync(&adapter->fdir_reinit_task);
 
+	if (adapter->flags2 & IXGBE_FLAG2_TEMP_SENSOR_CAPABLE)
+		cancel_work_sync(&adapter->check_overtemp_task);
+
 	/* disable transmits in the hardware now that interrupts are off */
 	for (i = 0; i < adapter->num_tx_queues; i++) {
 		j = adapter->tx_ring[i]->reg_idx;
@@ -4645,6 +4707,8 @@ static int __devinit ixgbe_sw_init(struct ixgbe_adapter *adapter)
 		adapter->max_msix_q_vectors = MAX_MSIX_Q_VECTORS_82599;
 		adapter->flags2 |= IXGBE_FLAG2_RSC_CAPABLE;
 		adapter->flags2 |= IXGBE_FLAG2_RSC_ENABLED;
+		if (hw->device_id == IXGBE_DEV_ID_82599_T3_LOM)
+			adapter->flags2 |= IXGBE_FLAG2_TEMP_SENSOR_CAPABLE;
 		if (dev->features & NETIF_F_NTUPLE) {
 			/* Flow Director perfect filter enabled */
 			adapter->flags |= IXGBE_FLAG_FDIR_PERFECT_CAPABLE;
@@ -6561,7 +6625,9 @@ static int __devinit ixgbe_probe(struct pci_dev *pdev,
 	}
 
 	/* reset_hw fills in the perm_addr as well */
+	hw->phy.reset_if_overtemp = true;
 	err = hw->mac.ops.reset_hw(hw);
+	hw->phy.reset_if_overtemp = false;
 	if (err == IXGBE_ERR_SFP_NOT_PRESENT &&
 	    hw->mac.type == ixgbe_mac_82598EB) {
 		/*
@@ -6730,6 +6796,8 @@ static int __devinit ixgbe_probe(struct pci_dev *pdev,
 	    adapter->flags & IXGBE_FLAG_FDIR_PERFECT_CAPABLE)
 		INIT_WORK(&adapter->fdir_reinit_task, ixgbe_fdir_reinit_task);
 
+	if (adapter->flags2 & IXGBE_FLAG2_TEMP_SENSOR_CAPABLE)
+		INIT_WORK(&adapter->check_overtemp_task, ixgbe_check_overtemp_task);
 #ifdef CONFIG_IXGBE_DCA
 	if (dca_add_requester(&pdev->dev) == 0) {
 		adapter->flags |= IXGBE_FLAG_DCA_ENABLED;
diff --git a/drivers/net/ixgbe/ixgbe_phy.c b/drivers/net/ixgbe/ixgbe_phy.c
index 22d21af..9c8fb85 100644
--- a/drivers/net/ixgbe/ixgbe_phy.c
+++ b/drivers/net/ixgbe/ixgbe_phy.c
@@ -135,6 +135,11 @@ static enum ixgbe_phy_type ixgbe_get_phy_type_from_id(u32 phy_id)
  **/
 s32 ixgbe_reset_phy_generic(struct ixgbe_hw *hw)
 {
+	/* Don't reset PHY if it's shut down due to overtemp. */
+	if (!hw->phy.reset_if_overtemp &&
+	    (IXGBE_ERR_OVERTEMP == hw->phy.ops.check_overtemp(hw)))
+		return 0;
+
 	/*
 	 * Perform soft PHY reset to the PHY_XS.
 	 * This will cause a soft reset to the PHY
@@ -1345,3 +1350,29 @@ s32 ixgbe_get_phy_firmware_version_tnx(struct ixgbe_hw *hw,
 	return status;
 }
 
+/**
+ *  ixgbe_tn_check_overtemp - Checks if an overtemp occured.
+ *  @hw: pointer to hardware structure
+ *
+ *  Checks if the LASI temp alarm status was triggered due to overtemp
+ **/
+s32 ixgbe_tn_check_overtemp(struct ixgbe_hw *hw)
+{
+	s32 status = 0;
+	u16 phy_data = 0;
+
+	if (hw->device_id != IXGBE_DEV_ID_82599_T3_LOM)
+		goto out;
+
+	/* Check that the LASI temp alarm status was triggered */
+	hw->phy.ops.read_reg(hw, IXGBE_TN_LASI_STATUS_REG,
+	                     MDIO_MMD_PMAPMD, &phy_data);
+
+	if (!(phy_data & IXGBE_TN_LASI_STATUS_TEMP_ALARM))
+		goto out;
+
+	status = IXGBE_ERR_OVERTEMP;
+out:
+	return status;
+}
+
diff --git a/drivers/net/ixgbe/ixgbe_phy.h b/drivers/net/ixgbe/ixgbe_phy.h
index c9c5459..ef4ba83 100644
--- a/drivers/net/ixgbe/ixgbe_phy.h
+++ b/drivers/net/ixgbe/ixgbe_phy.h
@@ -80,6 +80,8 @@
 #define IXGBE_I2C_T_SU_STO  4
 #define IXGBE_I2C_T_BUF     5
 
+#define IXGBE_TN_LASI_STATUS_REG        0x9005
+#define IXGBE_TN_LASI_STATUS_TEMP_ALARM 0x0008
 
 s32 ixgbe_init_phy_ops_generic(struct ixgbe_hw *hw);
 s32 ixgbe_identify_phy_generic(struct ixgbe_hw *hw);
@@ -106,6 +108,7 @@ s32 ixgbe_identify_sfp_module_generic(struct ixgbe_hw *hw);
 s32 ixgbe_get_sfp_init_sequence_offsets(struct ixgbe_hw *hw,
                                         u16 *list_offset,
                                         u16 *data_offset);
+s32 ixgbe_tn_check_overtemp(struct ixgbe_hw *hw);
 s32 ixgbe_read_i2c_byte_generic(struct ixgbe_hw *hw, u8 byte_offset,
                                 u8 dev_addr, u8 *data);
 s32 ixgbe_write_i2c_byte_generic(struct ixgbe_hw *hw, u8 byte_offset,
diff --git a/drivers/net/ixgbe/ixgbe_type.h b/drivers/net/ixgbe/ixgbe_type.h
index 39b9be8..2eb6e15 100644
--- a/drivers/net/ixgbe/ixgbe_type.h
+++ b/drivers/net/ixgbe/ixgbe_type.h
@@ -51,6 +51,7 @@
 #define IXGBE_DEV_ID_82599_KX4           0x10F7
 #define IXGBE_DEV_ID_82599_KX4_MEZZ      0x1514
 #define IXGBE_DEV_ID_82599_KR            0x1517
+#define IXGBE_DEV_ID_82599_T3_LOM        0x151C
 #define IXGBE_DEV_ID_82599_CX4           0x10F9
 #define IXGBE_DEV_ID_82599_SFP           0x10FB
 #define IXGBE_DEV_ID_82599_SFP_EM        0x1507
@@ -2470,6 +2471,7 @@ struct ixgbe_phy_operations {
 	s32 (*write_i2c_byte)(struct ixgbe_hw *, u8, u8, u8);
 	s32 (*read_i2c_eeprom)(struct ixgbe_hw *, u8 , u8 *);
 	s32 (*write_i2c_eeprom)(struct ixgbe_hw *, u8, u8);
+	s32 (*check_overtemp)(struct ixgbe_hw *);
 };
 
 struct ixgbe_eeprom_info {
@@ -2518,6 +2520,7 @@ struct ixgbe_phy_info {
 	enum ixgbe_smart_speed          smart_speed;
 	bool                            smart_speed_active;
 	bool                            multispeed_fiber;
+	bool                            reset_if_overtemp;
 };
 
 #include "ixgbe_mbx.h"
@@ -2605,6 +2608,7 @@ struct ixgbe_info {
 #define IXGBE_ERR_FDIR_REINIT_FAILED            -23
 #define IXGBE_ERR_EEPROM_VERSION                -24
 #define IXGBE_ERR_NO_SPACE                      -25
+#define IXGBE_ERR_OVERTEMP                      -26
 #define IXGBE_NOT_IMPLEMENTED                   0x7FFFFFFF
 
 #endif /* _IXGBE_TYPE_H_ */


^ permalink raw reply related

* Re: [RFC] netem: correlated loss generation (v3)
From: Hagen Paul Pfeifer @ 2010-05-19 21:42 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Stefano Salsano, David Miller, Fabio Ludovici, netdev, netem
In-Reply-To: <20100517205621.036a06e0@nehalam>

* Stephen Hemminger | 2010-05-17 20:56:21 [-0700]:

>Subject: netem - revised correlated loss generator
>
>This is a patch originated with Stefano Salsano and Fabio Ludovici.
>It provides several alternative loss models for use with netem.
>There are two state machine based models and one table driven model.
>
>To simplify the original code:
>   * eliminated the debugging messages and statistics
>   * reformatted for clarity
>   * changed API to nested attribute relating to loss
>   * changed the table to always loop across bits
>   * only allocate parameters needed
>
>Still untested, for comment only...
>Should have tested version before 2.6.35 merge window closes.

Why mainline? I questioning the advantage for the big audience, it looks like
a academic only piece of software - correct me if I'm wrong.

The authors pointed to some weak points in the implementation of the current
loss/correlation logic. But this "fix", add another - complicated component -
and let the broken components untouched ...

HGN

-- 
Hagen Paul Pfeifer <hagen@jauu.net>  ||  http://jauu.net/
Telephone: +49 174 5455209           ||  Key Id: 0x98350C22
Key Fingerprint: 490F 557B 6C48 6D7E 5706 2EA2 4A22 8D45 9835 0C22

^ permalink raw reply

* [PATCH] net: fix problem in dequeuing from input_pkt_queue
From: Tom Herbert @ 2010-05-19 21:47 UTC (permalink / raw)
  To: davem; +Cc: eric.dumazet, xiaosuo, netdev

Fix some issues introduced in batch skb dequeuing for input_pkt_queue.
The primary issue it that the queue head must be incremented only
after a packet has been processed, that is only after
__netif_receive_skb has been called.  This is needed for the mechanism
to prevent OOO packet in RFS.  Also when flushing the input_pkt_queue
and process_queue, the process queue should be done first to prevent
OOO packets.

Because the input_pkt_queue has been effectively split into two queues,
the calculation of the tail ptr is no longer correct.  The correct value
would be head+input_pkt_queue->len+process_queue->len.  To avoid
this calculation we added an explict input_queue_tail in softnet_data.
The tail value is simply incremented when queuing to input_pkt_queue.

In process_backlog the processing of the packet queue can be done
without irq's being disabled.

Made dropped in softnet_data to be "unsigned int" for consistency.

Signed-off-by: Tom Herbert <therbert@google.com>
---
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c3487a6..bc0bc85 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1403,17 +1403,25 @@ struct softnet_data {
 	struct softnet_data	*rps_ipi_next;
 	unsigned int		cpu;
 	unsigned int		input_queue_head;
+	unsigned int		input_queue_tail;
 #endif
-	unsigned		dropped;
+	unsigned int		dropped;
 	struct sk_buff_head	input_pkt_queue;
 	struct napi_struct	backlog;
 };
 
-static inline void input_queue_head_add(struct softnet_data *sd,
-					unsigned int len)
+static inline void input_queue_head_incr(struct softnet_data *sd)
 {
 #ifdef CONFIG_RPS
-	sd->input_queue_head += len;
+	sd->input_queue_head++;
+#endif
+}
+
+static inline void input_queue_tail_incr_save(struct softnet_data *sd,
+					      unsigned int *qtail)
+{
+#ifdef CONFIG_RPS
+	*qtail = ++sd->input_queue_tail;
 #endif
 }
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 6c82065..be7d475 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2426,10 +2426,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
 		if (skb_queue_len(&sd->input_pkt_queue)) {
 enqueue:
 			__skb_queue_tail(&sd->input_pkt_queue, skb);
-#ifdef CONFIG_RPS
-			*qtail = sd->input_queue_head +
-					skb_queue_len(&sd->input_pkt_queue);
-#endif
+			input_queue_tail_incr_save(sd, qtail);
 			rps_unlock(sd);
 			local_irq_restore(flags);
 			return NET_RX_SUCCESS;
@@ -2959,22 +2956,24 @@ static void flush_backlog(void *arg)
 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
 	struct sk_buff *skb, *tmp;
 
-	rps_lock(sd);
-	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
+	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
 		if (skb->dev == dev) {
-			__skb_unlink(skb, &sd->input_pkt_queue);
+			__skb_unlink(skb, &sd->process_queue);
 			kfree_skb(skb);
-			input_queue_head_add(sd, 1);
+			input_queue_head_incr(sd);
 		}
 	}
-	rps_unlock(sd);
 
-	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
+	rps_lock(sd);
+	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
 		if (skb->dev == dev) {
-			__skb_unlink(skb, &sd->process_queue);
+			__skb_unlink(skb, &sd->input_pkt_queue);
 			kfree_skb(skb);
+			input_queue_head_incr(sd);
 		}
 	}
+	rps_unlock(sd);
+
 }
 
 static int napi_gro_complete(struct sk_buff *skb)
@@ -3320,26 +3319,24 @@ static int process_backlog(struct napi_struct *napi, int quota)
 	}
 #endif
 	napi->weight = weight_p;
-	local_irq_disable();
 	while (work < quota) {
 		struct sk_buff *skb;
 		unsigned int qlen;
 
 		while ((skb = __skb_dequeue(&sd->process_queue))) {
-			local_irq_enable();
 			__netif_receive_skb(skb);
+			input_queue_head_incr(sd);
 			if (++work >= quota)
 				return work;
-			local_irq_disable();
 		}
 
+		local_irq_disable();
 		rps_lock(sd);
 		qlen = skb_queue_len(&sd->input_pkt_queue);
-		if (qlen) {
-			input_queue_head_add(sd, qlen);
+		if (qlen)
 			skb_queue_splice_tail_init(&sd->input_pkt_queue,
 						   &sd->process_queue);
-		}
+
 		if (qlen < quota - work) {
 			/*
 			 * Inline a custom version of __napi_complete().
@@ -3354,8 +3351,8 @@ static int process_backlog(struct napi_struct *napi, int quota)
 			quota = work + qlen;
 		}
 		rps_unlock(sd);
+		local_irq_enable();
 	}
-	local_irq_enable();
 
 	return work;
 }
@@ -5679,12 +5676,14 @@ static int dev_cpu_callback(struct notifier_block *nfb,
 	local_irq_enable();
 
 	/* Process offline CPU's input_pkt_queue */
-	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
+	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
 		netif_rx(skb);
-		input_queue_head_add(oldsd, 1);
+		input_queue_head_incr(oldsd);
 	}
-	while ((skb = __skb_dequeue(&oldsd->process_queue)))
+	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
 		netif_rx(skb);
+		input_queue_head_incr(oldsd);
+	}
 
 	return NOTIFY_OK;
 }

^ permalink raw reply related

* Re: tun: Use netif_receive_skb instead of netif_rx
From: Brian Bloniarz @ 2010-05-19 21:00 UTC (permalink / raw)
  To: tgraf
  Cc: Neil Horman, Neil Horman, Eric Dumazet, Herbert Xu,
	David S. Miller, netdev
In-Reply-To: <1274302191.3148.2.camel@lsx.localdomain>

On 05/19/2010 04:49 PM, Thomas Graf wrote:
> On Wed, 2010-05-19 at 14:00 -0400, Neil Horman wrote: 
>> I'm currently testing this, unfortunately, and its not breaking anything, but it
>> doesn't allow cgroups to classify frames comming from tun interfaces.  I'm still
>> investigating, but I think the issue is that, because we call local_bh_disable
>> with this patch, we wind up raising the count at SOFTIRQ_OFFSET in preempt_count
>> for the task.  Since the cgroup classifier has this check:
>>
>> if (softirq_count() != SOFTIRQ_OFFSET))
>> 	return -1;
>>
>> We still fail to classify the frame.  the cgroup classifier is assuming that any
>> frame arriving with a softirq count of 1 means we came directly from the
>> dev_queue_xmit routine and is safe to check current().  Any less than that, and
>> something is wrong (as we at least need the local_bh_disable in dev_queue_xmit),
>> and any more implies that we have nested calls to local_bh_disable, meaning
>> we're really handling a softirq context.
> 
> It is a hack but the only method to check for softirq context I found. I
> would favor using a flag if there was one.

Eric probably has some thoughts on this -- his scheduler-batching patch RFC
from last year needed the same bit of info:
http://patchwork.ozlabs.org/patch/24536/
(see the changes to trace_softirq_context).

^ permalink raw reply

* Re: tun: Use netif_receive_skb instead of netif_rx
From: Thomas Graf @ 2010-05-19 20:49 UTC (permalink / raw)
  To: Neil Horman
  Cc: Neil Horman, Eric Dumazet, Herbert Xu, David S. Miller, netdev
In-Reply-To: <20100519180053.GC26519@hmsreliant.think-freely.org>

On Wed, 2010-05-19 at 14:00 -0400, Neil Horman wrote: 
> I'm currently testing this, unfortunately, and its not breaking anything, but it
> doesn't allow cgroups to classify frames comming from tun interfaces.  I'm still
> investigating, but I think the issue is that, because we call local_bh_disable
> with this patch, we wind up raising the count at SOFTIRQ_OFFSET in preempt_count
> for the task.  Since the cgroup classifier has this check:
> 
> if (softirq_count() != SOFTIRQ_OFFSET))
> 	return -1;
> 
> We still fail to classify the frame.  the cgroup classifier is assuming that any
> frame arriving with a softirq count of 1 means we came directly from the
> dev_queue_xmit routine and is safe to check current().  Any less than that, and
> something is wrong (as we at least need the local_bh_disable in dev_queue_xmit),
> and any more implies that we have nested calls to local_bh_disable, meaning
> we're really handling a softirq context.

It is a hack but the only method to check for softirq context I found. I
would favor using a flag if there was one.


^ permalink raw reply

* Re: how many msi (msi-x) vectors can be setup?
From: Yinghai Lu @ 2010-05-19 20:33 UTC (permalink / raw)
  To: zhou rui; +Cc: netdev
In-Reply-To: <AANLkTikT3ThBnO8q3AvhS9t4jUu8npOwYmjY6WUMhseq@mail.gmail.com>

On Wed, May 19, 2010 at 8:18 AM, zhou rui <wirelesser@gmail.com> wrote:
> hi there:
> how many msi (msi-x) vectors can be setup?
> the number is limited by hardware resource(nic), or kernel ?
> I found that the driver (broadcom 57711 ver 1.5.12) tried to request
> 16 queues on my kernel2.6.27,but only 2  available
> will it be increased if I update the driver or kernel?
> and there is a limitiation in the system? if the other devices have
> already occupied too many MSI vectors then it is not enough.

from kernel 2.6.19 x86_64, there is per-cpu vector irq support.

depends your system : CPU num? 64bit or 32bit.

YH

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox